diff --git a/.gitattributes b/.gitattributes index a6344aac8c09253b3b630fb776ae94478aa0275b..14f5756aa0c03176509bd70ffec821cc02f5dd8f 100644 --- a/.gitattributes +++ b/.gitattributes @@ -33,3 +33,10 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text *.zip filter=lfs diff=lfs merge=lfs -text *.zst filter=lfs diff=lfs merge=lfs -text *tfevents* filter=lfs diff=lfs merge=lfs -text +checkpoint-10575/tokenizer.json filter=lfs diff=lfs merge=lfs -text +checkpoint-12690/tokenizer.json filter=lfs diff=lfs merge=lfs -text +checkpoint-2115/tokenizer.json filter=lfs diff=lfs merge=lfs -text +checkpoint-4230/tokenizer.json filter=lfs diff=lfs merge=lfs -text +checkpoint-6345/tokenizer.json filter=lfs diff=lfs merge=lfs -text +checkpoint-8460/tokenizer.json filter=lfs diff=lfs merge=lfs -text +tokenizer.json filter=lfs diff=lfs merge=lfs -text diff --git a/README.md b/README.md new file mode 100644 index 0000000000000000000000000000000000000000..ec3daa06f23591a6040e6b86b735afdf596aa153 --- /dev/null +++ b/README.md @@ -0,0 +1,143 @@ +--- +library_name: peft +license: llama3.1 +base_model: meta-llama/Llama-3.1-8B +tags: +- generated_from_trainer +datasets: +- ugaoo/instruction_conciseoutput__clinical_trails +model-index: +- name: out/meta_llama_Llama_3.1_8B_ugaoo_instruction_conciseoutput__clinical_trails + results: [] +--- + + + +[Built with Axolotl](https://github.com/axolotl-ai-cloud/axolotl) +
See axolotl config + +axolotl version: `0.8.0.dev0` +```yaml +base_model: meta-llama/Llama-3.1-8B +model_type: AutoModelForCausalLM +tokenizer_type: AutoTokenizer +trust_remote_code: true + +load_in_8bit: false +load_in_4bit: true +strict: false + +datasets: + - path: ugaoo/instruction_conciseoutput__clinical_trails + type: alpaca +val_set_size: 0 +output_dir: ./out/meta_llama_Llama_3.1_8B_ugaoo_instruction_conciseoutput__clinical_trails + +sequence_len: 4000 +sample_packing: true +pad_to_sequence_len: true + +adapter: qlora +lora_r: 256 +lora_alpha: 512 +lora_dropout: 0.05 +lora_target_linear: true +lora_target_modules: + - q_proj + - k_proj + - v_proj + - o_proj + - up_proj + - down_proj + - gate_proj +lora_modules_to_save: + - embed_tokens + - lm_head + +wandb_project: cosmosearch +wandb_entity: +wandb_watch: +wandb_name: meta_llama_Llama_3.1_8B_ugaoo_instruction_conciseoutput__clinical_trails +wandb_log_model: + +gradient_accumulation_steps: 3 +micro_batch_size: 4 +num_epochs: 6 +optimizer: adamw_torch +lr_scheduler: cosine +learning_rate: 5e-6 + +train_on_inputs: false +group_by_length: false +bf16: auto +fp16: false +tf32: false + +gradient_checkpointing: true +early_stopping_patience: +resume_from_checkpoint: +logging_steps: 1 +xformers_attention: +flash_attention: true + +warmup_steps: 100 +evals_per_epoch: 6 +eval_table_size: +saves_per_epoch: 1 +debug: +deepspeed: +weight_decay: 0.0 +fsdp: +fsdp_config: +save_total_limit: 6 +special_tokens: + pad_token: <|end_of_text|> +``` + +

+ +# out/meta_llama_Llama_3.1_8B_ugaoo_instruction_conciseoutput__clinical_trails + +This model is a fine-tuned version of [meta-llama/Llama-3.1-8B](https://huggingface.co/meta-llama/Llama-3.1-8B) on the ugaoo/instruction_conciseoutput__clinical_trails dataset. + +## Model description + +More information needed + +## Intended uses & limitations + +More information needed + +## Training and evaluation data + +More information needed + +## Training procedure + +### Training hyperparameters + +The following hyperparameters were used during training: +- learning_rate: 5e-06 +- train_batch_size: 4 +- eval_batch_size: 4 +- seed: 42 +- distributed_type: multi-GPU +- gradient_accumulation_steps: 3 +- total_train_batch_size: 12 +- optimizer: Use OptimizerNames.ADAMW_TORCH with betas=(0.9,0.999) and epsilon=1e-08 and optimizer_args=No additional optimizer arguments +- lr_scheduler_type: cosine +- lr_scheduler_warmup_steps: 100 +- num_epochs: 6.0 + +### Training results + + + +### Framework versions + +- PEFT 0.14.0 +- Transformers 4.49.0 +- Pytorch 2.5.1+cu124 +- Datasets 3.2.0 +- Tokenizers 0.21.0 \ No newline at end of file diff --git a/adapter_config.json b/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..38890ca21f7e1854f7350049a9113a9eec8a443a --- /dev/null +++ b/adapter_config.json @@ -0,0 +1,40 @@ +{ + "alpha_pattern": {}, + "auto_mapping": null, + "base_model_name_or_path": "meta-llama/Llama-3.1-8B", + "bias": "none", + "eva_config": null, + "exclude_modules": null, + "fan_in_fan_out": null, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 512, + "lora_bias": false, + "lora_dropout": 0.05, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": [ + "embed_tokens", + "lm_head" + ], + "peft_type": "LORA", + "r": 256, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "v_proj", + "q_proj", + "gate_proj", + "k_proj", + "down_proj", + "up_proj", + "o_proj" + ], + "task_type": "CAUSAL_LM", + "use_dora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/adapter_model.safetensors b/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..a86dd37eb82fb1d46b236ea471b6db3ec90b904d --- /dev/null +++ b/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1f07f4ee71740fb0bcf54ffe982c86734f08671b6b0cb55e4add89e30c744f1b +size 3443586272 diff --git a/checkpoint-10575/README.md b/checkpoint-10575/README.md new file mode 100644 index 0000000000000000000000000000000000000000..049d467664ca6172b7ffbe6ba60b3eac7479cac4 --- /dev/null +++ b/checkpoint-10575/README.md @@ -0,0 +1,202 @@ +--- +base_model: meta-llama/Llama-3.1-8B +library_name: peft +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.14.0 \ No newline at end of file diff --git a/checkpoint-10575/adapter_config.json b/checkpoint-10575/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..38890ca21f7e1854f7350049a9113a9eec8a443a --- /dev/null +++ b/checkpoint-10575/adapter_config.json @@ -0,0 +1,40 @@ +{ + "alpha_pattern": {}, + "auto_mapping": null, + "base_model_name_or_path": "meta-llama/Llama-3.1-8B", + "bias": "none", + "eva_config": null, + "exclude_modules": null, + "fan_in_fan_out": null, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 512, + "lora_bias": false, + "lora_dropout": 0.05, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": [ + "embed_tokens", + "lm_head" + ], + "peft_type": "LORA", + "r": 256, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "v_proj", + "q_proj", + "gate_proj", + "k_proj", + "down_proj", + "up_proj", + "o_proj" + ], + "task_type": "CAUSAL_LM", + "use_dora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/checkpoint-10575/adapter_model.safetensors b/checkpoint-10575/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..dd54bf712e5af1f879554a9d4b6c82513809423d --- /dev/null +++ b/checkpoint-10575/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:666c561a647dd6b4f82d8fea18727b69dbae4c01c62666d44986414a5bd74a87 +size 3443586272 diff --git a/checkpoint-10575/global_step10575/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt b/checkpoint-10575/global_step10575/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..2bd3486dad95f939747e6070cf659a4318c3eb7d --- /dev/null +++ b/checkpoint-10575/global_step10575/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4e27a0f175b033408dbfdf484eb68a273b80f9d9aaa9c00f007f85016a6987a2 +size 20661195036 diff --git a/checkpoint-10575/global_step10575/mp_rank_00_model_states.pt b/checkpoint-10575/global_step10575/mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..ece1202286a91934faa857aeaf7d67de4f397fdf --- /dev/null +++ b/checkpoint-10575/global_step10575/mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:29cfba8bd4c3d53e2fb1257abbe7914c55537330662b4e6dcdd6468dcc783459 +size 3555326841 diff --git a/checkpoint-10575/latest b/checkpoint-10575/latest new file mode 100644 index 0000000000000000000000000000000000000000..56565e819bc8ef07835bdab964cc11cd67edcd70 --- /dev/null +++ b/checkpoint-10575/latest @@ -0,0 +1 @@ +global_step10575 \ No newline at end of file diff --git a/checkpoint-10575/rng_state.pth b/checkpoint-10575/rng_state.pth new file mode 100644 index 0000000000000000000000000000000000000000..ffe45090af0d1ab26ff3adcca35aeeee802c4527 --- /dev/null +++ b/checkpoint-10575/rng_state.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:bbb84f257048c95820717aea86696c56a0aa84f41af814a650be05453aa1aa01 +size 14244 diff --git a/checkpoint-10575/scheduler.pt b/checkpoint-10575/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..c4e80fb5e19f1b1c29bd0920238b2b31259cc1e2 --- /dev/null +++ b/checkpoint-10575/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b9f49b370776cfadd191a2189ce69293b7feb8856f50fc048212b0479f81c0ab +size 1064 diff --git a/checkpoint-10575/special_tokens_map.json b/checkpoint-10575/special_tokens_map.json new file mode 100644 index 0000000000000000000000000000000000000000..e5b39b6305d89284b04934011c68dbb26bf588ca --- /dev/null +++ b/checkpoint-10575/special_tokens_map.json @@ -0,0 +1,23 @@ +{ + "bos_token": { + "content": "<|begin_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "eos_token": { + "content": "<|end_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "pad_token": { + "content": "<|end_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + } +} diff --git a/checkpoint-10575/tokenizer.json b/checkpoint-10575/tokenizer.json new file mode 100644 index 0000000000000000000000000000000000000000..1c1d8d5c9024994f1d3b00f9662b8dd89ca13cf2 --- /dev/null +++ b/checkpoint-10575/tokenizer.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6b9e4e7fb171f92fd137b777cc2714bf87d11576700a1dcd7a399e7bbe39537b +size 17209920 diff --git a/checkpoint-10575/tokenizer_config.json b/checkpoint-10575/tokenizer_config.json new file mode 100644 index 0000000000000000000000000000000000000000..81dd14db6632ad5b35b9d447732e37ac074873a5 --- /dev/null +++ b/checkpoint-10575/tokenizer_config.json @@ -0,0 +1,2063 @@ +{ + "added_tokens_decoder": { + "128000": { + "content": "<|begin_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128001": { + "content": "<|end_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128002": { + "content": "<|reserved_special_token_0|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128003": { + "content": "<|reserved_special_token_1|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128004": { + "content": "<|finetune_right_pad_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128005": { + "content": "<|reserved_special_token_2|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128006": { + "content": "<|start_header_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128007": { + "content": "<|end_header_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128008": { + "content": "<|eom_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128009": { + "content": "<|eot_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128010": { + "content": "<|python_tag|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128011": { + "content": "<|reserved_special_token_3|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128012": { + "content": "<|reserved_special_token_4|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128013": { + "content": "<|reserved_special_token_5|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128014": { + "content": "<|reserved_special_token_6|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128015": { + "content": "<|reserved_special_token_7|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128016": { + "content": "<|reserved_special_token_8|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128017": { + "content": "<|reserved_special_token_9|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128018": { + "content": "<|reserved_special_token_10|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128019": { + "content": "<|reserved_special_token_11|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128020": { + "content": "<|reserved_special_token_12|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128021": { + "content": "<|reserved_special_token_13|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128022": { + "content": "<|reserved_special_token_14|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128023": { + "content": "<|reserved_special_token_15|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128024": { + "content": "<|reserved_special_token_16|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128025": { + "content": "<|reserved_special_token_17|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128026": { + "content": "<|reserved_special_token_18|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128027": { + "content": "<|reserved_special_token_19|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128028": { + "content": "<|reserved_special_token_20|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128029": { + "content": "<|reserved_special_token_21|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128030": { + "content": "<|reserved_special_token_22|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128031": { + "content": "<|reserved_special_token_23|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128032": { + "content": "<|reserved_special_token_24|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128033": { + "content": "<|reserved_special_token_25|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128034": { + "content": "<|reserved_special_token_26|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128035": { + "content": "<|reserved_special_token_27|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128036": { + "content": "<|reserved_special_token_28|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128037": { + "content": "<|reserved_special_token_29|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128038": { + "content": "<|reserved_special_token_30|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128039": { + "content": "<|reserved_special_token_31|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128040": { + "content": "<|reserved_special_token_32|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128041": { + "content": "<|reserved_special_token_33|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128042": { + "content": "<|reserved_special_token_34|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128043": { + "content": "<|reserved_special_token_35|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128044": { + "content": "<|reserved_special_token_36|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128045": { + "content": "<|reserved_special_token_37|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128046": { + "content": "<|reserved_special_token_38|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128047": { + "content": "<|reserved_special_token_39|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128048": { + "content": "<|reserved_special_token_40|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128049": { + "content": "<|reserved_special_token_41|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128050": { + "content": "<|reserved_special_token_42|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128051": { + "content": "<|reserved_special_token_43|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128052": { + "content": "<|reserved_special_token_44|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128053": { + "content": "<|reserved_special_token_45|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128054": { + "content": "<|reserved_special_token_46|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128055": { + "content": "<|reserved_special_token_47|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128056": { + "content": "<|reserved_special_token_48|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128057": { + "content": "<|reserved_special_token_49|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128058": { + "content": "<|reserved_special_token_50|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128059": { + "content": "<|reserved_special_token_51|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128060": { + "content": "<|reserved_special_token_52|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128061": { + "content": "<|reserved_special_token_53|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128062": { + "content": "<|reserved_special_token_54|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128063": { + "content": "<|reserved_special_token_55|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128064": { + "content": "<|reserved_special_token_56|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128065": { + "content": "<|reserved_special_token_57|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128066": { + "content": "<|reserved_special_token_58|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128067": { + "content": "<|reserved_special_token_59|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128068": { + "content": "<|reserved_special_token_60|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128069": { + "content": "<|reserved_special_token_61|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128070": { + "content": "<|reserved_special_token_62|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128071": { + "content": "<|reserved_special_token_63|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128072": { + "content": "<|reserved_special_token_64|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128073": { + "content": "<|reserved_special_token_65|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128074": { + "content": "<|reserved_special_token_66|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128075": { + "content": "<|reserved_special_token_67|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128076": { + "content": "<|reserved_special_token_68|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128077": { + "content": "<|reserved_special_token_69|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128078": { + "content": "<|reserved_special_token_70|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128079": { + "content": "<|reserved_special_token_71|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128080": { + "content": "<|reserved_special_token_72|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128081": { + "content": "<|reserved_special_token_73|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128082": { + "content": "<|reserved_special_token_74|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128083": { + "content": "<|reserved_special_token_75|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128084": { + "content": "<|reserved_special_token_76|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128085": { + "content": "<|reserved_special_token_77|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128086": { + "content": "<|reserved_special_token_78|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128087": { + "content": "<|reserved_special_token_79|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128088": { + "content": "<|reserved_special_token_80|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128089": { + "content": "<|reserved_special_token_81|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128090": { + "content": "<|reserved_special_token_82|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128091": { + "content": "<|reserved_special_token_83|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128092": { + "content": "<|reserved_special_token_84|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128093": { + "content": "<|reserved_special_token_85|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128094": { + "content": "<|reserved_special_token_86|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128095": { + "content": "<|reserved_special_token_87|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128096": { + "content": "<|reserved_special_token_88|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128097": { + "content": "<|reserved_special_token_89|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128098": { + "content": "<|reserved_special_token_90|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128099": { + "content": "<|reserved_special_token_91|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128100": { + "content": "<|reserved_special_token_92|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128101": { + "content": "<|reserved_special_token_93|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128102": { + "content": "<|reserved_special_token_94|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128103": { + "content": "<|reserved_special_token_95|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128104": { + "content": "<|reserved_special_token_96|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128105": { + "content": "<|reserved_special_token_97|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128106": { + "content": "<|reserved_special_token_98|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128107": { + "content": "<|reserved_special_token_99|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128108": { + "content": "<|reserved_special_token_100|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128109": { + "content": "<|reserved_special_token_101|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128110": { + "content": "<|reserved_special_token_102|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128111": { + "content": "<|reserved_special_token_103|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128112": { + "content": "<|reserved_special_token_104|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128113": { + "content": "<|reserved_special_token_105|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128114": { + "content": "<|reserved_special_token_106|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128115": { + "content": "<|reserved_special_token_107|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128116": { + "content": "<|reserved_special_token_108|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128117": { + "content": "<|reserved_special_token_109|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128118": { + "content": "<|reserved_special_token_110|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128119": { + "content": "<|reserved_special_token_111|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128120": { + "content": "<|reserved_special_token_112|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128121": { + "content": "<|reserved_special_token_113|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128122": { + "content": "<|reserved_special_token_114|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128123": { + "content": "<|reserved_special_token_115|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128124": { + "content": "<|reserved_special_token_116|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128125": { + "content": "<|reserved_special_token_117|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128126": { + "content": "<|reserved_special_token_118|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128127": { + "content": "<|reserved_special_token_119|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128128": { + "content": "<|reserved_special_token_120|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128129": { + "content": "<|reserved_special_token_121|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128130": { + "content": "<|reserved_special_token_122|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128131": { + "content": "<|reserved_special_token_123|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128132": { + "content": "<|reserved_special_token_124|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128133": { + "content": "<|reserved_special_token_125|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128134": { + "content": "<|reserved_special_token_126|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128135": { + "content": "<|reserved_special_token_127|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128136": { + "content": "<|reserved_special_token_128|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128137": { + "content": "<|reserved_special_token_129|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128138": { + "content": "<|reserved_special_token_130|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128139": { + "content": "<|reserved_special_token_131|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128140": { + "content": "<|reserved_special_token_132|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128141": { + "content": "<|reserved_special_token_133|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128142": { + "content": "<|reserved_special_token_134|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128143": { + "content": "<|reserved_special_token_135|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128144": { + "content": "<|reserved_special_token_136|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128145": { + "content": "<|reserved_special_token_137|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128146": { + "content": "<|reserved_special_token_138|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128147": { + "content": "<|reserved_special_token_139|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128148": { + "content": "<|reserved_special_token_140|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128149": { + "content": "<|reserved_special_token_141|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128150": { + "content": "<|reserved_special_token_142|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128151": { + "content": "<|reserved_special_token_143|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128152": { + "content": "<|reserved_special_token_144|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128153": { + "content": "<|reserved_special_token_145|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128154": { + "content": "<|reserved_special_token_146|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128155": { + "content": "<|reserved_special_token_147|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128156": { + "content": "<|reserved_special_token_148|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128157": { + "content": "<|reserved_special_token_149|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128158": { + "content": "<|reserved_special_token_150|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128159": { + "content": "<|reserved_special_token_151|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128160": { + "content": "<|reserved_special_token_152|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128161": { + "content": "<|reserved_special_token_153|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128162": { + "content": "<|reserved_special_token_154|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128163": { + "content": "<|reserved_special_token_155|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128164": { + "content": "<|reserved_special_token_156|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128165": { + "content": "<|reserved_special_token_157|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128166": { + "content": "<|reserved_special_token_158|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128167": { + "content": "<|reserved_special_token_159|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128168": { + "content": "<|reserved_special_token_160|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128169": { + "content": "<|reserved_special_token_161|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128170": { + "content": "<|reserved_special_token_162|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128171": { + "content": "<|reserved_special_token_163|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128172": { + "content": "<|reserved_special_token_164|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128173": { + "content": "<|reserved_special_token_165|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128174": { + "content": "<|reserved_special_token_166|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128175": { + "content": "<|reserved_special_token_167|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128176": { + "content": "<|reserved_special_token_168|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128177": { + "content": "<|reserved_special_token_169|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128178": { + "content": "<|reserved_special_token_170|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128179": { + "content": "<|reserved_special_token_171|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128180": { + "content": "<|reserved_special_token_172|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128181": { + "content": "<|reserved_special_token_173|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128182": { + "content": "<|reserved_special_token_174|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128183": { + "content": "<|reserved_special_token_175|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128184": { + "content": "<|reserved_special_token_176|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128185": { + "content": "<|reserved_special_token_177|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128186": { + "content": "<|reserved_special_token_178|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128187": { + "content": "<|reserved_special_token_179|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128188": { + "content": "<|reserved_special_token_180|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128189": { + "content": "<|reserved_special_token_181|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128190": { + "content": "<|reserved_special_token_182|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128191": { + "content": "<|reserved_special_token_183|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128192": { + "content": "<|reserved_special_token_184|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128193": { + "content": "<|reserved_special_token_185|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128194": { + "content": "<|reserved_special_token_186|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128195": { + "content": "<|reserved_special_token_187|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128196": { + "content": "<|reserved_special_token_188|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128197": { + "content": "<|reserved_special_token_189|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128198": { + "content": "<|reserved_special_token_190|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128199": { + "content": "<|reserved_special_token_191|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128200": { + "content": "<|reserved_special_token_192|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128201": { + "content": "<|reserved_special_token_193|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128202": { + "content": "<|reserved_special_token_194|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128203": { + "content": "<|reserved_special_token_195|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128204": { + "content": "<|reserved_special_token_196|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128205": { + "content": "<|reserved_special_token_197|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128206": { + "content": "<|reserved_special_token_198|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128207": { + "content": "<|reserved_special_token_199|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128208": { + "content": "<|reserved_special_token_200|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128209": { + "content": "<|reserved_special_token_201|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128210": { + "content": "<|reserved_special_token_202|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128211": { + "content": "<|reserved_special_token_203|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128212": { + "content": "<|reserved_special_token_204|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128213": { + "content": "<|reserved_special_token_205|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128214": { + "content": "<|reserved_special_token_206|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128215": { + "content": "<|reserved_special_token_207|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128216": { + "content": "<|reserved_special_token_208|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128217": { + "content": "<|reserved_special_token_209|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128218": { + "content": "<|reserved_special_token_210|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128219": { + "content": "<|reserved_special_token_211|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128220": { + "content": "<|reserved_special_token_212|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128221": { + "content": "<|reserved_special_token_213|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128222": { + "content": "<|reserved_special_token_214|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128223": { + "content": "<|reserved_special_token_215|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128224": { + "content": "<|reserved_special_token_216|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128225": { + "content": "<|reserved_special_token_217|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128226": { + "content": "<|reserved_special_token_218|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128227": { + "content": "<|reserved_special_token_219|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128228": { + "content": "<|reserved_special_token_220|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128229": { + "content": "<|reserved_special_token_221|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128230": { + "content": "<|reserved_special_token_222|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128231": { + "content": "<|reserved_special_token_223|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128232": { + "content": "<|reserved_special_token_224|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128233": { + "content": "<|reserved_special_token_225|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128234": { + "content": "<|reserved_special_token_226|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128235": { + "content": "<|reserved_special_token_227|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128236": { + "content": "<|reserved_special_token_228|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128237": { + "content": "<|reserved_special_token_229|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128238": { + "content": "<|reserved_special_token_230|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128239": { + "content": "<|reserved_special_token_231|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128240": { + "content": "<|reserved_special_token_232|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128241": { + "content": "<|reserved_special_token_233|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128242": { + "content": "<|reserved_special_token_234|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128243": { + "content": "<|reserved_special_token_235|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128244": { + "content": "<|reserved_special_token_236|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128245": { + "content": "<|reserved_special_token_237|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128246": { + "content": "<|reserved_special_token_238|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128247": { + "content": "<|reserved_special_token_239|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128248": { + "content": "<|reserved_special_token_240|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128249": { + "content": "<|reserved_special_token_241|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128250": { + "content": "<|reserved_special_token_242|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128251": { + "content": "<|reserved_special_token_243|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128252": { + "content": "<|reserved_special_token_244|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128253": { + "content": "<|reserved_special_token_245|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128254": { + "content": "<|reserved_special_token_246|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128255": { + "content": "<|reserved_special_token_247|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + } + }, + "bos_token": "<|begin_of_text|>", + "clean_up_tokenization_spaces": true, + "eos_token": "<|end_of_text|>", + "extra_special_tokens": {}, + "model_input_names": [ + "input_ids", + "attention_mask" + ], + "model_max_length": 131072, + "pad_token": "<|end_of_text|>", + "tokenizer_class": "PreTrainedTokenizer" +} diff --git a/checkpoint-10575/trainer_state.json b/checkpoint-10575/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..fd40e2b98cbd186a1474c3a0212450bb18500eef --- /dev/null +++ b/checkpoint-10575/trainer_state.json @@ -0,0 +1,74058 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 5.0, + "eval_steps": 500, + "global_step": 10575, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.00047281323877068556, + "grad_norm": 5.163570880889893, + "learning_rate": 5.0000000000000004e-08, + "loss": 1.4628, + "step": 1 + }, + { + "epoch": 0.0009456264775413711, + "grad_norm": 6.298020839691162, + "learning_rate": 1.0000000000000001e-07, + "loss": 1.5003, + "step": 2 + }, + { + "epoch": 0.0014184397163120568, + "grad_norm": 5.853623390197754, + "learning_rate": 1.5000000000000002e-07, + "loss": 1.4495, + "step": 3 + }, + { + "epoch": 0.0018912529550827422, + "grad_norm": 5.456025123596191, + "learning_rate": 2.0000000000000002e-07, + "loss": 1.3798, + "step": 4 + }, + { + "epoch": 0.002364066193853428, + "grad_norm": 5.757407188415527, + "learning_rate": 2.5000000000000004e-07, + "loss": 1.4515, + "step": 5 + }, + { + "epoch": 0.0028368794326241137, + "grad_norm": 5.872277736663818, + "learning_rate": 3.0000000000000004e-07, + "loss": 1.4424, + "step": 6 + }, + { + "epoch": 0.003309692671394799, + "grad_norm": 6.7816009521484375, + "learning_rate": 3.5000000000000004e-07, + "loss": 1.4004, + "step": 7 + }, + { + "epoch": 0.0037825059101654845, + "grad_norm": 6.229667663574219, + "learning_rate": 4.0000000000000003e-07, + "loss": 1.4494, + "step": 8 + }, + { + "epoch": 0.00425531914893617, + "grad_norm": 5.336202621459961, + "learning_rate": 4.5000000000000003e-07, + "loss": 1.3916, + "step": 9 + }, + { + "epoch": 0.004728132387706856, + "grad_norm": 5.589445114135742, + "learning_rate": 5.000000000000001e-07, + "loss": 1.2318, + "step": 10 + }, + { + "epoch": 0.005200945626477541, + "grad_norm": 5.720539569854736, + "learning_rate": 5.5e-07, + "loss": 1.4367, + "step": 11 + }, + { + "epoch": 0.005673758865248227, + "grad_norm": 5.913913726806641, + "learning_rate": 6.000000000000001e-07, + "loss": 1.342, + "step": 12 + }, + { + "epoch": 0.006146572104018913, + "grad_norm": 5.899744987487793, + "learning_rate": 6.5e-07, + "loss": 1.4307, + "step": 13 + }, + { + "epoch": 0.006619385342789598, + "grad_norm": 5.571037292480469, + "learning_rate": 7.000000000000001e-07, + "loss": 1.3372, + "step": 14 + }, + { + "epoch": 0.0070921985815602835, + "grad_norm": 5.480010509490967, + "learning_rate": 7.5e-07, + "loss": 1.3923, + "step": 15 + }, + { + "epoch": 0.007565011820330969, + "grad_norm": 5.254702091217041, + "learning_rate": 8.000000000000001e-07, + "loss": 1.2928, + "step": 16 + }, + { + "epoch": 0.008037825059101654, + "grad_norm": 6.090312480926514, + "learning_rate": 8.500000000000001e-07, + "loss": 1.4984, + "step": 17 + }, + { + "epoch": 0.00851063829787234, + "grad_norm": 5.689319610595703, + "learning_rate": 9.000000000000001e-07, + "loss": 1.4108, + "step": 18 + }, + { + "epoch": 0.008983451536643027, + "grad_norm": 5.386685848236084, + "learning_rate": 9.500000000000001e-07, + "loss": 1.425, + "step": 19 + }, + { + "epoch": 0.009456264775413711, + "grad_norm": 6.451584815979004, + "learning_rate": 1.0000000000000002e-06, + "loss": 1.5507, + "step": 20 + }, + { + "epoch": 0.009929078014184398, + "grad_norm": 5.37647008895874, + "learning_rate": 1.0500000000000001e-06, + "loss": 1.4109, + "step": 21 + }, + { + "epoch": 0.010401891252955082, + "grad_norm": 4.716553211212158, + "learning_rate": 1.1e-06, + "loss": 1.2028, + "step": 22 + }, + { + "epoch": 0.010874704491725768, + "grad_norm": 4.950989723205566, + "learning_rate": 1.1500000000000002e-06, + "loss": 1.3043, + "step": 23 + }, + { + "epoch": 0.011347517730496455, + "grad_norm": 4.688975811004639, + "learning_rate": 1.2000000000000002e-06, + "loss": 1.2708, + "step": 24 + }, + { + "epoch": 0.01182033096926714, + "grad_norm": 4.905868053436279, + "learning_rate": 1.25e-06, + "loss": 1.3268, + "step": 25 + }, + { + "epoch": 0.012293144208037825, + "grad_norm": 4.503395080566406, + "learning_rate": 1.3e-06, + "loss": 1.1799, + "step": 26 + }, + { + "epoch": 0.01276595744680851, + "grad_norm": 4.77382230758667, + "learning_rate": 1.3500000000000002e-06, + "loss": 1.3882, + "step": 27 + }, + { + "epoch": 0.013238770685579196, + "grad_norm": 4.734329700469971, + "learning_rate": 1.4000000000000001e-06, + "loss": 1.3476, + "step": 28 + }, + { + "epoch": 0.013711583924349883, + "grad_norm": 4.775066375732422, + "learning_rate": 1.45e-06, + "loss": 1.2429, + "step": 29 + }, + { + "epoch": 0.014184397163120567, + "grad_norm": 4.978334426879883, + "learning_rate": 1.5e-06, + "loss": 1.2119, + "step": 30 + }, + { + "epoch": 0.014657210401891253, + "grad_norm": 4.506785869598389, + "learning_rate": 1.5500000000000002e-06, + "loss": 1.3157, + "step": 31 + }, + { + "epoch": 0.015130023640661938, + "grad_norm": 4.007757186889648, + "learning_rate": 1.6000000000000001e-06, + "loss": 1.1451, + "step": 32 + }, + { + "epoch": 0.015602836879432624, + "grad_norm": 3.6621618270874023, + "learning_rate": 1.6500000000000003e-06, + "loss": 1.093, + "step": 33 + }, + { + "epoch": 0.01607565011820331, + "grad_norm": 3.8733766078948975, + "learning_rate": 1.7000000000000002e-06, + "loss": 1.2289, + "step": 34 + }, + { + "epoch": 0.016548463356973995, + "grad_norm": 4.3391900062561035, + "learning_rate": 1.75e-06, + "loss": 1.1453, + "step": 35 + }, + { + "epoch": 0.01702127659574468, + "grad_norm": 3.287623643875122, + "learning_rate": 1.8000000000000001e-06, + "loss": 1.0257, + "step": 36 + }, + { + "epoch": 0.017494089834515367, + "grad_norm": 3.591721773147583, + "learning_rate": 1.85e-06, + "loss": 0.9976, + "step": 37 + }, + { + "epoch": 0.017966903073286054, + "grad_norm": 4.028271675109863, + "learning_rate": 1.9000000000000002e-06, + "loss": 1.0773, + "step": 38 + }, + { + "epoch": 0.018439716312056736, + "grad_norm": 3.3543951511383057, + "learning_rate": 1.9500000000000004e-06, + "loss": 1.1677, + "step": 39 + }, + { + "epoch": 0.018912529550827423, + "grad_norm": 3.807624340057373, + "learning_rate": 2.0000000000000003e-06, + "loss": 1.1232, + "step": 40 + }, + { + "epoch": 0.01938534278959811, + "grad_norm": 4.242797374725342, + "learning_rate": 2.05e-06, + "loss": 1.1819, + "step": 41 + }, + { + "epoch": 0.019858156028368795, + "grad_norm": 3.4574992656707764, + "learning_rate": 2.1000000000000002e-06, + "loss": 0.9878, + "step": 42 + }, + { + "epoch": 0.02033096926713948, + "grad_norm": 3.906695604324341, + "learning_rate": 2.15e-06, + "loss": 1.0592, + "step": 43 + }, + { + "epoch": 0.020803782505910164, + "grad_norm": 3.7543163299560547, + "learning_rate": 2.2e-06, + "loss": 1.0309, + "step": 44 + }, + { + "epoch": 0.02127659574468085, + "grad_norm": 3.3777148723602295, + "learning_rate": 2.25e-06, + "loss": 1.0664, + "step": 45 + }, + { + "epoch": 0.021749408983451537, + "grad_norm": 3.6003634929656982, + "learning_rate": 2.3000000000000004e-06, + "loss": 1.0482, + "step": 46 + }, + { + "epoch": 0.022222222222222223, + "grad_norm": 3.3961377143859863, + "learning_rate": 2.35e-06, + "loss": 1.0252, + "step": 47 + }, + { + "epoch": 0.02269503546099291, + "grad_norm": 3.1601035594940186, + "learning_rate": 2.4000000000000003e-06, + "loss": 1.0435, + "step": 48 + }, + { + "epoch": 0.023167848699763592, + "grad_norm": 3.4192967414855957, + "learning_rate": 2.4500000000000003e-06, + "loss": 1.0935, + "step": 49 + }, + { + "epoch": 0.02364066193853428, + "grad_norm": 3.1225922107696533, + "learning_rate": 2.5e-06, + "loss": 0.8988, + "step": 50 + }, + { + "epoch": 0.024113475177304965, + "grad_norm": 3.1423380374908447, + "learning_rate": 2.55e-06, + "loss": 1.0159, + "step": 51 + }, + { + "epoch": 0.02458628841607565, + "grad_norm": 3.4782402515411377, + "learning_rate": 2.6e-06, + "loss": 1.0231, + "step": 52 + }, + { + "epoch": 0.025059101654846337, + "grad_norm": 3.8362693786621094, + "learning_rate": 2.6500000000000005e-06, + "loss": 1.0725, + "step": 53 + }, + { + "epoch": 0.02553191489361702, + "grad_norm": 3.033294916152954, + "learning_rate": 2.7000000000000004e-06, + "loss": 0.9377, + "step": 54 + }, + { + "epoch": 0.026004728132387706, + "grad_norm": 3.849741220474243, + "learning_rate": 2.7500000000000004e-06, + "loss": 1.0046, + "step": 55 + }, + { + "epoch": 0.026477541371158392, + "grad_norm": 3.141876220703125, + "learning_rate": 2.8000000000000003e-06, + "loss": 0.9226, + "step": 56 + }, + { + "epoch": 0.02695035460992908, + "grad_norm": 2.773594856262207, + "learning_rate": 2.85e-06, + "loss": 0.8662, + "step": 57 + }, + { + "epoch": 0.027423167848699765, + "grad_norm": 3.1460225582122803, + "learning_rate": 2.9e-06, + "loss": 0.9304, + "step": 58 + }, + { + "epoch": 0.027895981087470448, + "grad_norm": 3.293583631515503, + "learning_rate": 2.95e-06, + "loss": 1.0374, + "step": 59 + }, + { + "epoch": 0.028368794326241134, + "grad_norm": 3.8190863132476807, + "learning_rate": 3e-06, + "loss": 0.971, + "step": 60 + }, + { + "epoch": 0.02884160756501182, + "grad_norm": 3.4566776752471924, + "learning_rate": 3.05e-06, + "loss": 0.9631, + "step": 61 + }, + { + "epoch": 0.029314420803782507, + "grad_norm": 3.355741500854492, + "learning_rate": 3.1000000000000004e-06, + "loss": 1.0097, + "step": 62 + }, + { + "epoch": 0.029787234042553193, + "grad_norm": 3.29746675491333, + "learning_rate": 3.1500000000000003e-06, + "loss": 0.9459, + "step": 63 + }, + { + "epoch": 0.030260047281323876, + "grad_norm": 3.3122968673706055, + "learning_rate": 3.2000000000000003e-06, + "loss": 0.8594, + "step": 64 + }, + { + "epoch": 0.030732860520094562, + "grad_norm": 3.477701187133789, + "learning_rate": 3.2500000000000002e-06, + "loss": 0.9197, + "step": 65 + }, + { + "epoch": 0.031205673758865248, + "grad_norm": 3.3363406658172607, + "learning_rate": 3.3000000000000006e-06, + "loss": 0.9478, + "step": 66 + }, + { + "epoch": 0.03167848699763593, + "grad_norm": 4.143295764923096, + "learning_rate": 3.3500000000000005e-06, + "loss": 1.0534, + "step": 67 + }, + { + "epoch": 0.03215130023640662, + "grad_norm": 3.2363274097442627, + "learning_rate": 3.4000000000000005e-06, + "loss": 0.9454, + "step": 68 + }, + { + "epoch": 0.032624113475177303, + "grad_norm": 3.198746681213379, + "learning_rate": 3.45e-06, + "loss": 0.9388, + "step": 69 + }, + { + "epoch": 0.03309692671394799, + "grad_norm": 3.5751023292541504, + "learning_rate": 3.5e-06, + "loss": 0.9444, + "step": 70 + }, + { + "epoch": 0.033569739952718676, + "grad_norm": 3.1745729446411133, + "learning_rate": 3.5500000000000003e-06, + "loss": 0.8683, + "step": 71 + }, + { + "epoch": 0.03404255319148936, + "grad_norm": 3.3210883140563965, + "learning_rate": 3.6000000000000003e-06, + "loss": 0.8811, + "step": 72 + }, + { + "epoch": 0.03451536643026005, + "grad_norm": 3.2502429485321045, + "learning_rate": 3.65e-06, + "loss": 1.0012, + "step": 73 + }, + { + "epoch": 0.034988179669030735, + "grad_norm": 3.44598126411438, + "learning_rate": 3.7e-06, + "loss": 0.9217, + "step": 74 + }, + { + "epoch": 0.03546099290780142, + "grad_norm": 3.439117431640625, + "learning_rate": 3.7500000000000005e-06, + "loss": 0.8976, + "step": 75 + }, + { + "epoch": 0.03593380614657211, + "grad_norm": 3.523627758026123, + "learning_rate": 3.8000000000000005e-06, + "loss": 0.8996, + "step": 76 + }, + { + "epoch": 0.03640661938534279, + "grad_norm": 3.3716015815734863, + "learning_rate": 3.85e-06, + "loss": 0.9061, + "step": 77 + }, + { + "epoch": 0.03687943262411347, + "grad_norm": 3.33518385887146, + "learning_rate": 3.900000000000001e-06, + "loss": 0.9371, + "step": 78 + }, + { + "epoch": 0.03735224586288416, + "grad_norm": 3.833829879760742, + "learning_rate": 3.95e-06, + "loss": 0.9669, + "step": 79 + }, + { + "epoch": 0.037825059101654845, + "grad_norm": 3.260446786880493, + "learning_rate": 4.000000000000001e-06, + "loss": 0.9449, + "step": 80 + }, + { + "epoch": 0.03829787234042553, + "grad_norm": 3.532451629638672, + "learning_rate": 4.05e-06, + "loss": 0.897, + "step": 81 + }, + { + "epoch": 0.03877068557919622, + "grad_norm": 3.1156492233276367, + "learning_rate": 4.1e-06, + "loss": 0.8463, + "step": 82 + }, + { + "epoch": 0.039243498817966904, + "grad_norm": 2.8801751136779785, + "learning_rate": 4.15e-06, + "loss": 0.8616, + "step": 83 + }, + { + "epoch": 0.03971631205673759, + "grad_norm": 3.072476863861084, + "learning_rate": 4.2000000000000004e-06, + "loss": 0.8387, + "step": 84 + }, + { + "epoch": 0.04018912529550828, + "grad_norm": 2.9601376056671143, + "learning_rate": 4.25e-06, + "loss": 0.8538, + "step": 85 + }, + { + "epoch": 0.04066193853427896, + "grad_norm": 3.521664619445801, + "learning_rate": 4.3e-06, + "loss": 0.8894, + "step": 86 + }, + { + "epoch": 0.04113475177304964, + "grad_norm": 3.2670981884002686, + "learning_rate": 4.350000000000001e-06, + "loss": 0.8387, + "step": 87 + }, + { + "epoch": 0.04160756501182033, + "grad_norm": 3.422089099884033, + "learning_rate": 4.4e-06, + "loss": 0.7728, + "step": 88 + }, + { + "epoch": 0.042080378250591015, + "grad_norm": 3.414034128189087, + "learning_rate": 4.450000000000001e-06, + "loss": 0.7968, + "step": 89 + }, + { + "epoch": 0.0425531914893617, + "grad_norm": 4.234285354614258, + "learning_rate": 4.5e-06, + "loss": 0.8502, + "step": 90 + }, + { + "epoch": 0.04302600472813239, + "grad_norm": 3.1446919441223145, + "learning_rate": 4.5500000000000005e-06, + "loss": 0.8236, + "step": 91 + }, + { + "epoch": 0.043498817966903074, + "grad_norm": 3.683443307876587, + "learning_rate": 4.600000000000001e-06, + "loss": 0.9792, + "step": 92 + }, + { + "epoch": 0.04397163120567376, + "grad_norm": 3.664219617843628, + "learning_rate": 4.65e-06, + "loss": 0.8743, + "step": 93 + }, + { + "epoch": 0.044444444444444446, + "grad_norm": 3.369479179382324, + "learning_rate": 4.7e-06, + "loss": 0.8741, + "step": 94 + }, + { + "epoch": 0.04491725768321513, + "grad_norm": 3.694949150085449, + "learning_rate": 4.75e-06, + "loss": 0.7574, + "step": 95 + }, + { + "epoch": 0.04539007092198582, + "grad_norm": 3.5144498348236084, + "learning_rate": 4.800000000000001e-06, + "loss": 0.9934, + "step": 96 + }, + { + "epoch": 0.0458628841607565, + "grad_norm": 3.164451837539673, + "learning_rate": 4.85e-06, + "loss": 0.7463, + "step": 97 + }, + { + "epoch": 0.046335697399527184, + "grad_norm": 3.222785472869873, + "learning_rate": 4.9000000000000005e-06, + "loss": 0.7698, + "step": 98 + }, + { + "epoch": 0.04680851063829787, + "grad_norm": 2.9129555225372314, + "learning_rate": 4.95e-06, + "loss": 0.7856, + "step": 99 + }, + { + "epoch": 0.04728132387706856, + "grad_norm": 3.5061235427856445, + "learning_rate": 5e-06, + "loss": 0.8588, + "step": 100 + }, + { + "epoch": 0.04775413711583924, + "grad_norm": 3.2805044651031494, + "learning_rate": 4.999999922167982e-06, + "loss": 0.7643, + "step": 101 + }, + { + "epoch": 0.04822695035460993, + "grad_norm": 3.5461678504943848, + "learning_rate": 4.999999688671929e-06, + "loss": 0.8253, + "step": 102 + }, + { + "epoch": 0.048699763593380616, + "grad_norm": 3.2238264083862305, + "learning_rate": 4.99999929951186e-06, + "loss": 0.7622, + "step": 103 + }, + { + "epoch": 0.0491725768321513, + "grad_norm": 3.818955898284912, + "learning_rate": 4.999998754687795e-06, + "loss": 0.8471, + "step": 104 + }, + { + "epoch": 0.04964539007092199, + "grad_norm": 3.1252424716949463, + "learning_rate": 4.99999805419977e-06, + "loss": 0.8409, + "step": 105 + }, + { + "epoch": 0.050118203309692674, + "grad_norm": 3.604283571243286, + "learning_rate": 4.999997198047828e-06, + "loss": 0.9027, + "step": 106 + }, + { + "epoch": 0.050591016548463354, + "grad_norm": 3.6752424240112305, + "learning_rate": 4.999996186232023e-06, + "loss": 0.9336, + "step": 107 + }, + { + "epoch": 0.05106382978723404, + "grad_norm": 3.517557144165039, + "learning_rate": 4.9999950187524184e-06, + "loss": 0.8351, + "step": 108 + }, + { + "epoch": 0.051536643026004726, + "grad_norm": 3.427285671234131, + "learning_rate": 4.999993695609085e-06, + "loss": 0.8457, + "step": 109 + }, + { + "epoch": 0.05200945626477541, + "grad_norm": 3.2792510986328125, + "learning_rate": 4.999992216802107e-06, + "loss": 0.8391, + "step": 110 + }, + { + "epoch": 0.0524822695035461, + "grad_norm": 3.581094741821289, + "learning_rate": 4.999990582331576e-06, + "loss": 0.7533, + "step": 111 + }, + { + "epoch": 0.052955082742316785, + "grad_norm": 3.1667377948760986, + "learning_rate": 4.999988792197593e-06, + "loss": 0.9562, + "step": 112 + }, + { + "epoch": 0.05342789598108747, + "grad_norm": 3.3609890937805176, + "learning_rate": 4.99998684640027e-06, + "loss": 0.8181, + "step": 113 + }, + { + "epoch": 0.05390070921985816, + "grad_norm": 3.260627269744873, + "learning_rate": 4.999984744939729e-06, + "loss": 0.8012, + "step": 114 + }, + { + "epoch": 0.054373522458628844, + "grad_norm": 3.4535653591156006, + "learning_rate": 4.9999824878160985e-06, + "loss": 0.919, + "step": 115 + }, + { + "epoch": 0.05484633569739953, + "grad_norm": 3.4880740642547607, + "learning_rate": 4.999980075029522e-06, + "loss": 0.8114, + "step": 116 + }, + { + "epoch": 0.05531914893617021, + "grad_norm": 3.2546932697296143, + "learning_rate": 4.999977506580147e-06, + "loss": 0.8274, + "step": 117 + }, + { + "epoch": 0.055791962174940896, + "grad_norm": 3.2762744426727295, + "learning_rate": 4.999974782468136e-06, + "loss": 0.9018, + "step": 118 + }, + { + "epoch": 0.05626477541371158, + "grad_norm": 3.42825984954834, + "learning_rate": 4.999971902693657e-06, + "loss": 0.8262, + "step": 119 + }, + { + "epoch": 0.05673758865248227, + "grad_norm": 3.082496404647827, + "learning_rate": 4.99996886725689e-06, + "loss": 0.8181, + "step": 120 + }, + { + "epoch": 0.057210401891252954, + "grad_norm": 3.322869300842285, + "learning_rate": 4.9999656761580225e-06, + "loss": 0.8382, + "step": 121 + }, + { + "epoch": 0.05768321513002364, + "grad_norm": 3.6365339756011963, + "learning_rate": 4.9999623293972555e-06, + "loss": 0.7489, + "step": 122 + }, + { + "epoch": 0.05815602836879433, + "grad_norm": 3.376352548599243, + "learning_rate": 4.999958826974796e-06, + "loss": 0.9012, + "step": 123 + }, + { + "epoch": 0.05862884160756501, + "grad_norm": 3.49088716506958, + "learning_rate": 4.999955168890862e-06, + "loss": 0.8999, + "step": 124 + }, + { + "epoch": 0.0591016548463357, + "grad_norm": 3.3265068531036377, + "learning_rate": 4.999951355145682e-06, + "loss": 0.8161, + "step": 125 + }, + { + "epoch": 0.059574468085106386, + "grad_norm": 3.697282314300537, + "learning_rate": 4.999947385739493e-06, + "loss": 0.9623, + "step": 126 + }, + { + "epoch": 0.06004728132387707, + "grad_norm": 2.7901928424835205, + "learning_rate": 4.999943260672542e-06, + "loss": 0.7371, + "step": 127 + }, + { + "epoch": 0.06052009456264775, + "grad_norm": 3.110319137573242, + "learning_rate": 4.999938979945086e-06, + "loss": 0.715, + "step": 128 + }, + { + "epoch": 0.06099290780141844, + "grad_norm": 3.2211520671844482, + "learning_rate": 4.999934543557392e-06, + "loss": 0.8888, + "step": 129 + }, + { + "epoch": 0.061465721040189124, + "grad_norm": 3.2466187477111816, + "learning_rate": 4.999929951509735e-06, + "loss": 0.9389, + "step": 130 + }, + { + "epoch": 0.06193853427895981, + "grad_norm": 3.3574399948120117, + "learning_rate": 4.999925203802403e-06, + "loss": 0.8263, + "step": 131 + }, + { + "epoch": 0.062411347517730496, + "grad_norm": 3.275601625442505, + "learning_rate": 4.99992030043569e-06, + "loss": 0.8338, + "step": 132 + }, + { + "epoch": 0.06288416075650118, + "grad_norm": 3.6011312007904053, + "learning_rate": 4.999915241409902e-06, + "loss": 0.8351, + "step": 133 + }, + { + "epoch": 0.06335697399527186, + "grad_norm": 2.969011068344116, + "learning_rate": 4.999910026725352e-06, + "loss": 0.79, + "step": 134 + }, + { + "epoch": 0.06382978723404255, + "grad_norm": 3.690784454345703, + "learning_rate": 4.999904656382369e-06, + "loss": 0.8209, + "step": 135 + }, + { + "epoch": 0.06430260047281323, + "grad_norm": 3.3363115787506104, + "learning_rate": 4.999899130381283e-06, + "loss": 0.858, + "step": 136 + }, + { + "epoch": 0.06477541371158392, + "grad_norm": 3.206881523132324, + "learning_rate": 4.9998934487224405e-06, + "loss": 0.834, + "step": 137 + }, + { + "epoch": 0.06524822695035461, + "grad_norm": 2.773146152496338, + "learning_rate": 4.999887611406195e-06, + "loss": 0.7576, + "step": 138 + }, + { + "epoch": 0.0657210401891253, + "grad_norm": 3.307725667953491, + "learning_rate": 4.999881618432908e-06, + "loss": 0.7487, + "step": 139 + }, + { + "epoch": 0.06619385342789598, + "grad_norm": 4.273657321929932, + "learning_rate": 4.999875469802956e-06, + "loss": 0.8176, + "step": 140 + }, + { + "epoch": 0.06666666666666667, + "grad_norm": 3.0898005962371826, + "learning_rate": 4.999869165516719e-06, + "loss": 0.7578, + "step": 141 + }, + { + "epoch": 0.06713947990543735, + "grad_norm": 3.25150990486145, + "learning_rate": 4.9998627055745915e-06, + "loss": 0.7873, + "step": 142 + }, + { + "epoch": 0.06761229314420804, + "grad_norm": 2.9705755710601807, + "learning_rate": 4.999856089976974e-06, + "loss": 0.6473, + "step": 143 + }, + { + "epoch": 0.06808510638297872, + "grad_norm": 3.5658507347106934, + "learning_rate": 4.9998493187242804e-06, + "loss": 0.855, + "step": 144 + }, + { + "epoch": 0.06855791962174941, + "grad_norm": 3.3994076251983643, + "learning_rate": 4.99984239181693e-06, + "loss": 0.7926, + "step": 145 + }, + { + "epoch": 0.0690307328605201, + "grad_norm": 2.8266260623931885, + "learning_rate": 4.999835309255357e-06, + "loss": 0.7564, + "step": 146 + }, + { + "epoch": 0.06950354609929078, + "grad_norm": 3.1143875122070312, + "learning_rate": 4.999828071039999e-06, + "loss": 0.8398, + "step": 147 + }, + { + "epoch": 0.06997635933806147, + "grad_norm": 2.9364278316497803, + "learning_rate": 4.99982067717131e-06, + "loss": 0.7381, + "step": 148 + }, + { + "epoch": 0.07044917257683216, + "grad_norm": 3.4155616760253906, + "learning_rate": 4.999813127649748e-06, + "loss": 0.7933, + "step": 149 + }, + { + "epoch": 0.07092198581560284, + "grad_norm": 4.371236324310303, + "learning_rate": 4.999805422475784e-06, + "loss": 0.8292, + "step": 150 + }, + { + "epoch": 0.07139479905437353, + "grad_norm": 3.3967185020446777, + "learning_rate": 4.999797561649897e-06, + "loss": 0.8712, + "step": 151 + }, + { + "epoch": 0.07186761229314421, + "grad_norm": 3.343303680419922, + "learning_rate": 4.999789545172578e-06, + "loss": 0.8177, + "step": 152 + }, + { + "epoch": 0.07234042553191489, + "grad_norm": 3.040235757827759, + "learning_rate": 4.999781373044325e-06, + "loss": 0.7379, + "step": 153 + }, + { + "epoch": 0.07281323877068557, + "grad_norm": 3.4069204330444336, + "learning_rate": 4.999773045265647e-06, + "loss": 0.7939, + "step": 154 + }, + { + "epoch": 0.07328605200945626, + "grad_norm": 3.1939475536346436, + "learning_rate": 4.999764561837063e-06, + "loss": 0.8037, + "step": 155 + }, + { + "epoch": 0.07375886524822695, + "grad_norm": 4.452004909515381, + "learning_rate": 4.999755922759101e-06, + "loss": 0.8421, + "step": 156 + }, + { + "epoch": 0.07423167848699763, + "grad_norm": 3.2031240463256836, + "learning_rate": 4.999747128032298e-06, + "loss": 0.794, + "step": 157 + }, + { + "epoch": 0.07470449172576832, + "grad_norm": 3.175920009613037, + "learning_rate": 4.999738177657203e-06, + "loss": 0.759, + "step": 158 + }, + { + "epoch": 0.075177304964539, + "grad_norm": 3.7679688930511475, + "learning_rate": 4.9997290716343725e-06, + "loss": 0.8174, + "step": 159 + }, + { + "epoch": 0.07565011820330969, + "grad_norm": 3.7020037174224854, + "learning_rate": 4.999719809964373e-06, + "loss": 0.7116, + "step": 160 + }, + { + "epoch": 0.07612293144208038, + "grad_norm": 4.357471942901611, + "learning_rate": 4.999710392647783e-06, + "loss": 0.7649, + "step": 161 + }, + { + "epoch": 0.07659574468085106, + "grad_norm": 3.3439087867736816, + "learning_rate": 4.999700819685187e-06, + "loss": 0.7907, + "step": 162 + }, + { + "epoch": 0.07706855791962175, + "grad_norm": 3.210815191268921, + "learning_rate": 4.999691091077182e-06, + "loss": 0.8446, + "step": 163 + }, + { + "epoch": 0.07754137115839244, + "grad_norm": 3.1029553413391113, + "learning_rate": 4.9996812068243735e-06, + "loss": 0.7232, + "step": 164 + }, + { + "epoch": 0.07801418439716312, + "grad_norm": 2.9389400482177734, + "learning_rate": 4.999671166927378e-06, + "loss": 0.7413, + "step": 165 + }, + { + "epoch": 0.07848699763593381, + "grad_norm": 3.7062697410583496, + "learning_rate": 4.9996609713868185e-06, + "loss": 0.8773, + "step": 166 + }, + { + "epoch": 0.0789598108747045, + "grad_norm": 3.2768924236297607, + "learning_rate": 4.999650620203332e-06, + "loss": 0.8046, + "step": 167 + }, + { + "epoch": 0.07943262411347518, + "grad_norm": 3.380373001098633, + "learning_rate": 4.999640113377561e-06, + "loss": 0.7529, + "step": 168 + }, + { + "epoch": 0.07990543735224587, + "grad_norm": 3.520022392272949, + "learning_rate": 4.999629450910162e-06, + "loss": 0.7352, + "step": 169 + }, + { + "epoch": 0.08037825059101655, + "grad_norm": 3.43269419670105, + "learning_rate": 4.999618632801796e-06, + "loss": 0.9371, + "step": 170 + }, + { + "epoch": 0.08085106382978724, + "grad_norm": 3.555877923965454, + "learning_rate": 4.99960765905314e-06, + "loss": 0.8276, + "step": 171 + }, + { + "epoch": 0.08132387706855793, + "grad_norm": 3.597050189971924, + "learning_rate": 4.999596529664874e-06, + "loss": 0.8164, + "step": 172 + }, + { + "epoch": 0.0817966903073286, + "grad_norm": 3.2002956867218018, + "learning_rate": 4.999585244637693e-06, + "loss": 0.7824, + "step": 173 + }, + { + "epoch": 0.08226950354609928, + "grad_norm": 3.527275562286377, + "learning_rate": 4.999573803972299e-06, + "loss": 0.8033, + "step": 174 + }, + { + "epoch": 0.08274231678486997, + "grad_norm": 3.5184452533721924, + "learning_rate": 4.999562207669405e-06, + "loss": 0.724, + "step": 175 + }, + { + "epoch": 0.08321513002364066, + "grad_norm": 3.6635067462921143, + "learning_rate": 4.999550455729732e-06, + "loss": 0.819, + "step": 176 + }, + { + "epoch": 0.08368794326241134, + "grad_norm": 3.192399740219116, + "learning_rate": 4.999538548154012e-06, + "loss": 0.7999, + "step": 177 + }, + { + "epoch": 0.08416075650118203, + "grad_norm": 3.0946953296661377, + "learning_rate": 4.999526484942988e-06, + "loss": 0.7367, + "step": 178 + }, + { + "epoch": 0.08463356973995272, + "grad_norm": 2.847198009490967, + "learning_rate": 4.99951426609741e-06, + "loss": 0.7536, + "step": 179 + }, + { + "epoch": 0.0851063829787234, + "grad_norm": 2.7674827575683594, + "learning_rate": 4.999501891618037e-06, + "loss": 0.701, + "step": 180 + }, + { + "epoch": 0.08557919621749409, + "grad_norm": 3.357933521270752, + "learning_rate": 4.999489361505643e-06, + "loss": 0.8331, + "step": 181 + }, + { + "epoch": 0.08605200945626477, + "grad_norm": 3.1464426517486572, + "learning_rate": 4.999476675761004e-06, + "loss": 0.7931, + "step": 182 + }, + { + "epoch": 0.08652482269503546, + "grad_norm": 3.310697078704834, + "learning_rate": 4.999463834384915e-06, + "loss": 0.753, + "step": 183 + }, + { + "epoch": 0.08699763593380615, + "grad_norm": 2.9794881343841553, + "learning_rate": 4.999450837378171e-06, + "loss": 0.7091, + "step": 184 + }, + { + "epoch": 0.08747044917257683, + "grad_norm": 3.0776889324188232, + "learning_rate": 4.999437684741584e-06, + "loss": 0.7226, + "step": 185 + }, + { + "epoch": 0.08794326241134752, + "grad_norm": 3.6657519340515137, + "learning_rate": 4.999424376475972e-06, + "loss": 0.845, + "step": 186 + }, + { + "epoch": 0.0884160756501182, + "grad_norm": 3.872718572616577, + "learning_rate": 4.999410912582164e-06, + "loss": 0.812, + "step": 187 + }, + { + "epoch": 0.08888888888888889, + "grad_norm": 2.9184508323669434, + "learning_rate": 4.9993972930609976e-06, + "loss": 0.6823, + "step": 188 + }, + { + "epoch": 0.08936170212765958, + "grad_norm": 3.5567142963409424, + "learning_rate": 4.999383517913321e-06, + "loss": 0.7614, + "step": 189 + }, + { + "epoch": 0.08983451536643026, + "grad_norm": 3.3688533306121826, + "learning_rate": 4.999369587139992e-06, + "loss": 0.858, + "step": 190 + }, + { + "epoch": 0.09030732860520095, + "grad_norm": 2.893223524093628, + "learning_rate": 4.99935550074188e-06, + "loss": 0.6761, + "step": 191 + }, + { + "epoch": 0.09078014184397164, + "grad_norm": 3.400225877761841, + "learning_rate": 4.999341258719859e-06, + "loss": 0.7531, + "step": 192 + }, + { + "epoch": 0.09125295508274232, + "grad_norm": 3.6167714595794678, + "learning_rate": 4.999326861074817e-06, + "loss": 0.8164, + "step": 193 + }, + { + "epoch": 0.091725768321513, + "grad_norm": 4.325016498565674, + "learning_rate": 4.9993123078076506e-06, + "loss": 0.7069, + "step": 194 + }, + { + "epoch": 0.09219858156028368, + "grad_norm": 3.195317029953003, + "learning_rate": 4.999297598919266e-06, + "loss": 0.726, + "step": 195 + }, + { + "epoch": 0.09267139479905437, + "grad_norm": 3.146530866622925, + "learning_rate": 4.999282734410579e-06, + "loss": 0.7888, + "step": 196 + }, + { + "epoch": 0.09314420803782505, + "grad_norm": 3.5166752338409424, + "learning_rate": 4.999267714282515e-06, + "loss": 0.8473, + "step": 197 + }, + { + "epoch": 0.09361702127659574, + "grad_norm": 3.3140196800231934, + "learning_rate": 4.99925253853601e-06, + "loss": 0.7233, + "step": 198 + }, + { + "epoch": 0.09408983451536643, + "grad_norm": 3.0318164825439453, + "learning_rate": 4.999237207172008e-06, + "loss": 0.7543, + "step": 199 + }, + { + "epoch": 0.09456264775413711, + "grad_norm": 3.662214756011963, + "learning_rate": 4.999221720191464e-06, + "loss": 0.7783, + "step": 200 + }, + { + "epoch": 0.0950354609929078, + "grad_norm": 3.452078104019165, + "learning_rate": 4.9992060775953425e-06, + "loss": 0.7868, + "step": 201 + }, + { + "epoch": 0.09550827423167849, + "grad_norm": 3.4051287174224854, + "learning_rate": 4.999190279384617e-06, + "loss": 0.7849, + "step": 202 + }, + { + "epoch": 0.09598108747044917, + "grad_norm": 3.1377196311950684, + "learning_rate": 4.999174325560271e-06, + "loss": 0.8364, + "step": 203 + }, + { + "epoch": 0.09645390070921986, + "grad_norm": 3.129473924636841, + "learning_rate": 4.999158216123299e-06, + "loss": 0.7458, + "step": 204 + }, + { + "epoch": 0.09692671394799054, + "grad_norm": 3.169548749923706, + "learning_rate": 4.999141951074703e-06, + "loss": 0.7256, + "step": 205 + }, + { + "epoch": 0.09739952718676123, + "grad_norm": 3.186009168624878, + "learning_rate": 4.999125530415495e-06, + "loss": 0.783, + "step": 206 + }, + { + "epoch": 0.09787234042553192, + "grad_norm": 3.0995123386383057, + "learning_rate": 4.9991089541467e-06, + "loss": 0.7519, + "step": 207 + }, + { + "epoch": 0.0983451536643026, + "grad_norm": 3.1854088306427, + "learning_rate": 4.999092222269348e-06, + "loss": 0.7444, + "step": 208 + }, + { + "epoch": 0.09881796690307329, + "grad_norm": 3.1512246131896973, + "learning_rate": 4.999075334784482e-06, + "loss": 0.7882, + "step": 209 + }, + { + "epoch": 0.09929078014184398, + "grad_norm": 3.6199698448181152, + "learning_rate": 4.999058291693153e-06, + "loss": 0.8048, + "step": 210 + }, + { + "epoch": 0.09976359338061466, + "grad_norm": 2.956907272338867, + "learning_rate": 4.999041092996422e-06, + "loss": 0.7663, + "step": 211 + }, + { + "epoch": 0.10023640661938535, + "grad_norm": 3.3493971824645996, + "learning_rate": 4.99902373869536e-06, + "loss": 0.7639, + "step": 212 + }, + { + "epoch": 0.10070921985815603, + "grad_norm": 3.144812822341919, + "learning_rate": 4.9990062287910475e-06, + "loss": 0.7953, + "step": 213 + }, + { + "epoch": 0.10118203309692671, + "grad_norm": 3.5986971855163574, + "learning_rate": 4.998988563284576e-06, + "loss": 0.8297, + "step": 214 + }, + { + "epoch": 0.1016548463356974, + "grad_norm": 3.447584867477417, + "learning_rate": 4.998970742177044e-06, + "loss": 0.808, + "step": 215 + }, + { + "epoch": 0.10212765957446808, + "grad_norm": 3.791353940963745, + "learning_rate": 4.998952765469562e-06, + "loss": 0.8005, + "step": 216 + }, + { + "epoch": 0.10260047281323877, + "grad_norm": 3.4490807056427, + "learning_rate": 4.998934633163247e-06, + "loss": 0.8135, + "step": 217 + }, + { + "epoch": 0.10307328605200945, + "grad_norm": 3.1053314208984375, + "learning_rate": 4.998916345259232e-06, + "loss": 0.7888, + "step": 218 + }, + { + "epoch": 0.10354609929078014, + "grad_norm": 3.407862663269043, + "learning_rate": 4.9988979017586514e-06, + "loss": 0.7099, + "step": 219 + }, + { + "epoch": 0.10401891252955082, + "grad_norm": 3.116656541824341, + "learning_rate": 4.998879302662658e-06, + "loss": 0.8344, + "step": 220 + }, + { + "epoch": 0.10449172576832151, + "grad_norm": 3.339264154434204, + "learning_rate": 4.998860547972406e-06, + "loss": 0.8496, + "step": 221 + }, + { + "epoch": 0.1049645390070922, + "grad_norm": 3.251892566680908, + "learning_rate": 4.998841637689066e-06, + "loss": 0.7455, + "step": 222 + }, + { + "epoch": 0.10543735224586288, + "grad_norm": 4.098135471343994, + "learning_rate": 4.998822571813814e-06, + "loss": 0.7772, + "step": 223 + }, + { + "epoch": 0.10591016548463357, + "grad_norm": 3.9871134757995605, + "learning_rate": 4.998803350347837e-06, + "loss": 0.8261, + "step": 224 + }, + { + "epoch": 0.10638297872340426, + "grad_norm": 3.2822303771972656, + "learning_rate": 4.998783973292333e-06, + "loss": 0.8623, + "step": 225 + }, + { + "epoch": 0.10685579196217494, + "grad_norm": 3.0356857776641846, + "learning_rate": 4.998764440648507e-06, + "loss": 0.7426, + "step": 226 + }, + { + "epoch": 0.10732860520094563, + "grad_norm": 2.8932785987854004, + "learning_rate": 4.998744752417576e-06, + "loss": 0.6741, + "step": 227 + }, + { + "epoch": 0.10780141843971631, + "grad_norm": 3.085820436477661, + "learning_rate": 4.998724908600767e-06, + "loss": 0.6549, + "step": 228 + }, + { + "epoch": 0.108274231678487, + "grad_norm": 3.135829210281372, + "learning_rate": 4.998704909199314e-06, + "loss": 0.6702, + "step": 229 + }, + { + "epoch": 0.10874704491725769, + "grad_norm": 5.016134262084961, + "learning_rate": 4.9986847542144625e-06, + "loss": 0.7852, + "step": 230 + }, + { + "epoch": 0.10921985815602837, + "grad_norm": 3.9056200981140137, + "learning_rate": 4.998664443647468e-06, + "loss": 0.9654, + "step": 231 + }, + { + "epoch": 0.10969267139479906, + "grad_norm": 3.0880749225616455, + "learning_rate": 4.998643977499595e-06, + "loss": 0.7579, + "step": 232 + }, + { + "epoch": 0.11016548463356975, + "grad_norm": 3.6893601417541504, + "learning_rate": 4.998623355772118e-06, + "loss": 0.713, + "step": 233 + }, + { + "epoch": 0.11063829787234042, + "grad_norm": 4.181536674499512, + "learning_rate": 4.998602578466319e-06, + "loss": 0.7331, + "step": 234 + }, + { + "epoch": 0.1111111111111111, + "grad_norm": 3.036386728286743, + "learning_rate": 4.998581645583496e-06, + "loss": 0.7115, + "step": 235 + }, + { + "epoch": 0.11158392434988179, + "grad_norm": 3.6333255767822266, + "learning_rate": 4.998560557124948e-06, + "loss": 0.7544, + "step": 236 + }, + { + "epoch": 0.11205673758865248, + "grad_norm": 2.926417827606201, + "learning_rate": 4.9985393130919915e-06, + "loss": 0.715, + "step": 237 + }, + { + "epoch": 0.11252955082742316, + "grad_norm": 2.969158172607422, + "learning_rate": 4.998517913485946e-06, + "loss": 0.7304, + "step": 238 + }, + { + "epoch": 0.11300236406619385, + "grad_norm": 3.5254971981048584, + "learning_rate": 4.9984963583081466e-06, + "loss": 0.7725, + "step": 239 + }, + { + "epoch": 0.11347517730496454, + "grad_norm": 3.7840335369110107, + "learning_rate": 4.998474647559936e-06, + "loss": 0.8685, + "step": 240 + }, + { + "epoch": 0.11394799054373522, + "grad_norm": 3.0333125591278076, + "learning_rate": 4.9984527812426625e-06, + "loss": 0.7793, + "step": 241 + }, + { + "epoch": 0.11442080378250591, + "grad_norm": 3.290159225463867, + "learning_rate": 4.99843075935769e-06, + "loss": 0.7158, + "step": 242 + }, + { + "epoch": 0.1148936170212766, + "grad_norm": 3.3935494422912598, + "learning_rate": 4.99840858190639e-06, + "loss": 0.7643, + "step": 243 + }, + { + "epoch": 0.11536643026004728, + "grad_norm": 3.333965539932251, + "learning_rate": 4.998386248890142e-06, + "loss": 0.7255, + "step": 244 + }, + { + "epoch": 0.11583924349881797, + "grad_norm": 2.8129613399505615, + "learning_rate": 4.998363760310339e-06, + "loss": 0.768, + "step": 245 + }, + { + "epoch": 0.11631205673758865, + "grad_norm": 2.8678107261657715, + "learning_rate": 4.998341116168378e-06, + "loss": 0.7403, + "step": 246 + }, + { + "epoch": 0.11678486997635934, + "grad_norm": 2.8898239135742188, + "learning_rate": 4.998318316465672e-06, + "loss": 0.6844, + "step": 247 + }, + { + "epoch": 0.11725768321513003, + "grad_norm": 3.139777898788452, + "learning_rate": 4.998295361203637e-06, + "loss": 0.7936, + "step": 248 + }, + { + "epoch": 0.11773049645390071, + "grad_norm": 3.393721103668213, + "learning_rate": 4.998272250383707e-06, + "loss": 0.8173, + "step": 249 + }, + { + "epoch": 0.1182033096926714, + "grad_norm": 3.240973949432373, + "learning_rate": 4.998248984007318e-06, + "loss": 0.8252, + "step": 250 + }, + { + "epoch": 0.11867612293144209, + "grad_norm": 3.384855031967163, + "learning_rate": 4.998225562075918e-06, + "loss": 0.7244, + "step": 251 + }, + { + "epoch": 0.11914893617021277, + "grad_norm": 3.1881816387176514, + "learning_rate": 4.9982019845909675e-06, + "loss": 0.6818, + "step": 252 + }, + { + "epoch": 0.11962174940898346, + "grad_norm": 2.888364553451538, + "learning_rate": 4.998178251553934e-06, + "loss": 0.6753, + "step": 253 + }, + { + "epoch": 0.12009456264775414, + "grad_norm": 3.630093812942505, + "learning_rate": 4.9981543629662944e-06, + "loss": 0.7995, + "step": 254 + }, + { + "epoch": 0.12056737588652482, + "grad_norm": 2.9820947647094727, + "learning_rate": 4.998130318829537e-06, + "loss": 0.7478, + "step": 255 + }, + { + "epoch": 0.1210401891252955, + "grad_norm": 2.7094738483428955, + "learning_rate": 4.998106119145159e-06, + "loss": 0.7237, + "step": 256 + }, + { + "epoch": 0.12151300236406619, + "grad_norm": 3.1808104515075684, + "learning_rate": 4.9980817639146665e-06, + "loss": 0.7915, + "step": 257 + }, + { + "epoch": 0.12198581560283688, + "grad_norm": 3.1661291122436523, + "learning_rate": 4.998057253139575e-06, + "loss": 0.8053, + "step": 258 + }, + { + "epoch": 0.12245862884160756, + "grad_norm": 3.528749942779541, + "learning_rate": 4.998032586821413e-06, + "loss": 0.7946, + "step": 259 + }, + { + "epoch": 0.12293144208037825, + "grad_norm": 3.125964879989624, + "learning_rate": 4.998007764961716e-06, + "loss": 0.7569, + "step": 260 + }, + { + "epoch": 0.12340425531914893, + "grad_norm": 3.0778942108154297, + "learning_rate": 4.997982787562029e-06, + "loss": 0.7184, + "step": 261 + }, + { + "epoch": 0.12387706855791962, + "grad_norm": 3.3531930446624756, + "learning_rate": 4.997957654623906e-06, + "loss": 0.7586, + "step": 262 + }, + { + "epoch": 0.1243498817966903, + "grad_norm": 3.229278564453125, + "learning_rate": 4.997932366148913e-06, + "loss": 0.6092, + "step": 263 + }, + { + "epoch": 0.12482269503546099, + "grad_norm": 3.7286155223846436, + "learning_rate": 4.997906922138626e-06, + "loss": 0.7965, + "step": 264 + }, + { + "epoch": 0.12529550827423167, + "grad_norm": 3.300311803817749, + "learning_rate": 4.997881322594628e-06, + "loss": 0.7665, + "step": 265 + }, + { + "epoch": 0.12576832151300235, + "grad_norm": 3.411482572555542, + "learning_rate": 4.9978555675185115e-06, + "loss": 0.7253, + "step": 266 + }, + { + "epoch": 0.12624113475177304, + "grad_norm": 3.0884511470794678, + "learning_rate": 4.9978296569118825e-06, + "loss": 0.659, + "step": 267 + }, + { + "epoch": 0.12671394799054372, + "grad_norm": 3.0652925968170166, + "learning_rate": 4.9978035907763535e-06, + "loss": 0.6739, + "step": 268 + }, + { + "epoch": 0.1271867612293144, + "grad_norm": 3.280555009841919, + "learning_rate": 4.997777369113547e-06, + "loss": 0.8003, + "step": 269 + }, + { + "epoch": 0.1276595744680851, + "grad_norm": 2.980860948562622, + "learning_rate": 4.997750991925096e-06, + "loss": 0.7097, + "step": 270 + }, + { + "epoch": 0.12813238770685578, + "grad_norm": 3.301760673522949, + "learning_rate": 4.997724459212644e-06, + "loss": 0.7894, + "step": 271 + }, + { + "epoch": 0.12860520094562647, + "grad_norm": 2.9584903717041016, + "learning_rate": 4.997697770977841e-06, + "loss": 0.733, + "step": 272 + }, + { + "epoch": 0.12907801418439716, + "grad_norm": 3.5632214546203613, + "learning_rate": 4.99767092722235e-06, + "loss": 0.7228, + "step": 273 + }, + { + "epoch": 0.12955082742316784, + "grad_norm": 3.5900983810424805, + "learning_rate": 4.997643927947843e-06, + "loss": 0.7634, + "step": 274 + }, + { + "epoch": 0.13002364066193853, + "grad_norm": 3.332650661468506, + "learning_rate": 4.997616773156e-06, + "loss": 0.797, + "step": 275 + }, + { + "epoch": 0.13049645390070921, + "grad_norm": 3.1094167232513428, + "learning_rate": 4.997589462848512e-06, + "loss": 0.7849, + "step": 276 + }, + { + "epoch": 0.1309692671394799, + "grad_norm": 3.5359463691711426, + "learning_rate": 4.99756199702708e-06, + "loss": 0.6871, + "step": 277 + }, + { + "epoch": 0.1314420803782506, + "grad_norm": 3.190441846847534, + "learning_rate": 4.997534375693414e-06, + "loss": 0.6883, + "step": 278 + }, + { + "epoch": 0.13191489361702127, + "grad_norm": 3.063518762588501, + "learning_rate": 4.997506598849234e-06, + "loss": 0.7586, + "step": 279 + }, + { + "epoch": 0.13238770685579196, + "grad_norm": 3.4112050533294678, + "learning_rate": 4.997478666496269e-06, + "loss": 0.796, + "step": 280 + }, + { + "epoch": 0.13286052009456265, + "grad_norm": 3.231886386871338, + "learning_rate": 4.997450578636259e-06, + "loss": 0.7714, + "step": 281 + }, + { + "epoch": 0.13333333333333333, + "grad_norm": 3.279425621032715, + "learning_rate": 4.9974223352709515e-06, + "loss": 0.7793, + "step": 282 + }, + { + "epoch": 0.13380614657210402, + "grad_norm": 3.2154316902160645, + "learning_rate": 4.9973939364021075e-06, + "loss": 0.791, + "step": 283 + }, + { + "epoch": 0.1342789598108747, + "grad_norm": 3.2090768814086914, + "learning_rate": 4.9973653820314925e-06, + "loss": 0.6433, + "step": 284 + }, + { + "epoch": 0.1347517730496454, + "grad_norm": 3.1712026596069336, + "learning_rate": 4.997336672160886e-06, + "loss": 0.8128, + "step": 285 + }, + { + "epoch": 0.13522458628841608, + "grad_norm": 2.929229497909546, + "learning_rate": 4.997307806792076e-06, + "loss": 0.7594, + "step": 286 + }, + { + "epoch": 0.13569739952718676, + "grad_norm": 3.0363314151763916, + "learning_rate": 4.997278785926859e-06, + "loss": 0.7336, + "step": 287 + }, + { + "epoch": 0.13617021276595745, + "grad_norm": 3.1352357864379883, + "learning_rate": 4.997249609567042e-06, + "loss": 0.7225, + "step": 288 + }, + { + "epoch": 0.13664302600472814, + "grad_norm": 3.3171157836914062, + "learning_rate": 4.997220277714442e-06, + "loss": 0.7777, + "step": 289 + }, + { + "epoch": 0.13711583924349882, + "grad_norm": 3.050717353820801, + "learning_rate": 4.997190790370885e-06, + "loss": 0.6836, + "step": 290 + }, + { + "epoch": 0.1375886524822695, + "grad_norm": 3.0297694206237793, + "learning_rate": 4.997161147538208e-06, + "loss": 0.6883, + "step": 291 + }, + { + "epoch": 0.1380614657210402, + "grad_norm": 3.0566554069519043, + "learning_rate": 4.997131349218256e-06, + "loss": 0.6674, + "step": 292 + }, + { + "epoch": 0.13853427895981088, + "grad_norm": 3.799111843109131, + "learning_rate": 4.997101395412885e-06, + "loss": 0.8256, + "step": 293 + }, + { + "epoch": 0.13900709219858157, + "grad_norm": 3.1394248008728027, + "learning_rate": 4.9970712861239576e-06, + "loss": 0.7306, + "step": 294 + }, + { + "epoch": 0.13947990543735225, + "grad_norm": 3.0605666637420654, + "learning_rate": 4.997041021353352e-06, + "loss": 0.7212, + "step": 295 + }, + { + "epoch": 0.13995271867612294, + "grad_norm": 3.8813397884368896, + "learning_rate": 4.997010601102951e-06, + "loss": 0.769, + "step": 296 + }, + { + "epoch": 0.14042553191489363, + "grad_norm": 3.0514819622039795, + "learning_rate": 4.996980025374649e-06, + "loss": 0.7422, + "step": 297 + }, + { + "epoch": 0.1408983451536643, + "grad_norm": 2.9544146060943604, + "learning_rate": 4.99694929417035e-06, + "loss": 0.6912, + "step": 298 + }, + { + "epoch": 0.141371158392435, + "grad_norm": 3.2635602951049805, + "learning_rate": 4.996918407491966e-06, + "loss": 0.7395, + "step": 299 + }, + { + "epoch": 0.14184397163120568, + "grad_norm": 3.373882532119751, + "learning_rate": 4.996887365341423e-06, + "loss": 0.7799, + "step": 300 + }, + { + "epoch": 0.14231678486997637, + "grad_norm": 3.001128673553467, + "learning_rate": 4.996856167720652e-06, + "loss": 0.7168, + "step": 301 + }, + { + "epoch": 0.14278959810874706, + "grad_norm": 3.1026835441589355, + "learning_rate": 4.996824814631595e-06, + "loss": 0.7492, + "step": 302 + }, + { + "epoch": 0.14326241134751774, + "grad_norm": 3.41947603225708, + "learning_rate": 4.996793306076205e-06, + "loss": 0.6659, + "step": 303 + }, + { + "epoch": 0.14373522458628843, + "grad_norm": 3.2272400856018066, + "learning_rate": 4.996761642056444e-06, + "loss": 0.7184, + "step": 304 + }, + { + "epoch": 0.14420803782505912, + "grad_norm": 2.9488935470581055, + "learning_rate": 4.996729822574284e-06, + "loss": 0.7451, + "step": 305 + }, + { + "epoch": 0.14468085106382977, + "grad_norm": 3.268231153488159, + "learning_rate": 4.9966978476317065e-06, + "loss": 0.7798, + "step": 306 + }, + { + "epoch": 0.14515366430260046, + "grad_norm": 3.9086556434631348, + "learning_rate": 4.996665717230701e-06, + "loss": 0.7871, + "step": 307 + }, + { + "epoch": 0.14562647754137115, + "grad_norm": 3.3483879566192627, + "learning_rate": 4.996633431373269e-06, + "loss": 0.7415, + "step": 308 + }, + { + "epoch": 0.14609929078014183, + "grad_norm": 2.839400053024292, + "learning_rate": 4.99660099006142e-06, + "loss": 0.7192, + "step": 309 + }, + { + "epoch": 0.14657210401891252, + "grad_norm": 3.177302598953247, + "learning_rate": 4.996568393297175e-06, + "loss": 0.755, + "step": 310 + }, + { + "epoch": 0.1470449172576832, + "grad_norm": 3.5477044582366943, + "learning_rate": 4.996535641082563e-06, + "loss": 0.7531, + "step": 311 + }, + { + "epoch": 0.1475177304964539, + "grad_norm": 3.418576717376709, + "learning_rate": 4.996502733419624e-06, + "loss": 0.8009, + "step": 312 + }, + { + "epoch": 0.14799054373522458, + "grad_norm": 3.711341619491577, + "learning_rate": 4.996469670310407e-06, + "loss": 0.7362, + "step": 313 + }, + { + "epoch": 0.14846335697399526, + "grad_norm": 3.2419373989105225, + "learning_rate": 4.99643645175697e-06, + "loss": 0.7761, + "step": 314 + }, + { + "epoch": 0.14893617021276595, + "grad_norm": 3.121858835220337, + "learning_rate": 4.996403077761381e-06, + "loss": 0.6495, + "step": 315 + }, + { + "epoch": 0.14940898345153664, + "grad_norm": 3.123054265975952, + "learning_rate": 4.996369548325719e-06, + "loss": 0.7444, + "step": 316 + }, + { + "epoch": 0.14988179669030732, + "grad_norm": 2.780880928039551, + "learning_rate": 4.996335863452072e-06, + "loss": 0.672, + "step": 317 + }, + { + "epoch": 0.150354609929078, + "grad_norm": 3.3738629817962646, + "learning_rate": 4.996302023142536e-06, + "loss": 0.7972, + "step": 318 + }, + { + "epoch": 0.1508274231678487, + "grad_norm": 3.4874777793884277, + "learning_rate": 4.99626802739922e-06, + "loss": 0.8252, + "step": 319 + }, + { + "epoch": 0.15130023640661938, + "grad_norm": 3.7074787616729736, + "learning_rate": 4.9962338762242395e-06, + "loss": 0.8216, + "step": 320 + }, + { + "epoch": 0.15177304964539007, + "grad_norm": 3.281912326812744, + "learning_rate": 4.996199569619721e-06, + "loss": 0.8175, + "step": 321 + }, + { + "epoch": 0.15224586288416075, + "grad_norm": 2.9485340118408203, + "learning_rate": 4.996165107587801e-06, + "loss": 0.707, + "step": 322 + }, + { + "epoch": 0.15271867612293144, + "grad_norm": 3.3757646083831787, + "learning_rate": 4.996130490130625e-06, + "loss": 0.7955, + "step": 323 + }, + { + "epoch": 0.15319148936170213, + "grad_norm": 2.962181568145752, + "learning_rate": 4.996095717250349e-06, + "loss": 0.7067, + "step": 324 + }, + { + "epoch": 0.1536643026004728, + "grad_norm": 3.114272356033325, + "learning_rate": 4.996060788949136e-06, + "loss": 0.7486, + "step": 325 + }, + { + "epoch": 0.1541371158392435, + "grad_norm": 3.0621590614318848, + "learning_rate": 4.996025705229165e-06, + "loss": 0.6547, + "step": 326 + }, + { + "epoch": 0.15460992907801419, + "grad_norm": 2.8745882511138916, + "learning_rate": 4.995990466092616e-06, + "loss": 0.6435, + "step": 327 + }, + { + "epoch": 0.15508274231678487, + "grad_norm": 2.90841007232666, + "learning_rate": 4.995955071541686e-06, + "loss": 0.7331, + "step": 328 + }, + { + "epoch": 0.15555555555555556, + "grad_norm": 2.694580316543579, + "learning_rate": 4.9959195215785784e-06, + "loss": 0.6731, + "step": 329 + }, + { + "epoch": 0.15602836879432624, + "grad_norm": 3.158083438873291, + "learning_rate": 4.995883816205507e-06, + "loss": 0.7257, + "step": 330 + }, + { + "epoch": 0.15650118203309693, + "grad_norm": 3.3234715461730957, + "learning_rate": 4.995847955424694e-06, + "loss": 0.7389, + "step": 331 + }, + { + "epoch": 0.15697399527186762, + "grad_norm": 2.9406495094299316, + "learning_rate": 4.995811939238373e-06, + "loss": 0.643, + "step": 332 + }, + { + "epoch": 0.1574468085106383, + "grad_norm": 3.3191726207733154, + "learning_rate": 4.995775767648785e-06, + "loss": 0.7879, + "step": 333 + }, + { + "epoch": 0.157919621749409, + "grad_norm": 3.711925745010376, + "learning_rate": 4.995739440658185e-06, + "loss": 0.7586, + "step": 334 + }, + { + "epoch": 0.15839243498817968, + "grad_norm": 9.573421478271484, + "learning_rate": 4.995702958268833e-06, + "loss": 0.7842, + "step": 335 + }, + { + "epoch": 0.15886524822695036, + "grad_norm": 3.4154508113861084, + "learning_rate": 4.995666320483001e-06, + "loss": 0.6735, + "step": 336 + }, + { + "epoch": 0.15933806146572105, + "grad_norm": 3.4169859886169434, + "learning_rate": 4.995629527302971e-06, + "loss": 0.741, + "step": 337 + }, + { + "epoch": 0.15981087470449173, + "grad_norm": 3.287503242492676, + "learning_rate": 4.9955925787310335e-06, + "loss": 0.7139, + "step": 338 + }, + { + "epoch": 0.16028368794326242, + "grad_norm": 3.288409471511841, + "learning_rate": 4.995555474769488e-06, + "loss": 0.7636, + "step": 339 + }, + { + "epoch": 0.1607565011820331, + "grad_norm": 2.8021693229675293, + "learning_rate": 4.995518215420646e-06, + "loss": 0.5883, + "step": 340 + }, + { + "epoch": 0.1612293144208038, + "grad_norm": 2.7038564682006836, + "learning_rate": 4.995480800686827e-06, + "loss": 0.657, + "step": 341 + }, + { + "epoch": 0.16170212765957448, + "grad_norm": 3.2370235919952393, + "learning_rate": 4.9954432305703615e-06, + "loss": 0.6999, + "step": 342 + }, + { + "epoch": 0.16217494089834517, + "grad_norm": 2.8666412830352783, + "learning_rate": 4.995405505073588e-06, + "loss": 0.7199, + "step": 343 + }, + { + "epoch": 0.16264775413711585, + "grad_norm": 3.6467232704162598, + "learning_rate": 4.995367624198856e-06, + "loss": 0.7317, + "step": 344 + }, + { + "epoch": 0.16312056737588654, + "grad_norm": 2.7576327323913574, + "learning_rate": 4.9953295879485246e-06, + "loss": 0.647, + "step": 345 + }, + { + "epoch": 0.1635933806146572, + "grad_norm": 2.922232151031494, + "learning_rate": 4.995291396324959e-06, + "loss": 0.6686, + "step": 346 + }, + { + "epoch": 0.16406619385342788, + "grad_norm": 2.8693501949310303, + "learning_rate": 4.995253049330542e-06, + "loss": 0.6756, + "step": 347 + }, + { + "epoch": 0.16453900709219857, + "grad_norm": 3.671865701675415, + "learning_rate": 4.995214546967658e-06, + "loss": 0.7347, + "step": 348 + }, + { + "epoch": 0.16501182033096926, + "grad_norm": 3.024219274520874, + "learning_rate": 4.995175889238706e-06, + "loss": 0.7547, + "step": 349 + }, + { + "epoch": 0.16548463356973994, + "grad_norm": 2.8470778465270996, + "learning_rate": 4.995137076146091e-06, + "loss": 0.6764, + "step": 350 + }, + { + "epoch": 0.16595744680851063, + "grad_norm": 2.905057907104492, + "learning_rate": 4.9950981076922324e-06, + "loss": 0.6814, + "step": 351 + }, + { + "epoch": 0.16643026004728131, + "grad_norm": 3.504377841949463, + "learning_rate": 4.995058983879555e-06, + "loss": 0.7145, + "step": 352 + }, + { + "epoch": 0.166903073286052, + "grad_norm": 3.0029661655426025, + "learning_rate": 4.995019704710495e-06, + "loss": 0.7114, + "step": 353 + }, + { + "epoch": 0.1673758865248227, + "grad_norm": 2.8666274547576904, + "learning_rate": 4.994980270187499e-06, + "loss": 0.7416, + "step": 354 + }, + { + "epoch": 0.16784869976359337, + "grad_norm": 3.1644718647003174, + "learning_rate": 4.994940680313021e-06, + "loss": 0.661, + "step": 355 + }, + { + "epoch": 0.16832151300236406, + "grad_norm": 3.050391674041748, + "learning_rate": 4.994900935089527e-06, + "loss": 0.7243, + "step": 356 + }, + { + "epoch": 0.16879432624113475, + "grad_norm": 2.985466480255127, + "learning_rate": 4.994861034519491e-06, + "loss": 0.6917, + "step": 357 + }, + { + "epoch": 0.16926713947990543, + "grad_norm": 2.909342050552368, + "learning_rate": 4.9948209786053995e-06, + "loss": 0.6636, + "step": 358 + }, + { + "epoch": 0.16973995271867612, + "grad_norm": 3.2214784622192383, + "learning_rate": 4.9947807673497435e-06, + "loss": 0.7903, + "step": 359 + }, + { + "epoch": 0.1702127659574468, + "grad_norm": 2.5654983520507812, + "learning_rate": 4.994740400755029e-06, + "loss": 0.6129, + "step": 360 + }, + { + "epoch": 0.1706855791962175, + "grad_norm": 3.775646448135376, + "learning_rate": 4.99469987882377e-06, + "loss": 0.7145, + "step": 361 + }, + { + "epoch": 0.17115839243498818, + "grad_norm": 2.8965413570404053, + "learning_rate": 4.994659201558487e-06, + "loss": 0.7177, + "step": 362 + }, + { + "epoch": 0.17163120567375886, + "grad_norm": 3.485597848892212, + "learning_rate": 4.9946183689617146e-06, + "loss": 0.8107, + "step": 363 + }, + { + "epoch": 0.17210401891252955, + "grad_norm": 3.277839183807373, + "learning_rate": 4.994577381035995e-06, + "loss": 0.691, + "step": 364 + }, + { + "epoch": 0.17257683215130024, + "grad_norm": 2.8807685375213623, + "learning_rate": 4.99453623778388e-06, + "loss": 0.7627, + "step": 365 + }, + { + "epoch": 0.17304964539007092, + "grad_norm": 3.0659940242767334, + "learning_rate": 4.994494939207932e-06, + "loss": 0.6858, + "step": 366 + }, + { + "epoch": 0.1735224586288416, + "grad_norm": 3.0881855487823486, + "learning_rate": 4.994453485310723e-06, + "loss": 0.8212, + "step": 367 + }, + { + "epoch": 0.1739952718676123, + "grad_norm": 2.7199201583862305, + "learning_rate": 4.994411876094832e-06, + "loss": 0.6516, + "step": 368 + }, + { + "epoch": 0.17446808510638298, + "grad_norm": 2.955889940261841, + "learning_rate": 4.994370111562851e-06, + "loss": 0.6579, + "step": 369 + }, + { + "epoch": 0.17494089834515367, + "grad_norm": 3.1321663856506348, + "learning_rate": 4.994328191717382e-06, + "loss": 0.6891, + "step": 370 + }, + { + "epoch": 0.17541371158392435, + "grad_norm": 3.0560388565063477, + "learning_rate": 4.994286116561034e-06, + "loss": 0.7243, + "step": 371 + }, + { + "epoch": 0.17588652482269504, + "grad_norm": 3.1560704708099365, + "learning_rate": 4.994243886096425e-06, + "loss": 0.7262, + "step": 372 + }, + { + "epoch": 0.17635933806146573, + "grad_norm": 2.913541316986084, + "learning_rate": 4.994201500326187e-06, + "loss": 0.7318, + "step": 373 + }, + { + "epoch": 0.1768321513002364, + "grad_norm": 3.098376512527466, + "learning_rate": 4.994158959252958e-06, + "loss": 0.6419, + "step": 374 + }, + { + "epoch": 0.1773049645390071, + "grad_norm": 2.977508544921875, + "learning_rate": 4.994116262879387e-06, + "loss": 0.6709, + "step": 375 + }, + { + "epoch": 0.17777777777777778, + "grad_norm": 3.168186902999878, + "learning_rate": 4.994073411208133e-06, + "loss": 0.6608, + "step": 376 + }, + { + "epoch": 0.17825059101654847, + "grad_norm": 3.436844825744629, + "learning_rate": 4.994030404241864e-06, + "loss": 0.7227, + "step": 377 + }, + { + "epoch": 0.17872340425531916, + "grad_norm": 2.8998289108276367, + "learning_rate": 4.993987241983258e-06, + "loss": 0.6512, + "step": 378 + }, + { + "epoch": 0.17919621749408984, + "grad_norm": 3.407191514968872, + "learning_rate": 4.993943924435002e-06, + "loss": 0.616, + "step": 379 + }, + { + "epoch": 0.17966903073286053, + "grad_norm": 3.744858741760254, + "learning_rate": 4.993900451599793e-06, + "loss": 0.8599, + "step": 380 + }, + { + "epoch": 0.18014184397163122, + "grad_norm": 3.486283779144287, + "learning_rate": 4.993856823480338e-06, + "loss": 0.6634, + "step": 381 + }, + { + "epoch": 0.1806146572104019, + "grad_norm": 2.895719051361084, + "learning_rate": 4.993813040079355e-06, + "loss": 0.6972, + "step": 382 + }, + { + "epoch": 0.1810874704491726, + "grad_norm": 2.814133882522583, + "learning_rate": 4.993769101399569e-06, + "loss": 0.6271, + "step": 383 + }, + { + "epoch": 0.18156028368794327, + "grad_norm": 2.8609800338745117, + "learning_rate": 4.993725007443715e-06, + "loss": 0.6481, + "step": 384 + }, + { + "epoch": 0.18203309692671396, + "grad_norm": 3.2829644680023193, + "learning_rate": 4.99368075821454e-06, + "loss": 0.7999, + "step": 385 + }, + { + "epoch": 0.18250591016548465, + "grad_norm": 3.1417458057403564, + "learning_rate": 4.993636353714798e-06, + "loss": 0.6972, + "step": 386 + }, + { + "epoch": 0.1829787234042553, + "grad_norm": 3.0679385662078857, + "learning_rate": 4.993591793947256e-06, + "loss": 0.667, + "step": 387 + }, + { + "epoch": 0.183451536643026, + "grad_norm": 3.1387410163879395, + "learning_rate": 4.993547078914686e-06, + "loss": 0.7618, + "step": 388 + }, + { + "epoch": 0.18392434988179668, + "grad_norm": 2.9181406497955322, + "learning_rate": 4.993502208619872e-06, + "loss": 0.7391, + "step": 389 + }, + { + "epoch": 0.18439716312056736, + "grad_norm": 2.8952157497406006, + "learning_rate": 4.993457183065611e-06, + "loss": 0.6988, + "step": 390 + }, + { + "epoch": 0.18486997635933805, + "grad_norm": 3.2274813652038574, + "learning_rate": 4.993412002254704e-06, + "loss": 0.688, + "step": 391 + }, + { + "epoch": 0.18534278959810874, + "grad_norm": 3.4693779945373535, + "learning_rate": 4.993366666189965e-06, + "loss": 0.6634, + "step": 392 + }, + { + "epoch": 0.18581560283687942, + "grad_norm": 3.5358526706695557, + "learning_rate": 4.993321174874217e-06, + "loss": 0.7343, + "step": 393 + }, + { + "epoch": 0.1862884160756501, + "grad_norm": 3.013338088989258, + "learning_rate": 4.993275528310292e-06, + "loss": 0.7579, + "step": 394 + }, + { + "epoch": 0.1867612293144208, + "grad_norm": 2.694772720336914, + "learning_rate": 4.993229726501033e-06, + "loss": 0.718, + "step": 395 + }, + { + "epoch": 0.18723404255319148, + "grad_norm": 3.070612907409668, + "learning_rate": 4.9931837694492915e-06, + "loss": 0.6438, + "step": 396 + }, + { + "epoch": 0.18770685579196217, + "grad_norm": 2.9193027019500732, + "learning_rate": 4.993137657157928e-06, + "loss": 0.6788, + "step": 397 + }, + { + "epoch": 0.18817966903073285, + "grad_norm": 3.047682046890259, + "learning_rate": 4.993091389629816e-06, + "loss": 0.6826, + "step": 398 + }, + { + "epoch": 0.18865248226950354, + "grad_norm": 2.9629905223846436, + "learning_rate": 4.993044966867834e-06, + "loss": 0.7196, + "step": 399 + }, + { + "epoch": 0.18912529550827423, + "grad_norm": 3.0692050457000732, + "learning_rate": 4.992998388874874e-06, + "loss": 0.7015, + "step": 400 + }, + { + "epoch": 0.1895981087470449, + "grad_norm": 3.5427212715148926, + "learning_rate": 4.992951655653836e-06, + "loss": 0.8292, + "step": 401 + }, + { + "epoch": 0.1900709219858156, + "grad_norm": 2.643526554107666, + "learning_rate": 4.992904767207629e-06, + "loss": 0.624, + "step": 402 + }, + { + "epoch": 0.19054373522458629, + "grad_norm": 3.1185996532440186, + "learning_rate": 4.992857723539173e-06, + "loss": 0.7354, + "step": 403 + }, + { + "epoch": 0.19101654846335697, + "grad_norm": 3.006856679916382, + "learning_rate": 4.992810524651398e-06, + "loss": 0.7752, + "step": 404 + }, + { + "epoch": 0.19148936170212766, + "grad_norm": 2.9913275241851807, + "learning_rate": 4.9927631705472425e-06, + "loss": 0.7306, + "step": 405 + }, + { + "epoch": 0.19196217494089834, + "grad_norm": 2.6794071197509766, + "learning_rate": 4.992715661229655e-06, + "loss": 0.6136, + "step": 406 + }, + { + "epoch": 0.19243498817966903, + "grad_norm": 3.5933966636657715, + "learning_rate": 4.992667996701593e-06, + "loss": 0.7024, + "step": 407 + }, + { + "epoch": 0.19290780141843972, + "grad_norm": 2.862187623977661, + "learning_rate": 4.992620176966025e-06, + "loss": 0.692, + "step": 408 + }, + { + "epoch": 0.1933806146572104, + "grad_norm": 3.076845407485962, + "learning_rate": 4.9925722020259286e-06, + "loss": 0.7475, + "step": 409 + }, + { + "epoch": 0.1938534278959811, + "grad_norm": 3.372919797897339, + "learning_rate": 4.9925240718842895e-06, + "loss": 0.6886, + "step": 410 + }, + { + "epoch": 0.19432624113475178, + "grad_norm": 2.922977924346924, + "learning_rate": 4.992475786544108e-06, + "loss": 0.7049, + "step": 411 + }, + { + "epoch": 0.19479905437352246, + "grad_norm": 2.908034324645996, + "learning_rate": 4.992427346008387e-06, + "loss": 0.6498, + "step": 412 + }, + { + "epoch": 0.19527186761229315, + "grad_norm": 3.096723794937134, + "learning_rate": 4.992378750280144e-06, + "loss": 0.7151, + "step": 413 + }, + { + "epoch": 0.19574468085106383, + "grad_norm": 2.895237684249878, + "learning_rate": 4.992329999362405e-06, + "loss": 0.7277, + "step": 414 + }, + { + "epoch": 0.19621749408983452, + "grad_norm": 2.718230724334717, + "learning_rate": 4.9922810932582065e-06, + "loss": 0.6375, + "step": 415 + }, + { + "epoch": 0.1966903073286052, + "grad_norm": 3.187743663787842, + "learning_rate": 4.992232031970592e-06, + "loss": 0.6528, + "step": 416 + }, + { + "epoch": 0.1971631205673759, + "grad_norm": 2.996406316757202, + "learning_rate": 4.992182815502616e-06, + "loss": 0.6552, + "step": 417 + }, + { + "epoch": 0.19763593380614658, + "grad_norm": 3.301084041595459, + "learning_rate": 4.992133443857345e-06, + "loss": 0.7061, + "step": 418 + }, + { + "epoch": 0.19810874704491727, + "grad_norm": 3.7874677181243896, + "learning_rate": 4.992083917037853e-06, + "loss": 0.7859, + "step": 419 + }, + { + "epoch": 0.19858156028368795, + "grad_norm": 3.124253511428833, + "learning_rate": 4.992034235047222e-06, + "loss": 0.7615, + "step": 420 + }, + { + "epoch": 0.19905437352245864, + "grad_norm": 3.0488970279693604, + "learning_rate": 4.991984397888546e-06, + "loss": 0.6916, + "step": 421 + }, + { + "epoch": 0.19952718676122932, + "grad_norm": 3.1241321563720703, + "learning_rate": 4.991934405564929e-06, + "loss": 0.7055, + "step": 422 + }, + { + "epoch": 0.2, + "grad_norm": 3.396632432937622, + "learning_rate": 4.991884258079484e-06, + "loss": 0.7675, + "step": 423 + }, + { + "epoch": 0.2004728132387707, + "grad_norm": 3.7776873111724854, + "learning_rate": 4.9918339554353316e-06, + "loss": 0.7371, + "step": 424 + }, + { + "epoch": 0.20094562647754138, + "grad_norm": 3.3356032371520996, + "learning_rate": 4.991783497635606e-06, + "loss": 0.6778, + "step": 425 + }, + { + "epoch": 0.20141843971631207, + "grad_norm": 2.988856792449951, + "learning_rate": 4.9917328846834474e-06, + "loss": 0.6795, + "step": 426 + }, + { + "epoch": 0.20189125295508276, + "grad_norm": 3.264183282852173, + "learning_rate": 4.99168211658201e-06, + "loss": 0.7707, + "step": 427 + }, + { + "epoch": 0.20236406619385341, + "grad_norm": 3.878068208694458, + "learning_rate": 4.991631193334451e-06, + "loss": 0.857, + "step": 428 + }, + { + "epoch": 0.2028368794326241, + "grad_norm": 3.6377553939819336, + "learning_rate": 4.991580114943943e-06, + "loss": 0.8033, + "step": 429 + }, + { + "epoch": 0.2033096926713948, + "grad_norm": 2.95393967628479, + "learning_rate": 4.991528881413667e-06, + "loss": 0.6809, + "step": 430 + }, + { + "epoch": 0.20378250591016547, + "grad_norm": 3.058704376220703, + "learning_rate": 4.9914774927468125e-06, + "loss": 0.6664, + "step": 431 + }, + { + "epoch": 0.20425531914893616, + "grad_norm": 2.7783217430114746, + "learning_rate": 4.9914259489465795e-06, + "loss": 0.6478, + "step": 432 + }, + { + "epoch": 0.20472813238770685, + "grad_norm": 2.4825217723846436, + "learning_rate": 4.991374250016177e-06, + "loss": 0.6598, + "step": 433 + }, + { + "epoch": 0.20520094562647753, + "grad_norm": 2.8753600120544434, + "learning_rate": 4.991322395958824e-06, + "loss": 0.6947, + "step": 434 + }, + { + "epoch": 0.20567375886524822, + "grad_norm": 3.2339367866516113, + "learning_rate": 4.99127038677775e-06, + "loss": 0.8201, + "step": 435 + }, + { + "epoch": 0.2061465721040189, + "grad_norm": 2.9065537452697754, + "learning_rate": 4.991218222476193e-06, + "loss": 0.6679, + "step": 436 + }, + { + "epoch": 0.2066193853427896, + "grad_norm": 3.283228874206543, + "learning_rate": 4.991165903057401e-06, + "loss": 0.8039, + "step": 437 + }, + { + "epoch": 0.20709219858156028, + "grad_norm": 3.429872751235962, + "learning_rate": 4.991113428524631e-06, + "loss": 0.7392, + "step": 438 + }, + { + "epoch": 0.20756501182033096, + "grad_norm": 3.118943452835083, + "learning_rate": 4.991060798881152e-06, + "loss": 0.6794, + "step": 439 + }, + { + "epoch": 0.20803782505910165, + "grad_norm": 3.395970106124878, + "learning_rate": 4.99100801413024e-06, + "loss": 0.6862, + "step": 440 + }, + { + "epoch": 0.20851063829787234, + "grad_norm": 2.869191884994507, + "learning_rate": 4.99095507427518e-06, + "loss": 0.6076, + "step": 441 + }, + { + "epoch": 0.20898345153664302, + "grad_norm": 3.1934661865234375, + "learning_rate": 4.990901979319272e-06, + "loss": 0.6927, + "step": 442 + }, + { + "epoch": 0.2094562647754137, + "grad_norm": 2.9068603515625, + "learning_rate": 4.990848729265819e-06, + "loss": 0.6864, + "step": 443 + }, + { + "epoch": 0.2099290780141844, + "grad_norm": 3.0535948276519775, + "learning_rate": 4.9907953241181375e-06, + "loss": 0.6396, + "step": 444 + }, + { + "epoch": 0.21040189125295508, + "grad_norm": 2.871511459350586, + "learning_rate": 4.990741763879554e-06, + "loss": 0.6743, + "step": 445 + }, + { + "epoch": 0.21087470449172577, + "grad_norm": 2.9184393882751465, + "learning_rate": 4.9906880485534015e-06, + "loss": 0.6786, + "step": 446 + }, + { + "epoch": 0.21134751773049645, + "grad_norm": 3.0628271102905273, + "learning_rate": 4.990634178143026e-06, + "loss": 0.6326, + "step": 447 + }, + { + "epoch": 0.21182033096926714, + "grad_norm": 3.7878305912017822, + "learning_rate": 4.990580152651782e-06, + "loss": 0.7944, + "step": 448 + }, + { + "epoch": 0.21229314420803783, + "grad_norm": 2.8577189445495605, + "learning_rate": 4.990525972083031e-06, + "loss": 0.71, + "step": 449 + }, + { + "epoch": 0.2127659574468085, + "grad_norm": 3.307769775390625, + "learning_rate": 4.99047163644015e-06, + "loss": 0.6893, + "step": 450 + }, + { + "epoch": 0.2132387706855792, + "grad_norm": 2.7391717433929443, + "learning_rate": 4.990417145726519e-06, + "loss": 0.712, + "step": 451 + }, + { + "epoch": 0.21371158392434988, + "grad_norm": 2.938044786453247, + "learning_rate": 4.990362499945534e-06, + "loss": 0.7516, + "step": 452 + }, + { + "epoch": 0.21418439716312057, + "grad_norm": 2.7831056118011475, + "learning_rate": 4.990307699100595e-06, + "loss": 0.6168, + "step": 453 + }, + { + "epoch": 0.21465721040189126, + "grad_norm": 2.907977342605591, + "learning_rate": 4.990252743195116e-06, + "loss": 0.6706, + "step": 454 + }, + { + "epoch": 0.21513002364066194, + "grad_norm": 3.7882161140441895, + "learning_rate": 4.990197632232517e-06, + "loss": 0.6847, + "step": 455 + }, + { + "epoch": 0.21560283687943263, + "grad_norm": 2.899716854095459, + "learning_rate": 4.990142366216232e-06, + "loss": 0.6699, + "step": 456 + }, + { + "epoch": 0.21607565011820332, + "grad_norm": 2.907003879547119, + "learning_rate": 4.990086945149701e-06, + "loss": 0.6864, + "step": 457 + }, + { + "epoch": 0.216548463356974, + "grad_norm": 3.2407333850860596, + "learning_rate": 4.9900313690363736e-06, + "loss": 0.692, + "step": 458 + }, + { + "epoch": 0.2170212765957447, + "grad_norm": 2.9055583477020264, + "learning_rate": 4.989975637879712e-06, + "loss": 0.7113, + "step": 459 + }, + { + "epoch": 0.21749408983451538, + "grad_norm": 2.9836206436157227, + "learning_rate": 4.989919751683184e-06, + "loss": 0.6673, + "step": 460 + }, + { + "epoch": 0.21796690307328606, + "grad_norm": 3.371035575866699, + "learning_rate": 4.989863710450273e-06, + "loss": 0.7181, + "step": 461 + }, + { + "epoch": 0.21843971631205675, + "grad_norm": 2.9636635780334473, + "learning_rate": 4.989807514184465e-06, + "loss": 0.6082, + "step": 462 + }, + { + "epoch": 0.21891252955082743, + "grad_norm": 2.9634664058685303, + "learning_rate": 4.9897511628892615e-06, + "loss": 0.7086, + "step": 463 + }, + { + "epoch": 0.21938534278959812, + "grad_norm": 3.154763698577881, + "learning_rate": 4.98969465656817e-06, + "loss": 0.7027, + "step": 464 + }, + { + "epoch": 0.2198581560283688, + "grad_norm": 2.9959890842437744, + "learning_rate": 4.98963799522471e-06, + "loss": 0.6498, + "step": 465 + }, + { + "epoch": 0.2203309692671395, + "grad_norm": 3.5470590591430664, + "learning_rate": 4.989581178862408e-06, + "loss": 0.7199, + "step": 466 + }, + { + "epoch": 0.22080378250591018, + "grad_norm": 7.1873369216918945, + "learning_rate": 4.989524207484802e-06, + "loss": 0.6676, + "step": 467 + }, + { + "epoch": 0.22127659574468084, + "grad_norm": 3.1099541187286377, + "learning_rate": 4.98946708109544e-06, + "loss": 0.6785, + "step": 468 + }, + { + "epoch": 0.22174940898345152, + "grad_norm": 2.830991506576538, + "learning_rate": 4.9894097996978795e-06, + "loss": 0.6456, + "step": 469 + }, + { + "epoch": 0.2222222222222222, + "grad_norm": 3.0212316513061523, + "learning_rate": 4.989352363295687e-06, + "loss": 0.6048, + "step": 470 + }, + { + "epoch": 0.2226950354609929, + "grad_norm": 3.18776798248291, + "learning_rate": 4.989294771892437e-06, + "loss": 0.7078, + "step": 471 + }, + { + "epoch": 0.22316784869976358, + "grad_norm": 2.9972598552703857, + "learning_rate": 4.989237025491717e-06, + "loss": 0.7082, + "step": 472 + }, + { + "epoch": 0.22364066193853427, + "grad_norm": 3.4935688972473145, + "learning_rate": 4.989179124097123e-06, + "loss": 0.8199, + "step": 473 + }, + { + "epoch": 0.22411347517730495, + "grad_norm": 2.6485543251037598, + "learning_rate": 4.9891210677122595e-06, + "loss": 0.6371, + "step": 474 + }, + { + "epoch": 0.22458628841607564, + "grad_norm": 2.969233512878418, + "learning_rate": 4.989062856340742e-06, + "loss": 0.6879, + "step": 475 + }, + { + "epoch": 0.22505910165484633, + "grad_norm": 2.881875514984131, + "learning_rate": 4.989004489986194e-06, + "loss": 0.7415, + "step": 476 + }, + { + "epoch": 0.225531914893617, + "grad_norm": 2.624540090560913, + "learning_rate": 4.98894596865225e-06, + "loss": 0.6522, + "step": 477 + }, + { + "epoch": 0.2260047281323877, + "grad_norm": 3.61075496673584, + "learning_rate": 4.988887292342555e-06, + "loss": 0.7109, + "step": 478 + }, + { + "epoch": 0.2264775413711584, + "grad_norm": 2.9368972778320312, + "learning_rate": 4.988828461060762e-06, + "loss": 0.6843, + "step": 479 + }, + { + "epoch": 0.22695035460992907, + "grad_norm": 3.0670197010040283, + "learning_rate": 4.988769474810533e-06, + "loss": 0.6807, + "step": 480 + }, + { + "epoch": 0.22742316784869976, + "grad_norm": 2.9662792682647705, + "learning_rate": 4.988710333595542e-06, + "loss": 0.6796, + "step": 481 + }, + { + "epoch": 0.22789598108747045, + "grad_norm": 2.971235752105713, + "learning_rate": 4.988651037419472e-06, + "loss": 0.696, + "step": 482 + }, + { + "epoch": 0.22836879432624113, + "grad_norm": 2.931884527206421, + "learning_rate": 4.988591586286013e-06, + "loss": 0.7323, + "step": 483 + }, + { + "epoch": 0.22884160756501182, + "grad_norm": 2.8114213943481445, + "learning_rate": 4.988531980198868e-06, + "loss": 0.6584, + "step": 484 + }, + { + "epoch": 0.2293144208037825, + "grad_norm": 3.2785916328430176, + "learning_rate": 4.98847221916175e-06, + "loss": 0.7514, + "step": 485 + }, + { + "epoch": 0.2297872340425532, + "grad_norm": 3.0520215034484863, + "learning_rate": 4.988412303178377e-06, + "loss": 0.7564, + "step": 486 + }, + { + "epoch": 0.23026004728132388, + "grad_norm": 3.181002616882324, + "learning_rate": 4.988352232252483e-06, + "loss": 0.6768, + "step": 487 + }, + { + "epoch": 0.23073286052009456, + "grad_norm": 3.4953625202178955, + "learning_rate": 4.988292006387805e-06, + "loss": 0.7143, + "step": 488 + }, + { + "epoch": 0.23120567375886525, + "grad_norm": 3.326571226119995, + "learning_rate": 4.988231625588096e-06, + "loss": 0.7318, + "step": 489 + }, + { + "epoch": 0.23167848699763594, + "grad_norm": 3.09614634513855, + "learning_rate": 4.988171089857113e-06, + "loss": 0.6574, + "step": 490 + }, + { + "epoch": 0.23215130023640662, + "grad_norm": 2.7439446449279785, + "learning_rate": 4.9881103991986265e-06, + "loss": 0.6637, + "step": 491 + }, + { + "epoch": 0.2326241134751773, + "grad_norm": 3.0681190490722656, + "learning_rate": 4.988049553616416e-06, + "loss": 0.6326, + "step": 492 + }, + { + "epoch": 0.233096926713948, + "grad_norm": 3.0757341384887695, + "learning_rate": 4.98798855311427e-06, + "loss": 0.695, + "step": 493 + }, + { + "epoch": 0.23356973995271868, + "grad_norm": 2.8637635707855225, + "learning_rate": 4.987927397695985e-06, + "loss": 0.6598, + "step": 494 + }, + { + "epoch": 0.23404255319148937, + "grad_norm": 3.3641068935394287, + "learning_rate": 4.9878660873653715e-06, + "loss": 0.7435, + "step": 495 + }, + { + "epoch": 0.23451536643026005, + "grad_norm": 3.5025596618652344, + "learning_rate": 4.987804622126245e-06, + "loss": 0.735, + "step": 496 + }, + { + "epoch": 0.23498817966903074, + "grad_norm": 2.9298837184906006, + "learning_rate": 4.987743001982434e-06, + "loss": 0.7063, + "step": 497 + }, + { + "epoch": 0.23546099290780143, + "grad_norm": 2.70358943939209, + "learning_rate": 4.987681226937774e-06, + "loss": 0.6799, + "step": 498 + }, + { + "epoch": 0.2359338061465721, + "grad_norm": 3.027871608734131, + "learning_rate": 4.9876192969961125e-06, + "loss": 0.6881, + "step": 499 + }, + { + "epoch": 0.2364066193853428, + "grad_norm": 3.362306594848633, + "learning_rate": 4.987557212161304e-06, + "loss": 0.7906, + "step": 500 + }, + { + "epoch": 0.23687943262411348, + "grad_norm": 3.3136050701141357, + "learning_rate": 4.987494972437217e-06, + "loss": 0.6878, + "step": 501 + }, + { + "epoch": 0.23735224586288417, + "grad_norm": 3.017089605331421, + "learning_rate": 4.9874325778277255e-06, + "loss": 0.7279, + "step": 502 + }, + { + "epoch": 0.23782505910165486, + "grad_norm": 2.8300516605377197, + "learning_rate": 4.987370028336714e-06, + "loss": 0.6864, + "step": 503 + }, + { + "epoch": 0.23829787234042554, + "grad_norm": 3.201860189437866, + "learning_rate": 4.987307323968077e-06, + "loss": 0.7531, + "step": 504 + }, + { + "epoch": 0.23877068557919623, + "grad_norm": 2.685396194458008, + "learning_rate": 4.987244464725721e-06, + "loss": 0.5849, + "step": 505 + }, + { + "epoch": 0.23924349881796692, + "grad_norm": 2.8715312480926514, + "learning_rate": 4.987181450613557e-06, + "loss": 0.675, + "step": 506 + }, + { + "epoch": 0.2397163120567376, + "grad_norm": 2.813908815383911, + "learning_rate": 4.987118281635511e-06, + "loss": 0.6841, + "step": 507 + }, + { + "epoch": 0.2401891252955083, + "grad_norm": 3.2738473415374756, + "learning_rate": 4.987054957795514e-06, + "loss": 0.7158, + "step": 508 + }, + { + "epoch": 0.24066193853427895, + "grad_norm": 2.896134376525879, + "learning_rate": 4.986991479097511e-06, + "loss": 0.7542, + "step": 509 + }, + { + "epoch": 0.24113475177304963, + "grad_norm": 3.0390403270721436, + "learning_rate": 4.986927845545454e-06, + "loss": 0.6733, + "step": 510 + }, + { + "epoch": 0.24160756501182032, + "grad_norm": 3.0300254821777344, + "learning_rate": 4.9868640571433044e-06, + "loss": 0.722, + "step": 511 + }, + { + "epoch": 0.242080378250591, + "grad_norm": 3.3037352561950684, + "learning_rate": 4.986800113895035e-06, + "loss": 0.6811, + "step": 512 + }, + { + "epoch": 0.2425531914893617, + "grad_norm": 3.0358474254608154, + "learning_rate": 4.986736015804627e-06, + "loss": 0.7348, + "step": 513 + }, + { + "epoch": 0.24302600472813238, + "grad_norm": 3.108792304992676, + "learning_rate": 4.986671762876071e-06, + "loss": 0.6096, + "step": 514 + }, + { + "epoch": 0.24349881796690306, + "grad_norm": 3.1316237449645996, + "learning_rate": 4.986607355113367e-06, + "loss": 0.6357, + "step": 515 + }, + { + "epoch": 0.24397163120567375, + "grad_norm": 3.3095219135284424, + "learning_rate": 4.986542792520528e-06, + "loss": 0.7515, + "step": 516 + }, + { + "epoch": 0.24444444444444444, + "grad_norm": 3.4775984287261963, + "learning_rate": 4.986478075101572e-06, + "loss": 0.7104, + "step": 517 + }, + { + "epoch": 0.24491725768321512, + "grad_norm": 3.341708183288574, + "learning_rate": 4.986413202860528e-06, + "loss": 0.7339, + "step": 518 + }, + { + "epoch": 0.2453900709219858, + "grad_norm": 2.9646966457366943, + "learning_rate": 4.986348175801438e-06, + "loss": 0.6032, + "step": 519 + }, + { + "epoch": 0.2458628841607565, + "grad_norm": 3.1853902339935303, + "learning_rate": 4.986282993928349e-06, + "loss": 0.6925, + "step": 520 + }, + { + "epoch": 0.24633569739952718, + "grad_norm": 3.286909818649292, + "learning_rate": 4.98621765724532e-06, + "loss": 0.7447, + "step": 521 + }, + { + "epoch": 0.24680851063829787, + "grad_norm": 3.2255051136016846, + "learning_rate": 4.986152165756419e-06, + "loss": 0.7747, + "step": 522 + }, + { + "epoch": 0.24728132387706855, + "grad_norm": 3.002352237701416, + "learning_rate": 4.986086519465724e-06, + "loss": 0.6472, + "step": 523 + }, + { + "epoch": 0.24775413711583924, + "grad_norm": 3.4738974571228027, + "learning_rate": 4.986020718377322e-06, + "loss": 0.7381, + "step": 524 + }, + { + "epoch": 0.24822695035460993, + "grad_norm": 3.4470200538635254, + "learning_rate": 4.985954762495312e-06, + "loss": 0.6878, + "step": 525 + }, + { + "epoch": 0.2486997635933806, + "grad_norm": 2.9219350814819336, + "learning_rate": 4.985888651823799e-06, + "loss": 0.6317, + "step": 526 + }, + { + "epoch": 0.2491725768321513, + "grad_norm": 3.061767101287842, + "learning_rate": 4.985822386366899e-06, + "loss": 0.6842, + "step": 527 + }, + { + "epoch": 0.24964539007092199, + "grad_norm": 3.0291247367858887, + "learning_rate": 4.985755966128742e-06, + "loss": 0.6852, + "step": 528 + }, + { + "epoch": 0.25011820330969264, + "grad_norm": 2.964280843734741, + "learning_rate": 4.985689391113457e-06, + "loss": 0.7738, + "step": 529 + }, + { + "epoch": 0.25059101654846333, + "grad_norm": 3.058302164077759, + "learning_rate": 4.9856226613251955e-06, + "loss": 0.6677, + "step": 530 + }, + { + "epoch": 0.251063829787234, + "grad_norm": 3.345141649246216, + "learning_rate": 4.985555776768109e-06, + "loss": 0.7837, + "step": 531 + }, + { + "epoch": 0.2515366430260047, + "grad_norm": 3.565031051635742, + "learning_rate": 4.9854887374463636e-06, + "loss": 0.7231, + "step": 532 + }, + { + "epoch": 0.2520094562647754, + "grad_norm": 2.7953789234161377, + "learning_rate": 4.985421543364132e-06, + "loss": 0.6102, + "step": 533 + }, + { + "epoch": 0.2524822695035461, + "grad_norm": 2.887606620788574, + "learning_rate": 4.9853541945256e-06, + "loss": 0.6289, + "step": 534 + }, + { + "epoch": 0.25295508274231676, + "grad_norm": 3.1480495929718018, + "learning_rate": 4.985286690934961e-06, + "loss": 0.6348, + "step": 535 + }, + { + "epoch": 0.25342789598108745, + "grad_norm": 2.8912761211395264, + "learning_rate": 4.985219032596416e-06, + "loss": 0.595, + "step": 536 + }, + { + "epoch": 0.25390070921985813, + "grad_norm": 2.947936534881592, + "learning_rate": 4.98515121951418e-06, + "loss": 0.6196, + "step": 537 + }, + { + "epoch": 0.2543735224586288, + "grad_norm": 3.1085827350616455, + "learning_rate": 4.985083251692474e-06, + "loss": 0.6387, + "step": 538 + }, + { + "epoch": 0.2548463356973995, + "grad_norm": 3.1688334941864014, + "learning_rate": 4.985015129135531e-06, + "loss": 0.7055, + "step": 539 + }, + { + "epoch": 0.2553191489361702, + "grad_norm": 3.075042963027954, + "learning_rate": 4.984946851847593e-06, + "loss": 0.7515, + "step": 540 + }, + { + "epoch": 0.2557919621749409, + "grad_norm": 3.1933093070983887, + "learning_rate": 4.98487841983291e-06, + "loss": 0.7054, + "step": 541 + }, + { + "epoch": 0.25626477541371157, + "grad_norm": 3.043473958969116, + "learning_rate": 4.984809833095744e-06, + "loss": 0.6281, + "step": 542 + }, + { + "epoch": 0.25673758865248225, + "grad_norm": 3.0532584190368652, + "learning_rate": 4.9847410916403645e-06, + "loss": 0.6155, + "step": 543 + }, + { + "epoch": 0.25721040189125294, + "grad_norm": 3.608480215072632, + "learning_rate": 4.984672195471053e-06, + "loss": 0.7363, + "step": 544 + }, + { + "epoch": 0.2576832151300236, + "grad_norm": 2.7491862773895264, + "learning_rate": 4.9846031445921e-06, + "loss": 0.6594, + "step": 545 + }, + { + "epoch": 0.2581560283687943, + "grad_norm": 2.8602418899536133, + "learning_rate": 4.984533939007802e-06, + "loss": 0.6742, + "step": 546 + }, + { + "epoch": 0.258628841607565, + "grad_norm": 3.1782007217407227, + "learning_rate": 4.98446457872247e-06, + "loss": 0.731, + "step": 547 + }, + { + "epoch": 0.2591016548463357, + "grad_norm": 2.796147584915161, + "learning_rate": 4.984395063740423e-06, + "loss": 0.6617, + "step": 548 + }, + { + "epoch": 0.25957446808510637, + "grad_norm": 2.8392202854156494, + "learning_rate": 4.984325394065991e-06, + "loss": 0.6753, + "step": 549 + }, + { + "epoch": 0.26004728132387706, + "grad_norm": 3.134672164916992, + "learning_rate": 4.984255569703508e-06, + "loss": 0.7222, + "step": 550 + }, + { + "epoch": 0.26052009456264774, + "grad_norm": 2.734330177307129, + "learning_rate": 4.984185590657325e-06, + "loss": 0.6098, + "step": 551 + }, + { + "epoch": 0.26099290780141843, + "grad_norm": 3.739010810852051, + "learning_rate": 4.984115456931798e-06, + "loss": 0.7457, + "step": 552 + }, + { + "epoch": 0.2614657210401891, + "grad_norm": 2.8412528038024902, + "learning_rate": 4.9840451685312925e-06, + "loss": 0.6972, + "step": 553 + }, + { + "epoch": 0.2619385342789598, + "grad_norm": 3.017395496368408, + "learning_rate": 4.983974725460188e-06, + "loss": 0.6887, + "step": 554 + }, + { + "epoch": 0.2624113475177305, + "grad_norm": 3.2746949195861816, + "learning_rate": 4.98390412772287e-06, + "loss": 0.7047, + "step": 555 + }, + { + "epoch": 0.2628841607565012, + "grad_norm": 3.1561965942382812, + "learning_rate": 4.983833375323732e-06, + "loss": 0.7726, + "step": 556 + }, + { + "epoch": 0.26335697399527186, + "grad_norm": 3.2367217540740967, + "learning_rate": 4.9837624682671816e-06, + "loss": 0.6348, + "step": 557 + }, + { + "epoch": 0.26382978723404255, + "grad_norm": 2.8195858001708984, + "learning_rate": 4.983691406557633e-06, + "loss": 0.6387, + "step": 558 + }, + { + "epoch": 0.26430260047281323, + "grad_norm": 3.349820852279663, + "learning_rate": 4.983620190199511e-06, + "loss": 0.6776, + "step": 559 + }, + { + "epoch": 0.2647754137115839, + "grad_norm": 2.8025588989257812, + "learning_rate": 4.98354881919725e-06, + "loss": 0.6512, + "step": 560 + }, + { + "epoch": 0.2652482269503546, + "grad_norm": 2.9125499725341797, + "learning_rate": 4.983477293555295e-06, + "loss": 0.7024, + "step": 561 + }, + { + "epoch": 0.2657210401891253, + "grad_norm": 3.3479275703430176, + "learning_rate": 4.983405613278098e-06, + "loss": 0.688, + "step": 562 + }, + { + "epoch": 0.266193853427896, + "grad_norm": 3.123971462249756, + "learning_rate": 4.983333778370123e-06, + "loss": 0.6743, + "step": 563 + }, + { + "epoch": 0.26666666666666666, + "grad_norm": 2.891625165939331, + "learning_rate": 4.983261788835843e-06, + "loss": 0.5971, + "step": 564 + }, + { + "epoch": 0.26713947990543735, + "grad_norm": 3.5066864490509033, + "learning_rate": 4.98318964467974e-06, + "loss": 0.6958, + "step": 565 + }, + { + "epoch": 0.26761229314420804, + "grad_norm": 2.570547342300415, + "learning_rate": 4.983117345906306e-06, + "loss": 0.609, + "step": 566 + }, + { + "epoch": 0.2680851063829787, + "grad_norm": 3.005106210708618, + "learning_rate": 4.983044892520044e-06, + "loss": 0.6791, + "step": 567 + }, + { + "epoch": 0.2685579196217494, + "grad_norm": 3.429675340652466, + "learning_rate": 4.982972284525463e-06, + "loss": 0.6625, + "step": 568 + }, + { + "epoch": 0.2690307328605201, + "grad_norm": 3.825657367706299, + "learning_rate": 4.982899521927086e-06, + "loss": 0.6368, + "step": 569 + }, + { + "epoch": 0.2695035460992908, + "grad_norm": 2.8699095249176025, + "learning_rate": 4.982826604729443e-06, + "loss": 0.6425, + "step": 570 + }, + { + "epoch": 0.26997635933806147, + "grad_norm": 3.1688714027404785, + "learning_rate": 4.982753532937074e-06, + "loss": 0.6904, + "step": 571 + }, + { + "epoch": 0.27044917257683215, + "grad_norm": 3.3889992237091064, + "learning_rate": 4.98268030655453e-06, + "loss": 0.7575, + "step": 572 + }, + { + "epoch": 0.27092198581560284, + "grad_norm": 3.108315944671631, + "learning_rate": 4.982606925586367e-06, + "loss": 0.6648, + "step": 573 + }, + { + "epoch": 0.2713947990543735, + "grad_norm": 3.209831953048706, + "learning_rate": 4.982533390037159e-06, + "loss": 0.657, + "step": 574 + }, + { + "epoch": 0.2718676122931442, + "grad_norm": 3.1740927696228027, + "learning_rate": 4.982459699911482e-06, + "loss": 0.7262, + "step": 575 + }, + { + "epoch": 0.2723404255319149, + "grad_norm": 3.0190417766571045, + "learning_rate": 4.982385855213924e-06, + "loss": 0.6368, + "step": 576 + }, + { + "epoch": 0.2728132387706856, + "grad_norm": 3.05049467086792, + "learning_rate": 4.982311855949084e-06, + "loss": 0.72, + "step": 577 + }, + { + "epoch": 0.27328605200945627, + "grad_norm": 2.984816551208496, + "learning_rate": 4.98223770212157e-06, + "loss": 0.6856, + "step": 578 + }, + { + "epoch": 0.27375886524822696, + "grad_norm": 2.744969606399536, + "learning_rate": 4.982163393735998e-06, + "loss": 0.6023, + "step": 579 + }, + { + "epoch": 0.27423167848699764, + "grad_norm": 3.170564889907837, + "learning_rate": 4.982088930796996e-06, + "loss": 0.6678, + "step": 580 + }, + { + "epoch": 0.27470449172576833, + "grad_norm": 2.8686118125915527, + "learning_rate": 4.982014313309199e-06, + "loss": 0.6157, + "step": 581 + }, + { + "epoch": 0.275177304964539, + "grad_norm": 2.8768694400787354, + "learning_rate": 4.981939541277254e-06, + "loss": 0.6566, + "step": 582 + }, + { + "epoch": 0.2756501182033097, + "grad_norm": 2.621481418609619, + "learning_rate": 4.981864614705818e-06, + "loss": 0.7372, + "step": 583 + }, + { + "epoch": 0.2761229314420804, + "grad_norm": 3.527374267578125, + "learning_rate": 4.981789533599554e-06, + "loss": 0.6485, + "step": 584 + }, + { + "epoch": 0.2765957446808511, + "grad_norm": 3.3141074180603027, + "learning_rate": 4.981714297963138e-06, + "loss": 0.6816, + "step": 585 + }, + { + "epoch": 0.27706855791962176, + "grad_norm": 2.9247069358825684, + "learning_rate": 4.981638907801255e-06, + "loss": 0.7217, + "step": 586 + }, + { + "epoch": 0.27754137115839245, + "grad_norm": 2.875236749649048, + "learning_rate": 4.981563363118599e-06, + "loss": 0.6662, + "step": 587 + }, + { + "epoch": 0.27801418439716313, + "grad_norm": 2.9540364742279053, + "learning_rate": 4.981487663919874e-06, + "loss": 0.7225, + "step": 588 + }, + { + "epoch": 0.2784869976359338, + "grad_norm": 2.90889310836792, + "learning_rate": 4.981411810209793e-06, + "loss": 0.6054, + "step": 589 + }, + { + "epoch": 0.2789598108747045, + "grad_norm": 2.8541409969329834, + "learning_rate": 4.981335801993078e-06, + "loss": 0.6539, + "step": 590 + }, + { + "epoch": 0.2794326241134752, + "grad_norm": 3.1600730419158936, + "learning_rate": 4.981259639274465e-06, + "loss": 0.6415, + "step": 591 + }, + { + "epoch": 0.2799054373522459, + "grad_norm": 3.569376230239868, + "learning_rate": 4.981183322058693e-06, + "loss": 0.6944, + "step": 592 + }, + { + "epoch": 0.28037825059101656, + "grad_norm": 3.067667007446289, + "learning_rate": 4.981106850350515e-06, + "loss": 0.7378, + "step": 593 + }, + { + "epoch": 0.28085106382978725, + "grad_norm": 3.082073450088501, + "learning_rate": 4.981030224154693e-06, + "loss": 0.693, + "step": 594 + }, + { + "epoch": 0.28132387706855794, + "grad_norm": 2.902932643890381, + "learning_rate": 4.980953443475998e-06, + "loss": 0.6549, + "step": 595 + }, + { + "epoch": 0.2817966903073286, + "grad_norm": 2.6821181774139404, + "learning_rate": 4.980876508319211e-06, + "loss": 0.6231, + "step": 596 + }, + { + "epoch": 0.2822695035460993, + "grad_norm": 3.1747355461120605, + "learning_rate": 4.9807994186891215e-06, + "loss": 0.6826, + "step": 597 + }, + { + "epoch": 0.28274231678487, + "grad_norm": 2.6975860595703125, + "learning_rate": 4.980722174590531e-06, + "loss": 0.6669, + "step": 598 + }, + { + "epoch": 0.2832151300236407, + "grad_norm": 2.924285650253296, + "learning_rate": 4.9806447760282486e-06, + "loss": 0.689, + "step": 599 + }, + { + "epoch": 0.28368794326241137, + "grad_norm": 2.941417694091797, + "learning_rate": 4.980567223007093e-06, + "loss": 0.6672, + "step": 600 + }, + { + "epoch": 0.28416075650118205, + "grad_norm": 2.8582186698913574, + "learning_rate": 4.980489515531892e-06, + "loss": 0.6229, + "step": 601 + }, + { + "epoch": 0.28463356973995274, + "grad_norm": 2.6462013721466064, + "learning_rate": 4.9804116536074865e-06, + "loss": 0.606, + "step": 602 + }, + { + "epoch": 0.2851063829787234, + "grad_norm": 2.9029998779296875, + "learning_rate": 4.980333637238723e-06, + "loss": 0.5915, + "step": 603 + }, + { + "epoch": 0.2855791962174941, + "grad_norm": 3.9359042644500732, + "learning_rate": 4.980255466430462e-06, + "loss": 0.7035, + "step": 604 + }, + { + "epoch": 0.2860520094562648, + "grad_norm": 3.200524091720581, + "learning_rate": 4.980177141187566e-06, + "loss": 0.7156, + "step": 605 + }, + { + "epoch": 0.2865248226950355, + "grad_norm": 3.1708686351776123, + "learning_rate": 4.980098661514916e-06, + "loss": 0.746, + "step": 606 + }, + { + "epoch": 0.28699763593380617, + "grad_norm": 2.8926830291748047, + "learning_rate": 4.980020027417397e-06, + "loss": 0.6282, + "step": 607 + }, + { + "epoch": 0.28747044917257686, + "grad_norm": 3.0526294708251953, + "learning_rate": 4.979941238899906e-06, + "loss": 0.6594, + "step": 608 + }, + { + "epoch": 0.28794326241134754, + "grad_norm": 2.9869306087493896, + "learning_rate": 4.9798622959673486e-06, + "loss": 0.7771, + "step": 609 + }, + { + "epoch": 0.28841607565011823, + "grad_norm": 2.7894513607025146, + "learning_rate": 4.979783198624638e-06, + "loss": 0.6819, + "step": 610 + }, + { + "epoch": 0.28888888888888886, + "grad_norm": 2.958575963973999, + "learning_rate": 4.9797039468767025e-06, + "loss": 0.6474, + "step": 611 + }, + { + "epoch": 0.28936170212765955, + "grad_norm": 3.423748016357422, + "learning_rate": 4.979624540728475e-06, + "loss": 0.7389, + "step": 612 + }, + { + "epoch": 0.28983451536643023, + "grad_norm": 2.9641635417938232, + "learning_rate": 4.9795449801849e-06, + "loss": 0.6005, + "step": 613 + }, + { + "epoch": 0.2903073286052009, + "grad_norm": 3.02274227142334, + "learning_rate": 4.979465265250933e-06, + "loss": 0.6358, + "step": 614 + }, + { + "epoch": 0.2907801418439716, + "grad_norm": 3.0562758445739746, + "learning_rate": 4.979385395931534e-06, + "loss": 0.6313, + "step": 615 + }, + { + "epoch": 0.2912529550827423, + "grad_norm": 3.301816701889038, + "learning_rate": 4.97930537223168e-06, + "loss": 0.7264, + "step": 616 + }, + { + "epoch": 0.291725768321513, + "grad_norm": 2.975360870361328, + "learning_rate": 4.979225194156351e-06, + "loss": 0.613, + "step": 617 + }, + { + "epoch": 0.29219858156028367, + "grad_norm": 2.9245030879974365, + "learning_rate": 4.97914486171054e-06, + "loss": 0.6646, + "step": 618 + }, + { + "epoch": 0.29267139479905435, + "grad_norm": 3.1336188316345215, + "learning_rate": 4.979064374899249e-06, + "loss": 0.6421, + "step": 619 + }, + { + "epoch": 0.29314420803782504, + "grad_norm": 3.6298763751983643, + "learning_rate": 4.978983733727491e-06, + "loss": 0.6433, + "step": 620 + }, + { + "epoch": 0.2936170212765957, + "grad_norm": 2.919597625732422, + "learning_rate": 4.9789029382002845e-06, + "loss": 0.6288, + "step": 621 + }, + { + "epoch": 0.2940898345153664, + "grad_norm": 3.2206127643585205, + "learning_rate": 4.978821988322662e-06, + "loss": 0.7102, + "step": 622 + }, + { + "epoch": 0.2945626477541371, + "grad_norm": 3.1767101287841797, + "learning_rate": 4.978740884099664e-06, + "loss": 0.6722, + "step": 623 + }, + { + "epoch": 0.2950354609929078, + "grad_norm": 3.3425452709198, + "learning_rate": 4.97865962553634e-06, + "loss": 0.6492, + "step": 624 + }, + { + "epoch": 0.29550827423167847, + "grad_norm": 3.0408358573913574, + "learning_rate": 4.97857821263775e-06, + "loss": 0.6522, + "step": 625 + }, + { + "epoch": 0.29598108747044916, + "grad_norm": 2.8144783973693848, + "learning_rate": 4.978496645408963e-06, + "loss": 0.7237, + "step": 626 + }, + { + "epoch": 0.29645390070921984, + "grad_norm": 3.7010560035705566, + "learning_rate": 4.978414923855057e-06, + "loss": 0.7509, + "step": 627 + }, + { + "epoch": 0.29692671394799053, + "grad_norm": 2.9438371658325195, + "learning_rate": 4.978333047981122e-06, + "loss": 0.6244, + "step": 628 + }, + { + "epoch": 0.2973995271867612, + "grad_norm": 3.285982370376587, + "learning_rate": 4.978251017792255e-06, + "loss": 0.7553, + "step": 629 + }, + { + "epoch": 0.2978723404255319, + "grad_norm": 3.7021138668060303, + "learning_rate": 4.978168833293564e-06, + "loss": 0.7859, + "step": 630 + }, + { + "epoch": 0.2983451536643026, + "grad_norm": 3.481858730316162, + "learning_rate": 4.9780864944901654e-06, + "loss": 0.7146, + "step": 631 + }, + { + "epoch": 0.2988179669030733, + "grad_norm": 3.693824529647827, + "learning_rate": 4.978004001387188e-06, + "loss": 0.6608, + "step": 632 + }, + { + "epoch": 0.29929078014184396, + "grad_norm": 3.0069146156311035, + "learning_rate": 4.9779213539897665e-06, + "loss": 0.6506, + "step": 633 + }, + { + "epoch": 0.29976359338061465, + "grad_norm": 3.037644147872925, + "learning_rate": 4.977838552303048e-06, + "loss": 0.6487, + "step": 634 + }, + { + "epoch": 0.30023640661938533, + "grad_norm": 3.018554449081421, + "learning_rate": 4.977755596332188e-06, + "loss": 0.6128, + "step": 635 + }, + { + "epoch": 0.300709219858156, + "grad_norm": 3.000312089920044, + "learning_rate": 4.977672486082351e-06, + "loss": 0.6431, + "step": 636 + }, + { + "epoch": 0.3011820330969267, + "grad_norm": 2.836803913116455, + "learning_rate": 4.977589221558713e-06, + "loss": 0.5914, + "step": 637 + }, + { + "epoch": 0.3016548463356974, + "grad_norm": 3.080469846725464, + "learning_rate": 4.977505802766457e-06, + "loss": 0.7265, + "step": 638 + }, + { + "epoch": 0.3021276595744681, + "grad_norm": 3.2245471477508545, + "learning_rate": 4.97742222971078e-06, + "loss": 0.6895, + "step": 639 + }, + { + "epoch": 0.30260047281323876, + "grad_norm": 3.559006452560425, + "learning_rate": 4.977338502396882e-06, + "loss": 0.7439, + "step": 640 + }, + { + "epoch": 0.30307328605200945, + "grad_norm": 2.9116289615631104, + "learning_rate": 4.9772546208299795e-06, + "loss": 0.6907, + "step": 641 + }, + { + "epoch": 0.30354609929078014, + "grad_norm": 3.3645524978637695, + "learning_rate": 4.977170585015295e-06, + "loss": 0.6983, + "step": 642 + }, + { + "epoch": 0.3040189125295508, + "grad_norm": 3.080148458480835, + "learning_rate": 4.977086394958058e-06, + "loss": 0.7016, + "step": 643 + }, + { + "epoch": 0.3044917257683215, + "grad_norm": 2.9276750087738037, + "learning_rate": 4.977002050663515e-06, + "loss": 0.6509, + "step": 644 + }, + { + "epoch": 0.3049645390070922, + "grad_norm": 3.183609962463379, + "learning_rate": 4.976917552136914e-06, + "loss": 0.6814, + "step": 645 + }, + { + "epoch": 0.3054373522458629, + "grad_norm": 3.0980000495910645, + "learning_rate": 4.976832899383519e-06, + "loss": 0.6319, + "step": 646 + }, + { + "epoch": 0.30591016548463357, + "grad_norm": 3.211376190185547, + "learning_rate": 4.9767480924086e-06, + "loss": 0.6365, + "step": 647 + }, + { + "epoch": 0.30638297872340425, + "grad_norm": 3.214430093765259, + "learning_rate": 4.976663131217437e-06, + "loss": 0.6006, + "step": 648 + }, + { + "epoch": 0.30685579196217494, + "grad_norm": 3.0914318561553955, + "learning_rate": 4.976578015815321e-06, + "loss": 0.7162, + "step": 649 + }, + { + "epoch": 0.3073286052009456, + "grad_norm": 2.7644500732421875, + "learning_rate": 4.976492746207551e-06, + "loss": 0.6045, + "step": 650 + }, + { + "epoch": 0.3078014184397163, + "grad_norm": 3.1913280487060547, + "learning_rate": 4.9764073223994374e-06, + "loss": 0.6796, + "step": 651 + }, + { + "epoch": 0.308274231678487, + "grad_norm": 2.8919692039489746, + "learning_rate": 4.976321744396299e-06, + "loss": 0.6683, + "step": 652 + }, + { + "epoch": 0.3087470449172577, + "grad_norm": 2.862234115600586, + "learning_rate": 4.976236012203463e-06, + "loss": 0.6631, + "step": 653 + }, + { + "epoch": 0.30921985815602837, + "grad_norm": 2.9708092212677, + "learning_rate": 4.976150125826268e-06, + "loss": 0.6326, + "step": 654 + }, + { + "epoch": 0.30969267139479906, + "grad_norm": 2.892465353012085, + "learning_rate": 4.976064085270063e-06, + "loss": 0.6574, + "step": 655 + }, + { + "epoch": 0.31016548463356974, + "grad_norm": 3.9215126037597656, + "learning_rate": 4.975977890540205e-06, + "loss": 0.7351, + "step": 656 + }, + { + "epoch": 0.31063829787234043, + "grad_norm": 2.9544081687927246, + "learning_rate": 4.975891541642059e-06, + "loss": 0.7264, + "step": 657 + }, + { + "epoch": 0.3111111111111111, + "grad_norm": 2.995035409927368, + "learning_rate": 4.975805038581005e-06, + "loss": 0.7405, + "step": 658 + }, + { + "epoch": 0.3115839243498818, + "grad_norm": 2.9653120040893555, + "learning_rate": 4.975718381362427e-06, + "loss": 0.679, + "step": 659 + }, + { + "epoch": 0.3120567375886525, + "grad_norm": 2.93976092338562, + "learning_rate": 4.9756315699917205e-06, + "loss": 0.627, + "step": 660 + }, + { + "epoch": 0.3125295508274232, + "grad_norm": 3.106522560119629, + "learning_rate": 4.9755446044742915e-06, + "loss": 0.6329, + "step": 661 + }, + { + "epoch": 0.31300236406619386, + "grad_norm": 3.0238280296325684, + "learning_rate": 4.975457484815554e-06, + "loss": 0.6643, + "step": 662 + }, + { + "epoch": 0.31347517730496455, + "grad_norm": 2.943528175354004, + "learning_rate": 4.9753702110209356e-06, + "loss": 0.668, + "step": 663 + }, + { + "epoch": 0.31394799054373523, + "grad_norm": 2.6840121746063232, + "learning_rate": 4.9752827830958676e-06, + "loss": 0.5482, + "step": 664 + }, + { + "epoch": 0.3144208037825059, + "grad_norm": 2.823875904083252, + "learning_rate": 4.975195201045794e-06, + "loss": 0.7017, + "step": 665 + }, + { + "epoch": 0.3148936170212766, + "grad_norm": 3.148181200027466, + "learning_rate": 4.975107464876168e-06, + "loss": 0.747, + "step": 666 + }, + { + "epoch": 0.3153664302600473, + "grad_norm": 2.630584478378296, + "learning_rate": 4.9750195745924545e-06, + "loss": 0.5987, + "step": 667 + }, + { + "epoch": 0.315839243498818, + "grad_norm": 3.075866460800171, + "learning_rate": 4.974931530200124e-06, + "loss": 0.664, + "step": 668 + }, + { + "epoch": 0.31631205673758866, + "grad_norm": 2.947197914123535, + "learning_rate": 4.974843331704659e-06, + "loss": 0.631, + "step": 669 + }, + { + "epoch": 0.31678486997635935, + "grad_norm": 3.519646644592285, + "learning_rate": 4.974754979111552e-06, + "loss": 0.7154, + "step": 670 + }, + { + "epoch": 0.31725768321513004, + "grad_norm": 2.8687186241149902, + "learning_rate": 4.974666472426305e-06, + "loss": 0.6366, + "step": 671 + }, + { + "epoch": 0.3177304964539007, + "grad_norm": 2.6966612339019775, + "learning_rate": 4.974577811654426e-06, + "loss": 0.7112, + "step": 672 + }, + { + "epoch": 0.3182033096926714, + "grad_norm": 3.1390228271484375, + "learning_rate": 4.974488996801439e-06, + "loss": 0.6882, + "step": 673 + }, + { + "epoch": 0.3186761229314421, + "grad_norm": 3.4667599201202393, + "learning_rate": 4.974400027872871e-06, + "loss": 0.7153, + "step": 674 + }, + { + "epoch": 0.3191489361702128, + "grad_norm": 2.9632184505462646, + "learning_rate": 4.974310904874265e-06, + "loss": 0.7081, + "step": 675 + }, + { + "epoch": 0.31962174940898347, + "grad_norm": 3.46150279045105, + "learning_rate": 4.9742216278111666e-06, + "loss": 0.6242, + "step": 676 + }, + { + "epoch": 0.32009456264775416, + "grad_norm": 3.380403757095337, + "learning_rate": 4.974132196689137e-06, + "loss": 0.6863, + "step": 677 + }, + { + "epoch": 0.32056737588652484, + "grad_norm": 3.4279606342315674, + "learning_rate": 4.974042611513746e-06, + "loss": 0.6388, + "step": 678 + }, + { + "epoch": 0.3210401891252955, + "grad_norm": 2.634523391723633, + "learning_rate": 4.973952872290568e-06, + "loss": 0.6038, + "step": 679 + }, + { + "epoch": 0.3215130023640662, + "grad_norm": 3.19693922996521, + "learning_rate": 4.973862979025194e-06, + "loss": 0.6383, + "step": 680 + }, + { + "epoch": 0.3219858156028369, + "grad_norm": 3.437692165374756, + "learning_rate": 4.973772931723218e-06, + "loss": 0.7288, + "step": 681 + }, + { + "epoch": 0.3224586288416076, + "grad_norm": 2.506301164627075, + "learning_rate": 4.97368273039025e-06, + "loss": 0.5707, + "step": 682 + }, + { + "epoch": 0.3229314420803783, + "grad_norm": 3.0942845344543457, + "learning_rate": 4.9735923750319044e-06, + "loss": 0.6348, + "step": 683 + }, + { + "epoch": 0.32340425531914896, + "grad_norm": 3.0889835357666016, + "learning_rate": 4.973501865653809e-06, + "loss": 0.6697, + "step": 684 + }, + { + "epoch": 0.32387706855791965, + "grad_norm": 3.0391931533813477, + "learning_rate": 4.973411202261598e-06, + "loss": 0.7091, + "step": 685 + }, + { + "epoch": 0.32434988179669033, + "grad_norm": 3.0333497524261475, + "learning_rate": 4.973320384860917e-06, + "loss": 0.6403, + "step": 686 + }, + { + "epoch": 0.324822695035461, + "grad_norm": 2.9714622497558594, + "learning_rate": 4.973229413457421e-06, + "loss": 0.6977, + "step": 687 + }, + { + "epoch": 0.3252955082742317, + "grad_norm": 3.057558298110962, + "learning_rate": 4.973138288056774e-06, + "loss": 0.7236, + "step": 688 + }, + { + "epoch": 0.3257683215130024, + "grad_norm": 2.921093463897705, + "learning_rate": 4.97304700866465e-06, + "loss": 0.576, + "step": 689 + }, + { + "epoch": 0.3262411347517731, + "grad_norm": 3.0287256240844727, + "learning_rate": 4.972955575286732e-06, + "loss": 0.7077, + "step": 690 + }, + { + "epoch": 0.32671394799054376, + "grad_norm": 2.8621346950531006, + "learning_rate": 4.972863987928716e-06, + "loss": 0.6952, + "step": 691 + }, + { + "epoch": 0.3271867612293144, + "grad_norm": 2.631359100341797, + "learning_rate": 4.9727722465963006e-06, + "loss": 0.6931, + "step": 692 + }, + { + "epoch": 0.3276595744680851, + "grad_norm": 2.8484320640563965, + "learning_rate": 4.972680351295201e-06, + "loss": 0.6292, + "step": 693 + }, + { + "epoch": 0.32813238770685577, + "grad_norm": 2.593001365661621, + "learning_rate": 4.972588302031138e-06, + "loss": 0.5942, + "step": 694 + }, + { + "epoch": 0.32860520094562645, + "grad_norm": 2.6321065425872803, + "learning_rate": 4.972496098809844e-06, + "loss": 0.65, + "step": 695 + }, + { + "epoch": 0.32907801418439714, + "grad_norm": 3.2516732215881348, + "learning_rate": 4.972403741637059e-06, + "loss": 0.7385, + "step": 696 + }, + { + "epoch": 0.3295508274231678, + "grad_norm": 3.180854320526123, + "learning_rate": 4.972311230518535e-06, + "loss": 0.6569, + "step": 697 + }, + { + "epoch": 0.3300236406619385, + "grad_norm": 4.161016941070557, + "learning_rate": 4.972218565460031e-06, + "loss": 0.6416, + "step": 698 + }, + { + "epoch": 0.3304964539007092, + "grad_norm": 3.153897762298584, + "learning_rate": 4.972125746467317e-06, + "loss": 0.7196, + "step": 699 + }, + { + "epoch": 0.3309692671394799, + "grad_norm": 2.9595556259155273, + "learning_rate": 4.972032773546173e-06, + "loss": 0.7093, + "step": 700 + }, + { + "epoch": 0.33144208037825057, + "grad_norm": 3.1086833477020264, + "learning_rate": 4.9719396467023875e-06, + "loss": 0.6963, + "step": 701 + }, + { + "epoch": 0.33191489361702126, + "grad_norm": 2.958921432495117, + "learning_rate": 4.971846365941759e-06, + "loss": 0.6518, + "step": 702 + }, + { + "epoch": 0.33238770685579194, + "grad_norm": 2.8745479583740234, + "learning_rate": 4.971752931270096e-06, + "loss": 0.696, + "step": 703 + }, + { + "epoch": 0.33286052009456263, + "grad_norm": 3.224358558654785, + "learning_rate": 4.971659342693217e-06, + "loss": 0.6769, + "step": 704 + }, + { + "epoch": 0.3333333333333333, + "grad_norm": 2.696319580078125, + "learning_rate": 4.9715656002169486e-06, + "loss": 0.6833, + "step": 705 + }, + { + "epoch": 0.333806146572104, + "grad_norm": 2.9283502101898193, + "learning_rate": 4.971471703847127e-06, + "loss": 0.6784, + "step": 706 + }, + { + "epoch": 0.3342789598108747, + "grad_norm": 2.654914140701294, + "learning_rate": 4.9713776535896e-06, + "loss": 0.6337, + "step": 707 + }, + { + "epoch": 0.3347517730496454, + "grad_norm": 3.041555643081665, + "learning_rate": 4.971283449450224e-06, + "loss": 0.6227, + "step": 708 + }, + { + "epoch": 0.33522458628841606, + "grad_norm": 2.893008232116699, + "learning_rate": 4.971189091434863e-06, + "loss": 0.655, + "step": 709 + }, + { + "epoch": 0.33569739952718675, + "grad_norm": 2.8806653022766113, + "learning_rate": 4.971094579549393e-06, + "loss": 0.7077, + "step": 710 + }, + { + "epoch": 0.33617021276595743, + "grad_norm": 3.4830048084259033, + "learning_rate": 4.9709999137996986e-06, + "loss": 0.7461, + "step": 711 + }, + { + "epoch": 0.3366430260047281, + "grad_norm": 3.155444860458374, + "learning_rate": 4.970905094191674e-06, + "loss": 0.652, + "step": 712 + }, + { + "epoch": 0.3371158392434988, + "grad_norm": 2.7608706951141357, + "learning_rate": 4.970810120731225e-06, + "loss": 0.684, + "step": 713 + }, + { + "epoch": 0.3375886524822695, + "grad_norm": 2.8209474086761475, + "learning_rate": 4.970714993424265e-06, + "loss": 0.6009, + "step": 714 + }, + { + "epoch": 0.3380614657210402, + "grad_norm": 3.6532654762268066, + "learning_rate": 4.9706197122767145e-06, + "loss": 0.702, + "step": 715 + }, + { + "epoch": 0.33853427895981086, + "grad_norm": 2.6276566982269287, + "learning_rate": 4.970524277294508e-06, + "loss": 0.6338, + "step": 716 + }, + { + "epoch": 0.33900709219858155, + "grad_norm": 3.509871482849121, + "learning_rate": 4.970428688483589e-06, + "loss": 0.6853, + "step": 717 + }, + { + "epoch": 0.33947990543735224, + "grad_norm": 5.332682132720947, + "learning_rate": 4.970332945849906e-06, + "loss": 0.6684, + "step": 718 + }, + { + "epoch": 0.3399527186761229, + "grad_norm": 2.718801975250244, + "learning_rate": 4.970237049399424e-06, + "loss": 0.6676, + "step": 719 + }, + { + "epoch": 0.3404255319148936, + "grad_norm": 3.891003131866455, + "learning_rate": 4.970140999138112e-06, + "loss": 0.7043, + "step": 720 + }, + { + "epoch": 0.3408983451536643, + "grad_norm": 2.8863155841827393, + "learning_rate": 4.970044795071951e-06, + "loss": 0.6563, + "step": 721 + }, + { + "epoch": 0.341371158392435, + "grad_norm": 3.2527518272399902, + "learning_rate": 4.969948437206932e-06, + "loss": 0.7244, + "step": 722 + }, + { + "epoch": 0.34184397163120567, + "grad_norm": 2.9726758003234863, + "learning_rate": 4.969851925549054e-06, + "loss": 0.6548, + "step": 723 + }, + { + "epoch": 0.34231678486997635, + "grad_norm": 3.118309497833252, + "learning_rate": 4.969755260104327e-06, + "loss": 0.7293, + "step": 724 + }, + { + "epoch": 0.34278959810874704, + "grad_norm": 3.373068332672119, + "learning_rate": 4.969658440878769e-06, + "loss": 0.6444, + "step": 725 + }, + { + "epoch": 0.3432624113475177, + "grad_norm": 2.7157437801361084, + "learning_rate": 4.969561467878409e-06, + "loss": 0.642, + "step": 726 + }, + { + "epoch": 0.3437352245862884, + "grad_norm": 2.58929705619812, + "learning_rate": 4.969464341109285e-06, + "loss": 0.6165, + "step": 727 + }, + { + "epoch": 0.3442080378250591, + "grad_norm": 2.8811306953430176, + "learning_rate": 4.969367060577445e-06, + "loss": 0.7127, + "step": 728 + }, + { + "epoch": 0.3446808510638298, + "grad_norm": 3.494358539581299, + "learning_rate": 4.969269626288946e-06, + "loss": 0.7103, + "step": 729 + }, + { + "epoch": 0.34515366430260047, + "grad_norm": 2.9753928184509277, + "learning_rate": 4.969172038249855e-06, + "loss": 0.6911, + "step": 730 + }, + { + "epoch": 0.34562647754137116, + "grad_norm": 3.2885913848876953, + "learning_rate": 4.969074296466247e-06, + "loss": 0.6968, + "step": 731 + }, + { + "epoch": 0.34609929078014184, + "grad_norm": 2.7564568519592285, + "learning_rate": 4.968976400944211e-06, + "loss": 0.6843, + "step": 732 + }, + { + "epoch": 0.34657210401891253, + "grad_norm": 2.9255006313323975, + "learning_rate": 4.96887835168984e-06, + "loss": 0.6024, + "step": 733 + }, + { + "epoch": 0.3470449172576832, + "grad_norm": 3.1808290481567383, + "learning_rate": 4.968780148709239e-06, + "loss": 0.7377, + "step": 734 + }, + { + "epoch": 0.3475177304964539, + "grad_norm": 2.956666946411133, + "learning_rate": 4.968681792008523e-06, + "loss": 0.65, + "step": 735 + }, + { + "epoch": 0.3479905437352246, + "grad_norm": 2.9631855487823486, + "learning_rate": 4.9685832815938175e-06, + "loss": 0.677, + "step": 736 + }, + { + "epoch": 0.3484633569739953, + "grad_norm": 2.501917600631714, + "learning_rate": 4.968484617471256e-06, + "loss": 0.6282, + "step": 737 + }, + { + "epoch": 0.34893617021276596, + "grad_norm": 2.750779628753662, + "learning_rate": 4.968385799646981e-06, + "loss": 0.6507, + "step": 738 + }, + { + "epoch": 0.34940898345153665, + "grad_norm": 2.872300624847412, + "learning_rate": 4.968286828127146e-06, + "loss": 0.5949, + "step": 739 + }, + { + "epoch": 0.34988179669030733, + "grad_norm": 2.6316142082214355, + "learning_rate": 4.9681877029179124e-06, + "loss": 0.6328, + "step": 740 + }, + { + "epoch": 0.350354609929078, + "grad_norm": 3.244364023208618, + "learning_rate": 4.968088424025454e-06, + "loss": 0.7393, + "step": 741 + }, + { + "epoch": 0.3508274231678487, + "grad_norm": 2.620465040206909, + "learning_rate": 4.967988991455951e-06, + "loss": 0.6797, + "step": 742 + }, + { + "epoch": 0.3513002364066194, + "grad_norm": 2.854513645172119, + "learning_rate": 4.967889405215596e-06, + "loss": 0.6368, + "step": 743 + }, + { + "epoch": 0.3517730496453901, + "grad_norm": 2.579854726791382, + "learning_rate": 4.9677896653105886e-06, + "loss": 0.6489, + "step": 744 + }, + { + "epoch": 0.35224586288416077, + "grad_norm": 3.0697381496429443, + "learning_rate": 4.96768977174714e-06, + "loss": 0.6313, + "step": 745 + }, + { + "epoch": 0.35271867612293145, + "grad_norm": 3.369338035583496, + "learning_rate": 4.96758972453147e-06, + "loss": 0.7416, + "step": 746 + }, + { + "epoch": 0.35319148936170214, + "grad_norm": 2.836221933364868, + "learning_rate": 4.967489523669807e-06, + "loss": 0.6422, + "step": 747 + }, + { + "epoch": 0.3536643026004728, + "grad_norm": 2.929579496383667, + "learning_rate": 4.967389169168392e-06, + "loss": 0.6482, + "step": 748 + }, + { + "epoch": 0.3541371158392435, + "grad_norm": 2.9243831634521484, + "learning_rate": 4.967288661033472e-06, + "loss": 0.5813, + "step": 749 + }, + { + "epoch": 0.3546099290780142, + "grad_norm": 3.7555336952209473, + "learning_rate": 4.967187999271306e-06, + "loss": 0.6501, + "step": 750 + }, + { + "epoch": 0.3550827423167849, + "grad_norm": 3.4279143810272217, + "learning_rate": 4.9670871838881615e-06, + "loss": 0.6326, + "step": 751 + }, + { + "epoch": 0.35555555555555557, + "grad_norm": 2.875066041946411, + "learning_rate": 4.9669862148903166e-06, + "loss": 0.664, + "step": 752 + }, + { + "epoch": 0.35602836879432626, + "grad_norm": 3.130394697189331, + "learning_rate": 4.966885092284057e-06, + "loss": 0.706, + "step": 753 + }, + { + "epoch": 0.35650118203309694, + "grad_norm": 2.9606287479400635, + "learning_rate": 4.96678381607568e-06, + "loss": 0.693, + "step": 754 + }, + { + "epoch": 0.35697399527186763, + "grad_norm": 3.0584909915924072, + "learning_rate": 4.966682386271491e-06, + "loss": 0.6034, + "step": 755 + }, + { + "epoch": 0.3574468085106383, + "grad_norm": 2.8215200901031494, + "learning_rate": 4.966580802877805e-06, + "loss": 0.6217, + "step": 756 + }, + { + "epoch": 0.357919621749409, + "grad_norm": 2.7348055839538574, + "learning_rate": 4.966479065900949e-06, + "loss": 0.6194, + "step": 757 + }, + { + "epoch": 0.3583924349881797, + "grad_norm": 3.2347466945648193, + "learning_rate": 4.966377175347257e-06, + "loss": 0.6377, + "step": 758 + }, + { + "epoch": 0.3588652482269504, + "grad_norm": 3.311845302581787, + "learning_rate": 4.966275131223072e-06, + "loss": 0.6234, + "step": 759 + }, + { + "epoch": 0.35933806146572106, + "grad_norm": 3.0384368896484375, + "learning_rate": 4.96617293353475e-06, + "loss": 0.609, + "step": 760 + }, + { + "epoch": 0.35981087470449175, + "grad_norm": 3.516854763031006, + "learning_rate": 4.966070582288653e-06, + "loss": 0.6627, + "step": 761 + }, + { + "epoch": 0.36028368794326243, + "grad_norm": 3.2425215244293213, + "learning_rate": 4.9659680774911534e-06, + "loss": 0.7355, + "step": 762 + }, + { + "epoch": 0.3607565011820331, + "grad_norm": 3.2665750980377197, + "learning_rate": 4.965865419148636e-06, + "loss": 0.6787, + "step": 763 + }, + { + "epoch": 0.3612293144208038, + "grad_norm": 2.729428291320801, + "learning_rate": 4.96576260726749e-06, + "loss": 0.6272, + "step": 764 + }, + { + "epoch": 0.3617021276595745, + "grad_norm": 3.299969434738159, + "learning_rate": 4.965659641854119e-06, + "loss": 0.6552, + "step": 765 + }, + { + "epoch": 0.3621749408983452, + "grad_norm": 2.7090916633605957, + "learning_rate": 4.965556522914934e-06, + "loss": 0.6661, + "step": 766 + }, + { + "epoch": 0.36264775413711586, + "grad_norm": 2.488846778869629, + "learning_rate": 4.965453250456355e-06, + "loss": 0.5821, + "step": 767 + }, + { + "epoch": 0.36312056737588655, + "grad_norm": 2.5267233848571777, + "learning_rate": 4.965349824484813e-06, + "loss": 0.5593, + "step": 768 + }, + { + "epoch": 0.36359338061465724, + "grad_norm": 3.0646679401397705, + "learning_rate": 4.965246245006748e-06, + "loss": 0.6341, + "step": 769 + }, + { + "epoch": 0.3640661938534279, + "grad_norm": 2.9877712726593018, + "learning_rate": 4.965142512028609e-06, + "loss": 0.7202, + "step": 770 + }, + { + "epoch": 0.3645390070921986, + "grad_norm": 3.7494113445281982, + "learning_rate": 4.965038625556854e-06, + "loss": 0.7643, + "step": 771 + }, + { + "epoch": 0.3650118203309693, + "grad_norm": 2.8382890224456787, + "learning_rate": 4.964934585597954e-06, + "loss": 0.6522, + "step": 772 + }, + { + "epoch": 0.3654846335697399, + "grad_norm": 3.091655731201172, + "learning_rate": 4.9648303921583854e-06, + "loss": 0.7117, + "step": 773 + }, + { + "epoch": 0.3659574468085106, + "grad_norm": 3.0608325004577637, + "learning_rate": 4.964726045244635e-06, + "loss": 0.6538, + "step": 774 + }, + { + "epoch": 0.3664302600472813, + "grad_norm": 2.8492867946624756, + "learning_rate": 4.964621544863203e-06, + "loss": 0.6079, + "step": 775 + }, + { + "epoch": 0.366903073286052, + "grad_norm": 3.0669894218444824, + "learning_rate": 4.964516891020594e-06, + "loss": 0.6223, + "step": 776 + }, + { + "epoch": 0.36737588652482267, + "grad_norm": 3.089984893798828, + "learning_rate": 4.964412083723325e-06, + "loss": 0.671, + "step": 777 + }, + { + "epoch": 0.36784869976359336, + "grad_norm": 2.905242443084717, + "learning_rate": 4.964307122977921e-06, + "loss": 0.62, + "step": 778 + }, + { + "epoch": 0.36832151300236404, + "grad_norm": 3.954436779022217, + "learning_rate": 4.964202008790918e-06, + "loss": 0.6535, + "step": 779 + }, + { + "epoch": 0.36879432624113473, + "grad_norm": 2.6026058197021484, + "learning_rate": 4.9640967411688615e-06, + "loss": 0.5865, + "step": 780 + }, + { + "epoch": 0.3692671394799054, + "grad_norm": 2.9876346588134766, + "learning_rate": 4.963991320118306e-06, + "loss": 0.6698, + "step": 781 + }, + { + "epoch": 0.3697399527186761, + "grad_norm": 2.9411263465881348, + "learning_rate": 4.963885745645815e-06, + "loss": 0.6173, + "step": 782 + }, + { + "epoch": 0.3702127659574468, + "grad_norm": 2.5679805278778076, + "learning_rate": 4.963780017757962e-06, + "loss": 0.6285, + "step": 783 + }, + { + "epoch": 0.3706855791962175, + "grad_norm": 3.3100640773773193, + "learning_rate": 4.963674136461332e-06, + "loss": 0.5968, + "step": 784 + }, + { + "epoch": 0.37115839243498816, + "grad_norm": 3.1293699741363525, + "learning_rate": 4.963568101762515e-06, + "loss": 0.697, + "step": 785 + }, + { + "epoch": 0.37163120567375885, + "grad_norm": 3.043853759765625, + "learning_rate": 4.963461913668115e-06, + "loss": 0.5881, + "step": 786 + }, + { + "epoch": 0.37210401891252953, + "grad_norm": 3.07351016998291, + "learning_rate": 4.963355572184744e-06, + "loss": 0.6307, + "step": 787 + }, + { + "epoch": 0.3725768321513002, + "grad_norm": 2.7381317615509033, + "learning_rate": 4.9632490773190225e-06, + "loss": 0.716, + "step": 788 + }, + { + "epoch": 0.3730496453900709, + "grad_norm": 2.892221450805664, + "learning_rate": 4.963142429077582e-06, + "loss": 0.6867, + "step": 789 + }, + { + "epoch": 0.3735224586288416, + "grad_norm": 3.133122205734253, + "learning_rate": 4.963035627467064e-06, + "loss": 0.659, + "step": 790 + }, + { + "epoch": 0.3739952718676123, + "grad_norm": 3.032599925994873, + "learning_rate": 4.962928672494116e-06, + "loss": 0.6848, + "step": 791 + }, + { + "epoch": 0.37446808510638296, + "grad_norm": 3.0076355934143066, + "learning_rate": 4.9628215641654e-06, + "loss": 0.6549, + "step": 792 + }, + { + "epoch": 0.37494089834515365, + "grad_norm": 2.8904454708099365, + "learning_rate": 4.962714302487585e-06, + "loss": 0.6484, + "step": 793 + }, + { + "epoch": 0.37541371158392434, + "grad_norm": 2.881364107131958, + "learning_rate": 4.9626068874673486e-06, + "loss": 0.721, + "step": 794 + }, + { + "epoch": 0.375886524822695, + "grad_norm": 3.11668062210083, + "learning_rate": 4.962499319111379e-06, + "loss": 0.7824, + "step": 795 + }, + { + "epoch": 0.3763593380614657, + "grad_norm": 2.9201436042785645, + "learning_rate": 4.962391597426374e-06, + "loss": 0.6911, + "step": 796 + }, + { + "epoch": 0.3768321513002364, + "grad_norm": 2.926598072052002, + "learning_rate": 4.962283722419043e-06, + "loss": 0.6715, + "step": 797 + }, + { + "epoch": 0.3773049645390071, + "grad_norm": 2.7267675399780273, + "learning_rate": 4.962175694096101e-06, + "loss": 0.6111, + "step": 798 + }, + { + "epoch": 0.37777777777777777, + "grad_norm": 3.194031000137329, + "learning_rate": 4.962067512464275e-06, + "loss": 0.6558, + "step": 799 + }, + { + "epoch": 0.37825059101654845, + "grad_norm": 2.6249136924743652, + "learning_rate": 4.9619591775303e-06, + "loss": 0.6166, + "step": 800 + }, + { + "epoch": 0.37872340425531914, + "grad_norm": 2.6356167793273926, + "learning_rate": 4.961850689300923e-06, + "loss": 0.6112, + "step": 801 + }, + { + "epoch": 0.3791962174940898, + "grad_norm": 3.030724287033081, + "learning_rate": 4.961742047782898e-06, + "loss": 0.6511, + "step": 802 + }, + { + "epoch": 0.3796690307328605, + "grad_norm": 3.4987757205963135, + "learning_rate": 4.96163325298299e-06, + "loss": 0.5888, + "step": 803 + }, + { + "epoch": 0.3801418439716312, + "grad_norm": 3.0371780395507812, + "learning_rate": 4.961524304907974e-06, + "loss": 0.6385, + "step": 804 + }, + { + "epoch": 0.3806146572104019, + "grad_norm": 3.302570104598999, + "learning_rate": 4.961415203564632e-06, + "loss": 0.6515, + "step": 805 + }, + { + "epoch": 0.38108747044917257, + "grad_norm": 2.7597038745880127, + "learning_rate": 4.961305948959759e-06, + "loss": 0.6126, + "step": 806 + }, + { + "epoch": 0.38156028368794326, + "grad_norm": 2.789811849594116, + "learning_rate": 4.9611965411001575e-06, + "loss": 0.6601, + "step": 807 + }, + { + "epoch": 0.38203309692671394, + "grad_norm": 3.0403921604156494, + "learning_rate": 4.961086979992639e-06, + "loss": 0.6947, + "step": 808 + }, + { + "epoch": 0.38250591016548463, + "grad_norm": 3.2139980792999268, + "learning_rate": 4.960977265644026e-06, + "loss": 0.6876, + "step": 809 + }, + { + "epoch": 0.3829787234042553, + "grad_norm": 2.918515205383301, + "learning_rate": 4.960867398061149e-06, + "loss": 0.5997, + "step": 810 + }, + { + "epoch": 0.383451536643026, + "grad_norm": 3.197636604309082, + "learning_rate": 4.9607573772508495e-06, + "loss": 0.5754, + "step": 811 + }, + { + "epoch": 0.3839243498817967, + "grad_norm": 2.8848466873168945, + "learning_rate": 4.960647203219979e-06, + "loss": 0.6424, + "step": 812 + }, + { + "epoch": 0.3843971631205674, + "grad_norm": 3.4810187816619873, + "learning_rate": 4.960536875975397e-06, + "loss": 0.6851, + "step": 813 + }, + { + "epoch": 0.38486997635933806, + "grad_norm": 3.713934898376465, + "learning_rate": 4.960426395523972e-06, + "loss": 0.6122, + "step": 814 + }, + { + "epoch": 0.38534278959810875, + "grad_norm": 2.862600803375244, + "learning_rate": 4.960315761872585e-06, + "loss": 0.6493, + "step": 815 + }, + { + "epoch": 0.38581560283687943, + "grad_norm": 3.133882522583008, + "learning_rate": 4.960204975028123e-06, + "loss": 0.7535, + "step": 816 + }, + { + "epoch": 0.3862884160756501, + "grad_norm": 3.1526732444763184, + "learning_rate": 4.960094034997485e-06, + "loss": 0.6512, + "step": 817 + }, + { + "epoch": 0.3867612293144208, + "grad_norm": 2.7213544845581055, + "learning_rate": 4.959982941787579e-06, + "loss": 0.6121, + "step": 818 + }, + { + "epoch": 0.3872340425531915, + "grad_norm": 3.4935851097106934, + "learning_rate": 4.9598716954053214e-06, + "loss": 0.7852, + "step": 819 + }, + { + "epoch": 0.3877068557919622, + "grad_norm": 2.691016435623169, + "learning_rate": 4.9597602958576395e-06, + "loss": 0.6861, + "step": 820 + }, + { + "epoch": 0.38817966903073287, + "grad_norm": 2.8621015548706055, + "learning_rate": 4.959648743151469e-06, + "loss": 0.6262, + "step": 821 + }, + { + "epoch": 0.38865248226950355, + "grad_norm": 3.3887462615966797, + "learning_rate": 4.959537037293758e-06, + "loss": 0.7103, + "step": 822 + }, + { + "epoch": 0.38912529550827424, + "grad_norm": 2.7565438747406006, + "learning_rate": 4.95942517829146e-06, + "loss": 0.6471, + "step": 823 + }, + { + "epoch": 0.3895981087470449, + "grad_norm": 2.7920358180999756, + "learning_rate": 4.959313166151541e-06, + "loss": 0.6239, + "step": 824 + }, + { + "epoch": 0.3900709219858156, + "grad_norm": 3.18904185295105, + "learning_rate": 4.959201000880973e-06, + "loss": 0.7461, + "step": 825 + }, + { + "epoch": 0.3905437352245863, + "grad_norm": 2.727872371673584, + "learning_rate": 4.959088682486743e-06, + "loss": 0.6333, + "step": 826 + }, + { + "epoch": 0.391016548463357, + "grad_norm": 2.906378746032715, + "learning_rate": 4.958976210975844e-06, + "loss": 0.7547, + "step": 827 + }, + { + "epoch": 0.39148936170212767, + "grad_norm": 2.96482515335083, + "learning_rate": 4.958863586355278e-06, + "loss": 0.6312, + "step": 828 + }, + { + "epoch": 0.39196217494089836, + "grad_norm": 3.2890889644622803, + "learning_rate": 4.958750808632059e-06, + "loss": 0.6943, + "step": 829 + }, + { + "epoch": 0.39243498817966904, + "grad_norm": 2.7004311084747314, + "learning_rate": 4.958637877813207e-06, + "loss": 0.5918, + "step": 830 + }, + { + "epoch": 0.39290780141843973, + "grad_norm": 2.7487950325012207, + "learning_rate": 4.9585247939057566e-06, + "loss": 0.6201, + "step": 831 + }, + { + "epoch": 0.3933806146572104, + "grad_norm": 2.7873897552490234, + "learning_rate": 4.958411556916747e-06, + "loss": 0.6268, + "step": 832 + }, + { + "epoch": 0.3938534278959811, + "grad_norm": 2.8501343727111816, + "learning_rate": 4.958298166853229e-06, + "loss": 0.7119, + "step": 833 + }, + { + "epoch": 0.3943262411347518, + "grad_norm": 3.0391547679901123, + "learning_rate": 4.958184623722265e-06, + "loss": 0.6375, + "step": 834 + }, + { + "epoch": 0.3947990543735225, + "grad_norm": 2.850520133972168, + "learning_rate": 4.958070927530922e-06, + "loss": 0.5962, + "step": 835 + }, + { + "epoch": 0.39527186761229316, + "grad_norm": 3.351914644241333, + "learning_rate": 4.957957078286281e-06, + "loss": 0.7247, + "step": 836 + }, + { + "epoch": 0.39574468085106385, + "grad_norm": 2.9559543132781982, + "learning_rate": 4.957843075995431e-06, + "loss": 0.6571, + "step": 837 + }, + { + "epoch": 0.39621749408983453, + "grad_norm": 3.225785255432129, + "learning_rate": 4.95772892066547e-06, + "loss": 0.7074, + "step": 838 + }, + { + "epoch": 0.3966903073286052, + "grad_norm": 2.7842373847961426, + "learning_rate": 4.957614612303505e-06, + "loss": 0.6469, + "step": 839 + }, + { + "epoch": 0.3971631205673759, + "grad_norm": 4.249724864959717, + "learning_rate": 4.957500150916655e-06, + "loss": 0.741, + "step": 840 + }, + { + "epoch": 0.3976359338061466, + "grad_norm": 3.138221263885498, + "learning_rate": 4.957385536512046e-06, + "loss": 0.6676, + "step": 841 + }, + { + "epoch": 0.3981087470449173, + "grad_norm": 3.456423759460449, + "learning_rate": 4.957270769096816e-06, + "loss": 0.6877, + "step": 842 + }, + { + "epoch": 0.39858156028368796, + "grad_norm": 2.8676278591156006, + "learning_rate": 4.957155848678109e-06, + "loss": 0.5986, + "step": 843 + }, + { + "epoch": 0.39905437352245865, + "grad_norm": 2.705324411392212, + "learning_rate": 4.957040775263082e-06, + "loss": 0.6356, + "step": 844 + }, + { + "epoch": 0.39952718676122934, + "grad_norm": 3.0767486095428467, + "learning_rate": 4.9569255488589e-06, + "loss": 0.6844, + "step": 845 + }, + { + "epoch": 0.4, + "grad_norm": 2.7787704467773438, + "learning_rate": 4.956810169472736e-06, + "loss": 0.6641, + "step": 846 + }, + { + "epoch": 0.4004728132387707, + "grad_norm": 2.584277868270874, + "learning_rate": 4.956694637111777e-06, + "loss": 0.6256, + "step": 847 + }, + { + "epoch": 0.4009456264775414, + "grad_norm": 2.751641273498535, + "learning_rate": 4.956578951783215e-06, + "loss": 0.5954, + "step": 848 + }, + { + "epoch": 0.4014184397163121, + "grad_norm": 3.0181658267974854, + "learning_rate": 4.956463113494253e-06, + "loss": 0.6569, + "step": 849 + }, + { + "epoch": 0.40189125295508277, + "grad_norm": 3.0933220386505127, + "learning_rate": 4.956347122252104e-06, + "loss": 0.6248, + "step": 850 + }, + { + "epoch": 0.40236406619385345, + "grad_norm": 3.3767428398132324, + "learning_rate": 4.956230978063991e-06, + "loss": 0.719, + "step": 851 + }, + { + "epoch": 0.40283687943262414, + "grad_norm": 3.7666573524475098, + "learning_rate": 4.956114680937145e-06, + "loss": 0.6467, + "step": 852 + }, + { + "epoch": 0.4033096926713948, + "grad_norm": 2.9836843013763428, + "learning_rate": 4.955998230878808e-06, + "loss": 0.6993, + "step": 853 + }, + { + "epoch": 0.4037825059101655, + "grad_norm": 2.981497049331665, + "learning_rate": 4.955881627896229e-06, + "loss": 0.6578, + "step": 854 + }, + { + "epoch": 0.40425531914893614, + "grad_norm": 3.1369056701660156, + "learning_rate": 4.955764871996672e-06, + "loss": 0.6763, + "step": 855 + }, + { + "epoch": 0.40472813238770683, + "grad_norm": 2.7675817012786865, + "learning_rate": 4.9556479631874036e-06, + "loss": 0.6488, + "step": 856 + }, + { + "epoch": 0.4052009456264775, + "grad_norm": 3.035334825515747, + "learning_rate": 4.9555309014757034e-06, + "loss": 0.7076, + "step": 857 + }, + { + "epoch": 0.4056737588652482, + "grad_norm": 3.493704319000244, + "learning_rate": 4.955413686868862e-06, + "loss": 0.6773, + "step": 858 + }, + { + "epoch": 0.4061465721040189, + "grad_norm": 3.245487928390503, + "learning_rate": 4.9552963193741765e-06, + "loss": 0.6915, + "step": 859 + }, + { + "epoch": 0.4066193853427896, + "grad_norm": 3.189969539642334, + "learning_rate": 4.955178798998956e-06, + "loss": 0.7318, + "step": 860 + }, + { + "epoch": 0.40709219858156026, + "grad_norm": 2.7987146377563477, + "learning_rate": 4.955061125750517e-06, + "loss": 0.6162, + "step": 861 + }, + { + "epoch": 0.40756501182033095, + "grad_norm": 3.020118474960327, + "learning_rate": 4.954943299636187e-06, + "loss": 0.6678, + "step": 862 + }, + { + "epoch": 0.40803782505910163, + "grad_norm": 2.715463876724243, + "learning_rate": 4.954825320663302e-06, + "loss": 0.668, + "step": 863 + }, + { + "epoch": 0.4085106382978723, + "grad_norm": 2.595050096511841, + "learning_rate": 4.9547071888392085e-06, + "loss": 0.6557, + "step": 864 + }, + { + "epoch": 0.408983451536643, + "grad_norm": 3.131596088409424, + "learning_rate": 4.954588904171261e-06, + "loss": 0.6548, + "step": 865 + }, + { + "epoch": 0.4094562647754137, + "grad_norm": 2.5742313861846924, + "learning_rate": 4.954470466666827e-06, + "loss": 0.6592, + "step": 866 + }, + { + "epoch": 0.4099290780141844, + "grad_norm": 2.8612802028656006, + "learning_rate": 4.9543518763332785e-06, + "loss": 0.5391, + "step": 867 + }, + { + "epoch": 0.41040189125295506, + "grad_norm": 2.8973186016082764, + "learning_rate": 4.954233133178001e-06, + "loss": 0.6649, + "step": 868 + }, + { + "epoch": 0.41087470449172575, + "grad_norm": 2.802525043487549, + "learning_rate": 4.954114237208388e-06, + "loss": 0.6212, + "step": 869 + }, + { + "epoch": 0.41134751773049644, + "grad_norm": 2.5919506549835205, + "learning_rate": 4.953995188431843e-06, + "loss": 0.6596, + "step": 870 + }, + { + "epoch": 0.4118203309692671, + "grad_norm": 3.139169454574585, + "learning_rate": 4.953875986855777e-06, + "loss": 0.6799, + "step": 871 + }, + { + "epoch": 0.4122931442080378, + "grad_norm": 3.99727725982666, + "learning_rate": 4.953756632487614e-06, + "loss": 0.6519, + "step": 872 + }, + { + "epoch": 0.4127659574468085, + "grad_norm": 3.238706350326538, + "learning_rate": 4.953637125334784e-06, + "loss": 0.7361, + "step": 873 + }, + { + "epoch": 0.4132387706855792, + "grad_norm": 2.780019998550415, + "learning_rate": 4.9535174654047295e-06, + "loss": 0.6406, + "step": 874 + }, + { + "epoch": 0.41371158392434987, + "grad_norm": 2.7629551887512207, + "learning_rate": 4.953397652704901e-06, + "loss": 0.6131, + "step": 875 + }, + { + "epoch": 0.41418439716312055, + "grad_norm": 2.8008246421813965, + "learning_rate": 4.9532776872427585e-06, + "loss": 0.6464, + "step": 876 + }, + { + "epoch": 0.41465721040189124, + "grad_norm": 3.0970115661621094, + "learning_rate": 4.953157569025772e-06, + "loss": 0.7066, + "step": 877 + }, + { + "epoch": 0.4151300236406619, + "grad_norm": 2.8375589847564697, + "learning_rate": 4.9530372980614195e-06, + "loss": 0.6551, + "step": 878 + }, + { + "epoch": 0.4156028368794326, + "grad_norm": 2.718843936920166, + "learning_rate": 4.952916874357191e-06, + "loss": 0.5947, + "step": 879 + }, + { + "epoch": 0.4160756501182033, + "grad_norm": 2.7104697227478027, + "learning_rate": 4.952796297920585e-06, + "loss": 0.6708, + "step": 880 + }, + { + "epoch": 0.416548463356974, + "grad_norm": 2.8223445415496826, + "learning_rate": 4.952675568759108e-06, + "loss": 0.6214, + "step": 881 + }, + { + "epoch": 0.41702127659574467, + "grad_norm": 2.6598153114318848, + "learning_rate": 4.952554686880279e-06, + "loss": 0.6116, + "step": 882 + }, + { + "epoch": 0.41749408983451536, + "grad_norm": 2.8639824390411377, + "learning_rate": 4.952433652291623e-06, + "loss": 0.5971, + "step": 883 + }, + { + "epoch": 0.41796690307328604, + "grad_norm": 2.9578304290771484, + "learning_rate": 4.952312465000677e-06, + "loss": 0.6785, + "step": 884 + }, + { + "epoch": 0.41843971631205673, + "grad_norm": 2.872144937515259, + "learning_rate": 4.952191125014987e-06, + "loss": 0.6772, + "step": 885 + }, + { + "epoch": 0.4189125295508274, + "grad_norm": 2.7513675689697266, + "learning_rate": 4.952069632342108e-06, + "loss": 0.702, + "step": 886 + }, + { + "epoch": 0.4193853427895981, + "grad_norm": 2.9275078773498535, + "learning_rate": 4.951947986989606e-06, + "loss": 0.589, + "step": 887 + }, + { + "epoch": 0.4198581560283688, + "grad_norm": 2.740549325942993, + "learning_rate": 4.951826188965053e-06, + "loss": 0.5942, + "step": 888 + }, + { + "epoch": 0.4203309692671395, + "grad_norm": 2.92452073097229, + "learning_rate": 4.951704238276035e-06, + "loss": 0.6819, + "step": 889 + }, + { + "epoch": 0.42080378250591016, + "grad_norm": 2.842491865158081, + "learning_rate": 4.951582134930144e-06, + "loss": 0.6304, + "step": 890 + }, + { + "epoch": 0.42127659574468085, + "grad_norm": 2.613478422164917, + "learning_rate": 4.951459878934983e-06, + "loss": 0.6912, + "step": 891 + }, + { + "epoch": 0.42174940898345153, + "grad_norm": 3.2408607006073, + "learning_rate": 4.951337470298165e-06, + "loss": 0.6755, + "step": 892 + }, + { + "epoch": 0.4222222222222222, + "grad_norm": 3.1022439002990723, + "learning_rate": 4.9512149090273125e-06, + "loss": 0.6138, + "step": 893 + }, + { + "epoch": 0.4226950354609929, + "grad_norm": 2.6418895721435547, + "learning_rate": 4.951092195130055e-06, + "loss": 0.639, + "step": 894 + }, + { + "epoch": 0.4231678486997636, + "grad_norm": 3.010744333267212, + "learning_rate": 4.950969328614035e-06, + "loss": 0.7102, + "step": 895 + }, + { + "epoch": 0.4236406619385343, + "grad_norm": 2.673292636871338, + "learning_rate": 4.950846309486901e-06, + "loss": 0.5676, + "step": 896 + }, + { + "epoch": 0.42411347517730497, + "grad_norm": 3.6974737644195557, + "learning_rate": 4.950723137756314e-06, + "loss": 0.5722, + "step": 897 + }, + { + "epoch": 0.42458628841607565, + "grad_norm": 3.69028902053833, + "learning_rate": 4.9505998134299435e-06, + "loss": 0.6337, + "step": 898 + }, + { + "epoch": 0.42505910165484634, + "grad_norm": 3.2136125564575195, + "learning_rate": 4.950476336515469e-06, + "loss": 0.6469, + "step": 899 + }, + { + "epoch": 0.425531914893617, + "grad_norm": 2.7396016120910645, + "learning_rate": 4.950352707020577e-06, + "loss": 0.6656, + "step": 900 + }, + { + "epoch": 0.4260047281323877, + "grad_norm": 2.825416088104248, + "learning_rate": 4.950228924952967e-06, + "loss": 0.6298, + "step": 901 + }, + { + "epoch": 0.4264775413711584, + "grad_norm": 3.401658535003662, + "learning_rate": 4.950104990320345e-06, + "loss": 0.778, + "step": 902 + }, + { + "epoch": 0.4269503546099291, + "grad_norm": 2.7002272605895996, + "learning_rate": 4.9499809031304294e-06, + "loss": 0.6536, + "step": 903 + }, + { + "epoch": 0.42742316784869977, + "grad_norm": 2.62386417388916, + "learning_rate": 4.949856663390945e-06, + "loss": 0.6629, + "step": 904 + }, + { + "epoch": 0.42789598108747046, + "grad_norm": 2.584247589111328, + "learning_rate": 4.94973227110963e-06, + "loss": 0.5813, + "step": 905 + }, + { + "epoch": 0.42836879432624114, + "grad_norm": 3.4365768432617188, + "learning_rate": 4.9496077262942265e-06, + "loss": 0.7648, + "step": 906 + }, + { + "epoch": 0.42884160756501183, + "grad_norm": 2.8993639945983887, + "learning_rate": 4.949483028952492e-06, + "loss": 0.6696, + "step": 907 + }, + { + "epoch": 0.4293144208037825, + "grad_norm": 2.922809362411499, + "learning_rate": 4.94935817909219e-06, + "loss": 0.6892, + "step": 908 + }, + { + "epoch": 0.4297872340425532, + "grad_norm": 2.85478138923645, + "learning_rate": 4.9492331767210944e-06, + "loss": 0.536, + "step": 909 + }, + { + "epoch": 0.4302600472813239, + "grad_norm": 2.8639259338378906, + "learning_rate": 4.949108021846988e-06, + "loss": 0.634, + "step": 910 + }, + { + "epoch": 0.4307328605200946, + "grad_norm": 3.0533697605133057, + "learning_rate": 4.948982714477664e-06, + "loss": 0.6318, + "step": 911 + }, + { + "epoch": 0.43120567375886526, + "grad_norm": 2.331674814224243, + "learning_rate": 4.9488572546209255e-06, + "loss": 0.6562, + "step": 912 + }, + { + "epoch": 0.43167848699763595, + "grad_norm": 3.0154623985290527, + "learning_rate": 4.9487316422845835e-06, + "loss": 0.6675, + "step": 913 + }, + { + "epoch": 0.43215130023640663, + "grad_norm": 2.7354514598846436, + "learning_rate": 4.948605877476459e-06, + "loss": 0.6012, + "step": 914 + }, + { + "epoch": 0.4326241134751773, + "grad_norm": 2.863736629486084, + "learning_rate": 4.948479960204383e-06, + "loss": 0.6062, + "step": 915 + }, + { + "epoch": 0.433096926713948, + "grad_norm": 3.01998233795166, + "learning_rate": 4.948353890476197e-06, + "loss": 0.6749, + "step": 916 + }, + { + "epoch": 0.4335697399527187, + "grad_norm": 2.7550456523895264, + "learning_rate": 4.94822766829975e-06, + "loss": 0.6507, + "step": 917 + }, + { + "epoch": 0.4340425531914894, + "grad_norm": 3.370572805404663, + "learning_rate": 4.948101293682901e-06, + "loss": 0.714, + "step": 918 + }, + { + "epoch": 0.43451536643026006, + "grad_norm": 2.9736790657043457, + "learning_rate": 4.947974766633519e-06, + "loss": 0.729, + "step": 919 + }, + { + "epoch": 0.43498817966903075, + "grad_norm": 3.1036548614501953, + "learning_rate": 4.947848087159483e-06, + "loss": 0.7547, + "step": 920 + }, + { + "epoch": 0.43546099290780144, + "grad_norm": 2.895094871520996, + "learning_rate": 4.947721255268679e-06, + "loss": 0.6089, + "step": 921 + }, + { + "epoch": 0.4359338061465721, + "grad_norm": 2.798476219177246, + "learning_rate": 4.947594270969005e-06, + "loss": 0.5432, + "step": 922 + }, + { + "epoch": 0.4364066193853428, + "grad_norm": 2.7675702571868896, + "learning_rate": 4.94746713426837e-06, + "loss": 0.5693, + "step": 923 + }, + { + "epoch": 0.4368794326241135, + "grad_norm": 2.6851553916931152, + "learning_rate": 4.947339845174687e-06, + "loss": 0.6503, + "step": 924 + }, + { + "epoch": 0.4373522458628842, + "grad_norm": 2.909635543823242, + "learning_rate": 4.947212403695883e-06, + "loss": 0.6494, + "step": 925 + }, + { + "epoch": 0.43782505910165487, + "grad_norm": 2.604526996612549, + "learning_rate": 4.947084809839894e-06, + "loss": 0.6349, + "step": 926 + }, + { + "epoch": 0.43829787234042555, + "grad_norm": 3.118149518966675, + "learning_rate": 4.946957063614664e-06, + "loss": 0.6219, + "step": 927 + }, + { + "epoch": 0.43877068557919624, + "grad_norm": 2.7452616691589355, + "learning_rate": 4.9468291650281465e-06, + "loss": 0.6096, + "step": 928 + }, + { + "epoch": 0.4392434988179669, + "grad_norm": 3.30098819732666, + "learning_rate": 4.946701114088307e-06, + "loss": 0.6277, + "step": 929 + }, + { + "epoch": 0.4397163120567376, + "grad_norm": 2.789482593536377, + "learning_rate": 4.946572910803116e-06, + "loss": 0.7, + "step": 930 + }, + { + "epoch": 0.4401891252955083, + "grad_norm": 2.7283935546875, + "learning_rate": 4.946444555180559e-06, + "loss": 0.5375, + "step": 931 + }, + { + "epoch": 0.440661938534279, + "grad_norm": 3.101304054260254, + "learning_rate": 4.946316047228627e-06, + "loss": 0.6131, + "step": 932 + }, + { + "epoch": 0.44113475177304967, + "grad_norm": 3.573908805847168, + "learning_rate": 4.946187386955321e-06, + "loss": 0.7073, + "step": 933 + }, + { + "epoch": 0.44160756501182036, + "grad_norm": 3.214979648590088, + "learning_rate": 4.946058574368653e-06, + "loss": 0.6508, + "step": 934 + }, + { + "epoch": 0.44208037825059104, + "grad_norm": 3.145082712173462, + "learning_rate": 4.945929609476643e-06, + "loss": 0.64, + "step": 935 + }, + { + "epoch": 0.4425531914893617, + "grad_norm": 2.991780996322632, + "learning_rate": 4.945800492287321e-06, + "loss": 0.6315, + "step": 936 + }, + { + "epoch": 0.44302600472813236, + "grad_norm": 3.2441139221191406, + "learning_rate": 4.945671222808727e-06, + "loss": 0.7144, + "step": 937 + }, + { + "epoch": 0.44349881796690305, + "grad_norm": 2.9397029876708984, + "learning_rate": 4.94554180104891e-06, + "loss": 0.6818, + "step": 938 + }, + { + "epoch": 0.44397163120567373, + "grad_norm": 3.2471461296081543, + "learning_rate": 4.945412227015929e-06, + "loss": 0.6921, + "step": 939 + }, + { + "epoch": 0.4444444444444444, + "grad_norm": 3.0882487297058105, + "learning_rate": 4.945282500717851e-06, + "loss": 0.718, + "step": 940 + }, + { + "epoch": 0.4449172576832151, + "grad_norm": 2.6035783290863037, + "learning_rate": 4.945152622162753e-06, + "loss": 0.621, + "step": 941 + }, + { + "epoch": 0.4453900709219858, + "grad_norm": 2.83659029006958, + "learning_rate": 4.945022591358724e-06, + "loss": 0.6403, + "step": 942 + }, + { + "epoch": 0.4458628841607565, + "grad_norm": 2.824463129043579, + "learning_rate": 4.944892408313859e-06, + "loss": 0.6594, + "step": 943 + }, + { + "epoch": 0.44633569739952716, + "grad_norm": 2.753735065460205, + "learning_rate": 4.9447620730362645e-06, + "loss": 0.6116, + "step": 944 + }, + { + "epoch": 0.44680851063829785, + "grad_norm": 3.0659725666046143, + "learning_rate": 4.944631585534056e-06, + "loss": 0.5983, + "step": 945 + }, + { + "epoch": 0.44728132387706854, + "grad_norm": 2.969113349914551, + "learning_rate": 4.944500945815357e-06, + "loss": 0.6859, + "step": 946 + }, + { + "epoch": 0.4477541371158392, + "grad_norm": 2.810303211212158, + "learning_rate": 4.944370153888303e-06, + "loss": 0.7025, + "step": 947 + }, + { + "epoch": 0.4482269503546099, + "grad_norm": 3.027721643447876, + "learning_rate": 4.944239209761038e-06, + "loss": 0.7268, + "step": 948 + }, + { + "epoch": 0.4486997635933806, + "grad_norm": 2.661503314971924, + "learning_rate": 4.944108113441716e-06, + "loss": 0.6702, + "step": 949 + }, + { + "epoch": 0.4491725768321513, + "grad_norm": 2.738591432571411, + "learning_rate": 4.943976864938498e-06, + "loss": 0.6728, + "step": 950 + }, + { + "epoch": 0.44964539007092197, + "grad_norm": 3.447505474090576, + "learning_rate": 4.943845464259557e-06, + "loss": 0.6586, + "step": 951 + }, + { + "epoch": 0.45011820330969265, + "grad_norm": 3.0968854427337646, + "learning_rate": 4.943713911413075e-06, + "loss": 0.7666, + "step": 952 + }, + { + "epoch": 0.45059101654846334, + "grad_norm": 2.4113779067993164, + "learning_rate": 4.943582206407244e-06, + "loss": 0.6173, + "step": 953 + }, + { + "epoch": 0.451063829787234, + "grad_norm": 2.6357979774475098, + "learning_rate": 4.943450349250263e-06, + "loss": 0.5589, + "step": 954 + }, + { + "epoch": 0.4515366430260047, + "grad_norm": 2.9182233810424805, + "learning_rate": 4.9433183399503425e-06, + "loss": 0.6252, + "step": 955 + }, + { + "epoch": 0.4520094562647754, + "grad_norm": 2.832740306854248, + "learning_rate": 4.943186178515703e-06, + "loss": 0.6882, + "step": 956 + }, + { + "epoch": 0.4524822695035461, + "grad_norm": 2.9508981704711914, + "learning_rate": 4.943053864954574e-06, + "loss": 0.5722, + "step": 957 + }, + { + "epoch": 0.4529550827423168, + "grad_norm": 3.044729471206665, + "learning_rate": 4.9429213992751925e-06, + "loss": 0.6772, + "step": 958 + }, + { + "epoch": 0.45342789598108746, + "grad_norm": 2.606003522872925, + "learning_rate": 4.9427887814858075e-06, + "loss": 0.6445, + "step": 959 + }, + { + "epoch": 0.45390070921985815, + "grad_norm": 2.4634225368499756, + "learning_rate": 4.942656011594676e-06, + "loss": 0.6151, + "step": 960 + }, + { + "epoch": 0.45437352245862883, + "grad_norm": 2.8872334957122803, + "learning_rate": 4.942523089610066e-06, + "loss": 0.6255, + "step": 961 + }, + { + "epoch": 0.4548463356973995, + "grad_norm": 2.870605707168579, + "learning_rate": 4.942390015540253e-06, + "loss": 0.7481, + "step": 962 + }, + { + "epoch": 0.4553191489361702, + "grad_norm": 2.952680826187134, + "learning_rate": 4.942256789393524e-06, + "loss": 0.5556, + "step": 963 + }, + { + "epoch": 0.4557919621749409, + "grad_norm": 2.623680353164673, + "learning_rate": 4.9421234111781725e-06, + "loss": 0.6115, + "step": 964 + }, + { + "epoch": 0.4562647754137116, + "grad_norm": 2.6933600902557373, + "learning_rate": 4.941989880902505e-06, + "loss": 0.6102, + "step": 965 + }, + { + "epoch": 0.45673758865248226, + "grad_norm": 2.6047189235687256, + "learning_rate": 4.941856198574836e-06, + "loss": 0.612, + "step": 966 + }, + { + "epoch": 0.45721040189125295, + "grad_norm": 2.779186725616455, + "learning_rate": 4.9417223642034885e-06, + "loss": 0.5424, + "step": 967 + }, + { + "epoch": 0.45768321513002364, + "grad_norm": 2.6177165508270264, + "learning_rate": 4.941588377796795e-06, + "loss": 0.4661, + "step": 968 + }, + { + "epoch": 0.4581560283687943, + "grad_norm": 2.959676742553711, + "learning_rate": 4.941454239363101e-06, + "loss": 0.6966, + "step": 969 + }, + { + "epoch": 0.458628841607565, + "grad_norm": 2.9788379669189453, + "learning_rate": 4.941319948910756e-06, + "loss": 0.6181, + "step": 970 + }, + { + "epoch": 0.4591016548463357, + "grad_norm": 4.642750263214111, + "learning_rate": 4.941185506448122e-06, + "loss": 0.5602, + "step": 971 + }, + { + "epoch": 0.4595744680851064, + "grad_norm": 2.793002128601074, + "learning_rate": 4.941050911983572e-06, + "loss": 0.602, + "step": 972 + }, + { + "epoch": 0.46004728132387707, + "grad_norm": 2.6833035945892334, + "learning_rate": 4.9409161655254845e-06, + "loss": 0.5549, + "step": 973 + }, + { + "epoch": 0.46052009456264775, + "grad_norm": 3.905032157897949, + "learning_rate": 4.94078126708225e-06, + "loss": 0.6335, + "step": 974 + }, + { + "epoch": 0.46099290780141844, + "grad_norm": 2.922609329223633, + "learning_rate": 4.94064621666227e-06, + "loss": 0.5839, + "step": 975 + }, + { + "epoch": 0.4614657210401891, + "grad_norm": 2.8277416229248047, + "learning_rate": 4.940511014273952e-06, + "loss": 0.629, + "step": 976 + }, + { + "epoch": 0.4619385342789598, + "grad_norm": 3.07511043548584, + "learning_rate": 4.940375659925714e-06, + "loss": 0.7058, + "step": 977 + }, + { + "epoch": 0.4624113475177305, + "grad_norm": 3.65043044090271, + "learning_rate": 4.940240153625984e-06, + "loss": 0.7174, + "step": 978 + }, + { + "epoch": 0.4628841607565012, + "grad_norm": 2.755167245864868, + "learning_rate": 4.9401044953832e-06, + "loss": 0.6548, + "step": 979 + }, + { + "epoch": 0.46335697399527187, + "grad_norm": 2.9881057739257812, + "learning_rate": 4.939968685205808e-06, + "loss": 0.6245, + "step": 980 + }, + { + "epoch": 0.46382978723404256, + "grad_norm": 2.9484212398529053, + "learning_rate": 4.939832723102266e-06, + "loss": 0.655, + "step": 981 + }, + { + "epoch": 0.46430260047281324, + "grad_norm": 2.898918628692627, + "learning_rate": 4.939696609081038e-06, + "loss": 0.6178, + "step": 982 + }, + { + "epoch": 0.46477541371158393, + "grad_norm": 2.7052435874938965, + "learning_rate": 4.9395603431506e-06, + "loss": 0.6393, + "step": 983 + }, + { + "epoch": 0.4652482269503546, + "grad_norm": 2.5610013008117676, + "learning_rate": 4.939423925319436e-06, + "loss": 0.4847, + "step": 984 + }, + { + "epoch": 0.4657210401891253, + "grad_norm": 3.229083299636841, + "learning_rate": 4.939287355596042e-06, + "loss": 0.6473, + "step": 985 + }, + { + "epoch": 0.466193853427896, + "grad_norm": 2.907097816467285, + "learning_rate": 4.9391506339889195e-06, + "loss": 0.652, + "step": 986 + }, + { + "epoch": 0.4666666666666667, + "grad_norm": 2.6929478645324707, + "learning_rate": 4.939013760506582e-06, + "loss": 0.6175, + "step": 987 + }, + { + "epoch": 0.46713947990543736, + "grad_norm": 3.414813280105591, + "learning_rate": 4.938876735157554e-06, + "loss": 0.7597, + "step": 988 + }, + { + "epoch": 0.46761229314420805, + "grad_norm": 3.297360420227051, + "learning_rate": 4.938739557950365e-06, + "loss": 0.6824, + "step": 989 + }, + { + "epoch": 0.46808510638297873, + "grad_norm": 3.083155393600464, + "learning_rate": 4.938602228893557e-06, + "loss": 0.6505, + "step": 990 + }, + { + "epoch": 0.4685579196217494, + "grad_norm": 2.9781153202056885, + "learning_rate": 4.938464747995681e-06, + "loss": 0.666, + "step": 991 + }, + { + "epoch": 0.4690307328605201, + "grad_norm": 3.1494534015655518, + "learning_rate": 4.9383271152652975e-06, + "loss": 0.6422, + "step": 992 + }, + { + "epoch": 0.4695035460992908, + "grad_norm": 2.547868490219116, + "learning_rate": 4.938189330710976e-06, + "loss": 0.5766, + "step": 993 + }, + { + "epoch": 0.4699763593380615, + "grad_norm": 2.684736967086792, + "learning_rate": 4.938051394341297e-06, + "loss": 0.6407, + "step": 994 + }, + { + "epoch": 0.47044917257683216, + "grad_norm": 2.9619693756103516, + "learning_rate": 4.937913306164847e-06, + "loss": 0.6936, + "step": 995 + }, + { + "epoch": 0.47092198581560285, + "grad_norm": 2.9698498249053955, + "learning_rate": 4.937775066190227e-06, + "loss": 0.6464, + "step": 996 + }, + { + "epoch": 0.47139479905437354, + "grad_norm": 3.121049642562866, + "learning_rate": 4.937636674426042e-06, + "loss": 0.6383, + "step": 997 + }, + { + "epoch": 0.4718676122931442, + "grad_norm": 3.113672971725464, + "learning_rate": 4.93749813088091e-06, + "loss": 0.6892, + "step": 998 + }, + { + "epoch": 0.4723404255319149, + "grad_norm": 3.126113176345825, + "learning_rate": 4.937359435563458e-06, + "loss": 0.6728, + "step": 999 + }, + { + "epoch": 0.4728132387706856, + "grad_norm": 3.353966236114502, + "learning_rate": 4.937220588482321e-06, + "loss": 0.6041, + "step": 1000 + }, + { + "epoch": 0.4732860520094563, + "grad_norm": 2.8860628604888916, + "learning_rate": 4.937081589646144e-06, + "loss": 0.6798, + "step": 1001 + }, + { + "epoch": 0.47375886524822697, + "grad_norm": 3.0510590076446533, + "learning_rate": 4.936942439063584e-06, + "loss": 0.5841, + "step": 1002 + }, + { + "epoch": 0.47423167848699765, + "grad_norm": 2.6998369693756104, + "learning_rate": 4.936803136743303e-06, + "loss": 0.6403, + "step": 1003 + }, + { + "epoch": 0.47470449172576834, + "grad_norm": 2.875347137451172, + "learning_rate": 4.9366636826939765e-06, + "loss": 0.5811, + "step": 1004 + }, + { + "epoch": 0.475177304964539, + "grad_norm": 2.9122262001037598, + "learning_rate": 4.936524076924287e-06, + "loss": 0.6852, + "step": 1005 + }, + { + "epoch": 0.4756501182033097, + "grad_norm": 2.5167057514190674, + "learning_rate": 4.9363843194429265e-06, + "loss": 0.5367, + "step": 1006 + }, + { + "epoch": 0.4761229314420804, + "grad_norm": 2.5745551586151123, + "learning_rate": 4.9362444102585985e-06, + "loss": 0.6241, + "step": 1007 + }, + { + "epoch": 0.4765957446808511, + "grad_norm": 2.5024216175079346, + "learning_rate": 4.9361043493800125e-06, + "loss": 0.6133, + "step": 1008 + }, + { + "epoch": 0.47706855791962177, + "grad_norm": 2.7281384468078613, + "learning_rate": 4.935964136815892e-06, + "loss": 0.6834, + "step": 1009 + }, + { + "epoch": 0.47754137115839246, + "grad_norm": 3.0118913650512695, + "learning_rate": 4.935823772574965e-06, + "loss": 0.6922, + "step": 1010 + }, + { + "epoch": 0.47801418439716314, + "grad_norm": 3.016216993331909, + "learning_rate": 4.935683256665973e-06, + "loss": 0.6653, + "step": 1011 + }, + { + "epoch": 0.47848699763593383, + "grad_norm": 2.9526784420013428, + "learning_rate": 4.9355425890976636e-06, + "loss": 0.6423, + "step": 1012 + }, + { + "epoch": 0.4789598108747045, + "grad_norm": 6.222797393798828, + "learning_rate": 4.9354017698787985e-06, + "loss": 0.5884, + "step": 1013 + }, + { + "epoch": 0.4794326241134752, + "grad_norm": 2.6553597450256348, + "learning_rate": 4.935260799018143e-06, + "loss": 0.6624, + "step": 1014 + }, + { + "epoch": 0.4799054373522459, + "grad_norm": 3.0942065715789795, + "learning_rate": 4.935119676524475e-06, + "loss": 0.6623, + "step": 1015 + }, + { + "epoch": 0.4803782505910166, + "grad_norm": 2.626359224319458, + "learning_rate": 4.934978402406585e-06, + "loss": 0.6195, + "step": 1016 + }, + { + "epoch": 0.4808510638297872, + "grad_norm": 2.7954699993133545, + "learning_rate": 4.934836976673265e-06, + "loss": 0.5545, + "step": 1017 + }, + { + "epoch": 0.4813238770685579, + "grad_norm": 2.913557291030884, + "learning_rate": 4.934695399333324e-06, + "loss": 0.6288, + "step": 1018 + }, + { + "epoch": 0.4817966903073286, + "grad_norm": 3.1043739318847656, + "learning_rate": 4.9345536703955746e-06, + "loss": 0.6771, + "step": 1019 + }, + { + "epoch": 0.48226950354609927, + "grad_norm": 2.789357900619507, + "learning_rate": 4.934411789868845e-06, + "loss": 0.6227, + "step": 1020 + }, + { + "epoch": 0.48274231678486995, + "grad_norm": 2.480609655380249, + "learning_rate": 4.934269757761967e-06, + "loss": 0.5779, + "step": 1021 + }, + { + "epoch": 0.48321513002364064, + "grad_norm": 2.7946252822875977, + "learning_rate": 4.934127574083785e-06, + "loss": 0.6166, + "step": 1022 + }, + { + "epoch": 0.4836879432624113, + "grad_norm": 3.0670509338378906, + "learning_rate": 4.933985238843153e-06, + "loss": 0.7766, + "step": 1023 + }, + { + "epoch": 0.484160756501182, + "grad_norm": 2.8567559719085693, + "learning_rate": 4.933842752048932e-06, + "loss": 0.5088, + "step": 1024 + }, + { + "epoch": 0.4846335697399527, + "grad_norm": 2.5674657821655273, + "learning_rate": 4.933700113709996e-06, + "loss": 0.6036, + "step": 1025 + }, + { + "epoch": 0.4851063829787234, + "grad_norm": 2.782339096069336, + "learning_rate": 4.933557323835224e-06, + "loss": 0.5335, + "step": 1026 + }, + { + "epoch": 0.48557919621749407, + "grad_norm": 2.6334071159362793, + "learning_rate": 4.93341438243351e-06, + "loss": 0.6327, + "step": 1027 + }, + { + "epoch": 0.48605200945626476, + "grad_norm": 3.0853965282440186, + "learning_rate": 4.933271289513751e-06, + "loss": 0.7102, + "step": 1028 + }, + { + "epoch": 0.48652482269503544, + "grad_norm": 2.619997501373291, + "learning_rate": 4.933128045084859e-06, + "loss": 0.6138, + "step": 1029 + }, + { + "epoch": 0.48699763593380613, + "grad_norm": 2.8316116333007812, + "learning_rate": 4.932984649155753e-06, + "loss": 0.6346, + "step": 1030 + }, + { + "epoch": 0.4874704491725768, + "grad_norm": 3.153486490249634, + "learning_rate": 4.932841101735361e-06, + "loss": 0.7626, + "step": 1031 + }, + { + "epoch": 0.4879432624113475, + "grad_norm": 3.1831274032592773, + "learning_rate": 4.9326974028326214e-06, + "loss": 0.6607, + "step": 1032 + }, + { + "epoch": 0.4884160756501182, + "grad_norm": 2.791078567504883, + "learning_rate": 4.932553552456481e-06, + "loss": 0.6141, + "step": 1033 + }, + { + "epoch": 0.4888888888888889, + "grad_norm": 2.627263307571411, + "learning_rate": 4.932409550615898e-06, + "loss": 0.6777, + "step": 1034 + }, + { + "epoch": 0.48936170212765956, + "grad_norm": 2.8550007343292236, + "learning_rate": 4.932265397319838e-06, + "loss": 0.6379, + "step": 1035 + }, + { + "epoch": 0.48983451536643025, + "grad_norm": 4.505824089050293, + "learning_rate": 4.932121092577276e-06, + "loss": 0.5892, + "step": 1036 + }, + { + "epoch": 0.49030732860520093, + "grad_norm": 3.100191116333008, + "learning_rate": 4.931976636397199e-06, + "loss": 0.6443, + "step": 1037 + }, + { + "epoch": 0.4907801418439716, + "grad_norm": 2.921494245529175, + "learning_rate": 4.9318320287886e-06, + "loss": 0.6821, + "step": 1038 + }, + { + "epoch": 0.4912529550827423, + "grad_norm": 4.577807903289795, + "learning_rate": 4.931687269760485e-06, + "loss": 0.5946, + "step": 1039 + }, + { + "epoch": 0.491725768321513, + "grad_norm": 2.7347636222839355, + "learning_rate": 4.931542359321865e-06, + "loss": 0.5689, + "step": 1040 + }, + { + "epoch": 0.4921985815602837, + "grad_norm": 2.5289158821105957, + "learning_rate": 4.931397297481765e-06, + "loss": 0.5632, + "step": 1041 + }, + { + "epoch": 0.49267139479905436, + "grad_norm": 3.3518471717834473, + "learning_rate": 4.9312520842492165e-06, + "loss": 0.6349, + "step": 1042 + }, + { + "epoch": 0.49314420803782505, + "grad_norm": 3.0469748973846436, + "learning_rate": 4.931106719633261e-06, + "loss": 0.5734, + "step": 1043 + }, + { + "epoch": 0.49361702127659574, + "grad_norm": 3.104682445526123, + "learning_rate": 4.930961203642951e-06, + "loss": 0.6101, + "step": 1044 + }, + { + "epoch": 0.4940898345153664, + "grad_norm": 2.776705503463745, + "learning_rate": 4.930815536287346e-06, + "loss": 0.6397, + "step": 1045 + }, + { + "epoch": 0.4945626477541371, + "grad_norm": 2.760380983352661, + "learning_rate": 4.930669717575516e-06, + "loss": 0.668, + "step": 1046 + }, + { + "epoch": 0.4950354609929078, + "grad_norm": 2.70084547996521, + "learning_rate": 4.930523747516541e-06, + "loss": 0.5729, + "step": 1047 + }, + { + "epoch": 0.4955082742316785, + "grad_norm": 2.7319583892822266, + "learning_rate": 4.930377626119511e-06, + "loss": 0.6258, + "step": 1048 + }, + { + "epoch": 0.49598108747044917, + "grad_norm": 3.2515223026275635, + "learning_rate": 4.930231353393521e-06, + "loss": 0.7412, + "step": 1049 + }, + { + "epoch": 0.49645390070921985, + "grad_norm": 3.0646486282348633, + "learning_rate": 4.930084929347682e-06, + "loss": 0.5809, + "step": 1050 + }, + { + "epoch": 0.49692671394799054, + "grad_norm": 3.1621921062469482, + "learning_rate": 4.9299383539911096e-06, + "loss": 0.6282, + "step": 1051 + }, + { + "epoch": 0.4973995271867612, + "grad_norm": 2.864713191986084, + "learning_rate": 4.929791627332931e-06, + "loss": 0.6263, + "step": 1052 + }, + { + "epoch": 0.4978723404255319, + "grad_norm": 3.181016683578491, + "learning_rate": 4.929644749382283e-06, + "loss": 0.5697, + "step": 1053 + }, + { + "epoch": 0.4983451536643026, + "grad_norm": 2.9064836502075195, + "learning_rate": 4.929497720148309e-06, + "loss": 0.6161, + "step": 1054 + }, + { + "epoch": 0.4988179669030733, + "grad_norm": 3.058112859725952, + "learning_rate": 4.9293505396401655e-06, + "loss": 0.6477, + "step": 1055 + }, + { + "epoch": 0.49929078014184397, + "grad_norm": 2.5227596759796143, + "learning_rate": 4.929203207867016e-06, + "loss": 0.5819, + "step": 1056 + }, + { + "epoch": 0.49976359338061466, + "grad_norm": 3.386862277984619, + "learning_rate": 4.929055724838035e-06, + "loss": 0.7342, + "step": 1057 + }, + { + "epoch": 0.5002364066193853, + "grad_norm": 3.368346929550171, + "learning_rate": 4.928908090562404e-06, + "loss": 0.6622, + "step": 1058 + }, + { + "epoch": 0.500709219858156, + "grad_norm": 2.9108314514160156, + "learning_rate": 4.928760305049317e-06, + "loss": 0.6598, + "step": 1059 + }, + { + "epoch": 0.5011820330969267, + "grad_norm": 2.822305917739868, + "learning_rate": 4.928612368307977e-06, + "loss": 0.5841, + "step": 1060 + }, + { + "epoch": 0.5016548463356973, + "grad_norm": 2.689131259918213, + "learning_rate": 4.928464280347592e-06, + "loss": 0.6631, + "step": 1061 + }, + { + "epoch": 0.502127659574468, + "grad_norm": 3.337214946746826, + "learning_rate": 4.9283160411773864e-06, + "loss": 0.6105, + "step": 1062 + }, + { + "epoch": 0.5026004728132387, + "grad_norm": 3.035911798477173, + "learning_rate": 4.928167650806588e-06, + "loss": 0.6981, + "step": 1063 + }, + { + "epoch": 0.5030732860520094, + "grad_norm": 2.8820855617523193, + "learning_rate": 4.9280191092444375e-06, + "loss": 0.6408, + "step": 1064 + }, + { + "epoch": 0.5035460992907801, + "grad_norm": 3.080432415008545, + "learning_rate": 4.927870416500183e-06, + "loss": 0.6398, + "step": 1065 + }, + { + "epoch": 0.5040189125295508, + "grad_norm": 2.761612892150879, + "learning_rate": 4.927721572583084e-06, + "loss": 0.6126, + "step": 1066 + }, + { + "epoch": 0.5044917257683215, + "grad_norm": 2.8561882972717285, + "learning_rate": 4.927572577502408e-06, + "loss": 0.584, + "step": 1067 + }, + { + "epoch": 0.5049645390070922, + "grad_norm": 3.3386311531066895, + "learning_rate": 4.927423431267432e-06, + "loss": 0.6666, + "step": 1068 + }, + { + "epoch": 0.5054373522458628, + "grad_norm": 2.632906675338745, + "learning_rate": 4.927274133887443e-06, + "loss": 0.632, + "step": 1069 + }, + { + "epoch": 0.5059101654846335, + "grad_norm": 2.8737308979034424, + "learning_rate": 4.927124685371737e-06, + "loss": 0.6051, + "step": 1070 + }, + { + "epoch": 0.5063829787234042, + "grad_norm": 3.042222738265991, + "learning_rate": 4.926975085729619e-06, + "loss": 0.6954, + "step": 1071 + }, + { + "epoch": 0.5068557919621749, + "grad_norm": 3.3341481685638428, + "learning_rate": 4.926825334970404e-06, + "loss": 0.7148, + "step": 1072 + }, + { + "epoch": 0.5073286052009456, + "grad_norm": 2.7415387630462646, + "learning_rate": 4.926675433103418e-06, + "loss": 0.5456, + "step": 1073 + }, + { + "epoch": 0.5078014184397163, + "grad_norm": 2.7545325756073, + "learning_rate": 4.926525380137993e-06, + "loss": 0.6213, + "step": 1074 + }, + { + "epoch": 0.508274231678487, + "grad_norm": 2.9153690338134766, + "learning_rate": 4.926375176083472e-06, + "loss": 0.6466, + "step": 1075 + }, + { + "epoch": 0.5087470449172576, + "grad_norm": 4.210638523101807, + "learning_rate": 4.926224820949209e-06, + "loss": 0.6192, + "step": 1076 + }, + { + "epoch": 0.5092198581560283, + "grad_norm": 2.4357898235321045, + "learning_rate": 4.926074314744565e-06, + "loss": 0.594, + "step": 1077 + }, + { + "epoch": 0.509692671394799, + "grad_norm": 2.8004701137542725, + "learning_rate": 4.92592365747891e-06, + "loss": 0.6276, + "step": 1078 + }, + { + "epoch": 0.5101654846335697, + "grad_norm": 2.920675039291382, + "learning_rate": 4.925772849161628e-06, + "loss": 0.6043, + "step": 1079 + }, + { + "epoch": 0.5106382978723404, + "grad_norm": 2.791555404663086, + "learning_rate": 4.9256218898021055e-06, + "loss": 0.6837, + "step": 1080 + }, + { + "epoch": 0.5111111111111111, + "grad_norm": 3.1702463626861572, + "learning_rate": 4.925470779409746e-06, + "loss": 0.668, + "step": 1081 + }, + { + "epoch": 0.5115839243498818, + "grad_norm": 2.7149479389190674, + "learning_rate": 4.925319517993955e-06, + "loss": 0.5842, + "step": 1082 + }, + { + "epoch": 0.5120567375886524, + "grad_norm": 2.916311025619507, + "learning_rate": 4.925168105564153e-06, + "loss": 0.6893, + "step": 1083 + }, + { + "epoch": 0.5125295508274231, + "grad_norm": 2.917654514312744, + "learning_rate": 4.925016542129767e-06, + "loss": 0.6513, + "step": 1084 + }, + { + "epoch": 0.5130023640661938, + "grad_norm": 2.5568928718566895, + "learning_rate": 4.924864827700234e-06, + "loss": 0.6177, + "step": 1085 + }, + { + "epoch": 0.5134751773049645, + "grad_norm": 2.816720485687256, + "learning_rate": 4.924712962285001e-06, + "loss": 0.5833, + "step": 1086 + }, + { + "epoch": 0.5139479905437352, + "grad_norm": 2.6989188194274902, + "learning_rate": 4.9245609458935235e-06, + "loss": 0.6332, + "step": 1087 + }, + { + "epoch": 0.5144208037825059, + "grad_norm": 2.959599494934082, + "learning_rate": 4.924408778535268e-06, + "loss": 0.626, + "step": 1088 + }, + { + "epoch": 0.5148936170212766, + "grad_norm": 2.872814416885376, + "learning_rate": 4.924256460219708e-06, + "loss": 0.6407, + "step": 1089 + }, + { + "epoch": 0.5153664302600472, + "grad_norm": 2.6989097595214844, + "learning_rate": 4.924103990956329e-06, + "loss": 0.6391, + "step": 1090 + }, + { + "epoch": 0.5158392434988179, + "grad_norm": 2.986492156982422, + "learning_rate": 4.9239513707546235e-06, + "loss": 0.6911, + "step": 1091 + }, + { + "epoch": 0.5163120567375886, + "grad_norm": 3.069920301437378, + "learning_rate": 4.9237985996240954e-06, + "loss": 0.671, + "step": 1092 + }, + { + "epoch": 0.5167848699763593, + "grad_norm": 2.8214917182922363, + "learning_rate": 4.9236456775742555e-06, + "loss": 0.5885, + "step": 1093 + }, + { + "epoch": 0.51725768321513, + "grad_norm": 2.9416961669921875, + "learning_rate": 4.923492604614627e-06, + "loss": 0.6293, + "step": 1094 + }, + { + "epoch": 0.5177304964539007, + "grad_norm": 2.761780023574829, + "learning_rate": 4.923339380754741e-06, + "loss": 0.649, + "step": 1095 + }, + { + "epoch": 0.5182033096926714, + "grad_norm": 2.7648792266845703, + "learning_rate": 4.923186006004138e-06, + "loss": 0.5906, + "step": 1096 + }, + { + "epoch": 0.518676122931442, + "grad_norm": 3.5535428524017334, + "learning_rate": 4.923032480372367e-06, + "loss": 0.7138, + "step": 1097 + }, + { + "epoch": 0.5191489361702127, + "grad_norm": 2.6252479553222656, + "learning_rate": 4.922878803868988e-06, + "loss": 0.5499, + "step": 1098 + }, + { + "epoch": 0.5196217494089834, + "grad_norm": 2.901002883911133, + "learning_rate": 4.9227249765035715e-06, + "loss": 0.6991, + "step": 1099 + }, + { + "epoch": 0.5200945626477541, + "grad_norm": 2.621877431869507, + "learning_rate": 4.9225709982856925e-06, + "loss": 0.6269, + "step": 1100 + }, + { + "epoch": 0.5205673758865248, + "grad_norm": 2.872483015060425, + "learning_rate": 4.92241686922494e-06, + "loss": 0.6657, + "step": 1101 + }, + { + "epoch": 0.5210401891252955, + "grad_norm": 2.730447769165039, + "learning_rate": 4.922262589330912e-06, + "loss": 0.6061, + "step": 1102 + }, + { + "epoch": 0.5215130023640662, + "grad_norm": 2.646247386932373, + "learning_rate": 4.922108158613213e-06, + "loss": 0.5923, + "step": 1103 + }, + { + "epoch": 0.5219858156028369, + "grad_norm": 2.6488895416259766, + "learning_rate": 4.92195357708146e-06, + "loss": 0.6293, + "step": 1104 + }, + { + "epoch": 0.5224586288416075, + "grad_norm": 2.756338357925415, + "learning_rate": 4.921798844745278e-06, + "loss": 0.6374, + "step": 1105 + }, + { + "epoch": 0.5229314420803782, + "grad_norm": 3.1441280841827393, + "learning_rate": 4.921643961614301e-06, + "loss": 0.6652, + "step": 1106 + }, + { + "epoch": 0.5234042553191489, + "grad_norm": 3.050002098083496, + "learning_rate": 4.921488927698172e-06, + "loss": 0.6809, + "step": 1107 + }, + { + "epoch": 0.5238770685579196, + "grad_norm": 2.71750807762146, + "learning_rate": 4.921333743006547e-06, + "loss": 0.6266, + "step": 1108 + }, + { + "epoch": 0.5243498817966903, + "grad_norm": 2.8439245223999023, + "learning_rate": 4.921178407549086e-06, + "loss": 0.5663, + "step": 1109 + }, + { + "epoch": 0.524822695035461, + "grad_norm": 3.0722241401672363, + "learning_rate": 4.921022921335464e-06, + "loss": 0.6791, + "step": 1110 + }, + { + "epoch": 0.5252955082742317, + "grad_norm": 3.4381656646728516, + "learning_rate": 4.920867284375358e-06, + "loss": 0.6687, + "step": 1111 + }, + { + "epoch": 0.5257683215130023, + "grad_norm": 2.819812774658203, + "learning_rate": 4.920711496678463e-06, + "loss": 0.6299, + "step": 1112 + }, + { + "epoch": 0.526241134751773, + "grad_norm": 3.6587414741516113, + "learning_rate": 4.9205555582544765e-06, + "loss": 0.7392, + "step": 1113 + }, + { + "epoch": 0.5267139479905437, + "grad_norm": 2.774296522140503, + "learning_rate": 4.920399469113109e-06, + "loss": 0.6652, + "step": 1114 + }, + { + "epoch": 0.5271867612293144, + "grad_norm": 2.7480580806732178, + "learning_rate": 4.920243229264081e-06, + "loss": 0.596, + "step": 1115 + }, + { + "epoch": 0.5276595744680851, + "grad_norm": 3.213057518005371, + "learning_rate": 4.920086838717119e-06, + "loss": 0.6986, + "step": 1116 + }, + { + "epoch": 0.5281323877068558, + "grad_norm": 2.940546989440918, + "learning_rate": 4.919930297481962e-06, + "loss": 0.6481, + "step": 1117 + }, + { + "epoch": 0.5286052009456265, + "grad_norm": 2.5970494747161865, + "learning_rate": 4.9197736055683555e-06, + "loss": 0.5658, + "step": 1118 + }, + { + "epoch": 0.5290780141843971, + "grad_norm": 4.49385404586792, + "learning_rate": 4.919616762986057e-06, + "loss": 0.605, + "step": 1119 + }, + { + "epoch": 0.5295508274231678, + "grad_norm": 2.971857786178589, + "learning_rate": 4.919459769744833e-06, + "loss": 0.6539, + "step": 1120 + }, + { + "epoch": 0.5300236406619385, + "grad_norm": 2.6192965507507324, + "learning_rate": 4.919302625854457e-06, + "loss": 0.6226, + "step": 1121 + }, + { + "epoch": 0.5304964539007092, + "grad_norm": 2.665088176727295, + "learning_rate": 4.919145331324716e-06, + "loss": 0.6647, + "step": 1122 + }, + { + "epoch": 0.5309692671394799, + "grad_norm": 2.612126111984253, + "learning_rate": 4.918987886165403e-06, + "loss": 0.6965, + "step": 1123 + }, + { + "epoch": 0.5314420803782506, + "grad_norm": 3.80017352104187, + "learning_rate": 4.9188302903863205e-06, + "loss": 0.7396, + "step": 1124 + }, + { + "epoch": 0.5319148936170213, + "grad_norm": 2.781752824783325, + "learning_rate": 4.918672543997282e-06, + "loss": 0.5985, + "step": 1125 + }, + { + "epoch": 0.532387706855792, + "grad_norm": 2.6067914962768555, + "learning_rate": 4.91851464700811e-06, + "loss": 0.6159, + "step": 1126 + }, + { + "epoch": 0.5328605200945626, + "grad_norm": 2.670807123184204, + "learning_rate": 4.918356599428636e-06, + "loss": 0.5958, + "step": 1127 + }, + { + "epoch": 0.5333333333333333, + "grad_norm": 2.608611822128296, + "learning_rate": 4.9181984012687e-06, + "loss": 0.5768, + "step": 1128 + }, + { + "epoch": 0.533806146572104, + "grad_norm": 2.586764097213745, + "learning_rate": 4.918040052538154e-06, + "loss": 0.661, + "step": 1129 + }, + { + "epoch": 0.5342789598108747, + "grad_norm": 3.1317451000213623, + "learning_rate": 4.917881553246856e-06, + "loss": 0.6626, + "step": 1130 + }, + { + "epoch": 0.5347517730496454, + "grad_norm": 2.7135281562805176, + "learning_rate": 4.917722903404676e-06, + "loss": 0.6572, + "step": 1131 + }, + { + "epoch": 0.5352245862884161, + "grad_norm": 3.4546358585357666, + "learning_rate": 4.917564103021493e-06, + "loss": 0.5597, + "step": 1132 + }, + { + "epoch": 0.5356973995271868, + "grad_norm": 3.0943493843078613, + "learning_rate": 4.917405152107193e-06, + "loss": 0.7258, + "step": 1133 + }, + { + "epoch": 0.5361702127659574, + "grad_norm": 2.6069352626800537, + "learning_rate": 4.917246050671674e-06, + "loss": 0.6209, + "step": 1134 + }, + { + "epoch": 0.5366430260047281, + "grad_norm": 2.584883689880371, + "learning_rate": 4.917086798724844e-06, + "loss": 0.658, + "step": 1135 + }, + { + "epoch": 0.5371158392434988, + "grad_norm": 3.001976490020752, + "learning_rate": 4.9169273962766166e-06, + "loss": 0.6306, + "step": 1136 + }, + { + "epoch": 0.5375886524822695, + "grad_norm": 2.5013928413391113, + "learning_rate": 4.916767843336918e-06, + "loss": 0.572, + "step": 1137 + }, + { + "epoch": 0.5380614657210402, + "grad_norm": 2.9114553928375244, + "learning_rate": 4.916608139915684e-06, + "loss": 0.5841, + "step": 1138 + }, + { + "epoch": 0.5385342789598109, + "grad_norm": 2.8878467082977295, + "learning_rate": 4.9164482860228564e-06, + "loss": 0.6654, + "step": 1139 + }, + { + "epoch": 0.5390070921985816, + "grad_norm": 2.9827866554260254, + "learning_rate": 4.91628828166839e-06, + "loss": 0.6674, + "step": 1140 + }, + { + "epoch": 0.5394799054373522, + "grad_norm": 3.8696281909942627, + "learning_rate": 4.916128126862248e-06, + "loss": 0.6241, + "step": 1141 + }, + { + "epoch": 0.5399527186761229, + "grad_norm": 2.9556291103363037, + "learning_rate": 4.915967821614402e-06, + "loss": 0.6478, + "step": 1142 + }, + { + "epoch": 0.5404255319148936, + "grad_norm": 2.392942428588867, + "learning_rate": 4.915807365934834e-06, + "loss": 0.6097, + "step": 1143 + }, + { + "epoch": 0.5408983451536643, + "grad_norm": 3.032235860824585, + "learning_rate": 4.915646759833534e-06, + "loss": 0.7193, + "step": 1144 + }, + { + "epoch": 0.541371158392435, + "grad_norm": 2.840416193008423, + "learning_rate": 4.915486003320501e-06, + "loss": 0.5506, + "step": 1145 + }, + { + "epoch": 0.5418439716312057, + "grad_norm": 2.5438895225524902, + "learning_rate": 4.915325096405747e-06, + "loss": 0.6487, + "step": 1146 + }, + { + "epoch": 0.5423167848699764, + "grad_norm": 2.544334650039673, + "learning_rate": 4.9151640390992905e-06, + "loss": 0.6168, + "step": 1147 + }, + { + "epoch": 0.542789598108747, + "grad_norm": 2.8535678386688232, + "learning_rate": 4.91500283141116e-06, + "loss": 0.678, + "step": 1148 + }, + { + "epoch": 0.5432624113475177, + "grad_norm": 2.8086955547332764, + "learning_rate": 4.9148414733513915e-06, + "loss": 0.6473, + "step": 1149 + }, + { + "epoch": 0.5437352245862884, + "grad_norm": 2.4709885120391846, + "learning_rate": 4.914679964930034e-06, + "loss": 0.6797, + "step": 1150 + }, + { + "epoch": 0.5442080378250591, + "grad_norm": 2.8546934127807617, + "learning_rate": 4.9145183061571435e-06, + "loss": 0.6247, + "step": 1151 + }, + { + "epoch": 0.5446808510638298, + "grad_norm": 2.991184711456299, + "learning_rate": 4.9143564970427844e-06, + "loss": 0.5977, + "step": 1152 + }, + { + "epoch": 0.5451536643026005, + "grad_norm": 3.011216402053833, + "learning_rate": 4.914194537597033e-06, + "loss": 0.7005, + "step": 1153 + }, + { + "epoch": 0.5456264775413712, + "grad_norm": 2.807521343231201, + "learning_rate": 4.9140324278299744e-06, + "loss": 0.5412, + "step": 1154 + }, + { + "epoch": 0.5460992907801419, + "grad_norm": 3.0401229858398438, + "learning_rate": 4.913870167751701e-06, + "loss": 0.6394, + "step": 1155 + }, + { + "epoch": 0.5465721040189125, + "grad_norm": 2.853914976119995, + "learning_rate": 4.913707757372317e-06, + "loss": 0.6745, + "step": 1156 + }, + { + "epoch": 0.5470449172576832, + "grad_norm": 4.505620956420898, + "learning_rate": 4.913545196701935e-06, + "loss": 0.6668, + "step": 1157 + }, + { + "epoch": 0.5475177304964539, + "grad_norm": 3.0505781173706055, + "learning_rate": 4.913382485750676e-06, + "loss": 0.6926, + "step": 1158 + }, + { + "epoch": 0.5479905437352246, + "grad_norm": 2.798435688018799, + "learning_rate": 4.913219624528672e-06, + "loss": 0.605, + "step": 1159 + }, + { + "epoch": 0.5484633569739953, + "grad_norm": 2.7814908027648926, + "learning_rate": 4.913056613046065e-06, + "loss": 0.6678, + "step": 1160 + }, + { + "epoch": 0.548936170212766, + "grad_norm": 3.2089321613311768, + "learning_rate": 4.9128934513130025e-06, + "loss": 0.5995, + "step": 1161 + }, + { + "epoch": 0.5494089834515367, + "grad_norm": 2.7699952125549316, + "learning_rate": 4.9127301393396455e-06, + "loss": 0.7062, + "step": 1162 + }, + { + "epoch": 0.5498817966903073, + "grad_norm": 2.859368324279785, + "learning_rate": 4.912566677136162e-06, + "loss": 0.6063, + "step": 1163 + }, + { + "epoch": 0.550354609929078, + "grad_norm": 2.727334499359131, + "learning_rate": 4.91240306471273e-06, + "loss": 0.6848, + "step": 1164 + }, + { + "epoch": 0.5508274231678487, + "grad_norm": 2.6017510890960693, + "learning_rate": 4.912239302079537e-06, + "loss": 0.5808, + "step": 1165 + }, + { + "epoch": 0.5513002364066194, + "grad_norm": 3.539583206176758, + "learning_rate": 4.912075389246781e-06, + "loss": 0.7053, + "step": 1166 + }, + { + "epoch": 0.5517730496453901, + "grad_norm": 2.918280601501465, + "learning_rate": 4.911911326224666e-06, + "loss": 0.5904, + "step": 1167 + }, + { + "epoch": 0.5522458628841608, + "grad_norm": 3.0067362785339355, + "learning_rate": 4.9117471130234095e-06, + "loss": 0.6392, + "step": 1168 + }, + { + "epoch": 0.5527186761229315, + "grad_norm": 2.4374797344207764, + "learning_rate": 4.911582749653236e-06, + "loss": 0.5793, + "step": 1169 + }, + { + "epoch": 0.5531914893617021, + "grad_norm": 3.121182918548584, + "learning_rate": 4.911418236124378e-06, + "loss": 0.6636, + "step": 1170 + }, + { + "epoch": 0.5536643026004728, + "grad_norm": 3.1289851665496826, + "learning_rate": 4.91125357244708e-06, + "loss": 0.656, + "step": 1171 + }, + { + "epoch": 0.5541371158392435, + "grad_norm": 2.7034592628479004, + "learning_rate": 4.911088758631596e-06, + "loss": 0.6001, + "step": 1172 + }, + { + "epoch": 0.5546099290780142, + "grad_norm": 2.710146188735962, + "learning_rate": 4.910923794688187e-06, + "loss": 0.6007, + "step": 1173 + }, + { + "epoch": 0.5550827423167849, + "grad_norm": 2.5424487590789795, + "learning_rate": 4.910758680627124e-06, + "loss": 0.5193, + "step": 1174 + }, + { + "epoch": 0.5555555555555556, + "grad_norm": 2.615893602371216, + "learning_rate": 4.91059341645869e-06, + "loss": 0.5525, + "step": 1175 + }, + { + "epoch": 0.5560283687943263, + "grad_norm": 3.3179728984832764, + "learning_rate": 4.910428002193174e-06, + "loss": 0.7285, + "step": 1176 + }, + { + "epoch": 0.556501182033097, + "grad_norm": 2.7234175205230713, + "learning_rate": 4.910262437840875e-06, + "loss": 0.574, + "step": 1177 + }, + { + "epoch": 0.5569739952718676, + "grad_norm": 3.0416605472564697, + "learning_rate": 4.9100967234121034e-06, + "loss": 0.5623, + "step": 1178 + }, + { + "epoch": 0.5574468085106383, + "grad_norm": 3.067786455154419, + "learning_rate": 4.909930858917177e-06, + "loss": 0.6491, + "step": 1179 + }, + { + "epoch": 0.557919621749409, + "grad_norm": 3.0037379264831543, + "learning_rate": 4.909764844366422e-06, + "loss": 0.5696, + "step": 1180 + }, + { + "epoch": 0.5583924349881797, + "grad_norm": 2.966179609298706, + "learning_rate": 4.909598679770178e-06, + "loss": 0.6042, + "step": 1181 + }, + { + "epoch": 0.5588652482269504, + "grad_norm": 2.6000657081604004, + "learning_rate": 4.909432365138789e-06, + "loss": 0.5883, + "step": 1182 + }, + { + "epoch": 0.5593380614657211, + "grad_norm": 2.6794495582580566, + "learning_rate": 4.909265900482612e-06, + "loss": 0.6809, + "step": 1183 + }, + { + "epoch": 0.5598108747044918, + "grad_norm": 2.6765122413635254, + "learning_rate": 4.9090992858120115e-06, + "loss": 0.6601, + "step": 1184 + }, + { + "epoch": 0.5602836879432624, + "grad_norm": 2.6051928997039795, + "learning_rate": 4.908932521137363e-06, + "loss": 0.5946, + "step": 1185 + }, + { + "epoch": 0.5607565011820331, + "grad_norm": 3.0405542850494385, + "learning_rate": 4.908765606469048e-06, + "loss": 0.6998, + "step": 1186 + }, + { + "epoch": 0.5612293144208038, + "grad_norm": 2.7975668907165527, + "learning_rate": 4.908598541817462e-06, + "loss": 0.6218, + "step": 1187 + }, + { + "epoch": 0.5617021276595745, + "grad_norm": 2.5367627143859863, + "learning_rate": 4.908431327193005e-06, + "loss": 0.6354, + "step": 1188 + }, + { + "epoch": 0.5621749408983452, + "grad_norm": 3.7939631938934326, + "learning_rate": 4.908263962606091e-06, + "loss": 0.6376, + "step": 1189 + }, + { + "epoch": 0.5626477541371159, + "grad_norm": 2.864079475402832, + "learning_rate": 4.908096448067139e-06, + "loss": 0.5485, + "step": 1190 + }, + { + "epoch": 0.5631205673758866, + "grad_norm": 2.7855563163757324, + "learning_rate": 4.9079287835865804e-06, + "loss": 0.6645, + "step": 1191 + }, + { + "epoch": 0.5635933806146572, + "grad_norm": 2.6156625747680664, + "learning_rate": 4.9077609691748556e-06, + "loss": 0.5751, + "step": 1192 + }, + { + "epoch": 0.5640661938534279, + "grad_norm": 3.0475659370422363, + "learning_rate": 4.907593004842412e-06, + "loss": 0.6739, + "step": 1193 + }, + { + "epoch": 0.5645390070921986, + "grad_norm": 2.9176738262176514, + "learning_rate": 4.9074248905997104e-06, + "loss": 0.6493, + "step": 1194 + }, + { + "epoch": 0.5650118203309693, + "grad_norm": 2.6168384552001953, + "learning_rate": 4.907256626457216e-06, + "loss": 0.6154, + "step": 1195 + }, + { + "epoch": 0.56548463356974, + "grad_norm": 2.893980026245117, + "learning_rate": 4.907088212425408e-06, + "loss": 0.5808, + "step": 1196 + }, + { + "epoch": 0.5659574468085107, + "grad_norm": 3.3832836151123047, + "learning_rate": 4.90691964851477e-06, + "loss": 0.7888, + "step": 1197 + }, + { + "epoch": 0.5664302600472814, + "grad_norm": 3.088932752609253, + "learning_rate": 4.906750934735801e-06, + "loss": 0.6516, + "step": 1198 + }, + { + "epoch": 0.566903073286052, + "grad_norm": 2.494471549987793, + "learning_rate": 4.906582071099004e-06, + "loss": 0.6286, + "step": 1199 + }, + { + "epoch": 0.5673758865248227, + "grad_norm": 2.716550588607788, + "learning_rate": 4.906413057614895e-06, + "loss": 0.5939, + "step": 1200 + }, + { + "epoch": 0.5678486997635934, + "grad_norm": 2.5821073055267334, + "learning_rate": 4.906243894293995e-06, + "loss": 0.6668, + "step": 1201 + }, + { + "epoch": 0.5683215130023641, + "grad_norm": 3.651787042617798, + "learning_rate": 4.90607458114684e-06, + "loss": 0.6124, + "step": 1202 + }, + { + "epoch": 0.5687943262411348, + "grad_norm": 2.7567858695983887, + "learning_rate": 4.9059051181839705e-06, + "loss": 0.6656, + "step": 1203 + }, + { + "epoch": 0.5692671394799055, + "grad_norm": 2.8067586421966553, + "learning_rate": 4.90573550541594e-06, + "loss": 0.6306, + "step": 1204 + }, + { + "epoch": 0.5697399527186762, + "grad_norm": 2.6136393547058105, + "learning_rate": 4.905565742853307e-06, + "loss": 0.5992, + "step": 1205 + }, + { + "epoch": 0.5702127659574469, + "grad_norm": 2.899049758911133, + "learning_rate": 4.905395830506644e-06, + "loss": 0.621, + "step": 1206 + }, + { + "epoch": 0.5706855791962175, + "grad_norm": 3.036583185195923, + "learning_rate": 4.9052257683865294e-06, + "loss": 0.652, + "step": 1207 + }, + { + "epoch": 0.5711583924349882, + "grad_norm": 2.7947216033935547, + "learning_rate": 4.905055556503553e-06, + "loss": 0.6636, + "step": 1208 + }, + { + "epoch": 0.5716312056737589, + "grad_norm": 3.1646955013275146, + "learning_rate": 4.9048851948683135e-06, + "loss": 0.6376, + "step": 1209 + }, + { + "epoch": 0.5721040189125296, + "grad_norm": 2.8175766468048096, + "learning_rate": 4.904714683491417e-06, + "loss": 0.5929, + "step": 1210 + }, + { + "epoch": 0.5725768321513003, + "grad_norm": 2.923923969268799, + "learning_rate": 4.904544022383483e-06, + "loss": 0.6633, + "step": 1211 + }, + { + "epoch": 0.573049645390071, + "grad_norm": 2.7471134662628174, + "learning_rate": 4.9043732115551356e-06, + "loss": 0.6551, + "step": 1212 + }, + { + "epoch": 0.5735224586288417, + "grad_norm": 2.8660807609558105, + "learning_rate": 4.90420225101701e-06, + "loss": 0.6423, + "step": 1213 + }, + { + "epoch": 0.5739952718676123, + "grad_norm": 2.769247531890869, + "learning_rate": 4.904031140779754e-06, + "loss": 0.5982, + "step": 1214 + }, + { + "epoch": 0.574468085106383, + "grad_norm": 2.9043145179748535, + "learning_rate": 4.90385988085402e-06, + "loss": 0.5843, + "step": 1215 + }, + { + "epoch": 0.5749408983451537, + "grad_norm": 2.6639609336853027, + "learning_rate": 4.903688471250471e-06, + "loss": 0.5858, + "step": 1216 + }, + { + "epoch": 0.5754137115839244, + "grad_norm": 2.6967573165893555, + "learning_rate": 4.903516911979781e-06, + "loss": 0.5755, + "step": 1217 + }, + { + "epoch": 0.5758865248226951, + "grad_norm": 2.8865857124328613, + "learning_rate": 4.903345203052633e-06, + "loss": 0.6051, + "step": 1218 + }, + { + "epoch": 0.5763593380614658, + "grad_norm": 2.381979465484619, + "learning_rate": 4.903173344479717e-06, + "loss": 0.5727, + "step": 1219 + }, + { + "epoch": 0.5768321513002365, + "grad_norm": 2.7717981338500977, + "learning_rate": 4.903001336271734e-06, + "loss": 0.6406, + "step": 1220 + }, + { + "epoch": 0.577304964539007, + "grad_norm": 2.6431570053100586, + "learning_rate": 4.902829178439395e-06, + "loss": 0.6226, + "step": 1221 + }, + { + "epoch": 0.5777777777777777, + "grad_norm": 2.8090415000915527, + "learning_rate": 4.902656870993419e-06, + "loss": 0.5761, + "step": 1222 + }, + { + "epoch": 0.5782505910165484, + "grad_norm": 2.4769368171691895, + "learning_rate": 4.902484413944535e-06, + "loss": 0.5602, + "step": 1223 + }, + { + "epoch": 0.5787234042553191, + "grad_norm": 2.693316698074341, + "learning_rate": 4.902311807303481e-06, + "loss": 0.5222, + "step": 1224 + }, + { + "epoch": 0.5791962174940898, + "grad_norm": 2.7623913288116455, + "learning_rate": 4.902139051081004e-06, + "loss": 0.6978, + "step": 1225 + }, + { + "epoch": 0.5796690307328605, + "grad_norm": 2.6133766174316406, + "learning_rate": 4.901966145287863e-06, + "loss": 0.5802, + "step": 1226 + }, + { + "epoch": 0.5801418439716312, + "grad_norm": 2.7345972061157227, + "learning_rate": 4.901793089934821e-06, + "loss": 0.6294, + "step": 1227 + }, + { + "epoch": 0.5806146572104018, + "grad_norm": 2.7545835971832275, + "learning_rate": 4.9016198850326555e-06, + "loss": 0.6085, + "step": 1228 + }, + { + "epoch": 0.5810874704491725, + "grad_norm": 2.6947758197784424, + "learning_rate": 4.90144653059215e-06, + "loss": 0.6025, + "step": 1229 + }, + { + "epoch": 0.5815602836879432, + "grad_norm": 2.692967414855957, + "learning_rate": 4.901273026624099e-06, + "loss": 0.5715, + "step": 1230 + }, + { + "epoch": 0.5820330969267139, + "grad_norm": 2.78347110748291, + "learning_rate": 4.901099373139307e-06, + "loss": 0.6063, + "step": 1231 + }, + { + "epoch": 0.5825059101654846, + "grad_norm": 2.346496343612671, + "learning_rate": 4.900925570148585e-06, + "loss": 0.5869, + "step": 1232 + }, + { + "epoch": 0.5829787234042553, + "grad_norm": 2.606639862060547, + "learning_rate": 4.900751617662755e-06, + "loss": 0.6197, + "step": 1233 + }, + { + "epoch": 0.583451536643026, + "grad_norm": 2.5825929641723633, + "learning_rate": 4.900577515692649e-06, + "loss": 0.6721, + "step": 1234 + }, + { + "epoch": 0.5839243498817966, + "grad_norm": 2.731349468231201, + "learning_rate": 4.900403264249107e-06, + "loss": 0.6273, + "step": 1235 + }, + { + "epoch": 0.5843971631205673, + "grad_norm": 3.2133874893188477, + "learning_rate": 4.90022886334298e-06, + "loss": 0.6231, + "step": 1236 + }, + { + "epoch": 0.584869976359338, + "grad_norm": 2.9213852882385254, + "learning_rate": 4.900054312985127e-06, + "loss": 0.6677, + "step": 1237 + }, + { + "epoch": 0.5853427895981087, + "grad_norm": 2.815425157546997, + "learning_rate": 4.899879613186414e-06, + "loss": 0.6405, + "step": 1238 + }, + { + "epoch": 0.5858156028368794, + "grad_norm": 2.730782985687256, + "learning_rate": 4.899704763957721e-06, + "loss": 0.6233, + "step": 1239 + }, + { + "epoch": 0.5862884160756501, + "grad_norm": 2.6432766914367676, + "learning_rate": 4.899529765309936e-06, + "loss": 0.6267, + "step": 1240 + }, + { + "epoch": 0.5867612293144208, + "grad_norm": 2.616215229034424, + "learning_rate": 4.899354617253953e-06, + "loss": 0.6268, + "step": 1241 + }, + { + "epoch": 0.5872340425531914, + "grad_norm": 2.7630255222320557, + "learning_rate": 4.899179319800679e-06, + "loss": 0.6348, + "step": 1242 + }, + { + "epoch": 0.5877068557919621, + "grad_norm": 2.785095453262329, + "learning_rate": 4.899003872961029e-06, + "loss": 0.5839, + "step": 1243 + }, + { + "epoch": 0.5881796690307328, + "grad_norm": 2.9050328731536865, + "learning_rate": 4.898828276745927e-06, + "loss": 0.651, + "step": 1244 + }, + { + "epoch": 0.5886524822695035, + "grad_norm": 2.958092212677002, + "learning_rate": 4.8986525311663065e-06, + "loss": 0.6395, + "step": 1245 + }, + { + "epoch": 0.5891252955082742, + "grad_norm": 2.952310800552368, + "learning_rate": 4.898476636233111e-06, + "loss": 0.6731, + "step": 1246 + }, + { + "epoch": 0.5895981087470449, + "grad_norm": 2.9876346588134766, + "learning_rate": 4.898300591957293e-06, + "loss": 0.7015, + "step": 1247 + }, + { + "epoch": 0.5900709219858156, + "grad_norm": 2.8941752910614014, + "learning_rate": 4.898124398349813e-06, + "loss": 0.6452, + "step": 1248 + }, + { + "epoch": 0.5905437352245863, + "grad_norm": 2.9809536933898926, + "learning_rate": 4.897948055421642e-06, + "loss": 0.5736, + "step": 1249 + }, + { + "epoch": 0.5910165484633569, + "grad_norm": 2.927046775817871, + "learning_rate": 4.897771563183761e-06, + "loss": 0.5918, + "step": 1250 + }, + { + "epoch": 0.5914893617021276, + "grad_norm": 2.865020275115967, + "learning_rate": 4.897594921647158e-06, + "loss": 0.6924, + "step": 1251 + }, + { + "epoch": 0.5919621749408983, + "grad_norm": 2.7406699657440186, + "learning_rate": 4.897418130822832e-06, + "loss": 0.509, + "step": 1252 + }, + { + "epoch": 0.592434988179669, + "grad_norm": 2.781606912612915, + "learning_rate": 4.897241190721791e-06, + "loss": 0.5555, + "step": 1253 + }, + { + "epoch": 0.5929078014184397, + "grad_norm": 2.79209303855896, + "learning_rate": 4.8970641013550535e-06, + "loss": 0.6722, + "step": 1254 + }, + { + "epoch": 0.5933806146572104, + "grad_norm": 3.0672268867492676, + "learning_rate": 4.896886862733645e-06, + "loss": 0.6366, + "step": 1255 + }, + { + "epoch": 0.5938534278959811, + "grad_norm": 2.7456953525543213, + "learning_rate": 4.896709474868602e-06, + "loss": 0.6246, + "step": 1256 + }, + { + "epoch": 0.5943262411347517, + "grad_norm": 3.6731202602386475, + "learning_rate": 4.896531937770968e-06, + "loss": 0.668, + "step": 1257 + }, + { + "epoch": 0.5947990543735224, + "grad_norm": 2.6056087017059326, + "learning_rate": 4.8963542514518e-06, + "loss": 0.5815, + "step": 1258 + }, + { + "epoch": 0.5952718676122931, + "grad_norm": 2.719698905944824, + "learning_rate": 4.89617641592216e-06, + "loss": 0.6058, + "step": 1259 + }, + { + "epoch": 0.5957446808510638, + "grad_norm": 2.625838279724121, + "learning_rate": 4.895998431193121e-06, + "loss": 0.6143, + "step": 1260 + }, + { + "epoch": 0.5962174940898345, + "grad_norm": 2.7166085243225098, + "learning_rate": 4.895820297275767e-06, + "loss": 0.5187, + "step": 1261 + }, + { + "epoch": 0.5966903073286052, + "grad_norm": 2.7544102668762207, + "learning_rate": 4.8956420141811875e-06, + "loss": 0.5928, + "step": 1262 + }, + { + "epoch": 0.5971631205673759, + "grad_norm": 2.6678333282470703, + "learning_rate": 4.895463581920484e-06, + "loss": 0.611, + "step": 1263 + }, + { + "epoch": 0.5976359338061465, + "grad_norm": 2.853384494781494, + "learning_rate": 4.895285000504768e-06, + "loss": 0.642, + "step": 1264 + }, + { + "epoch": 0.5981087470449172, + "grad_norm": 2.637852430343628, + "learning_rate": 4.895106269945158e-06, + "loss": 0.6308, + "step": 1265 + }, + { + "epoch": 0.5985815602836879, + "grad_norm": 2.9880387783050537, + "learning_rate": 4.8949273902527826e-06, + "loss": 0.5781, + "step": 1266 + }, + { + "epoch": 0.5990543735224586, + "grad_norm": 3.5984015464782715, + "learning_rate": 4.89474836143878e-06, + "loss": 0.5865, + "step": 1267 + }, + { + "epoch": 0.5995271867612293, + "grad_norm": 2.719855546951294, + "learning_rate": 4.8945691835142975e-06, + "loss": 0.6393, + "step": 1268 + }, + { + "epoch": 0.6, + "grad_norm": 2.7885141372680664, + "learning_rate": 4.894389856490492e-06, + "loss": 0.66, + "step": 1269 + }, + { + "epoch": 0.6004728132387707, + "grad_norm": 2.698819875717163, + "learning_rate": 4.894210380378529e-06, + "loss": 0.6144, + "step": 1270 + }, + { + "epoch": 0.6009456264775414, + "grad_norm": 2.278045654296875, + "learning_rate": 4.894030755189584e-06, + "loss": 0.5609, + "step": 1271 + }, + { + "epoch": 0.601418439716312, + "grad_norm": 2.8729357719421387, + "learning_rate": 4.893850980934841e-06, + "loss": 0.6715, + "step": 1272 + }, + { + "epoch": 0.6018912529550827, + "grad_norm": 2.8541221618652344, + "learning_rate": 4.893671057625495e-06, + "loss": 0.6787, + "step": 1273 + }, + { + "epoch": 0.6023640661938534, + "grad_norm": 2.4561476707458496, + "learning_rate": 4.893490985272748e-06, + "loss": 0.6331, + "step": 1274 + }, + { + "epoch": 0.6028368794326241, + "grad_norm": 2.565739154815674, + "learning_rate": 4.893310763887812e-06, + "loss": 0.587, + "step": 1275 + }, + { + "epoch": 0.6033096926713948, + "grad_norm": 2.384951591491699, + "learning_rate": 4.8931303934819095e-06, + "loss": 0.5358, + "step": 1276 + }, + { + "epoch": 0.6037825059101655, + "grad_norm": 2.380808115005493, + "learning_rate": 4.89294987406627e-06, + "loss": 0.5402, + "step": 1277 + }, + { + "epoch": 0.6042553191489362, + "grad_norm": 2.764815092086792, + "learning_rate": 4.892769205652136e-06, + "loss": 0.6103, + "step": 1278 + }, + { + "epoch": 0.6047281323877068, + "grad_norm": 2.463350296020508, + "learning_rate": 4.892588388250754e-06, + "loss": 0.5937, + "step": 1279 + }, + { + "epoch": 0.6052009456264775, + "grad_norm": 3.099689245223999, + "learning_rate": 4.8924074218733855e-06, + "loss": 0.6354, + "step": 1280 + }, + { + "epoch": 0.6056737588652482, + "grad_norm": 2.804450035095215, + "learning_rate": 4.892226306531297e-06, + "loss": 0.6595, + "step": 1281 + }, + { + "epoch": 0.6061465721040189, + "grad_norm": 3.1559767723083496, + "learning_rate": 4.892045042235765e-06, + "loss": 0.6664, + "step": 1282 + }, + { + "epoch": 0.6066193853427896, + "grad_norm": 2.844341993331909, + "learning_rate": 4.891863628998079e-06, + "loss": 0.7454, + "step": 1283 + }, + { + "epoch": 0.6070921985815603, + "grad_norm": 2.686602830886841, + "learning_rate": 4.891682066829532e-06, + "loss": 0.6755, + "step": 1284 + }, + { + "epoch": 0.607565011820331, + "grad_norm": 2.736457347869873, + "learning_rate": 4.8915003557414285e-06, + "loss": 0.6305, + "step": 1285 + }, + { + "epoch": 0.6080378250591016, + "grad_norm": 2.661362409591675, + "learning_rate": 4.891318495745086e-06, + "loss": 0.5958, + "step": 1286 + }, + { + "epoch": 0.6085106382978723, + "grad_norm": 2.707348108291626, + "learning_rate": 4.8911364868518255e-06, + "loss": 0.5824, + "step": 1287 + }, + { + "epoch": 0.608983451536643, + "grad_norm": 2.9798858165740967, + "learning_rate": 4.890954329072981e-06, + "loss": 0.5981, + "step": 1288 + }, + { + "epoch": 0.6094562647754137, + "grad_norm": 2.6285455226898193, + "learning_rate": 4.890772022419895e-06, + "loss": 0.6194, + "step": 1289 + }, + { + "epoch": 0.6099290780141844, + "grad_norm": 2.9254322052001953, + "learning_rate": 4.890589566903917e-06, + "loss": 0.6002, + "step": 1290 + }, + { + "epoch": 0.6104018912529551, + "grad_norm": 2.6458325386047363, + "learning_rate": 4.89040696253641e-06, + "loss": 0.5457, + "step": 1291 + }, + { + "epoch": 0.6108747044917258, + "grad_norm": 2.508242607116699, + "learning_rate": 4.890224209328743e-06, + "loss": 0.6168, + "step": 1292 + }, + { + "epoch": 0.6113475177304964, + "grad_norm": 3.034785509109497, + "learning_rate": 4.890041307292296e-06, + "loss": 0.664, + "step": 1293 + }, + { + "epoch": 0.6118203309692671, + "grad_norm": 3.52469539642334, + "learning_rate": 4.889858256438455e-06, + "loss": 0.7301, + "step": 1294 + }, + { + "epoch": 0.6122931442080378, + "grad_norm": 2.9145348072052, + "learning_rate": 4.889675056778622e-06, + "loss": 0.6494, + "step": 1295 + }, + { + "epoch": 0.6127659574468085, + "grad_norm": 2.831829071044922, + "learning_rate": 4.8894917083242e-06, + "loss": 0.6064, + "step": 1296 + }, + { + "epoch": 0.6132387706855792, + "grad_norm": 2.6883130073547363, + "learning_rate": 4.889308211086608e-06, + "loss": 0.5642, + "step": 1297 + }, + { + "epoch": 0.6137115839243499, + "grad_norm": 3.0605485439300537, + "learning_rate": 4.889124565077269e-06, + "loss": 0.6695, + "step": 1298 + }, + { + "epoch": 0.6141843971631206, + "grad_norm": 3.44062876701355, + "learning_rate": 4.88894077030762e-06, + "loss": 0.6415, + "step": 1299 + }, + { + "epoch": 0.6146572104018913, + "grad_norm": 2.5970818996429443, + "learning_rate": 4.888756826789105e-06, + "loss": 0.6518, + "step": 1300 + }, + { + "epoch": 0.6151300236406619, + "grad_norm": 4.2233567237854, + "learning_rate": 4.8885727345331755e-06, + "loss": 0.6555, + "step": 1301 + }, + { + "epoch": 0.6156028368794326, + "grad_norm": 2.645385503768921, + "learning_rate": 4.888388493551297e-06, + "loss": 0.6762, + "step": 1302 + }, + { + "epoch": 0.6160756501182033, + "grad_norm": 2.907954454421997, + "learning_rate": 4.8882041038549385e-06, + "loss": 0.6526, + "step": 1303 + }, + { + "epoch": 0.616548463356974, + "grad_norm": 2.482771873474121, + "learning_rate": 4.888019565455583e-06, + "loss": 0.628, + "step": 1304 + }, + { + "epoch": 0.6170212765957447, + "grad_norm": 2.7165915966033936, + "learning_rate": 4.88783487836472e-06, + "loss": 0.5743, + "step": 1305 + }, + { + "epoch": 0.6174940898345154, + "grad_norm": 3.095627546310425, + "learning_rate": 4.88765004259385e-06, + "loss": 0.627, + "step": 1306 + }, + { + "epoch": 0.6179669030732861, + "grad_norm": 2.5018465518951416, + "learning_rate": 4.8874650581544805e-06, + "loss": 0.5215, + "step": 1307 + }, + { + "epoch": 0.6184397163120567, + "grad_norm": 3.094337224960327, + "learning_rate": 4.8872799250581316e-06, + "loss": 0.6979, + "step": 1308 + }, + { + "epoch": 0.6189125295508274, + "grad_norm": 3.1002209186553955, + "learning_rate": 4.887094643316329e-06, + "loss": 0.6565, + "step": 1309 + }, + { + "epoch": 0.6193853427895981, + "grad_norm": 2.551431894302368, + "learning_rate": 4.88690921294061e-06, + "loss": 0.5748, + "step": 1310 + }, + { + "epoch": 0.6198581560283688, + "grad_norm": 2.8282904624938965, + "learning_rate": 4.886723633942521e-06, + "loss": 0.676, + "step": 1311 + }, + { + "epoch": 0.6203309692671395, + "grad_norm": 2.8887810707092285, + "learning_rate": 4.886537906333617e-06, + "loss": 0.5971, + "step": 1312 + }, + { + "epoch": 0.6208037825059102, + "grad_norm": 2.9989118576049805, + "learning_rate": 4.886352030125462e-06, + "loss": 0.6341, + "step": 1313 + }, + { + "epoch": 0.6212765957446809, + "grad_norm": 2.8042776584625244, + "learning_rate": 4.886166005329629e-06, + "loss": 0.6578, + "step": 1314 + }, + { + "epoch": 0.6217494089834515, + "grad_norm": 2.4980967044830322, + "learning_rate": 4.8859798319577026e-06, + "loss": 0.6711, + "step": 1315 + }, + { + "epoch": 0.6222222222222222, + "grad_norm": 2.762369155883789, + "learning_rate": 4.885793510021274e-06, + "loss": 0.5747, + "step": 1316 + }, + { + "epoch": 0.6226950354609929, + "grad_norm": 3.136327028274536, + "learning_rate": 4.885607039531945e-06, + "loss": 0.7544, + "step": 1317 + }, + { + "epoch": 0.6231678486997636, + "grad_norm": 2.8736963272094727, + "learning_rate": 4.885420420501327e-06, + "loss": 0.6603, + "step": 1318 + }, + { + "epoch": 0.6236406619385343, + "grad_norm": 2.766237497329712, + "learning_rate": 4.885233652941039e-06, + "loss": 0.581, + "step": 1319 + }, + { + "epoch": 0.624113475177305, + "grad_norm": 2.4740939140319824, + "learning_rate": 4.88504673686271e-06, + "loss": 0.6335, + "step": 1320 + }, + { + "epoch": 0.6245862884160757, + "grad_norm": 3.324795961380005, + "learning_rate": 4.884859672277978e-06, + "loss": 0.6019, + "step": 1321 + }, + { + "epoch": 0.6250591016548463, + "grad_norm": 3.521327257156372, + "learning_rate": 4.884672459198493e-06, + "loss": 0.6104, + "step": 1322 + }, + { + "epoch": 0.625531914893617, + "grad_norm": 2.7728071212768555, + "learning_rate": 4.884485097635909e-06, + "loss": 0.6714, + "step": 1323 + }, + { + "epoch": 0.6260047281323877, + "grad_norm": 3.0738155841827393, + "learning_rate": 4.884297587601895e-06, + "loss": 0.604, + "step": 1324 + }, + { + "epoch": 0.6264775413711584, + "grad_norm": 2.719240427017212, + "learning_rate": 4.884109929108124e-06, + "loss": 0.6795, + "step": 1325 + }, + { + "epoch": 0.6269503546099291, + "grad_norm": 2.4108200073242188, + "learning_rate": 4.883922122166282e-06, + "loss": 0.5846, + "step": 1326 + }, + { + "epoch": 0.6274231678486998, + "grad_norm": 2.393899917602539, + "learning_rate": 4.883734166788063e-06, + "loss": 0.6188, + "step": 1327 + }, + { + "epoch": 0.6278959810874705, + "grad_norm": 4.555255889892578, + "learning_rate": 4.883546062985169e-06, + "loss": 0.5962, + "step": 1328 + }, + { + "epoch": 0.6283687943262412, + "grad_norm": 2.571075439453125, + "learning_rate": 4.883357810769315e-06, + "loss": 0.6165, + "step": 1329 + }, + { + "epoch": 0.6288416075650118, + "grad_norm": 2.553115129470825, + "learning_rate": 4.8831694101522185e-06, + "loss": 0.6787, + "step": 1330 + }, + { + "epoch": 0.6293144208037825, + "grad_norm": 3.2564642429351807, + "learning_rate": 4.882980861145614e-06, + "loss": 0.659, + "step": 1331 + }, + { + "epoch": 0.6297872340425532, + "grad_norm": 2.535216808319092, + "learning_rate": 4.882792163761241e-06, + "loss": 0.6176, + "step": 1332 + }, + { + "epoch": 0.6302600472813239, + "grad_norm": 3.097921848297119, + "learning_rate": 4.882603318010847e-06, + "loss": 0.6822, + "step": 1333 + }, + { + "epoch": 0.6307328605200946, + "grad_norm": 2.8135175704956055, + "learning_rate": 4.882414323906192e-06, + "loss": 0.6782, + "step": 1334 + }, + { + "epoch": 0.6312056737588653, + "grad_norm": 2.724634885787964, + "learning_rate": 4.882225181459044e-06, + "loss": 0.6545, + "step": 1335 + }, + { + "epoch": 0.631678486997636, + "grad_norm": 2.9585227966308594, + "learning_rate": 4.882035890681179e-06, + "loss": 0.6218, + "step": 1336 + }, + { + "epoch": 0.6321513002364066, + "grad_norm": 2.6952011585235596, + "learning_rate": 4.881846451584385e-06, + "loss": 0.6, + "step": 1337 + }, + { + "epoch": 0.6326241134751773, + "grad_norm": 3.1400704383850098, + "learning_rate": 4.881656864180455e-06, + "loss": 0.6687, + "step": 1338 + }, + { + "epoch": 0.633096926713948, + "grad_norm": 2.8382487297058105, + "learning_rate": 4.881467128481197e-06, + "loss": 0.574, + "step": 1339 + }, + { + "epoch": 0.6335697399527187, + "grad_norm": 2.8520095348358154, + "learning_rate": 4.881277244498422e-06, + "loss": 0.6582, + "step": 1340 + }, + { + "epoch": 0.6340425531914894, + "grad_norm": 2.703498363494873, + "learning_rate": 4.881087212243956e-06, + "loss": 0.7224, + "step": 1341 + }, + { + "epoch": 0.6345153664302601, + "grad_norm": 3.697205066680908, + "learning_rate": 4.880897031729629e-06, + "loss": 0.6582, + "step": 1342 + }, + { + "epoch": 0.6349881796690308, + "grad_norm": 2.7625808715820312, + "learning_rate": 4.880706702967284e-06, + "loss": 0.574, + "step": 1343 + }, + { + "epoch": 0.6354609929078014, + "grad_norm": 2.949984073638916, + "learning_rate": 4.880516225968771e-06, + "loss": 0.66, + "step": 1344 + }, + { + "epoch": 0.6359338061465721, + "grad_norm": 2.548269748687744, + "learning_rate": 4.8803256007459525e-06, + "loss": 0.642, + "step": 1345 + }, + { + "epoch": 0.6364066193853428, + "grad_norm": 2.5102174282073975, + "learning_rate": 4.8801348273106945e-06, + "loss": 0.6238, + "step": 1346 + }, + { + "epoch": 0.6368794326241135, + "grad_norm": 2.9847946166992188, + "learning_rate": 4.8799439056748786e-06, + "loss": 0.5416, + "step": 1347 + }, + { + "epoch": 0.6373522458628842, + "grad_norm": 2.8711049556732178, + "learning_rate": 4.879752835850391e-06, + "loss": 0.6427, + "step": 1348 + }, + { + "epoch": 0.6378250591016549, + "grad_norm": 2.7901716232299805, + "learning_rate": 4.879561617849129e-06, + "loss": 0.6026, + "step": 1349 + }, + { + "epoch": 0.6382978723404256, + "grad_norm": 2.659778356552124, + "learning_rate": 4.879370251682999e-06, + "loss": 0.6623, + "step": 1350 + }, + { + "epoch": 0.6387706855791963, + "grad_norm": 3.224386692047119, + "learning_rate": 4.879178737363917e-06, + "loss": 0.6485, + "step": 1351 + }, + { + "epoch": 0.6392434988179669, + "grad_norm": 2.6385605335235596, + "learning_rate": 4.8789870749038076e-06, + "loss": 0.5866, + "step": 1352 + }, + { + "epoch": 0.6397163120567376, + "grad_norm": 2.807713270187378, + "learning_rate": 4.8787952643146045e-06, + "loss": 0.6537, + "step": 1353 + }, + { + "epoch": 0.6401891252955083, + "grad_norm": 2.5689280033111572, + "learning_rate": 4.878603305608251e-06, + "loss": 0.6216, + "step": 1354 + }, + { + "epoch": 0.640661938534279, + "grad_norm": 2.7347843647003174, + "learning_rate": 4.8784111987967e-06, + "loss": 0.6318, + "step": 1355 + }, + { + "epoch": 0.6411347517730497, + "grad_norm": 2.5210378170013428, + "learning_rate": 4.878218943891911e-06, + "loss": 0.5472, + "step": 1356 + }, + { + "epoch": 0.6416075650118204, + "grad_norm": 2.866785764694214, + "learning_rate": 4.878026540905858e-06, + "loss": 0.7108, + "step": 1357 + }, + { + "epoch": 0.642080378250591, + "grad_norm": 2.923314332962036, + "learning_rate": 4.877833989850519e-06, + "loss": 0.5557, + "step": 1358 + }, + { + "epoch": 0.6425531914893617, + "grad_norm": 2.925463914871216, + "learning_rate": 4.8776412907378845e-06, + "loss": 0.6382, + "step": 1359 + }, + { + "epoch": 0.6430260047281324, + "grad_norm": 2.909644365310669, + "learning_rate": 4.877448443579952e-06, + "loss": 0.5603, + "step": 1360 + }, + { + "epoch": 0.6434988179669031, + "grad_norm": 3.501148223876953, + "learning_rate": 4.8772554483887306e-06, + "loss": 0.6722, + "step": 1361 + }, + { + "epoch": 0.6439716312056738, + "grad_norm": 2.823765516281128, + "learning_rate": 4.877062305176235e-06, + "loss": 0.6408, + "step": 1362 + }, + { + "epoch": 0.6444444444444445, + "grad_norm": 2.9807584285736084, + "learning_rate": 4.8768690139544935e-06, + "loss": 0.5984, + "step": 1363 + }, + { + "epoch": 0.6449172576832152, + "grad_norm": 2.8411378860473633, + "learning_rate": 4.8766755747355405e-06, + "loss": 0.6231, + "step": 1364 + }, + { + "epoch": 0.6453900709219859, + "grad_norm": 3.158952236175537, + "learning_rate": 4.8764819875314215e-06, + "loss": 0.6441, + "step": 1365 + }, + { + "epoch": 0.6458628841607565, + "grad_norm": 2.9614369869232178, + "learning_rate": 4.876288252354189e-06, + "loss": 0.6308, + "step": 1366 + }, + { + "epoch": 0.6463356973995272, + "grad_norm": 3.073805570602417, + "learning_rate": 4.876094369215907e-06, + "loss": 0.6046, + "step": 1367 + }, + { + "epoch": 0.6468085106382979, + "grad_norm": 2.719189405441284, + "learning_rate": 4.875900338128648e-06, + "loss": 0.6082, + "step": 1368 + }, + { + "epoch": 0.6472813238770686, + "grad_norm": 2.676726818084717, + "learning_rate": 4.8757061591044914e-06, + "loss": 0.6344, + "step": 1369 + }, + { + "epoch": 0.6477541371158393, + "grad_norm": 2.955256938934326, + "learning_rate": 4.87551183215553e-06, + "loss": 0.6506, + "step": 1370 + }, + { + "epoch": 0.64822695035461, + "grad_norm": 2.5672218799591064, + "learning_rate": 4.875317357293864e-06, + "loss": 0.5284, + "step": 1371 + }, + { + "epoch": 0.6486997635933807, + "grad_norm": 2.5860238075256348, + "learning_rate": 4.875122734531602e-06, + "loss": 0.667, + "step": 1372 + }, + { + "epoch": 0.6491725768321513, + "grad_norm": 3.1037003993988037, + "learning_rate": 4.8749279638808605e-06, + "loss": 0.6902, + "step": 1373 + }, + { + "epoch": 0.649645390070922, + "grad_norm": 2.7715282440185547, + "learning_rate": 4.874733045353769e-06, + "loss": 0.6291, + "step": 1374 + }, + { + "epoch": 0.6501182033096927, + "grad_norm": 2.527071475982666, + "learning_rate": 4.874537978962463e-06, + "loss": 0.5565, + "step": 1375 + }, + { + "epoch": 0.6505910165484634, + "grad_norm": 2.722092628479004, + "learning_rate": 4.874342764719091e-06, + "loss": 0.5724, + "step": 1376 + }, + { + "epoch": 0.6510638297872341, + "grad_norm": 2.6342411041259766, + "learning_rate": 4.874147402635805e-06, + "loss": 0.6308, + "step": 1377 + }, + { + "epoch": 0.6515366430260048, + "grad_norm": 2.3850719928741455, + "learning_rate": 4.8739518927247695e-06, + "loss": 0.5692, + "step": 1378 + }, + { + "epoch": 0.6520094562647755, + "grad_norm": 2.9787259101867676, + "learning_rate": 4.873756234998161e-06, + "loss": 0.6953, + "step": 1379 + }, + { + "epoch": 0.6524822695035462, + "grad_norm": 2.634141683578491, + "learning_rate": 4.873560429468159e-06, + "loss": 0.6077, + "step": 1380 + }, + { + "epoch": 0.6529550827423168, + "grad_norm": 2.803046941757202, + "learning_rate": 4.873364476146958e-06, + "loss": 0.6657, + "step": 1381 + }, + { + "epoch": 0.6534278959810875, + "grad_norm": 2.762827157974243, + "learning_rate": 4.8731683750467574e-06, + "loss": 0.6061, + "step": 1382 + }, + { + "epoch": 0.6539007092198581, + "grad_norm": 2.6654391288757324, + "learning_rate": 4.872972126179768e-06, + "loss": 0.6387, + "step": 1383 + }, + { + "epoch": 0.6543735224586288, + "grad_norm": 2.4363625049591064, + "learning_rate": 4.872775729558209e-06, + "loss": 0.5623, + "step": 1384 + }, + { + "epoch": 0.6548463356973995, + "grad_norm": 2.528959035873413, + "learning_rate": 4.87257918519431e-06, + "loss": 0.5609, + "step": 1385 + }, + { + "epoch": 0.6553191489361702, + "grad_norm": 2.718383312225342, + "learning_rate": 4.872382493100309e-06, + "loss": 0.5575, + "step": 1386 + }, + { + "epoch": 0.6557919621749408, + "grad_norm": 2.660841226577759, + "learning_rate": 4.872185653288453e-06, + "loss": 0.6106, + "step": 1387 + }, + { + "epoch": 0.6562647754137115, + "grad_norm": 2.508753538131714, + "learning_rate": 4.871988665770997e-06, + "loss": 0.5705, + "step": 1388 + }, + { + "epoch": 0.6567375886524822, + "grad_norm": 2.5134334564208984, + "learning_rate": 4.871791530560208e-06, + "loss": 0.5592, + "step": 1389 + }, + { + "epoch": 0.6572104018912529, + "grad_norm": 2.7475597858428955, + "learning_rate": 4.871594247668361e-06, + "loss": 0.6277, + "step": 1390 + }, + { + "epoch": 0.6576832151300236, + "grad_norm": 2.793616533279419, + "learning_rate": 4.871396817107739e-06, + "loss": 0.595, + "step": 1391 + }, + { + "epoch": 0.6581560283687943, + "grad_norm": 2.8285086154937744, + "learning_rate": 4.871199238890635e-06, + "loss": 0.6094, + "step": 1392 + }, + { + "epoch": 0.658628841607565, + "grad_norm": 2.74124813079834, + "learning_rate": 4.871001513029352e-06, + "loss": 0.6296, + "step": 1393 + }, + { + "epoch": 0.6591016548463356, + "grad_norm": 2.761237621307373, + "learning_rate": 4.870803639536202e-06, + "loss": 0.5702, + "step": 1394 + }, + { + "epoch": 0.6595744680851063, + "grad_norm": 2.761038064956665, + "learning_rate": 4.870605618423504e-06, + "loss": 0.6195, + "step": 1395 + }, + { + "epoch": 0.660047281323877, + "grad_norm": 2.8812482357025146, + "learning_rate": 4.870407449703589e-06, + "loss": 0.616, + "step": 1396 + }, + { + "epoch": 0.6605200945626477, + "grad_norm": 2.9966578483581543, + "learning_rate": 4.870209133388797e-06, + "loss": 0.6547, + "step": 1397 + }, + { + "epoch": 0.6609929078014184, + "grad_norm": 2.7969017028808594, + "learning_rate": 4.870010669491474e-06, + "loss": 0.5762, + "step": 1398 + }, + { + "epoch": 0.6614657210401891, + "grad_norm": 2.557783842086792, + "learning_rate": 4.86981205802398e-06, + "loss": 0.6184, + "step": 1399 + }, + { + "epoch": 0.6619385342789598, + "grad_norm": 2.5393927097320557, + "learning_rate": 4.86961329899868e-06, + "loss": 0.5953, + "step": 1400 + }, + { + "epoch": 0.6624113475177305, + "grad_norm": 2.7745981216430664, + "learning_rate": 4.86941439242795e-06, + "loss": 0.5967, + "step": 1401 + }, + { + "epoch": 0.6628841607565011, + "grad_norm": 2.650381326675415, + "learning_rate": 4.869215338324176e-06, + "loss": 0.5667, + "step": 1402 + }, + { + "epoch": 0.6633569739952718, + "grad_norm": 2.583169937133789, + "learning_rate": 4.869016136699751e-06, + "loss": 0.549, + "step": 1403 + }, + { + "epoch": 0.6638297872340425, + "grad_norm": 2.984978437423706, + "learning_rate": 4.868816787567079e-06, + "loss": 0.5931, + "step": 1404 + }, + { + "epoch": 0.6643026004728132, + "grad_norm": 3.1947181224823, + "learning_rate": 4.868617290938573e-06, + "loss": 0.5473, + "step": 1405 + }, + { + "epoch": 0.6647754137115839, + "grad_norm": 2.562927007675171, + "learning_rate": 4.868417646826654e-06, + "loss": 0.6878, + "step": 1406 + }, + { + "epoch": 0.6652482269503546, + "grad_norm": 2.8741261959075928, + "learning_rate": 4.868217855243754e-06, + "loss": 0.6312, + "step": 1407 + }, + { + "epoch": 0.6657210401891253, + "grad_norm": 2.9834797382354736, + "learning_rate": 4.868017916202312e-06, + "loss": 0.5624, + "step": 1408 + }, + { + "epoch": 0.6661938534278959, + "grad_norm": 2.6935982704162598, + "learning_rate": 4.8678178297147785e-06, + "loss": 0.5857, + "step": 1409 + }, + { + "epoch": 0.6666666666666666, + "grad_norm": 2.8200576305389404, + "learning_rate": 4.86761759579361e-06, + "loss": 0.6153, + "step": 1410 + }, + { + "epoch": 0.6671394799054373, + "grad_norm": 2.831425189971924, + "learning_rate": 4.867417214451276e-06, + "loss": 0.6495, + "step": 1411 + }, + { + "epoch": 0.667612293144208, + "grad_norm": 2.733565092086792, + "learning_rate": 4.867216685700253e-06, + "loss": 0.6036, + "step": 1412 + }, + { + "epoch": 0.6680851063829787, + "grad_norm": 3.0609400272369385, + "learning_rate": 4.867016009553027e-06, + "loss": 0.6773, + "step": 1413 + }, + { + "epoch": 0.6685579196217494, + "grad_norm": 2.665452241897583, + "learning_rate": 4.866815186022093e-06, + "loss": 0.6256, + "step": 1414 + }, + { + "epoch": 0.6690307328605201, + "grad_norm": 2.9480721950531006, + "learning_rate": 4.866614215119956e-06, + "loss": 0.535, + "step": 1415 + }, + { + "epoch": 0.6695035460992907, + "grad_norm": 2.5514180660247803, + "learning_rate": 4.866413096859128e-06, + "loss": 0.6588, + "step": 1416 + }, + { + "epoch": 0.6699763593380614, + "grad_norm": 3.3442373275756836, + "learning_rate": 4.866211831252134e-06, + "loss": 0.5754, + "step": 1417 + }, + { + "epoch": 0.6704491725768321, + "grad_norm": 2.521467685699463, + "learning_rate": 4.866010418311504e-06, + "loss": 0.5546, + "step": 1418 + }, + { + "epoch": 0.6709219858156028, + "grad_norm": 2.930706262588501, + "learning_rate": 4.865808858049781e-06, + "loss": 0.589, + "step": 1419 + }, + { + "epoch": 0.6713947990543735, + "grad_norm": 2.6298375129699707, + "learning_rate": 4.865607150479513e-06, + "loss": 0.5915, + "step": 1420 + }, + { + "epoch": 0.6718676122931442, + "grad_norm": 2.9554293155670166, + "learning_rate": 4.8654052956132615e-06, + "loss": 0.6654, + "step": 1421 + }, + { + "epoch": 0.6723404255319149, + "grad_norm": 3.2706902027130127, + "learning_rate": 4.865203293463593e-06, + "loss": 0.7115, + "step": 1422 + }, + { + "epoch": 0.6728132387706856, + "grad_norm": 3.041539430618286, + "learning_rate": 4.865001144043088e-06, + "loss": 0.5818, + "step": 1423 + }, + { + "epoch": 0.6732860520094562, + "grad_norm": 3.1314544677734375, + "learning_rate": 4.864798847364331e-06, + "loss": 0.5822, + "step": 1424 + }, + { + "epoch": 0.6737588652482269, + "grad_norm": 2.5301461219787598, + "learning_rate": 4.86459640343992e-06, + "loss": 0.5525, + "step": 1425 + }, + { + "epoch": 0.6742316784869976, + "grad_norm": 2.809295892715454, + "learning_rate": 4.864393812282458e-06, + "loss": 0.6768, + "step": 1426 + }, + { + "epoch": 0.6747044917257683, + "grad_norm": 2.794664144515991, + "learning_rate": 4.864191073904562e-06, + "loss": 0.5793, + "step": 1427 + }, + { + "epoch": 0.675177304964539, + "grad_norm": 2.7771105766296387, + "learning_rate": 4.863988188318854e-06, + "loss": 0.6453, + "step": 1428 + }, + { + "epoch": 0.6756501182033097, + "grad_norm": 2.6431946754455566, + "learning_rate": 4.863785155537967e-06, + "loss": 0.5877, + "step": 1429 + }, + { + "epoch": 0.6761229314420804, + "grad_norm": 2.951353073120117, + "learning_rate": 4.863581975574544e-06, + "loss": 0.6793, + "step": 1430 + }, + { + "epoch": 0.676595744680851, + "grad_norm": 3.1336071491241455, + "learning_rate": 4.863378648441235e-06, + "loss": 0.6695, + "step": 1431 + }, + { + "epoch": 0.6770685579196217, + "grad_norm": 2.735982656478882, + "learning_rate": 4.8631751741507e-06, + "loss": 0.5239, + "step": 1432 + }, + { + "epoch": 0.6775413711583924, + "grad_norm": 2.7085206508636475, + "learning_rate": 4.862971552715611e-06, + "loss": 0.6837, + "step": 1433 + }, + { + "epoch": 0.6780141843971631, + "grad_norm": 3.136528730392456, + "learning_rate": 4.8627677841486436e-06, + "loss": 0.683, + "step": 1434 + }, + { + "epoch": 0.6784869976359338, + "grad_norm": 2.7879369258880615, + "learning_rate": 4.862563868462486e-06, + "loss": 0.608, + "step": 1435 + }, + { + "epoch": 0.6789598108747045, + "grad_norm": 2.7937729358673096, + "learning_rate": 4.862359805669837e-06, + "loss": 0.6131, + "step": 1436 + }, + { + "epoch": 0.6794326241134752, + "grad_norm": 2.5988364219665527, + "learning_rate": 4.862155595783401e-06, + "loss": 0.6303, + "step": 1437 + }, + { + "epoch": 0.6799054373522458, + "grad_norm": 3.251070499420166, + "learning_rate": 4.861951238815894e-06, + "loss": 0.7246, + "step": 1438 + }, + { + "epoch": 0.6803782505910165, + "grad_norm": 2.646759271621704, + "learning_rate": 4.861746734780039e-06, + "loss": 0.6313, + "step": 1439 + }, + { + "epoch": 0.6808510638297872, + "grad_norm": 2.773866891860962, + "learning_rate": 4.861542083688573e-06, + "loss": 0.6463, + "step": 1440 + }, + { + "epoch": 0.6813238770685579, + "grad_norm": 2.759965658187866, + "learning_rate": 4.861337285554235e-06, + "loss": 0.5428, + "step": 1441 + }, + { + "epoch": 0.6817966903073286, + "grad_norm": 3.3250818252563477, + "learning_rate": 4.861132340389779e-06, + "loss": 0.6522, + "step": 1442 + }, + { + "epoch": 0.6822695035460993, + "grad_norm": 2.661797523498535, + "learning_rate": 4.860927248207965e-06, + "loss": 0.5871, + "step": 1443 + }, + { + "epoch": 0.68274231678487, + "grad_norm": 2.706289052963257, + "learning_rate": 4.860722009021563e-06, + "loss": 0.6651, + "step": 1444 + }, + { + "epoch": 0.6832151300236406, + "grad_norm": 2.8459298610687256, + "learning_rate": 4.860516622843354e-06, + "loss": 0.5827, + "step": 1445 + }, + { + "epoch": 0.6836879432624113, + "grad_norm": 3.1041831970214844, + "learning_rate": 4.860311089686125e-06, + "loss": 0.6727, + "step": 1446 + }, + { + "epoch": 0.684160756501182, + "grad_norm": 2.9382801055908203, + "learning_rate": 4.8601054095626746e-06, + "loss": 0.6002, + "step": 1447 + }, + { + "epoch": 0.6846335697399527, + "grad_norm": 2.782475471496582, + "learning_rate": 4.859899582485808e-06, + "loss": 0.6951, + "step": 1448 + }, + { + "epoch": 0.6851063829787234, + "grad_norm": 3.313894510269165, + "learning_rate": 4.859693608468343e-06, + "loss": 0.6363, + "step": 1449 + }, + { + "epoch": 0.6855791962174941, + "grad_norm": 3.1639695167541504, + "learning_rate": 4.8594874875231045e-06, + "loss": 0.7002, + "step": 1450 + }, + { + "epoch": 0.6860520094562648, + "grad_norm": 2.6762218475341797, + "learning_rate": 4.859281219662926e-06, + "loss": 0.6246, + "step": 1451 + }, + { + "epoch": 0.6865248226950355, + "grad_norm": 2.8368663787841797, + "learning_rate": 4.85907480490065e-06, + "loss": 0.5906, + "step": 1452 + }, + { + "epoch": 0.6869976359338061, + "grad_norm": 2.887373208999634, + "learning_rate": 4.858868243249131e-06, + "loss": 0.5931, + "step": 1453 + }, + { + "epoch": 0.6874704491725768, + "grad_norm": 2.8115322589874268, + "learning_rate": 4.858661534721229e-06, + "loss": 0.6337, + "step": 1454 + }, + { + "epoch": 0.6879432624113475, + "grad_norm": 2.8470499515533447, + "learning_rate": 4.8584546793298174e-06, + "loss": 0.632, + "step": 1455 + }, + { + "epoch": 0.6884160756501182, + "grad_norm": 2.8229613304138184, + "learning_rate": 4.8582476770877725e-06, + "loss": 0.6494, + "step": 1456 + }, + { + "epoch": 0.6888888888888889, + "grad_norm": 2.4235479831695557, + "learning_rate": 4.858040528007987e-06, + "loss": 0.5709, + "step": 1457 + }, + { + "epoch": 0.6893617021276596, + "grad_norm": 2.9348199367523193, + "learning_rate": 4.857833232103356e-06, + "loss": 0.5404, + "step": 1458 + }, + { + "epoch": 0.6898345153664303, + "grad_norm": 2.8274219036102295, + "learning_rate": 4.857625789386789e-06, + "loss": 0.701, + "step": 1459 + }, + { + "epoch": 0.6903073286052009, + "grad_norm": 3.136929988861084, + "learning_rate": 4.857418199871203e-06, + "loss": 0.6971, + "step": 1460 + }, + { + "epoch": 0.6907801418439716, + "grad_norm": 2.8987185955047607, + "learning_rate": 4.8572104635695214e-06, + "loss": 0.6613, + "step": 1461 + }, + { + "epoch": 0.6912529550827423, + "grad_norm": 2.5073442459106445, + "learning_rate": 4.857002580494681e-06, + "loss": 0.6032, + "step": 1462 + }, + { + "epoch": 0.691725768321513, + "grad_norm": 2.7019522190093994, + "learning_rate": 4.856794550659625e-06, + "loss": 0.567, + "step": 1463 + }, + { + "epoch": 0.6921985815602837, + "grad_norm": 2.4795594215393066, + "learning_rate": 4.8565863740773054e-06, + "loss": 0.5777, + "step": 1464 + }, + { + "epoch": 0.6926713947990544, + "grad_norm": 3.032506227493286, + "learning_rate": 4.856378050760687e-06, + "loss": 0.607, + "step": 1465 + }, + { + "epoch": 0.6931442080378251, + "grad_norm": 3.052091121673584, + "learning_rate": 4.85616958072274e-06, + "loss": 0.591, + "step": 1466 + }, + { + "epoch": 0.6936170212765957, + "grad_norm": 2.704831838607788, + "learning_rate": 4.855960963976443e-06, + "loss": 0.6528, + "step": 1467 + }, + { + "epoch": 0.6940898345153664, + "grad_norm": 2.680995225906372, + "learning_rate": 4.855752200534788e-06, + "loss": 0.6294, + "step": 1468 + }, + { + "epoch": 0.6945626477541371, + "grad_norm": 2.3948659896850586, + "learning_rate": 4.855543290410774e-06, + "loss": 0.6091, + "step": 1469 + }, + { + "epoch": 0.6950354609929078, + "grad_norm": 2.6407411098480225, + "learning_rate": 4.855334233617407e-06, + "loss": 0.5572, + "step": 1470 + }, + { + "epoch": 0.6955082742316785, + "grad_norm": 2.5526835918426514, + "learning_rate": 4.8551250301677064e-06, + "loss": 0.5432, + "step": 1471 + }, + { + "epoch": 0.6959810874704492, + "grad_norm": 3.1237430572509766, + "learning_rate": 4.8549156800746965e-06, + "loss": 0.5944, + "step": 1472 + }, + { + "epoch": 0.6964539007092199, + "grad_norm": 2.8112540245056152, + "learning_rate": 4.854706183351412e-06, + "loss": 0.604, + "step": 1473 + }, + { + "epoch": 0.6969267139479906, + "grad_norm": 2.664644479751587, + "learning_rate": 4.8544965400109e-06, + "loss": 0.5647, + "step": 1474 + }, + { + "epoch": 0.6973995271867612, + "grad_norm": 3.26310133934021, + "learning_rate": 4.854286750066212e-06, + "loss": 0.6999, + "step": 1475 + }, + { + "epoch": 0.6978723404255319, + "grad_norm": 2.9717442989349365, + "learning_rate": 4.8540768135304115e-06, + "loss": 0.6655, + "step": 1476 + }, + { + "epoch": 0.6983451536643026, + "grad_norm": 2.5302982330322266, + "learning_rate": 4.85386673041657e-06, + "loss": 0.6384, + "step": 1477 + }, + { + "epoch": 0.6988179669030733, + "grad_norm": 2.864877700805664, + "learning_rate": 4.853656500737769e-06, + "loss": 0.6834, + "step": 1478 + }, + { + "epoch": 0.699290780141844, + "grad_norm": 2.5522031784057617, + "learning_rate": 4.853446124507098e-06, + "loss": 0.5929, + "step": 1479 + }, + { + "epoch": 0.6997635933806147, + "grad_norm": 3.096477746963501, + "learning_rate": 4.853235601737656e-06, + "loss": 0.5737, + "step": 1480 + }, + { + "epoch": 0.7002364066193854, + "grad_norm": 2.884779214859009, + "learning_rate": 4.853024932442552e-06, + "loss": 0.6362, + "step": 1481 + }, + { + "epoch": 0.700709219858156, + "grad_norm": 3.368558406829834, + "learning_rate": 4.852814116634903e-06, + "loss": 0.6721, + "step": 1482 + }, + { + "epoch": 0.7011820330969267, + "grad_norm": 2.742414951324463, + "learning_rate": 4.852603154327837e-06, + "loss": 0.6212, + "step": 1483 + }, + { + "epoch": 0.7016548463356974, + "grad_norm": 2.53454852104187, + "learning_rate": 4.8523920455344864e-06, + "loss": 0.6675, + "step": 1484 + }, + { + "epoch": 0.7021276595744681, + "grad_norm": 2.9354238510131836, + "learning_rate": 4.852180790267999e-06, + "loss": 0.6692, + "step": 1485 + }, + { + "epoch": 0.7026004728132388, + "grad_norm": 2.585070848464966, + "learning_rate": 4.8519693885415274e-06, + "loss": 0.6215, + "step": 1486 + }, + { + "epoch": 0.7030732860520095, + "grad_norm": 2.9047999382019043, + "learning_rate": 4.851757840368235e-06, + "loss": 0.6231, + "step": 1487 + }, + { + "epoch": 0.7035460992907802, + "grad_norm": 3.0930933952331543, + "learning_rate": 4.851546145761295e-06, + "loss": 0.7267, + "step": 1488 + }, + { + "epoch": 0.7040189125295508, + "grad_norm": 3.0224719047546387, + "learning_rate": 4.8513343047338875e-06, + "loss": 0.6293, + "step": 1489 + }, + { + "epoch": 0.7044917257683215, + "grad_norm": 2.5758471488952637, + "learning_rate": 4.851122317299203e-06, + "loss": 0.5855, + "step": 1490 + }, + { + "epoch": 0.7049645390070922, + "grad_norm": 2.579272508621216, + "learning_rate": 4.850910183470441e-06, + "loss": 0.582, + "step": 1491 + }, + { + "epoch": 0.7054373522458629, + "grad_norm": 2.8148300647735596, + "learning_rate": 4.85069790326081e-06, + "loss": 0.6396, + "step": 1492 + }, + { + "epoch": 0.7059101654846336, + "grad_norm": 2.6380527019500732, + "learning_rate": 4.850485476683528e-06, + "loss": 0.6114, + "step": 1493 + }, + { + "epoch": 0.7063829787234043, + "grad_norm": 2.7736263275146484, + "learning_rate": 4.850272903751823e-06, + "loss": 0.6683, + "step": 1494 + }, + { + "epoch": 0.706855791962175, + "grad_norm": 3.1958179473876953, + "learning_rate": 4.8500601844789285e-06, + "loss": 0.6265, + "step": 1495 + }, + { + "epoch": 0.7073286052009456, + "grad_norm": 3.783212423324585, + "learning_rate": 4.8498473188780916e-06, + "loss": 0.6078, + "step": 1496 + }, + { + "epoch": 0.7078014184397163, + "grad_norm": 2.6656646728515625, + "learning_rate": 4.849634306962566e-06, + "loss": 0.5756, + "step": 1497 + }, + { + "epoch": 0.708274231678487, + "grad_norm": 2.757141590118408, + "learning_rate": 4.849421148745615e-06, + "loss": 0.5596, + "step": 1498 + }, + { + "epoch": 0.7087470449172577, + "grad_norm": 3.0391886234283447, + "learning_rate": 4.849207844240511e-06, + "loss": 0.5293, + "step": 1499 + }, + { + "epoch": 0.7092198581560284, + "grad_norm": 2.981912851333618, + "learning_rate": 4.848994393460535e-06, + "loss": 0.598, + "step": 1500 + }, + { + "epoch": 0.7096926713947991, + "grad_norm": 2.5470798015594482, + "learning_rate": 4.848780796418978e-06, + "loss": 0.6266, + "step": 1501 + }, + { + "epoch": 0.7101654846335698, + "grad_norm": 2.8394415378570557, + "learning_rate": 4.8485670531291415e-06, + "loss": 0.6844, + "step": 1502 + }, + { + "epoch": 0.7106382978723405, + "grad_norm": 3.2023508548736572, + "learning_rate": 4.848353163604331e-06, + "loss": 0.6134, + "step": 1503 + }, + { + "epoch": 0.7111111111111111, + "grad_norm": 2.98245906829834, + "learning_rate": 4.848139127857867e-06, + "loss": 0.7084, + "step": 1504 + }, + { + "epoch": 0.7115839243498818, + "grad_norm": 2.5917441844940186, + "learning_rate": 4.847924945903076e-06, + "loss": 0.5676, + "step": 1505 + }, + { + "epoch": 0.7120567375886525, + "grad_norm": 2.8736681938171387, + "learning_rate": 4.847710617753294e-06, + "loss": 0.6304, + "step": 1506 + }, + { + "epoch": 0.7125295508274232, + "grad_norm": 2.7832682132720947, + "learning_rate": 4.847496143421866e-06, + "loss": 0.5705, + "step": 1507 + }, + { + "epoch": 0.7130023640661939, + "grad_norm": 2.480560779571533, + "learning_rate": 4.847281522922147e-06, + "loss": 0.5595, + "step": 1508 + }, + { + "epoch": 0.7134751773049646, + "grad_norm": 2.357675313949585, + "learning_rate": 4.847066756267499e-06, + "loss": 0.5065, + "step": 1509 + }, + { + "epoch": 0.7139479905437353, + "grad_norm": 2.632669448852539, + "learning_rate": 4.846851843471296e-06, + "loss": 0.6949, + "step": 1510 + }, + { + "epoch": 0.7144208037825059, + "grad_norm": 2.7691073417663574, + "learning_rate": 4.84663678454692e-06, + "loss": 0.6638, + "step": 1511 + }, + { + "epoch": 0.7148936170212766, + "grad_norm": 2.5647685527801514, + "learning_rate": 4.846421579507761e-06, + "loss": 0.6098, + "step": 1512 + }, + { + "epoch": 0.7153664302600473, + "grad_norm": 2.476701021194458, + "learning_rate": 4.846206228367218e-06, + "loss": 0.592, + "step": 1513 + }, + { + "epoch": 0.715839243498818, + "grad_norm": 2.805727958679199, + "learning_rate": 4.845990731138702e-06, + "loss": 0.5466, + "step": 1514 + }, + { + "epoch": 0.7163120567375887, + "grad_norm": 2.551392078399658, + "learning_rate": 4.84577508783563e-06, + "loss": 0.6039, + "step": 1515 + }, + { + "epoch": 0.7167848699763594, + "grad_norm": 2.6861350536346436, + "learning_rate": 4.845559298471429e-06, + "loss": 0.6427, + "step": 1516 + }, + { + "epoch": 0.7172576832151301, + "grad_norm": 3.1908371448516846, + "learning_rate": 4.845343363059535e-06, + "loss": 0.5447, + "step": 1517 + }, + { + "epoch": 0.7177304964539007, + "grad_norm": 2.9021761417388916, + "learning_rate": 4.845127281613394e-06, + "loss": 0.5836, + "step": 1518 + }, + { + "epoch": 0.7182033096926714, + "grad_norm": 2.476670742034912, + "learning_rate": 4.844911054146461e-06, + "loss": 0.5863, + "step": 1519 + }, + { + "epoch": 0.7186761229314421, + "grad_norm": 2.662935495376587, + "learning_rate": 4.844694680672198e-06, + "loss": 0.5678, + "step": 1520 + }, + { + "epoch": 0.7191489361702128, + "grad_norm": 2.677896738052368, + "learning_rate": 4.844478161204079e-06, + "loss": 0.6195, + "step": 1521 + }, + { + "epoch": 0.7196217494089835, + "grad_norm": 2.781921863555908, + "learning_rate": 4.844261495755585e-06, + "loss": 0.643, + "step": 1522 + }, + { + "epoch": 0.7200945626477542, + "grad_norm": 3.0157392024993896, + "learning_rate": 4.844044684340206e-06, + "loss": 0.7559, + "step": 1523 + }, + { + "epoch": 0.7205673758865249, + "grad_norm": 2.8109354972839355, + "learning_rate": 4.843827726971444e-06, + "loss": 0.6264, + "step": 1524 + }, + { + "epoch": 0.7210401891252955, + "grad_norm": 3.0953569412231445, + "learning_rate": 4.8436106236628064e-06, + "loss": 0.6429, + "step": 1525 + }, + { + "epoch": 0.7215130023640662, + "grad_norm": 2.6850643157958984, + "learning_rate": 4.843393374427812e-06, + "loss": 0.6598, + "step": 1526 + }, + { + "epoch": 0.7219858156028369, + "grad_norm": 3.043480634689331, + "learning_rate": 4.8431759792799874e-06, + "loss": 0.6331, + "step": 1527 + }, + { + "epoch": 0.7224586288416076, + "grad_norm": 2.723870038986206, + "learning_rate": 4.842958438232868e-06, + "loss": 0.6259, + "step": 1528 + }, + { + "epoch": 0.7229314420803783, + "grad_norm": 2.822492837905884, + "learning_rate": 4.842740751300002e-06, + "loss": 0.6554, + "step": 1529 + }, + { + "epoch": 0.723404255319149, + "grad_norm": 2.7866315841674805, + "learning_rate": 4.842522918494941e-06, + "loss": 0.6991, + "step": 1530 + }, + { + "epoch": 0.7238770685579197, + "grad_norm": 2.8881826400756836, + "learning_rate": 4.84230493983125e-06, + "loss": 0.5876, + "step": 1531 + }, + { + "epoch": 0.7243498817966904, + "grad_norm": 2.7456939220428467, + "learning_rate": 4.8420868153225e-06, + "loss": 0.6188, + "step": 1532 + }, + { + "epoch": 0.724822695035461, + "grad_norm": 3.0257532596588135, + "learning_rate": 4.841868544982274e-06, + "loss": 0.63, + "step": 1533 + }, + { + "epoch": 0.7252955082742317, + "grad_norm": 3.1581954956054688, + "learning_rate": 4.841650128824164e-06, + "loss": 0.7214, + "step": 1534 + }, + { + "epoch": 0.7257683215130024, + "grad_norm": 2.9174306392669678, + "learning_rate": 4.841431566861767e-06, + "loss": 0.704, + "step": 1535 + }, + { + "epoch": 0.7262411347517731, + "grad_norm": 2.5019054412841797, + "learning_rate": 4.8412128591086935e-06, + "loss": 0.6298, + "step": 1536 + }, + { + "epoch": 0.7267139479905438, + "grad_norm": 2.724285125732422, + "learning_rate": 4.840994005578562e-06, + "loss": 0.6289, + "step": 1537 + }, + { + "epoch": 0.7271867612293145, + "grad_norm": 2.5882341861724854, + "learning_rate": 4.840775006284998e-06, + "loss": 0.6355, + "step": 1538 + }, + { + "epoch": 0.7276595744680852, + "grad_norm": 3.1281991004943848, + "learning_rate": 4.840555861241638e-06, + "loss": 0.5551, + "step": 1539 + }, + { + "epoch": 0.7281323877068558, + "grad_norm": 2.6064817905426025, + "learning_rate": 4.840336570462127e-06, + "loss": 0.5543, + "step": 1540 + }, + { + "epoch": 0.7286052009456265, + "grad_norm": 2.67112398147583, + "learning_rate": 4.840117133960122e-06, + "loss": 0.6044, + "step": 1541 + }, + { + "epoch": 0.7290780141843972, + "grad_norm": 2.838022232055664, + "learning_rate": 4.839897551749282e-06, + "loss": 0.6814, + "step": 1542 + }, + { + "epoch": 0.7295508274231679, + "grad_norm": 2.8897151947021484, + "learning_rate": 4.839677823843283e-06, + "loss": 0.593, + "step": 1543 + }, + { + "epoch": 0.7300236406619386, + "grad_norm": 2.9238014221191406, + "learning_rate": 4.839457950255805e-06, + "loss": 0.5544, + "step": 1544 + }, + { + "epoch": 0.7304964539007093, + "grad_norm": 3.016876459121704, + "learning_rate": 4.839237931000538e-06, + "loss": 0.6099, + "step": 1545 + }, + { + "epoch": 0.7309692671394799, + "grad_norm": 2.9415392875671387, + "learning_rate": 4.839017766091182e-06, + "loss": 0.6413, + "step": 1546 + }, + { + "epoch": 0.7314420803782505, + "grad_norm": 2.658067226409912, + "learning_rate": 4.838797455541446e-06, + "loss": 0.6534, + "step": 1547 + }, + { + "epoch": 0.7319148936170212, + "grad_norm": 2.460358142852783, + "learning_rate": 4.838576999365049e-06, + "loss": 0.5307, + "step": 1548 + }, + { + "epoch": 0.7323877068557919, + "grad_norm": 2.5818674564361572, + "learning_rate": 4.838356397575716e-06, + "loss": 0.6265, + "step": 1549 + }, + { + "epoch": 0.7328605200945626, + "grad_norm": 3.009197473526001, + "learning_rate": 4.838135650187183e-06, + "loss": 0.6957, + "step": 1550 + }, + { + "epoch": 0.7333333333333333, + "grad_norm": 2.738543748855591, + "learning_rate": 4.837914757213196e-06, + "loss": 0.646, + "step": 1551 + }, + { + "epoch": 0.733806146572104, + "grad_norm": 2.8208494186401367, + "learning_rate": 4.837693718667508e-06, + "loss": 0.5936, + "step": 1552 + }, + { + "epoch": 0.7342789598108747, + "grad_norm": 3.1574649810791016, + "learning_rate": 4.837472534563883e-06, + "loss": 0.6455, + "step": 1553 + }, + { + "epoch": 0.7347517730496453, + "grad_norm": 2.6737420558929443, + "learning_rate": 4.837251204916093e-06, + "loss": 0.5921, + "step": 1554 + }, + { + "epoch": 0.735224586288416, + "grad_norm": 2.424983024597168, + "learning_rate": 4.837029729737918e-06, + "loss": 0.6346, + "step": 1555 + }, + { + "epoch": 0.7356973995271867, + "grad_norm": 2.5163493156433105, + "learning_rate": 4.836808109043151e-06, + "loss": 0.6061, + "step": 1556 + }, + { + "epoch": 0.7361702127659574, + "grad_norm": 2.8377044200897217, + "learning_rate": 4.836586342845588e-06, + "loss": 0.611, + "step": 1557 + }, + { + "epoch": 0.7366430260047281, + "grad_norm": 2.5929181575775146, + "learning_rate": 4.83636443115904e-06, + "loss": 0.5496, + "step": 1558 + }, + { + "epoch": 0.7371158392434988, + "grad_norm": 2.5017223358154297, + "learning_rate": 4.836142373997323e-06, + "loss": 0.6235, + "step": 1559 + }, + { + "epoch": 0.7375886524822695, + "grad_norm": 2.822500228881836, + "learning_rate": 4.835920171374265e-06, + "loss": 0.6147, + "step": 1560 + }, + { + "epoch": 0.7380614657210401, + "grad_norm": 2.7234230041503906, + "learning_rate": 4.8356978233037e-06, + "loss": 0.6228, + "step": 1561 + }, + { + "epoch": 0.7385342789598108, + "grad_norm": 2.9565515518188477, + "learning_rate": 4.835475329799472e-06, + "loss": 0.5728, + "step": 1562 + }, + { + "epoch": 0.7390070921985815, + "grad_norm": 2.4356038570404053, + "learning_rate": 4.835252690875438e-06, + "loss": 0.6723, + "step": 1563 + }, + { + "epoch": 0.7394799054373522, + "grad_norm": 2.765913248062134, + "learning_rate": 4.835029906545458e-06, + "loss": 0.5805, + "step": 1564 + }, + { + "epoch": 0.7399527186761229, + "grad_norm": 2.4481914043426514, + "learning_rate": 4.834806976823405e-06, + "loss": 0.599, + "step": 1565 + }, + { + "epoch": 0.7404255319148936, + "grad_norm": 2.620779514312744, + "learning_rate": 4.834583901723158e-06, + "loss": 0.63, + "step": 1566 + }, + { + "epoch": 0.7408983451536643, + "grad_norm": 2.654426097869873, + "learning_rate": 4.83436068125861e-06, + "loss": 0.6544, + "step": 1567 + }, + { + "epoch": 0.741371158392435, + "grad_norm": 2.589623212814331, + "learning_rate": 4.834137315443656e-06, + "loss": 0.5596, + "step": 1568 + }, + { + "epoch": 0.7418439716312056, + "grad_norm": 2.572883129119873, + "learning_rate": 4.833913804292209e-06, + "loss": 0.5974, + "step": 1569 + }, + { + "epoch": 0.7423167848699763, + "grad_norm": 2.8744914531707764, + "learning_rate": 4.833690147818181e-06, + "loss": 0.5364, + "step": 1570 + }, + { + "epoch": 0.742789598108747, + "grad_norm": 2.9800851345062256, + "learning_rate": 4.833466346035502e-06, + "loss": 0.6287, + "step": 1571 + }, + { + "epoch": 0.7432624113475177, + "grad_norm": 2.627784490585327, + "learning_rate": 4.833242398958105e-06, + "loss": 0.621, + "step": 1572 + }, + { + "epoch": 0.7437352245862884, + "grad_norm": 2.5187721252441406, + "learning_rate": 4.833018306599933e-06, + "loss": 0.5901, + "step": 1573 + }, + { + "epoch": 0.7442080378250591, + "grad_norm": 2.4843688011169434, + "learning_rate": 4.832794068974944e-06, + "loss": 0.6336, + "step": 1574 + }, + { + "epoch": 0.7446808510638298, + "grad_norm": 2.774911880493164, + "learning_rate": 4.832569686097096e-06, + "loss": 0.6091, + "step": 1575 + }, + { + "epoch": 0.7451536643026004, + "grad_norm": 3.2562527656555176, + "learning_rate": 4.8323451579803615e-06, + "loss": 0.7686, + "step": 1576 + }, + { + "epoch": 0.7456264775413711, + "grad_norm": 2.799570083618164, + "learning_rate": 4.832120484638721e-06, + "loss": 0.6233, + "step": 1577 + }, + { + "epoch": 0.7460992907801418, + "grad_norm": 2.661893367767334, + "learning_rate": 4.831895666086164e-06, + "loss": 0.5841, + "step": 1578 + }, + { + "epoch": 0.7465721040189125, + "grad_norm": 3.0382652282714844, + "learning_rate": 4.831670702336689e-06, + "loss": 0.5769, + "step": 1579 + }, + { + "epoch": 0.7470449172576832, + "grad_norm": 2.676398515701294, + "learning_rate": 4.831445593404304e-06, + "loss": 0.619, + "step": 1580 + }, + { + "epoch": 0.7475177304964539, + "grad_norm": 2.717916965484619, + "learning_rate": 4.831220339303024e-06, + "loss": 0.5787, + "step": 1581 + }, + { + "epoch": 0.7479905437352246, + "grad_norm": 2.3918066024780273, + "learning_rate": 4.830994940046876e-06, + "loss": 0.5108, + "step": 1582 + }, + { + "epoch": 0.7484633569739952, + "grad_norm": 2.709144115447998, + "learning_rate": 4.830769395649895e-06, + "loss": 0.6875, + "step": 1583 + }, + { + "epoch": 0.7489361702127659, + "grad_norm": 2.8711116313934326, + "learning_rate": 4.830543706126123e-06, + "loss": 0.6745, + "step": 1584 + }, + { + "epoch": 0.7494089834515366, + "grad_norm": 2.612339496612549, + "learning_rate": 4.830317871489614e-06, + "loss": 0.5738, + "step": 1585 + }, + { + "epoch": 0.7498817966903073, + "grad_norm": 2.4355857372283936, + "learning_rate": 4.830091891754429e-06, + "loss": 0.5907, + "step": 1586 + }, + { + "epoch": 0.750354609929078, + "grad_norm": 2.676051378250122, + "learning_rate": 4.829865766934638e-06, + "loss": 0.6628, + "step": 1587 + }, + { + "epoch": 0.7508274231678487, + "grad_norm": 2.66489839553833, + "learning_rate": 4.829639497044323e-06, + "loss": 0.5984, + "step": 1588 + }, + { + "epoch": 0.7513002364066194, + "grad_norm": 2.5358035564422607, + "learning_rate": 4.829413082097572e-06, + "loss": 0.5867, + "step": 1589 + }, + { + "epoch": 0.75177304964539, + "grad_norm": 2.6530144214630127, + "learning_rate": 4.8291865221084815e-06, + "loss": 0.5917, + "step": 1590 + }, + { + "epoch": 0.7522458628841607, + "grad_norm": 2.5160958766937256, + "learning_rate": 4.82895981709116e-06, + "loss": 0.6347, + "step": 1591 + }, + { + "epoch": 0.7527186761229314, + "grad_norm": 2.61592698097229, + "learning_rate": 4.8287329670597225e-06, + "loss": 0.5472, + "step": 1592 + }, + { + "epoch": 0.7531914893617021, + "grad_norm": 2.7528622150421143, + "learning_rate": 4.828505972028296e-06, + "loss": 0.5842, + "step": 1593 + }, + { + "epoch": 0.7536643026004728, + "grad_norm": 2.8154072761535645, + "learning_rate": 4.828278832011011e-06, + "loss": 0.5757, + "step": 1594 + }, + { + "epoch": 0.7541371158392435, + "grad_norm": 3.118515729904175, + "learning_rate": 4.828051547022013e-06, + "loss": 0.6472, + "step": 1595 + }, + { + "epoch": 0.7546099290780142, + "grad_norm": 2.452033758163452, + "learning_rate": 4.827824117075453e-06, + "loss": 0.5571, + "step": 1596 + }, + { + "epoch": 0.7550827423167848, + "grad_norm": 2.984388828277588, + "learning_rate": 4.827596542185492e-06, + "loss": 0.6656, + "step": 1597 + }, + { + "epoch": 0.7555555555555555, + "grad_norm": 2.61356782913208, + "learning_rate": 4.8273688223663014e-06, + "loss": 0.6444, + "step": 1598 + }, + { + "epoch": 0.7560283687943262, + "grad_norm": 2.8967196941375732, + "learning_rate": 4.8271409576320595e-06, + "loss": 0.6457, + "step": 1599 + }, + { + "epoch": 0.7565011820330969, + "grad_norm": 2.852367639541626, + "learning_rate": 4.826912947996954e-06, + "loss": 0.5629, + "step": 1600 + }, + { + "epoch": 0.7569739952718676, + "grad_norm": 2.905280590057373, + "learning_rate": 4.826684793475182e-06, + "loss": 0.6245, + "step": 1601 + }, + { + "epoch": 0.7574468085106383, + "grad_norm": 2.6156530380249023, + "learning_rate": 4.826456494080951e-06, + "loss": 0.5869, + "step": 1602 + }, + { + "epoch": 0.757919621749409, + "grad_norm": 2.6490228176116943, + "learning_rate": 4.826228049828475e-06, + "loss": 0.5461, + "step": 1603 + }, + { + "epoch": 0.7583924349881797, + "grad_norm": 2.9626693725585938, + "learning_rate": 4.825999460731978e-06, + "loss": 0.6842, + "step": 1604 + }, + { + "epoch": 0.7588652482269503, + "grad_norm": 2.6866023540496826, + "learning_rate": 4.825770726805695e-06, + "loss": 0.5726, + "step": 1605 + }, + { + "epoch": 0.759338061465721, + "grad_norm": 2.5525858402252197, + "learning_rate": 4.825541848063866e-06, + "loss": 0.6061, + "step": 1606 + }, + { + "epoch": 0.7598108747044917, + "grad_norm": 2.703977584838867, + "learning_rate": 4.825312824520743e-06, + "loss": 0.6726, + "step": 1607 + }, + { + "epoch": 0.7602836879432624, + "grad_norm": 2.856534957885742, + "learning_rate": 4.825083656190588e-06, + "loss": 0.625, + "step": 1608 + }, + { + "epoch": 0.7607565011820331, + "grad_norm": 2.8564887046813965, + "learning_rate": 4.824854343087668e-06, + "loss": 0.7251, + "step": 1609 + }, + { + "epoch": 0.7612293144208038, + "grad_norm": 2.327650308609009, + "learning_rate": 4.824624885226262e-06, + "loss": 0.526, + "step": 1610 + }, + { + "epoch": 0.7617021276595745, + "grad_norm": 3.0025737285614014, + "learning_rate": 4.824395282620659e-06, + "loss": 0.6043, + "step": 1611 + }, + { + "epoch": 0.7621749408983451, + "grad_norm": 2.5441737174987793, + "learning_rate": 4.824165535285152e-06, + "loss": 0.6276, + "step": 1612 + }, + { + "epoch": 0.7626477541371158, + "grad_norm": 2.4177372455596924, + "learning_rate": 4.823935643234049e-06, + "loss": 0.6419, + "step": 1613 + }, + { + "epoch": 0.7631205673758865, + "grad_norm": 2.9210550785064697, + "learning_rate": 4.823705606481664e-06, + "loss": 0.5663, + "step": 1614 + }, + { + "epoch": 0.7635933806146572, + "grad_norm": 2.6353724002838135, + "learning_rate": 4.82347542504232e-06, + "loss": 0.5669, + "step": 1615 + }, + { + "epoch": 0.7640661938534279, + "grad_norm": 2.419081926345825, + "learning_rate": 4.823245098930349e-06, + "loss": 0.5777, + "step": 1616 + }, + { + "epoch": 0.7645390070921986, + "grad_norm": 2.5077571868896484, + "learning_rate": 4.823014628160093e-06, + "loss": 0.5924, + "step": 1617 + }, + { + "epoch": 0.7650118203309693, + "grad_norm": 2.816056251525879, + "learning_rate": 4.822784012745902e-06, + "loss": 0.7273, + "step": 1618 + }, + { + "epoch": 0.76548463356974, + "grad_norm": 2.7163147926330566, + "learning_rate": 4.8225532527021366e-06, + "loss": 0.5545, + "step": 1619 + }, + { + "epoch": 0.7659574468085106, + "grad_norm": 2.4784302711486816, + "learning_rate": 4.822322348043164e-06, + "loss": 0.556, + "step": 1620 + }, + { + "epoch": 0.7664302600472813, + "grad_norm": 2.712467670440674, + "learning_rate": 4.822091298783361e-06, + "loss": 0.6501, + "step": 1621 + }, + { + "epoch": 0.766903073286052, + "grad_norm": 2.7217724323272705, + "learning_rate": 4.821860104937115e-06, + "loss": 0.5989, + "step": 1622 + }, + { + "epoch": 0.7673758865248227, + "grad_norm": 2.5622854232788086, + "learning_rate": 4.821628766518821e-06, + "loss": 0.5263, + "step": 1623 + }, + { + "epoch": 0.7678486997635934, + "grad_norm": 3.230923891067505, + "learning_rate": 4.821397283542884e-06, + "loss": 0.6707, + "step": 1624 + }, + { + "epoch": 0.7683215130023641, + "grad_norm": 2.37929105758667, + "learning_rate": 4.821165656023718e-06, + "loss": 0.6124, + "step": 1625 + }, + { + "epoch": 0.7687943262411348, + "grad_norm": 2.9811325073242188, + "learning_rate": 4.820933883975745e-06, + "loss": 0.6435, + "step": 1626 + }, + { + "epoch": 0.7692671394799054, + "grad_norm": 2.887380838394165, + "learning_rate": 4.820701967413395e-06, + "loss": 0.621, + "step": 1627 + }, + { + "epoch": 0.7697399527186761, + "grad_norm": 2.6762876510620117, + "learning_rate": 4.820469906351109e-06, + "loss": 0.5713, + "step": 1628 + }, + { + "epoch": 0.7702127659574468, + "grad_norm": 2.7347512245178223, + "learning_rate": 4.820237700803337e-06, + "loss": 0.6136, + "step": 1629 + }, + { + "epoch": 0.7706855791962175, + "grad_norm": 2.7244746685028076, + "learning_rate": 4.820005350784539e-06, + "loss": 0.5816, + "step": 1630 + }, + { + "epoch": 0.7711583924349882, + "grad_norm": 2.9293999671936035, + "learning_rate": 4.8197728563091795e-06, + "loss": 0.6649, + "step": 1631 + }, + { + "epoch": 0.7716312056737589, + "grad_norm": 2.4402127265930176, + "learning_rate": 4.819540217391736e-06, + "loss": 0.6481, + "step": 1632 + }, + { + "epoch": 0.7721040189125296, + "grad_norm": 3.083941698074341, + "learning_rate": 4.819307434046694e-06, + "loss": 0.6951, + "step": 1633 + }, + { + "epoch": 0.7725768321513002, + "grad_norm": 2.544952392578125, + "learning_rate": 4.819074506288548e-06, + "loss": 0.539, + "step": 1634 + }, + { + "epoch": 0.7730496453900709, + "grad_norm": 2.7791268825531006, + "learning_rate": 4.818841434131801e-06, + "loss": 0.5827, + "step": 1635 + }, + { + "epoch": 0.7735224586288416, + "grad_norm": 2.7349796295166016, + "learning_rate": 4.818608217590967e-06, + "loss": 0.5584, + "step": 1636 + }, + { + "epoch": 0.7739952718676123, + "grad_norm": 2.637652635574341, + "learning_rate": 4.818374856680565e-06, + "loss": 0.6386, + "step": 1637 + }, + { + "epoch": 0.774468085106383, + "grad_norm": 2.9821584224700928, + "learning_rate": 4.818141351415127e-06, + "loss": 0.6734, + "step": 1638 + }, + { + "epoch": 0.7749408983451537, + "grad_norm": 2.992938995361328, + "learning_rate": 4.817907701809192e-06, + "loss": 0.5899, + "step": 1639 + }, + { + "epoch": 0.7754137115839244, + "grad_norm": 4.35719633102417, + "learning_rate": 4.8176739078773076e-06, + "loss": 0.6281, + "step": 1640 + }, + { + "epoch": 0.775886524822695, + "grad_norm": 2.838146209716797, + "learning_rate": 4.8174399696340315e-06, + "loss": 0.5766, + "step": 1641 + }, + { + "epoch": 0.7763593380614657, + "grad_norm": 3.3116989135742188, + "learning_rate": 4.81720588709393e-06, + "loss": 0.6409, + "step": 1642 + }, + { + "epoch": 0.7768321513002364, + "grad_norm": 2.9843590259552, + "learning_rate": 4.816971660271579e-06, + "loss": 0.6108, + "step": 1643 + }, + { + "epoch": 0.7773049645390071, + "grad_norm": 2.843770742416382, + "learning_rate": 4.816737289181562e-06, + "loss": 0.6053, + "step": 1644 + }, + { + "epoch": 0.7777777777777778, + "grad_norm": 2.7608556747436523, + "learning_rate": 4.816502773838473e-06, + "loss": 0.5854, + "step": 1645 + }, + { + "epoch": 0.7782505910165485, + "grad_norm": 3.343682289123535, + "learning_rate": 4.816268114256914e-06, + "loss": 0.6329, + "step": 1646 + }, + { + "epoch": 0.7787234042553192, + "grad_norm": 2.769768476486206, + "learning_rate": 4.816033310451496e-06, + "loss": 0.6242, + "step": 1647 + }, + { + "epoch": 0.7791962174940898, + "grad_norm": 2.989851713180542, + "learning_rate": 4.815798362436838e-06, + "loss": 0.6493, + "step": 1648 + }, + { + "epoch": 0.7796690307328605, + "grad_norm": 3.170736312866211, + "learning_rate": 4.8155632702275716e-06, + "loss": 0.6341, + "step": 1649 + }, + { + "epoch": 0.7801418439716312, + "grad_norm": 2.7372522354125977, + "learning_rate": 4.815328033838334e-06, + "loss": 0.5445, + "step": 1650 + }, + { + "epoch": 0.7806146572104019, + "grad_norm": 2.6947238445281982, + "learning_rate": 4.8150926532837715e-06, + "loss": 0.6437, + "step": 1651 + }, + { + "epoch": 0.7810874704491726, + "grad_norm": 2.472323179244995, + "learning_rate": 4.81485712857854e-06, + "loss": 0.5751, + "step": 1652 + }, + { + "epoch": 0.7815602836879433, + "grad_norm": 2.791114091873169, + "learning_rate": 4.814621459737308e-06, + "loss": 0.5996, + "step": 1653 + }, + { + "epoch": 0.782033096926714, + "grad_norm": 3.1957521438598633, + "learning_rate": 4.814385646774745e-06, + "loss": 0.5803, + "step": 1654 + }, + { + "epoch": 0.7825059101654847, + "grad_norm": 2.4120798110961914, + "learning_rate": 4.8141496897055364e-06, + "loss": 0.5814, + "step": 1655 + }, + { + "epoch": 0.7829787234042553, + "grad_norm": 2.9262423515319824, + "learning_rate": 4.813913588544374e-06, + "loss": 0.6292, + "step": 1656 + }, + { + "epoch": 0.783451536643026, + "grad_norm": 2.8251047134399414, + "learning_rate": 4.813677343305959e-06, + "loss": 0.6787, + "step": 1657 + }, + { + "epoch": 0.7839243498817967, + "grad_norm": 2.931659698486328, + "learning_rate": 4.8134409540050005e-06, + "loss": 0.6163, + "step": 1658 + }, + { + "epoch": 0.7843971631205674, + "grad_norm": 2.7160706520080566, + "learning_rate": 4.813204420656219e-06, + "loss": 0.6831, + "step": 1659 + }, + { + "epoch": 0.7848699763593381, + "grad_norm": 3.2134454250335693, + "learning_rate": 4.81296774327434e-06, + "loss": 0.6002, + "step": 1660 + }, + { + "epoch": 0.7853427895981088, + "grad_norm": 2.4002513885498047, + "learning_rate": 4.812730921874103e-06, + "loss": 0.5488, + "step": 1661 + }, + { + "epoch": 0.7858156028368795, + "grad_norm": 2.5559282302856445, + "learning_rate": 4.812493956470251e-06, + "loss": 0.5802, + "step": 1662 + }, + { + "epoch": 0.7862884160756501, + "grad_norm": 2.57478404045105, + "learning_rate": 4.812256847077541e-06, + "loss": 0.646, + "step": 1663 + }, + { + "epoch": 0.7867612293144208, + "grad_norm": 2.811851978302002, + "learning_rate": 4.812019593710736e-06, + "loss": 0.6245, + "step": 1664 + }, + { + "epoch": 0.7872340425531915, + "grad_norm": 2.5228829383850098, + "learning_rate": 4.811782196384609e-06, + "loss": 0.5949, + "step": 1665 + }, + { + "epoch": 0.7877068557919622, + "grad_norm": 2.744096040725708, + "learning_rate": 4.8115446551139415e-06, + "loss": 0.6006, + "step": 1666 + }, + { + "epoch": 0.7881796690307329, + "grad_norm": 3.129242420196533, + "learning_rate": 4.811306969913524e-06, + "loss": 0.7251, + "step": 1667 + }, + { + "epoch": 0.7886524822695036, + "grad_norm": 2.7855660915374756, + "learning_rate": 4.811069140798156e-06, + "loss": 0.6534, + "step": 1668 + }, + { + "epoch": 0.7891252955082743, + "grad_norm": 2.836603879928589, + "learning_rate": 4.810831167782647e-06, + "loss": 0.6661, + "step": 1669 + }, + { + "epoch": 0.789598108747045, + "grad_norm": 2.5339887142181396, + "learning_rate": 4.810593050881813e-06, + "loss": 0.5354, + "step": 1670 + }, + { + "epoch": 0.7900709219858156, + "grad_norm": 2.9553709030151367, + "learning_rate": 4.810354790110482e-06, + "loss": 0.6001, + "step": 1671 + }, + { + "epoch": 0.7905437352245863, + "grad_norm": 2.6581788063049316, + "learning_rate": 4.8101163854834885e-06, + "loss": 0.6802, + "step": 1672 + }, + { + "epoch": 0.791016548463357, + "grad_norm": 3.2002551555633545, + "learning_rate": 4.809877837015677e-06, + "loss": 0.6641, + "step": 1673 + }, + { + "epoch": 0.7914893617021277, + "grad_norm": 2.918792963027954, + "learning_rate": 4.809639144721902e-06, + "loss": 0.6758, + "step": 1674 + }, + { + "epoch": 0.7919621749408984, + "grad_norm": 2.7993946075439453, + "learning_rate": 4.8094003086170245e-06, + "loss": 0.5889, + "step": 1675 + }, + { + "epoch": 0.7924349881796691, + "grad_norm": 2.3698952198028564, + "learning_rate": 4.809161328715916e-06, + "loss": 0.6244, + "step": 1676 + }, + { + "epoch": 0.7929078014184398, + "grad_norm": 2.8891594409942627, + "learning_rate": 4.808922205033458e-06, + "loss": 0.5835, + "step": 1677 + }, + { + "epoch": 0.7933806146572104, + "grad_norm": 2.838345766067505, + "learning_rate": 4.808682937584537e-06, + "loss": 0.6907, + "step": 1678 + }, + { + "epoch": 0.7938534278959811, + "grad_norm": 2.8443174362182617, + "learning_rate": 4.808443526384053e-06, + "loss": 0.6692, + "step": 1679 + }, + { + "epoch": 0.7943262411347518, + "grad_norm": 2.7355034351348877, + "learning_rate": 4.808203971446913e-06, + "loss": 0.5799, + "step": 1680 + }, + { + "epoch": 0.7947990543735225, + "grad_norm": 2.7108020782470703, + "learning_rate": 4.807964272788033e-06, + "loss": 0.652, + "step": 1681 + }, + { + "epoch": 0.7952718676122932, + "grad_norm": 2.397650957107544, + "learning_rate": 4.807724430422338e-06, + "loss": 0.5418, + "step": 1682 + }, + { + "epoch": 0.7957446808510639, + "grad_norm": 2.4981582164764404, + "learning_rate": 4.807484444364762e-06, + "loss": 0.5731, + "step": 1683 + }, + { + "epoch": 0.7962174940898346, + "grad_norm": 2.7943713665008545, + "learning_rate": 4.8072443146302475e-06, + "loss": 0.5913, + "step": 1684 + }, + { + "epoch": 0.7966903073286052, + "grad_norm": 2.5691423416137695, + "learning_rate": 4.807004041233746e-06, + "loss": 0.6475, + "step": 1685 + }, + { + "epoch": 0.7971631205673759, + "grad_norm": 3.2367498874664307, + "learning_rate": 4.8067636241902195e-06, + "loss": 0.675, + "step": 1686 + }, + { + "epoch": 0.7976359338061466, + "grad_norm": 3.000595808029175, + "learning_rate": 4.806523063514637e-06, + "loss": 0.5481, + "step": 1687 + }, + { + "epoch": 0.7981087470449173, + "grad_norm": 2.702014207839966, + "learning_rate": 4.806282359221976e-06, + "loss": 0.5993, + "step": 1688 + }, + { + "epoch": 0.798581560283688, + "grad_norm": 2.383671998977661, + "learning_rate": 4.806041511327226e-06, + "loss": 0.562, + "step": 1689 + }, + { + "epoch": 0.7990543735224587, + "grad_norm": 2.6965041160583496, + "learning_rate": 4.8058005198453834e-06, + "loss": 0.5955, + "step": 1690 + }, + { + "epoch": 0.7995271867612294, + "grad_norm": 2.5906765460968018, + "learning_rate": 4.805559384791453e-06, + "loss": 0.5151, + "step": 1691 + }, + { + "epoch": 0.8, + "grad_norm": 2.5454652309417725, + "learning_rate": 4.8053181061804475e-06, + "loss": 0.5843, + "step": 1692 + }, + { + "epoch": 0.8004728132387707, + "grad_norm": 2.661343812942505, + "learning_rate": 4.8050766840273935e-06, + "loss": 0.5995, + "step": 1693 + }, + { + "epoch": 0.8009456264775414, + "grad_norm": 2.5635924339294434, + "learning_rate": 4.8048351183473215e-06, + "loss": 0.5676, + "step": 1694 + }, + { + "epoch": 0.8014184397163121, + "grad_norm": 2.5936667919158936, + "learning_rate": 4.804593409155274e-06, + "loss": 0.6291, + "step": 1695 + }, + { + "epoch": 0.8018912529550828, + "grad_norm": 2.6902432441711426, + "learning_rate": 4.804351556466299e-06, + "loss": 0.6114, + "step": 1696 + }, + { + "epoch": 0.8023640661938535, + "grad_norm": 2.7764673233032227, + "learning_rate": 4.804109560295457e-06, + "loss": 0.5768, + "step": 1697 + }, + { + "epoch": 0.8028368794326242, + "grad_norm": 2.9587221145629883, + "learning_rate": 4.803867420657816e-06, + "loss": 0.6048, + "step": 1698 + }, + { + "epoch": 0.8033096926713948, + "grad_norm": 2.9238998889923096, + "learning_rate": 4.803625137568453e-06, + "loss": 0.6329, + "step": 1699 + }, + { + "epoch": 0.8037825059101655, + "grad_norm": 2.70473313331604, + "learning_rate": 4.803382711042455e-06, + "loss": 0.5427, + "step": 1700 + }, + { + "epoch": 0.8042553191489362, + "grad_norm": 3.1604979038238525, + "learning_rate": 4.803140141094914e-06, + "loss": 0.626, + "step": 1701 + }, + { + "epoch": 0.8047281323877069, + "grad_norm": 2.9567699432373047, + "learning_rate": 4.802897427740936e-06, + "loss": 0.5319, + "step": 1702 + }, + { + "epoch": 0.8052009456264776, + "grad_norm": 2.90983247756958, + "learning_rate": 4.802654570995632e-06, + "loss": 0.586, + "step": 1703 + }, + { + "epoch": 0.8056737588652483, + "grad_norm": 2.783480167388916, + "learning_rate": 4.8024115708741255e-06, + "loss": 0.5773, + "step": 1704 + }, + { + "epoch": 0.806146572104019, + "grad_norm": 3.3307793140411377, + "learning_rate": 4.802168427391547e-06, + "loss": 0.6257, + "step": 1705 + }, + { + "epoch": 0.8066193853427897, + "grad_norm": 3.0475001335144043, + "learning_rate": 4.801925140563034e-06, + "loss": 0.6612, + "step": 1706 + }, + { + "epoch": 0.8070921985815603, + "grad_norm": 2.8278894424438477, + "learning_rate": 4.8016817104037375e-06, + "loss": 0.6449, + "step": 1707 + }, + { + "epoch": 0.807565011820331, + "grad_norm": 2.760244369506836, + "learning_rate": 4.801438136928812e-06, + "loss": 0.7007, + "step": 1708 + }, + { + "epoch": 0.8080378250591016, + "grad_norm": 2.827634572982788, + "learning_rate": 4.801194420153427e-06, + "loss": 0.6418, + "step": 1709 + }, + { + "epoch": 0.8085106382978723, + "grad_norm": 2.8655009269714355, + "learning_rate": 4.800950560092754e-06, + "loss": 0.6231, + "step": 1710 + }, + { + "epoch": 0.808983451536643, + "grad_norm": 2.738112688064575, + "learning_rate": 4.800706556761981e-06, + "loss": 0.6463, + "step": 1711 + }, + { + "epoch": 0.8094562647754137, + "grad_norm": 2.4781179428100586, + "learning_rate": 4.800462410176296e-06, + "loss": 0.5365, + "step": 1712 + }, + { + "epoch": 0.8099290780141843, + "grad_norm": 2.6049838066101074, + "learning_rate": 4.800218120350906e-06, + "loss": 0.6035, + "step": 1713 + }, + { + "epoch": 0.810401891252955, + "grad_norm": 2.9089980125427246, + "learning_rate": 4.79997368730102e-06, + "loss": 0.5828, + "step": 1714 + }, + { + "epoch": 0.8108747044917257, + "grad_norm": 2.831871747970581, + "learning_rate": 4.799729111041857e-06, + "loss": 0.5953, + "step": 1715 + }, + { + "epoch": 0.8113475177304964, + "grad_norm": 2.5611300468444824, + "learning_rate": 4.799484391588647e-06, + "loss": 0.6302, + "step": 1716 + }, + { + "epoch": 0.8118203309692671, + "grad_norm": 2.744070053100586, + "learning_rate": 4.799239528956625e-06, + "loss": 0.5561, + "step": 1717 + }, + { + "epoch": 0.8122931442080378, + "grad_norm": 2.7344231605529785, + "learning_rate": 4.798994523161041e-06, + "loss": 0.6317, + "step": 1718 + }, + { + "epoch": 0.8127659574468085, + "grad_norm": 2.3420889377593994, + "learning_rate": 4.798749374217149e-06, + "loss": 0.5415, + "step": 1719 + }, + { + "epoch": 0.8132387706855791, + "grad_norm": 2.57384991645813, + "learning_rate": 4.798504082140212e-06, + "loss": 0.6383, + "step": 1720 + }, + { + "epoch": 0.8137115839243498, + "grad_norm": 2.8819844722747803, + "learning_rate": 4.798258646945505e-06, + "loss": 0.6355, + "step": 1721 + }, + { + "epoch": 0.8141843971631205, + "grad_norm": 2.908123254776001, + "learning_rate": 4.79801306864831e-06, + "loss": 0.701, + "step": 1722 + }, + { + "epoch": 0.8146572104018912, + "grad_norm": 2.6500701904296875, + "learning_rate": 4.797767347263917e-06, + "loss": 0.6152, + "step": 1723 + }, + { + "epoch": 0.8151300236406619, + "grad_norm": 2.5513017177581787, + "learning_rate": 4.797521482807628e-06, + "loss": 0.6241, + "step": 1724 + }, + { + "epoch": 0.8156028368794326, + "grad_norm": 2.6239185333251953, + "learning_rate": 4.7972754752947495e-06, + "loss": 0.6072, + "step": 1725 + }, + { + "epoch": 0.8160756501182033, + "grad_norm": 2.673436403274536, + "learning_rate": 4.797029324740601e-06, + "loss": 0.5802, + "step": 1726 + }, + { + "epoch": 0.816548463356974, + "grad_norm": 2.533831834793091, + "learning_rate": 4.796783031160508e-06, + "loss": 0.5566, + "step": 1727 + }, + { + "epoch": 0.8170212765957446, + "grad_norm": 2.9806582927703857, + "learning_rate": 4.796536594569807e-06, + "loss": 0.6945, + "step": 1728 + }, + { + "epoch": 0.8174940898345153, + "grad_norm": 2.7093560695648193, + "learning_rate": 4.796290014983842e-06, + "loss": 0.7143, + "step": 1729 + }, + { + "epoch": 0.817966903073286, + "grad_norm": 2.814507246017456, + "learning_rate": 4.796043292417967e-06, + "loss": 0.6122, + "step": 1730 + }, + { + "epoch": 0.8184397163120567, + "grad_norm": 2.537156820297241, + "learning_rate": 4.795796426887543e-06, + "loss": 0.6229, + "step": 1731 + }, + { + "epoch": 0.8189125295508274, + "grad_norm": 2.4878013134002686, + "learning_rate": 4.795549418407944e-06, + "loss": 0.5442, + "step": 1732 + }, + { + "epoch": 0.8193853427895981, + "grad_norm": 2.839383363723755, + "learning_rate": 4.795302266994548e-06, + "loss": 0.6717, + "step": 1733 + }, + { + "epoch": 0.8198581560283688, + "grad_norm": 3.1981801986694336, + "learning_rate": 4.795054972662744e-06, + "loss": 0.6596, + "step": 1734 + }, + { + "epoch": 0.8203309692671394, + "grad_norm": 2.781730890274048, + "learning_rate": 4.79480753542793e-06, + "loss": 0.5845, + "step": 1735 + }, + { + "epoch": 0.8208037825059101, + "grad_norm": 2.689948558807373, + "learning_rate": 4.794559955305513e-06, + "loss": 0.5928, + "step": 1736 + }, + { + "epoch": 0.8212765957446808, + "grad_norm": 2.7267637252807617, + "learning_rate": 4.7943122323109105e-06, + "loss": 0.5224, + "step": 1737 + }, + { + "epoch": 0.8217494089834515, + "grad_norm": 2.4346601963043213, + "learning_rate": 4.794064366459544e-06, + "loss": 0.6431, + "step": 1738 + }, + { + "epoch": 0.8222222222222222, + "grad_norm": 2.7440176010131836, + "learning_rate": 4.793816357766849e-06, + "loss": 0.6083, + "step": 1739 + }, + { + "epoch": 0.8226950354609929, + "grad_norm": 2.6558027267456055, + "learning_rate": 4.793568206248268e-06, + "loss": 0.698, + "step": 1740 + }, + { + "epoch": 0.8231678486997636, + "grad_norm": 2.591658353805542, + "learning_rate": 4.793319911919251e-06, + "loss": 0.6601, + "step": 1741 + }, + { + "epoch": 0.8236406619385342, + "grad_norm": 2.5431172847747803, + "learning_rate": 4.79307147479526e-06, + "loss": 0.5917, + "step": 1742 + }, + { + "epoch": 0.8241134751773049, + "grad_norm": 2.7335588932037354, + "learning_rate": 4.792822894891762e-06, + "loss": 0.5925, + "step": 1743 + }, + { + "epoch": 0.8245862884160756, + "grad_norm": 2.2500839233398438, + "learning_rate": 4.792574172224237e-06, + "loss": 0.4984, + "step": 1744 + }, + { + "epoch": 0.8250591016548463, + "grad_norm": 2.691343069076538, + "learning_rate": 4.79232530680817e-06, + "loss": 0.6262, + "step": 1745 + }, + { + "epoch": 0.825531914893617, + "grad_norm": 2.612204074859619, + "learning_rate": 4.792076298659058e-06, + "loss": 0.5822, + "step": 1746 + }, + { + "epoch": 0.8260047281323877, + "grad_norm": 3.0163519382476807, + "learning_rate": 4.791827147792406e-06, + "loss": 0.6263, + "step": 1747 + }, + { + "epoch": 0.8264775413711584, + "grad_norm": 2.742183208465576, + "learning_rate": 4.791577854223727e-06, + "loss": 0.6628, + "step": 1748 + }, + { + "epoch": 0.826950354609929, + "grad_norm": 2.872213840484619, + "learning_rate": 4.791328417968542e-06, + "loss": 0.6332, + "step": 1749 + }, + { + "epoch": 0.8274231678486997, + "grad_norm": 2.725006580352783, + "learning_rate": 4.7910788390423844e-06, + "loss": 0.6266, + "step": 1750 + }, + { + "epoch": 0.8278959810874704, + "grad_norm": 3.0366697311401367, + "learning_rate": 4.790829117460793e-06, + "loss": 0.6403, + "step": 1751 + }, + { + "epoch": 0.8283687943262411, + "grad_norm": 2.594881772994995, + "learning_rate": 4.790579253239318e-06, + "loss": 0.521, + "step": 1752 + }, + { + "epoch": 0.8288416075650118, + "grad_norm": 2.4496347904205322, + "learning_rate": 4.790329246393517e-06, + "loss": 0.54, + "step": 1753 + }, + { + "epoch": 0.8293144208037825, + "grad_norm": 3.102278470993042, + "learning_rate": 4.790079096938956e-06, + "loss": 0.6142, + "step": 1754 + }, + { + "epoch": 0.8297872340425532, + "grad_norm": 2.4645912647247314, + "learning_rate": 4.789828804891212e-06, + "loss": 0.5212, + "step": 1755 + }, + { + "epoch": 0.8302600472813239, + "grad_norm": 2.7482516765594482, + "learning_rate": 4.789578370265868e-06, + "loss": 0.6712, + "step": 1756 + }, + { + "epoch": 0.8307328605200945, + "grad_norm": 2.61360502243042, + "learning_rate": 4.7893277930785195e-06, + "loss": 0.6367, + "step": 1757 + }, + { + "epoch": 0.8312056737588652, + "grad_norm": 2.79028058052063, + "learning_rate": 4.789077073344767e-06, + "loss": 0.5099, + "step": 1758 + }, + { + "epoch": 0.8316784869976359, + "grad_norm": 2.647662401199341, + "learning_rate": 4.788826211080222e-06, + "loss": 0.6698, + "step": 1759 + }, + { + "epoch": 0.8321513002364066, + "grad_norm": 3.0214831829071045, + "learning_rate": 4.7885752063005055e-06, + "loss": 0.6121, + "step": 1760 + }, + { + "epoch": 0.8326241134751773, + "grad_norm": 2.8244032859802246, + "learning_rate": 4.788324059021247e-06, + "loss": 0.6921, + "step": 1761 + }, + { + "epoch": 0.833096926713948, + "grad_norm": 3.1501076221466064, + "learning_rate": 4.788072769258082e-06, + "loss": 0.6872, + "step": 1762 + }, + { + "epoch": 0.8335697399527187, + "grad_norm": 2.6989903450012207, + "learning_rate": 4.7878213370266594e-06, + "loss": 0.5884, + "step": 1763 + }, + { + "epoch": 0.8340425531914893, + "grad_norm": 2.6982665061950684, + "learning_rate": 4.787569762342633e-06, + "loss": 0.6112, + "step": 1764 + }, + { + "epoch": 0.83451536643026, + "grad_norm": 2.6918323040008545, + "learning_rate": 4.7873180452216685e-06, + "loss": 0.5315, + "step": 1765 + }, + { + "epoch": 0.8349881796690307, + "grad_norm": 2.5494401454925537, + "learning_rate": 4.78706618567944e-06, + "loss": 0.5909, + "step": 1766 + }, + { + "epoch": 0.8354609929078014, + "grad_norm": 2.7532095909118652, + "learning_rate": 4.786814183731627e-06, + "loss": 0.5566, + "step": 1767 + }, + { + "epoch": 0.8359338061465721, + "grad_norm": 2.550865888595581, + "learning_rate": 4.786562039393923e-06, + "loss": 0.555, + "step": 1768 + }, + { + "epoch": 0.8364066193853428, + "grad_norm": 2.4477791786193848, + "learning_rate": 4.786309752682028e-06, + "loss": 0.5844, + "step": 1769 + }, + { + "epoch": 0.8368794326241135, + "grad_norm": 2.6982262134552, + "learning_rate": 4.7860573236116485e-06, + "loss": 0.6136, + "step": 1770 + }, + { + "epoch": 0.8373522458628841, + "grad_norm": 2.456263542175293, + "learning_rate": 4.785804752198503e-06, + "loss": 0.5055, + "step": 1771 + }, + { + "epoch": 0.8378250591016548, + "grad_norm": 2.428544521331787, + "learning_rate": 4.78555203845832e-06, + "loss": 0.5859, + "step": 1772 + }, + { + "epoch": 0.8382978723404255, + "grad_norm": 2.1782307624816895, + "learning_rate": 4.785299182406833e-06, + "loss": 0.5325, + "step": 1773 + }, + { + "epoch": 0.8387706855791962, + "grad_norm": 3.137956142425537, + "learning_rate": 4.785046184059786e-06, + "loss": 0.6097, + "step": 1774 + }, + { + "epoch": 0.8392434988179669, + "grad_norm": 2.6269001960754395, + "learning_rate": 4.7847930434329336e-06, + "loss": 0.5972, + "step": 1775 + }, + { + "epoch": 0.8397163120567376, + "grad_norm": 2.732659339904785, + "learning_rate": 4.784539760542037e-06, + "loss": 0.6054, + "step": 1776 + }, + { + "epoch": 0.8401891252955083, + "grad_norm": 2.5346736907958984, + "learning_rate": 4.784286335402866e-06, + "loss": 0.5521, + "step": 1777 + }, + { + "epoch": 0.840661938534279, + "grad_norm": 3.1420228481292725, + "learning_rate": 4.784032768031202e-06, + "loss": 0.6165, + "step": 1778 + }, + { + "epoch": 0.8411347517730496, + "grad_norm": 3.073793411254883, + "learning_rate": 4.783779058442831e-06, + "loss": 0.6414, + "step": 1779 + }, + { + "epoch": 0.8416075650118203, + "grad_norm": 2.6621336936950684, + "learning_rate": 4.783525206653554e-06, + "loss": 0.5836, + "step": 1780 + }, + { + "epoch": 0.842080378250591, + "grad_norm": 2.7029049396514893, + "learning_rate": 4.7832712126791745e-06, + "loss": 0.5897, + "step": 1781 + }, + { + "epoch": 0.8425531914893617, + "grad_norm": 2.4733822345733643, + "learning_rate": 4.783017076535509e-06, + "loss": 0.5913, + "step": 1782 + }, + { + "epoch": 0.8430260047281324, + "grad_norm": 2.8119473457336426, + "learning_rate": 4.782762798238381e-06, + "loss": 0.6105, + "step": 1783 + }, + { + "epoch": 0.8434988179669031, + "grad_norm": 2.5290818214416504, + "learning_rate": 4.782508377803622e-06, + "loss": 0.6119, + "step": 1784 + }, + { + "epoch": 0.8439716312056738, + "grad_norm": 3.193472385406494, + "learning_rate": 4.782253815247076e-06, + "loss": 0.6665, + "step": 1785 + }, + { + "epoch": 0.8444444444444444, + "grad_norm": 3.206759452819824, + "learning_rate": 4.781999110584592e-06, + "loss": 0.6012, + "step": 1786 + }, + { + "epoch": 0.8449172576832151, + "grad_norm": 2.6227457523345947, + "learning_rate": 4.781744263832029e-06, + "loss": 0.5845, + "step": 1787 + }, + { + "epoch": 0.8453900709219858, + "grad_norm": 2.838365316390991, + "learning_rate": 4.781489275005257e-06, + "loss": 0.5695, + "step": 1788 + }, + { + "epoch": 0.8458628841607565, + "grad_norm": 2.8348326683044434, + "learning_rate": 4.78123414412015e-06, + "loss": 0.6136, + "step": 1789 + }, + { + "epoch": 0.8463356973995272, + "grad_norm": 2.5698344707489014, + "learning_rate": 4.780978871192597e-06, + "loss": 0.6576, + "step": 1790 + }, + { + "epoch": 0.8468085106382979, + "grad_norm": 2.5198330879211426, + "learning_rate": 4.780723456238492e-06, + "loss": 0.5521, + "step": 1791 + }, + { + "epoch": 0.8472813238770686, + "grad_norm": 3.001325845718384, + "learning_rate": 4.780467899273737e-06, + "loss": 0.6075, + "step": 1792 + }, + { + "epoch": 0.8477541371158392, + "grad_norm": 2.7732746601104736, + "learning_rate": 4.780212200314247e-06, + "loss": 0.6245, + "step": 1793 + }, + { + "epoch": 0.8482269503546099, + "grad_norm": 2.6950337886810303, + "learning_rate": 4.77995635937594e-06, + "loss": 0.5723, + "step": 1794 + }, + { + "epoch": 0.8486997635933806, + "grad_norm": 2.82051420211792, + "learning_rate": 4.779700376474749e-06, + "loss": 0.6184, + "step": 1795 + }, + { + "epoch": 0.8491725768321513, + "grad_norm": 2.757791757583618, + "learning_rate": 4.779444251626611e-06, + "loss": 0.608, + "step": 1796 + }, + { + "epoch": 0.849645390070922, + "grad_norm": 2.394108533859253, + "learning_rate": 4.779187984847475e-06, + "loss": 0.6174, + "step": 1797 + }, + { + "epoch": 0.8501182033096927, + "grad_norm": 2.427562713623047, + "learning_rate": 4.778931576153296e-06, + "loss": 0.5618, + "step": 1798 + }, + { + "epoch": 0.8505910165484634, + "grad_norm": 2.891268491744995, + "learning_rate": 4.778675025560042e-06, + "loss": 0.6865, + "step": 1799 + }, + { + "epoch": 0.851063829787234, + "grad_norm": 2.665534257888794, + "learning_rate": 4.778418333083685e-06, + "loss": 0.5852, + "step": 1800 + }, + { + "epoch": 0.8515366430260047, + "grad_norm": 2.5492889881134033, + "learning_rate": 4.7781614987402095e-06, + "loss": 0.5161, + "step": 1801 + }, + { + "epoch": 0.8520094562647754, + "grad_norm": 2.400177001953125, + "learning_rate": 4.777904522545607e-06, + "loss": 0.5128, + "step": 1802 + }, + { + "epoch": 0.8524822695035461, + "grad_norm": 2.3949809074401855, + "learning_rate": 4.777647404515878e-06, + "loss": 0.571, + "step": 1803 + }, + { + "epoch": 0.8529550827423168, + "grad_norm": 2.3624472618103027, + "learning_rate": 4.7773901446670325e-06, + "loss": 0.5486, + "step": 1804 + }, + { + "epoch": 0.8534278959810875, + "grad_norm": 2.711366891860962, + "learning_rate": 4.7771327430150885e-06, + "loss": 0.5667, + "step": 1805 + }, + { + "epoch": 0.8539007092198582, + "grad_norm": 2.7681493759155273, + "learning_rate": 4.776875199576073e-06, + "loss": 0.5686, + "step": 1806 + }, + { + "epoch": 0.8543735224586289, + "grad_norm": 3.0369436740875244, + "learning_rate": 4.776617514366023e-06, + "loss": 0.6635, + "step": 1807 + }, + { + "epoch": 0.8548463356973995, + "grad_norm": 2.919649600982666, + "learning_rate": 4.776359687400983e-06, + "loss": 0.5749, + "step": 1808 + }, + { + "epoch": 0.8553191489361702, + "grad_norm": 2.7986185550689697, + "learning_rate": 4.776101718697007e-06, + "loss": 0.559, + "step": 1809 + }, + { + "epoch": 0.8557919621749409, + "grad_norm": 2.5951223373413086, + "learning_rate": 4.775843608270158e-06, + "loss": 0.5654, + "step": 1810 + }, + { + "epoch": 0.8562647754137116, + "grad_norm": 2.674138069152832, + "learning_rate": 4.775585356136505e-06, + "loss": 0.5286, + "step": 1811 + }, + { + "epoch": 0.8567375886524823, + "grad_norm": 3.045437812805176, + "learning_rate": 4.775326962312131e-06, + "loss": 0.6185, + "step": 1812 + }, + { + "epoch": 0.857210401891253, + "grad_norm": 2.6145293712615967, + "learning_rate": 4.775068426813124e-06, + "loss": 0.6075, + "step": 1813 + }, + { + "epoch": 0.8576832151300237, + "grad_norm": 2.6320106983184814, + "learning_rate": 4.7748097496555824e-06, + "loss": 0.561, + "step": 1814 + }, + { + "epoch": 0.8581560283687943, + "grad_norm": 2.5038623809814453, + "learning_rate": 4.774550930855612e-06, + "loss": 0.593, + "step": 1815 + }, + { + "epoch": 0.858628841607565, + "grad_norm": 2.8168089389801025, + "learning_rate": 4.774291970429329e-06, + "loss": 0.5196, + "step": 1816 + }, + { + "epoch": 0.8591016548463357, + "grad_norm": 2.778130292892456, + "learning_rate": 4.774032868392858e-06, + "loss": 0.5984, + "step": 1817 + }, + { + "epoch": 0.8595744680851064, + "grad_norm": 2.536458730697632, + "learning_rate": 4.7737736247623305e-06, + "loss": 0.568, + "step": 1818 + }, + { + "epoch": 0.8600472813238771, + "grad_norm": 2.6669719219207764, + "learning_rate": 4.77351423955389e-06, + "loss": 0.6233, + "step": 1819 + }, + { + "epoch": 0.8605200945626478, + "grad_norm": 2.578242540359497, + "learning_rate": 4.773254712783687e-06, + "loss": 0.579, + "step": 1820 + }, + { + "epoch": 0.8609929078014185, + "grad_norm": 2.816664457321167, + "learning_rate": 4.772995044467881e-06, + "loss": 0.6635, + "step": 1821 + }, + { + "epoch": 0.8614657210401891, + "grad_norm": 3.1111979484558105, + "learning_rate": 4.77273523462264e-06, + "loss": 0.6372, + "step": 1822 + }, + { + "epoch": 0.8619385342789598, + "grad_norm": 2.764552354812622, + "learning_rate": 4.772475283264142e-06, + "loss": 0.6216, + "step": 1823 + }, + { + "epoch": 0.8624113475177305, + "grad_norm": 2.9126830101013184, + "learning_rate": 4.772215190408572e-06, + "loss": 0.6396, + "step": 1824 + }, + { + "epoch": 0.8628841607565012, + "grad_norm": 2.7502307891845703, + "learning_rate": 4.7719549560721264e-06, + "loss": 0.6186, + "step": 1825 + }, + { + "epoch": 0.8633569739952719, + "grad_norm": 2.6279006004333496, + "learning_rate": 4.771694580271007e-06, + "loss": 0.5557, + "step": 1826 + }, + { + "epoch": 0.8638297872340426, + "grad_norm": 2.996563196182251, + "learning_rate": 4.7714340630214276e-06, + "loss": 0.6259, + "step": 1827 + }, + { + "epoch": 0.8643026004728133, + "grad_norm": 3.231323480606079, + "learning_rate": 4.771173404339609e-06, + "loss": 0.5473, + "step": 1828 + }, + { + "epoch": 0.864775413711584, + "grad_norm": 3.143519878387451, + "learning_rate": 4.770912604241781e-06, + "loss": 0.593, + "step": 1829 + }, + { + "epoch": 0.8652482269503546, + "grad_norm": 2.515484094619751, + "learning_rate": 4.770651662744184e-06, + "loss": 0.538, + "step": 1830 + }, + { + "epoch": 0.8657210401891253, + "grad_norm": 2.629058837890625, + "learning_rate": 4.770390579863064e-06, + "loss": 0.5745, + "step": 1831 + }, + { + "epoch": 0.866193853427896, + "grad_norm": 2.5826802253723145, + "learning_rate": 4.770129355614677e-06, + "loss": 0.6397, + "step": 1832 + }, + { + "epoch": 0.8666666666666667, + "grad_norm": 2.954623222351074, + "learning_rate": 4.769867990015289e-06, + "loss": 0.6106, + "step": 1833 + }, + { + "epoch": 0.8671394799054374, + "grad_norm": 2.742192268371582, + "learning_rate": 4.769606483081175e-06, + "loss": 0.6902, + "step": 1834 + }, + { + "epoch": 0.8676122931442081, + "grad_norm": 2.2619097232818604, + "learning_rate": 4.769344834828618e-06, + "loss": 0.5414, + "step": 1835 + }, + { + "epoch": 0.8680851063829788, + "grad_norm": 2.7384188175201416, + "learning_rate": 4.769083045273908e-06, + "loss": 0.5787, + "step": 1836 + }, + { + "epoch": 0.8685579196217494, + "grad_norm": 2.6734485626220703, + "learning_rate": 4.768821114433346e-06, + "loss": 0.5923, + "step": 1837 + }, + { + "epoch": 0.8690307328605201, + "grad_norm": 2.286140203475952, + "learning_rate": 4.768559042323243e-06, + "loss": 0.5822, + "step": 1838 + }, + { + "epoch": 0.8695035460992908, + "grad_norm": 3.0243725776672363, + "learning_rate": 4.768296828959915e-06, + "loss": 0.6623, + "step": 1839 + }, + { + "epoch": 0.8699763593380615, + "grad_norm": 2.4026312828063965, + "learning_rate": 4.768034474359689e-06, + "loss": 0.5554, + "step": 1840 + }, + { + "epoch": 0.8704491725768322, + "grad_norm": 2.7469029426574707, + "learning_rate": 4.767771978538903e-06, + "loss": 0.6316, + "step": 1841 + }, + { + "epoch": 0.8709219858156029, + "grad_norm": 2.729659080505371, + "learning_rate": 4.767509341513899e-06, + "loss": 0.5807, + "step": 1842 + }, + { + "epoch": 0.8713947990543736, + "grad_norm": 2.5336945056915283, + "learning_rate": 4.76724656330103e-06, + "loss": 0.6109, + "step": 1843 + }, + { + "epoch": 0.8718676122931442, + "grad_norm": 2.519880533218384, + "learning_rate": 4.76698364391666e-06, + "loss": 0.5313, + "step": 1844 + }, + { + "epoch": 0.8723404255319149, + "grad_norm": 2.698862075805664, + "learning_rate": 4.766720583377159e-06, + "loss": 0.5953, + "step": 1845 + }, + { + "epoch": 0.8728132387706856, + "grad_norm": 3.0195560455322266, + "learning_rate": 4.766457381698907e-06, + "loss": 0.5965, + "step": 1846 + }, + { + "epoch": 0.8732860520094563, + "grad_norm": 2.5972697734832764, + "learning_rate": 4.766194038898291e-06, + "loss": 0.6014, + "step": 1847 + }, + { + "epoch": 0.873758865248227, + "grad_norm": 2.7132294178009033, + "learning_rate": 4.76593055499171e-06, + "loss": 0.5638, + "step": 1848 + }, + { + "epoch": 0.8742316784869977, + "grad_norm": 2.7134575843811035, + "learning_rate": 4.765666929995568e-06, + "loss": 0.52, + "step": 1849 + }, + { + "epoch": 0.8747044917257684, + "grad_norm": 2.3804993629455566, + "learning_rate": 4.765403163926282e-06, + "loss": 0.5435, + "step": 1850 + }, + { + "epoch": 0.875177304964539, + "grad_norm": 2.8782761096954346, + "learning_rate": 4.765139256800274e-06, + "loss": 0.5843, + "step": 1851 + }, + { + "epoch": 0.8756501182033097, + "grad_norm": 2.836209774017334, + "learning_rate": 4.764875208633977e-06, + "loss": 0.6667, + "step": 1852 + }, + { + "epoch": 0.8761229314420804, + "grad_norm": 2.608851194381714, + "learning_rate": 4.764611019443831e-06, + "loss": 0.5436, + "step": 1853 + }, + { + "epoch": 0.8765957446808511, + "grad_norm": 2.788738965988159, + "learning_rate": 4.764346689246288e-06, + "loss": 0.7331, + "step": 1854 + }, + { + "epoch": 0.8770685579196218, + "grad_norm": 2.524277687072754, + "learning_rate": 4.764082218057805e-06, + "loss": 0.5067, + "step": 1855 + }, + { + "epoch": 0.8775413711583925, + "grad_norm": 3.7559316158294678, + "learning_rate": 4.763817605894851e-06, + "loss": 0.6809, + "step": 1856 + }, + { + "epoch": 0.8780141843971632, + "grad_norm": 2.9070613384246826, + "learning_rate": 4.763552852773899e-06, + "loss": 0.5913, + "step": 1857 + }, + { + "epoch": 0.8784869976359339, + "grad_norm": 2.7050609588623047, + "learning_rate": 4.7632879587114386e-06, + "loss": 0.6074, + "step": 1858 + }, + { + "epoch": 0.8789598108747045, + "grad_norm": 2.891134262084961, + "learning_rate": 4.76302292372396e-06, + "loss": 0.5939, + "step": 1859 + }, + { + "epoch": 0.8794326241134752, + "grad_norm": 2.8581702709198, + "learning_rate": 4.762757747827968e-06, + "loss": 0.5972, + "step": 1860 + }, + { + "epoch": 0.8799054373522459, + "grad_norm": 2.8266196250915527, + "learning_rate": 4.762492431039971e-06, + "loss": 0.5993, + "step": 1861 + }, + { + "epoch": 0.8803782505910166, + "grad_norm": 2.4853954315185547, + "learning_rate": 4.762226973376493e-06, + "loss": 0.6388, + "step": 1862 + }, + { + "epoch": 0.8808510638297873, + "grad_norm": 3.2212886810302734, + "learning_rate": 4.761961374854059e-06, + "loss": 0.6698, + "step": 1863 + }, + { + "epoch": 0.881323877068558, + "grad_norm": 3.1254501342773438, + "learning_rate": 4.761695635489211e-06, + "loss": 0.5263, + "step": 1864 + }, + { + "epoch": 0.8817966903073287, + "grad_norm": 2.6891462802886963, + "learning_rate": 4.761429755298491e-06, + "loss": 0.5359, + "step": 1865 + }, + { + "epoch": 0.8822695035460993, + "grad_norm": 2.8557538986206055, + "learning_rate": 4.761163734298457e-06, + "loss": 0.5933, + "step": 1866 + }, + { + "epoch": 0.88274231678487, + "grad_norm": 2.53548264503479, + "learning_rate": 4.7608975725056724e-06, + "loss": 0.6397, + "step": 1867 + }, + { + "epoch": 0.8832151300236407, + "grad_norm": 3.0237956047058105, + "learning_rate": 4.76063126993671e-06, + "loss": 0.6845, + "step": 1868 + }, + { + "epoch": 0.8836879432624114, + "grad_norm": 3.222886800765991, + "learning_rate": 4.76036482660815e-06, + "loss": 0.6055, + "step": 1869 + }, + { + "epoch": 0.8841607565011821, + "grad_norm": 3.1867551803588867, + "learning_rate": 4.760098242536584e-06, + "loss": 0.6592, + "step": 1870 + }, + { + "epoch": 0.8846335697399527, + "grad_norm": 2.782209873199463, + "learning_rate": 4.7598315177386115e-06, + "loss": 0.5833, + "step": 1871 + }, + { + "epoch": 0.8851063829787233, + "grad_norm": 2.899871587753296, + "learning_rate": 4.759564652230838e-06, + "loss": 0.6129, + "step": 1872 + }, + { + "epoch": 0.885579196217494, + "grad_norm": 2.5690579414367676, + "learning_rate": 4.759297646029882e-06, + "loss": 0.5827, + "step": 1873 + }, + { + "epoch": 0.8860520094562647, + "grad_norm": 2.666130304336548, + "learning_rate": 4.759030499152368e-06, + "loss": 0.5272, + "step": 1874 + }, + { + "epoch": 0.8865248226950354, + "grad_norm": 2.7030911445617676, + "learning_rate": 4.758763211614932e-06, + "loss": 0.6415, + "step": 1875 + }, + { + "epoch": 0.8869976359338061, + "grad_norm": 2.717512845993042, + "learning_rate": 4.7584957834342135e-06, + "loss": 0.5827, + "step": 1876 + }, + { + "epoch": 0.8874704491725768, + "grad_norm": 2.665823459625244, + "learning_rate": 4.758228214626867e-06, + "loss": 0.6209, + "step": 1877 + }, + { + "epoch": 0.8879432624113475, + "grad_norm": 2.636653184890747, + "learning_rate": 4.75796050520955e-06, + "loss": 0.6413, + "step": 1878 + }, + { + "epoch": 0.8884160756501182, + "grad_norm": 2.585115671157837, + "learning_rate": 4.7576926551989345e-06, + "loss": 0.5518, + "step": 1879 + }, + { + "epoch": 0.8888888888888888, + "grad_norm": 2.808526039123535, + "learning_rate": 4.757424664611697e-06, + "loss": 0.5717, + "step": 1880 + }, + { + "epoch": 0.8893617021276595, + "grad_norm": 3.5957939624786377, + "learning_rate": 4.757156533464524e-06, + "loss": 0.6323, + "step": 1881 + }, + { + "epoch": 0.8898345153664302, + "grad_norm": 2.5003883838653564, + "learning_rate": 4.756888261774111e-06, + "loss": 0.5937, + "step": 1882 + }, + { + "epoch": 0.8903073286052009, + "grad_norm": 2.749061346054077, + "learning_rate": 4.756619849557161e-06, + "loss": 0.6642, + "step": 1883 + }, + { + "epoch": 0.8907801418439716, + "grad_norm": 2.6757891178131104, + "learning_rate": 4.756351296830389e-06, + "loss": 0.5887, + "step": 1884 + }, + { + "epoch": 0.8912529550827423, + "grad_norm": 2.811925172805786, + "learning_rate": 4.756082603610516e-06, + "loss": 0.6571, + "step": 1885 + }, + { + "epoch": 0.891725768321513, + "grad_norm": 2.5054616928100586, + "learning_rate": 4.755813769914271e-06, + "loss": 0.6312, + "step": 1886 + }, + { + "epoch": 0.8921985815602836, + "grad_norm": 2.7518467903137207, + "learning_rate": 4.755544795758395e-06, + "loss": 0.6685, + "step": 1887 + }, + { + "epoch": 0.8926713947990543, + "grad_norm": 2.7527287006378174, + "learning_rate": 4.755275681159634e-06, + "loss": 0.5886, + "step": 1888 + }, + { + "epoch": 0.893144208037825, + "grad_norm": 2.6162452697753906, + "learning_rate": 4.755006426134745e-06, + "loss": 0.546, + "step": 1889 + }, + { + "epoch": 0.8936170212765957, + "grad_norm": 2.4016737937927246, + "learning_rate": 4.754737030700495e-06, + "loss": 0.5726, + "step": 1890 + }, + { + "epoch": 0.8940898345153664, + "grad_norm": 2.528327703475952, + "learning_rate": 4.754467494873656e-06, + "loss": 0.5682, + "step": 1891 + }, + { + "epoch": 0.8945626477541371, + "grad_norm": 2.3139286041259766, + "learning_rate": 4.7541978186710115e-06, + "loss": 0.6108, + "step": 1892 + }, + { + "epoch": 0.8950354609929078, + "grad_norm": 2.7269136905670166, + "learning_rate": 4.753928002109354e-06, + "loss": 0.5875, + "step": 1893 + }, + { + "epoch": 0.8955082742316784, + "grad_norm": 4.425495147705078, + "learning_rate": 4.753658045205482e-06, + "loss": 0.5572, + "step": 1894 + }, + { + "epoch": 0.8959810874704491, + "grad_norm": 2.535409927368164, + "learning_rate": 4.753387947976206e-06, + "loss": 0.5868, + "step": 1895 + }, + { + "epoch": 0.8964539007092198, + "grad_norm": 2.722458600997925, + "learning_rate": 4.753117710438343e-06, + "loss": 0.5935, + "step": 1896 + }, + { + "epoch": 0.8969267139479905, + "grad_norm": 2.743861436843872, + "learning_rate": 4.75284733260872e-06, + "loss": 0.572, + "step": 1897 + }, + { + "epoch": 0.8973995271867612, + "grad_norm": 2.60640549659729, + "learning_rate": 4.752576814504173e-06, + "loss": 0.567, + "step": 1898 + }, + { + "epoch": 0.8978723404255319, + "grad_norm": 2.7486042976379395, + "learning_rate": 4.7523061561415435e-06, + "loss": 0.5768, + "step": 1899 + }, + { + "epoch": 0.8983451536643026, + "grad_norm": 3.8410251140594482, + "learning_rate": 4.752035357537686e-06, + "loss": 0.6034, + "step": 1900 + }, + { + "epoch": 0.8988179669030733, + "grad_norm": 3.0935890674591064, + "learning_rate": 4.751764418709462e-06, + "loss": 0.5644, + "step": 1901 + }, + { + "epoch": 0.8992907801418439, + "grad_norm": 2.7989892959594727, + "learning_rate": 4.751493339673742e-06, + "loss": 0.656, + "step": 1902 + }, + { + "epoch": 0.8997635933806146, + "grad_norm": 3.6940557956695557, + "learning_rate": 4.751222120447403e-06, + "loss": 0.6632, + "step": 1903 + }, + { + "epoch": 0.9002364066193853, + "grad_norm": 2.3428797721862793, + "learning_rate": 4.750950761047335e-06, + "loss": 0.4485, + "step": 1904 + }, + { + "epoch": 0.900709219858156, + "grad_norm": 2.622544050216675, + "learning_rate": 4.750679261490432e-06, + "loss": 0.5857, + "step": 1905 + }, + { + "epoch": 0.9011820330969267, + "grad_norm": 2.4911322593688965, + "learning_rate": 4.750407621793601e-06, + "loss": 0.5618, + "step": 1906 + }, + { + "epoch": 0.9016548463356974, + "grad_norm": 2.6434662342071533, + "learning_rate": 4.750135841973755e-06, + "loss": 0.6057, + "step": 1907 + }, + { + "epoch": 0.902127659574468, + "grad_norm": 3.115443706512451, + "learning_rate": 4.749863922047817e-06, + "loss": 0.6064, + "step": 1908 + }, + { + "epoch": 0.9026004728132387, + "grad_norm": 2.5671091079711914, + "learning_rate": 4.749591862032718e-06, + "loss": 0.5625, + "step": 1909 + }, + { + "epoch": 0.9030732860520094, + "grad_norm": 3.2008655071258545, + "learning_rate": 4.749319661945398e-06, + "loss": 0.5547, + "step": 1910 + }, + { + "epoch": 0.9035460992907801, + "grad_norm": 2.905987024307251, + "learning_rate": 4.749047321802805e-06, + "loss": 0.6033, + "step": 1911 + }, + { + "epoch": 0.9040189125295508, + "grad_norm": 3.1456053256988525, + "learning_rate": 4.748774841621897e-06, + "loss": 0.5651, + "step": 1912 + }, + { + "epoch": 0.9044917257683215, + "grad_norm": 2.8116416931152344, + "learning_rate": 4.748502221419641e-06, + "loss": 0.5853, + "step": 1913 + }, + { + "epoch": 0.9049645390070922, + "grad_norm": 3.123835325241089, + "learning_rate": 4.748229461213011e-06, + "loss": 0.5427, + "step": 1914 + }, + { + "epoch": 0.9054373522458629, + "grad_norm": 2.4750146865844727, + "learning_rate": 4.747956561018989e-06, + "loss": 0.6517, + "step": 1915 + }, + { + "epoch": 0.9059101654846335, + "grad_norm": 2.6174299716949463, + "learning_rate": 4.7476835208545705e-06, + "loss": 0.6119, + "step": 1916 + }, + { + "epoch": 0.9063829787234042, + "grad_norm": 2.7390382289886475, + "learning_rate": 4.747410340736755e-06, + "loss": 0.5664, + "step": 1917 + }, + { + "epoch": 0.9068557919621749, + "grad_norm": 2.7940444946289062, + "learning_rate": 4.747137020682552e-06, + "loss": 0.5628, + "step": 1918 + }, + { + "epoch": 0.9073286052009456, + "grad_norm": 2.477365016937256, + "learning_rate": 4.7468635607089795e-06, + "loss": 0.5261, + "step": 1919 + }, + { + "epoch": 0.9078014184397163, + "grad_norm": 2.7016685009002686, + "learning_rate": 4.746589960833066e-06, + "loss": 0.5576, + "step": 1920 + }, + { + "epoch": 0.908274231678487, + "grad_norm": 2.8806519508361816, + "learning_rate": 4.746316221071846e-06, + "loss": 0.5925, + "step": 1921 + }, + { + "epoch": 0.9087470449172577, + "grad_norm": 3.0315234661102295, + "learning_rate": 4.746042341442365e-06, + "loss": 0.6142, + "step": 1922 + }, + { + "epoch": 0.9092198581560283, + "grad_norm": 4.2446160316467285, + "learning_rate": 4.745768321961676e-06, + "loss": 0.5352, + "step": 1923 + }, + { + "epoch": 0.909692671394799, + "grad_norm": 2.6517012119293213, + "learning_rate": 4.745494162646841e-06, + "loss": 0.6118, + "step": 1924 + }, + { + "epoch": 0.9101654846335697, + "grad_norm": 2.774900197982788, + "learning_rate": 4.7452198635149304e-06, + "loss": 0.572, + "step": 1925 + }, + { + "epoch": 0.9106382978723404, + "grad_norm": 3.0133683681488037, + "learning_rate": 4.744945424583024e-06, + "loss": 0.5897, + "step": 1926 + }, + { + "epoch": 0.9111111111111111, + "grad_norm": 2.7344839572906494, + "learning_rate": 4.744670845868211e-06, + "loss": 0.6207, + "step": 1927 + }, + { + "epoch": 0.9115839243498818, + "grad_norm": 2.636578321456909, + "learning_rate": 4.744396127387586e-06, + "loss": 0.6687, + "step": 1928 + }, + { + "epoch": 0.9120567375886525, + "grad_norm": 2.8663458824157715, + "learning_rate": 4.744121269158255e-06, + "loss": 0.5002, + "step": 1929 + }, + { + "epoch": 0.9125295508274232, + "grad_norm": 2.661079168319702, + "learning_rate": 4.743846271197333e-06, + "loss": 0.5848, + "step": 1930 + }, + { + "epoch": 0.9130023640661938, + "grad_norm": 2.881256341934204, + "learning_rate": 4.743571133521943e-06, + "loss": 0.5911, + "step": 1931 + }, + { + "epoch": 0.9134751773049645, + "grad_norm": 2.5540573596954346, + "learning_rate": 4.743295856149217e-06, + "loss": 0.5647, + "step": 1932 + }, + { + "epoch": 0.9139479905437352, + "grad_norm": 2.7060387134552, + "learning_rate": 4.743020439096293e-06, + "loss": 0.6267, + "step": 1933 + }, + { + "epoch": 0.9144208037825059, + "grad_norm": 2.694481372833252, + "learning_rate": 4.742744882380323e-06, + "loss": 0.6283, + "step": 1934 + }, + { + "epoch": 0.9148936170212766, + "grad_norm": 2.711555242538452, + "learning_rate": 4.7424691860184625e-06, + "loss": 0.5784, + "step": 1935 + }, + { + "epoch": 0.9153664302600473, + "grad_norm": 2.9077224731445312, + "learning_rate": 4.742193350027879e-06, + "loss": 0.5948, + "step": 1936 + }, + { + "epoch": 0.915839243498818, + "grad_norm": 2.9824187755584717, + "learning_rate": 4.7419173744257476e-06, + "loss": 0.6115, + "step": 1937 + }, + { + "epoch": 0.9163120567375886, + "grad_norm": 2.5127830505371094, + "learning_rate": 4.7416412592292515e-06, + "loss": 0.5803, + "step": 1938 + }, + { + "epoch": 0.9167848699763593, + "grad_norm": 3.1307175159454346, + "learning_rate": 4.741365004455583e-06, + "loss": 0.5657, + "step": 1939 + }, + { + "epoch": 0.91725768321513, + "grad_norm": 2.8205273151397705, + "learning_rate": 4.741088610121944e-06, + "loss": 0.6145, + "step": 1940 + }, + { + "epoch": 0.9177304964539007, + "grad_norm": 2.6119720935821533, + "learning_rate": 4.7408120762455444e-06, + "loss": 0.6058, + "step": 1941 + }, + { + "epoch": 0.9182033096926714, + "grad_norm": 2.421276092529297, + "learning_rate": 4.7405354028436025e-06, + "loss": 0.5973, + "step": 1942 + }, + { + "epoch": 0.9186761229314421, + "grad_norm": 2.9846808910369873, + "learning_rate": 4.740258589933346e-06, + "loss": 0.6892, + "step": 1943 + }, + { + "epoch": 0.9191489361702128, + "grad_norm": 2.6899871826171875, + "learning_rate": 4.739981637532009e-06, + "loss": 0.5705, + "step": 1944 + }, + { + "epoch": 0.9196217494089834, + "grad_norm": 2.8636131286621094, + "learning_rate": 4.739704545656839e-06, + "loss": 0.5775, + "step": 1945 + }, + { + "epoch": 0.9200945626477541, + "grad_norm": 2.7659449577331543, + "learning_rate": 4.739427314325087e-06, + "loss": 0.5823, + "step": 1946 + }, + { + "epoch": 0.9205673758865248, + "grad_norm": 4.71295166015625, + "learning_rate": 4.739149943554016e-06, + "loss": 0.5601, + "step": 1947 + }, + { + "epoch": 0.9210401891252955, + "grad_norm": 2.642636775970459, + "learning_rate": 4.738872433360896e-06, + "loss": 0.5278, + "step": 1948 + }, + { + "epoch": 0.9215130023640662, + "grad_norm": 2.4658217430114746, + "learning_rate": 4.7385947837630065e-06, + "loss": 0.6392, + "step": 1949 + }, + { + "epoch": 0.9219858156028369, + "grad_norm": 2.851602792739868, + "learning_rate": 4.738316994777636e-06, + "loss": 0.6164, + "step": 1950 + }, + { + "epoch": 0.9224586288416076, + "grad_norm": 2.394226551055908, + "learning_rate": 4.738039066422081e-06, + "loss": 0.5556, + "step": 1951 + }, + { + "epoch": 0.9229314420803783, + "grad_norm": 2.7985100746154785, + "learning_rate": 4.737760998713647e-06, + "loss": 0.5799, + "step": 1952 + }, + { + "epoch": 0.9234042553191489, + "grad_norm": 2.5974674224853516, + "learning_rate": 4.737482791669648e-06, + "loss": 0.6984, + "step": 1953 + }, + { + "epoch": 0.9238770685579196, + "grad_norm": 2.707636594772339, + "learning_rate": 4.737204445307406e-06, + "loss": 0.5548, + "step": 1954 + }, + { + "epoch": 0.9243498817966903, + "grad_norm": 2.7882707118988037, + "learning_rate": 4.736925959644254e-06, + "loss": 0.6026, + "step": 1955 + }, + { + "epoch": 0.924822695035461, + "grad_norm": 2.474482774734497, + "learning_rate": 4.7366473346975304e-06, + "loss": 0.5832, + "step": 1956 + }, + { + "epoch": 0.9252955082742317, + "grad_norm": 2.6196324825286865, + "learning_rate": 4.736368570484585e-06, + "loss": 0.5861, + "step": 1957 + }, + { + "epoch": 0.9257683215130024, + "grad_norm": 2.826864004135132, + "learning_rate": 4.736089667022775e-06, + "loss": 0.6173, + "step": 1958 + }, + { + "epoch": 0.926241134751773, + "grad_norm": 2.414473056793213, + "learning_rate": 4.735810624329466e-06, + "loss": 0.5753, + "step": 1959 + }, + { + "epoch": 0.9267139479905437, + "grad_norm": 2.8037970066070557, + "learning_rate": 4.7355314424220335e-06, + "loss": 0.6207, + "step": 1960 + }, + { + "epoch": 0.9271867612293144, + "grad_norm": 2.645458698272705, + "learning_rate": 4.735252121317861e-06, + "loss": 0.5959, + "step": 1961 + }, + { + "epoch": 0.9276595744680851, + "grad_norm": 2.7983884811401367, + "learning_rate": 4.734972661034339e-06, + "loss": 0.5696, + "step": 1962 + }, + { + "epoch": 0.9281323877068558, + "grad_norm": 3.0568997859954834, + "learning_rate": 4.73469306158887e-06, + "loss": 0.6194, + "step": 1963 + }, + { + "epoch": 0.9286052009456265, + "grad_norm": 2.7205135822296143, + "learning_rate": 4.734413322998863e-06, + "loss": 0.5292, + "step": 1964 + }, + { + "epoch": 0.9290780141843972, + "grad_norm": 3.3168489933013916, + "learning_rate": 4.734133445281735e-06, + "loss": 0.5654, + "step": 1965 + }, + { + "epoch": 0.9295508274231679, + "grad_norm": 3.0095653533935547, + "learning_rate": 4.733853428454916e-06, + "loss": 0.6508, + "step": 1966 + }, + { + "epoch": 0.9300236406619385, + "grad_norm": 2.7726712226867676, + "learning_rate": 4.733573272535838e-06, + "loss": 0.644, + "step": 1967 + }, + { + "epoch": 0.9304964539007092, + "grad_norm": 2.474397659301758, + "learning_rate": 4.7332929775419456e-06, + "loss": 0.5479, + "step": 1968 + }, + { + "epoch": 0.9309692671394799, + "grad_norm": 2.4518635272979736, + "learning_rate": 4.733012543490693e-06, + "loss": 0.6, + "step": 1969 + }, + { + "epoch": 0.9314420803782506, + "grad_norm": 2.9292192459106445, + "learning_rate": 4.73273197039954e-06, + "loss": 0.6647, + "step": 1970 + }, + { + "epoch": 0.9319148936170213, + "grad_norm": 2.425004720687866, + "learning_rate": 4.732451258285958e-06, + "loss": 0.6338, + "step": 1971 + }, + { + "epoch": 0.932387706855792, + "grad_norm": 2.904479503631592, + "learning_rate": 4.7321704071674255e-06, + "loss": 0.5923, + "step": 1972 + }, + { + "epoch": 0.9328605200945627, + "grad_norm": 2.477085590362549, + "learning_rate": 4.731889417061428e-06, + "loss": 0.5984, + "step": 1973 + }, + { + "epoch": 0.9333333333333333, + "grad_norm": 2.585240364074707, + "learning_rate": 4.731608287985465e-06, + "loss": 0.558, + "step": 1974 + }, + { + "epoch": 0.933806146572104, + "grad_norm": 2.658714532852173, + "learning_rate": 4.731327019957039e-06, + "loss": 0.5567, + "step": 1975 + }, + { + "epoch": 0.9342789598108747, + "grad_norm": 2.7593026161193848, + "learning_rate": 4.731045612993662e-06, + "loss": 0.5772, + "step": 1976 + }, + { + "epoch": 0.9347517730496454, + "grad_norm": 2.4386026859283447, + "learning_rate": 4.7307640671128585e-06, + "loss": 0.6199, + "step": 1977 + }, + { + "epoch": 0.9352245862884161, + "grad_norm": 2.681910514831543, + "learning_rate": 4.730482382332158e-06, + "loss": 0.5971, + "step": 1978 + }, + { + "epoch": 0.9356973995271868, + "grad_norm": 3.7593860626220703, + "learning_rate": 4.7302005586691e-06, + "loss": 0.6346, + "step": 1979 + }, + { + "epoch": 0.9361702127659575, + "grad_norm": 2.5789096355438232, + "learning_rate": 4.729918596141232e-06, + "loss": 0.5684, + "step": 1980 + }, + { + "epoch": 0.9366430260047282, + "grad_norm": 3.0607335567474365, + "learning_rate": 4.729636494766111e-06, + "loss": 0.6223, + "step": 1981 + }, + { + "epoch": 0.9371158392434988, + "grad_norm": 2.906643867492676, + "learning_rate": 4.729354254561303e-06, + "loss": 0.6513, + "step": 1982 + }, + { + "epoch": 0.9375886524822695, + "grad_norm": 3.192430019378662, + "learning_rate": 4.7290718755443795e-06, + "loss": 0.5095, + "step": 1983 + }, + { + "epoch": 0.9380614657210402, + "grad_norm": 2.661536931991577, + "learning_rate": 4.7287893577329255e-06, + "loss": 0.5525, + "step": 1984 + }, + { + "epoch": 0.9385342789598109, + "grad_norm": 2.8436734676361084, + "learning_rate": 4.728506701144531e-06, + "loss": 0.6323, + "step": 1985 + }, + { + "epoch": 0.9390070921985816, + "grad_norm": 2.75544810295105, + "learning_rate": 4.728223905796796e-06, + "loss": 0.6018, + "step": 1986 + }, + { + "epoch": 0.9394799054373523, + "grad_norm": 3.0652759075164795, + "learning_rate": 4.727940971707329e-06, + "loss": 0.62, + "step": 1987 + }, + { + "epoch": 0.939952718676123, + "grad_norm": 2.802567720413208, + "learning_rate": 4.727657898893747e-06, + "loss": 0.5809, + "step": 1988 + }, + { + "epoch": 0.9404255319148936, + "grad_norm": 2.6208512783050537, + "learning_rate": 4.7273746873736745e-06, + "loss": 0.5762, + "step": 1989 + }, + { + "epoch": 0.9408983451536643, + "grad_norm": 2.5901873111724854, + "learning_rate": 4.727091337164748e-06, + "loss": 0.6111, + "step": 1990 + }, + { + "epoch": 0.941371158392435, + "grad_norm": 3.002347707748413, + "learning_rate": 4.726807848284609e-06, + "loss": 0.6419, + "step": 1991 + }, + { + "epoch": 0.9418439716312057, + "grad_norm": 2.522151470184326, + "learning_rate": 4.72652422075091e-06, + "loss": 0.642, + "step": 1992 + }, + { + "epoch": 0.9423167848699764, + "grad_norm": 2.5571532249450684, + "learning_rate": 4.726240454581311e-06, + "loss": 0.5729, + "step": 1993 + }, + { + "epoch": 0.9427895981087471, + "grad_norm": 2.7704918384552, + "learning_rate": 4.72595654979348e-06, + "loss": 0.6816, + "step": 1994 + }, + { + "epoch": 0.9432624113475178, + "grad_norm": 2.517040491104126, + "learning_rate": 4.7256725064050955e-06, + "loss": 0.5782, + "step": 1995 + }, + { + "epoch": 0.9437352245862884, + "grad_norm": 2.613955020904541, + "learning_rate": 4.725388324433843e-06, + "loss": 0.6291, + "step": 1996 + }, + { + "epoch": 0.9442080378250591, + "grad_norm": 2.848891258239746, + "learning_rate": 4.725104003897418e-06, + "loss": 0.6544, + "step": 1997 + }, + { + "epoch": 0.9446808510638298, + "grad_norm": 3.0162429809570312, + "learning_rate": 4.724819544813523e-06, + "loss": 0.6301, + "step": 1998 + }, + { + "epoch": 0.9451536643026005, + "grad_norm": 2.613614559173584, + "learning_rate": 4.72453494719987e-06, + "loss": 0.5829, + "step": 1999 + }, + { + "epoch": 0.9456264775413712, + "grad_norm": 2.4838767051696777, + "learning_rate": 4.724250211074182e-06, + "loss": 0.6042, + "step": 2000 + }, + { + "epoch": 0.9460992907801419, + "grad_norm": 2.526470899581909, + "learning_rate": 4.723965336454185e-06, + "loss": 0.6167, + "step": 2001 + }, + { + "epoch": 0.9465721040189126, + "grad_norm": 2.504506826400757, + "learning_rate": 4.723680323357618e-06, + "loss": 0.6061, + "step": 2002 + }, + { + "epoch": 0.9470449172576832, + "grad_norm": 3.0547544956207275, + "learning_rate": 4.723395171802228e-06, + "loss": 0.6619, + "step": 2003 + }, + { + "epoch": 0.9475177304964539, + "grad_norm": 2.8692407608032227, + "learning_rate": 4.723109881805771e-06, + "loss": 0.5985, + "step": 2004 + }, + { + "epoch": 0.9479905437352246, + "grad_norm": 2.7929654121398926, + "learning_rate": 4.7228244533860094e-06, + "loss": 0.5869, + "step": 2005 + }, + { + "epoch": 0.9484633569739953, + "grad_norm": 2.764869451522827, + "learning_rate": 4.7225388865607146e-06, + "loss": 0.6288, + "step": 2006 + }, + { + "epoch": 0.948936170212766, + "grad_norm": 2.7656404972076416, + "learning_rate": 4.722253181347671e-06, + "loss": 0.5831, + "step": 2007 + }, + { + "epoch": 0.9494089834515367, + "grad_norm": 2.6698336601257324, + "learning_rate": 4.7219673377646635e-06, + "loss": 0.6087, + "step": 2008 + }, + { + "epoch": 0.9498817966903074, + "grad_norm": 2.524935722351074, + "learning_rate": 4.7216813558294946e-06, + "loss": 0.5675, + "step": 2009 + }, + { + "epoch": 0.950354609929078, + "grad_norm": 2.5998785495758057, + "learning_rate": 4.721395235559969e-06, + "loss": 0.5667, + "step": 2010 + }, + { + "epoch": 0.9508274231678487, + "grad_norm": 2.758021354675293, + "learning_rate": 4.721108976973902e-06, + "loss": 0.4931, + "step": 2011 + }, + { + "epoch": 0.9513002364066194, + "grad_norm": 2.767695903778076, + "learning_rate": 4.72082258008912e-06, + "loss": 0.5778, + "step": 2012 + }, + { + "epoch": 0.9517730496453901, + "grad_norm": 2.982314348220825, + "learning_rate": 4.720536044923453e-06, + "loss": 0.6096, + "step": 2013 + }, + { + "epoch": 0.9522458628841608, + "grad_norm": 2.7608799934387207, + "learning_rate": 4.720249371494743e-06, + "loss": 0.6242, + "step": 2014 + }, + { + "epoch": 0.9527186761229315, + "grad_norm": 2.60054349899292, + "learning_rate": 4.71996255982084e-06, + "loss": 0.6249, + "step": 2015 + }, + { + "epoch": 0.9531914893617022, + "grad_norm": 2.654355764389038, + "learning_rate": 4.719675609919603e-06, + "loss": 0.6327, + "step": 2016 + }, + { + "epoch": 0.9536643026004729, + "grad_norm": 2.589404582977295, + "learning_rate": 4.719388521808899e-06, + "loss": 0.6357, + "step": 2017 + }, + { + "epoch": 0.9541371158392435, + "grad_norm": 2.8016581535339355, + "learning_rate": 4.719101295506603e-06, + "loss": 0.5901, + "step": 2018 + }, + { + "epoch": 0.9546099290780142, + "grad_norm": 3.1408045291900635, + "learning_rate": 4.7188139310306e-06, + "loss": 0.598, + "step": 2019 + }, + { + "epoch": 0.9550827423167849, + "grad_norm": 2.7432665824890137, + "learning_rate": 4.718526428398783e-06, + "loss": 0.5508, + "step": 2020 + }, + { + "epoch": 0.9555555555555556, + "grad_norm": 2.947800874710083, + "learning_rate": 4.718238787629053e-06, + "loss": 0.6439, + "step": 2021 + }, + { + "epoch": 0.9560283687943263, + "grad_norm": 2.50828218460083, + "learning_rate": 4.71795100873932e-06, + "loss": 0.5441, + "step": 2022 + }, + { + "epoch": 0.956501182033097, + "grad_norm": 2.8558974266052246, + "learning_rate": 4.717663091747503e-06, + "loss": 0.5416, + "step": 2023 + }, + { + "epoch": 0.9569739952718677, + "grad_norm": 2.4803316593170166, + "learning_rate": 4.71737503667153e-06, + "loss": 0.5317, + "step": 2024 + }, + { + "epoch": 0.9574468085106383, + "grad_norm": 4.36754035949707, + "learning_rate": 4.717086843529336e-06, + "loss": 0.5808, + "step": 2025 + }, + { + "epoch": 0.957919621749409, + "grad_norm": 2.730185031890869, + "learning_rate": 4.7167985123388665e-06, + "loss": 0.5257, + "step": 2026 + }, + { + "epoch": 0.9583924349881797, + "grad_norm": 2.8136069774627686, + "learning_rate": 4.716510043118074e-06, + "loss": 0.5836, + "step": 2027 + }, + { + "epoch": 0.9588652482269504, + "grad_norm": 2.793975353240967, + "learning_rate": 4.71622143588492e-06, + "loss": 0.5706, + "step": 2028 + }, + { + "epoch": 0.9593380614657211, + "grad_norm": 2.3883821964263916, + "learning_rate": 4.7159326906573745e-06, + "loss": 0.5291, + "step": 2029 + }, + { + "epoch": 0.9598108747044918, + "grad_norm": 2.6135976314544678, + "learning_rate": 4.715643807453417e-06, + "loss": 0.6199, + "step": 2030 + }, + { + "epoch": 0.9602836879432625, + "grad_norm": 2.6245670318603516, + "learning_rate": 4.715354786291035e-06, + "loss": 0.5585, + "step": 2031 + }, + { + "epoch": 0.9607565011820332, + "grad_norm": 2.7870967388153076, + "learning_rate": 4.715065627188225e-06, + "loss": 0.6196, + "step": 2032 + }, + { + "epoch": 0.9612293144208038, + "grad_norm": 2.6983911991119385, + "learning_rate": 4.714776330162991e-06, + "loss": 0.6424, + "step": 2033 + }, + { + "epoch": 0.9617021276595744, + "grad_norm": 2.3221919536590576, + "learning_rate": 4.7144868952333465e-06, + "loss": 0.568, + "step": 2034 + }, + { + "epoch": 0.9621749408983451, + "grad_norm": 2.9408178329467773, + "learning_rate": 4.714197322417314e-06, + "loss": 0.6175, + "step": 2035 + }, + { + "epoch": 0.9626477541371158, + "grad_norm": 2.404057264328003, + "learning_rate": 4.713907611732921e-06, + "loss": 0.4943, + "step": 2036 + }, + { + "epoch": 0.9631205673758865, + "grad_norm": 3.547607660293579, + "learning_rate": 4.71361776319821e-06, + "loss": 0.5488, + "step": 2037 + }, + { + "epoch": 0.9635933806146572, + "grad_norm": 2.679614543914795, + "learning_rate": 4.713327776831227e-06, + "loss": 0.6234, + "step": 2038 + }, + { + "epoch": 0.9640661938534278, + "grad_norm": 2.526914119720459, + "learning_rate": 4.7130376526500286e-06, + "loss": 0.5891, + "step": 2039 + }, + { + "epoch": 0.9645390070921985, + "grad_norm": 2.6953470706939697, + "learning_rate": 4.71274739067268e-06, + "loss": 0.69, + "step": 2040 + }, + { + "epoch": 0.9650118203309692, + "grad_norm": 2.546660900115967, + "learning_rate": 4.712456990917254e-06, + "loss": 0.6185, + "step": 2041 + }, + { + "epoch": 0.9654846335697399, + "grad_norm": 3.3920490741729736, + "learning_rate": 4.712166453401832e-06, + "loss": 0.587, + "step": 2042 + }, + { + "epoch": 0.9659574468085106, + "grad_norm": 2.5961573123931885, + "learning_rate": 4.711875778144504e-06, + "loss": 0.6105, + "step": 2043 + }, + { + "epoch": 0.9664302600472813, + "grad_norm": 2.5111498832702637, + "learning_rate": 4.711584965163372e-06, + "loss": 0.5533, + "step": 2044 + }, + { + "epoch": 0.966903073286052, + "grad_norm": 2.4878132343292236, + "learning_rate": 4.7112940144765405e-06, + "loss": 0.5604, + "step": 2045 + }, + { + "epoch": 0.9673758865248226, + "grad_norm": 2.5714077949523926, + "learning_rate": 4.711002926102128e-06, + "loss": 0.5794, + "step": 2046 + }, + { + "epoch": 0.9678486997635933, + "grad_norm": 2.7069091796875, + "learning_rate": 4.710711700058257e-06, + "loss": 0.594, + "step": 2047 + }, + { + "epoch": 0.968321513002364, + "grad_norm": 2.8104631900787354, + "learning_rate": 4.710420336363063e-06, + "loss": 0.6247, + "step": 2048 + }, + { + "epoch": 0.9687943262411347, + "grad_norm": 2.8464386463165283, + "learning_rate": 4.7101288350346865e-06, + "loss": 0.6162, + "step": 2049 + }, + { + "epoch": 0.9692671394799054, + "grad_norm": 2.7187976837158203, + "learning_rate": 4.709837196091279e-06, + "loss": 0.6109, + "step": 2050 + }, + { + "epoch": 0.9697399527186761, + "grad_norm": 2.556734085083008, + "learning_rate": 4.709545419550999e-06, + "loss": 0.6297, + "step": 2051 + }, + { + "epoch": 0.9702127659574468, + "grad_norm": 2.937195062637329, + "learning_rate": 4.709253505432014e-06, + "loss": 0.6862, + "step": 2052 + }, + { + "epoch": 0.9706855791962175, + "grad_norm": 2.792175531387329, + "learning_rate": 4.7089614537525015e-06, + "loss": 0.6105, + "step": 2053 + }, + { + "epoch": 0.9711583924349881, + "grad_norm": 2.625636100769043, + "learning_rate": 4.708669264530644e-06, + "loss": 0.5849, + "step": 2054 + }, + { + "epoch": 0.9716312056737588, + "grad_norm": 2.6752610206604004, + "learning_rate": 4.708376937784637e-06, + "loss": 0.5949, + "step": 2055 + }, + { + "epoch": 0.9721040189125295, + "grad_norm": 2.6072793006896973, + "learning_rate": 4.708084473532681e-06, + "loss": 0.5776, + "step": 2056 + }, + { + "epoch": 0.9725768321513002, + "grad_norm": 2.728632926940918, + "learning_rate": 4.707791871792988e-06, + "loss": 0.6352, + "step": 2057 + }, + { + "epoch": 0.9730496453900709, + "grad_norm": 2.5841758251190186, + "learning_rate": 4.707499132583775e-06, + "loss": 0.5488, + "step": 2058 + }, + { + "epoch": 0.9735224586288416, + "grad_norm": 2.8464293479919434, + "learning_rate": 4.707206255923271e-06, + "loss": 0.7051, + "step": 2059 + }, + { + "epoch": 0.9739952718676123, + "grad_norm": 2.547297239303589, + "learning_rate": 4.706913241829712e-06, + "loss": 0.5937, + "step": 2060 + }, + { + "epoch": 0.9744680851063829, + "grad_norm": 2.6572306156158447, + "learning_rate": 4.706620090321341e-06, + "loss": 0.6041, + "step": 2061 + }, + { + "epoch": 0.9749408983451536, + "grad_norm": 2.3262805938720703, + "learning_rate": 4.706326801416414e-06, + "loss": 0.5144, + "step": 2062 + }, + { + "epoch": 0.9754137115839243, + "grad_norm": 2.9693965911865234, + "learning_rate": 4.706033375133191e-06, + "loss": 0.551, + "step": 2063 + }, + { + "epoch": 0.975886524822695, + "grad_norm": 2.5993731021881104, + "learning_rate": 4.7057398114899435e-06, + "loss": 0.6143, + "step": 2064 + }, + { + "epoch": 0.9763593380614657, + "grad_norm": 2.453336477279663, + "learning_rate": 4.70544611050495e-06, + "loss": 0.6093, + "step": 2065 + }, + { + "epoch": 0.9768321513002364, + "grad_norm": 2.898629665374756, + "learning_rate": 4.705152272196497e-06, + "loss": 0.6007, + "step": 2066 + }, + { + "epoch": 0.9773049645390071, + "grad_norm": 2.7990612983703613, + "learning_rate": 4.7048582965828815e-06, + "loss": 0.6687, + "step": 2067 + }, + { + "epoch": 0.9777777777777777, + "grad_norm": 2.635284423828125, + "learning_rate": 4.704564183682408e-06, + "loss": 0.5564, + "step": 2068 + }, + { + "epoch": 0.9782505910165484, + "grad_norm": 3.014547109603882, + "learning_rate": 4.704269933513389e-06, + "loss": 0.6084, + "step": 2069 + }, + { + "epoch": 0.9787234042553191, + "grad_norm": 2.659357786178589, + "learning_rate": 4.703975546094147e-06, + "loss": 0.6031, + "step": 2070 + }, + { + "epoch": 0.9791962174940898, + "grad_norm": 2.326932668685913, + "learning_rate": 4.703681021443013e-06, + "loss": 0.5859, + "step": 2071 + }, + { + "epoch": 0.9796690307328605, + "grad_norm": 2.958803653717041, + "learning_rate": 4.7033863595783235e-06, + "loss": 0.5586, + "step": 2072 + }, + { + "epoch": 0.9801418439716312, + "grad_norm": 2.921386957168579, + "learning_rate": 4.703091560518427e-06, + "loss": 0.6126, + "step": 2073 + }, + { + "epoch": 0.9806146572104019, + "grad_norm": 2.6500775814056396, + "learning_rate": 4.702796624281679e-06, + "loss": 0.5678, + "step": 2074 + }, + { + "epoch": 0.9810874704491725, + "grad_norm": 2.7740228176116943, + "learning_rate": 4.702501550886445e-06, + "loss": 0.6067, + "step": 2075 + }, + { + "epoch": 0.9815602836879432, + "grad_norm": 2.3296213150024414, + "learning_rate": 4.702206340351096e-06, + "loss": 0.5247, + "step": 2076 + }, + { + "epoch": 0.9820330969267139, + "grad_norm": 2.748300790786743, + "learning_rate": 4.701910992694016e-06, + "loss": 0.5197, + "step": 2077 + }, + { + "epoch": 0.9825059101654846, + "grad_norm": 2.250985622406006, + "learning_rate": 4.7016155079335926e-06, + "loss": 0.5214, + "step": 2078 + }, + { + "epoch": 0.9829787234042553, + "grad_norm": 2.389845848083496, + "learning_rate": 4.701319886088226e-06, + "loss": 0.519, + "step": 2079 + }, + { + "epoch": 0.983451536643026, + "grad_norm": 2.818220853805542, + "learning_rate": 4.701024127176322e-06, + "loss": 0.607, + "step": 2080 + }, + { + "epoch": 0.9839243498817967, + "grad_norm": 3.4058034420013428, + "learning_rate": 4.700728231216297e-06, + "loss": 0.5711, + "step": 2081 + }, + { + "epoch": 0.9843971631205674, + "grad_norm": 2.5297787189483643, + "learning_rate": 4.700432198226575e-06, + "loss": 0.5979, + "step": 2082 + }, + { + "epoch": 0.984869976359338, + "grad_norm": 3.0548105239868164, + "learning_rate": 4.7001360282255885e-06, + "loss": 0.6041, + "step": 2083 + }, + { + "epoch": 0.9853427895981087, + "grad_norm": 2.8983733654022217, + "learning_rate": 4.699839721231779e-06, + "loss": 0.5926, + "step": 2084 + }, + { + "epoch": 0.9858156028368794, + "grad_norm": 3.2717764377593994, + "learning_rate": 4.699543277263596e-06, + "loss": 0.6477, + "step": 2085 + }, + { + "epoch": 0.9862884160756501, + "grad_norm": 3.03729248046875, + "learning_rate": 4.699246696339497e-06, + "loss": 0.6786, + "step": 2086 + }, + { + "epoch": 0.9867612293144208, + "grad_norm": 2.852301597595215, + "learning_rate": 4.698949978477951e-06, + "loss": 0.6565, + "step": 2087 + }, + { + "epoch": 0.9872340425531915, + "grad_norm": 2.843485116958618, + "learning_rate": 4.698653123697431e-06, + "loss": 0.6627, + "step": 2088 + }, + { + "epoch": 0.9877068557919622, + "grad_norm": 2.6315064430236816, + "learning_rate": 4.698356132016423e-06, + "loss": 0.6577, + "step": 2089 + }, + { + "epoch": 0.9881796690307328, + "grad_norm": 2.7482151985168457, + "learning_rate": 4.698059003453417e-06, + "loss": 0.5514, + "step": 2090 + }, + { + "epoch": 0.9886524822695035, + "grad_norm": 2.826673746109009, + "learning_rate": 4.6977617380269145e-06, + "loss": 0.565, + "step": 2091 + }, + { + "epoch": 0.9891252955082742, + "grad_norm": 3.0273752212524414, + "learning_rate": 4.697464335755427e-06, + "loss": 0.6331, + "step": 2092 + }, + { + "epoch": 0.9895981087470449, + "grad_norm": 2.7551653385162354, + "learning_rate": 4.6971667966574695e-06, + "loss": 0.6486, + "step": 2093 + }, + { + "epoch": 0.9900709219858156, + "grad_norm": 2.656299114227295, + "learning_rate": 4.696869120751571e-06, + "loss": 0.6562, + "step": 2094 + }, + { + "epoch": 0.9905437352245863, + "grad_norm": 2.785322904586792, + "learning_rate": 4.696571308056265e-06, + "loss": 0.5892, + "step": 2095 + }, + { + "epoch": 0.991016548463357, + "grad_norm": 2.9334635734558105, + "learning_rate": 4.696273358590095e-06, + "loss": 0.6346, + "step": 2096 + }, + { + "epoch": 0.9914893617021276, + "grad_norm": 2.7944300174713135, + "learning_rate": 4.695975272371613e-06, + "loss": 0.5859, + "step": 2097 + }, + { + "epoch": 0.9919621749408983, + "grad_norm": 2.5416972637176514, + "learning_rate": 4.695677049419381e-06, + "loss": 0.5658, + "step": 2098 + }, + { + "epoch": 0.992434988179669, + "grad_norm": 2.4056856632232666, + "learning_rate": 4.695378689751966e-06, + "loss": 0.5121, + "step": 2099 + }, + { + "epoch": 0.9929078014184397, + "grad_norm": 2.614548683166504, + "learning_rate": 4.695080193387948e-06, + "loss": 0.5961, + "step": 2100 + }, + { + "epoch": 0.9933806146572104, + "grad_norm": 2.8966517448425293, + "learning_rate": 4.69478156034591e-06, + "loss": 0.5985, + "step": 2101 + }, + { + "epoch": 0.9938534278959811, + "grad_norm": 2.9514098167419434, + "learning_rate": 4.694482790644448e-06, + "loss": 0.5677, + "step": 2102 + }, + { + "epoch": 0.9943262411347518, + "grad_norm": 2.4326791763305664, + "learning_rate": 4.694183884302165e-06, + "loss": 0.5698, + "step": 2103 + }, + { + "epoch": 0.9947990543735225, + "grad_norm": 2.9242892265319824, + "learning_rate": 4.6938848413376735e-06, + "loss": 0.6245, + "step": 2104 + }, + { + "epoch": 0.9952718676122931, + "grad_norm": 2.9134104251861572, + "learning_rate": 4.693585661769593e-06, + "loss": 0.6164, + "step": 2105 + }, + { + "epoch": 0.9957446808510638, + "grad_norm": 2.472564458847046, + "learning_rate": 4.693286345616551e-06, + "loss": 0.5616, + "step": 2106 + }, + { + "epoch": 0.9962174940898345, + "grad_norm": 3.2456448078155518, + "learning_rate": 4.692986892897186e-06, + "loss": 0.6977, + "step": 2107 + }, + { + "epoch": 0.9966903073286052, + "grad_norm": 3.4032769203186035, + "learning_rate": 4.692687303630143e-06, + "loss": 0.643, + "step": 2108 + }, + { + "epoch": 0.9971631205673759, + "grad_norm": 2.722200870513916, + "learning_rate": 4.692387577834076e-06, + "loss": 0.5873, + "step": 2109 + }, + { + "epoch": 0.9976359338061466, + "grad_norm": 2.687532663345337, + "learning_rate": 4.692087715527648e-06, + "loss": 0.5423, + "step": 2110 + }, + { + "epoch": 0.9981087470449173, + "grad_norm": 2.578613042831421, + "learning_rate": 4.6917877167295305e-06, + "loss": 0.5689, + "step": 2111 + }, + { + "epoch": 0.9985815602836879, + "grad_norm": 3.1806094646453857, + "learning_rate": 4.691487581458402e-06, + "loss": 0.6133, + "step": 2112 + }, + { + "epoch": 0.9990543735224586, + "grad_norm": 2.4449520111083984, + "learning_rate": 4.691187309732952e-06, + "loss": 0.5841, + "step": 2113 + }, + { + "epoch": 0.9995271867612293, + "grad_norm": 2.908749580383301, + "learning_rate": 4.690886901571875e-06, + "loss": 0.534, + "step": 2114 + }, + { + "epoch": 1.0, + "grad_norm": 4.019968032836914, + "learning_rate": 4.6905863569938785e-06, + "loss": 0.596, + "step": 2115 + }, + { + "epoch": 1.0004728132387706, + "grad_norm": 2.4319307804107666, + "learning_rate": 4.690285676017675e-06, + "loss": 0.4973, + "step": 2116 + }, + { + "epoch": 1.0009456264775414, + "grad_norm": 2.6366477012634277, + "learning_rate": 4.689984858661986e-06, + "loss": 0.5682, + "step": 2117 + }, + { + "epoch": 1.001418439716312, + "grad_norm": 2.815114974975586, + "learning_rate": 4.689683904945542e-06, + "loss": 0.5616, + "step": 2118 + }, + { + "epoch": 1.0018912529550827, + "grad_norm": 2.6680490970611572, + "learning_rate": 4.689382814887084e-06, + "loss": 0.5161, + "step": 2119 + }, + { + "epoch": 1.0023640661938533, + "grad_norm": 2.7406351566314697, + "learning_rate": 4.689081588505358e-06, + "loss": 0.4937, + "step": 2120 + }, + { + "epoch": 1.0028368794326241, + "grad_norm": 2.2832298278808594, + "learning_rate": 4.68878022581912e-06, + "loss": 0.4986, + "step": 2121 + }, + { + "epoch": 1.0033096926713947, + "grad_norm": 2.5525307655334473, + "learning_rate": 4.688478726847136e-06, + "loss": 0.4909, + "step": 2122 + }, + { + "epoch": 1.0037825059101655, + "grad_norm": 2.9843199253082275, + "learning_rate": 4.688177091608176e-06, + "loss": 0.6046, + "step": 2123 + }, + { + "epoch": 1.004255319148936, + "grad_norm": 2.5231106281280518, + "learning_rate": 4.687875320121024e-06, + "loss": 0.5423, + "step": 2124 + }, + { + "epoch": 1.0047281323877069, + "grad_norm": 2.567599058151245, + "learning_rate": 4.68757341240447e-06, + "loss": 0.5092, + "step": 2125 + }, + { + "epoch": 1.0052009456264774, + "grad_norm": 2.768111228942871, + "learning_rate": 4.687271368477311e-06, + "loss": 0.5175, + "step": 2126 + }, + { + "epoch": 1.0056737588652482, + "grad_norm": 2.7223286628723145, + "learning_rate": 4.686969188358355e-06, + "loss": 0.5412, + "step": 2127 + }, + { + "epoch": 1.0061465721040188, + "grad_norm": 2.488299608230591, + "learning_rate": 4.686666872066418e-06, + "loss": 0.5288, + "step": 2128 + }, + { + "epoch": 1.0066193853427896, + "grad_norm": 2.882981777191162, + "learning_rate": 4.6863644196203215e-06, + "loss": 0.6117, + "step": 2129 + }, + { + "epoch": 1.0070921985815602, + "grad_norm": 3.0019447803497314, + "learning_rate": 4.686061831038901e-06, + "loss": 0.5308, + "step": 2130 + }, + { + "epoch": 1.007565011820331, + "grad_norm": 3.0056138038635254, + "learning_rate": 4.685759106340996e-06, + "loss": 0.5833, + "step": 2131 + }, + { + "epoch": 1.0080378250591016, + "grad_norm": 2.5709075927734375, + "learning_rate": 4.685456245545454e-06, + "loss": 0.5071, + "step": 2132 + }, + { + "epoch": 1.0085106382978724, + "grad_norm": 2.4641504287719727, + "learning_rate": 4.685153248671136e-06, + "loss": 0.4813, + "step": 2133 + }, + { + "epoch": 1.008983451536643, + "grad_norm": 2.374413013458252, + "learning_rate": 4.684850115736906e-06, + "loss": 0.5179, + "step": 2134 + }, + { + "epoch": 1.0094562647754137, + "grad_norm": 2.6504571437835693, + "learning_rate": 4.684546846761641e-06, + "loss": 0.437, + "step": 2135 + }, + { + "epoch": 1.0099290780141843, + "grad_norm": 2.5977871417999268, + "learning_rate": 4.684243441764221e-06, + "loss": 0.497, + "step": 2136 + }, + { + "epoch": 1.010401891252955, + "grad_norm": 2.4950785636901855, + "learning_rate": 4.683939900763541e-06, + "loss": 0.5624, + "step": 2137 + }, + { + "epoch": 1.0108747044917257, + "grad_norm": 3.065718412399292, + "learning_rate": 4.6836362237785e-06, + "loss": 0.512, + "step": 2138 + }, + { + "epoch": 1.0113475177304965, + "grad_norm": 2.7419207096099854, + "learning_rate": 4.6833324108280045e-06, + "loss": 0.5585, + "step": 2139 + }, + { + "epoch": 1.011820330969267, + "grad_norm": 2.623610496520996, + "learning_rate": 4.6830284619309744e-06, + "loss": 0.5163, + "step": 2140 + }, + { + "epoch": 1.0122931442080378, + "grad_norm": 2.774322986602783, + "learning_rate": 4.682724377106334e-06, + "loss": 0.527, + "step": 2141 + }, + { + "epoch": 1.0127659574468084, + "grad_norm": 2.959935188293457, + "learning_rate": 4.682420156373017e-06, + "loss": 0.6166, + "step": 2142 + }, + { + "epoch": 1.0132387706855792, + "grad_norm": 2.584026336669922, + "learning_rate": 4.682115799749968e-06, + "loss": 0.5086, + "step": 2143 + }, + { + "epoch": 1.0137115839243498, + "grad_norm": 2.6039700508117676, + "learning_rate": 4.6818113072561346e-06, + "loss": 0.49, + "step": 2144 + }, + { + "epoch": 1.0141843971631206, + "grad_norm": 2.466381072998047, + "learning_rate": 4.681506678910479e-06, + "loss": 0.4959, + "step": 2145 + }, + { + "epoch": 1.0146572104018912, + "grad_norm": 2.432636260986328, + "learning_rate": 4.681201914731969e-06, + "loss": 0.5057, + "step": 2146 + }, + { + "epoch": 1.015130023640662, + "grad_norm": 2.6134090423583984, + "learning_rate": 4.680897014739579e-06, + "loss": 0.4874, + "step": 2147 + }, + { + "epoch": 1.0156028368794325, + "grad_norm": 2.774481773376465, + "learning_rate": 4.680591978952295e-06, + "loss": 0.4967, + "step": 2148 + }, + { + "epoch": 1.0160756501182033, + "grad_norm": 2.66050124168396, + "learning_rate": 4.68028680738911e-06, + "loss": 0.4932, + "step": 2149 + }, + { + "epoch": 1.016548463356974, + "grad_norm": 3.020594835281372, + "learning_rate": 4.679981500069026e-06, + "loss": 0.5788, + "step": 2150 + }, + { + "epoch": 1.0170212765957447, + "grad_norm": 2.697758436203003, + "learning_rate": 4.679676057011053e-06, + "loss": 0.5441, + "step": 2151 + }, + { + "epoch": 1.0174940898345153, + "grad_norm": 6.986445903778076, + "learning_rate": 4.679370478234209e-06, + "loss": 0.6483, + "step": 2152 + }, + { + "epoch": 1.017966903073286, + "grad_norm": 2.6637115478515625, + "learning_rate": 4.679064763757522e-06, + "loss": 0.5859, + "step": 2153 + }, + { + "epoch": 1.0184397163120567, + "grad_norm": 2.7501862049102783, + "learning_rate": 4.678758913600027e-06, + "loss": 0.5745, + "step": 2154 + }, + { + "epoch": 1.0189125295508275, + "grad_norm": 2.7959372997283936, + "learning_rate": 4.678452927780768e-06, + "loss": 0.5076, + "step": 2155 + }, + { + "epoch": 1.019385342789598, + "grad_norm": 2.4377388954162598, + "learning_rate": 4.678146806318798e-06, + "loss": 0.5061, + "step": 2156 + }, + { + "epoch": 1.0198581560283688, + "grad_norm": 2.5478947162628174, + "learning_rate": 4.677840549233176e-06, + "loss": 0.4941, + "step": 2157 + }, + { + "epoch": 1.0203309692671394, + "grad_norm": 3.0956528186798096, + "learning_rate": 4.677534156542973e-06, + "loss": 0.5879, + "step": 2158 + }, + { + "epoch": 1.0208037825059102, + "grad_norm": 2.5247607231140137, + "learning_rate": 4.6772276282672666e-06, + "loss": 0.5532, + "step": 2159 + }, + { + "epoch": 1.0212765957446808, + "grad_norm": 3.1972787380218506, + "learning_rate": 4.676920964425143e-06, + "loss": 0.6081, + "step": 2160 + }, + { + "epoch": 1.0217494089834516, + "grad_norm": 2.6173388957977295, + "learning_rate": 4.6766141650356955e-06, + "loss": 0.5001, + "step": 2161 + }, + { + "epoch": 1.0222222222222221, + "grad_norm": 2.9914398193359375, + "learning_rate": 4.676307230118029e-06, + "loss": 0.5566, + "step": 2162 + }, + { + "epoch": 1.022695035460993, + "grad_norm": 2.8011834621429443, + "learning_rate": 4.676000159691254e-06, + "loss": 0.4909, + "step": 2163 + }, + { + "epoch": 1.0231678486997635, + "grad_norm": 2.6049559116363525, + "learning_rate": 4.67569295377449e-06, + "loss": 0.5018, + "step": 2164 + }, + { + "epoch": 1.0236406619385343, + "grad_norm": 2.8175013065338135, + "learning_rate": 4.675385612386866e-06, + "loss": 0.5309, + "step": 2165 + }, + { + "epoch": 1.0241134751773049, + "grad_norm": 2.854696750640869, + "learning_rate": 4.675078135547519e-06, + "loss": 0.5627, + "step": 2166 + }, + { + "epoch": 1.0245862884160757, + "grad_norm": 3.1856436729431152, + "learning_rate": 4.674770523275594e-06, + "loss": 0.5475, + "step": 2167 + }, + { + "epoch": 1.0250591016548463, + "grad_norm": 2.8289129734039307, + "learning_rate": 4.674462775590244e-06, + "loss": 0.5878, + "step": 2168 + }, + { + "epoch": 1.025531914893617, + "grad_norm": 2.8824517726898193, + "learning_rate": 4.6741548925106325e-06, + "loss": 0.4392, + "step": 2169 + }, + { + "epoch": 1.0260047281323876, + "grad_norm": 2.7044589519500732, + "learning_rate": 4.673846874055928e-06, + "loss": 0.5264, + "step": 2170 + }, + { + "epoch": 1.0264775413711584, + "grad_norm": 2.575035810470581, + "learning_rate": 4.673538720245312e-06, + "loss": 0.4615, + "step": 2171 + }, + { + "epoch": 1.026950354609929, + "grad_norm": 2.48168683052063, + "learning_rate": 4.67323043109797e-06, + "loss": 0.4404, + "step": 2172 + }, + { + "epoch": 1.0274231678486998, + "grad_norm": 2.926593065261841, + "learning_rate": 4.672922006633098e-06, + "loss": 0.54, + "step": 2173 + }, + { + "epoch": 1.0278959810874704, + "grad_norm": 2.4610698223114014, + "learning_rate": 4.672613446869901e-06, + "loss": 0.5555, + "step": 2174 + }, + { + "epoch": 1.0283687943262412, + "grad_norm": 3.026901960372925, + "learning_rate": 4.672304751827592e-06, + "loss": 0.62, + "step": 2175 + }, + { + "epoch": 1.0288416075650118, + "grad_norm": 2.3946213722229004, + "learning_rate": 4.671995921525391e-06, + "loss": 0.5228, + "step": 2176 + }, + { + "epoch": 1.0293144208037825, + "grad_norm": 2.985020399093628, + "learning_rate": 4.671686955982528e-06, + "loss": 0.6256, + "step": 2177 + }, + { + "epoch": 1.0297872340425531, + "grad_norm": 3.0910139083862305, + "learning_rate": 4.671377855218239e-06, + "loss": 0.5893, + "step": 2178 + }, + { + "epoch": 1.030260047281324, + "grad_norm": 2.507805109024048, + "learning_rate": 4.6710686192517744e-06, + "loss": 0.5329, + "step": 2179 + }, + { + "epoch": 1.0307328605200945, + "grad_norm": 2.4514641761779785, + "learning_rate": 4.670759248102386e-06, + "loss": 0.4585, + "step": 2180 + }, + { + "epoch": 1.0312056737588653, + "grad_norm": 2.742838144302368, + "learning_rate": 4.670449741789337e-06, + "loss": 0.6255, + "step": 2181 + }, + { + "epoch": 1.0316784869976359, + "grad_norm": 2.374349594116211, + "learning_rate": 4.670140100331901e-06, + "loss": 0.5049, + "step": 2182 + }, + { + "epoch": 1.0321513002364067, + "grad_norm": 2.78894305229187, + "learning_rate": 4.669830323749356e-06, + "loss": 0.6061, + "step": 2183 + }, + { + "epoch": 1.0326241134751772, + "grad_norm": 2.7195091247558594, + "learning_rate": 4.6695204120609905e-06, + "loss": 0.592, + "step": 2184 + }, + { + "epoch": 1.033096926713948, + "grad_norm": 2.824411630630493, + "learning_rate": 4.6692103652861035e-06, + "loss": 0.5666, + "step": 2185 + }, + { + "epoch": 1.0335697399527186, + "grad_norm": 2.4981014728546143, + "learning_rate": 4.6689001834439975e-06, + "loss": 0.5045, + "step": 2186 + }, + { + "epoch": 1.0340425531914894, + "grad_norm": 2.7375214099884033, + "learning_rate": 4.668589866553988e-06, + "loss": 0.5305, + "step": 2187 + }, + { + "epoch": 1.03451536643026, + "grad_norm": 2.625345468521118, + "learning_rate": 4.668279414635396e-06, + "loss": 0.4819, + "step": 2188 + }, + { + "epoch": 1.0349881796690308, + "grad_norm": 2.60479736328125, + "learning_rate": 4.667968827707553e-06, + "loss": 0.55, + "step": 2189 + }, + { + "epoch": 1.0354609929078014, + "grad_norm": 2.642014741897583, + "learning_rate": 4.667658105789797e-06, + "loss": 0.5264, + "step": 2190 + }, + { + "epoch": 1.0359338061465722, + "grad_norm": 2.5439083576202393, + "learning_rate": 4.667347248901476e-06, + "loss": 0.4657, + "step": 2191 + }, + { + "epoch": 1.0364066193853427, + "grad_norm": 2.5537586212158203, + "learning_rate": 4.667036257061945e-06, + "loss": 0.527, + "step": 2192 + }, + { + "epoch": 1.0368794326241135, + "grad_norm": 2.595466375350952, + "learning_rate": 4.666725130290569e-06, + "loss": 0.5336, + "step": 2193 + }, + { + "epoch": 1.037352245862884, + "grad_norm": 3.5106313228607178, + "learning_rate": 4.666413868606719e-06, + "loss": 0.5176, + "step": 2194 + }, + { + "epoch": 1.037825059101655, + "grad_norm": 2.931553363800049, + "learning_rate": 4.666102472029778e-06, + "loss": 0.549, + "step": 2195 + }, + { + "epoch": 1.0382978723404255, + "grad_norm": 2.4325125217437744, + "learning_rate": 4.665790940579133e-06, + "loss": 0.5095, + "step": 2196 + }, + { + "epoch": 1.0387706855791963, + "grad_norm": 2.708477258682251, + "learning_rate": 4.665479274274184e-06, + "loss": 0.5264, + "step": 2197 + }, + { + "epoch": 1.0392434988179668, + "grad_norm": 2.905977487564087, + "learning_rate": 4.665167473134335e-06, + "loss": 0.5575, + "step": 2198 + }, + { + "epoch": 1.0397163120567376, + "grad_norm": 2.428938865661621, + "learning_rate": 4.664855537179003e-06, + "loss": 0.5099, + "step": 2199 + }, + { + "epoch": 1.0401891252955082, + "grad_norm": 2.8432137966156006, + "learning_rate": 4.6645434664276075e-06, + "loss": 0.5331, + "step": 2200 + }, + { + "epoch": 1.040661938534279, + "grad_norm": 2.5185136795043945, + "learning_rate": 4.6642312608995825e-06, + "loss": 0.5217, + "step": 2201 + }, + { + "epoch": 1.0411347517730496, + "grad_norm": 2.556607723236084, + "learning_rate": 4.663918920614366e-06, + "loss": 0.4431, + "step": 2202 + }, + { + "epoch": 1.0416075650118204, + "grad_norm": 3.1271166801452637, + "learning_rate": 4.663606445591407e-06, + "loss": 0.5398, + "step": 2203 + }, + { + "epoch": 1.042080378250591, + "grad_norm": 2.573680877685547, + "learning_rate": 4.663293835850162e-06, + "loss": 0.4713, + "step": 2204 + }, + { + "epoch": 1.0425531914893618, + "grad_norm": 2.5230324268341064, + "learning_rate": 4.662981091410096e-06, + "loss": 0.5571, + "step": 2205 + }, + { + "epoch": 1.0430260047281323, + "grad_norm": 2.552182912826538, + "learning_rate": 4.662668212290681e-06, + "loss": 0.5173, + "step": 2206 + }, + { + "epoch": 1.0434988179669031, + "grad_norm": 2.832345724105835, + "learning_rate": 4.6623551985113995e-06, + "loss": 0.525, + "step": 2207 + }, + { + "epoch": 1.0439716312056737, + "grad_norm": 2.9729080200195312, + "learning_rate": 4.6620420500917416e-06, + "loss": 0.6308, + "step": 2208 + }, + { + "epoch": 1.0444444444444445, + "grad_norm": 2.618187665939331, + "learning_rate": 4.661728767051206e-06, + "loss": 0.4942, + "step": 2209 + }, + { + "epoch": 1.044917257683215, + "grad_norm": 2.515566349029541, + "learning_rate": 4.661415349409299e-06, + "loss": 0.5229, + "step": 2210 + }, + { + "epoch": 1.0453900709219859, + "grad_norm": 2.8651459217071533, + "learning_rate": 4.6611017971855356e-06, + "loss": 0.5029, + "step": 2211 + }, + { + "epoch": 1.0458628841607565, + "grad_norm": 2.502405881881714, + "learning_rate": 4.660788110399439e-06, + "loss": 0.4732, + "step": 2212 + }, + { + "epoch": 1.0463356973995273, + "grad_norm": 2.540668249130249, + "learning_rate": 4.660474289070541e-06, + "loss": 0.547, + "step": 2213 + }, + { + "epoch": 1.0468085106382978, + "grad_norm": 2.803469181060791, + "learning_rate": 4.660160333218384e-06, + "loss": 0.5441, + "step": 2214 + }, + { + "epoch": 1.0472813238770686, + "grad_norm": 3.233325481414795, + "learning_rate": 4.659846242862514e-06, + "loss": 0.4457, + "step": 2215 + }, + { + "epoch": 1.0477541371158392, + "grad_norm": 2.549548387527466, + "learning_rate": 4.659532018022489e-06, + "loss": 0.5684, + "step": 2216 + }, + { + "epoch": 1.04822695035461, + "grad_norm": 2.6112852096557617, + "learning_rate": 4.659217658717875e-06, + "loss": 0.5323, + "step": 2217 + }, + { + "epoch": 1.0486997635933806, + "grad_norm": 2.347418785095215, + "learning_rate": 4.658903164968245e-06, + "loss": 0.5349, + "step": 2218 + }, + { + "epoch": 1.0491725768321514, + "grad_norm": 2.695502281188965, + "learning_rate": 4.658588536793182e-06, + "loss": 0.4883, + "step": 2219 + }, + { + "epoch": 1.049645390070922, + "grad_norm": 2.7575674057006836, + "learning_rate": 4.658273774212275e-06, + "loss": 0.5517, + "step": 2220 + }, + { + "epoch": 1.0501182033096927, + "grad_norm": 2.787855386734009, + "learning_rate": 4.6579588772451245e-06, + "loss": 0.5744, + "step": 2221 + }, + { + "epoch": 1.0505910165484633, + "grad_norm": 3.0699398517608643, + "learning_rate": 4.657643845911337e-06, + "loss": 0.5258, + "step": 2222 + }, + { + "epoch": 1.0510638297872341, + "grad_norm": 2.652040719985962, + "learning_rate": 4.657328680230527e-06, + "loss": 0.5141, + "step": 2223 + }, + { + "epoch": 1.0515366430260047, + "grad_norm": 2.6896369457244873, + "learning_rate": 4.657013380222322e-06, + "loss": 0.5139, + "step": 2224 + }, + { + "epoch": 1.0520094562647755, + "grad_norm": 2.551839590072632, + "learning_rate": 4.65669794590635e-06, + "loss": 0.5099, + "step": 2225 + }, + { + "epoch": 1.052482269503546, + "grad_norm": 2.8543262481689453, + "learning_rate": 4.656382377302255e-06, + "loss": 0.6085, + "step": 2226 + }, + { + "epoch": 1.0529550827423169, + "grad_norm": 2.871469259262085, + "learning_rate": 4.656066674429685e-06, + "loss": 0.6108, + "step": 2227 + }, + { + "epoch": 1.0534278959810874, + "grad_norm": 2.4840824604034424, + "learning_rate": 4.655750837308296e-06, + "loss": 0.4994, + "step": 2228 + }, + { + "epoch": 1.0539007092198582, + "grad_norm": 2.5203280448913574, + "learning_rate": 4.6554348659577555e-06, + "loss": 0.4928, + "step": 2229 + }, + { + "epoch": 1.0543735224586288, + "grad_norm": 2.9327683448791504, + "learning_rate": 4.655118760397737e-06, + "loss": 0.6324, + "step": 2230 + }, + { + "epoch": 1.0548463356973996, + "grad_norm": 2.6766855716705322, + "learning_rate": 4.654802520647924e-06, + "loss": 0.5178, + "step": 2231 + }, + { + "epoch": 1.0553191489361702, + "grad_norm": 2.8438873291015625, + "learning_rate": 4.654486146728006e-06, + "loss": 0.509, + "step": 2232 + }, + { + "epoch": 1.055791962174941, + "grad_norm": 2.538661241531372, + "learning_rate": 4.6541696386576826e-06, + "loss": 0.5463, + "step": 2233 + }, + { + "epoch": 1.0562647754137116, + "grad_norm": 2.829030990600586, + "learning_rate": 4.653852996456662e-06, + "loss": 0.5404, + "step": 2234 + }, + { + "epoch": 1.0567375886524824, + "grad_norm": 2.5657269954681396, + "learning_rate": 4.653536220144659e-06, + "loss": 0.5479, + "step": 2235 + }, + { + "epoch": 1.057210401891253, + "grad_norm": 2.6641297340393066, + "learning_rate": 4.653219309741399e-06, + "loss": 0.5503, + "step": 2236 + }, + { + "epoch": 1.0576832151300237, + "grad_norm": 2.966350555419922, + "learning_rate": 4.652902265266615e-06, + "loss": 0.6404, + "step": 2237 + }, + { + "epoch": 1.0581560283687943, + "grad_norm": 2.462430000305176, + "learning_rate": 4.6525850867400455e-06, + "loss": 0.4885, + "step": 2238 + }, + { + "epoch": 1.058628841607565, + "grad_norm": 2.1791880130767822, + "learning_rate": 4.652267774181443e-06, + "loss": 0.4405, + "step": 2239 + }, + { + "epoch": 1.0591016548463357, + "grad_norm": 2.5473732948303223, + "learning_rate": 4.651950327610563e-06, + "loss": 0.5295, + "step": 2240 + }, + { + "epoch": 1.0595744680851065, + "grad_norm": 2.70904803276062, + "learning_rate": 4.651632747047172e-06, + "loss": 0.5169, + "step": 2241 + }, + { + "epoch": 1.060047281323877, + "grad_norm": 3.8442928791046143, + "learning_rate": 4.651315032511045e-06, + "loss": 0.5473, + "step": 2242 + }, + { + "epoch": 1.0605200945626478, + "grad_norm": 2.8613383769989014, + "learning_rate": 4.650997184021963e-06, + "loss": 0.5445, + "step": 2243 + }, + { + "epoch": 1.0609929078014184, + "grad_norm": 2.5995829105377197, + "learning_rate": 4.6506792015997184e-06, + "loss": 0.5525, + "step": 2244 + }, + { + "epoch": 1.0614657210401892, + "grad_norm": 2.5465996265411377, + "learning_rate": 4.650361085264111e-06, + "loss": 0.5093, + "step": 2245 + }, + { + "epoch": 1.0619385342789598, + "grad_norm": 2.46553111076355, + "learning_rate": 4.650042835034948e-06, + "loss": 0.5375, + "step": 2246 + }, + { + "epoch": 1.0624113475177306, + "grad_norm": 2.6907830238342285, + "learning_rate": 4.649724450932045e-06, + "loss": 0.572, + "step": 2247 + }, + { + "epoch": 1.0628841607565012, + "grad_norm": 3.0671346187591553, + "learning_rate": 4.649405932975226e-06, + "loss": 0.4974, + "step": 2248 + }, + { + "epoch": 1.063356973995272, + "grad_norm": 2.5392491817474365, + "learning_rate": 4.649087281184325e-06, + "loss": 0.524, + "step": 2249 + }, + { + "epoch": 1.0638297872340425, + "grad_norm": 2.7498562335968018, + "learning_rate": 4.648768495579183e-06, + "loss": 0.5801, + "step": 2250 + }, + { + "epoch": 1.0643026004728133, + "grad_norm": 2.8536248207092285, + "learning_rate": 4.648449576179649e-06, + "loss": 0.5384, + "step": 2251 + }, + { + "epoch": 1.064775413711584, + "grad_norm": 2.7062792778015137, + "learning_rate": 4.64813052300558e-06, + "loss": 0.5262, + "step": 2252 + }, + { + "epoch": 1.0652482269503547, + "grad_norm": 2.798650026321411, + "learning_rate": 4.647811336076841e-06, + "loss": 0.5719, + "step": 2253 + }, + { + "epoch": 1.0657210401891253, + "grad_norm": 2.9793951511383057, + "learning_rate": 4.647492015413311e-06, + "loss": 0.5377, + "step": 2254 + }, + { + "epoch": 1.066193853427896, + "grad_norm": 2.572129011154175, + "learning_rate": 4.647172561034868e-06, + "loss": 0.4791, + "step": 2255 + }, + { + "epoch": 1.0666666666666667, + "grad_norm": 3.7490930557250977, + "learning_rate": 4.646852972961405e-06, + "loss": 0.5423, + "step": 2256 + }, + { + "epoch": 1.0671394799054374, + "grad_norm": 2.626255750656128, + "learning_rate": 4.646533251212821e-06, + "loss": 0.5558, + "step": 2257 + }, + { + "epoch": 1.067612293144208, + "grad_norm": 2.8408126831054688, + "learning_rate": 4.646213395809023e-06, + "loss": 0.55, + "step": 2258 + }, + { + "epoch": 1.0680851063829788, + "grad_norm": 3.255606174468994, + "learning_rate": 4.645893406769929e-06, + "loss": 0.547, + "step": 2259 + }, + { + "epoch": 1.0685579196217494, + "grad_norm": 2.4352102279663086, + "learning_rate": 4.645573284115461e-06, + "loss": 0.4898, + "step": 2260 + }, + { + "epoch": 1.0690307328605202, + "grad_norm": 2.408634662628174, + "learning_rate": 4.6452530278655535e-06, + "loss": 0.5264, + "step": 2261 + }, + { + "epoch": 1.0695035460992908, + "grad_norm": 2.4220449924468994, + "learning_rate": 4.644932638040146e-06, + "loss": 0.5166, + "step": 2262 + }, + { + "epoch": 1.0699763593380616, + "grad_norm": 2.9188082218170166, + "learning_rate": 4.644612114659188e-06, + "loss": 0.5611, + "step": 2263 + }, + { + "epoch": 1.0704491725768321, + "grad_norm": 2.906557083129883, + "learning_rate": 4.644291457742638e-06, + "loss": 0.5515, + "step": 2264 + }, + { + "epoch": 1.070921985815603, + "grad_norm": 2.9039015769958496, + "learning_rate": 4.643970667310462e-06, + "loss": 0.5732, + "step": 2265 + }, + { + "epoch": 1.0713947990543735, + "grad_norm": 2.9985480308532715, + "learning_rate": 4.643649743382632e-06, + "loss": 0.563, + "step": 2266 + }, + { + "epoch": 1.0718676122931443, + "grad_norm": 2.5780906677246094, + "learning_rate": 4.6433286859791335e-06, + "loss": 0.502, + "step": 2267 + }, + { + "epoch": 1.0723404255319149, + "grad_norm": 2.590209722518921, + "learning_rate": 4.643007495119955e-06, + "loss": 0.4995, + "step": 2268 + }, + { + "epoch": 1.0728132387706855, + "grad_norm": 2.378894805908203, + "learning_rate": 4.642686170825097e-06, + "loss": 0.4886, + "step": 2269 + }, + { + "epoch": 1.0732860520094563, + "grad_norm": 2.6826229095458984, + "learning_rate": 4.642364713114567e-06, + "loss": 0.465, + "step": 2270 + }, + { + "epoch": 1.073758865248227, + "grad_norm": 2.627819538116455, + "learning_rate": 4.64204312200838e-06, + "loss": 0.4954, + "step": 2271 + }, + { + "epoch": 1.0742316784869976, + "grad_norm": 2.993021249771118, + "learning_rate": 4.641721397526561e-06, + "loss": 0.5073, + "step": 2272 + }, + { + "epoch": 1.0747044917257682, + "grad_norm": 2.719052791595459, + "learning_rate": 4.64139953968914e-06, + "loss": 0.538, + "step": 2273 + }, + { + "epoch": 1.075177304964539, + "grad_norm": 2.729252576828003, + "learning_rate": 4.6410775485161605e-06, + "loss": 0.552, + "step": 2274 + }, + { + "epoch": 1.0756501182033098, + "grad_norm": 2.924142599105835, + "learning_rate": 4.640755424027671e-06, + "loss": 0.522, + "step": 2275 + }, + { + "epoch": 1.0761229314420804, + "grad_norm": 3.329162120819092, + "learning_rate": 4.640433166243728e-06, + "loss": 0.5965, + "step": 2276 + }, + { + "epoch": 1.076595744680851, + "grad_norm": 2.9810245037078857, + "learning_rate": 4.640110775184396e-06, + "loss": 0.5653, + "step": 2277 + }, + { + "epoch": 1.0770685579196217, + "grad_norm": 2.61772084236145, + "learning_rate": 4.639788250869751e-06, + "loss": 0.5382, + "step": 2278 + }, + { + "epoch": 1.0775413711583925, + "grad_norm": 2.741225004196167, + "learning_rate": 4.639465593319874e-06, + "loss": 0.4866, + "step": 2279 + }, + { + "epoch": 1.0780141843971631, + "grad_norm": 2.7945218086242676, + "learning_rate": 4.639142802554856e-06, + "loss": 0.4711, + "step": 2280 + }, + { + "epoch": 1.0784869976359337, + "grad_norm": 2.4282329082489014, + "learning_rate": 4.638819878594795e-06, + "loss": 0.4911, + "step": 2281 + }, + { + "epoch": 1.0789598108747045, + "grad_norm": 2.551741361618042, + "learning_rate": 4.638496821459799e-06, + "loss": 0.453, + "step": 2282 + }, + { + "epoch": 1.0794326241134753, + "grad_norm": 2.5622754096984863, + "learning_rate": 4.638173631169983e-06, + "loss": 0.5983, + "step": 2283 + }, + { + "epoch": 1.0799054373522459, + "grad_norm": 2.7748284339904785, + "learning_rate": 4.6378503077454715e-06, + "loss": 0.5143, + "step": 2284 + }, + { + "epoch": 1.0803782505910164, + "grad_norm": 2.7693238258361816, + "learning_rate": 4.637526851206394e-06, + "loss": 0.5929, + "step": 2285 + }, + { + "epoch": 1.0808510638297872, + "grad_norm": 2.705548048019409, + "learning_rate": 4.637203261572893e-06, + "loss": 0.5577, + "step": 2286 + }, + { + "epoch": 1.081323877068558, + "grad_norm": 2.739307165145874, + "learning_rate": 4.636879538865117e-06, + "loss": 0.5676, + "step": 2287 + }, + { + "epoch": 1.0817966903073286, + "grad_norm": 2.514059543609619, + "learning_rate": 4.636555683103221e-06, + "loss": 0.5001, + "step": 2288 + }, + { + "epoch": 1.0822695035460992, + "grad_norm": 2.7166874408721924, + "learning_rate": 4.636231694307372e-06, + "loss": 0.5411, + "step": 2289 + }, + { + "epoch": 1.08274231678487, + "grad_norm": 2.7661683559417725, + "learning_rate": 4.635907572497741e-06, + "loss": 0.6353, + "step": 2290 + }, + { + "epoch": 1.0832151300236406, + "grad_norm": 2.598381996154785, + "learning_rate": 4.635583317694512e-06, + "loss": 0.5213, + "step": 2291 + }, + { + "epoch": 1.0836879432624114, + "grad_norm": 2.821491003036499, + "learning_rate": 4.6352589299178744e-06, + "loss": 0.6172, + "step": 2292 + }, + { + "epoch": 1.084160756501182, + "grad_norm": 2.5422823429107666, + "learning_rate": 4.634934409188025e-06, + "loss": 0.5245, + "step": 2293 + }, + { + "epoch": 1.0846335697399527, + "grad_norm": 2.8264620304107666, + "learning_rate": 4.634609755525173e-06, + "loss": 0.5004, + "step": 2294 + }, + { + "epoch": 1.0851063829787233, + "grad_norm": 2.3286643028259277, + "learning_rate": 4.63428496894953e-06, + "loss": 0.4561, + "step": 2295 + }, + { + "epoch": 1.085579196217494, + "grad_norm": 2.462005376815796, + "learning_rate": 4.633960049481321e-06, + "loss": 0.4948, + "step": 2296 + }, + { + "epoch": 1.0860520094562647, + "grad_norm": 2.760258913040161, + "learning_rate": 4.633634997140777e-06, + "loss": 0.5407, + "step": 2297 + }, + { + "epoch": 1.0865248226950355, + "grad_norm": 3.0234217643737793, + "learning_rate": 4.633309811948138e-06, + "loss": 0.4914, + "step": 2298 + }, + { + "epoch": 1.086997635933806, + "grad_norm": 2.8380849361419678, + "learning_rate": 4.63298449392365e-06, + "loss": 0.5562, + "step": 2299 + }, + { + "epoch": 1.0874704491725768, + "grad_norm": 2.6201648712158203, + "learning_rate": 4.632659043087572e-06, + "loss": 0.5882, + "step": 2300 + }, + { + "epoch": 1.0879432624113474, + "grad_norm": 2.586339235305786, + "learning_rate": 4.632333459460165e-06, + "loss": 0.4991, + "step": 2301 + }, + { + "epoch": 1.0884160756501182, + "grad_norm": 2.500115394592285, + "learning_rate": 4.632007743061705e-06, + "loss": 0.552, + "step": 2302 + }, + { + "epoch": 1.0888888888888888, + "grad_norm": 2.816390037536621, + "learning_rate": 4.63168189391247e-06, + "loss": 0.5301, + "step": 2303 + }, + { + "epoch": 1.0893617021276596, + "grad_norm": 2.975400924682617, + "learning_rate": 4.631355912032753e-06, + "loss": 0.6056, + "step": 2304 + }, + { + "epoch": 1.0898345153664302, + "grad_norm": 2.747985363006592, + "learning_rate": 4.631029797442846e-06, + "loss": 0.5335, + "step": 2305 + }, + { + "epoch": 1.090307328605201, + "grad_norm": 2.609281539916992, + "learning_rate": 4.630703550163059e-06, + "loss": 0.5189, + "step": 2306 + }, + { + "epoch": 1.0907801418439715, + "grad_norm": 2.624131202697754, + "learning_rate": 4.630377170213705e-06, + "loss": 0.5646, + "step": 2307 + }, + { + "epoch": 1.0912529550827423, + "grad_norm": 2.6186959743499756, + "learning_rate": 4.630050657615107e-06, + "loss": 0.5187, + "step": 2308 + }, + { + "epoch": 1.091725768321513, + "grad_norm": 2.9961764812469482, + "learning_rate": 4.629724012387594e-06, + "loss": 0.6207, + "step": 2309 + }, + { + "epoch": 1.0921985815602837, + "grad_norm": 2.665799140930176, + "learning_rate": 4.629397234551505e-06, + "loss": 0.5046, + "step": 2310 + }, + { + "epoch": 1.0926713947990543, + "grad_norm": 2.6154725551605225, + "learning_rate": 4.629070324127187e-06, + "loss": 0.5553, + "step": 2311 + }, + { + "epoch": 1.093144208037825, + "grad_norm": 2.702967643737793, + "learning_rate": 4.628743281134996e-06, + "loss": 0.5159, + "step": 2312 + }, + { + "epoch": 1.0936170212765957, + "grad_norm": 2.578080177307129, + "learning_rate": 4.628416105595295e-06, + "loss": 0.4934, + "step": 2313 + }, + { + "epoch": 1.0940898345153665, + "grad_norm": 2.8763060569763184, + "learning_rate": 4.628088797528456e-06, + "loss": 0.5404, + "step": 2314 + }, + { + "epoch": 1.094562647754137, + "grad_norm": 2.5301198959350586, + "learning_rate": 4.6277613569548585e-06, + "loss": 0.524, + "step": 2315 + }, + { + "epoch": 1.0950354609929078, + "grad_norm": 2.559903144836426, + "learning_rate": 4.627433783894892e-06, + "loss": 0.5177, + "step": 2316 + }, + { + "epoch": 1.0955082742316784, + "grad_norm": 2.430863380432129, + "learning_rate": 4.627106078368952e-06, + "loss": 0.5368, + "step": 2317 + }, + { + "epoch": 1.0959810874704492, + "grad_norm": 2.687567949295044, + "learning_rate": 4.626778240397444e-06, + "loss": 0.5385, + "step": 2318 + }, + { + "epoch": 1.0964539007092198, + "grad_norm": 3.053466558456421, + "learning_rate": 4.62645027000078e-06, + "loss": 0.5814, + "step": 2319 + }, + { + "epoch": 1.0969267139479906, + "grad_norm": 2.4612979888916016, + "learning_rate": 4.6261221671993815e-06, + "loss": 0.5069, + "step": 2320 + }, + { + "epoch": 1.0973995271867611, + "grad_norm": 2.6153628826141357, + "learning_rate": 4.625793932013679e-06, + "loss": 0.5422, + "step": 2321 + }, + { + "epoch": 1.097872340425532, + "grad_norm": 2.8918874263763428, + "learning_rate": 4.62546556446411e-06, + "loss": 0.5326, + "step": 2322 + }, + { + "epoch": 1.0983451536643025, + "grad_norm": 3.62565279006958, + "learning_rate": 4.625137064571119e-06, + "loss": 0.5164, + "step": 2323 + }, + { + "epoch": 1.0988179669030733, + "grad_norm": 2.4285085201263428, + "learning_rate": 4.624808432355164e-06, + "loss": 0.5084, + "step": 2324 + }, + { + "epoch": 1.099290780141844, + "grad_norm": 2.593979835510254, + "learning_rate": 4.624479667836702e-06, + "loss": 0.4986, + "step": 2325 + }, + { + "epoch": 1.0997635933806147, + "grad_norm": 2.490752935409546, + "learning_rate": 4.624150771036208e-06, + "loss": 0.5296, + "step": 2326 + }, + { + "epoch": 1.1002364066193853, + "grad_norm": 2.67694091796875, + "learning_rate": 4.6238217419741595e-06, + "loss": 0.5229, + "step": 2327 + }, + { + "epoch": 1.100709219858156, + "grad_norm": 2.594147205352783, + "learning_rate": 4.623492580671044e-06, + "loss": 0.4916, + "step": 2328 + }, + { + "epoch": 1.1011820330969266, + "grad_norm": 2.943472385406494, + "learning_rate": 4.623163287147356e-06, + "loss": 0.5591, + "step": 2329 + }, + { + "epoch": 1.1016548463356974, + "grad_norm": 2.569410562515259, + "learning_rate": 4.622833861423601e-06, + "loss": 0.4648, + "step": 2330 + }, + { + "epoch": 1.102127659574468, + "grad_norm": 2.5490405559539795, + "learning_rate": 4.6225043035202886e-06, + "loss": 0.5493, + "step": 2331 + }, + { + "epoch": 1.1026004728132388, + "grad_norm": 2.5964598655700684, + "learning_rate": 4.622174613457941e-06, + "loss": 0.5358, + "step": 2332 + }, + { + "epoch": 1.1030732860520094, + "grad_norm": 2.6456820964813232, + "learning_rate": 4.621844791257085e-06, + "loss": 0.5864, + "step": 2333 + }, + { + "epoch": 1.1035460992907802, + "grad_norm": 2.861180067062378, + "learning_rate": 4.621514836938259e-06, + "loss": 0.6064, + "step": 2334 + }, + { + "epoch": 1.1040189125295508, + "grad_norm": 2.8199548721313477, + "learning_rate": 4.621184750522005e-06, + "loss": 0.5244, + "step": 2335 + }, + { + "epoch": 1.1044917257683216, + "grad_norm": 2.7398853302001953, + "learning_rate": 4.6208545320288795e-06, + "loss": 0.5496, + "step": 2336 + }, + { + "epoch": 1.1049645390070921, + "grad_norm": 2.7941031455993652, + "learning_rate": 4.620524181479441e-06, + "loss": 0.5496, + "step": 2337 + }, + { + "epoch": 1.105437352245863, + "grad_norm": 2.973785161972046, + "learning_rate": 4.620193698894259e-06, + "loss": 0.5492, + "step": 2338 + }, + { + "epoch": 1.1059101654846335, + "grad_norm": 2.650355815887451, + "learning_rate": 4.6198630842939144e-06, + "loss": 0.5392, + "step": 2339 + }, + { + "epoch": 1.1063829787234043, + "grad_norm": 2.9092214107513428, + "learning_rate": 4.61953233769899e-06, + "loss": 0.5305, + "step": 2340 + }, + { + "epoch": 1.1068557919621749, + "grad_norm": 2.6329731941223145, + "learning_rate": 4.61920145913008e-06, + "loss": 0.5031, + "step": 2341 + }, + { + "epoch": 1.1073286052009457, + "grad_norm": 2.7214207649230957, + "learning_rate": 4.618870448607788e-06, + "loss": 0.5536, + "step": 2342 + }, + { + "epoch": 1.1078014184397162, + "grad_norm": 2.873119592666626, + "learning_rate": 4.618539306152724e-06, + "loss": 0.4531, + "step": 2343 + }, + { + "epoch": 1.108274231678487, + "grad_norm": 2.701042413711548, + "learning_rate": 4.618208031785507e-06, + "loss": 0.5217, + "step": 2344 + }, + { + "epoch": 1.1087470449172576, + "grad_norm": 2.7189881801605225, + "learning_rate": 4.6178766255267635e-06, + "loss": 0.6205, + "step": 2345 + }, + { + "epoch": 1.1092198581560284, + "grad_norm": 2.546382188796997, + "learning_rate": 4.61754508739713e-06, + "loss": 0.5475, + "step": 2346 + }, + { + "epoch": 1.109692671394799, + "grad_norm": 2.8429276943206787, + "learning_rate": 4.617213417417249e-06, + "loss": 0.4809, + "step": 2347 + }, + { + "epoch": 1.1101654846335698, + "grad_norm": 2.9515812397003174, + "learning_rate": 4.616881615607772e-06, + "loss": 0.5067, + "step": 2348 + }, + { + "epoch": 1.1106382978723404, + "grad_norm": 2.5910723209381104, + "learning_rate": 4.616549681989358e-06, + "loss": 0.5368, + "step": 2349 + }, + { + "epoch": 1.1111111111111112, + "grad_norm": 2.80855655670166, + "learning_rate": 4.616217616582678e-06, + "loss": 0.5827, + "step": 2350 + }, + { + "epoch": 1.1115839243498817, + "grad_norm": 2.604383945465088, + "learning_rate": 4.6158854194084044e-06, + "loss": 0.5716, + "step": 2351 + }, + { + "epoch": 1.1120567375886525, + "grad_norm": 3.0585904121398926, + "learning_rate": 4.6155530904872246e-06, + "loss": 0.4998, + "step": 2352 + }, + { + "epoch": 1.112529550827423, + "grad_norm": 2.660961627960205, + "learning_rate": 4.61522062983983e-06, + "loss": 0.4533, + "step": 2353 + }, + { + "epoch": 1.113002364066194, + "grad_norm": 2.8042070865631104, + "learning_rate": 4.614888037486923e-06, + "loss": 0.5592, + "step": 2354 + }, + { + "epoch": 1.1134751773049645, + "grad_norm": 2.681664228439331, + "learning_rate": 4.61455531344921e-06, + "loss": 0.5439, + "step": 2355 + }, + { + "epoch": 1.1139479905437353, + "grad_norm": 2.905054807662964, + "learning_rate": 4.61422245774741e-06, + "loss": 0.5497, + "step": 2356 + }, + { + "epoch": 1.1144208037825059, + "grad_norm": 2.7979753017425537, + "learning_rate": 4.6138894704022484e-06, + "loss": 0.5374, + "step": 2357 + }, + { + "epoch": 1.1148936170212767, + "grad_norm": 2.965611696243286, + "learning_rate": 4.613556351434458e-06, + "loss": 0.5145, + "step": 2358 + }, + { + "epoch": 1.1153664302600472, + "grad_norm": 2.583134889602661, + "learning_rate": 4.613223100864782e-06, + "loss": 0.535, + "step": 2359 + }, + { + "epoch": 1.115839243498818, + "grad_norm": 2.5979621410369873, + "learning_rate": 4.61288971871397e-06, + "loss": 0.5514, + "step": 2360 + }, + { + "epoch": 1.1163120567375886, + "grad_norm": 3.0117669105529785, + "learning_rate": 4.612556205002779e-06, + "loss": 0.5266, + "step": 2361 + }, + { + "epoch": 1.1167848699763594, + "grad_norm": 2.425133466720581, + "learning_rate": 4.612222559751976e-06, + "loss": 0.4838, + "step": 2362 + }, + { + "epoch": 1.11725768321513, + "grad_norm": 2.5102691650390625, + "learning_rate": 4.611888782982337e-06, + "loss": 0.3947, + "step": 2363 + }, + { + "epoch": 1.1177304964539008, + "grad_norm": 3.0327367782592773, + "learning_rate": 4.611554874714645e-06, + "loss": 0.5753, + "step": 2364 + }, + { + "epoch": 1.1182033096926713, + "grad_norm": 2.4561009407043457, + "learning_rate": 4.6112208349696875e-06, + "loss": 0.5054, + "step": 2365 + }, + { + "epoch": 1.1186761229314421, + "grad_norm": 3.3898050785064697, + "learning_rate": 4.610886663768267e-06, + "loss": 0.5946, + "step": 2366 + }, + { + "epoch": 1.1191489361702127, + "grad_norm": 2.8112242221832275, + "learning_rate": 4.61055236113119e-06, + "loss": 0.5475, + "step": 2367 + }, + { + "epoch": 1.1196217494089835, + "grad_norm": 3.152946710586548, + "learning_rate": 4.610217927079272e-06, + "loss": 0.5165, + "step": 2368 + }, + { + "epoch": 1.120094562647754, + "grad_norm": 2.7847867012023926, + "learning_rate": 4.609883361633336e-06, + "loss": 0.5533, + "step": 2369 + }, + { + "epoch": 1.1205673758865249, + "grad_norm": 2.6376686096191406, + "learning_rate": 4.6095486648142155e-06, + "loss": 0.4942, + "step": 2370 + }, + { + "epoch": 1.1210401891252955, + "grad_norm": 3.123072862625122, + "learning_rate": 4.609213836642749e-06, + "loss": 0.616, + "step": 2371 + }, + { + "epoch": 1.1215130023640663, + "grad_norm": 2.802694320678711, + "learning_rate": 4.608878877139786e-06, + "loss": 0.5323, + "step": 2372 + }, + { + "epoch": 1.1219858156028368, + "grad_norm": 2.3567938804626465, + "learning_rate": 4.6085437863261825e-06, + "loss": 0.4822, + "step": 2373 + }, + { + "epoch": 1.1224586288416076, + "grad_norm": 2.553112030029297, + "learning_rate": 4.608208564222804e-06, + "loss": 0.5447, + "step": 2374 + }, + { + "epoch": 1.1229314420803782, + "grad_norm": 3.0020132064819336, + "learning_rate": 4.607873210850521e-06, + "loss": 0.6486, + "step": 2375 + }, + { + "epoch": 1.123404255319149, + "grad_norm": 2.832442045211792, + "learning_rate": 4.607537726230216e-06, + "loss": 0.5257, + "step": 2376 + }, + { + "epoch": 1.1238770685579196, + "grad_norm": 2.471527099609375, + "learning_rate": 4.607202110382778e-06, + "loss": 0.4816, + "step": 2377 + }, + { + "epoch": 1.1243498817966904, + "grad_norm": 2.4232118129730225, + "learning_rate": 4.606866363329105e-06, + "loss": 0.5533, + "step": 2378 + }, + { + "epoch": 1.124822695035461, + "grad_norm": 2.477506637573242, + "learning_rate": 4.6065304850901025e-06, + "loss": 0.5223, + "step": 2379 + }, + { + "epoch": 1.1252955082742317, + "grad_norm": 3.54127836227417, + "learning_rate": 4.6061944756866824e-06, + "loss": 0.6514, + "step": 2380 + }, + { + "epoch": 1.1257683215130023, + "grad_norm": 2.5148677825927734, + "learning_rate": 4.605858335139768e-06, + "loss": 0.4864, + "step": 2381 + }, + { + "epoch": 1.1262411347517731, + "grad_norm": 2.8363659381866455, + "learning_rate": 4.605522063470289e-06, + "loss": 0.5034, + "step": 2382 + }, + { + "epoch": 1.1267139479905437, + "grad_norm": 2.4996654987335205, + "learning_rate": 4.605185660699184e-06, + "loss": 0.4126, + "step": 2383 + }, + { + "epoch": 1.1271867612293145, + "grad_norm": 2.352543830871582, + "learning_rate": 4.604849126847398e-06, + "loss": 0.5224, + "step": 2384 + }, + { + "epoch": 1.127659574468085, + "grad_norm": 2.60101056098938, + "learning_rate": 4.6045124619358875e-06, + "loss": 0.4867, + "step": 2385 + }, + { + "epoch": 1.1281323877068559, + "grad_norm": 2.9471068382263184, + "learning_rate": 4.604175665985613e-06, + "loss": 0.6474, + "step": 2386 + }, + { + "epoch": 1.1286052009456264, + "grad_norm": 2.5933351516723633, + "learning_rate": 4.603838739017546e-06, + "loss": 0.5081, + "step": 2387 + }, + { + "epoch": 1.1290780141843972, + "grad_norm": 2.3740346431732178, + "learning_rate": 4.6035016810526665e-06, + "loss": 0.4438, + "step": 2388 + }, + { + "epoch": 1.1295508274231678, + "grad_norm": 2.675020217895508, + "learning_rate": 4.6031644921119614e-06, + "loss": 0.4968, + "step": 2389 + }, + { + "epoch": 1.1300236406619386, + "grad_norm": 2.599472999572754, + "learning_rate": 4.602827172216424e-06, + "loss": 0.5131, + "step": 2390 + }, + { + "epoch": 1.1304964539007092, + "grad_norm": 2.8176097869873047, + "learning_rate": 4.602489721387061e-06, + "loss": 0.5549, + "step": 2391 + }, + { + "epoch": 1.13096926713948, + "grad_norm": 2.466914176940918, + "learning_rate": 4.602152139644881e-06, + "loss": 0.5052, + "step": 2392 + }, + { + "epoch": 1.1314420803782506, + "grad_norm": 2.8938796520233154, + "learning_rate": 4.601814427010905e-06, + "loss": 0.6181, + "step": 2393 + }, + { + "epoch": 1.1319148936170214, + "grad_norm": 2.7390825748443604, + "learning_rate": 4.601476583506161e-06, + "loss": 0.5178, + "step": 2394 + }, + { + "epoch": 1.132387706855792, + "grad_norm": 3.180112838745117, + "learning_rate": 4.601138609151685e-06, + "loss": 0.6071, + "step": 2395 + }, + { + "epoch": 1.1328605200945627, + "grad_norm": 2.9282350540161133, + "learning_rate": 4.600800503968521e-06, + "loss": 0.5557, + "step": 2396 + }, + { + "epoch": 1.1333333333333333, + "grad_norm": 2.6689717769622803, + "learning_rate": 4.6004622679777215e-06, + "loss": 0.4679, + "step": 2397 + }, + { + "epoch": 1.133806146572104, + "grad_norm": 2.651582956314087, + "learning_rate": 4.600123901200347e-06, + "loss": 0.4907, + "step": 2398 + }, + { + "epoch": 1.1342789598108747, + "grad_norm": 2.5702924728393555, + "learning_rate": 4.599785403657464e-06, + "loss": 0.4919, + "step": 2399 + }, + { + "epoch": 1.1347517730496455, + "grad_norm": 2.636812448501587, + "learning_rate": 4.599446775370153e-06, + "loss": 0.5091, + "step": 2400 + }, + { + "epoch": 1.135224586288416, + "grad_norm": 2.5965442657470703, + "learning_rate": 4.599108016359497e-06, + "loss": 0.5035, + "step": 2401 + }, + { + "epoch": 1.1356973995271868, + "grad_norm": 2.689732313156128, + "learning_rate": 4.5987691266465885e-06, + "loss": 0.5307, + "step": 2402 + }, + { + "epoch": 1.1361702127659574, + "grad_norm": 2.7256956100463867, + "learning_rate": 4.59843010625253e-06, + "loss": 0.5066, + "step": 2403 + }, + { + "epoch": 1.1366430260047282, + "grad_norm": 2.726020574569702, + "learning_rate": 4.59809095519843e-06, + "loss": 0.4805, + "step": 2404 + }, + { + "epoch": 1.1371158392434988, + "grad_norm": 2.703339099884033, + "learning_rate": 4.597751673505406e-06, + "loss": 0.4992, + "step": 2405 + }, + { + "epoch": 1.1375886524822696, + "grad_norm": 2.54455304145813, + "learning_rate": 4.5974122611945835e-06, + "loss": 0.5251, + "step": 2406 + }, + { + "epoch": 1.1380614657210402, + "grad_norm": 2.623507022857666, + "learning_rate": 4.597072718287096e-06, + "loss": 0.4831, + "step": 2407 + }, + { + "epoch": 1.138534278959811, + "grad_norm": 2.653590202331543, + "learning_rate": 4.596733044804086e-06, + "loss": 0.5646, + "step": 2408 + }, + { + "epoch": 1.1390070921985815, + "grad_norm": 2.8230600357055664, + "learning_rate": 4.5963932407667035e-06, + "loss": 0.514, + "step": 2409 + }, + { + "epoch": 1.1394799054373523, + "grad_norm": 2.6077451705932617, + "learning_rate": 4.5960533061961065e-06, + "loss": 0.4713, + "step": 2410 + }, + { + "epoch": 1.139952718676123, + "grad_norm": 2.3945798873901367, + "learning_rate": 4.595713241113461e-06, + "loss": 0.466, + "step": 2411 + }, + { + "epoch": 1.1404255319148937, + "grad_norm": 2.8100006580352783, + "learning_rate": 4.595373045539941e-06, + "loss": 0.5365, + "step": 2412 + }, + { + "epoch": 1.1408983451536643, + "grad_norm": 2.6825881004333496, + "learning_rate": 4.59503271949673e-06, + "loss": 0.4457, + "step": 2413 + }, + { + "epoch": 1.141371158392435, + "grad_norm": 2.969435691833496, + "learning_rate": 4.594692263005016e-06, + "loss": 0.5459, + "step": 2414 + }, + { + "epoch": 1.1418439716312057, + "grad_norm": 2.4103164672851562, + "learning_rate": 4.594351676086002e-06, + "loss": 0.4573, + "step": 2415 + }, + { + "epoch": 1.1423167848699765, + "grad_norm": 2.9450128078460693, + "learning_rate": 4.594010958760892e-06, + "loss": 0.5529, + "step": 2416 + }, + { + "epoch": 1.142789598108747, + "grad_norm": 2.6416335105895996, + "learning_rate": 4.593670111050901e-06, + "loss": 0.5153, + "step": 2417 + }, + { + "epoch": 1.1432624113475178, + "grad_norm": 2.473177194595337, + "learning_rate": 4.593329132977253e-06, + "loss": 0.4962, + "step": 2418 + }, + { + "epoch": 1.1437352245862884, + "grad_norm": 2.4494502544403076, + "learning_rate": 4.592988024561179e-06, + "loss": 0.5182, + "step": 2419 + }, + { + "epoch": 1.1442080378250592, + "grad_norm": 2.773930311203003, + "learning_rate": 4.592646785823918e-06, + "loss": 0.4442, + "step": 2420 + }, + { + "epoch": 1.1446808510638298, + "grad_norm": 2.4733314514160156, + "learning_rate": 4.592305416786718e-06, + "loss": 0.5106, + "step": 2421 + }, + { + "epoch": 1.1451536643026006, + "grad_norm": 2.6870038509368896, + "learning_rate": 4.591963917470834e-06, + "loss": 0.5316, + "step": 2422 + }, + { + "epoch": 1.1456264775413711, + "grad_norm": 2.8989531993865967, + "learning_rate": 4.591622287897529e-06, + "loss": 0.5906, + "step": 2423 + }, + { + "epoch": 1.1460992907801417, + "grad_norm": 2.6349124908447266, + "learning_rate": 4.591280528088077e-06, + "loss": 0.6225, + "step": 2424 + }, + { + "epoch": 1.1465721040189125, + "grad_norm": 3.19022274017334, + "learning_rate": 4.5909386380637555e-06, + "loss": 0.555, + "step": 2425 + }, + { + "epoch": 1.1470449172576833, + "grad_norm": 3.1473541259765625, + "learning_rate": 4.5905966178458535e-06, + "loss": 0.537, + "step": 2426 + }, + { + "epoch": 1.147517730496454, + "grad_norm": 2.6996145248413086, + "learning_rate": 4.590254467455667e-06, + "loss": 0.565, + "step": 2427 + }, + { + "epoch": 1.1479905437352245, + "grad_norm": 2.830188274383545, + "learning_rate": 4.5899121869145015e-06, + "loss": 0.6773, + "step": 2428 + }, + { + "epoch": 1.1484633569739953, + "grad_norm": 2.4937260150909424, + "learning_rate": 4.589569776243667e-06, + "loss": 0.5484, + "step": 2429 + }, + { + "epoch": 1.148936170212766, + "grad_norm": 2.54011869430542, + "learning_rate": 4.589227235464486e-06, + "loss": 0.5307, + "step": 2430 + }, + { + "epoch": 1.1494089834515366, + "grad_norm": 2.8764214515686035, + "learning_rate": 4.5888845645982845e-06, + "loss": 0.5296, + "step": 2431 + }, + { + "epoch": 1.1498817966903072, + "grad_norm": 2.637033462524414, + "learning_rate": 4.588541763666402e-06, + "loss": 0.5975, + "step": 2432 + }, + { + "epoch": 1.150354609929078, + "grad_norm": 2.8534255027770996, + "learning_rate": 4.5881988326901815e-06, + "loss": 0.5431, + "step": 2433 + }, + { + "epoch": 1.1508274231678488, + "grad_norm": 2.8546559810638428, + "learning_rate": 4.587855771690976e-06, + "loss": 0.469, + "step": 2434 + }, + { + "epoch": 1.1513002364066194, + "grad_norm": 2.9084973335266113, + "learning_rate": 4.587512580690146e-06, + "loss": 0.5566, + "step": 2435 + }, + { + "epoch": 1.15177304964539, + "grad_norm": 3.0993130207061768, + "learning_rate": 4.587169259709063e-06, + "loss": 0.5612, + "step": 2436 + }, + { + "epoch": 1.1522458628841608, + "grad_norm": 10.847400665283203, + "learning_rate": 4.5868258087691e-06, + "loss": 0.4678, + "step": 2437 + }, + { + "epoch": 1.1527186761229316, + "grad_norm": 2.6648571491241455, + "learning_rate": 4.586482227891645e-06, + "loss": 0.5951, + "step": 2438 + }, + { + "epoch": 1.1531914893617021, + "grad_norm": 2.529043197631836, + "learning_rate": 4.586138517098091e-06, + "loss": 0.5048, + "step": 2439 + }, + { + "epoch": 1.1536643026004727, + "grad_norm": 2.833904504776001, + "learning_rate": 4.585794676409839e-06, + "loss": 0.536, + "step": 2440 + }, + { + "epoch": 1.1541371158392435, + "grad_norm": 3.507657766342163, + "learning_rate": 4.585450705848298e-06, + "loss": 0.5954, + "step": 2441 + }, + { + "epoch": 1.1546099290780143, + "grad_norm": 2.6108388900756836, + "learning_rate": 4.585106605434887e-06, + "loss": 0.5684, + "step": 2442 + }, + { + "epoch": 1.1550827423167849, + "grad_norm": 2.490708589553833, + "learning_rate": 4.58476237519103e-06, + "loss": 0.4678, + "step": 2443 + }, + { + "epoch": 1.1555555555555554, + "grad_norm": 2.8192343711853027, + "learning_rate": 4.584418015138161e-06, + "loss": 0.5291, + "step": 2444 + }, + { + "epoch": 1.1560283687943262, + "grad_norm": 3.0878679752349854, + "learning_rate": 4.584073525297722e-06, + "loss": 0.5691, + "step": 2445 + }, + { + "epoch": 1.156501182033097, + "grad_norm": 3.1444318294525146, + "learning_rate": 4.583728905691163e-06, + "loss": 0.5643, + "step": 2446 + }, + { + "epoch": 1.1569739952718676, + "grad_norm": 3.02382230758667, + "learning_rate": 4.583384156339942e-06, + "loss": 0.6008, + "step": 2447 + }, + { + "epoch": 1.1574468085106382, + "grad_norm": 2.5942490100860596, + "learning_rate": 4.583039277265525e-06, + "loss": 0.5105, + "step": 2448 + }, + { + "epoch": 1.157919621749409, + "grad_norm": 2.938608407974243, + "learning_rate": 4.582694268489386e-06, + "loss": 0.5123, + "step": 2449 + }, + { + "epoch": 1.1583924349881798, + "grad_norm": 2.4622268676757812, + "learning_rate": 4.5823491300330075e-06, + "loss": 0.4538, + "step": 2450 + }, + { + "epoch": 1.1588652482269504, + "grad_norm": 2.4380505084991455, + "learning_rate": 4.5820038619178795e-06, + "loss": 0.4682, + "step": 2451 + }, + { + "epoch": 1.159338061465721, + "grad_norm": 2.479896068572998, + "learning_rate": 4.581658464165501e-06, + "loss": 0.4877, + "step": 2452 + }, + { + "epoch": 1.1598108747044917, + "grad_norm": 2.3373546600341797, + "learning_rate": 4.5813129367973765e-06, + "loss": 0.445, + "step": 2453 + }, + { + "epoch": 1.1602836879432625, + "grad_norm": 2.8586013317108154, + "learning_rate": 4.5809672798350214e-06, + "loss": 0.5232, + "step": 2454 + }, + { + "epoch": 1.160756501182033, + "grad_norm": 3.2302439212799072, + "learning_rate": 4.5806214932999595e-06, + "loss": 0.5336, + "step": 2455 + }, + { + "epoch": 1.1612293144208037, + "grad_norm": 3.1005783081054688, + "learning_rate": 4.580275577213721e-06, + "loss": 0.5123, + "step": 2456 + }, + { + "epoch": 1.1617021276595745, + "grad_norm": 2.7131073474884033, + "learning_rate": 4.579929531597842e-06, + "loss": 0.5648, + "step": 2457 + }, + { + "epoch": 1.1621749408983453, + "grad_norm": 2.5067050457000732, + "learning_rate": 4.579583356473874e-06, + "loss": 0.5324, + "step": 2458 + }, + { + "epoch": 1.1626477541371159, + "grad_norm": 2.7870543003082275, + "learning_rate": 4.579237051863366e-06, + "loss": 0.5094, + "step": 2459 + }, + { + "epoch": 1.1631205673758864, + "grad_norm": 2.739196300506592, + "learning_rate": 4.578890617787887e-06, + "loss": 0.5103, + "step": 2460 + }, + { + "epoch": 1.1635933806146572, + "grad_norm": 2.7108185291290283, + "learning_rate": 4.578544054269003e-06, + "loss": 0.533, + "step": 2461 + }, + { + "epoch": 1.1640661938534278, + "grad_norm": 3.028005361557007, + "learning_rate": 4.578197361328295e-06, + "loss": 0.636, + "step": 2462 + }, + { + "epoch": 1.1645390070921986, + "grad_norm": 2.4855129718780518, + "learning_rate": 4.5778505389873505e-06, + "loss": 0.501, + "step": 2463 + }, + { + "epoch": 1.1650118203309692, + "grad_norm": 2.6314198970794678, + "learning_rate": 4.577503587267764e-06, + "loss": 0.5812, + "step": 2464 + }, + { + "epoch": 1.16548463356974, + "grad_norm": 2.4209671020507812, + "learning_rate": 4.5771565061911385e-06, + "loss": 0.5168, + "step": 2465 + }, + { + "epoch": 1.1659574468085105, + "grad_norm": 2.526388645172119, + "learning_rate": 4.576809295779085e-06, + "loss": 0.5047, + "step": 2466 + }, + { + "epoch": 1.1664302600472813, + "grad_norm": 2.8278191089630127, + "learning_rate": 4.576461956053224e-06, + "loss": 0.4759, + "step": 2467 + }, + { + "epoch": 1.166903073286052, + "grad_norm": 2.7862167358398438, + "learning_rate": 4.576114487035182e-06, + "loss": 0.5492, + "step": 2468 + }, + { + "epoch": 1.1673758865248227, + "grad_norm": 2.6303019523620605, + "learning_rate": 4.575766888746594e-06, + "loss": 0.5538, + "step": 2469 + }, + { + "epoch": 1.1678486997635933, + "grad_norm": 2.613104820251465, + "learning_rate": 4.5754191612091034e-06, + "loss": 0.5114, + "step": 2470 + }, + { + "epoch": 1.168321513002364, + "grad_norm": 2.653958320617676, + "learning_rate": 4.5750713044443625e-06, + "loss": 0.5858, + "step": 2471 + }, + { + "epoch": 1.1687943262411347, + "grad_norm": 3.1143975257873535, + "learning_rate": 4.574723318474031e-06, + "loss": 0.5193, + "step": 2472 + }, + { + "epoch": 1.1692671394799055, + "grad_norm": 3.05454421043396, + "learning_rate": 4.574375203319775e-06, + "loss": 0.464, + "step": 2473 + }, + { + "epoch": 1.169739952718676, + "grad_norm": 2.66626238822937, + "learning_rate": 4.574026959003272e-06, + "loss": 0.4988, + "step": 2474 + }, + { + "epoch": 1.1702127659574468, + "grad_norm": 2.8871963024139404, + "learning_rate": 4.573678585546203e-06, + "loss": 0.5557, + "step": 2475 + }, + { + "epoch": 1.1706855791962174, + "grad_norm": 2.592949628829956, + "learning_rate": 4.573330082970262e-06, + "loss": 0.5178, + "step": 2476 + }, + { + "epoch": 1.1711583924349882, + "grad_norm": 2.9111456871032715, + "learning_rate": 4.572981451297148e-06, + "loss": 0.5712, + "step": 2477 + }, + { + "epoch": 1.1716312056737588, + "grad_norm": 2.8152248859405518, + "learning_rate": 4.57263269054857e-06, + "loss": 0.5548, + "step": 2478 + }, + { + "epoch": 1.1721040189125296, + "grad_norm": 3.0292418003082275, + "learning_rate": 4.572283800746241e-06, + "loss": 0.5937, + "step": 2479 + }, + { + "epoch": 1.1725768321513002, + "grad_norm": 3.454618215560913, + "learning_rate": 4.571934781911886e-06, + "loss": 0.5537, + "step": 2480 + }, + { + "epoch": 1.173049645390071, + "grad_norm": 2.7817866802215576, + "learning_rate": 4.571585634067239e-06, + "loss": 0.5649, + "step": 2481 + }, + { + "epoch": 1.1735224586288415, + "grad_norm": 2.7989349365234375, + "learning_rate": 4.571236357234037e-06, + "loss": 0.5448, + "step": 2482 + }, + { + "epoch": 1.1739952718676123, + "grad_norm": 2.8863933086395264, + "learning_rate": 4.57088695143403e-06, + "loss": 0.63, + "step": 2483 + }, + { + "epoch": 1.174468085106383, + "grad_norm": 2.5738039016723633, + "learning_rate": 4.570537416688972e-06, + "loss": 0.4702, + "step": 2484 + }, + { + "epoch": 1.1749408983451537, + "grad_norm": 3.003643274307251, + "learning_rate": 4.570187753020629e-06, + "loss": 0.5918, + "step": 2485 + }, + { + "epoch": 1.1754137115839243, + "grad_norm": 2.8619167804718018, + "learning_rate": 4.569837960450772e-06, + "loss": 0.5268, + "step": 2486 + }, + { + "epoch": 1.175886524822695, + "grad_norm": 2.876077175140381, + "learning_rate": 4.569488039001181e-06, + "loss": 0.4915, + "step": 2487 + }, + { + "epoch": 1.1763593380614656, + "grad_norm": 3.407115936279297, + "learning_rate": 4.569137988693644e-06, + "loss": 0.5761, + "step": 2488 + }, + { + "epoch": 1.1768321513002364, + "grad_norm": 2.7292826175689697, + "learning_rate": 4.568787809549958e-06, + "loss": 0.541, + "step": 2489 + }, + { + "epoch": 1.177304964539007, + "grad_norm": 2.8805999755859375, + "learning_rate": 4.568437501591926e-06, + "loss": 0.6223, + "step": 2490 + }, + { + "epoch": 1.1777777777777778, + "grad_norm": 2.9264373779296875, + "learning_rate": 4.56808706484136e-06, + "loss": 0.6081, + "step": 2491 + }, + { + "epoch": 1.1782505910165484, + "grad_norm": 2.5167033672332764, + "learning_rate": 4.567736499320082e-06, + "loss": 0.5393, + "step": 2492 + }, + { + "epoch": 1.1787234042553192, + "grad_norm": 3.4647862911224365, + "learning_rate": 4.567385805049918e-06, + "loss": 0.4826, + "step": 2493 + }, + { + "epoch": 1.1791962174940898, + "grad_norm": 2.9824202060699463, + "learning_rate": 4.5670349820527055e-06, + "loss": 0.541, + "step": 2494 + }, + { + "epoch": 1.1796690307328606, + "grad_norm": 2.997105836868286, + "learning_rate": 4.5666840303502885e-06, + "loss": 0.5771, + "step": 2495 + }, + { + "epoch": 1.1801418439716311, + "grad_norm": 2.8728017807006836, + "learning_rate": 4.56633294996452e-06, + "loss": 0.4877, + "step": 2496 + }, + { + "epoch": 1.180614657210402, + "grad_norm": 2.626498222351074, + "learning_rate": 4.5659817409172565e-06, + "loss": 0.5296, + "step": 2497 + }, + { + "epoch": 1.1810874704491725, + "grad_norm": 2.87037992477417, + "learning_rate": 4.565630403230371e-06, + "loss": 0.539, + "step": 2498 + }, + { + "epoch": 1.1815602836879433, + "grad_norm": 2.5719685554504395, + "learning_rate": 4.5652789369257375e-06, + "loss": 0.5653, + "step": 2499 + }, + { + "epoch": 1.1820330969267139, + "grad_norm": 2.4842135906219482, + "learning_rate": 4.56492734202524e-06, + "loss": 0.515, + "step": 2500 + }, + { + "epoch": 1.1825059101654847, + "grad_norm": 2.640951156616211, + "learning_rate": 4.564575618550773e-06, + "loss": 0.5601, + "step": 2501 + }, + { + "epoch": 1.1829787234042553, + "grad_norm": 2.624394655227661, + "learning_rate": 4.564223766524234e-06, + "loss": 0.5551, + "step": 2502 + }, + { + "epoch": 1.183451536643026, + "grad_norm": 3.014537811279297, + "learning_rate": 4.563871785967533e-06, + "loss": 0.5212, + "step": 2503 + }, + { + "epoch": 1.1839243498817966, + "grad_norm": 2.8756890296936035, + "learning_rate": 4.563519676902585e-06, + "loss": 0.5132, + "step": 2504 + }, + { + "epoch": 1.1843971631205674, + "grad_norm": 2.636781692504883, + "learning_rate": 4.5631674393513145e-06, + "loss": 0.5323, + "step": 2505 + }, + { + "epoch": 1.184869976359338, + "grad_norm": 2.7233786582946777, + "learning_rate": 4.562815073335655e-06, + "loss": 0.5608, + "step": 2506 + }, + { + "epoch": 1.1853427895981088, + "grad_norm": 2.7158713340759277, + "learning_rate": 4.562462578877546e-06, + "loss": 0.5373, + "step": 2507 + }, + { + "epoch": 1.1858156028368794, + "grad_norm": 2.9754762649536133, + "learning_rate": 4.562109955998936e-06, + "loss": 0.5712, + "step": 2508 + }, + { + "epoch": 1.1862884160756502, + "grad_norm": 2.8815054893493652, + "learning_rate": 4.561757204721781e-06, + "loss": 0.6126, + "step": 2509 + }, + { + "epoch": 1.1867612293144207, + "grad_norm": 2.866319417953491, + "learning_rate": 4.561404325068045e-06, + "loss": 0.506, + "step": 2510 + }, + { + "epoch": 1.1872340425531915, + "grad_norm": 2.6187376976013184, + "learning_rate": 4.561051317059701e-06, + "loss": 0.4674, + "step": 2511 + }, + { + "epoch": 1.1877068557919621, + "grad_norm": 2.642552137374878, + "learning_rate": 4.560698180718729e-06, + "loss": 0.4793, + "step": 2512 + }, + { + "epoch": 1.188179669030733, + "grad_norm": 2.7815041542053223, + "learning_rate": 4.560344916067117e-06, + "loss": 0.5034, + "step": 2513 + }, + { + "epoch": 1.1886524822695035, + "grad_norm": 2.70853590965271, + "learning_rate": 4.559991523126862e-06, + "loss": 0.4811, + "step": 2514 + }, + { + "epoch": 1.1891252955082743, + "grad_norm": 2.7049436569213867, + "learning_rate": 4.559638001919967e-06, + "loss": 0.547, + "step": 2515 + }, + { + "epoch": 1.1895981087470449, + "grad_norm": 2.766773223876953, + "learning_rate": 4.559284352468445e-06, + "loss": 0.5362, + "step": 2516 + }, + { + "epoch": 1.1900709219858157, + "grad_norm": 3.0064334869384766, + "learning_rate": 4.558930574794316e-06, + "loss": 0.5915, + "step": 2517 + }, + { + "epoch": 1.1905437352245862, + "grad_norm": 2.4899885654449463, + "learning_rate": 4.558576668919609e-06, + "loss": 0.4379, + "step": 2518 + }, + { + "epoch": 1.191016548463357, + "grad_norm": 2.925963878631592, + "learning_rate": 4.558222634866358e-06, + "loss": 0.5389, + "step": 2519 + }, + { + "epoch": 1.1914893617021276, + "grad_norm": 6.087667465209961, + "learning_rate": 4.55786847265661e-06, + "loss": 0.4777, + "step": 2520 + }, + { + "epoch": 1.1919621749408984, + "grad_norm": 2.4560582637786865, + "learning_rate": 4.5575141823124145e-06, + "loss": 0.5576, + "step": 2521 + }, + { + "epoch": 1.192434988179669, + "grad_norm": 3.184252977371216, + "learning_rate": 4.557159763855834e-06, + "loss": 0.5151, + "step": 2522 + }, + { + "epoch": 1.1929078014184398, + "grad_norm": 2.359722137451172, + "learning_rate": 4.556805217308935e-06, + "loss": 0.478, + "step": 2523 + }, + { + "epoch": 1.1933806146572103, + "grad_norm": 3.0821568965911865, + "learning_rate": 4.5564505426937935e-06, + "loss": 0.5784, + "step": 2524 + }, + { + "epoch": 1.1938534278959811, + "grad_norm": 2.9905128479003906, + "learning_rate": 4.5560957400324936e-06, + "loss": 0.6087, + "step": 2525 + }, + { + "epoch": 1.1943262411347517, + "grad_norm": 2.462102174758911, + "learning_rate": 4.555740809347128e-06, + "loss": 0.4739, + "step": 2526 + }, + { + "epoch": 1.1947990543735225, + "grad_norm": 2.7931067943573, + "learning_rate": 4.555385750659796e-06, + "loss": 0.4961, + "step": 2527 + }, + { + "epoch": 1.195271867612293, + "grad_norm": 2.660320997238159, + "learning_rate": 4.555030563992607e-06, + "loss": 0.487, + "step": 2528 + }, + { + "epoch": 1.195744680851064, + "grad_norm": 2.8135557174682617, + "learning_rate": 4.554675249367675e-06, + "loss": 0.5269, + "step": 2529 + }, + { + "epoch": 1.1962174940898345, + "grad_norm": 2.661933422088623, + "learning_rate": 4.554319806807126e-06, + "loss": 0.4723, + "step": 2530 + }, + { + "epoch": 1.1966903073286053, + "grad_norm": 2.568176507949829, + "learning_rate": 4.553964236333089e-06, + "loss": 0.5258, + "step": 2531 + }, + { + "epoch": 1.1971631205673758, + "grad_norm": 2.6890947818756104, + "learning_rate": 4.553608537967705e-06, + "loss": 0.4965, + "step": 2532 + }, + { + "epoch": 1.1976359338061466, + "grad_norm": 3.133470058441162, + "learning_rate": 4.553252711733124e-06, + "loss": 0.5423, + "step": 2533 + }, + { + "epoch": 1.1981087470449172, + "grad_norm": 2.7086687088012695, + "learning_rate": 4.552896757651498e-06, + "loss": 0.5326, + "step": 2534 + }, + { + "epoch": 1.198581560283688, + "grad_norm": 2.8411715030670166, + "learning_rate": 4.552540675744994e-06, + "loss": 0.5793, + "step": 2535 + }, + { + "epoch": 1.1990543735224586, + "grad_norm": 3.041077136993408, + "learning_rate": 4.552184466035782e-06, + "loss": 0.5068, + "step": 2536 + }, + { + "epoch": 1.1995271867612294, + "grad_norm": 2.5921192169189453, + "learning_rate": 4.551828128546041e-06, + "loss": 0.5189, + "step": 2537 + }, + { + "epoch": 1.2, + "grad_norm": 2.923305034637451, + "learning_rate": 4.5514716632979605e-06, + "loss": 0.516, + "step": 2538 + }, + { + "epoch": 1.2004728132387708, + "grad_norm": 2.7083024978637695, + "learning_rate": 4.551115070313734e-06, + "loss": 0.4825, + "step": 2539 + }, + { + "epoch": 1.2009456264775413, + "grad_norm": 2.746842384338379, + "learning_rate": 4.550758349615567e-06, + "loss": 0.5691, + "step": 2540 + }, + { + "epoch": 1.2014184397163121, + "grad_norm": 2.6596429347991943, + "learning_rate": 4.550401501225669e-06, + "loss": 0.5983, + "step": 2541 + }, + { + "epoch": 1.2018912529550827, + "grad_norm": 2.9057931900024414, + "learning_rate": 4.550044525166261e-06, + "loss": 0.5069, + "step": 2542 + }, + { + "epoch": 1.2023640661938535, + "grad_norm": 2.6139039993286133, + "learning_rate": 4.5496874214595686e-06, + "loss": 0.5102, + "step": 2543 + }, + { + "epoch": 1.202836879432624, + "grad_norm": 2.630286455154419, + "learning_rate": 4.5493301901278285e-06, + "loss": 0.4902, + "step": 2544 + }, + { + "epoch": 1.2033096926713949, + "grad_norm": 2.639174222946167, + "learning_rate": 4.548972831193284e-06, + "loss": 0.4566, + "step": 2545 + }, + { + "epoch": 1.2037825059101654, + "grad_norm": 2.9569664001464844, + "learning_rate": 4.548615344678186e-06, + "loss": 0.5636, + "step": 2546 + }, + { + "epoch": 1.2042553191489362, + "grad_norm": 2.981734037399292, + "learning_rate": 4.5482577306047924e-06, + "loss": 0.4884, + "step": 2547 + }, + { + "epoch": 1.2047281323877068, + "grad_norm": 2.6760342121124268, + "learning_rate": 4.547899988995371e-06, + "loss": 0.5426, + "step": 2548 + }, + { + "epoch": 1.2052009456264776, + "grad_norm": 2.825805902481079, + "learning_rate": 4.547542119872198e-06, + "loss": 0.4989, + "step": 2549 + }, + { + "epoch": 1.2056737588652482, + "grad_norm": 2.856426954269409, + "learning_rate": 4.547184123257555e-06, + "loss": 0.5734, + "step": 2550 + }, + { + "epoch": 1.206146572104019, + "grad_norm": 2.555682420730591, + "learning_rate": 4.5468259991737334e-06, + "loss": 0.5299, + "step": 2551 + }, + { + "epoch": 1.2066193853427896, + "grad_norm": 2.6324024200439453, + "learning_rate": 4.546467747643032e-06, + "loss": 0.5906, + "step": 2552 + }, + { + "epoch": 1.2070921985815604, + "grad_norm": 3.4145350456237793, + "learning_rate": 4.546109368687757e-06, + "loss": 0.5153, + "step": 2553 + }, + { + "epoch": 1.207565011820331, + "grad_norm": 2.658691644668579, + "learning_rate": 4.545750862330225e-06, + "loss": 0.5759, + "step": 2554 + }, + { + "epoch": 1.2080378250591017, + "grad_norm": 3.162605047225952, + "learning_rate": 4.545392228592755e-06, + "loss": 0.5379, + "step": 2555 + }, + { + "epoch": 1.2085106382978723, + "grad_norm": 2.8631198406219482, + "learning_rate": 4.545033467497681e-06, + "loss": 0.5959, + "step": 2556 + }, + { + "epoch": 1.208983451536643, + "grad_norm": 2.457109212875366, + "learning_rate": 4.54467457906734e-06, + "loss": 0.4864, + "step": 2557 + }, + { + "epoch": 1.2094562647754137, + "grad_norm": 2.5307061672210693, + "learning_rate": 4.544315563324078e-06, + "loss": 0.5308, + "step": 2558 + }, + { + "epoch": 1.2099290780141845, + "grad_norm": 2.8482773303985596, + "learning_rate": 4.543956420290251e-06, + "loss": 0.5126, + "step": 2559 + }, + { + "epoch": 1.210401891252955, + "grad_norm": 2.4990832805633545, + "learning_rate": 4.5435971499882195e-06, + "loss": 0.4534, + "step": 2560 + }, + { + "epoch": 1.2108747044917259, + "grad_norm": 2.6292665004730225, + "learning_rate": 4.543237752440354e-06, + "loss": 0.4434, + "step": 2561 + }, + { + "epoch": 1.2113475177304964, + "grad_norm": 2.865983247756958, + "learning_rate": 4.542878227669033e-06, + "loss": 0.5667, + "step": 2562 + }, + { + "epoch": 1.2118203309692672, + "grad_norm": 2.745614528656006, + "learning_rate": 4.542518575696644e-06, + "loss": 0.4724, + "step": 2563 + }, + { + "epoch": 1.2122931442080378, + "grad_norm": 2.8562581539154053, + "learning_rate": 4.5421587965455785e-06, + "loss": 0.5405, + "step": 2564 + }, + { + "epoch": 1.2127659574468086, + "grad_norm": 2.6670095920562744, + "learning_rate": 4.5417988902382385e-06, + "loss": 0.5432, + "step": 2565 + }, + { + "epoch": 1.2132387706855792, + "grad_norm": 2.9320743083953857, + "learning_rate": 4.541438856797036e-06, + "loss": 0.5862, + "step": 2566 + }, + { + "epoch": 1.21371158392435, + "grad_norm": 2.577505588531494, + "learning_rate": 4.541078696244386e-06, + "loss": 0.4742, + "step": 2567 + }, + { + "epoch": 1.2141843971631205, + "grad_norm": 3.4476120471954346, + "learning_rate": 4.540718408602717e-06, + "loss": 0.5903, + "step": 2568 + }, + { + "epoch": 1.2146572104018913, + "grad_norm": 2.816210985183716, + "learning_rate": 4.540357993894459e-06, + "loss": 0.5033, + "step": 2569 + }, + { + "epoch": 1.215130023640662, + "grad_norm": 3.0806639194488525, + "learning_rate": 4.539997452142058e-06, + "loss": 0.6064, + "step": 2570 + }, + { + "epoch": 1.2156028368794327, + "grad_norm": 2.563060760498047, + "learning_rate": 4.5396367833679586e-06, + "loss": 0.5597, + "step": 2571 + }, + { + "epoch": 1.2160756501182033, + "grad_norm": 3.1014397144317627, + "learning_rate": 4.5392759875946215e-06, + "loss": 0.54, + "step": 2572 + }, + { + "epoch": 1.216548463356974, + "grad_norm": 3.124190330505371, + "learning_rate": 4.53891506484451e-06, + "loss": 0.5122, + "step": 2573 + }, + { + "epoch": 1.2170212765957447, + "grad_norm": 2.6688716411590576, + "learning_rate": 4.538554015140097e-06, + "loss": 0.5615, + "step": 2574 + }, + { + "epoch": 1.2174940898345155, + "grad_norm": 2.775543689727783, + "learning_rate": 4.538192838503866e-06, + "loss": 0.496, + "step": 2575 + }, + { + "epoch": 1.217966903073286, + "grad_norm": 2.7877283096313477, + "learning_rate": 4.537831534958303e-06, + "loss": 0.4995, + "step": 2576 + }, + { + "epoch": 1.2184397163120568, + "grad_norm": 2.824810028076172, + "learning_rate": 4.537470104525906e-06, + "loss": 0.5481, + "step": 2577 + }, + { + "epoch": 1.2189125295508274, + "grad_norm": 2.801269292831421, + "learning_rate": 4.53710854722918e-06, + "loss": 0.5628, + "step": 2578 + }, + { + "epoch": 1.2193853427895982, + "grad_norm": 2.7780683040618896, + "learning_rate": 4.536746863090637e-06, + "loss": 0.4845, + "step": 2579 + }, + { + "epoch": 1.2198581560283688, + "grad_norm": 2.536010265350342, + "learning_rate": 4.536385052132798e-06, + "loss": 0.4771, + "step": 2580 + }, + { + "epoch": 1.2203309692671396, + "grad_norm": 2.768775701522827, + "learning_rate": 4.536023114378191e-06, + "loss": 0.5366, + "step": 2581 + }, + { + "epoch": 1.2208037825059102, + "grad_norm": 2.658125877380371, + "learning_rate": 4.535661049849352e-06, + "loss": 0.524, + "step": 2582 + }, + { + "epoch": 1.2212765957446807, + "grad_norm": 2.558696746826172, + "learning_rate": 4.535298858568825e-06, + "loss": 0.5482, + "step": 2583 + }, + { + "epoch": 1.2217494089834515, + "grad_norm": 2.5284535884857178, + "learning_rate": 4.534936540559164e-06, + "loss": 0.4454, + "step": 2584 + }, + { + "epoch": 1.2222222222222223, + "grad_norm": 7.617330074310303, + "learning_rate": 4.534574095842927e-06, + "loss": 0.5615, + "step": 2585 + }, + { + "epoch": 1.222695035460993, + "grad_norm": 2.9120311737060547, + "learning_rate": 4.534211524442682e-06, + "loss": 0.5624, + "step": 2586 + }, + { + "epoch": 1.2231678486997635, + "grad_norm": 2.5004289150238037, + "learning_rate": 4.533848826381005e-06, + "loss": 0.4743, + "step": 2587 + }, + { + "epoch": 1.2236406619385343, + "grad_norm": 2.8395533561706543, + "learning_rate": 4.53348600168048e-06, + "loss": 0.4457, + "step": 2588 + }, + { + "epoch": 1.224113475177305, + "grad_norm": 2.832211494445801, + "learning_rate": 4.533123050363699e-06, + "loss": 0.5559, + "step": 2589 + }, + { + "epoch": 1.2245862884160756, + "grad_norm": 2.6318583488464355, + "learning_rate": 4.53275997245326e-06, + "loss": 0.5281, + "step": 2590 + }, + { + "epoch": 1.2250591016548462, + "grad_norm": 3.0509233474731445, + "learning_rate": 4.532396767971771e-06, + "loss": 0.6003, + "step": 2591 + }, + { + "epoch": 1.225531914893617, + "grad_norm": 2.6863620281219482, + "learning_rate": 4.532033436941847e-06, + "loss": 0.5219, + "step": 2592 + }, + { + "epoch": 1.2260047281323878, + "grad_norm": 2.401463747024536, + "learning_rate": 4.5316699793861104e-06, + "loss": 0.5994, + "step": 2593 + }, + { + "epoch": 1.2264775413711584, + "grad_norm": 2.613517999649048, + "learning_rate": 4.531306395327194e-06, + "loss": 0.5785, + "step": 2594 + }, + { + "epoch": 1.226950354609929, + "grad_norm": 2.5016374588012695, + "learning_rate": 4.530942684787735e-06, + "loss": 0.5695, + "step": 2595 + }, + { + "epoch": 1.2274231678486998, + "grad_norm": 2.576464891433716, + "learning_rate": 4.53057884779038e-06, + "loss": 0.4427, + "step": 2596 + }, + { + "epoch": 1.2278959810874706, + "grad_norm": 2.5688700675964355, + "learning_rate": 4.530214884357785e-06, + "loss": 0.4966, + "step": 2597 + }, + { + "epoch": 1.2283687943262411, + "grad_norm": 3.179013729095459, + "learning_rate": 4.52985079451261e-06, + "loss": 0.5239, + "step": 2598 + }, + { + "epoch": 1.2288416075650117, + "grad_norm": 2.6015284061431885, + "learning_rate": 4.529486578277527e-06, + "loss": 0.5135, + "step": 2599 + }, + { + "epoch": 1.2293144208037825, + "grad_norm": 2.3029589653015137, + "learning_rate": 4.529122235675214e-06, + "loss": 0.4044, + "step": 2600 + }, + { + "epoch": 1.2297872340425533, + "grad_norm": 2.994093656539917, + "learning_rate": 4.528757766728357e-06, + "loss": 0.5419, + "step": 2601 + }, + { + "epoch": 1.2302600472813239, + "grad_norm": 2.6297390460968018, + "learning_rate": 4.52839317145965e-06, + "loss": 0.488, + "step": 2602 + }, + { + "epoch": 1.2307328605200945, + "grad_norm": 2.4814043045043945, + "learning_rate": 4.528028449891793e-06, + "loss": 0.4917, + "step": 2603 + }, + { + "epoch": 1.2312056737588652, + "grad_norm": 3.6052863597869873, + "learning_rate": 4.527663602047499e-06, + "loss": 0.5301, + "step": 2604 + }, + { + "epoch": 1.231678486997636, + "grad_norm": 2.6984751224517822, + "learning_rate": 4.5272986279494825e-06, + "loss": 0.5253, + "step": 2605 + }, + { + "epoch": 1.2321513002364066, + "grad_norm": 2.514000415802002, + "learning_rate": 4.526933527620469e-06, + "loss": 0.5661, + "step": 2606 + }, + { + "epoch": 1.2326241134751772, + "grad_norm": 2.890921115875244, + "learning_rate": 4.526568301083195e-06, + "loss": 0.5585, + "step": 2607 + }, + { + "epoch": 1.233096926713948, + "grad_norm": 2.6390011310577393, + "learning_rate": 4.526202948360397e-06, + "loss": 0.5168, + "step": 2608 + }, + { + "epoch": 1.2335697399527188, + "grad_norm": 2.7370636463165283, + "learning_rate": 4.5258374694748266e-06, + "loss": 0.5453, + "step": 2609 + }, + { + "epoch": 1.2340425531914894, + "grad_norm": 2.8203976154327393, + "learning_rate": 4.52547186444924e-06, + "loss": 0.5763, + "step": 2610 + }, + { + "epoch": 1.23451536643026, + "grad_norm": 2.7567849159240723, + "learning_rate": 4.5251061333064025e-06, + "loss": 0.5194, + "step": 2611 + }, + { + "epoch": 1.2349881796690307, + "grad_norm": 2.767519474029541, + "learning_rate": 4.524740276069085e-06, + "loss": 0.5355, + "step": 2612 + }, + { + "epoch": 1.2354609929078015, + "grad_norm": 3.072035312652588, + "learning_rate": 4.5243742927600695e-06, + "loss": 0.5391, + "step": 2613 + }, + { + "epoch": 1.2359338061465721, + "grad_norm": 2.5957462787628174, + "learning_rate": 4.524008183402143e-06, + "loss": 0.5645, + "step": 2614 + }, + { + "epoch": 1.2364066193853427, + "grad_norm": 2.774897575378418, + "learning_rate": 4.523641948018101e-06, + "loss": 0.5576, + "step": 2615 + }, + { + "epoch": 1.2368794326241135, + "grad_norm": 2.635887622833252, + "learning_rate": 4.5232755866307496e-06, + "loss": 0.5254, + "step": 2616 + }, + { + "epoch": 1.2373522458628843, + "grad_norm": 2.4860997200012207, + "learning_rate": 4.522909099262899e-06, + "loss": 0.4692, + "step": 2617 + }, + { + "epoch": 1.2378250591016549, + "grad_norm": 2.595513105392456, + "learning_rate": 4.522542485937369e-06, + "loss": 0.5166, + "step": 2618 + }, + { + "epoch": 1.2382978723404254, + "grad_norm": 2.961474895477295, + "learning_rate": 4.522175746676986e-06, + "loss": 0.5455, + "step": 2619 + }, + { + "epoch": 1.2387706855791962, + "grad_norm": 2.813889741897583, + "learning_rate": 4.521808881504588e-06, + "loss": 0.5249, + "step": 2620 + }, + { + "epoch": 1.239243498817967, + "grad_norm": 2.8434813022613525, + "learning_rate": 4.521441890443015e-06, + "loss": 0.472, + "step": 2621 + }, + { + "epoch": 1.2397163120567376, + "grad_norm": 2.4264845848083496, + "learning_rate": 4.521074773515119e-06, + "loss": 0.4783, + "step": 2622 + }, + { + "epoch": 1.2401891252955082, + "grad_norm": 2.615169048309326, + "learning_rate": 4.520707530743761e-06, + "loss": 0.5324, + "step": 2623 + }, + { + "epoch": 1.240661938534279, + "grad_norm": 2.6772537231445312, + "learning_rate": 4.520340162151803e-06, + "loss": 0.5224, + "step": 2624 + }, + { + "epoch": 1.2411347517730495, + "grad_norm": 2.683393955230713, + "learning_rate": 4.519972667762124e-06, + "loss": 0.4863, + "step": 2625 + }, + { + "epoch": 1.2416075650118203, + "grad_norm": 3.0335750579833984, + "learning_rate": 4.519605047597603e-06, + "loss": 0.544, + "step": 2626 + }, + { + "epoch": 1.242080378250591, + "grad_norm": 2.8694353103637695, + "learning_rate": 4.519237301681132e-06, + "loss": 0.5576, + "step": 2627 + }, + { + "epoch": 1.2425531914893617, + "grad_norm": 3.217808246612549, + "learning_rate": 4.518869430035609e-06, + "loss": 0.5459, + "step": 2628 + }, + { + "epoch": 1.2430260047281323, + "grad_norm": 2.7700083255767822, + "learning_rate": 4.518501432683937e-06, + "loss": 0.5579, + "step": 2629 + }, + { + "epoch": 1.243498817966903, + "grad_norm": 2.4759175777435303, + "learning_rate": 4.5181333096490335e-06, + "loss": 0.5049, + "step": 2630 + }, + { + "epoch": 1.2439716312056737, + "grad_norm": 2.8652584552764893, + "learning_rate": 4.517765060953818e-06, + "loss": 0.5366, + "step": 2631 + }, + { + "epoch": 1.2444444444444445, + "grad_norm": 2.776334524154663, + "learning_rate": 4.517396686621218e-06, + "loss": 0.5677, + "step": 2632 + }, + { + "epoch": 1.244917257683215, + "grad_norm": 2.676708221435547, + "learning_rate": 4.517028186674174e-06, + "loss": 0.5055, + "step": 2633 + }, + { + "epoch": 1.2453900709219858, + "grad_norm": 2.6851537227630615, + "learning_rate": 4.516659561135629e-06, + "loss": 0.5537, + "step": 2634 + }, + { + "epoch": 1.2458628841607564, + "grad_norm": 2.619971513748169, + "learning_rate": 4.516290810028536e-06, + "loss": 0.5765, + "step": 2635 + }, + { + "epoch": 1.2463356973995272, + "grad_norm": 2.7302334308624268, + "learning_rate": 4.515921933375855e-06, + "loss": 0.5611, + "step": 2636 + }, + { + "epoch": 1.2468085106382978, + "grad_norm": 2.5005829334259033, + "learning_rate": 4.5155529312005554e-06, + "loss": 0.442, + "step": 2637 + }, + { + "epoch": 1.2472813238770686, + "grad_norm": 2.713587522506714, + "learning_rate": 4.515183803525612e-06, + "loss": 0.5023, + "step": 2638 + }, + { + "epoch": 1.2477541371158392, + "grad_norm": 2.5146236419677734, + "learning_rate": 4.514814550374009e-06, + "loss": 0.5195, + "step": 2639 + }, + { + "epoch": 1.24822695035461, + "grad_norm": 2.761060953140259, + "learning_rate": 4.51444517176874e-06, + "loss": 0.5138, + "step": 2640 + }, + { + "epoch": 1.2486997635933805, + "grad_norm": 3.082329273223877, + "learning_rate": 4.5140756677328026e-06, + "loss": 0.6105, + "step": 2641 + }, + { + "epoch": 1.2491725768321513, + "grad_norm": 2.6933493614196777, + "learning_rate": 4.513706038289205e-06, + "loss": 0.5185, + "step": 2642 + }, + { + "epoch": 1.249645390070922, + "grad_norm": 2.515856981277466, + "learning_rate": 4.513336283460962e-06, + "loss": 0.5375, + "step": 2643 + }, + { + "epoch": 1.2501182033096927, + "grad_norm": 2.8553731441497803, + "learning_rate": 4.512966403271096e-06, + "loss": 0.5582, + "step": 2644 + }, + { + "epoch": 1.2505910165484633, + "grad_norm": 2.640880823135376, + "learning_rate": 4.5125963977426405e-06, + "loss": 0.5125, + "step": 2645 + }, + { + "epoch": 1.251063829787234, + "grad_norm": 2.9845943450927734, + "learning_rate": 4.512226266898631e-06, + "loss": 0.4749, + "step": 2646 + }, + { + "epoch": 1.2515366430260046, + "grad_norm": 2.5131032466888428, + "learning_rate": 4.511856010762116e-06, + "loss": 0.4764, + "step": 2647 + }, + { + "epoch": 1.2520094562647754, + "grad_norm": 2.370638370513916, + "learning_rate": 4.511485629356148e-06, + "loss": 0.5153, + "step": 2648 + }, + { + "epoch": 1.252482269503546, + "grad_norm": 2.912461996078491, + "learning_rate": 4.511115122703791e-06, + "loss": 0.6117, + "step": 2649 + }, + { + "epoch": 1.2529550827423168, + "grad_norm": 2.7308082580566406, + "learning_rate": 4.510744490828113e-06, + "loss": 0.5076, + "step": 2650 + }, + { + "epoch": 1.2534278959810874, + "grad_norm": 2.8524296283721924, + "learning_rate": 4.510373733752193e-06, + "loss": 0.542, + "step": 2651 + }, + { + "epoch": 1.2539007092198582, + "grad_norm": 2.799377202987671, + "learning_rate": 4.5100028514991145e-06, + "loss": 0.486, + "step": 2652 + }, + { + "epoch": 1.2543735224586288, + "grad_norm": 2.7248027324676514, + "learning_rate": 4.509631844091973e-06, + "loss": 0.4972, + "step": 2653 + }, + { + "epoch": 1.2548463356973996, + "grad_norm": 2.8041458129882812, + "learning_rate": 4.5092607115538686e-06, + "loss": 0.588, + "step": 2654 + }, + { + "epoch": 1.2553191489361701, + "grad_norm": 2.679417133331299, + "learning_rate": 4.50888945390791e-06, + "loss": 0.4639, + "step": 2655 + }, + { + "epoch": 1.255791962174941, + "grad_norm": 3.1049270629882812, + "learning_rate": 4.508518071177214e-06, + "loss": 0.5857, + "step": 2656 + }, + { + "epoch": 1.2562647754137115, + "grad_norm": 2.8590362071990967, + "learning_rate": 4.508146563384904e-06, + "loss": 0.5451, + "step": 2657 + }, + { + "epoch": 1.2567375886524823, + "grad_norm": 2.9774081707000732, + "learning_rate": 4.507774930554114e-06, + "loss": 0.5493, + "step": 2658 + }, + { + "epoch": 1.2572104018912529, + "grad_norm": 2.617643356323242, + "learning_rate": 4.507403172707983e-06, + "loss": 0.5472, + "step": 2659 + }, + { + "epoch": 1.2576832151300237, + "grad_norm": 2.9195587635040283, + "learning_rate": 4.507031289869658e-06, + "loss": 0.5403, + "step": 2660 + }, + { + "epoch": 1.2581560283687943, + "grad_norm": 2.706089496612549, + "learning_rate": 4.506659282062295e-06, + "loss": 0.4899, + "step": 2661 + }, + { + "epoch": 1.258628841607565, + "grad_norm": 2.8229358196258545, + "learning_rate": 4.506287149309057e-06, + "loss": 0.5336, + "step": 2662 + }, + { + "epoch": 1.2591016548463356, + "grad_norm": 2.5295674800872803, + "learning_rate": 4.505914891633117e-06, + "loss": 0.4806, + "step": 2663 + }, + { + "epoch": 1.2595744680851064, + "grad_norm": 3.098208427429199, + "learning_rate": 4.505542509057651e-06, + "loss": 0.6039, + "step": 2664 + }, + { + "epoch": 1.260047281323877, + "grad_norm": 2.5118041038513184, + "learning_rate": 4.5051700016058475e-06, + "loss": 0.5279, + "step": 2665 + }, + { + "epoch": 1.2605200945626478, + "grad_norm": 2.6901369094848633, + "learning_rate": 4.5047973693009005e-06, + "loss": 0.5515, + "step": 2666 + }, + { + "epoch": 1.2609929078014184, + "grad_norm": 2.5622377395629883, + "learning_rate": 4.504424612166012e-06, + "loss": 0.5405, + "step": 2667 + }, + { + "epoch": 1.2614657210401892, + "grad_norm": 2.685751438140869, + "learning_rate": 4.5040517302243915e-06, + "loss": 0.5797, + "step": 2668 + }, + { + "epoch": 1.2619385342789597, + "grad_norm": 2.8525350093841553, + "learning_rate": 4.503678723499259e-06, + "loss": 0.5561, + "step": 2669 + }, + { + "epoch": 1.2624113475177305, + "grad_norm": 2.803386926651001, + "learning_rate": 4.503305592013836e-06, + "loss": 0.5376, + "step": 2670 + }, + { + "epoch": 1.2628841607565011, + "grad_norm": 2.78633189201355, + "learning_rate": 4.502932335791359e-06, + "loss": 0.4739, + "step": 2671 + }, + { + "epoch": 1.263356973995272, + "grad_norm": 2.8337297439575195, + "learning_rate": 4.502558954855069e-06, + "loss": 0.5406, + "step": 2672 + }, + { + "epoch": 1.2638297872340425, + "grad_norm": 2.610275983810425, + "learning_rate": 4.502185449228213e-06, + "loss": 0.5343, + "step": 2673 + }, + { + "epoch": 1.2643026004728133, + "grad_norm": 2.7842252254486084, + "learning_rate": 4.501811818934048e-06, + "loss": 0.532, + "step": 2674 + }, + { + "epoch": 1.2647754137115839, + "grad_norm": 2.4472389221191406, + "learning_rate": 4.501438063995839e-06, + "loss": 0.4976, + "step": 2675 + }, + { + "epoch": 1.2652482269503547, + "grad_norm": 3.076580762863159, + "learning_rate": 4.501064184436858e-06, + "loss": 0.507, + "step": 2676 + }, + { + "epoch": 1.2657210401891252, + "grad_norm": 2.5952908992767334, + "learning_rate": 4.500690180280384e-06, + "loss": 0.5498, + "step": 2677 + }, + { + "epoch": 1.266193853427896, + "grad_norm": 2.476943016052246, + "learning_rate": 4.500316051549706e-06, + "loss": 0.557, + "step": 2678 + }, + { + "epoch": 1.2666666666666666, + "grad_norm": 2.730579376220703, + "learning_rate": 4.499941798268118e-06, + "loss": 0.4975, + "step": 2679 + }, + { + "epoch": 1.2671394799054374, + "grad_norm": 2.7916698455810547, + "learning_rate": 4.499567420458924e-06, + "loss": 0.5673, + "step": 2680 + }, + { + "epoch": 1.267612293144208, + "grad_norm": 2.4249091148376465, + "learning_rate": 4.4991929181454355e-06, + "loss": 0.4836, + "step": 2681 + }, + { + "epoch": 1.2680851063829788, + "grad_norm": 2.661911725997925, + "learning_rate": 4.498818291350969e-06, + "loss": 0.5332, + "step": 2682 + }, + { + "epoch": 1.2685579196217494, + "grad_norm": 2.693657875061035, + "learning_rate": 4.498443540098852e-06, + "loss": 0.5257, + "step": 2683 + }, + { + "epoch": 1.2690307328605201, + "grad_norm": 2.609386682510376, + "learning_rate": 4.4980686644124195e-06, + "loss": 0.4918, + "step": 2684 + }, + { + "epoch": 1.2695035460992907, + "grad_norm": 3.2104930877685547, + "learning_rate": 4.4976936643150124e-06, + "loss": 0.6097, + "step": 2685 + }, + { + "epoch": 1.2699763593380615, + "grad_norm": 2.707860231399536, + "learning_rate": 4.49731853982998e-06, + "loss": 0.5109, + "step": 2686 + }, + { + "epoch": 1.270449172576832, + "grad_norm": 3.5046379566192627, + "learning_rate": 4.49694329098068e-06, + "loss": 0.5883, + "step": 2687 + }, + { + "epoch": 1.270921985815603, + "grad_norm": 2.5362324714660645, + "learning_rate": 4.496567917790477e-06, + "loss": 0.5301, + "step": 2688 + }, + { + "epoch": 1.2713947990543735, + "grad_norm": 2.7095518112182617, + "learning_rate": 4.496192420282746e-06, + "loss": 0.4772, + "step": 2689 + }, + { + "epoch": 1.2718676122931443, + "grad_norm": 2.416433095932007, + "learning_rate": 4.495816798480865e-06, + "loss": 0.5012, + "step": 2690 + }, + { + "epoch": 1.2723404255319148, + "grad_norm": 2.5362391471862793, + "learning_rate": 4.495441052408224e-06, + "loss": 0.5197, + "step": 2691 + }, + { + "epoch": 1.2728132387706856, + "grad_norm": 2.9093947410583496, + "learning_rate": 4.495065182088218e-06, + "loss": 0.4893, + "step": 2692 + }, + { + "epoch": 1.2732860520094562, + "grad_norm": 2.520470142364502, + "learning_rate": 4.494689187544251e-06, + "loss": 0.5072, + "step": 2693 + }, + { + "epoch": 1.273758865248227, + "grad_norm": 2.4385125637054443, + "learning_rate": 4.494313068799735e-06, + "loss": 0.4923, + "step": 2694 + }, + { + "epoch": 1.2742316784869976, + "grad_norm": 2.636852502822876, + "learning_rate": 4.493936825878089e-06, + "loss": 0.5409, + "step": 2695 + }, + { + "epoch": 1.2747044917257684, + "grad_norm": 2.7027053833007812, + "learning_rate": 4.493560458802741e-06, + "loss": 0.5906, + "step": 2696 + }, + { + "epoch": 1.275177304964539, + "grad_norm": 2.58752179145813, + "learning_rate": 4.493183967597123e-06, + "loss": 0.5292, + "step": 2697 + }, + { + "epoch": 1.2756501182033098, + "grad_norm": 2.7658379077911377, + "learning_rate": 4.49280735228468e-06, + "loss": 0.5613, + "step": 2698 + }, + { + "epoch": 1.2761229314420803, + "grad_norm": 3.272688388824463, + "learning_rate": 4.492430612888861e-06, + "loss": 0.5654, + "step": 2699 + }, + { + "epoch": 1.2765957446808511, + "grad_norm": 2.806819438934326, + "learning_rate": 4.492053749433125e-06, + "loss": 0.5388, + "step": 2700 + }, + { + "epoch": 1.2770685579196217, + "grad_norm": 2.879727602005005, + "learning_rate": 4.491676761940936e-06, + "loss": 0.5033, + "step": 2701 + }, + { + "epoch": 1.2775413711583925, + "grad_norm": 2.733347177505493, + "learning_rate": 4.4912996504357695e-06, + "loss": 0.5113, + "step": 2702 + }, + { + "epoch": 1.278014184397163, + "grad_norm": 2.7431252002716064, + "learning_rate": 4.490922414941104e-06, + "loss": 0.5417, + "step": 2703 + }, + { + "epoch": 1.2784869976359339, + "grad_norm": 2.9287240505218506, + "learning_rate": 4.490545055480431e-06, + "loss": 0.5875, + "step": 2704 + }, + { + "epoch": 1.2789598108747045, + "grad_norm": 2.576775550842285, + "learning_rate": 4.490167572077244e-06, + "loss": 0.5176, + "step": 2705 + }, + { + "epoch": 1.2794326241134752, + "grad_norm": 2.4335594177246094, + "learning_rate": 4.4897899647550505e-06, + "loss": 0.4749, + "step": 2706 + }, + { + "epoch": 1.2799054373522458, + "grad_norm": 2.6798062324523926, + "learning_rate": 4.489412233537361e-06, + "loss": 0.5439, + "step": 2707 + }, + { + "epoch": 1.2803782505910166, + "grad_norm": 2.8440675735473633, + "learning_rate": 4.489034378447693e-06, + "loss": 0.552, + "step": 2708 + }, + { + "epoch": 1.2808510638297872, + "grad_norm": 2.9059503078460693, + "learning_rate": 4.488656399509577e-06, + "loss": 0.5667, + "step": 2709 + }, + { + "epoch": 1.281323877068558, + "grad_norm": 2.7415006160736084, + "learning_rate": 4.488278296746548e-06, + "loss": 0.5676, + "step": 2710 + }, + { + "epoch": 1.2817966903073286, + "grad_norm": 2.4584875106811523, + "learning_rate": 4.487900070182147e-06, + "loss": 0.4787, + "step": 2711 + }, + { + "epoch": 1.2822695035460994, + "grad_norm": 2.990940809249878, + "learning_rate": 4.487521719839924e-06, + "loss": 0.5239, + "step": 2712 + }, + { + "epoch": 1.28274231678487, + "grad_norm": 3.075201988220215, + "learning_rate": 4.487143245743441e-06, + "loss": 0.5103, + "step": 2713 + }, + { + "epoch": 1.2832151300236407, + "grad_norm": 2.543341875076294, + "learning_rate": 4.486764647916259e-06, + "loss": 0.5475, + "step": 2714 + }, + { + "epoch": 1.2836879432624113, + "grad_norm": 2.9927213191986084, + "learning_rate": 4.486385926381957e-06, + "loss": 0.4923, + "step": 2715 + }, + { + "epoch": 1.284160756501182, + "grad_norm": 2.4220657348632812, + "learning_rate": 4.486007081164111e-06, + "loss": 0.543, + "step": 2716 + }, + { + "epoch": 1.2846335697399527, + "grad_norm": 2.468214988708496, + "learning_rate": 4.4856281122863134e-06, + "loss": 0.5248, + "step": 2717 + }, + { + "epoch": 1.2851063829787235, + "grad_norm": 2.633711099624634, + "learning_rate": 4.48524901977216e-06, + "loss": 0.4764, + "step": 2718 + }, + { + "epoch": 1.285579196217494, + "grad_norm": 2.8399546146392822, + "learning_rate": 4.484869803645254e-06, + "loss": 0.5503, + "step": 2719 + }, + { + "epoch": 1.2860520094562649, + "grad_norm": 2.769063949584961, + "learning_rate": 4.484490463929209e-06, + "loss": 0.5468, + "step": 2720 + }, + { + "epoch": 1.2865248226950354, + "grad_norm": 2.617863893508911, + "learning_rate": 4.4841110006476465e-06, + "loss": 0.5906, + "step": 2721 + }, + { + "epoch": 1.2869976359338062, + "grad_norm": 2.7639541625976562, + "learning_rate": 4.4837314138241905e-06, + "loss": 0.552, + "step": 2722 + }, + { + "epoch": 1.2874704491725768, + "grad_norm": 2.7711129188537598, + "learning_rate": 4.483351703482478e-06, + "loss": 0.5229, + "step": 2723 + }, + { + "epoch": 1.2879432624113476, + "grad_norm": 2.611205577850342, + "learning_rate": 4.482971869646152e-06, + "loss": 0.5055, + "step": 2724 + }, + { + "epoch": 1.2884160756501182, + "grad_norm": 2.8602211475372314, + "learning_rate": 4.482591912338862e-06, + "loss": 0.5561, + "step": 2725 + }, + { + "epoch": 1.2888888888888888, + "grad_norm": 2.5882298946380615, + "learning_rate": 4.4822118315842675e-06, + "loss": 0.5555, + "step": 2726 + }, + { + "epoch": 1.2893617021276595, + "grad_norm": 2.7533531188964844, + "learning_rate": 4.481831627406033e-06, + "loss": 0.5346, + "step": 2727 + }, + { + "epoch": 1.2898345153664303, + "grad_norm": 2.4296958446502686, + "learning_rate": 4.481451299827835e-06, + "loss": 0.4915, + "step": 2728 + }, + { + "epoch": 1.290307328605201, + "grad_norm": 2.4403445720672607, + "learning_rate": 4.481070848873352e-06, + "loss": 0.5648, + "step": 2729 + }, + { + "epoch": 1.2907801418439715, + "grad_norm": 2.473224401473999, + "learning_rate": 4.480690274566274e-06, + "loss": 0.4849, + "step": 2730 + }, + { + "epoch": 1.2912529550827423, + "grad_norm": 2.637899875640869, + "learning_rate": 4.480309576930297e-06, + "loss": 0.4968, + "step": 2731 + }, + { + "epoch": 1.291725768321513, + "grad_norm": 2.7156927585601807, + "learning_rate": 4.479928755989127e-06, + "loss": 0.4759, + "step": 2732 + }, + { + "epoch": 1.2921985815602837, + "grad_norm": 2.632786989212036, + "learning_rate": 4.479547811766475e-06, + "loss": 0.5468, + "step": 2733 + }, + { + "epoch": 1.2926713947990542, + "grad_norm": 2.529218912124634, + "learning_rate": 4.479166744286061e-06, + "loss": 0.4852, + "step": 2734 + }, + { + "epoch": 1.293144208037825, + "grad_norm": 2.561978340148926, + "learning_rate": 4.4787855535716115e-06, + "loss": 0.546, + "step": 2735 + }, + { + "epoch": 1.2936170212765958, + "grad_norm": 2.3684909343719482, + "learning_rate": 4.478404239646862e-06, + "loss": 0.5369, + "step": 2736 + }, + { + "epoch": 1.2940898345153664, + "grad_norm": 2.8940367698669434, + "learning_rate": 4.4780228025355566e-06, + "loss": 0.568, + "step": 2737 + }, + { + "epoch": 1.294562647754137, + "grad_norm": 2.6950316429138184, + "learning_rate": 4.477641242261445e-06, + "loss": 0.4576, + "step": 2738 + }, + { + "epoch": 1.2950354609929078, + "grad_norm": 2.4211716651916504, + "learning_rate": 4.4772595588482835e-06, + "loss": 0.4341, + "step": 2739 + }, + { + "epoch": 1.2955082742316786, + "grad_norm": 3.141097068786621, + "learning_rate": 4.47687775231984e-06, + "loss": 0.5944, + "step": 2740 + }, + { + "epoch": 1.2959810874704492, + "grad_norm": 3.077522039413452, + "learning_rate": 4.476495822699887e-06, + "loss": 0.5786, + "step": 2741 + }, + { + "epoch": 1.2964539007092197, + "grad_norm": 2.708139419555664, + "learning_rate": 4.476113770012206e-06, + "loss": 0.5014, + "step": 2742 + }, + { + "epoch": 1.2969267139479905, + "grad_norm": 2.7572035789489746, + "learning_rate": 4.475731594280586e-06, + "loss": 0.594, + "step": 2743 + }, + { + "epoch": 1.2973995271867613, + "grad_norm": 2.673126459121704, + "learning_rate": 4.475349295528822e-06, + "loss": 0.5317, + "step": 2744 + }, + { + "epoch": 1.297872340425532, + "grad_norm": 2.6757819652557373, + "learning_rate": 4.4749668737807195e-06, + "loss": 0.5614, + "step": 2745 + }, + { + "epoch": 1.2983451536643025, + "grad_norm": 2.7077620029449463, + "learning_rate": 4.47458432906009e-06, + "loss": 0.4916, + "step": 2746 + }, + { + "epoch": 1.2988179669030733, + "grad_norm": 2.446570873260498, + "learning_rate": 4.474201661390752e-06, + "loss": 0.5005, + "step": 2747 + }, + { + "epoch": 1.299290780141844, + "grad_norm": 2.642695665359497, + "learning_rate": 4.473818870796533e-06, + "loss": 0.5048, + "step": 2748 + }, + { + "epoch": 1.2997635933806146, + "grad_norm": 2.519824743270874, + "learning_rate": 4.4734359573012686e-06, + "loss": 0.5131, + "step": 2749 + }, + { + "epoch": 1.3002364066193852, + "grad_norm": 2.5901925563812256, + "learning_rate": 4.4730529209287995e-06, + "loss": 0.4582, + "step": 2750 + }, + { + "epoch": 1.300709219858156, + "grad_norm": 2.6789121627807617, + "learning_rate": 4.472669761702978e-06, + "loss": 0.5685, + "step": 2751 + }, + { + "epoch": 1.3011820330969268, + "grad_norm": 2.408003807067871, + "learning_rate": 4.472286479647659e-06, + "loss": 0.4329, + "step": 2752 + }, + { + "epoch": 1.3016548463356974, + "grad_norm": 2.681403398513794, + "learning_rate": 4.47190307478671e-06, + "loss": 0.4853, + "step": 2753 + }, + { + "epoch": 1.302127659574468, + "grad_norm": 2.9923183917999268, + "learning_rate": 4.4715195471440025e-06, + "loss": 0.5184, + "step": 2754 + }, + { + "epoch": 1.3026004728132388, + "grad_norm": 2.5100321769714355, + "learning_rate": 4.471135896743418e-06, + "loss": 0.5148, + "step": 2755 + }, + { + "epoch": 1.3030732860520096, + "grad_norm": 2.267881393432617, + "learning_rate": 4.4707521236088444e-06, + "loss": 0.5028, + "step": 2756 + }, + { + "epoch": 1.3035460992907801, + "grad_norm": 2.7779829502105713, + "learning_rate": 4.4703682277641775e-06, + "loss": 0.5724, + "step": 2757 + }, + { + "epoch": 1.3040189125295507, + "grad_norm": 2.4262194633483887, + "learning_rate": 4.4699842092333205e-06, + "loss": 0.5341, + "step": 2758 + }, + { + "epoch": 1.3044917257683215, + "grad_norm": 2.8682050704956055, + "learning_rate": 4.469600068040185e-06, + "loss": 0.6114, + "step": 2759 + }, + { + "epoch": 1.3049645390070923, + "grad_norm": 2.647853374481201, + "learning_rate": 4.46921580420869e-06, + "loss": 0.5107, + "step": 2760 + }, + { + "epoch": 1.3054373522458629, + "grad_norm": 2.561998128890991, + "learning_rate": 4.468831417762762e-06, + "loss": 0.6019, + "step": 2761 + }, + { + "epoch": 1.3059101654846335, + "grad_norm": 2.763425350189209, + "learning_rate": 4.468446908726334e-06, + "loss": 0.572, + "step": 2762 + }, + { + "epoch": 1.3063829787234043, + "grad_norm": 2.7052934169769287, + "learning_rate": 4.468062277123348e-06, + "loss": 0.4876, + "step": 2763 + }, + { + "epoch": 1.306855791962175, + "grad_norm": 2.997845411300659, + "learning_rate": 4.467677522977755e-06, + "loss": 0.5683, + "step": 2764 + }, + { + "epoch": 1.3073286052009456, + "grad_norm": 2.503129005432129, + "learning_rate": 4.46729264631351e-06, + "loss": 0.4951, + "step": 2765 + }, + { + "epoch": 1.3078014184397162, + "grad_norm": 2.617492437362671, + "learning_rate": 4.466907647154578e-06, + "loss": 0.5054, + "step": 2766 + }, + { + "epoch": 1.308274231678487, + "grad_norm": 2.934967279434204, + "learning_rate": 4.4665225255249315e-06, + "loss": 0.5299, + "step": 2767 + }, + { + "epoch": 1.3087470449172578, + "grad_norm": 2.787252187728882, + "learning_rate": 4.46613728144855e-06, + "loss": 0.4652, + "step": 2768 + }, + { + "epoch": 1.3092198581560284, + "grad_norm": 2.567439556121826, + "learning_rate": 4.465751914949422e-06, + "loss": 0.538, + "step": 2769 + }, + { + "epoch": 1.309692671394799, + "grad_norm": 2.6386024951934814, + "learning_rate": 4.4653664260515416e-06, + "loss": 0.464, + "step": 2770 + }, + { + "epoch": 1.3101654846335697, + "grad_norm": 2.966848134994507, + "learning_rate": 4.464980814778912e-06, + "loss": 0.4889, + "step": 2771 + }, + { + "epoch": 1.3106382978723405, + "grad_norm": 2.571256637573242, + "learning_rate": 4.464595081155542e-06, + "loss": 0.4979, + "step": 2772 + }, + { + "epoch": 1.3111111111111111, + "grad_norm": 2.774203062057495, + "learning_rate": 4.4642092252054515e-06, + "loss": 0.5366, + "step": 2773 + }, + { + "epoch": 1.3115839243498817, + "grad_norm": 2.682969331741333, + "learning_rate": 4.463823246952666e-06, + "loss": 0.5118, + "step": 2774 + }, + { + "epoch": 1.3120567375886525, + "grad_norm": 2.4873905181884766, + "learning_rate": 4.463437146421217e-06, + "loss": 0.5548, + "step": 2775 + }, + { + "epoch": 1.3125295508274233, + "grad_norm": 2.6769661903381348, + "learning_rate": 4.463050923635147e-06, + "loss": 0.5023, + "step": 2776 + }, + { + "epoch": 1.3130023640661939, + "grad_norm": 2.7190892696380615, + "learning_rate": 4.462664578618503e-06, + "loss": 0.5546, + "step": 2777 + }, + { + "epoch": 1.3134751773049644, + "grad_norm": 2.8193624019622803, + "learning_rate": 4.462278111395343e-06, + "loss": 0.5265, + "step": 2778 + }, + { + "epoch": 1.3139479905437352, + "grad_norm": 2.7324538230895996, + "learning_rate": 4.461891521989728e-06, + "loss": 0.5449, + "step": 2779 + }, + { + "epoch": 1.314420803782506, + "grad_norm": 2.87320876121521, + "learning_rate": 4.4615048104257305e-06, + "loss": 0.5367, + "step": 2780 + }, + { + "epoch": 1.3148936170212766, + "grad_norm": 2.6777031421661377, + "learning_rate": 4.4611179767274306e-06, + "loss": 0.5026, + "step": 2781 + }, + { + "epoch": 1.3153664302600472, + "grad_norm": 3.714524269104004, + "learning_rate": 4.460731020918913e-06, + "loss": 0.569, + "step": 2782 + }, + { + "epoch": 1.315839243498818, + "grad_norm": 2.7493600845336914, + "learning_rate": 4.460343943024273e-06, + "loss": 0.5826, + "step": 2783 + }, + { + "epoch": 1.3163120567375888, + "grad_norm": 2.6544079780578613, + "learning_rate": 4.459956743067609e-06, + "loss": 0.5399, + "step": 2784 + }, + { + "epoch": 1.3167848699763594, + "grad_norm": 2.4338037967681885, + "learning_rate": 4.459569421073036e-06, + "loss": 0.5186, + "step": 2785 + }, + { + "epoch": 1.31725768321513, + "grad_norm": 2.9312374591827393, + "learning_rate": 4.459181977064665e-06, + "loss": 0.5571, + "step": 2786 + }, + { + "epoch": 1.3177304964539007, + "grad_norm": 2.5988922119140625, + "learning_rate": 4.458794411066624e-06, + "loss": 0.5926, + "step": 2787 + }, + { + "epoch": 1.3182033096926715, + "grad_norm": 2.5193772315979004, + "learning_rate": 4.458406723103044e-06, + "loss": 0.5243, + "step": 2788 + }, + { + "epoch": 1.318676122931442, + "grad_norm": 2.8653743267059326, + "learning_rate": 4.458018913198066e-06, + "loss": 0.5421, + "step": 2789 + }, + { + "epoch": 1.3191489361702127, + "grad_norm": 2.486245632171631, + "learning_rate": 4.457630981375834e-06, + "loss": 0.4862, + "step": 2790 + }, + { + "epoch": 1.3196217494089835, + "grad_norm": 3.155435800552368, + "learning_rate": 4.457242927660506e-06, + "loss": 0.5386, + "step": 2791 + }, + { + "epoch": 1.3200945626477543, + "grad_norm": 3.102023124694824, + "learning_rate": 4.456854752076242e-06, + "loss": 0.5527, + "step": 2792 + }, + { + "epoch": 1.3205673758865248, + "grad_norm": 2.7995986938476562, + "learning_rate": 4.456466454647215e-06, + "loss": 0.4364, + "step": 2793 + }, + { + "epoch": 1.3210401891252954, + "grad_norm": 2.8328311443328857, + "learning_rate": 4.456078035397599e-06, + "loss": 0.5516, + "step": 2794 + }, + { + "epoch": 1.3215130023640662, + "grad_norm": 2.606161594390869, + "learning_rate": 4.455689494351581e-06, + "loss": 0.5042, + "step": 2795 + }, + { + "epoch": 1.321985815602837, + "grad_norm": 2.6344757080078125, + "learning_rate": 4.455300831533354e-06, + "loss": 0.4807, + "step": 2796 + }, + { + "epoch": 1.3224586288416076, + "grad_norm": 2.8539786338806152, + "learning_rate": 4.454912046967118e-06, + "loss": 0.4694, + "step": 2797 + }, + { + "epoch": 1.3229314420803782, + "grad_norm": 2.849066734313965, + "learning_rate": 4.454523140677081e-06, + "loss": 0.5037, + "step": 2798 + }, + { + "epoch": 1.323404255319149, + "grad_norm": 2.6803371906280518, + "learning_rate": 4.454134112687458e-06, + "loss": 0.4959, + "step": 2799 + }, + { + "epoch": 1.3238770685579198, + "grad_norm": 3.0546066761016846, + "learning_rate": 4.453744963022473e-06, + "loss": 0.5935, + "step": 2800 + }, + { + "epoch": 1.3243498817966903, + "grad_norm": 2.625602960586548, + "learning_rate": 4.453355691706356e-06, + "loss": 0.5349, + "step": 2801 + }, + { + "epoch": 1.324822695035461, + "grad_norm": 2.7568554878234863, + "learning_rate": 4.452966298763345e-06, + "loss": 0.5012, + "step": 2802 + }, + { + "epoch": 1.3252955082742317, + "grad_norm": 2.940427303314209, + "learning_rate": 4.452576784217686e-06, + "loss": 0.5246, + "step": 2803 + }, + { + "epoch": 1.3257683215130025, + "grad_norm": 2.5485289096832275, + "learning_rate": 4.452187148093633e-06, + "loss": 0.5282, + "step": 2804 + }, + { + "epoch": 1.326241134751773, + "grad_norm": 2.8152987957000732, + "learning_rate": 4.4517973904154455e-06, + "loss": 0.5468, + "step": 2805 + }, + { + "epoch": 1.3267139479905437, + "grad_norm": 2.9399688243865967, + "learning_rate": 4.451407511207393e-06, + "loss": 0.5586, + "step": 2806 + }, + { + "epoch": 1.3271867612293144, + "grad_norm": 2.3870036602020264, + "learning_rate": 4.451017510493751e-06, + "loss": 0.4807, + "step": 2807 + }, + { + "epoch": 1.327659574468085, + "grad_norm": 3.4667887687683105, + "learning_rate": 4.450627388298805e-06, + "loss": 0.5571, + "step": 2808 + }, + { + "epoch": 1.3281323877068558, + "grad_norm": 2.685986042022705, + "learning_rate": 4.450237144646844e-06, + "loss": 0.5525, + "step": 2809 + }, + { + "epoch": 1.3286052009456264, + "grad_norm": 2.8529131412506104, + "learning_rate": 4.449846779562168e-06, + "loss": 0.491, + "step": 2810 + }, + { + "epoch": 1.3290780141843972, + "grad_norm": 2.7360332012176514, + "learning_rate": 4.449456293069082e-06, + "loss": 0.5574, + "step": 2811 + }, + { + "epoch": 1.3295508274231678, + "grad_norm": 2.4656026363372803, + "learning_rate": 4.4490656851919015e-06, + "loss": 0.4678, + "step": 2812 + }, + { + "epoch": 1.3300236406619386, + "grad_norm": 2.602651357650757, + "learning_rate": 4.448674955954947e-06, + "loss": 0.5118, + "step": 2813 + }, + { + "epoch": 1.3304964539007091, + "grad_norm": 3.0129756927490234, + "learning_rate": 4.448284105382548e-06, + "loss": 0.6136, + "step": 2814 + }, + { + "epoch": 1.33096926713948, + "grad_norm": 2.8499927520751953, + "learning_rate": 4.447893133499039e-06, + "loss": 0.5286, + "step": 2815 + }, + { + "epoch": 1.3314420803782505, + "grad_norm": 2.8320744037628174, + "learning_rate": 4.447502040328767e-06, + "loss": 0.5186, + "step": 2816 + }, + { + "epoch": 1.3319148936170213, + "grad_norm": 2.499950885772705, + "learning_rate": 4.447110825896084e-06, + "loss": 0.5338, + "step": 2817 + }, + { + "epoch": 1.3323877068557919, + "grad_norm": 2.530895233154297, + "learning_rate": 4.446719490225346e-06, + "loss": 0.5151, + "step": 2818 + }, + { + "epoch": 1.3328605200945627, + "grad_norm": 2.5276098251342773, + "learning_rate": 4.446328033340921e-06, + "loss": 0.5424, + "step": 2819 + }, + { + "epoch": 1.3333333333333333, + "grad_norm": 2.90218186378479, + "learning_rate": 4.4459364552671845e-06, + "loss": 0.5747, + "step": 2820 + }, + { + "epoch": 1.333806146572104, + "grad_norm": 2.500943183898926, + "learning_rate": 4.445544756028518e-06, + "loss": 0.5459, + "step": 2821 + }, + { + "epoch": 1.3342789598108746, + "grad_norm": 2.960374355316162, + "learning_rate": 4.44515293564931e-06, + "loss": 0.6092, + "step": 2822 + }, + { + "epoch": 1.3347517730496454, + "grad_norm": 2.813671827316284, + "learning_rate": 4.444760994153958e-06, + "loss": 0.5536, + "step": 2823 + }, + { + "epoch": 1.335224586288416, + "grad_norm": 2.7147483825683594, + "learning_rate": 4.444368931566867e-06, + "loss": 0.5291, + "step": 2824 + }, + { + "epoch": 1.3356973995271868, + "grad_norm": 2.710101842880249, + "learning_rate": 4.443976747912447e-06, + "loss": 0.5138, + "step": 2825 + }, + { + "epoch": 1.3361702127659574, + "grad_norm": 2.711419105529785, + "learning_rate": 4.443584443215121e-06, + "loss": 0.5223, + "step": 2826 + }, + { + "epoch": 1.3366430260047282, + "grad_norm": 2.887472152709961, + "learning_rate": 4.443192017499313e-06, + "loss": 0.5464, + "step": 2827 + }, + { + "epoch": 1.3371158392434987, + "grad_norm": 2.8867223262786865, + "learning_rate": 4.4427994707894585e-06, + "loss": 0.5748, + "step": 2828 + }, + { + "epoch": 1.3375886524822695, + "grad_norm": 2.407247543334961, + "learning_rate": 4.44240680311e-06, + "loss": 0.4727, + "step": 2829 + }, + { + "epoch": 1.3380614657210401, + "grad_norm": 2.578420877456665, + "learning_rate": 4.4420140144853865e-06, + "loss": 0.5129, + "step": 2830 + }, + { + "epoch": 1.338534278959811, + "grad_norm": 2.884373426437378, + "learning_rate": 4.441621104940077e-06, + "loss": 0.5366, + "step": 2831 + }, + { + "epoch": 1.3390070921985815, + "grad_norm": 2.8652374744415283, + "learning_rate": 4.441228074498534e-06, + "loss": 0.5045, + "step": 2832 + }, + { + "epoch": 1.3394799054373523, + "grad_norm": 2.5380210876464844, + "learning_rate": 4.440834923185231e-06, + "loss": 0.509, + "step": 2833 + }, + { + "epoch": 1.3399527186761229, + "grad_norm": 2.415734052658081, + "learning_rate": 4.440441651024648e-06, + "loss": 0.5066, + "step": 2834 + }, + { + "epoch": 1.3404255319148937, + "grad_norm": 2.503051996231079, + "learning_rate": 4.440048258041272e-06, + "loss": 0.5118, + "step": 2835 + }, + { + "epoch": 1.3408983451536642, + "grad_norm": 3.351001024246216, + "learning_rate": 4.439654744259598e-06, + "loss": 0.5758, + "step": 2836 + }, + { + "epoch": 1.341371158392435, + "grad_norm": 2.7368781566619873, + "learning_rate": 4.439261109704129e-06, + "loss": 0.5674, + "step": 2837 + }, + { + "epoch": 1.3418439716312056, + "grad_norm": 3.008199453353882, + "learning_rate": 4.438867354399372e-06, + "loss": 0.5891, + "step": 2838 + }, + { + "epoch": 1.3423167848699764, + "grad_norm": 2.538907766342163, + "learning_rate": 4.438473478369847e-06, + "loss": 0.5102, + "step": 2839 + }, + { + "epoch": 1.342789598108747, + "grad_norm": 2.7169063091278076, + "learning_rate": 4.438079481640079e-06, + "loss": 0.6131, + "step": 2840 + }, + { + "epoch": 1.3432624113475178, + "grad_norm": 2.7411608695983887, + "learning_rate": 4.437685364234601e-06, + "loss": 0.5337, + "step": 2841 + }, + { + "epoch": 1.3437352245862884, + "grad_norm": 3.2374939918518066, + "learning_rate": 4.43729112617795e-06, + "loss": 0.5401, + "step": 2842 + }, + { + "epoch": 1.3442080378250592, + "grad_norm": 2.4712226390838623, + "learning_rate": 4.436896767494676e-06, + "loss": 0.5365, + "step": 2843 + }, + { + "epoch": 1.3446808510638297, + "grad_norm": 2.661619186401367, + "learning_rate": 4.436502288209334e-06, + "loss": 0.4919, + "step": 2844 + }, + { + "epoch": 1.3451536643026005, + "grad_norm": 2.5943779945373535, + "learning_rate": 4.4361076883464845e-06, + "loss": 0.5253, + "step": 2845 + }, + { + "epoch": 1.345626477541371, + "grad_norm": 2.672297477722168, + "learning_rate": 4.4357129679307e-06, + "loss": 0.541, + "step": 2846 + }, + { + "epoch": 1.346099290780142, + "grad_norm": 2.6830925941467285, + "learning_rate": 4.435318126986557e-06, + "loss": 0.5641, + "step": 2847 + }, + { + "epoch": 1.3465721040189125, + "grad_norm": 2.7394626140594482, + "learning_rate": 4.434923165538639e-06, + "loss": 0.5591, + "step": 2848 + }, + { + "epoch": 1.3470449172576833, + "grad_norm": 2.9656317234039307, + "learning_rate": 4.434528083611541e-06, + "loss": 0.515, + "step": 2849 + }, + { + "epoch": 1.3475177304964538, + "grad_norm": 3.30155086517334, + "learning_rate": 4.434132881229861e-06, + "loss": 0.5871, + "step": 2850 + }, + { + "epoch": 1.3479905437352246, + "grad_norm": 2.6222476959228516, + "learning_rate": 4.433737558418209e-06, + "loss": 0.5143, + "step": 2851 + }, + { + "epoch": 1.3484633569739952, + "grad_norm": 2.903158187866211, + "learning_rate": 4.4333421152011965e-06, + "loss": 0.4484, + "step": 2852 + }, + { + "epoch": 1.348936170212766, + "grad_norm": 2.863116979598999, + "learning_rate": 4.432946551603449e-06, + "loss": 0.5213, + "step": 2853 + }, + { + "epoch": 1.3494089834515366, + "grad_norm": 2.8253962993621826, + "learning_rate": 4.432550867649596e-06, + "loss": 0.5713, + "step": 2854 + }, + { + "epoch": 1.3498817966903074, + "grad_norm": 2.652493953704834, + "learning_rate": 4.432155063364273e-06, + "loss": 0.5559, + "step": 2855 + }, + { + "epoch": 1.350354609929078, + "grad_norm": 2.4289376735687256, + "learning_rate": 4.431759138772127e-06, + "loss": 0.5122, + "step": 2856 + }, + { + "epoch": 1.3508274231678488, + "grad_norm": 2.6329853534698486, + "learning_rate": 4.43136309389781e-06, + "loss": 0.5332, + "step": 2857 + }, + { + "epoch": 1.3513002364066193, + "grad_norm": 2.431103229522705, + "learning_rate": 4.430966928765982e-06, + "loss": 0.4863, + "step": 2858 + }, + { + "epoch": 1.3517730496453901, + "grad_norm": 2.7529025077819824, + "learning_rate": 4.4305706434013106e-06, + "loss": 0.5263, + "step": 2859 + }, + { + "epoch": 1.3522458628841607, + "grad_norm": 2.884605646133423, + "learning_rate": 4.43017423782847e-06, + "loss": 0.564, + "step": 2860 + }, + { + "epoch": 1.3527186761229315, + "grad_norm": 3.027771234512329, + "learning_rate": 4.4297777120721435e-06, + "loss": 0.5846, + "step": 2861 + }, + { + "epoch": 1.353191489361702, + "grad_norm": 3.0140626430511475, + "learning_rate": 4.4293810661570205e-06, + "loss": 0.6621, + "step": 2862 + }, + { + "epoch": 1.3536643026004729, + "grad_norm": 2.721799612045288, + "learning_rate": 4.428984300107799e-06, + "loss": 0.5566, + "step": 2863 + }, + { + "epoch": 1.3541371158392435, + "grad_norm": 3.0016496181488037, + "learning_rate": 4.428587413949183e-06, + "loss": 0.5525, + "step": 2864 + }, + { + "epoch": 1.3546099290780143, + "grad_norm": 2.77138614654541, + "learning_rate": 4.428190407705886e-06, + "loss": 0.6016, + "step": 2865 + }, + { + "epoch": 1.3550827423167848, + "grad_norm": 2.9783477783203125, + "learning_rate": 4.427793281402627e-06, + "loss": 0.5556, + "step": 2866 + }, + { + "epoch": 1.3555555555555556, + "grad_norm": 2.2490382194519043, + "learning_rate": 4.427396035064132e-06, + "loss": 0.5138, + "step": 2867 + }, + { + "epoch": 1.3560283687943262, + "grad_norm": 2.442225217819214, + "learning_rate": 4.426998668715139e-06, + "loss": 0.4843, + "step": 2868 + }, + { + "epoch": 1.356501182033097, + "grad_norm": 2.74040150642395, + "learning_rate": 4.426601182380388e-06, + "loss": 0.54, + "step": 2869 + }, + { + "epoch": 1.3569739952718676, + "grad_norm": 2.4434332847595215, + "learning_rate": 4.426203576084629e-06, + "loss": 0.5199, + "step": 2870 + }, + { + "epoch": 1.3574468085106384, + "grad_norm": 2.6380388736724854, + "learning_rate": 4.42580584985262e-06, + "loss": 0.5049, + "step": 2871 + }, + { + "epoch": 1.357919621749409, + "grad_norm": 2.7324254512786865, + "learning_rate": 4.425408003709125e-06, + "loss": 0.5036, + "step": 2872 + }, + { + "epoch": 1.3583924349881797, + "grad_norm": 2.661012649536133, + "learning_rate": 4.425010037678916e-06, + "loss": 0.4965, + "step": 2873 + }, + { + "epoch": 1.3588652482269503, + "grad_norm": 2.5380208492279053, + "learning_rate": 4.424611951786773e-06, + "loss": 0.4293, + "step": 2874 + }, + { + "epoch": 1.3593380614657211, + "grad_norm": 2.6060714721679688, + "learning_rate": 4.424213746057483e-06, + "loss": 0.5335, + "step": 2875 + }, + { + "epoch": 1.3598108747044917, + "grad_norm": 2.98282527923584, + "learning_rate": 4.423815420515841e-06, + "loss": 0.5626, + "step": 2876 + }, + { + "epoch": 1.3602836879432625, + "grad_norm": 2.779371500015259, + "learning_rate": 4.423416975186647e-06, + "loss": 0.5353, + "step": 2877 + }, + { + "epoch": 1.360756501182033, + "grad_norm": 2.8033530712127686, + "learning_rate": 4.423018410094713e-06, + "loss": 0.538, + "step": 2878 + }, + { + "epoch": 1.3612293144208039, + "grad_norm": 3.225177764892578, + "learning_rate": 4.422619725264855e-06, + "loss": 0.5441, + "step": 2879 + }, + { + "epoch": 1.3617021276595744, + "grad_norm": 2.959135055541992, + "learning_rate": 4.422220920721896e-06, + "loss": 0.5293, + "step": 2880 + }, + { + "epoch": 1.3621749408983452, + "grad_norm": 2.5558884143829346, + "learning_rate": 4.4218219964906704e-06, + "loss": 0.442, + "step": 2881 + }, + { + "epoch": 1.3626477541371158, + "grad_norm": 2.694899797439575, + "learning_rate": 4.421422952596015e-06, + "loss": 0.5318, + "step": 2882 + }, + { + "epoch": 1.3631205673758866, + "grad_norm": 2.7909531593322754, + "learning_rate": 4.421023789062777e-06, + "loss": 0.6648, + "step": 2883 + }, + { + "epoch": 1.3635933806146572, + "grad_norm": 2.421995162963867, + "learning_rate": 4.420624505915813e-06, + "loss": 0.4644, + "step": 2884 + }, + { + "epoch": 1.364066193853428, + "grad_norm": 2.5876688957214355, + "learning_rate": 4.420225103179981e-06, + "loss": 0.5743, + "step": 2885 + }, + { + "epoch": 1.3645390070921986, + "grad_norm": 2.89341139793396, + "learning_rate": 4.419825580880152e-06, + "loss": 0.5454, + "step": 2886 + }, + { + "epoch": 1.3650118203309693, + "grad_norm": 2.534708261489868, + "learning_rate": 4.419425939041203e-06, + "loss": 0.5572, + "step": 2887 + }, + { + "epoch": 1.36548463356974, + "grad_norm": 2.6052141189575195, + "learning_rate": 4.419026177688017e-06, + "loss": 0.4763, + "step": 2888 + }, + { + "epoch": 1.3659574468085105, + "grad_norm": 2.723720073699951, + "learning_rate": 4.4186262968454854e-06, + "loss": 0.5659, + "step": 2889 + }, + { + "epoch": 1.3664302600472813, + "grad_norm": 2.8909599781036377, + "learning_rate": 4.418226296538507e-06, + "loss": 0.4996, + "step": 2890 + }, + { + "epoch": 1.366903073286052, + "grad_norm": 2.551375389099121, + "learning_rate": 4.417826176791988e-06, + "loss": 0.5259, + "step": 2891 + }, + { + "epoch": 1.3673758865248227, + "grad_norm": 3.360267162322998, + "learning_rate": 4.417425937630843e-06, + "loss": 0.5381, + "step": 2892 + }, + { + "epoch": 1.3678486997635932, + "grad_norm": 2.7611942291259766, + "learning_rate": 4.417025579079992e-06, + "loss": 0.6022, + "step": 2893 + }, + { + "epoch": 1.368321513002364, + "grad_norm": 2.5931224822998047, + "learning_rate": 4.416625101164365e-06, + "loss": 0.5102, + "step": 2894 + }, + { + "epoch": 1.3687943262411348, + "grad_norm": 2.5888102054595947, + "learning_rate": 4.416224503908897e-06, + "loss": 0.4955, + "step": 2895 + }, + { + "epoch": 1.3692671394799054, + "grad_norm": 2.6262896060943604, + "learning_rate": 4.41582378733853e-06, + "loss": 0.5101, + "step": 2896 + }, + { + "epoch": 1.369739952718676, + "grad_norm": 3.339170217514038, + "learning_rate": 4.415422951478218e-06, + "loss": 0.4939, + "step": 2897 + }, + { + "epoch": 1.3702127659574468, + "grad_norm": 2.940866708755493, + "learning_rate": 4.415021996352917e-06, + "loss": 0.5157, + "step": 2898 + }, + { + "epoch": 1.3706855791962176, + "grad_norm": 2.7423818111419678, + "learning_rate": 4.414620921987594e-06, + "loss": 0.5308, + "step": 2899 + }, + { + "epoch": 1.3711583924349882, + "grad_norm": 2.7177040576934814, + "learning_rate": 4.414219728407221e-06, + "loss": 0.5429, + "step": 2900 + }, + { + "epoch": 1.3716312056737587, + "grad_norm": 2.560774087905884, + "learning_rate": 4.4138184156367794e-06, + "loss": 0.5266, + "step": 2901 + }, + { + "epoch": 1.3721040189125295, + "grad_norm": 2.5649116039276123, + "learning_rate": 4.413416983701256e-06, + "loss": 0.4718, + "step": 2902 + }, + { + "epoch": 1.3725768321513003, + "grad_norm": 2.8547167778015137, + "learning_rate": 4.413015432625648e-06, + "loss": 0.5129, + "step": 2903 + }, + { + "epoch": 1.373049645390071, + "grad_norm": 2.5413618087768555, + "learning_rate": 4.412613762434958e-06, + "loss": 0.5738, + "step": 2904 + }, + { + "epoch": 1.3735224586288415, + "grad_norm": 3.3252241611480713, + "learning_rate": 4.412211973154195e-06, + "loss": 0.5639, + "step": 2905 + }, + { + "epoch": 1.3739952718676123, + "grad_norm": 2.869102954864502, + "learning_rate": 4.411810064808376e-06, + "loss": 0.5384, + "step": 2906 + }, + { + "epoch": 1.374468085106383, + "grad_norm": 2.703199863433838, + "learning_rate": 4.411408037422529e-06, + "loss": 0.5742, + "step": 2907 + }, + { + "epoch": 1.3749408983451537, + "grad_norm": 2.685450792312622, + "learning_rate": 4.411005891021684e-06, + "loss": 0.5121, + "step": 2908 + }, + { + "epoch": 1.3754137115839242, + "grad_norm": 2.9572203159332275, + "learning_rate": 4.410603625630882e-06, + "loss": 0.5444, + "step": 2909 + }, + { + "epoch": 1.375886524822695, + "grad_norm": 2.707002878189087, + "learning_rate": 4.410201241275169e-06, + "loss": 0.5125, + "step": 2910 + }, + { + "epoch": 1.3763593380614658, + "grad_norm": 3.0158939361572266, + "learning_rate": 4.409798737979602e-06, + "loss": 0.5299, + "step": 2911 + }, + { + "epoch": 1.3768321513002364, + "grad_norm": 2.7932698726654053, + "learning_rate": 4.4093961157692415e-06, + "loss": 0.5437, + "step": 2912 + }, + { + "epoch": 1.377304964539007, + "grad_norm": 2.459510326385498, + "learning_rate": 4.408993374669156e-06, + "loss": 0.5548, + "step": 2913 + }, + { + "epoch": 1.3777777777777778, + "grad_norm": 2.7500696182250977, + "learning_rate": 4.408590514704425e-06, + "loss": 0.5186, + "step": 2914 + }, + { + "epoch": 1.3782505910165486, + "grad_norm": 2.7824268341064453, + "learning_rate": 4.4081875359001315e-06, + "loss": 0.4762, + "step": 2915 + }, + { + "epoch": 1.3787234042553191, + "grad_norm": 2.4202158451080322, + "learning_rate": 4.4077844382813675e-06, + "loss": 0.5005, + "step": 2916 + }, + { + "epoch": 1.3791962174940897, + "grad_norm": 2.5566670894622803, + "learning_rate": 4.4073812218732316e-06, + "loss": 0.5377, + "step": 2917 + }, + { + "epoch": 1.3796690307328605, + "grad_norm": 3.400874376296997, + "learning_rate": 4.406977886700831e-06, + "loss": 0.6637, + "step": 2918 + }, + { + "epoch": 1.3801418439716313, + "grad_norm": 2.8187878131866455, + "learning_rate": 4.406574432789278e-06, + "loss": 0.5033, + "step": 2919 + }, + { + "epoch": 1.3806146572104019, + "grad_norm": 2.5578041076660156, + "learning_rate": 4.406170860163697e-06, + "loss": 0.5293, + "step": 2920 + }, + { + "epoch": 1.3810874704491725, + "grad_norm": 2.6709718704223633, + "learning_rate": 4.405767168849213e-06, + "loss": 0.5144, + "step": 2921 + }, + { + "epoch": 1.3815602836879433, + "grad_norm": 3.049365997314453, + "learning_rate": 4.405363358870965e-06, + "loss": 0.4894, + "step": 2922 + }, + { + "epoch": 1.382033096926714, + "grad_norm": 2.5569891929626465, + "learning_rate": 4.404959430254095e-06, + "loss": 0.4929, + "step": 2923 + }, + { + "epoch": 1.3825059101654846, + "grad_norm": 2.8288230895996094, + "learning_rate": 4.404555383023754e-06, + "loss": 0.5438, + "step": 2924 + }, + { + "epoch": 1.3829787234042552, + "grad_norm": 2.8363358974456787, + "learning_rate": 4.404151217205102e-06, + "loss": 0.545, + "step": 2925 + }, + { + "epoch": 1.383451536643026, + "grad_norm": 2.720972776412964, + "learning_rate": 4.403746932823302e-06, + "loss": 0.5732, + "step": 2926 + }, + { + "epoch": 1.3839243498817968, + "grad_norm": 2.728043794631958, + "learning_rate": 4.403342529903528e-06, + "loss": 0.4944, + "step": 2927 + }, + { + "epoch": 1.3843971631205674, + "grad_norm": 2.4366135597229004, + "learning_rate": 4.402938008470961e-06, + "loss": 0.4441, + "step": 2928 + }, + { + "epoch": 1.384869976359338, + "grad_norm": 2.858454704284668, + "learning_rate": 4.402533368550788e-06, + "loss": 0.5359, + "step": 2929 + }, + { + "epoch": 1.3853427895981087, + "grad_norm": 2.805795907974243, + "learning_rate": 4.402128610168205e-06, + "loss": 0.4954, + "step": 2930 + }, + { + "epoch": 1.3858156028368795, + "grad_norm": 3.3514177799224854, + "learning_rate": 4.401723733348413e-06, + "loss": 0.579, + "step": 2931 + }, + { + "epoch": 1.3862884160756501, + "grad_norm": 2.6255125999450684, + "learning_rate": 4.401318738116624e-06, + "loss": 0.5002, + "step": 2932 + }, + { + "epoch": 1.3867612293144207, + "grad_norm": 2.3480796813964844, + "learning_rate": 4.400913624498054e-06, + "loss": 0.4688, + "step": 2933 + }, + { + "epoch": 1.3872340425531915, + "grad_norm": 2.710165023803711, + "learning_rate": 4.400508392517927e-06, + "loss": 0.5099, + "step": 2934 + }, + { + "epoch": 1.3877068557919623, + "grad_norm": 2.5820295810699463, + "learning_rate": 4.400103042201477e-06, + "loss": 0.512, + "step": 2935 + }, + { + "epoch": 1.3881796690307329, + "grad_norm": 2.750596523284912, + "learning_rate": 4.399697573573942e-06, + "loss": 0.463, + "step": 2936 + }, + { + "epoch": 1.3886524822695034, + "grad_norm": 3.497537612915039, + "learning_rate": 4.399291986660569e-06, + "loss": 0.5676, + "step": 2937 + }, + { + "epoch": 1.3891252955082742, + "grad_norm": 2.4046003818511963, + "learning_rate": 4.398886281486612e-06, + "loss": 0.5408, + "step": 2938 + }, + { + "epoch": 1.389598108747045, + "grad_norm": 2.941606283187866, + "learning_rate": 4.398480458077332e-06, + "loss": 0.5734, + "step": 2939 + }, + { + "epoch": 1.3900709219858156, + "grad_norm": 3.030214309692383, + "learning_rate": 4.398074516458e-06, + "loss": 0.5353, + "step": 2940 + }, + { + "epoch": 1.3905437352245862, + "grad_norm": 2.9991626739501953, + "learning_rate": 4.397668456653889e-06, + "loss": 0.5989, + "step": 2941 + }, + { + "epoch": 1.391016548463357, + "grad_norm": 4.163141250610352, + "learning_rate": 4.397262278690285e-06, + "loss": 0.5436, + "step": 2942 + }, + { + "epoch": 1.3914893617021278, + "grad_norm": 2.6576037406921387, + "learning_rate": 4.396855982592478e-06, + "loss": 0.5206, + "step": 2943 + }, + { + "epoch": 1.3919621749408984, + "grad_norm": 2.7729203701019287, + "learning_rate": 4.396449568385768e-06, + "loss": 0.5403, + "step": 2944 + }, + { + "epoch": 1.392434988179669, + "grad_norm": 2.4560446739196777, + "learning_rate": 4.396043036095457e-06, + "loss": 0.4924, + "step": 2945 + }, + { + "epoch": 1.3929078014184397, + "grad_norm": 2.6370556354522705, + "learning_rate": 4.39563638574686e-06, + "loss": 0.5543, + "step": 2946 + }, + { + "epoch": 1.3933806146572105, + "grad_norm": 2.593914270401001, + "learning_rate": 4.395229617365298e-06, + "loss": 0.5133, + "step": 2947 + }, + { + "epoch": 1.393853427895981, + "grad_norm": 2.3583998680114746, + "learning_rate": 4.394822730976099e-06, + "loss": 0.4436, + "step": 2948 + }, + { + "epoch": 1.3943262411347517, + "grad_norm": 3.2768537998199463, + "learning_rate": 4.394415726604596e-06, + "loss": 0.5489, + "step": 2949 + }, + { + "epoch": 1.3947990543735225, + "grad_norm": 2.88662052154541, + "learning_rate": 4.394008604276133e-06, + "loss": 0.5194, + "step": 2950 + }, + { + "epoch": 1.3952718676122933, + "grad_norm": 2.46610426902771, + "learning_rate": 4.393601364016059e-06, + "loss": 0.5255, + "step": 2951 + }, + { + "epoch": 1.3957446808510638, + "grad_norm": 3.122509241104126, + "learning_rate": 4.393194005849731e-06, + "loss": 0.6046, + "step": 2952 + }, + { + "epoch": 1.3962174940898344, + "grad_norm": 2.724926471710205, + "learning_rate": 4.392786529802513e-06, + "loss": 0.4958, + "step": 2953 + }, + { + "epoch": 1.3966903073286052, + "grad_norm": 2.491485595703125, + "learning_rate": 4.3923789358997785e-06, + "loss": 0.5209, + "step": 2954 + }, + { + "epoch": 1.397163120567376, + "grad_norm": 2.61110520362854, + "learning_rate": 4.3919712241669056e-06, + "loss": 0.5202, + "step": 2955 + }, + { + "epoch": 1.3976359338061466, + "grad_norm": 2.3814501762390137, + "learning_rate": 4.39156339462928e-06, + "loss": 0.4966, + "step": 2956 + }, + { + "epoch": 1.3981087470449172, + "grad_norm": 2.762498617172241, + "learning_rate": 4.391155447312296e-06, + "loss": 0.6025, + "step": 2957 + }, + { + "epoch": 1.398581560283688, + "grad_norm": 2.964975595474243, + "learning_rate": 4.390747382241355e-06, + "loss": 0.4845, + "step": 2958 + }, + { + "epoch": 1.3990543735224588, + "grad_norm": 3.0117249488830566, + "learning_rate": 4.3903391994418655e-06, + "loss": 0.5326, + "step": 2959 + }, + { + "epoch": 1.3995271867612293, + "grad_norm": 2.578626871109009, + "learning_rate": 4.389930898939243e-06, + "loss": 0.5271, + "step": 2960 + }, + { + "epoch": 1.4, + "grad_norm": 2.747441053390503, + "learning_rate": 4.38952248075891e-06, + "loss": 0.5553, + "step": 2961 + }, + { + "epoch": 1.4004728132387707, + "grad_norm": 2.8273086547851562, + "learning_rate": 4.389113944926297e-06, + "loss": 0.5475, + "step": 2962 + }, + { + "epoch": 1.4009456264775415, + "grad_norm": 2.55238676071167, + "learning_rate": 4.388705291466843e-06, + "loss": 0.4864, + "step": 2963 + }, + { + "epoch": 1.401418439716312, + "grad_norm": 2.597214460372925, + "learning_rate": 4.388296520405992e-06, + "loss": 0.4845, + "step": 2964 + }, + { + "epoch": 1.4018912529550827, + "grad_norm": 2.608962297439575, + "learning_rate": 4.387887631769196e-06, + "loss": 0.5544, + "step": 2965 + }, + { + "epoch": 1.4023640661938535, + "grad_norm": 2.2754876613616943, + "learning_rate": 4.3874786255819165e-06, + "loss": 0.5045, + "step": 2966 + }, + { + "epoch": 1.4028368794326243, + "grad_norm": 2.9900264739990234, + "learning_rate": 4.387069501869618e-06, + "loss": 0.562, + "step": 2967 + }, + { + "epoch": 1.4033096926713948, + "grad_norm": 2.8069417476654053, + "learning_rate": 4.386660260657778e-06, + "loss": 0.5284, + "step": 2968 + }, + { + "epoch": 1.4037825059101654, + "grad_norm": 2.68894624710083, + "learning_rate": 4.386250901971875e-06, + "loss": 0.5879, + "step": 2969 + }, + { + "epoch": 1.4042553191489362, + "grad_norm": 2.614485025405884, + "learning_rate": 4.385841425837399e-06, + "loss": 0.4771, + "step": 2970 + }, + { + "epoch": 1.4047281323877068, + "grad_norm": 2.487950325012207, + "learning_rate": 4.385431832279848e-06, + "loss": 0.5552, + "step": 2971 + }, + { + "epoch": 1.4052009456264776, + "grad_norm": 2.5098392963409424, + "learning_rate": 4.385022121324723e-06, + "loss": 0.5267, + "step": 2972 + }, + { + "epoch": 1.4056737588652481, + "grad_norm": 2.825838565826416, + "learning_rate": 4.384612292997537e-06, + "loss": 0.5336, + "step": 2973 + }, + { + "epoch": 1.406146572104019, + "grad_norm": 2.898188829421997, + "learning_rate": 4.384202347323806e-06, + "loss": 0.5685, + "step": 2974 + }, + { + "epoch": 1.4066193853427895, + "grad_norm": 2.8722569942474365, + "learning_rate": 4.383792284329057e-06, + "loss": 0.5977, + "step": 2975 + }, + { + "epoch": 1.4070921985815603, + "grad_norm": 2.832951307296753, + "learning_rate": 4.3833821040388235e-06, + "loss": 0.5766, + "step": 2976 + }, + { + "epoch": 1.407565011820331, + "grad_norm": 2.7353670597076416, + "learning_rate": 4.3829718064786446e-06, + "loss": 0.5461, + "step": 2977 + }, + { + "epoch": 1.4080378250591017, + "grad_norm": 2.6050429344177246, + "learning_rate": 4.3825613916740675e-06, + "loss": 0.5501, + "step": 2978 + }, + { + "epoch": 1.4085106382978723, + "grad_norm": 2.79719877243042, + "learning_rate": 4.382150859650647e-06, + "loss": 0.502, + "step": 2979 + }, + { + "epoch": 1.408983451536643, + "grad_norm": 2.5538079738616943, + "learning_rate": 4.381740210433946e-06, + "loss": 0.4762, + "step": 2980 + }, + { + "epoch": 1.4094562647754136, + "grad_norm": 2.7256062030792236, + "learning_rate": 4.381329444049533e-06, + "loss": 0.4692, + "step": 2981 + }, + { + "epoch": 1.4099290780141844, + "grad_norm": 2.7778146266937256, + "learning_rate": 4.3809185605229855e-06, + "loss": 0.5366, + "step": 2982 + }, + { + "epoch": 1.410401891252955, + "grad_norm": 2.6289451122283936, + "learning_rate": 4.380507559879887e-06, + "loss": 0.5412, + "step": 2983 + }, + { + "epoch": 1.4108747044917258, + "grad_norm": 2.697204828262329, + "learning_rate": 4.380096442145827e-06, + "loss": 0.5065, + "step": 2984 + }, + { + "epoch": 1.4113475177304964, + "grad_norm": 2.4709219932556152, + "learning_rate": 4.379685207346407e-06, + "loss": 0.568, + "step": 2985 + }, + { + "epoch": 1.4118203309692672, + "grad_norm": 2.9740655422210693, + "learning_rate": 4.379273855507231e-06, + "loss": 0.5512, + "step": 2986 + }, + { + "epoch": 1.4122931442080378, + "grad_norm": 3.0090627670288086, + "learning_rate": 4.378862386653911e-06, + "loss": 0.5459, + "step": 2987 + }, + { + "epoch": 1.4127659574468086, + "grad_norm": 2.8835368156433105, + "learning_rate": 4.378450800812071e-06, + "loss": 0.5357, + "step": 2988 + }, + { + "epoch": 1.4132387706855791, + "grad_norm": 2.558824062347412, + "learning_rate": 4.378039098007335e-06, + "loss": 0.536, + "step": 2989 + }, + { + "epoch": 1.41371158392435, + "grad_norm": 2.5572092533111572, + "learning_rate": 4.377627278265339e-06, + "loss": 0.5183, + "step": 2990 + }, + { + "epoch": 1.4141843971631205, + "grad_norm": 2.7356579303741455, + "learning_rate": 4.377215341611727e-06, + "loss": 0.5087, + "step": 2991 + }, + { + "epoch": 1.4146572104018913, + "grad_norm": 2.7541024684906006, + "learning_rate": 4.376803288072146e-06, + "loss": 0.4509, + "step": 2992 + }, + { + "epoch": 1.4151300236406619, + "grad_norm": 2.7548446655273438, + "learning_rate": 4.376391117672254e-06, + "loss": 0.5532, + "step": 2993 + }, + { + "epoch": 1.4156028368794327, + "grad_norm": 2.9107465744018555, + "learning_rate": 4.375978830437715e-06, + "loss": 0.5719, + "step": 2994 + }, + { + "epoch": 1.4160756501182032, + "grad_norm": 2.7077393531799316, + "learning_rate": 4.3755664263942e-06, + "loss": 0.5084, + "step": 2995 + }, + { + "epoch": 1.416548463356974, + "grad_norm": 2.764209270477295, + "learning_rate": 4.375153905567388e-06, + "loss": 0.5976, + "step": 2996 + }, + { + "epoch": 1.4170212765957446, + "grad_norm": 2.7792932987213135, + "learning_rate": 4.374741267982964e-06, + "loss": 0.5358, + "step": 2997 + }, + { + "epoch": 1.4174940898345154, + "grad_norm": 2.459212064743042, + "learning_rate": 4.374328513666622e-06, + "loss": 0.5181, + "step": 2998 + }, + { + "epoch": 1.417966903073286, + "grad_norm": 2.548546552658081, + "learning_rate": 4.373915642644062e-06, + "loss": 0.528, + "step": 2999 + }, + { + "epoch": 1.4184397163120568, + "grad_norm": 2.998138189315796, + "learning_rate": 4.373502654940992e-06, + "loss": 0.5233, + "step": 3000 + }, + { + "epoch": 1.4189125295508274, + "grad_norm": 2.604341983795166, + "learning_rate": 4.373089550583126e-06, + "loss": 0.5274, + "step": 3001 + }, + { + "epoch": 1.4193853427895982, + "grad_norm": 2.6792588233947754, + "learning_rate": 4.372676329596188e-06, + "loss": 0.5061, + "step": 3002 + }, + { + "epoch": 1.4198581560283687, + "grad_norm": 2.5182368755340576, + "learning_rate": 4.372262992005906e-06, + "loss": 0.541, + "step": 3003 + }, + { + "epoch": 1.4203309692671395, + "grad_norm": 2.690718173980713, + "learning_rate": 4.371849537838018e-06, + "loss": 0.5308, + "step": 3004 + }, + { + "epoch": 1.42080378250591, + "grad_norm": 2.6797590255737305, + "learning_rate": 4.371435967118266e-06, + "loss": 0.5728, + "step": 3005 + }, + { + "epoch": 1.421276595744681, + "grad_norm": 2.847900152206421, + "learning_rate": 4.371022279872403e-06, + "loss": 0.5053, + "step": 3006 + }, + { + "epoch": 1.4217494089834515, + "grad_norm": 2.497810125350952, + "learning_rate": 4.370608476126186e-06, + "loss": 0.5057, + "step": 3007 + }, + { + "epoch": 1.4222222222222223, + "grad_norm": 2.5259225368499756, + "learning_rate": 4.370194555905382e-06, + "loss": 0.5508, + "step": 3008 + }, + { + "epoch": 1.4226950354609929, + "grad_norm": 2.774118423461914, + "learning_rate": 4.369780519235763e-06, + "loss": 0.5419, + "step": 3009 + }, + { + "epoch": 1.4231678486997636, + "grad_norm": 2.2764663696289062, + "learning_rate": 4.369366366143111e-06, + "loss": 0.5032, + "step": 3010 + }, + { + "epoch": 1.4236406619385342, + "grad_norm": 2.736347198486328, + "learning_rate": 4.368952096653211e-06, + "loss": 0.5184, + "step": 3011 + }, + { + "epoch": 1.424113475177305, + "grad_norm": 2.476762056350708, + "learning_rate": 4.36853771079186e-06, + "loss": 0.5331, + "step": 3012 + }, + { + "epoch": 1.4245862884160756, + "grad_norm": 2.8006162643432617, + "learning_rate": 4.3681232085848585e-06, + "loss": 0.5331, + "step": 3013 + }, + { + "epoch": 1.4250591016548464, + "grad_norm": 2.509143590927124, + "learning_rate": 4.367708590058016e-06, + "loss": 0.5127, + "step": 3014 + }, + { + "epoch": 1.425531914893617, + "grad_norm": 3.030137538909912, + "learning_rate": 4.3672938552371505e-06, + "loss": 0.5555, + "step": 3015 + }, + { + "epoch": 1.4260047281323878, + "grad_norm": 3.0536904335021973, + "learning_rate": 4.3668790041480835e-06, + "loss": 0.5241, + "step": 3016 + }, + { + "epoch": 1.4264775413711583, + "grad_norm": 2.6400439739227295, + "learning_rate": 4.366464036816647e-06, + "loss": 0.4946, + "step": 3017 + }, + { + "epoch": 1.4269503546099291, + "grad_norm": 2.7302589416503906, + "learning_rate": 4.366048953268679e-06, + "loss": 0.5105, + "step": 3018 + }, + { + "epoch": 1.4274231678486997, + "grad_norm": 2.504549264907837, + "learning_rate": 4.365633753530026e-06, + "loss": 0.4844, + "step": 3019 + }, + { + "epoch": 1.4278959810874705, + "grad_norm": 2.3872320652008057, + "learning_rate": 4.365218437626539e-06, + "loss": 0.4402, + "step": 3020 + }, + { + "epoch": 1.428368794326241, + "grad_norm": 2.531649351119995, + "learning_rate": 4.364803005584078e-06, + "loss": 0.4913, + "step": 3021 + }, + { + "epoch": 1.4288416075650119, + "grad_norm": 2.4683783054351807, + "learning_rate": 4.364387457428512e-06, + "loss": 0.515, + "step": 3022 + }, + { + "epoch": 1.4293144208037825, + "grad_norm": 2.632336378097534, + "learning_rate": 4.363971793185713e-06, + "loss": 0.5398, + "step": 3023 + }, + { + "epoch": 1.4297872340425533, + "grad_norm": 2.7456719875335693, + "learning_rate": 4.363556012881565e-06, + "loss": 0.5254, + "step": 3024 + }, + { + "epoch": 1.4302600472813238, + "grad_norm": 2.607177972793579, + "learning_rate": 4.363140116541955e-06, + "loss": 0.5266, + "step": 3025 + }, + { + "epoch": 1.4307328605200946, + "grad_norm": 2.640127420425415, + "learning_rate": 4.3627241041927796e-06, + "loss": 0.5157, + "step": 3026 + }, + { + "epoch": 1.4312056737588652, + "grad_norm": 2.4210736751556396, + "learning_rate": 4.362307975859941e-06, + "loss": 0.4599, + "step": 3027 + }, + { + "epoch": 1.431678486997636, + "grad_norm": 2.6007790565490723, + "learning_rate": 4.361891731569352e-06, + "loss": 0.5298, + "step": 3028 + }, + { + "epoch": 1.4321513002364066, + "grad_norm": 2.5352046489715576, + "learning_rate": 4.361475371346928e-06, + "loss": 0.5128, + "step": 3029 + }, + { + "epoch": 1.4326241134751774, + "grad_norm": 2.4204049110412598, + "learning_rate": 4.361058895218596e-06, + "loss": 0.4669, + "step": 3030 + }, + { + "epoch": 1.433096926713948, + "grad_norm": 2.525240182876587, + "learning_rate": 4.360642303210286e-06, + "loss": 0.4925, + "step": 3031 + }, + { + "epoch": 1.4335697399527187, + "grad_norm": 2.839646339416504, + "learning_rate": 4.360225595347939e-06, + "loss": 0.5868, + "step": 3032 + }, + { + "epoch": 1.4340425531914893, + "grad_norm": 2.5043296813964844, + "learning_rate": 4.359808771657501e-06, + "loss": 0.4951, + "step": 3033 + }, + { + "epoch": 1.4345153664302601, + "grad_norm": 2.9082300662994385, + "learning_rate": 4.359391832164927e-06, + "loss": 0.5259, + "step": 3034 + }, + { + "epoch": 1.4349881796690307, + "grad_norm": 2.6651999950408936, + "learning_rate": 4.3589747768961745e-06, + "loss": 0.537, + "step": 3035 + }, + { + "epoch": 1.4354609929078015, + "grad_norm": 2.577077865600586, + "learning_rate": 4.358557605877216e-06, + "loss": 0.5186, + "step": 3036 + }, + { + "epoch": 1.435933806146572, + "grad_norm": 2.7445287704467773, + "learning_rate": 4.3581403191340236e-06, + "loss": 0.5573, + "step": 3037 + }, + { + "epoch": 1.4364066193853429, + "grad_norm": 2.502086639404297, + "learning_rate": 4.357722916692582e-06, + "loss": 0.5039, + "step": 3038 + }, + { + "epoch": 1.4368794326241134, + "grad_norm": 2.4476163387298584, + "learning_rate": 4.357305398578879e-06, + "loss": 0.5638, + "step": 3039 + }, + { + "epoch": 1.4373522458628842, + "grad_norm": 2.7705588340759277, + "learning_rate": 4.356887764818915e-06, + "loss": 0.5485, + "step": 3040 + }, + { + "epoch": 1.4378250591016548, + "grad_norm": 2.498225450515747, + "learning_rate": 4.356470015438691e-06, + "loss": 0.5486, + "step": 3041 + }, + { + "epoch": 1.4382978723404256, + "grad_norm": 2.394320011138916, + "learning_rate": 4.356052150464219e-06, + "loss": 0.512, + "step": 3042 + }, + { + "epoch": 1.4387706855791962, + "grad_norm": 2.8725767135620117, + "learning_rate": 4.3556341699215185e-06, + "loss": 0.5202, + "step": 3043 + }, + { + "epoch": 1.439243498817967, + "grad_norm": 3.1707918643951416, + "learning_rate": 4.355216073836615e-06, + "loss": 0.5229, + "step": 3044 + }, + { + "epoch": 1.4397163120567376, + "grad_norm": 2.532578468322754, + "learning_rate": 4.3547978622355415e-06, + "loss": 0.4569, + "step": 3045 + }, + { + "epoch": 1.4401891252955084, + "grad_norm": 3.0111029148101807, + "learning_rate": 4.354379535144338e-06, + "loss": 0.5801, + "step": 3046 + }, + { + "epoch": 1.440661938534279, + "grad_norm": 2.9554224014282227, + "learning_rate": 4.353961092589052e-06, + "loss": 0.5968, + "step": 3047 + }, + { + "epoch": 1.4411347517730497, + "grad_norm": 2.7562637329101562, + "learning_rate": 4.353542534595738e-06, + "loss": 0.5005, + "step": 3048 + }, + { + "epoch": 1.4416075650118203, + "grad_norm": 3.083254337310791, + "learning_rate": 4.3531238611904595e-06, + "loss": 0.5389, + "step": 3049 + }, + { + "epoch": 1.442080378250591, + "grad_norm": 2.7778005599975586, + "learning_rate": 4.352705072399282e-06, + "loss": 0.5342, + "step": 3050 + }, + { + "epoch": 1.4425531914893617, + "grad_norm": 2.6673996448516846, + "learning_rate": 4.3522861682482845e-06, + "loss": 0.5213, + "step": 3051 + }, + { + "epoch": 1.4430260047281322, + "grad_norm": 2.637605905532837, + "learning_rate": 4.351867148763548e-06, + "loss": 0.4893, + "step": 3052 + }, + { + "epoch": 1.443498817966903, + "grad_norm": 2.834469795227051, + "learning_rate": 4.351448013971166e-06, + "loss": 0.5391, + "step": 3053 + }, + { + "epoch": 1.4439716312056738, + "grad_norm": 2.824153184890747, + "learning_rate": 4.351028763897234e-06, + "loss": 0.6403, + "step": 3054 + }, + { + "epoch": 1.4444444444444444, + "grad_norm": 2.558966875076294, + "learning_rate": 4.350609398567857e-06, + "loss": 0.4912, + "step": 3055 + }, + { + "epoch": 1.444917257683215, + "grad_norm": 2.281726360321045, + "learning_rate": 4.3501899180091475e-06, + "loss": 0.4655, + "step": 3056 + }, + { + "epoch": 1.4453900709219858, + "grad_norm": 2.499472141265869, + "learning_rate": 4.349770322247225e-06, + "loss": 0.4878, + "step": 3057 + }, + { + "epoch": 1.4458628841607566, + "grad_norm": 2.578615188598633, + "learning_rate": 4.349350611308215e-06, + "loss": 0.4855, + "step": 3058 + }, + { + "epoch": 1.4463356973995272, + "grad_norm": 2.7111165523529053, + "learning_rate": 4.348930785218252e-06, + "loss": 0.5415, + "step": 3059 + }, + { + "epoch": 1.4468085106382977, + "grad_norm": 2.8081610202789307, + "learning_rate": 4.348510844003476e-06, + "loss": 0.4881, + "step": 3060 + }, + { + "epoch": 1.4472813238770685, + "grad_norm": 2.9439868927001953, + "learning_rate": 4.348090787690036e-06, + "loss": 0.5485, + "step": 3061 + }, + { + "epoch": 1.4477541371158393, + "grad_norm": 2.592532157897949, + "learning_rate": 4.347670616304085e-06, + "loss": 0.4912, + "step": 3062 + }, + { + "epoch": 1.44822695035461, + "grad_norm": 2.960592746734619, + "learning_rate": 4.347250329871787e-06, + "loss": 0.5473, + "step": 3063 + }, + { + "epoch": 1.4486997635933805, + "grad_norm": 2.5786688327789307, + "learning_rate": 4.3468299284193116e-06, + "loss": 0.5348, + "step": 3064 + }, + { + "epoch": 1.4491725768321513, + "grad_norm": 2.6084046363830566, + "learning_rate": 4.346409411972834e-06, + "loss": 0.527, + "step": 3065 + }, + { + "epoch": 1.449645390070922, + "grad_norm": 2.489748239517212, + "learning_rate": 4.3459887805585385e-06, + "loss": 0.4943, + "step": 3066 + }, + { + "epoch": 1.4501182033096927, + "grad_norm": 2.452131986618042, + "learning_rate": 4.345568034202617e-06, + "loss": 0.4886, + "step": 3067 + }, + { + "epoch": 1.4505910165484632, + "grad_norm": 2.4034671783447266, + "learning_rate": 4.345147172931266e-06, + "loss": 0.4689, + "step": 3068 + }, + { + "epoch": 1.451063829787234, + "grad_norm": 2.6045448780059814, + "learning_rate": 4.344726196770691e-06, + "loss": 0.5842, + "step": 3069 + }, + { + "epoch": 1.4515366430260048, + "grad_norm": 2.697593927383423, + "learning_rate": 4.3443051057471045e-06, + "loss": 0.5358, + "step": 3070 + }, + { + "epoch": 1.4520094562647754, + "grad_norm": 2.6080820560455322, + "learning_rate": 4.343883899886727e-06, + "loss": 0.5361, + "step": 3071 + }, + { + "epoch": 1.452482269503546, + "grad_norm": 2.4605307579040527, + "learning_rate": 4.343462579215783e-06, + "loss": 0.4941, + "step": 3072 + }, + { + "epoch": 1.4529550827423168, + "grad_norm": 2.8025355339050293, + "learning_rate": 4.343041143760509e-06, + "loss": 0.5116, + "step": 3073 + }, + { + "epoch": 1.4534278959810876, + "grad_norm": 2.432515859603882, + "learning_rate": 4.3426195935471434e-06, + "loss": 0.4991, + "step": 3074 + }, + { + "epoch": 1.4539007092198581, + "grad_norm": 2.5838661193847656, + "learning_rate": 4.342197928601935e-06, + "loss": 0.4994, + "step": 3075 + }, + { + "epoch": 1.4543735224586287, + "grad_norm": 2.421692371368408, + "learning_rate": 4.341776148951141e-06, + "loss": 0.4945, + "step": 3076 + }, + { + "epoch": 1.4548463356973995, + "grad_norm": 2.5354676246643066, + "learning_rate": 4.341354254621021e-06, + "loss": 0.4859, + "step": 3077 + }, + { + "epoch": 1.4553191489361703, + "grad_norm": 2.7316789627075195, + "learning_rate": 4.340932245637846e-06, + "loss": 0.5136, + "step": 3078 + }, + { + "epoch": 1.455791962174941, + "grad_norm": 3.5903496742248535, + "learning_rate": 4.340510122027891e-06, + "loss": 0.6451, + "step": 3079 + }, + { + "epoch": 1.4562647754137115, + "grad_norm": 2.95190167427063, + "learning_rate": 4.340087883817442e-06, + "loss": 0.6354, + "step": 3080 + }, + { + "epoch": 1.4567375886524823, + "grad_norm": 2.8659214973449707, + "learning_rate": 4.339665531032789e-06, + "loss": 0.5514, + "step": 3081 + }, + { + "epoch": 1.457210401891253, + "grad_norm": 2.5681674480438232, + "learning_rate": 4.339243063700231e-06, + "loss": 0.5135, + "step": 3082 + }, + { + "epoch": 1.4576832151300236, + "grad_norm": 2.7353906631469727, + "learning_rate": 4.338820481846072e-06, + "loss": 0.4608, + "step": 3083 + }, + { + "epoch": 1.4581560283687942, + "grad_norm": 2.6116466522216797, + "learning_rate": 4.3383977854966245e-06, + "loss": 0.4924, + "step": 3084 + }, + { + "epoch": 1.458628841607565, + "grad_norm": 2.6676487922668457, + "learning_rate": 4.337974974678207e-06, + "loss": 0.5747, + "step": 3085 + }, + { + "epoch": 1.4591016548463358, + "grad_norm": 2.909031629562378, + "learning_rate": 4.337552049417147e-06, + "loss": 0.4618, + "step": 3086 + }, + { + "epoch": 1.4595744680851064, + "grad_norm": 2.7614190578460693, + "learning_rate": 4.33712900973978e-06, + "loss": 0.5154, + "step": 3087 + }, + { + "epoch": 1.460047281323877, + "grad_norm": 2.452188014984131, + "learning_rate": 4.336705855672444e-06, + "loss": 0.542, + "step": 3088 + }, + { + "epoch": 1.4605200945626478, + "grad_norm": 3.0004117488861084, + "learning_rate": 4.336282587241488e-06, + "loss": 0.5857, + "step": 3089 + }, + { + "epoch": 1.4609929078014185, + "grad_norm": 2.870783567428589, + "learning_rate": 4.335859204473268e-06, + "loss": 0.5506, + "step": 3090 + }, + { + "epoch": 1.4614657210401891, + "grad_norm": 3.1078689098358154, + "learning_rate": 4.335435707394145e-06, + "loss": 0.5138, + "step": 3091 + }, + { + "epoch": 1.4619385342789597, + "grad_norm": 2.8516197204589844, + "learning_rate": 4.335012096030488e-06, + "loss": 0.5842, + "step": 3092 + }, + { + "epoch": 1.4624113475177305, + "grad_norm": 2.615922212600708, + "learning_rate": 4.334588370408675e-06, + "loss": 0.4896, + "step": 3093 + }, + { + "epoch": 1.4628841607565013, + "grad_norm": 3.1911802291870117, + "learning_rate": 4.334164530555088e-06, + "loss": 0.4974, + "step": 3094 + }, + { + "epoch": 1.4633569739952719, + "grad_norm": 3.075051784515381, + "learning_rate": 4.3337405764961186e-06, + "loss": 0.567, + "step": 3095 + }, + { + "epoch": 1.4638297872340424, + "grad_norm": 2.550625801086426, + "learning_rate": 4.333316508258163e-06, + "loss": 0.4887, + "step": 3096 + }, + { + "epoch": 1.4643026004728132, + "grad_norm": 2.3986475467681885, + "learning_rate": 4.332892325867629e-06, + "loss": 0.5047, + "step": 3097 + }, + { + "epoch": 1.464775413711584, + "grad_norm": 2.5045125484466553, + "learning_rate": 4.332468029350926e-06, + "loss": 0.4721, + "step": 3098 + }, + { + "epoch": 1.4652482269503546, + "grad_norm": 2.347365617752075, + "learning_rate": 4.332043618734474e-06, + "loss": 0.4913, + "step": 3099 + }, + { + "epoch": 1.4657210401891252, + "grad_norm": 2.459928512573242, + "learning_rate": 4.331619094044699e-06, + "loss": 0.523, + "step": 3100 + }, + { + "epoch": 1.466193853427896, + "grad_norm": 2.5771310329437256, + "learning_rate": 4.331194455308035e-06, + "loss": 0.593, + "step": 3101 + }, + { + "epoch": 1.4666666666666668, + "grad_norm": 3.1351823806762695, + "learning_rate": 4.330769702550921e-06, + "loss": 0.5852, + "step": 3102 + }, + { + "epoch": 1.4671394799054374, + "grad_norm": 2.589817523956299, + "learning_rate": 4.330344835799806e-06, + "loss": 0.508, + "step": 3103 + }, + { + "epoch": 1.467612293144208, + "grad_norm": 3.1140341758728027, + "learning_rate": 4.329919855081144e-06, + "loss": 0.469, + "step": 3104 + }, + { + "epoch": 1.4680851063829787, + "grad_norm": 2.8186635971069336, + "learning_rate": 4.329494760421396e-06, + "loss": 0.5088, + "step": 3105 + }, + { + "epoch": 1.4685579196217495, + "grad_norm": 2.676077365875244, + "learning_rate": 4.329069551847031e-06, + "loss": 0.52, + "step": 3106 + }, + { + "epoch": 1.46903073286052, + "grad_norm": 2.5543313026428223, + "learning_rate": 4.328644229384526e-06, + "loss": 0.5066, + "step": 3107 + }, + { + "epoch": 1.4695035460992907, + "grad_norm": 2.8176217079162598, + "learning_rate": 4.328218793060362e-06, + "loss": 0.6404, + "step": 3108 + }, + { + "epoch": 1.4699763593380615, + "grad_norm": 2.485217332839966, + "learning_rate": 4.3277932429010314e-06, + "loss": 0.4578, + "step": 3109 + }, + { + "epoch": 1.4704491725768323, + "grad_norm": 2.6741621494293213, + "learning_rate": 4.327367578933031e-06, + "loss": 0.5068, + "step": 3110 + }, + { + "epoch": 1.4709219858156029, + "grad_norm": 2.377242088317871, + "learning_rate": 4.326941801182863e-06, + "loss": 0.5249, + "step": 3111 + }, + { + "epoch": 1.4713947990543734, + "grad_norm": 2.790046215057373, + "learning_rate": 4.32651590967704e-06, + "loss": 0.5532, + "step": 3112 + }, + { + "epoch": 1.4718676122931442, + "grad_norm": 2.78019642829895, + "learning_rate": 4.326089904442081e-06, + "loss": 0.5362, + "step": 3113 + }, + { + "epoch": 1.472340425531915, + "grad_norm": 2.5661380290985107, + "learning_rate": 4.32566378550451e-06, + "loss": 0.5041, + "step": 3114 + }, + { + "epoch": 1.4728132387706856, + "grad_norm": 2.522153615951538, + "learning_rate": 4.3252375528908605e-06, + "loss": 0.5074, + "step": 3115 + }, + { + "epoch": 1.4732860520094562, + "grad_norm": 2.874688148498535, + "learning_rate": 4.3248112066276725e-06, + "loss": 0.59, + "step": 3116 + }, + { + "epoch": 1.473758865248227, + "grad_norm": 3.067866802215576, + "learning_rate": 4.324384746741492e-06, + "loss": 0.5924, + "step": 3117 + }, + { + "epoch": 1.4742316784869978, + "grad_norm": 3.359463930130005, + "learning_rate": 4.323958173258873e-06, + "loss": 0.6346, + "step": 3118 + }, + { + "epoch": 1.4747044917257683, + "grad_norm": 2.193024158477783, + "learning_rate": 4.323531486206376e-06, + "loss": 0.4594, + "step": 3119 + }, + { + "epoch": 1.475177304964539, + "grad_norm": 2.886889934539795, + "learning_rate": 4.323104685610569e-06, + "loss": 0.523, + "step": 3120 + }, + { + "epoch": 1.4756501182033097, + "grad_norm": 2.7558681964874268, + "learning_rate": 4.322677771498028e-06, + "loss": 0.5387, + "step": 3121 + }, + { + "epoch": 1.4761229314420805, + "grad_norm": 2.639277935028076, + "learning_rate": 4.322250743895335e-06, + "loss": 0.5599, + "step": 3122 + }, + { + "epoch": 1.476595744680851, + "grad_norm": 2.786198616027832, + "learning_rate": 4.321823602829078e-06, + "loss": 0.5405, + "step": 3123 + }, + { + "epoch": 1.4770685579196217, + "grad_norm": 2.582315683364868, + "learning_rate": 4.321396348325853e-06, + "loss": 0.4452, + "step": 3124 + }, + { + "epoch": 1.4775413711583925, + "grad_norm": 2.8574297428131104, + "learning_rate": 4.320968980412265e-06, + "loss": 0.4846, + "step": 3125 + }, + { + "epoch": 1.4780141843971633, + "grad_norm": 2.705281972885132, + "learning_rate": 4.320541499114922e-06, + "loss": 0.5548, + "step": 3126 + }, + { + "epoch": 1.4784869976359338, + "grad_norm": 2.3152754306793213, + "learning_rate": 4.320113904460444e-06, + "loss": 0.5216, + "step": 3127 + }, + { + "epoch": 1.4789598108747044, + "grad_norm": 3.230764150619507, + "learning_rate": 4.319686196475453e-06, + "loss": 0.6192, + "step": 3128 + }, + { + "epoch": 1.4794326241134752, + "grad_norm": 2.463380813598633, + "learning_rate": 4.319258375186583e-06, + "loss": 0.4872, + "step": 3129 + }, + { + "epoch": 1.479905437352246, + "grad_norm": 2.8477656841278076, + "learning_rate": 4.31883044062047e-06, + "loss": 0.5371, + "step": 3130 + }, + { + "epoch": 1.4803782505910166, + "grad_norm": 2.393911123275757, + "learning_rate": 4.318402392803762e-06, + "loss": 0.5334, + "step": 3131 + }, + { + "epoch": 1.4808510638297872, + "grad_norm": 2.6113736629486084, + "learning_rate": 4.317974231763109e-06, + "loss": 0.5572, + "step": 3132 + }, + { + "epoch": 1.481323877068558, + "grad_norm": 2.3941731452941895, + "learning_rate": 4.317545957525173e-06, + "loss": 0.4849, + "step": 3133 + }, + { + "epoch": 1.4817966903073285, + "grad_norm": 2.9536755084991455, + "learning_rate": 4.317117570116619e-06, + "loss": 0.6058, + "step": 3134 + }, + { + "epoch": 1.4822695035460993, + "grad_norm": 2.595754623413086, + "learning_rate": 4.316689069564123e-06, + "loss": 0.5193, + "step": 3135 + }, + { + "epoch": 1.48274231678487, + "grad_norm": 2.569833993911743, + "learning_rate": 4.316260455894364e-06, + "loss": 0.543, + "step": 3136 + }, + { + "epoch": 1.4832151300236407, + "grad_norm": 2.5137455463409424, + "learning_rate": 4.315831729134031e-06, + "loss": 0.5415, + "step": 3137 + }, + { + "epoch": 1.4836879432624113, + "grad_norm": 2.5582292079925537, + "learning_rate": 4.3154028893098176e-06, + "loss": 0.5338, + "step": 3138 + }, + { + "epoch": 1.484160756501182, + "grad_norm": 2.666426181793213, + "learning_rate": 4.3149739364484265e-06, + "loss": 0.5435, + "step": 3139 + }, + { + "epoch": 1.4846335697399526, + "grad_norm": 2.790851354598999, + "learning_rate": 4.314544870576568e-06, + "loss": 0.5746, + "step": 3140 + }, + { + "epoch": 1.4851063829787234, + "grad_norm": 2.620326042175293, + "learning_rate": 4.314115691720956e-06, + "loss": 0.5076, + "step": 3141 + }, + { + "epoch": 1.485579196217494, + "grad_norm": 3.075674533843994, + "learning_rate": 4.313686399908314e-06, + "loss": 0.5486, + "step": 3142 + }, + { + "epoch": 1.4860520094562648, + "grad_norm": 3.1347315311431885, + "learning_rate": 4.3132569951653745e-06, + "loss": 0.531, + "step": 3143 + }, + { + "epoch": 1.4865248226950354, + "grad_norm": 2.5783653259277344, + "learning_rate": 4.312827477518871e-06, + "loss": 0.5818, + "step": 3144 + }, + { + "epoch": 1.4869976359338062, + "grad_norm": 3.0247137546539307, + "learning_rate": 4.3123978469955505e-06, + "loss": 0.5347, + "step": 3145 + }, + { + "epoch": 1.4874704491725768, + "grad_norm": 2.4789345264434814, + "learning_rate": 4.311968103622163e-06, + "loss": 0.5, + "step": 3146 + }, + { + "epoch": 1.4879432624113476, + "grad_norm": 2.663341522216797, + "learning_rate": 4.311538247425466e-06, + "loss": 0.4825, + "step": 3147 + }, + { + "epoch": 1.4884160756501181, + "grad_norm": 2.633711099624634, + "learning_rate": 4.311108278432226e-06, + "loss": 0.5244, + "step": 3148 + }, + { + "epoch": 1.488888888888889, + "grad_norm": 2.51312518119812, + "learning_rate": 4.310678196669216e-06, + "loss": 0.513, + "step": 3149 + }, + { + "epoch": 1.4893617021276595, + "grad_norm": 2.5263755321502686, + "learning_rate": 4.310248002163214e-06, + "loss": 0.5236, + "step": 3150 + }, + { + "epoch": 1.4898345153664303, + "grad_norm": 2.559216260910034, + "learning_rate": 4.309817694941007e-06, + "loss": 0.5107, + "step": 3151 + }, + { + "epoch": 1.4903073286052009, + "grad_norm": 2.5023303031921387, + "learning_rate": 4.309387275029386e-06, + "loss": 0.4685, + "step": 3152 + }, + { + "epoch": 1.4907801418439717, + "grad_norm": 3.0314254760742188, + "learning_rate": 4.308956742455155e-06, + "loss": 0.5462, + "step": 3153 + }, + { + "epoch": 1.4912529550827422, + "grad_norm": 2.675295114517212, + "learning_rate": 4.308526097245119e-06, + "loss": 0.5398, + "step": 3154 + }, + { + "epoch": 1.491725768321513, + "grad_norm": 2.6613399982452393, + "learning_rate": 4.308095339426094e-06, + "loss": 0.5376, + "step": 3155 + }, + { + "epoch": 1.4921985815602836, + "grad_norm": 2.58937668800354, + "learning_rate": 4.307664469024899e-06, + "loss": 0.5385, + "step": 3156 + }, + { + "epoch": 1.4926713947990544, + "grad_norm": 2.583631992340088, + "learning_rate": 4.3072334860683655e-06, + "loss": 0.4927, + "step": 3157 + }, + { + "epoch": 1.493144208037825, + "grad_norm": 2.5889222621917725, + "learning_rate": 4.306802390583327e-06, + "loss": 0.47, + "step": 3158 + }, + { + "epoch": 1.4936170212765958, + "grad_norm": 2.9362716674804688, + "learning_rate": 4.3063711825966244e-06, + "loss": 0.4902, + "step": 3159 + }, + { + "epoch": 1.4940898345153664, + "grad_norm": 2.5385425090789795, + "learning_rate": 4.305939862135111e-06, + "loss": 0.5396, + "step": 3160 + }, + { + "epoch": 1.4945626477541372, + "grad_norm": 2.776326894760132, + "learning_rate": 4.305508429225641e-06, + "loss": 0.5169, + "step": 3161 + }, + { + "epoch": 1.4950354609929077, + "grad_norm": 2.575063467025757, + "learning_rate": 4.305076883895076e-06, + "loss": 0.4938, + "step": 3162 + }, + { + "epoch": 1.4955082742316785, + "grad_norm": 2.7552313804626465, + "learning_rate": 4.304645226170291e-06, + "loss": 0.6211, + "step": 3163 + }, + { + "epoch": 1.4959810874704491, + "grad_norm": 2.57149338722229, + "learning_rate": 4.30421345607816e-06, + "loss": 0.5241, + "step": 3164 + }, + { + "epoch": 1.49645390070922, + "grad_norm": 2.8142426013946533, + "learning_rate": 4.303781573645568e-06, + "loss": 0.5699, + "step": 3165 + }, + { + "epoch": 1.4969267139479905, + "grad_norm": 2.6344845294952393, + "learning_rate": 4.303349578899407e-06, + "loss": 0.5049, + "step": 3166 + }, + { + "epoch": 1.4973995271867613, + "grad_norm": 2.554410934448242, + "learning_rate": 4.302917471866575e-06, + "loss": 0.4404, + "step": 3167 + }, + { + "epoch": 1.4978723404255319, + "grad_norm": 2.896240711212158, + "learning_rate": 4.302485252573978e-06, + "loss": 0.602, + "step": 3168 + }, + { + "epoch": 1.4983451536643027, + "grad_norm": 2.4044477939605713, + "learning_rate": 4.302052921048527e-06, + "loss": 0.4857, + "step": 3169 + }, + { + "epoch": 1.4988179669030732, + "grad_norm": 2.7447879314422607, + "learning_rate": 4.301620477317144e-06, + "loss": 0.5438, + "step": 3170 + }, + { + "epoch": 1.499290780141844, + "grad_norm": 2.851820945739746, + "learning_rate": 4.301187921406752e-06, + "loss": 0.5245, + "step": 3171 + }, + { + "epoch": 1.4997635933806146, + "grad_norm": 3.247114419937134, + "learning_rate": 4.300755253344287e-06, + "loss": 0.504, + "step": 3172 + }, + { + "epoch": 1.5002364066193854, + "grad_norm": 3.117490291595459, + "learning_rate": 4.300322473156688e-06, + "loss": 0.4627, + "step": 3173 + }, + { + "epoch": 1.500709219858156, + "grad_norm": 2.558319330215454, + "learning_rate": 4.299889580870904e-06, + "loss": 0.5721, + "step": 3174 + }, + { + "epoch": 1.5011820330969265, + "grad_norm": 2.8983113765716553, + "learning_rate": 4.2994565765138865e-06, + "loss": 0.5257, + "step": 3175 + }, + { + "epoch": 1.5016548463356973, + "grad_norm": 2.744056463241577, + "learning_rate": 4.299023460112599e-06, + "loss": 0.4892, + "step": 3176 + }, + { + "epoch": 1.5021276595744681, + "grad_norm": 2.5506751537323, + "learning_rate": 4.29859023169401e-06, + "loss": 0.4933, + "step": 3177 + }, + { + "epoch": 1.5026004728132387, + "grad_norm": 2.842615842819214, + "learning_rate": 4.298156891285092e-06, + "loss": 0.6124, + "step": 3178 + }, + { + "epoch": 1.5030732860520093, + "grad_norm": 2.5355329513549805, + "learning_rate": 4.2977234389128305e-06, + "loss": 0.641, + "step": 3179 + }, + { + "epoch": 1.50354609929078, + "grad_norm": 2.674781084060669, + "learning_rate": 4.297289874604213e-06, + "loss": 0.475, + "step": 3180 + }, + { + "epoch": 1.5040189125295509, + "grad_norm": 2.6845548152923584, + "learning_rate": 4.296856198386235e-06, + "loss": 0.5328, + "step": 3181 + }, + { + "epoch": 1.5044917257683215, + "grad_norm": 2.9686241149902344, + "learning_rate": 4.296422410285902e-06, + "loss": 0.6216, + "step": 3182 + }, + { + "epoch": 1.504964539007092, + "grad_norm": 2.5095980167388916, + "learning_rate": 4.295988510330222e-06, + "loss": 0.4993, + "step": 3183 + }, + { + "epoch": 1.5054373522458628, + "grad_norm": 2.4906392097473145, + "learning_rate": 4.2955544985462125e-06, + "loss": 0.4795, + "step": 3184 + }, + { + "epoch": 1.5059101654846336, + "grad_norm": 2.5593366622924805, + "learning_rate": 4.295120374960897e-06, + "loss": 0.5527, + "step": 3185 + }, + { + "epoch": 1.5063829787234042, + "grad_norm": 2.691495180130005, + "learning_rate": 4.294686139601308e-06, + "loss": 0.5646, + "step": 3186 + }, + { + "epoch": 1.5068557919621748, + "grad_norm": 2.74320387840271, + "learning_rate": 4.294251792494483e-06, + "loss": 0.6149, + "step": 3187 + }, + { + "epoch": 1.5073286052009456, + "grad_norm": 2.8827052116394043, + "learning_rate": 4.293817333667465e-06, + "loss": 0.5414, + "step": 3188 + }, + { + "epoch": 1.5078014184397164, + "grad_norm": 2.5652425289154053, + "learning_rate": 4.293382763147308e-06, + "loss": 0.5006, + "step": 3189 + }, + { + "epoch": 1.508274231678487, + "grad_norm": 2.729295253753662, + "learning_rate": 4.29294808096107e-06, + "loss": 0.522, + "step": 3190 + }, + { + "epoch": 1.5087470449172575, + "grad_norm": 2.348118305206299, + "learning_rate": 4.292513287135817e-06, + "loss": 0.4125, + "step": 3191 + }, + { + "epoch": 1.5092198581560283, + "grad_norm": 2.809551954269409, + "learning_rate": 4.292078381698621e-06, + "loss": 0.5577, + "step": 3192 + }, + { + "epoch": 1.5096926713947991, + "grad_norm": 2.6925361156463623, + "learning_rate": 4.291643364676563e-06, + "loss": 0.62, + "step": 3193 + }, + { + "epoch": 1.5101654846335697, + "grad_norm": 2.4200620651245117, + "learning_rate": 4.291208236096729e-06, + "loss": 0.5464, + "step": 3194 + }, + { + "epoch": 1.5106382978723403, + "grad_norm": 2.5659191608428955, + "learning_rate": 4.290772995986211e-06, + "loss": 0.5402, + "step": 3195 + }, + { + "epoch": 1.511111111111111, + "grad_norm": 2.3877315521240234, + "learning_rate": 4.290337644372113e-06, + "loss": 0.463, + "step": 3196 + }, + { + "epoch": 1.5115839243498819, + "grad_norm": 2.7063233852386475, + "learning_rate": 4.289902181281538e-06, + "loss": 0.5253, + "step": 3197 + }, + { + "epoch": 1.5120567375886524, + "grad_norm": 2.56788969039917, + "learning_rate": 4.289466606741603e-06, + "loss": 0.5012, + "step": 3198 + }, + { + "epoch": 1.512529550827423, + "grad_norm": 2.637164831161499, + "learning_rate": 4.28903092077943e-06, + "loss": 0.5236, + "step": 3199 + }, + { + "epoch": 1.5130023640661938, + "grad_norm": 2.767526865005493, + "learning_rate": 4.288595123422146e-06, + "loss": 0.5832, + "step": 3200 + }, + { + "epoch": 1.5134751773049646, + "grad_norm": 2.33365535736084, + "learning_rate": 4.2881592146968866e-06, + "loss": 0.4548, + "step": 3201 + }, + { + "epoch": 1.5139479905437352, + "grad_norm": 2.544189453125, + "learning_rate": 4.287723194630793e-06, + "loss": 0.5115, + "step": 3202 + }, + { + "epoch": 1.5144208037825058, + "grad_norm": 2.588793992996216, + "learning_rate": 4.2872870632510155e-06, + "loss": 0.4766, + "step": 3203 + }, + { + "epoch": 1.5148936170212766, + "grad_norm": 2.5382184982299805, + "learning_rate": 4.286850820584709e-06, + "loss": 0.5401, + "step": 3204 + }, + { + "epoch": 1.5153664302600474, + "grad_norm": 2.597930669784546, + "learning_rate": 4.286414466659038e-06, + "loss": 0.5346, + "step": 3205 + }, + { + "epoch": 1.515839243498818, + "grad_norm": 2.8522393703460693, + "learning_rate": 4.28597800150117e-06, + "loss": 0.486, + "step": 3206 + }, + { + "epoch": 1.5163120567375885, + "grad_norm": 2.4801454544067383, + "learning_rate": 4.285541425138285e-06, + "loss": 0.5162, + "step": 3207 + }, + { + "epoch": 1.5167848699763593, + "grad_norm": 2.353665351867676, + "learning_rate": 4.285104737597563e-06, + "loss": 0.5066, + "step": 3208 + }, + { + "epoch": 1.51725768321513, + "grad_norm": 2.767976760864258, + "learning_rate": 4.2846679389061975e-06, + "loss": 0.5331, + "step": 3209 + }, + { + "epoch": 1.5177304964539007, + "grad_norm": 2.9307682514190674, + "learning_rate": 4.284231029091385e-06, + "loss": 0.5291, + "step": 3210 + }, + { + "epoch": 1.5182033096926713, + "grad_norm": 2.39719820022583, + "learning_rate": 4.283794008180329e-06, + "loss": 0.4759, + "step": 3211 + }, + { + "epoch": 1.518676122931442, + "grad_norm": 2.452244758605957, + "learning_rate": 4.283356876200242e-06, + "loss": 0.4283, + "step": 3212 + }, + { + "epoch": 1.5191489361702128, + "grad_norm": 2.4911608695983887, + "learning_rate": 4.282919633178343e-06, + "loss": 0.4812, + "step": 3213 + }, + { + "epoch": 1.5196217494089834, + "grad_norm": 2.5813944339752197, + "learning_rate": 4.282482279141856e-06, + "loss": 0.4911, + "step": 3214 + }, + { + "epoch": 1.520094562647754, + "grad_norm": 2.503542184829712, + "learning_rate": 4.282044814118013e-06, + "loss": 0.4969, + "step": 3215 + }, + { + "epoch": 1.5205673758865248, + "grad_norm": 2.5090713500976562, + "learning_rate": 4.281607238134053e-06, + "loss": 0.5293, + "step": 3216 + }, + { + "epoch": 1.5210401891252956, + "grad_norm": 2.425994396209717, + "learning_rate": 4.281169551217223e-06, + "loss": 0.5365, + "step": 3217 + }, + { + "epoch": 1.5215130023640662, + "grad_norm": 2.637655258178711, + "learning_rate": 4.2807317533947765e-06, + "loss": 0.5589, + "step": 3218 + }, + { + "epoch": 1.5219858156028367, + "grad_norm": 2.9335296154022217, + "learning_rate": 4.28029384469397e-06, + "loss": 0.6071, + "step": 3219 + }, + { + "epoch": 1.5224586288416075, + "grad_norm": 2.898683547973633, + "learning_rate": 4.279855825142073e-06, + "loss": 0.5392, + "step": 3220 + }, + { + "epoch": 1.5229314420803783, + "grad_norm": 2.613914966583252, + "learning_rate": 4.279417694766359e-06, + "loss": 0.4968, + "step": 3221 + }, + { + "epoch": 1.523404255319149, + "grad_norm": 2.500682830810547, + "learning_rate": 4.278979453594106e-06, + "loss": 0.471, + "step": 3222 + }, + { + "epoch": 1.5238770685579195, + "grad_norm": 2.5269598960876465, + "learning_rate": 4.278541101652605e-06, + "loss": 0.471, + "step": 3223 + }, + { + "epoch": 1.5243498817966903, + "grad_norm": 2.8153114318847656, + "learning_rate": 4.2781026389691465e-06, + "loss": 0.5742, + "step": 3224 + }, + { + "epoch": 1.524822695035461, + "grad_norm": 2.5648019313812256, + "learning_rate": 4.277664065571034e-06, + "loss": 0.5315, + "step": 3225 + }, + { + "epoch": 1.5252955082742317, + "grad_norm": 2.778355836868286, + "learning_rate": 4.277225381485575e-06, + "loss": 0.5543, + "step": 3226 + }, + { + "epoch": 1.5257683215130022, + "grad_norm": 2.6736745834350586, + "learning_rate": 4.2767865867400846e-06, + "loss": 0.4947, + "step": 3227 + }, + { + "epoch": 1.526241134751773, + "grad_norm": 2.9560294151306152, + "learning_rate": 4.276347681361884e-06, + "loss": 0.5835, + "step": 3228 + }, + { + "epoch": 1.5267139479905438, + "grad_norm": 2.5580296516418457, + "learning_rate": 4.275908665378302e-06, + "loss": 0.4751, + "step": 3229 + }, + { + "epoch": 1.5271867612293144, + "grad_norm": 3.0705175399780273, + "learning_rate": 4.2754695388166755e-06, + "loss": 0.5327, + "step": 3230 + }, + { + "epoch": 1.527659574468085, + "grad_norm": 2.664652109146118, + "learning_rate": 4.275030301704346e-06, + "loss": 0.4934, + "step": 3231 + }, + { + "epoch": 1.5281323877068558, + "grad_norm": 2.308499813079834, + "learning_rate": 4.274590954068663e-06, + "loss": 0.4412, + "step": 3232 + }, + { + "epoch": 1.5286052009456266, + "grad_norm": 2.871189594268799, + "learning_rate": 4.2741514959369815e-06, + "loss": 0.5001, + "step": 3233 + }, + { + "epoch": 1.5290780141843971, + "grad_norm": 2.5274453163146973, + "learning_rate": 4.273711927336666e-06, + "loss": 0.4938, + "step": 3234 + }, + { + "epoch": 1.5295508274231677, + "grad_norm": 2.8848133087158203, + "learning_rate": 4.273272248295087e-06, + "loss": 0.5397, + "step": 3235 + }, + { + "epoch": 1.5300236406619385, + "grad_norm": 2.3927090167999268, + "learning_rate": 4.27283245883962e-06, + "loss": 0.5497, + "step": 3236 + }, + { + "epoch": 1.5304964539007093, + "grad_norm": 2.5413873195648193, + "learning_rate": 4.27239255899765e-06, + "loss": 0.5108, + "step": 3237 + }, + { + "epoch": 1.53096926713948, + "grad_norm": 2.7692389488220215, + "learning_rate": 4.271952548796567e-06, + "loss": 0.5768, + "step": 3238 + }, + { + "epoch": 1.5314420803782505, + "grad_norm": 2.4621126651763916, + "learning_rate": 4.271512428263768e-06, + "loss": 0.4698, + "step": 3239 + }, + { + "epoch": 1.5319148936170213, + "grad_norm": 2.6423375606536865, + "learning_rate": 4.271072197426659e-06, + "loss": 0.4929, + "step": 3240 + }, + { + "epoch": 1.532387706855792, + "grad_norm": 2.7097692489624023, + "learning_rate": 4.270631856312649e-06, + "loss": 0.4836, + "step": 3241 + }, + { + "epoch": 1.5328605200945626, + "grad_norm": 2.545706272125244, + "learning_rate": 4.270191404949158e-06, + "loss": 0.4636, + "step": 3242 + }, + { + "epoch": 1.5333333333333332, + "grad_norm": 3.138781785964966, + "learning_rate": 4.26975084336361e-06, + "loss": 0.5988, + "step": 3243 + }, + { + "epoch": 1.533806146572104, + "grad_norm": 2.492715835571289, + "learning_rate": 4.269310171583438e-06, + "loss": 0.5095, + "step": 3244 + }, + { + "epoch": 1.5342789598108748, + "grad_norm": 2.5705838203430176, + "learning_rate": 4.268869389636077e-06, + "loss": 0.4818, + "step": 3245 + }, + { + "epoch": 1.5347517730496454, + "grad_norm": 2.7633554935455322, + "learning_rate": 4.268428497548979e-06, + "loss": 0.547, + "step": 3246 + }, + { + "epoch": 1.535224586288416, + "grad_norm": 2.654528856277466, + "learning_rate": 4.2679874953495905e-06, + "loss": 0.5261, + "step": 3247 + }, + { + "epoch": 1.5356973995271868, + "grad_norm": 2.5039751529693604, + "learning_rate": 4.2675463830653744e-06, + "loss": 0.4941, + "step": 3248 + }, + { + "epoch": 1.5361702127659576, + "grad_norm": 2.897268295288086, + "learning_rate": 4.267105160723794e-06, + "loss": 0.5404, + "step": 3249 + }, + { + "epoch": 1.5366430260047281, + "grad_norm": 2.500732421875, + "learning_rate": 4.266663828352324e-06, + "loss": 0.5375, + "step": 3250 + }, + { + "epoch": 1.5371158392434987, + "grad_norm": 2.6310064792633057, + "learning_rate": 4.266222385978444e-06, + "loss": 0.5217, + "step": 3251 + }, + { + "epoch": 1.5375886524822695, + "grad_norm": 2.7440476417541504, + "learning_rate": 4.265780833629642e-06, + "loss": 0.5419, + "step": 3252 + }, + { + "epoch": 1.5380614657210403, + "grad_norm": 2.7037577629089355, + "learning_rate": 4.2653391713334095e-06, + "loss": 0.5634, + "step": 3253 + }, + { + "epoch": 1.5385342789598109, + "grad_norm": 2.548525810241699, + "learning_rate": 4.264897399117248e-06, + "loss": 0.535, + "step": 3254 + }, + { + "epoch": 1.5390070921985815, + "grad_norm": 2.6127355098724365, + "learning_rate": 4.264455517008663e-06, + "loss": 0.4619, + "step": 3255 + }, + { + "epoch": 1.5394799054373522, + "grad_norm": 2.5597004890441895, + "learning_rate": 4.264013525035171e-06, + "loss": 0.4477, + "step": 3256 + }, + { + "epoch": 1.539952718676123, + "grad_norm": 2.642432689666748, + "learning_rate": 4.263571423224292e-06, + "loss": 0.4749, + "step": 3257 + }, + { + "epoch": 1.5404255319148936, + "grad_norm": 2.5121877193450928, + "learning_rate": 4.2631292116035526e-06, + "loss": 0.4693, + "step": 3258 + }, + { + "epoch": 1.5408983451536642, + "grad_norm": 2.390292167663574, + "learning_rate": 4.262686890200489e-06, + "loss": 0.4872, + "step": 3259 + }, + { + "epoch": 1.541371158392435, + "grad_norm": 2.5898337364196777, + "learning_rate": 4.2622444590426405e-06, + "loss": 0.5193, + "step": 3260 + }, + { + "epoch": 1.5418439716312058, + "grad_norm": 2.508821487426758, + "learning_rate": 4.261801918157558e-06, + "loss": 0.511, + "step": 3261 + }, + { + "epoch": 1.5423167848699764, + "grad_norm": 2.6992101669311523, + "learning_rate": 4.261359267572795e-06, + "loss": 0.5069, + "step": 3262 + }, + { + "epoch": 1.542789598108747, + "grad_norm": 2.6011030673980713, + "learning_rate": 4.2609165073159145e-06, + "loss": 0.5887, + "step": 3263 + }, + { + "epoch": 1.5432624113475177, + "grad_norm": 2.887053966522217, + "learning_rate": 4.260473637414483e-06, + "loss": 0.5556, + "step": 3264 + }, + { + "epoch": 1.5437352245862885, + "grad_norm": 2.6433887481689453, + "learning_rate": 4.260030657896079e-06, + "loss": 0.4728, + "step": 3265 + }, + { + "epoch": 1.544208037825059, + "grad_norm": 2.6134607791900635, + "learning_rate": 4.259587568788282e-06, + "loss": 0.483, + "step": 3266 + }, + { + "epoch": 1.5446808510638297, + "grad_norm": 2.5308640003204346, + "learning_rate": 4.259144370118684e-06, + "loss": 0.5115, + "step": 3267 + }, + { + "epoch": 1.5451536643026005, + "grad_norm": 2.8256733417510986, + "learning_rate": 4.258701061914879e-06, + "loss": 0.5414, + "step": 3268 + }, + { + "epoch": 1.5456264775413713, + "grad_norm": 2.8648319244384766, + "learning_rate": 4.258257644204471e-06, + "loss": 0.5695, + "step": 3269 + }, + { + "epoch": 1.5460992907801419, + "grad_norm": 2.8568081855773926, + "learning_rate": 4.257814117015069e-06, + "loss": 0.5264, + "step": 3270 + }, + { + "epoch": 1.5465721040189124, + "grad_norm": 2.6065011024475098, + "learning_rate": 4.257370480374289e-06, + "loss": 0.5646, + "step": 3271 + }, + { + "epoch": 1.5470449172576832, + "grad_norm": 2.7840216159820557, + "learning_rate": 4.256926734309756e-06, + "loss": 0.5191, + "step": 3272 + }, + { + "epoch": 1.547517730496454, + "grad_norm": 2.85906982421875, + "learning_rate": 4.256482878849099e-06, + "loss": 0.5911, + "step": 3273 + }, + { + "epoch": 1.5479905437352246, + "grad_norm": 2.916029930114746, + "learning_rate": 4.256038914019954e-06, + "loss": 0.5589, + "step": 3274 + }, + { + "epoch": 1.5484633569739952, + "grad_norm": 2.6748716831207275, + "learning_rate": 4.255594839849967e-06, + "loss": 0.5323, + "step": 3275 + }, + { + "epoch": 1.548936170212766, + "grad_norm": 2.717212200164795, + "learning_rate": 4.255150656366787e-06, + "loss": 0.453, + "step": 3276 + }, + { + "epoch": 1.5494089834515368, + "grad_norm": 2.4974849224090576, + "learning_rate": 4.254706363598072e-06, + "loss": 0.4516, + "step": 3277 + }, + { + "epoch": 1.5498817966903073, + "grad_norm": 2.648151397705078, + "learning_rate": 4.254261961571485e-06, + "loss": 0.5452, + "step": 3278 + }, + { + "epoch": 1.550354609929078, + "grad_norm": 2.932905435562134, + "learning_rate": 4.253817450314699e-06, + "loss": 0.4813, + "step": 3279 + }, + { + "epoch": 1.5508274231678487, + "grad_norm": 2.862912178039551, + "learning_rate": 4.25337282985539e-06, + "loss": 0.5689, + "step": 3280 + }, + { + "epoch": 1.5513002364066195, + "grad_norm": 2.532156467437744, + "learning_rate": 4.2529281002212436e-06, + "loss": 0.485, + "step": 3281 + }, + { + "epoch": 1.55177304964539, + "grad_norm": 2.583299160003662, + "learning_rate": 4.25248326143995e-06, + "loss": 0.4661, + "step": 3282 + }, + { + "epoch": 1.5522458628841607, + "grad_norm": 2.5790653228759766, + "learning_rate": 4.252038313539209e-06, + "loss": 0.5455, + "step": 3283 + }, + { + "epoch": 1.5527186761229315, + "grad_norm": 2.872864007949829, + "learning_rate": 4.251593256546724e-06, + "loss": 0.5317, + "step": 3284 + }, + { + "epoch": 1.5531914893617023, + "grad_norm": 3.0382463932037354, + "learning_rate": 4.251148090490208e-06, + "loss": 0.5131, + "step": 3285 + }, + { + "epoch": 1.5536643026004728, + "grad_norm": 2.574399709701538, + "learning_rate": 4.250702815397379e-06, + "loss": 0.5399, + "step": 3286 + }, + { + "epoch": 1.5541371158392434, + "grad_norm": 2.9784770011901855, + "learning_rate": 4.250257431295962e-06, + "loss": 0.5209, + "step": 3287 + }, + { + "epoch": 1.5546099290780142, + "grad_norm": 2.6482062339782715, + "learning_rate": 4.249811938213689e-06, + "loss": 0.5416, + "step": 3288 + }, + { + "epoch": 1.555082742316785, + "grad_norm": 2.82142972946167, + "learning_rate": 4.2493663361783e-06, + "loss": 0.594, + "step": 3289 + }, + { + "epoch": 1.5555555555555556, + "grad_norm": 2.815595865249634, + "learning_rate": 4.24892062521754e-06, + "loss": 0.5381, + "step": 3290 + }, + { + "epoch": 1.5560283687943262, + "grad_norm": 2.689764976501465, + "learning_rate": 4.248474805359161e-06, + "loss": 0.5141, + "step": 3291 + }, + { + "epoch": 1.556501182033097, + "grad_norm": 2.7718515396118164, + "learning_rate": 4.248028876630922e-06, + "loss": 0.5324, + "step": 3292 + }, + { + "epoch": 1.5569739952718678, + "grad_norm": 3.0196774005889893, + "learning_rate": 4.247582839060591e-06, + "loss": 0.4971, + "step": 3293 + }, + { + "epoch": 1.5574468085106383, + "grad_norm": 2.608475923538208, + "learning_rate": 4.247136692675939e-06, + "loss": 0.5795, + "step": 3294 + }, + { + "epoch": 1.557919621749409, + "grad_norm": 2.4912326335906982, + "learning_rate": 4.246690437504746e-06, + "loss": 0.5348, + "step": 3295 + }, + { + "epoch": 1.5583924349881797, + "grad_norm": 2.519303560256958, + "learning_rate": 4.246244073574799e-06, + "loss": 0.4953, + "step": 3296 + }, + { + "epoch": 1.5588652482269505, + "grad_norm": 2.5667171478271484, + "learning_rate": 4.24579760091389e-06, + "loss": 0.5353, + "step": 3297 + }, + { + "epoch": 1.559338061465721, + "grad_norm": 2.8835761547088623, + "learning_rate": 4.24535101954982e-06, + "loss": 0.578, + "step": 3298 + }, + { + "epoch": 1.5598108747044916, + "grad_norm": 3.0506930351257324, + "learning_rate": 4.244904329510395e-06, + "loss": 0.6418, + "step": 3299 + }, + { + "epoch": 1.5602836879432624, + "grad_norm": 2.579446315765381, + "learning_rate": 4.244457530823428e-06, + "loss": 0.5027, + "step": 3300 + }, + { + "epoch": 1.5607565011820332, + "grad_norm": 2.72012996673584, + "learning_rate": 4.24401062351674e-06, + "loss": 0.5438, + "step": 3301 + }, + { + "epoch": 1.5612293144208038, + "grad_norm": 2.527007818222046, + "learning_rate": 4.243563607618158e-06, + "loss": 0.5303, + "step": 3302 + }, + { + "epoch": 1.5617021276595744, + "grad_norm": 2.4415159225463867, + "learning_rate": 4.243116483155516e-06, + "loss": 0.4893, + "step": 3303 + }, + { + "epoch": 1.5621749408983452, + "grad_norm": 2.462256669998169, + "learning_rate": 4.242669250156653e-06, + "loss": 0.5671, + "step": 3304 + }, + { + "epoch": 1.562647754137116, + "grad_norm": 2.479865074157715, + "learning_rate": 4.242221908649418e-06, + "loss": 0.5038, + "step": 3305 + }, + { + "epoch": 1.5631205673758866, + "grad_norm": 2.74670672416687, + "learning_rate": 4.241774458661662e-06, + "loss": 0.5689, + "step": 3306 + }, + { + "epoch": 1.5635933806146571, + "grad_norm": 2.55938982963562, + "learning_rate": 4.24132690022125e-06, + "loss": 0.492, + "step": 3307 + }, + { + "epoch": 1.564066193853428, + "grad_norm": 2.634956121444702, + "learning_rate": 4.240879233356048e-06, + "loss": 0.503, + "step": 3308 + }, + { + "epoch": 1.5645390070921987, + "grad_norm": 2.381775140762329, + "learning_rate": 4.240431458093928e-06, + "loss": 0.4939, + "step": 3309 + }, + { + "epoch": 1.5650118203309693, + "grad_norm": 2.8176610469818115, + "learning_rate": 4.239983574462774e-06, + "loss": 0.5609, + "step": 3310 + }, + { + "epoch": 1.5654846335697399, + "grad_norm": 3.0268442630767822, + "learning_rate": 4.239535582490471e-06, + "loss": 0.5427, + "step": 3311 + }, + { + "epoch": 1.5659574468085107, + "grad_norm": 2.5881481170654297, + "learning_rate": 4.239087482204916e-06, + "loss": 0.5538, + "step": 3312 + }, + { + "epoch": 1.5664302600472815, + "grad_norm": 2.5317704677581787, + "learning_rate": 4.238639273634008e-06, + "loss": 0.4915, + "step": 3313 + }, + { + "epoch": 1.566903073286052, + "grad_norm": 2.9608731269836426, + "learning_rate": 4.238190956805658e-06, + "loss": 0.564, + "step": 3314 + }, + { + "epoch": 1.5673758865248226, + "grad_norm": 3.022686243057251, + "learning_rate": 4.237742531747777e-06, + "loss": 0.5503, + "step": 3315 + }, + { + "epoch": 1.5678486997635934, + "grad_norm": 2.763622283935547, + "learning_rate": 4.23729399848829e-06, + "loss": 0.5241, + "step": 3316 + }, + { + "epoch": 1.5683215130023642, + "grad_norm": 2.6112794876098633, + "learning_rate": 4.236845357055122e-06, + "loss": 0.4919, + "step": 3317 + }, + { + "epoch": 1.5687943262411348, + "grad_norm": 2.649829149246216, + "learning_rate": 4.23639660747621e-06, + "loss": 0.5472, + "step": 3318 + }, + { + "epoch": 1.5692671394799054, + "grad_norm": 2.8888115882873535, + "learning_rate": 4.2359477497794955e-06, + "loss": 0.5077, + "step": 3319 + }, + { + "epoch": 1.5697399527186762, + "grad_norm": 2.5666911602020264, + "learning_rate": 4.235498783992927e-06, + "loss": 0.5365, + "step": 3320 + }, + { + "epoch": 1.570212765957447, + "grad_norm": 2.448758363723755, + "learning_rate": 4.2350497101444575e-06, + "loss": 0.5043, + "step": 3321 + }, + { + "epoch": 1.5706855791962175, + "grad_norm": 2.595207691192627, + "learning_rate": 4.234600528262052e-06, + "loss": 0.5303, + "step": 3322 + }, + { + "epoch": 1.5711583924349881, + "grad_norm": 2.7814228534698486, + "learning_rate": 4.234151238373676e-06, + "loss": 0.4521, + "step": 3323 + }, + { + "epoch": 1.571631205673759, + "grad_norm": 2.781538724899292, + "learning_rate": 4.233701840507308e-06, + "loss": 0.5193, + "step": 3324 + }, + { + "epoch": 1.5721040189125297, + "grad_norm": 2.771907329559326, + "learning_rate": 4.233252334690928e-06, + "loss": 0.497, + "step": 3325 + }, + { + "epoch": 1.5725768321513003, + "grad_norm": 2.5557498931884766, + "learning_rate": 4.232802720952525e-06, + "loss": 0.4913, + "step": 3326 + }, + { + "epoch": 1.5730496453900709, + "grad_norm": 2.478267192840576, + "learning_rate": 4.232352999320094e-06, + "loss": 0.4967, + "step": 3327 + }, + { + "epoch": 1.5735224586288417, + "grad_norm": 3.1548502445220947, + "learning_rate": 4.231903169821639e-06, + "loss": 0.5009, + "step": 3328 + }, + { + "epoch": 1.5739952718676125, + "grad_norm": 2.634824275970459, + "learning_rate": 4.231453232485168e-06, + "loss": 0.5223, + "step": 3329 + }, + { + "epoch": 1.574468085106383, + "grad_norm": 2.579102039337158, + "learning_rate": 4.231003187338695e-06, + "loss": 0.5513, + "step": 3330 + }, + { + "epoch": 1.5749408983451536, + "grad_norm": 2.8477070331573486, + "learning_rate": 4.230553034410245e-06, + "loss": 0.561, + "step": 3331 + }, + { + "epoch": 1.5754137115839244, + "grad_norm": 2.6714725494384766, + "learning_rate": 4.2301027737278446e-06, + "loss": 0.4687, + "step": 3332 + }, + { + "epoch": 1.5758865248226952, + "grad_norm": 2.6562764644622803, + "learning_rate": 4.229652405319532e-06, + "loss": 0.5925, + "step": 3333 + }, + { + "epoch": 1.5763593380614658, + "grad_norm": 2.750946283340454, + "learning_rate": 4.229201929213348e-06, + "loss": 0.4748, + "step": 3334 + }, + { + "epoch": 1.5768321513002364, + "grad_norm": 2.760470151901245, + "learning_rate": 4.228751345437342e-06, + "loss": 0.5989, + "step": 3335 + }, + { + "epoch": 1.5773049645390071, + "grad_norm": 3.1451845169067383, + "learning_rate": 4.2283006540195706e-06, + "loss": 0.562, + "step": 3336 + }, + { + "epoch": 1.5777777777777777, + "grad_norm": 2.563011407852173, + "learning_rate": 4.227849854988095e-06, + "loss": 0.5473, + "step": 3337 + }, + { + "epoch": 1.5782505910165483, + "grad_norm": 2.310469388961792, + "learning_rate": 4.2273989483709856e-06, + "loss": 0.5033, + "step": 3338 + }, + { + "epoch": 1.578723404255319, + "grad_norm": 2.677978754043579, + "learning_rate": 4.226947934196318e-06, + "loss": 0.5291, + "step": 3339 + }, + { + "epoch": 1.57919621749409, + "grad_norm": 3.0423545837402344, + "learning_rate": 4.226496812492176e-06, + "loss": 0.5201, + "step": 3340 + }, + { + "epoch": 1.5796690307328605, + "grad_norm": 2.357513904571533, + "learning_rate": 4.226045583286647e-06, + "loss": 0.4421, + "step": 3341 + }, + { + "epoch": 1.580141843971631, + "grad_norm": 2.719860315322876, + "learning_rate": 4.225594246607828e-06, + "loss": 0.4855, + "step": 3342 + }, + { + "epoch": 1.5806146572104018, + "grad_norm": 3.2645058631896973, + "learning_rate": 4.2251428024838215e-06, + "loss": 0.6654, + "step": 3343 + }, + { + "epoch": 1.5810874704491726, + "grad_norm": 2.2997004985809326, + "learning_rate": 4.224691250942737e-06, + "loss": 0.4565, + "step": 3344 + }, + { + "epoch": 1.5815602836879432, + "grad_norm": 2.8103034496307373, + "learning_rate": 4.2242395920126926e-06, + "loss": 0.5543, + "step": 3345 + }, + { + "epoch": 1.5820330969267138, + "grad_norm": 2.720254898071289, + "learning_rate": 4.223787825721808e-06, + "loss": 0.5028, + "step": 3346 + }, + { + "epoch": 1.5825059101654846, + "grad_norm": 2.735544204711914, + "learning_rate": 4.223335952098214e-06, + "loss": 0.5169, + "step": 3347 + }, + { + "epoch": 1.5829787234042554, + "grad_norm": 2.784254550933838, + "learning_rate": 4.222883971170047e-06, + "loss": 0.4989, + "step": 3348 + }, + { + "epoch": 1.583451536643026, + "grad_norm": 2.7192094326019287, + "learning_rate": 4.22243188296545e-06, + "loss": 0.502, + "step": 3349 + }, + { + "epoch": 1.5839243498817965, + "grad_norm": 2.716501474380493, + "learning_rate": 4.221979687512573e-06, + "loss": 0.5687, + "step": 3350 + }, + { + "epoch": 1.5843971631205673, + "grad_norm": 2.8420114517211914, + "learning_rate": 4.22152738483957e-06, + "loss": 0.5903, + "step": 3351 + }, + { + "epoch": 1.5848699763593381, + "grad_norm": 2.734872579574585, + "learning_rate": 4.2210749749746065e-06, + "loss": 0.5397, + "step": 3352 + }, + { + "epoch": 1.5853427895981087, + "grad_norm": 2.4343836307525635, + "learning_rate": 4.220622457945851e-06, + "loss": 0.436, + "step": 3353 + }, + { + "epoch": 1.5858156028368793, + "grad_norm": 2.728177547454834, + "learning_rate": 4.2201698337814785e-06, + "loss": 0.5703, + "step": 3354 + }, + { + "epoch": 1.58628841607565, + "grad_norm": 2.502098560333252, + "learning_rate": 4.219717102509674e-06, + "loss": 0.5275, + "step": 3355 + }, + { + "epoch": 1.5867612293144209, + "grad_norm": 2.6595494747161865, + "learning_rate": 4.219264264158627e-06, + "loss": 0.4659, + "step": 3356 + }, + { + "epoch": 1.5872340425531914, + "grad_norm": 2.5307185649871826, + "learning_rate": 4.218811318756532e-06, + "loss": 0.5048, + "step": 3357 + }, + { + "epoch": 1.587706855791962, + "grad_norm": 2.9300129413604736, + "learning_rate": 4.218358266331593e-06, + "loss": 0.5137, + "step": 3358 + }, + { + "epoch": 1.5881796690307328, + "grad_norm": 2.686586618423462, + "learning_rate": 4.21790510691202e-06, + "loss": 0.4529, + "step": 3359 + }, + { + "epoch": 1.5886524822695036, + "grad_norm": 2.9981517791748047, + "learning_rate": 4.217451840526029e-06, + "loss": 0.6054, + "step": 3360 + }, + { + "epoch": 1.5891252955082742, + "grad_norm": 2.6943674087524414, + "learning_rate": 4.216998467201841e-06, + "loss": 0.5153, + "step": 3361 + }, + { + "epoch": 1.5895981087470448, + "grad_norm": 2.707084894180298, + "learning_rate": 4.216544986967689e-06, + "loss": 0.5235, + "step": 3362 + }, + { + "epoch": 1.5900709219858156, + "grad_norm": 2.6553728580474854, + "learning_rate": 4.216091399851808e-06, + "loss": 0.5275, + "step": 3363 + }, + { + "epoch": 1.5905437352245864, + "grad_norm": 2.9136953353881836, + "learning_rate": 4.215637705882439e-06, + "loss": 0.5834, + "step": 3364 + }, + { + "epoch": 1.591016548463357, + "grad_norm": 2.7647159099578857, + "learning_rate": 4.2151839050878325e-06, + "loss": 0.5641, + "step": 3365 + }, + { + "epoch": 1.5914893617021275, + "grad_norm": 2.4556827545166016, + "learning_rate": 4.214729997496246e-06, + "loss": 0.5636, + "step": 3366 + }, + { + "epoch": 1.5919621749408983, + "grad_norm": 2.6111652851104736, + "learning_rate": 4.2142759831359414e-06, + "loss": 0.5097, + "step": 3367 + }, + { + "epoch": 1.592434988179669, + "grad_norm": 2.4886903762817383, + "learning_rate": 4.213821862035189e-06, + "loss": 0.531, + "step": 3368 + }, + { + "epoch": 1.5929078014184397, + "grad_norm": 2.5245840549468994, + "learning_rate": 4.213367634222263e-06, + "loss": 0.5085, + "step": 3369 + }, + { + "epoch": 1.5933806146572103, + "grad_norm": 2.970214605331421, + "learning_rate": 4.212913299725447e-06, + "loss": 0.5851, + "step": 3370 + }, + { + "epoch": 1.593853427895981, + "grad_norm": 2.5433361530303955, + "learning_rate": 4.212458858573032e-06, + "loss": 0.48, + "step": 3371 + }, + { + "epoch": 1.5943262411347519, + "grad_norm": 2.3550102710723877, + "learning_rate": 4.212004310793312e-06, + "loss": 0.4405, + "step": 3372 + }, + { + "epoch": 1.5947990543735224, + "grad_norm": 2.4824719429016113, + "learning_rate": 4.2115496564145896e-06, + "loss": 0.4634, + "step": 3373 + }, + { + "epoch": 1.595271867612293, + "grad_norm": 2.4751930236816406, + "learning_rate": 4.211094895465176e-06, + "loss": 0.5662, + "step": 3374 + }, + { + "epoch": 1.5957446808510638, + "grad_norm": 2.4193356037139893, + "learning_rate": 4.210640027973386e-06, + "loss": 0.4441, + "step": 3375 + }, + { + "epoch": 1.5962174940898346, + "grad_norm": 2.4477498531341553, + "learning_rate": 4.210185053967543e-06, + "loss": 0.5205, + "step": 3376 + }, + { + "epoch": 1.5966903073286052, + "grad_norm": 2.7954161167144775, + "learning_rate": 4.209729973475976e-06, + "loss": 0.4951, + "step": 3377 + }, + { + "epoch": 1.5971631205673757, + "grad_norm": 3.1907570362091064, + "learning_rate": 4.209274786527019e-06, + "loss": 0.6024, + "step": 3378 + }, + { + "epoch": 1.5976359338061465, + "grad_norm": 2.485245704650879, + "learning_rate": 4.2088194931490165e-06, + "loss": 0.5652, + "step": 3379 + }, + { + "epoch": 1.5981087470449173, + "grad_norm": 2.589310884475708, + "learning_rate": 4.208364093370317e-06, + "loss": 0.5085, + "step": 3380 + }, + { + "epoch": 1.598581560283688, + "grad_norm": 2.8941214084625244, + "learning_rate": 4.207908587219276e-06, + "loss": 0.53, + "step": 3381 + }, + { + "epoch": 1.5990543735224585, + "grad_norm": 2.480509042739868, + "learning_rate": 4.207452974724258e-06, + "loss": 0.4543, + "step": 3382 + }, + { + "epoch": 1.5995271867612293, + "grad_norm": 2.7884905338287354, + "learning_rate": 4.206997255913629e-06, + "loss": 0.5483, + "step": 3383 + }, + { + "epoch": 1.6, + "grad_norm": 2.7976696491241455, + "learning_rate": 4.206541430815766e-06, + "loss": 0.4734, + "step": 3384 + }, + { + "epoch": 1.6004728132387707, + "grad_norm": 2.5463132858276367, + "learning_rate": 4.206085499459051e-06, + "loss": 0.4931, + "step": 3385 + }, + { + "epoch": 1.6009456264775412, + "grad_norm": 2.8384251594543457, + "learning_rate": 4.205629461871871e-06, + "loss": 0.5066, + "step": 3386 + }, + { + "epoch": 1.601418439716312, + "grad_norm": 2.8578574657440186, + "learning_rate": 4.205173318082626e-06, + "loss": 0.458, + "step": 3387 + }, + { + "epoch": 1.6018912529550828, + "grad_norm": 2.7779932022094727, + "learning_rate": 4.204717068119715e-06, + "loss": 0.5293, + "step": 3388 + }, + { + "epoch": 1.6023640661938534, + "grad_norm": 2.9123778343200684, + "learning_rate": 4.204260712011546e-06, + "loss": 0.4866, + "step": 3389 + }, + { + "epoch": 1.602836879432624, + "grad_norm": 2.757922887802124, + "learning_rate": 4.203804249786537e-06, + "loss": 0.4925, + "step": 3390 + }, + { + "epoch": 1.6033096926713948, + "grad_norm": 3.287733316421509, + "learning_rate": 4.203347681473107e-06, + "loss": 0.6694, + "step": 3391 + }, + { + "epoch": 1.6037825059101656, + "grad_norm": 3.2117912769317627, + "learning_rate": 4.202891007099687e-06, + "loss": 0.5269, + "step": 3392 + }, + { + "epoch": 1.6042553191489362, + "grad_norm": 2.8489456176757812, + "learning_rate": 4.20243422669471e-06, + "loss": 0.5073, + "step": 3393 + }, + { + "epoch": 1.6047281323877067, + "grad_norm": 2.7660224437713623, + "learning_rate": 4.201977340286619e-06, + "loss": 0.5014, + "step": 3394 + }, + { + "epoch": 1.6052009456264775, + "grad_norm": 2.68182110786438, + "learning_rate": 4.201520347903862e-06, + "loss": 0.4542, + "step": 3395 + }, + { + "epoch": 1.6056737588652483, + "grad_norm": 2.7546045780181885, + "learning_rate": 4.2010632495748934e-06, + "loss": 0.516, + "step": 3396 + }, + { + "epoch": 1.606146572104019, + "grad_norm": 2.744668483734131, + "learning_rate": 4.200606045328176e-06, + "loss": 0.5243, + "step": 3397 + }, + { + "epoch": 1.6066193853427895, + "grad_norm": 2.935343027114868, + "learning_rate": 4.200148735192177e-06, + "loss": 0.5624, + "step": 3398 + }, + { + "epoch": 1.6070921985815603, + "grad_norm": 2.7392852306365967, + "learning_rate": 4.19969131919537e-06, + "loss": 0.5796, + "step": 3399 + }, + { + "epoch": 1.607565011820331, + "grad_norm": 2.864750385284424, + "learning_rate": 4.199233797366239e-06, + "loss": 0.549, + "step": 3400 + }, + { + "epoch": 1.6080378250591016, + "grad_norm": 2.684157371520996, + "learning_rate": 4.198776169733269e-06, + "loss": 0.5532, + "step": 3401 + }, + { + "epoch": 1.6085106382978722, + "grad_norm": 2.4717135429382324, + "learning_rate": 4.198318436324957e-06, + "loss": 0.5174, + "step": 3402 + }, + { + "epoch": 1.608983451536643, + "grad_norm": 2.640242338180542, + "learning_rate": 4.197860597169802e-06, + "loss": 0.5117, + "step": 3403 + }, + { + "epoch": 1.6094562647754138, + "grad_norm": 2.4957473278045654, + "learning_rate": 4.197402652296313e-06, + "loss": 0.474, + "step": 3404 + }, + { + "epoch": 1.6099290780141844, + "grad_norm": 2.416138172149658, + "learning_rate": 4.196944601733004e-06, + "loss": 0.4858, + "step": 3405 + }, + { + "epoch": 1.610401891252955, + "grad_norm": 2.4498109817504883, + "learning_rate": 4.196486445508395e-06, + "loss": 0.5048, + "step": 3406 + }, + { + "epoch": 1.6108747044917258, + "grad_norm": 2.415895938873291, + "learning_rate": 4.196028183651014e-06, + "loss": 0.4745, + "step": 3407 + }, + { + "epoch": 1.6113475177304966, + "grad_norm": 2.843665838241577, + "learning_rate": 4.195569816189395e-06, + "loss": 0.5219, + "step": 3408 + }, + { + "epoch": 1.6118203309692671, + "grad_norm": 2.608579158782959, + "learning_rate": 4.195111343152079e-06, + "loss": 0.4941, + "step": 3409 + }, + { + "epoch": 1.6122931442080377, + "grad_norm": 2.643789529800415, + "learning_rate": 4.194652764567611e-06, + "loss": 0.515, + "step": 3410 + }, + { + "epoch": 1.6127659574468085, + "grad_norm": 2.8099429607391357, + "learning_rate": 4.194194080464547e-06, + "loss": 0.4935, + "step": 3411 + }, + { + "epoch": 1.6132387706855793, + "grad_norm": 2.595628261566162, + "learning_rate": 4.193735290871446e-06, + "loss": 0.5571, + "step": 3412 + }, + { + "epoch": 1.6137115839243499, + "grad_norm": 2.7903778553009033, + "learning_rate": 4.193276395816876e-06, + "loss": 0.5228, + "step": 3413 + }, + { + "epoch": 1.6141843971631205, + "grad_norm": 2.83910870552063, + "learning_rate": 4.192817395329409e-06, + "loss": 0.6124, + "step": 3414 + }, + { + "epoch": 1.6146572104018913, + "grad_norm": 2.6155734062194824, + "learning_rate": 4.192358289437626e-06, + "loss": 0.552, + "step": 3415 + }, + { + "epoch": 1.615130023640662, + "grad_norm": 2.795832872390747, + "learning_rate": 4.191899078170113e-06, + "loss": 0.5561, + "step": 3416 + }, + { + "epoch": 1.6156028368794326, + "grad_norm": 2.3402161598205566, + "learning_rate": 4.191439761555464e-06, + "loss": 0.4889, + "step": 3417 + }, + { + "epoch": 1.6160756501182032, + "grad_norm": 3.1183433532714844, + "learning_rate": 4.190980339622276e-06, + "loss": 0.5337, + "step": 3418 + }, + { + "epoch": 1.616548463356974, + "grad_norm": 2.6262872219085693, + "learning_rate": 4.190520812399158e-06, + "loss": 0.525, + "step": 3419 + }, + { + "epoch": 1.6170212765957448, + "grad_norm": 2.578340530395508, + "learning_rate": 4.190061179914722e-06, + "loss": 0.4975, + "step": 3420 + }, + { + "epoch": 1.6174940898345154, + "grad_norm": 3.19482159614563, + "learning_rate": 4.189601442197586e-06, + "loss": 0.5832, + "step": 3421 + }, + { + "epoch": 1.617966903073286, + "grad_norm": 2.6398792266845703, + "learning_rate": 4.189141599276378e-06, + "loss": 0.4676, + "step": 3422 + }, + { + "epoch": 1.6184397163120567, + "grad_norm": 2.624865770339966, + "learning_rate": 4.1886816511797275e-06, + "loss": 0.4507, + "step": 3423 + }, + { + "epoch": 1.6189125295508275, + "grad_norm": 2.4136857986450195, + "learning_rate": 4.1882215979362775e-06, + "loss": 0.4616, + "step": 3424 + }, + { + "epoch": 1.6193853427895981, + "grad_norm": 2.6906614303588867, + "learning_rate": 4.18776143957467e-06, + "loss": 0.5142, + "step": 3425 + }, + { + "epoch": 1.6198581560283687, + "grad_norm": 2.5149154663085938, + "learning_rate": 4.187301176123558e-06, + "loss": 0.5252, + "step": 3426 + }, + { + "epoch": 1.6203309692671395, + "grad_norm": 2.677405834197998, + "learning_rate": 4.186840807611602e-06, + "loss": 0.4635, + "step": 3427 + }, + { + "epoch": 1.6208037825059103, + "grad_norm": 2.7164649963378906, + "learning_rate": 4.186380334067464e-06, + "loss": 0.5634, + "step": 3428 + }, + { + "epoch": 1.6212765957446809, + "grad_norm": 2.8299832344055176, + "learning_rate": 4.185919755519817e-06, + "loss": 0.5166, + "step": 3429 + }, + { + "epoch": 1.6217494089834514, + "grad_norm": 2.465848207473755, + "learning_rate": 4.18545907199734e-06, + "loss": 0.4696, + "step": 3430 + }, + { + "epoch": 1.6222222222222222, + "grad_norm": 2.407616376876831, + "learning_rate": 4.1849982835287175e-06, + "loss": 0.5111, + "step": 3431 + }, + { + "epoch": 1.622695035460993, + "grad_norm": 2.452146291732788, + "learning_rate": 4.184537390142639e-06, + "loss": 0.4574, + "step": 3432 + }, + { + "epoch": 1.6231678486997636, + "grad_norm": 2.653071165084839, + "learning_rate": 4.1840763918678055e-06, + "loss": 0.5611, + "step": 3433 + }, + { + "epoch": 1.6236406619385342, + "grad_norm": 2.5920350551605225, + "learning_rate": 4.183615288732919e-06, + "loss": 0.5437, + "step": 3434 + }, + { + "epoch": 1.624113475177305, + "grad_norm": 2.782900810241699, + "learning_rate": 4.18315408076669e-06, + "loss": 0.5824, + "step": 3435 + }, + { + "epoch": 1.6245862884160758, + "grad_norm": 2.8769774436950684, + "learning_rate": 4.1826927679978365e-06, + "loss": 0.5271, + "step": 3436 + }, + { + "epoch": 1.6250591016548463, + "grad_norm": 2.488598585128784, + "learning_rate": 4.182231350455084e-06, + "loss": 0.4684, + "step": 3437 + }, + { + "epoch": 1.625531914893617, + "grad_norm": 2.6472036838531494, + "learning_rate": 4.181769828167161e-06, + "loss": 0.5372, + "step": 3438 + }, + { + "epoch": 1.6260047281323877, + "grad_norm": 2.6498794555664062, + "learning_rate": 4.1813082011628045e-06, + "loss": 0.4805, + "step": 3439 + }, + { + "epoch": 1.6264775413711585, + "grad_norm": 2.5386533737182617, + "learning_rate": 4.1808464694707595e-06, + "loss": 0.5015, + "step": 3440 + }, + { + "epoch": 1.626950354609929, + "grad_norm": 2.8812551498413086, + "learning_rate": 4.180384633119775e-06, + "loss": 0.5225, + "step": 3441 + }, + { + "epoch": 1.6274231678486997, + "grad_norm": 2.870124578475952, + "learning_rate": 4.179922692138609e-06, + "loss": 0.537, + "step": 3442 + }, + { + "epoch": 1.6278959810874705, + "grad_norm": 2.5759785175323486, + "learning_rate": 4.179460646556021e-06, + "loss": 0.5142, + "step": 3443 + }, + { + "epoch": 1.6283687943262413, + "grad_norm": 2.629347324371338, + "learning_rate": 4.1789984964007836e-06, + "loss": 0.5007, + "step": 3444 + }, + { + "epoch": 1.6288416075650118, + "grad_norm": 2.751128673553467, + "learning_rate": 4.178536241701672e-06, + "loss": 0.5677, + "step": 3445 + }, + { + "epoch": 1.6293144208037824, + "grad_norm": 2.7582364082336426, + "learning_rate": 4.178073882487469e-06, + "loss": 0.499, + "step": 3446 + }, + { + "epoch": 1.6297872340425532, + "grad_norm": 3.136711359024048, + "learning_rate": 4.177611418786963e-06, + "loss": 0.5294, + "step": 3447 + }, + { + "epoch": 1.630260047281324, + "grad_norm": 2.7363100051879883, + "learning_rate": 4.17714885062895e-06, + "loss": 0.5264, + "step": 3448 + }, + { + "epoch": 1.6307328605200946, + "grad_norm": 2.7305946350097656, + "learning_rate": 4.176686178042233e-06, + "loss": 0.5235, + "step": 3449 + }, + { + "epoch": 1.6312056737588652, + "grad_norm": 2.6500556468963623, + "learning_rate": 4.176223401055619e-06, + "loss": 0.5463, + "step": 3450 + }, + { + "epoch": 1.631678486997636, + "grad_norm": 2.756321907043457, + "learning_rate": 4.175760519697924e-06, + "loss": 0.545, + "step": 3451 + }, + { + "epoch": 1.6321513002364068, + "grad_norm": 2.6234960556030273, + "learning_rate": 4.17529753399797e-06, + "loss": 0.4927, + "step": 3452 + }, + { + "epoch": 1.6326241134751773, + "grad_norm": 2.6358842849731445, + "learning_rate": 4.174834443984584e-06, + "loss": 0.5445, + "step": 3453 + }, + { + "epoch": 1.633096926713948, + "grad_norm": 2.541147470474243, + "learning_rate": 4.174371249686601e-06, + "loss": 0.4691, + "step": 3454 + }, + { + "epoch": 1.6335697399527187, + "grad_norm": 2.566981077194214, + "learning_rate": 4.173907951132863e-06, + "loss": 0.4932, + "step": 3455 + }, + { + "epoch": 1.6340425531914895, + "grad_norm": 2.670940399169922, + "learning_rate": 4.173444548352216e-06, + "loss": 0.4979, + "step": 3456 + }, + { + "epoch": 1.63451536643026, + "grad_norm": 2.5440268516540527, + "learning_rate": 4.172981041373515e-06, + "loss": 0.4716, + "step": 3457 + }, + { + "epoch": 1.6349881796690307, + "grad_norm": 2.3801631927490234, + "learning_rate": 4.17251743022562e-06, + "loss": 0.5126, + "step": 3458 + }, + { + "epoch": 1.6354609929078014, + "grad_norm": 2.5051121711730957, + "learning_rate": 4.1720537149373985e-06, + "loss": 0.4964, + "step": 3459 + }, + { + "epoch": 1.6359338061465722, + "grad_norm": 3.5521697998046875, + "learning_rate": 4.171589895537724e-06, + "loss": 0.5447, + "step": 3460 + }, + { + "epoch": 1.6364066193853428, + "grad_norm": 2.6041572093963623, + "learning_rate": 4.171125972055477e-06, + "loss": 0.4637, + "step": 3461 + }, + { + "epoch": 1.6368794326241134, + "grad_norm": 2.2297258377075195, + "learning_rate": 4.170661944519543e-06, + "loss": 0.4702, + "step": 3462 + }, + { + "epoch": 1.6373522458628842, + "grad_norm": 2.6764535903930664, + "learning_rate": 4.170197812958815e-06, + "loss": 0.5111, + "step": 3463 + }, + { + "epoch": 1.637825059101655, + "grad_norm": 2.86892032623291, + "learning_rate": 4.169733577402193e-06, + "loss": 0.5437, + "step": 3464 + }, + { + "epoch": 1.6382978723404256, + "grad_norm": 2.9007070064544678, + "learning_rate": 4.1692692378785825e-06, + "loss": 0.5425, + "step": 3465 + }, + { + "epoch": 1.6387706855791961, + "grad_norm": 2.5902905464172363, + "learning_rate": 4.168804794416896e-06, + "loss": 0.5252, + "step": 3466 + }, + { + "epoch": 1.639243498817967, + "grad_norm": 2.821183681488037, + "learning_rate": 4.168340247046053e-06, + "loss": 0.5265, + "step": 3467 + }, + { + "epoch": 1.6397163120567377, + "grad_norm": 2.7928314208984375, + "learning_rate": 4.167875595794978e-06, + "loss": 0.5151, + "step": 3468 + }, + { + "epoch": 1.6401891252955083, + "grad_norm": 2.3130412101745605, + "learning_rate": 4.167410840692603e-06, + "loss": 0.4941, + "step": 3469 + }, + { + "epoch": 1.6406619385342789, + "grad_norm": 2.6078619956970215, + "learning_rate": 4.1669459817678655e-06, + "loss": 0.493, + "step": 3470 + }, + { + "epoch": 1.6411347517730497, + "grad_norm": 2.5335731506347656, + "learning_rate": 4.166481019049712e-06, + "loss": 0.4969, + "step": 3471 + }, + { + "epoch": 1.6416075650118205, + "grad_norm": 2.8181469440460205, + "learning_rate": 4.166015952567093e-06, + "loss": 0.5062, + "step": 3472 + }, + { + "epoch": 1.642080378250591, + "grad_norm": 2.7256782054901123, + "learning_rate": 4.165550782348966e-06, + "loss": 0.5397, + "step": 3473 + }, + { + "epoch": 1.6425531914893616, + "grad_norm": 2.284345865249634, + "learning_rate": 4.1650855084242946e-06, + "loss": 0.4448, + "step": 3474 + }, + { + "epoch": 1.6430260047281324, + "grad_norm": 3.0383145809173584, + "learning_rate": 4.164620130822049e-06, + "loss": 0.5873, + "step": 3475 + }, + { + "epoch": 1.6434988179669032, + "grad_norm": 2.754448652267456, + "learning_rate": 4.1641546495712085e-06, + "loss": 0.4852, + "step": 3476 + }, + { + "epoch": 1.6439716312056738, + "grad_norm": 2.6820101737976074, + "learning_rate": 4.1636890647007535e-06, + "loss": 0.5325, + "step": 3477 + }, + { + "epoch": 1.6444444444444444, + "grad_norm": 2.6396398544311523, + "learning_rate": 4.163223376239676e-06, + "loss": 0.466, + "step": 3478 + }, + { + "epoch": 1.6449172576832152, + "grad_norm": 2.395049810409546, + "learning_rate": 4.162757584216972e-06, + "loss": 0.4531, + "step": 3479 + }, + { + "epoch": 1.645390070921986, + "grad_norm": 2.596670627593994, + "learning_rate": 4.162291688661645e-06, + "loss": 0.5207, + "step": 3480 + }, + { + "epoch": 1.6458628841607565, + "grad_norm": 2.4391872882843018, + "learning_rate": 4.161825689602703e-06, + "loss": 0.5133, + "step": 3481 + }, + { + "epoch": 1.6463356973995271, + "grad_norm": 2.6169841289520264, + "learning_rate": 4.161359587069162e-06, + "loss": 0.5096, + "step": 3482 + }, + { + "epoch": 1.646808510638298, + "grad_norm": 2.634089946746826, + "learning_rate": 4.1608933810900445e-06, + "loss": 0.4921, + "step": 3483 + }, + { + "epoch": 1.6472813238770687, + "grad_norm": 2.815877914428711, + "learning_rate": 4.160427071694379e-06, + "loss": 0.5045, + "step": 3484 + }, + { + "epoch": 1.6477541371158393, + "grad_norm": 2.417525053024292, + "learning_rate": 4.159960658911199e-06, + "loss": 0.4997, + "step": 3485 + }, + { + "epoch": 1.6482269503546099, + "grad_norm": 2.5713605880737305, + "learning_rate": 4.15949414276955e-06, + "loss": 0.5246, + "step": 3486 + }, + { + "epoch": 1.6486997635933807, + "grad_norm": 3.49833607673645, + "learning_rate": 4.159027523298475e-06, + "loss": 0.4901, + "step": 3487 + }, + { + "epoch": 1.6491725768321515, + "grad_norm": 2.985464334487915, + "learning_rate": 4.158560800527033e-06, + "loss": 0.5726, + "step": 3488 + }, + { + "epoch": 1.649645390070922, + "grad_norm": 2.72745680809021, + "learning_rate": 4.158093974484282e-06, + "loss": 0.5119, + "step": 3489 + }, + { + "epoch": 1.6501182033096926, + "grad_norm": 2.4885571002960205, + "learning_rate": 4.157627045199289e-06, + "loss": 0.4838, + "step": 3490 + }, + { + "epoch": 1.6505910165484634, + "grad_norm": 2.7622628211975098, + "learning_rate": 4.157160012701128e-06, + "loss": 0.5269, + "step": 3491 + }, + { + "epoch": 1.6510638297872342, + "grad_norm": 2.615122079849243, + "learning_rate": 4.156692877018879e-06, + "loss": 0.5501, + "step": 3492 + }, + { + "epoch": 1.6515366430260048, + "grad_norm": 2.827753782272339, + "learning_rate": 4.156225638181631e-06, + "loss": 0.5452, + "step": 3493 + }, + { + "epoch": 1.6520094562647754, + "grad_norm": 2.724820137023926, + "learning_rate": 4.155758296218474e-06, + "loss": 0.5155, + "step": 3494 + }, + { + "epoch": 1.6524822695035462, + "grad_norm": 2.5806174278259277, + "learning_rate": 4.155290851158508e-06, + "loss": 0.5292, + "step": 3495 + }, + { + "epoch": 1.652955082742317, + "grad_norm": 2.5655179023742676, + "learning_rate": 4.154823303030838e-06, + "loss": 0.4959, + "step": 3496 + }, + { + "epoch": 1.6534278959810875, + "grad_norm": 2.656548261642456, + "learning_rate": 4.154355651864579e-06, + "loss": 0.5703, + "step": 3497 + }, + { + "epoch": 1.653900709219858, + "grad_norm": 2.9085004329681396, + "learning_rate": 4.153887897688847e-06, + "loss": 0.5061, + "step": 3498 + }, + { + "epoch": 1.654373522458629, + "grad_norm": 2.608010768890381, + "learning_rate": 4.1534200405327665e-06, + "loss": 0.5165, + "step": 3499 + }, + { + "epoch": 1.6548463356973995, + "grad_norm": 2.600463628768921, + "learning_rate": 4.152952080425471e-06, + "loss": 0.4946, + "step": 3500 + }, + { + "epoch": 1.65531914893617, + "grad_norm": 2.5561563968658447, + "learning_rate": 4.152484017396098e-06, + "loss": 0.4804, + "step": 3501 + }, + { + "epoch": 1.6557919621749408, + "grad_norm": 2.788594961166382, + "learning_rate": 4.152015851473791e-06, + "loss": 0.5635, + "step": 3502 + }, + { + "epoch": 1.6562647754137116, + "grad_norm": 2.693302631378174, + "learning_rate": 4.151547582687699e-06, + "loss": 0.5139, + "step": 3503 + }, + { + "epoch": 1.6567375886524822, + "grad_norm": 2.7887485027313232, + "learning_rate": 4.1510792110669825e-06, + "loss": 0.4952, + "step": 3504 + }, + { + "epoch": 1.6572104018912528, + "grad_norm": 2.8982298374176025, + "learning_rate": 4.150610736640803e-06, + "loss": 0.4136, + "step": 3505 + }, + { + "epoch": 1.6576832151300236, + "grad_norm": 2.7569408416748047, + "learning_rate": 4.150142159438331e-06, + "loss": 0.5272, + "step": 3506 + }, + { + "epoch": 1.6581560283687944, + "grad_norm": 2.531648874282837, + "learning_rate": 4.149673479488742e-06, + "loss": 0.5016, + "step": 3507 + }, + { + "epoch": 1.658628841607565, + "grad_norm": 2.7706353664398193, + "learning_rate": 4.149204696821219e-06, + "loss": 0.5512, + "step": 3508 + }, + { + "epoch": 1.6591016548463355, + "grad_norm": 2.7307450771331787, + "learning_rate": 4.148735811464951e-06, + "loss": 0.4968, + "step": 3509 + }, + { + "epoch": 1.6595744680851063, + "grad_norm": 3.0097429752349854, + "learning_rate": 4.1482668234491335e-06, + "loss": 0.4797, + "step": 3510 + }, + { + "epoch": 1.6600472813238771, + "grad_norm": 2.6045308113098145, + "learning_rate": 4.147797732802969e-06, + "loss": 0.5496, + "step": 3511 + }, + { + "epoch": 1.6605200945626477, + "grad_norm": 2.702061176300049, + "learning_rate": 4.147328539555664e-06, + "loss": 0.5302, + "step": 3512 + }, + { + "epoch": 1.6609929078014183, + "grad_norm": 3.3724892139434814, + "learning_rate": 4.1468592437364356e-06, + "loss": 0.5124, + "step": 3513 + }, + { + "epoch": 1.661465721040189, + "grad_norm": 2.5117242336273193, + "learning_rate": 4.146389845374502e-06, + "loss": 0.4953, + "step": 3514 + }, + { + "epoch": 1.6619385342789599, + "grad_norm": 2.86547589302063, + "learning_rate": 4.145920344499092e-06, + "loss": 0.5337, + "step": 3515 + }, + { + "epoch": 1.6624113475177305, + "grad_norm": 2.745149850845337, + "learning_rate": 4.14545074113944e-06, + "loss": 0.5187, + "step": 3516 + }, + { + "epoch": 1.662884160756501, + "grad_norm": 2.5560994148254395, + "learning_rate": 4.1449810353247855e-06, + "loss": 0.5183, + "step": 3517 + }, + { + "epoch": 1.6633569739952718, + "grad_norm": 2.2318122386932373, + "learning_rate": 4.144511227084374e-06, + "loss": 0.4452, + "step": 3518 + }, + { + "epoch": 1.6638297872340426, + "grad_norm": 2.6980903148651123, + "learning_rate": 4.14404131644746e-06, + "loss": 0.4974, + "step": 3519 + }, + { + "epoch": 1.6643026004728132, + "grad_norm": 2.6875357627868652, + "learning_rate": 4.1435713034433025e-06, + "loss": 0.4582, + "step": 3520 + }, + { + "epoch": 1.6647754137115838, + "grad_norm": 2.9430019855499268, + "learning_rate": 4.143101188101166e-06, + "loss": 0.5004, + "step": 3521 + }, + { + "epoch": 1.6652482269503546, + "grad_norm": 2.4447221755981445, + "learning_rate": 4.142630970450323e-06, + "loss": 0.5436, + "step": 3522 + }, + { + "epoch": 1.6657210401891254, + "grad_norm": 2.571023941040039, + "learning_rate": 4.142160650520053e-06, + "loss": 0.5307, + "step": 3523 + }, + { + "epoch": 1.666193853427896, + "grad_norm": 2.9725306034088135, + "learning_rate": 4.14169022833964e-06, + "loss": 0.5918, + "step": 3524 + }, + { + "epoch": 1.6666666666666665, + "grad_norm": 2.5958926677703857, + "learning_rate": 4.141219703938375e-06, + "loss": 0.5036, + "step": 3525 + }, + { + "epoch": 1.6671394799054373, + "grad_norm": 2.935788631439209, + "learning_rate": 4.140749077345556e-06, + "loss": 0.5773, + "step": 3526 + }, + { + "epoch": 1.6676122931442081, + "grad_norm": 2.5460526943206787, + "learning_rate": 4.140278348590485e-06, + "loss": 0.4762, + "step": 3527 + }, + { + "epoch": 1.6680851063829787, + "grad_norm": 2.5729143619537354, + "learning_rate": 4.139807517702475e-06, + "loss": 0.5515, + "step": 3528 + }, + { + "epoch": 1.6685579196217493, + "grad_norm": 2.4377381801605225, + "learning_rate": 4.13933658471084e-06, + "loss": 0.5383, + "step": 3529 + }, + { + "epoch": 1.66903073286052, + "grad_norm": 2.6284425258636475, + "learning_rate": 4.138865549644905e-06, + "loss": 0.5396, + "step": 3530 + }, + { + "epoch": 1.6695035460992909, + "grad_norm": 2.857250928878784, + "learning_rate": 4.138394412533998e-06, + "loss": 0.5861, + "step": 3531 + }, + { + "epoch": 1.6699763593380614, + "grad_norm": 2.9226012229919434, + "learning_rate": 4.137923173407456e-06, + "loss": 0.5262, + "step": 3532 + }, + { + "epoch": 1.670449172576832, + "grad_norm": 4.839131832122803, + "learning_rate": 4.137451832294619e-06, + "loss": 0.651, + "step": 3533 + }, + { + "epoch": 1.6709219858156028, + "grad_norm": 2.4727771282196045, + "learning_rate": 4.1369803892248375e-06, + "loss": 0.5149, + "step": 3534 + }, + { + "epoch": 1.6713947990543736, + "grad_norm": 2.5391688346862793, + "learning_rate": 4.1365088442274635e-06, + "loss": 0.4907, + "step": 3535 + }, + { + "epoch": 1.6718676122931442, + "grad_norm": 2.5168209075927734, + "learning_rate": 4.136037197331862e-06, + "loss": 0.5091, + "step": 3536 + }, + { + "epoch": 1.6723404255319148, + "grad_norm": 2.6278600692749023, + "learning_rate": 4.135565448567396e-06, + "loss": 0.4357, + "step": 3537 + }, + { + "epoch": 1.6728132387706856, + "grad_norm": 2.835184097290039, + "learning_rate": 4.135093597963441e-06, + "loss": 0.4786, + "step": 3538 + }, + { + "epoch": 1.6732860520094563, + "grad_norm": 2.385328531265259, + "learning_rate": 4.134621645549379e-06, + "loss": 0.4849, + "step": 3539 + }, + { + "epoch": 1.673758865248227, + "grad_norm": 2.6504149436950684, + "learning_rate": 4.134149591354593e-06, + "loss": 0.6037, + "step": 3540 + }, + { + "epoch": 1.6742316784869975, + "grad_norm": 2.945634126663208, + "learning_rate": 4.1336774354084786e-06, + "loss": 0.532, + "step": 3541 + }, + { + "epoch": 1.6747044917257683, + "grad_norm": 2.8373215198516846, + "learning_rate": 4.133205177740434e-06, + "loss": 0.5138, + "step": 3542 + }, + { + "epoch": 1.675177304964539, + "grad_norm": 2.6616621017456055, + "learning_rate": 4.1327328183798634e-06, + "loss": 0.5543, + "step": 3543 + }, + { + "epoch": 1.6756501182033097, + "grad_norm": 3.0843071937561035, + "learning_rate": 4.13226035735618e-06, + "loss": 0.6585, + "step": 3544 + }, + { + "epoch": 1.6761229314420802, + "grad_norm": 2.2214272022247314, + "learning_rate": 4.131787794698802e-06, + "loss": 0.5413, + "step": 3545 + }, + { + "epoch": 1.676595744680851, + "grad_norm": 2.4515018463134766, + "learning_rate": 4.131315130437152e-06, + "loss": 0.4966, + "step": 3546 + }, + { + "epoch": 1.6770685579196218, + "grad_norm": 2.647414207458496, + "learning_rate": 4.130842364600663e-06, + "loss": 0.5401, + "step": 3547 + }, + { + "epoch": 1.6775413711583924, + "grad_norm": 2.648941993713379, + "learning_rate": 4.13036949721877e-06, + "loss": 0.4796, + "step": 3548 + }, + { + "epoch": 1.678014184397163, + "grad_norm": 2.7835679054260254, + "learning_rate": 4.129896528320919e-06, + "loss": 0.5653, + "step": 3549 + }, + { + "epoch": 1.6784869976359338, + "grad_norm": 2.995964288711548, + "learning_rate": 4.129423457936556e-06, + "loss": 0.4999, + "step": 3550 + }, + { + "epoch": 1.6789598108747046, + "grad_norm": 2.5980007648468018, + "learning_rate": 4.1289502860951405e-06, + "loss": 0.5177, + "step": 3551 + }, + { + "epoch": 1.6794326241134752, + "grad_norm": 2.442254066467285, + "learning_rate": 4.128477012826133e-06, + "loss": 0.5062, + "step": 3552 + }, + { + "epoch": 1.6799054373522457, + "grad_norm": 2.3007538318634033, + "learning_rate": 4.1280036381590025e-06, + "loss": 0.5029, + "step": 3553 + }, + { + "epoch": 1.6803782505910165, + "grad_norm": 2.4169347286224365, + "learning_rate": 4.1275301621232245e-06, + "loss": 0.515, + "step": 3554 + }, + { + "epoch": 1.6808510638297873, + "grad_norm": 2.6456379890441895, + "learning_rate": 4.127056584748279e-06, + "loss": 0.5343, + "step": 3555 + }, + { + "epoch": 1.681323877068558, + "grad_norm": 2.6406595706939697, + "learning_rate": 4.1265829060636546e-06, + "loss": 0.5047, + "step": 3556 + }, + { + "epoch": 1.6817966903073285, + "grad_norm": 2.9344475269317627, + "learning_rate": 4.126109126098846e-06, + "loss": 0.5501, + "step": 3557 + }, + { + "epoch": 1.6822695035460993, + "grad_norm": 2.3292455673217773, + "learning_rate": 4.125635244883351e-06, + "loss": 0.463, + "step": 3558 + }, + { + "epoch": 1.68274231678487, + "grad_norm": 2.4150657653808594, + "learning_rate": 4.125161262446677e-06, + "loss": 0.4802, + "step": 3559 + }, + { + "epoch": 1.6832151300236406, + "grad_norm": 2.604292392730713, + "learning_rate": 4.124687178818339e-06, + "loss": 0.5683, + "step": 3560 + }, + { + "epoch": 1.6836879432624112, + "grad_norm": 2.5676791667938232, + "learning_rate": 4.1242129940278544e-06, + "loss": 0.5519, + "step": 3561 + }, + { + "epoch": 1.684160756501182, + "grad_norm": 3.078514814376831, + "learning_rate": 4.123738708104748e-06, + "loss": 0.5194, + "step": 3562 + }, + { + "epoch": 1.6846335697399528, + "grad_norm": 2.893577814102173, + "learning_rate": 4.123264321078552e-06, + "loss": 0.5107, + "step": 3563 + }, + { + "epoch": 1.6851063829787234, + "grad_norm": 2.772413730621338, + "learning_rate": 4.122789832978804e-06, + "loss": 0.6147, + "step": 3564 + }, + { + "epoch": 1.685579196217494, + "grad_norm": 2.5804643630981445, + "learning_rate": 4.12231524383505e-06, + "loss": 0.5057, + "step": 3565 + }, + { + "epoch": 1.6860520094562648, + "grad_norm": 2.599571466445923, + "learning_rate": 4.121840553676839e-06, + "loss": 0.5591, + "step": 3566 + }, + { + "epoch": 1.6865248226950356, + "grad_norm": 2.9124577045440674, + "learning_rate": 4.1213657625337275e-06, + "loss": 0.565, + "step": 3567 + }, + { + "epoch": 1.6869976359338061, + "grad_norm": 2.6582155227661133, + "learning_rate": 4.120890870435281e-06, + "loss": 0.4607, + "step": 3568 + }, + { + "epoch": 1.6874704491725767, + "grad_norm": 2.929227590560913, + "learning_rate": 4.120415877411066e-06, + "loss": 0.5705, + "step": 3569 + }, + { + "epoch": 1.6879432624113475, + "grad_norm": 2.4443247318267822, + "learning_rate": 4.11994078349066e-06, + "loss": 0.4592, + "step": 3570 + }, + { + "epoch": 1.6884160756501183, + "grad_norm": 2.4799163341522217, + "learning_rate": 4.119465588703645e-06, + "loss": 0.5361, + "step": 3571 + }, + { + "epoch": 1.6888888888888889, + "grad_norm": 2.9408936500549316, + "learning_rate": 4.1189902930796085e-06, + "loss": 0.5347, + "step": 3572 + }, + { + "epoch": 1.6893617021276595, + "grad_norm": 3.3348076343536377, + "learning_rate": 4.118514896648146e-06, + "loss": 0.5612, + "step": 3573 + }, + { + "epoch": 1.6898345153664303, + "grad_norm": 2.764889717102051, + "learning_rate": 4.118039399438857e-06, + "loss": 0.4745, + "step": 3574 + }, + { + "epoch": 1.690307328605201, + "grad_norm": 2.7023751735687256, + "learning_rate": 4.11756380148135e-06, + "loss": 0.5106, + "step": 3575 + }, + { + "epoch": 1.6907801418439716, + "grad_norm": 2.8816208839416504, + "learning_rate": 4.117088102805238e-06, + "loss": 0.6016, + "step": 3576 + }, + { + "epoch": 1.6912529550827422, + "grad_norm": 2.215733289718628, + "learning_rate": 4.11661230344014e-06, + "loss": 0.4404, + "step": 3577 + }, + { + "epoch": 1.691725768321513, + "grad_norm": 2.8190999031066895, + "learning_rate": 4.116136403415683e-06, + "loss": 0.5038, + "step": 3578 + }, + { + "epoch": 1.6921985815602838, + "grad_norm": 2.616424083709717, + "learning_rate": 4.115660402761499e-06, + "loss": 0.5493, + "step": 3579 + }, + { + "epoch": 1.6926713947990544, + "grad_norm": 2.7738113403320312, + "learning_rate": 4.115184301507226e-06, + "loss": 0.5416, + "step": 3580 + }, + { + "epoch": 1.693144208037825, + "grad_norm": 2.4793593883514404, + "learning_rate": 4.114708099682509e-06, + "loss": 0.4526, + "step": 3581 + }, + { + "epoch": 1.6936170212765957, + "grad_norm": 2.390652894973755, + "learning_rate": 4.114231797316999e-06, + "loss": 0.4908, + "step": 3582 + }, + { + "epoch": 1.6940898345153665, + "grad_norm": 2.513197660446167, + "learning_rate": 4.113755394440352e-06, + "loss": 0.4738, + "step": 3583 + }, + { + "epoch": 1.6945626477541371, + "grad_norm": 2.504497766494751, + "learning_rate": 4.113278891082234e-06, + "loss": 0.4661, + "step": 3584 + }, + { + "epoch": 1.6950354609929077, + "grad_norm": 2.4966917037963867, + "learning_rate": 4.112802287272314e-06, + "loss": 0.4979, + "step": 3585 + }, + { + "epoch": 1.6955082742316785, + "grad_norm": 2.3129689693450928, + "learning_rate": 4.112325583040265e-06, + "loss": 0.4933, + "step": 3586 + }, + { + "epoch": 1.6959810874704493, + "grad_norm": 2.822136878967285, + "learning_rate": 4.111848778415774e-06, + "loss": 0.5087, + "step": 3587 + }, + { + "epoch": 1.6964539007092199, + "grad_norm": 2.5181210041046143, + "learning_rate": 4.111371873428527e-06, + "loss": 0.4836, + "step": 3588 + }, + { + "epoch": 1.6969267139479904, + "grad_norm": 2.7564687728881836, + "learning_rate": 4.110894868108218e-06, + "loss": 0.5224, + "step": 3589 + }, + { + "epoch": 1.6973995271867612, + "grad_norm": 2.424421787261963, + "learning_rate": 4.11041776248455e-06, + "loss": 0.4552, + "step": 3590 + }, + { + "epoch": 1.697872340425532, + "grad_norm": 2.7013823986053467, + "learning_rate": 4.10994055658723e-06, + "loss": 0.5535, + "step": 3591 + }, + { + "epoch": 1.6983451536643026, + "grad_norm": 2.5660946369171143, + "learning_rate": 4.10946325044597e-06, + "loss": 0.5351, + "step": 3592 + }, + { + "epoch": 1.6988179669030732, + "grad_norm": 2.5598108768463135, + "learning_rate": 4.10898584409049e-06, + "loss": 0.5246, + "step": 3593 + }, + { + "epoch": 1.699290780141844, + "grad_norm": 2.6318907737731934, + "learning_rate": 4.108508337550518e-06, + "loss": 0.5002, + "step": 3594 + }, + { + "epoch": 1.6997635933806148, + "grad_norm": 2.527099132537842, + "learning_rate": 4.108030730855784e-06, + "loss": 0.5366, + "step": 3595 + }, + { + "epoch": 1.7002364066193854, + "grad_norm": 2.8629603385925293, + "learning_rate": 4.107553024036029e-06, + "loss": 0.5742, + "step": 3596 + }, + { + "epoch": 1.700709219858156, + "grad_norm": 2.8084018230438232, + "learning_rate": 4.107075217120994e-06, + "loss": 0.5618, + "step": 3597 + }, + { + "epoch": 1.7011820330969267, + "grad_norm": 3.6470065116882324, + "learning_rate": 4.1065973101404325e-06, + "loss": 0.508, + "step": 3598 + }, + { + "epoch": 1.7016548463356975, + "grad_norm": 3.0332422256469727, + "learning_rate": 4.106119303124102e-06, + "loss": 0.51, + "step": 3599 + }, + { + "epoch": 1.702127659574468, + "grad_norm": 2.4887590408325195, + "learning_rate": 4.105641196101765e-06, + "loss": 0.5109, + "step": 3600 + }, + { + "epoch": 1.7026004728132387, + "grad_norm": 2.6102066040039062, + "learning_rate": 4.105162989103191e-06, + "loss": 0.5278, + "step": 3601 + }, + { + "epoch": 1.7030732860520095, + "grad_norm": 2.771578073501587, + "learning_rate": 4.104684682158156e-06, + "loss": 0.498, + "step": 3602 + }, + { + "epoch": 1.7035460992907803, + "grad_norm": 2.5452702045440674, + "learning_rate": 4.1042062752964425e-06, + "loss": 0.4939, + "step": 3603 + }, + { + "epoch": 1.7040189125295508, + "grad_norm": 2.4287021160125732, + "learning_rate": 4.103727768547838e-06, + "loss": 0.4819, + "step": 3604 + }, + { + "epoch": 1.7044917257683214, + "grad_norm": 2.412280321121216, + "learning_rate": 4.103249161942138e-06, + "loss": 0.5196, + "step": 3605 + }, + { + "epoch": 1.7049645390070922, + "grad_norm": 2.8850717544555664, + "learning_rate": 4.102770455509142e-06, + "loss": 0.5724, + "step": 3606 + }, + { + "epoch": 1.705437352245863, + "grad_norm": 2.7979609966278076, + "learning_rate": 4.102291649278659e-06, + "loss": 0.5295, + "step": 3607 + }, + { + "epoch": 1.7059101654846336, + "grad_norm": 2.762238025665283, + "learning_rate": 4.1018127432805e-06, + "loss": 0.5166, + "step": 3608 + }, + { + "epoch": 1.7063829787234042, + "grad_norm": 2.921586513519287, + "learning_rate": 4.101333737544485e-06, + "loss": 0.5607, + "step": 3609 + }, + { + "epoch": 1.706855791962175, + "grad_norm": 3.001929998397827, + "learning_rate": 4.100854632100439e-06, + "loss": 0.6255, + "step": 3610 + }, + { + "epoch": 1.7073286052009458, + "grad_norm": 2.752713918685913, + "learning_rate": 4.100375426978196e-06, + "loss": 0.5732, + "step": 3611 + }, + { + "epoch": 1.7078014184397163, + "grad_norm": 2.6496472358703613, + "learning_rate": 4.099896122207593e-06, + "loss": 0.5138, + "step": 3612 + }, + { + "epoch": 1.708274231678487, + "grad_norm": 3.0079452991485596, + "learning_rate": 4.099416717818473e-06, + "loss": 0.5746, + "step": 3613 + }, + { + "epoch": 1.7087470449172577, + "grad_norm": 2.5762360095977783, + "learning_rate": 4.098937213840687e-06, + "loss": 0.5308, + "step": 3614 + }, + { + "epoch": 1.7092198581560285, + "grad_norm": 2.6026158332824707, + "learning_rate": 4.098457610304092e-06, + "loss": 0.4857, + "step": 3615 + }, + { + "epoch": 1.709692671394799, + "grad_norm": 2.587583541870117, + "learning_rate": 4.097977907238551e-06, + "loss": 0.4591, + "step": 3616 + }, + { + "epoch": 1.7101654846335697, + "grad_norm": 2.6996991634368896, + "learning_rate": 4.097498104673932e-06, + "loss": 0.5298, + "step": 3617 + }, + { + "epoch": 1.7106382978723405, + "grad_norm": 2.600029945373535, + "learning_rate": 4.097018202640111e-06, + "loss": 0.4726, + "step": 3618 + }, + { + "epoch": 1.7111111111111112, + "grad_norm": 2.8261220455169678, + "learning_rate": 4.096538201166969e-06, + "loss": 0.5242, + "step": 3619 + }, + { + "epoch": 1.7115839243498818, + "grad_norm": 3.053027629852295, + "learning_rate": 4.096058100284394e-06, + "loss": 0.5568, + "step": 3620 + }, + { + "epoch": 1.7120567375886524, + "grad_norm": 2.9638442993164062, + "learning_rate": 4.0955779000222805e-06, + "loss": 0.5325, + "step": 3621 + }, + { + "epoch": 1.7125295508274232, + "grad_norm": 2.731095790863037, + "learning_rate": 4.095097600410527e-06, + "loss": 0.4733, + "step": 3622 + }, + { + "epoch": 1.713002364066194, + "grad_norm": 2.632490873336792, + "learning_rate": 4.09461720147904e-06, + "loss": 0.5253, + "step": 3623 + }, + { + "epoch": 1.7134751773049646, + "grad_norm": 2.847689390182495, + "learning_rate": 4.094136703257732e-06, + "loss": 0.57, + "step": 3624 + }, + { + "epoch": 1.7139479905437351, + "grad_norm": 3.1078696250915527, + "learning_rate": 4.0936561057765215e-06, + "loss": 0.5368, + "step": 3625 + }, + { + "epoch": 1.714420803782506, + "grad_norm": 2.696349620819092, + "learning_rate": 4.0931754090653334e-06, + "loss": 0.491, + "step": 3626 + }, + { + "epoch": 1.7148936170212767, + "grad_norm": 2.712958812713623, + "learning_rate": 4.092694613154099e-06, + "loss": 0.5768, + "step": 3627 + }, + { + "epoch": 1.7153664302600473, + "grad_norm": 2.5421478748321533, + "learning_rate": 4.092213718072754e-06, + "loss": 0.4839, + "step": 3628 + }, + { + "epoch": 1.715839243498818, + "grad_norm": 2.5176162719726562, + "learning_rate": 4.091732723851243e-06, + "loss": 0.5049, + "step": 3629 + }, + { + "epoch": 1.7163120567375887, + "grad_norm": 2.642185926437378, + "learning_rate": 4.091251630519514e-06, + "loss": 0.589, + "step": 3630 + }, + { + "epoch": 1.7167848699763595, + "grad_norm": 2.587348461151123, + "learning_rate": 4.0907704381075245e-06, + "loss": 0.5281, + "step": 3631 + }, + { + "epoch": 1.71725768321513, + "grad_norm": 2.4628195762634277, + "learning_rate": 4.090289146645234e-06, + "loss": 0.5592, + "step": 3632 + }, + { + "epoch": 1.7177304964539006, + "grad_norm": 2.2751028537750244, + "learning_rate": 4.0898077561626125e-06, + "loss": 0.502, + "step": 3633 + }, + { + "epoch": 1.7182033096926714, + "grad_norm": 2.7712769508361816, + "learning_rate": 4.089326266689632e-06, + "loss": 0.5143, + "step": 3634 + }, + { + "epoch": 1.7186761229314422, + "grad_norm": 2.5297727584838867, + "learning_rate": 4.088844678256275e-06, + "loss": 0.5035, + "step": 3635 + }, + { + "epoch": 1.7191489361702128, + "grad_norm": 2.739130735397339, + "learning_rate": 4.088362990892527e-06, + "loss": 0.5959, + "step": 3636 + }, + { + "epoch": 1.7196217494089834, + "grad_norm": 2.3708314895629883, + "learning_rate": 4.08788120462838e-06, + "loss": 0.4796, + "step": 3637 + }, + { + "epoch": 1.7200945626477542, + "grad_norm": 2.7664241790771484, + "learning_rate": 4.087399319493832e-06, + "loss": 0.6052, + "step": 3638 + }, + { + "epoch": 1.720567375886525, + "grad_norm": 2.5900204181671143, + "learning_rate": 4.0869173355188895e-06, + "loss": 0.4955, + "step": 3639 + }, + { + "epoch": 1.7210401891252955, + "grad_norm": 2.6771862506866455, + "learning_rate": 4.0864352527335635e-06, + "loss": 0.4889, + "step": 3640 + }, + { + "epoch": 1.7215130023640661, + "grad_norm": 2.888479471206665, + "learning_rate": 4.085953071167871e-06, + "loss": 0.5719, + "step": 3641 + }, + { + "epoch": 1.721985815602837, + "grad_norm": 2.5967187881469727, + "learning_rate": 4.085470790851833e-06, + "loss": 0.4959, + "step": 3642 + }, + { + "epoch": 1.7224586288416077, + "grad_norm": 2.5317695140838623, + "learning_rate": 4.084988411815483e-06, + "loss": 0.4596, + "step": 3643 + }, + { + "epoch": 1.7229314420803783, + "grad_norm": 2.6531455516815186, + "learning_rate": 4.084505934088853e-06, + "loss": 0.5346, + "step": 3644 + }, + { + "epoch": 1.7234042553191489, + "grad_norm": 2.6525208950042725, + "learning_rate": 4.084023357701987e-06, + "loss": 0.5178, + "step": 3645 + }, + { + "epoch": 1.7238770685579197, + "grad_norm": 2.461954116821289, + "learning_rate": 4.083540682684932e-06, + "loss": 0.4802, + "step": 3646 + }, + { + "epoch": 1.7243498817966905, + "grad_norm": 2.794696807861328, + "learning_rate": 4.083057909067743e-06, + "loss": 0.5148, + "step": 3647 + }, + { + "epoch": 1.724822695035461, + "grad_norm": 2.867572546005249, + "learning_rate": 4.082575036880479e-06, + "loss": 0.5352, + "step": 3648 + }, + { + "epoch": 1.7252955082742316, + "grad_norm": 2.642820358276367, + "learning_rate": 4.082092066153207e-06, + "loss": 0.4652, + "step": 3649 + }, + { + "epoch": 1.7257683215130024, + "grad_norm": 2.782142400741577, + "learning_rate": 4.081608996915999e-06, + "loss": 0.5591, + "step": 3650 + }, + { + "epoch": 1.7262411347517732, + "grad_norm": 2.327331304550171, + "learning_rate": 4.081125829198934e-06, + "loss": 0.4339, + "step": 3651 + }, + { + "epoch": 1.7267139479905438, + "grad_norm": 2.7959988117218018, + "learning_rate": 4.0806425630320965e-06, + "loss": 0.5783, + "step": 3652 + }, + { + "epoch": 1.7271867612293144, + "grad_norm": 2.595053195953369, + "learning_rate": 4.080159198445578e-06, + "loss": 0.4602, + "step": 3653 + }, + { + "epoch": 1.7276595744680852, + "grad_norm": 3.0968129634857178, + "learning_rate": 4.079675735469475e-06, + "loss": 0.5775, + "step": 3654 + }, + { + "epoch": 1.728132387706856, + "grad_norm": 2.628044605255127, + "learning_rate": 4.07919217413389e-06, + "loss": 0.486, + "step": 3655 + }, + { + "epoch": 1.7286052009456265, + "grad_norm": 2.782799005508423, + "learning_rate": 4.078708514468933e-06, + "loss": 0.5282, + "step": 3656 + }, + { + "epoch": 1.729078014184397, + "grad_norm": 2.655365467071533, + "learning_rate": 4.0782247565047205e-06, + "loss": 0.4873, + "step": 3657 + }, + { + "epoch": 1.729550827423168, + "grad_norm": 2.9461584091186523, + "learning_rate": 4.077740900271371e-06, + "loss": 0.548, + "step": 3658 + }, + { + "epoch": 1.7300236406619387, + "grad_norm": 2.5094761848449707, + "learning_rate": 4.077256945799015e-06, + "loss": 0.5437, + "step": 3659 + }, + { + "epoch": 1.7304964539007093, + "grad_norm": 2.555793285369873, + "learning_rate": 4.0767728931177845e-06, + "loss": 0.5268, + "step": 3660 + }, + { + "epoch": 1.7309692671394799, + "grad_norm": 2.4433486461639404, + "learning_rate": 4.07628874225782e-06, + "loss": 0.5211, + "step": 3661 + }, + { + "epoch": 1.7314420803782506, + "grad_norm": 2.365206003189087, + "learning_rate": 4.075804493249267e-06, + "loss": 0.5084, + "step": 3662 + }, + { + "epoch": 1.7319148936170212, + "grad_norm": 2.514305830001831, + "learning_rate": 4.075320146122278e-06, + "loss": 0.4693, + "step": 3663 + }, + { + "epoch": 1.7323877068557918, + "grad_norm": 2.9270083904266357, + "learning_rate": 4.074835700907012e-06, + "loss": 0.5724, + "step": 3664 + }, + { + "epoch": 1.7328605200945626, + "grad_norm": 2.938692569732666, + "learning_rate": 4.0743511576336315e-06, + "loss": 0.5361, + "step": 3665 + }, + { + "epoch": 1.7333333333333334, + "grad_norm": 3.1978867053985596, + "learning_rate": 4.073866516332307e-06, + "loss": 0.6277, + "step": 3666 + }, + { + "epoch": 1.733806146572104, + "grad_norm": 2.3477370738983154, + "learning_rate": 4.073381777033217e-06, + "loss": 0.5139, + "step": 3667 + }, + { + "epoch": 1.7342789598108745, + "grad_norm": 2.5954184532165527, + "learning_rate": 4.072896939766543e-06, + "loss": 0.537, + "step": 3668 + }, + { + "epoch": 1.7347517730496453, + "grad_norm": 2.8999998569488525, + "learning_rate": 4.072412004562472e-06, + "loss": 0.5486, + "step": 3669 + }, + { + "epoch": 1.7352245862884161, + "grad_norm": 2.7320556640625, + "learning_rate": 4.071926971451201e-06, + "loss": 0.6025, + "step": 3670 + }, + { + "epoch": 1.7356973995271867, + "grad_norm": 2.499234676361084, + "learning_rate": 4.0714418404629304e-06, + "loss": 0.456, + "step": 3671 + }, + { + "epoch": 1.7361702127659573, + "grad_norm": 2.485924243927002, + "learning_rate": 4.070956611627867e-06, + "loss": 0.5097, + "step": 3672 + }, + { + "epoch": 1.736643026004728, + "grad_norm": 2.513723373413086, + "learning_rate": 4.070471284976225e-06, + "loss": 0.4744, + "step": 3673 + }, + { + "epoch": 1.7371158392434989, + "grad_norm": 2.281977653503418, + "learning_rate": 4.06998586053822e-06, + "loss": 0.5124, + "step": 3674 + }, + { + "epoch": 1.7375886524822695, + "grad_norm": 2.3683905601501465, + "learning_rate": 4.069500338344081e-06, + "loss": 0.4816, + "step": 3675 + }, + { + "epoch": 1.73806146572104, + "grad_norm": 2.5635924339294434, + "learning_rate": 4.069014718424038e-06, + "loss": 0.5665, + "step": 3676 + }, + { + "epoch": 1.7385342789598108, + "grad_norm": 2.7308456897735596, + "learning_rate": 4.068529000808328e-06, + "loss": 0.534, + "step": 3677 + }, + { + "epoch": 1.7390070921985816, + "grad_norm": 2.788452625274658, + "learning_rate": 4.068043185527196e-06, + "loss": 0.5609, + "step": 3678 + }, + { + "epoch": 1.7394799054373522, + "grad_norm": 2.832368850708008, + "learning_rate": 4.067557272610889e-06, + "loss": 0.553, + "step": 3679 + }, + { + "epoch": 1.7399527186761228, + "grad_norm": 2.9987435340881348, + "learning_rate": 4.067071262089665e-06, + "loss": 0.5, + "step": 3680 + }, + { + "epoch": 1.7404255319148936, + "grad_norm": 3.04913067817688, + "learning_rate": 4.066585153993785e-06, + "loss": 0.5158, + "step": 3681 + }, + { + "epoch": 1.7408983451536644, + "grad_norm": 2.5177130699157715, + "learning_rate": 4.066098948353516e-06, + "loss": 0.4508, + "step": 3682 + }, + { + "epoch": 1.741371158392435, + "grad_norm": 2.8991222381591797, + "learning_rate": 4.065612645199133e-06, + "loss": 0.5268, + "step": 3683 + }, + { + "epoch": 1.7418439716312055, + "grad_norm": 2.4928159713745117, + "learning_rate": 4.0651262445609156e-06, + "loss": 0.5024, + "step": 3684 + }, + { + "epoch": 1.7423167848699763, + "grad_norm": 2.9737319946289062, + "learning_rate": 4.06463974646915e-06, + "loss": 0.5429, + "step": 3685 + }, + { + "epoch": 1.7427895981087471, + "grad_norm": 2.6485493183135986, + "learning_rate": 4.064153150954128e-06, + "loss": 0.5619, + "step": 3686 + }, + { + "epoch": 1.7432624113475177, + "grad_norm": 2.564861297607422, + "learning_rate": 4.063666458046148e-06, + "loss": 0.4878, + "step": 3687 + }, + { + "epoch": 1.7437352245862883, + "grad_norm": 2.6048383712768555, + "learning_rate": 4.063179667775514e-06, + "loss": 0.4836, + "step": 3688 + }, + { + "epoch": 1.744208037825059, + "grad_norm": 2.751638650894165, + "learning_rate": 4.062692780172536e-06, + "loss": 0.5558, + "step": 3689 + }, + { + "epoch": 1.7446808510638299, + "grad_norm": 3.3866634368896484, + "learning_rate": 4.062205795267531e-06, + "loss": 0.4825, + "step": 3690 + }, + { + "epoch": 1.7451536643026004, + "grad_norm": 3.0112249851226807, + "learning_rate": 4.061718713090822e-06, + "loss": 0.5732, + "step": 3691 + }, + { + "epoch": 1.745626477541371, + "grad_norm": 2.5889365673065186, + "learning_rate": 4.061231533672736e-06, + "loss": 0.483, + "step": 3692 + }, + { + "epoch": 1.7460992907801418, + "grad_norm": 2.624598979949951, + "learning_rate": 4.0607442570436085e-06, + "loss": 0.5706, + "step": 3693 + }, + { + "epoch": 1.7465721040189126, + "grad_norm": 2.9219250679016113, + "learning_rate": 4.060256883233779e-06, + "loss": 0.5153, + "step": 3694 + }, + { + "epoch": 1.7470449172576832, + "grad_norm": 3.2219252586364746, + "learning_rate": 4.059769412273595e-06, + "loss": 0.5184, + "step": 3695 + }, + { + "epoch": 1.7475177304964538, + "grad_norm": 2.890697956085205, + "learning_rate": 4.05928184419341e-06, + "loss": 0.5312, + "step": 3696 + }, + { + "epoch": 1.7479905437352246, + "grad_norm": 2.673809289932251, + "learning_rate": 4.0587941790235816e-06, + "loss": 0.4893, + "step": 3697 + }, + { + "epoch": 1.7484633569739954, + "grad_norm": 2.5339348316192627, + "learning_rate": 4.058306416794474e-06, + "loss": 0.5115, + "step": 3698 + }, + { + "epoch": 1.748936170212766, + "grad_norm": 2.6525840759277344, + "learning_rate": 4.05781855753646e-06, + "loss": 0.5256, + "step": 3699 + }, + { + "epoch": 1.7494089834515365, + "grad_norm": 2.7868754863739014, + "learning_rate": 4.057330601279914e-06, + "loss": 0.5227, + "step": 3700 + }, + { + "epoch": 1.7498817966903073, + "grad_norm": 3.1629884243011475, + "learning_rate": 4.056842548055221e-06, + "loss": 0.5617, + "step": 3701 + }, + { + "epoch": 1.750354609929078, + "grad_norm": 2.9350688457489014, + "learning_rate": 4.056354397892769e-06, + "loss": 0.4753, + "step": 3702 + }, + { + "epoch": 1.7508274231678487, + "grad_norm": 2.9688615798950195, + "learning_rate": 4.0558661508229525e-06, + "loss": 0.596, + "step": 3703 + }, + { + "epoch": 1.7513002364066192, + "grad_norm": 2.802205801010132, + "learning_rate": 4.055377806876174e-06, + "loss": 0.5793, + "step": 3704 + }, + { + "epoch": 1.75177304964539, + "grad_norm": 2.4933416843414307, + "learning_rate": 4.054889366082839e-06, + "loss": 0.4824, + "step": 3705 + }, + { + "epoch": 1.7522458628841608, + "grad_norm": 3.7904608249664307, + "learning_rate": 4.054400828473361e-06, + "loss": 0.5124, + "step": 3706 + }, + { + "epoch": 1.7527186761229314, + "grad_norm": 2.694838762283325, + "learning_rate": 4.053912194078159e-06, + "loss": 0.5604, + "step": 3707 + }, + { + "epoch": 1.753191489361702, + "grad_norm": 2.3721256256103516, + "learning_rate": 4.053423462927659e-06, + "loss": 0.4978, + "step": 3708 + }, + { + "epoch": 1.7536643026004728, + "grad_norm": 2.718512773513794, + "learning_rate": 4.052934635052292e-06, + "loss": 0.5029, + "step": 3709 + }, + { + "epoch": 1.7541371158392436, + "grad_norm": 3.061558246612549, + "learning_rate": 4.052445710482493e-06, + "loss": 0.4886, + "step": 3710 + }, + { + "epoch": 1.7546099290780142, + "grad_norm": 3.0490729808807373, + "learning_rate": 4.051956689248709e-06, + "loss": 0.5363, + "step": 3711 + }, + { + "epoch": 1.7550827423167847, + "grad_norm": 2.611661672592163, + "learning_rate": 4.051467571381385e-06, + "loss": 0.5397, + "step": 3712 + }, + { + "epoch": 1.7555555555555555, + "grad_norm": 2.7829177379608154, + "learning_rate": 4.050978356910979e-06, + "loss": 0.4973, + "step": 3713 + }, + { + "epoch": 1.7560283687943263, + "grad_norm": 2.6228256225585938, + "learning_rate": 4.0504890458679525e-06, + "loss": 0.4551, + "step": 3714 + }, + { + "epoch": 1.756501182033097, + "grad_norm": 2.6801326274871826, + "learning_rate": 4.049999638282771e-06, + "loss": 0.5581, + "step": 3715 + }, + { + "epoch": 1.7569739952718675, + "grad_norm": 2.4476819038391113, + "learning_rate": 4.049510134185908e-06, + "loss": 0.5226, + "step": 3716 + }, + { + "epoch": 1.7574468085106383, + "grad_norm": 2.5661075115203857, + "learning_rate": 4.049020533607844e-06, + "loss": 0.5163, + "step": 3717 + }, + { + "epoch": 1.757919621749409, + "grad_norm": 2.3923349380493164, + "learning_rate": 4.048530836579065e-06, + "loss": 0.5076, + "step": 3718 + }, + { + "epoch": 1.7583924349881797, + "grad_norm": 2.8204405307769775, + "learning_rate": 4.0480410431300585e-06, + "loss": 0.5883, + "step": 3719 + }, + { + "epoch": 1.7588652482269502, + "grad_norm": 2.323107957839966, + "learning_rate": 4.047551153291325e-06, + "loss": 0.5116, + "step": 3720 + }, + { + "epoch": 1.759338061465721, + "grad_norm": 2.8306009769439697, + "learning_rate": 4.047061167093368e-06, + "loss": 0.5094, + "step": 3721 + }, + { + "epoch": 1.7598108747044918, + "grad_norm": 2.568765640258789, + "learning_rate": 4.046571084566695e-06, + "loss": 0.4725, + "step": 3722 + }, + { + "epoch": 1.7602836879432624, + "grad_norm": 2.7212061882019043, + "learning_rate": 4.046080905741822e-06, + "loss": 0.4741, + "step": 3723 + }, + { + "epoch": 1.760756501182033, + "grad_norm": 2.802917003631592, + "learning_rate": 4.04559063064927e-06, + "loss": 0.5691, + "step": 3724 + }, + { + "epoch": 1.7612293144208038, + "grad_norm": 3.1044139862060547, + "learning_rate": 4.0451002593195675e-06, + "loss": 0.5472, + "step": 3725 + }, + { + "epoch": 1.7617021276595746, + "grad_norm": 2.5855562686920166, + "learning_rate": 4.044609791783246e-06, + "loss": 0.4852, + "step": 3726 + }, + { + "epoch": 1.7621749408983451, + "grad_norm": 2.6235129833221436, + "learning_rate": 4.0441192280708465e-06, + "loss": 0.5269, + "step": 3727 + }, + { + "epoch": 1.7626477541371157, + "grad_norm": 3.535630464553833, + "learning_rate": 4.043628568212914e-06, + "loss": 0.5266, + "step": 3728 + }, + { + "epoch": 1.7631205673758865, + "grad_norm": 2.7783355712890625, + "learning_rate": 4.043137812239998e-06, + "loss": 0.5609, + "step": 3729 + }, + { + "epoch": 1.7635933806146573, + "grad_norm": 2.9344944953918457, + "learning_rate": 4.042646960182657e-06, + "loss": 0.5056, + "step": 3730 + }, + { + "epoch": 1.7640661938534279, + "grad_norm": 2.6205739974975586, + "learning_rate": 4.042156012071453e-06, + "loss": 0.4914, + "step": 3731 + }, + { + "epoch": 1.7645390070921985, + "grad_norm": 2.8004493713378906, + "learning_rate": 4.041664967936958e-06, + "loss": 0.4901, + "step": 3732 + }, + { + "epoch": 1.7650118203309693, + "grad_norm": 2.944589138031006, + "learning_rate": 4.041173827809745e-06, + "loss": 0.5572, + "step": 3733 + }, + { + "epoch": 1.76548463356974, + "grad_norm": 2.5021605491638184, + "learning_rate": 4.040682591720397e-06, + "loss": 0.4637, + "step": 3734 + }, + { + "epoch": 1.7659574468085106, + "grad_norm": 2.448030948638916, + "learning_rate": 4.040191259699497e-06, + "loss": 0.4785, + "step": 3735 + }, + { + "epoch": 1.7664302600472812, + "grad_norm": 2.7171032428741455, + "learning_rate": 4.039699831777643e-06, + "loss": 0.4919, + "step": 3736 + }, + { + "epoch": 1.766903073286052, + "grad_norm": 2.453118324279785, + "learning_rate": 4.03920830798543e-06, + "loss": 0.4326, + "step": 3737 + }, + { + "epoch": 1.7673758865248228, + "grad_norm": 3.112877368927002, + "learning_rate": 4.038716688353466e-06, + "loss": 0.5375, + "step": 3738 + }, + { + "epoch": 1.7678486997635934, + "grad_norm": 2.742239236831665, + "learning_rate": 4.038224972912361e-06, + "loss": 0.5267, + "step": 3739 + }, + { + "epoch": 1.768321513002364, + "grad_norm": 2.544785737991333, + "learning_rate": 4.037733161692731e-06, + "loss": 0.5032, + "step": 3740 + }, + { + "epoch": 1.7687943262411348, + "grad_norm": 2.4639062881469727, + "learning_rate": 4.037241254725201e-06, + "loss": 0.5532, + "step": 3741 + }, + { + "epoch": 1.7692671394799055, + "grad_norm": 2.866290330886841, + "learning_rate": 4.036749252040398e-06, + "loss": 0.5503, + "step": 3742 + }, + { + "epoch": 1.7697399527186761, + "grad_norm": 2.3466262817382812, + "learning_rate": 4.0362571536689575e-06, + "loss": 0.5286, + "step": 3743 + }, + { + "epoch": 1.7702127659574467, + "grad_norm": 2.246464967727661, + "learning_rate": 4.03576495964152e-06, + "loss": 0.4656, + "step": 3744 + }, + { + "epoch": 1.7706855791962175, + "grad_norm": 2.667558431625366, + "learning_rate": 4.035272669988733e-06, + "loss": 0.5205, + "step": 3745 + }, + { + "epoch": 1.7711583924349883, + "grad_norm": 2.974666118621826, + "learning_rate": 4.034780284741249e-06, + "loss": 0.6007, + "step": 3746 + }, + { + "epoch": 1.7716312056737589, + "grad_norm": 2.7164433002471924, + "learning_rate": 4.034287803929726e-06, + "loss": 0.4913, + "step": 3747 + }, + { + "epoch": 1.7721040189125294, + "grad_norm": 2.5923962593078613, + "learning_rate": 4.033795227584829e-06, + "loss": 0.5275, + "step": 3748 + }, + { + "epoch": 1.7725768321513002, + "grad_norm": 2.606027126312256, + "learning_rate": 4.033302555737229e-06, + "loss": 0.4869, + "step": 3749 + }, + { + "epoch": 1.773049645390071, + "grad_norm": 3.0110089778900146, + "learning_rate": 4.032809788417602e-06, + "loss": 0.4956, + "step": 3750 + }, + { + "epoch": 1.7735224586288416, + "grad_norm": 3.004598617553711, + "learning_rate": 4.032316925656632e-06, + "loss": 0.5159, + "step": 3751 + }, + { + "epoch": 1.7739952718676122, + "grad_norm": 2.731539249420166, + "learning_rate": 4.031823967485005e-06, + "loss": 0.5237, + "step": 3752 + }, + { + "epoch": 1.774468085106383, + "grad_norm": 2.7466373443603516, + "learning_rate": 4.0313309139334155e-06, + "loss": 0.4948, + "step": 3753 + }, + { + "epoch": 1.7749408983451538, + "grad_norm": 2.8596460819244385, + "learning_rate": 4.030837765032565e-06, + "loss": 0.5016, + "step": 3754 + }, + { + "epoch": 1.7754137115839244, + "grad_norm": 3.2886788845062256, + "learning_rate": 4.03034452081316e-06, + "loss": 0.5377, + "step": 3755 + }, + { + "epoch": 1.775886524822695, + "grad_norm": 2.5629258155822754, + "learning_rate": 4.029851181305912e-06, + "loss": 0.519, + "step": 3756 + }, + { + "epoch": 1.7763593380614657, + "grad_norm": 2.5988714694976807, + "learning_rate": 4.029357746541539e-06, + "loss": 0.5521, + "step": 3757 + }, + { + "epoch": 1.7768321513002365, + "grad_norm": 2.987884759902954, + "learning_rate": 4.028864216550765e-06, + "loss": 0.6225, + "step": 3758 + }, + { + "epoch": 1.777304964539007, + "grad_norm": 2.6875851154327393, + "learning_rate": 4.02837059136432e-06, + "loss": 0.5321, + "step": 3759 + }, + { + "epoch": 1.7777777777777777, + "grad_norm": 2.6414570808410645, + "learning_rate": 4.02787687101294e-06, + "loss": 0.4831, + "step": 3760 + }, + { + "epoch": 1.7782505910165485, + "grad_norm": 2.581475019454956, + "learning_rate": 4.027383055527368e-06, + "loss": 0.5204, + "step": 3761 + }, + { + "epoch": 1.7787234042553193, + "grad_norm": 2.811298131942749, + "learning_rate": 4.026889144938349e-06, + "loss": 0.5486, + "step": 3762 + }, + { + "epoch": 1.7791962174940898, + "grad_norm": 3.1589081287384033, + "learning_rate": 4.026395139276639e-06, + "loss": 0.4979, + "step": 3763 + }, + { + "epoch": 1.7796690307328604, + "grad_norm": 2.3773093223571777, + "learning_rate": 4.025901038572996e-06, + "loss": 0.503, + "step": 3764 + }, + { + "epoch": 1.7801418439716312, + "grad_norm": 2.962541341781616, + "learning_rate": 4.025406842858187e-06, + "loss": 0.4613, + "step": 3765 + }, + { + "epoch": 1.780614657210402, + "grad_norm": 2.603092908859253, + "learning_rate": 4.024912552162982e-06, + "loss": 0.5142, + "step": 3766 + }, + { + "epoch": 1.7810874704491726, + "grad_norm": 2.648927927017212, + "learning_rate": 4.024418166518159e-06, + "loss": 0.4491, + "step": 3767 + }, + { + "epoch": 1.7815602836879432, + "grad_norm": 3.3239917755126953, + "learning_rate": 4.023923685954502e-06, + "loss": 0.6272, + "step": 3768 + }, + { + "epoch": 1.782033096926714, + "grad_norm": 2.672821283340454, + "learning_rate": 4.023429110502798e-06, + "loss": 0.5171, + "step": 3769 + }, + { + "epoch": 1.7825059101654848, + "grad_norm": 2.364332437515259, + "learning_rate": 4.022934440193844e-06, + "loss": 0.4513, + "step": 3770 + }, + { + "epoch": 1.7829787234042553, + "grad_norm": 3.03108549118042, + "learning_rate": 4.022439675058441e-06, + "loss": 0.4324, + "step": 3771 + }, + { + "epoch": 1.783451536643026, + "grad_norm": 2.647557020187378, + "learning_rate": 4.021944815127393e-06, + "loss": 0.5162, + "step": 3772 + }, + { + "epoch": 1.7839243498817967, + "grad_norm": 2.4111907482147217, + "learning_rate": 4.021449860431517e-06, + "loss": 0.4712, + "step": 3773 + }, + { + "epoch": 1.7843971631205675, + "grad_norm": 2.796175718307495, + "learning_rate": 4.020954811001629e-06, + "loss": 0.5131, + "step": 3774 + }, + { + "epoch": 1.784869976359338, + "grad_norm": 2.4594924449920654, + "learning_rate": 4.020459666868553e-06, + "loss": 0.4739, + "step": 3775 + }, + { + "epoch": 1.7853427895981087, + "grad_norm": 2.5735671520233154, + "learning_rate": 4.0199644280631215e-06, + "loss": 0.4716, + "step": 3776 + }, + { + "epoch": 1.7858156028368795, + "grad_norm": 2.419990062713623, + "learning_rate": 4.01946909461617e-06, + "loss": 0.4866, + "step": 3777 + }, + { + "epoch": 1.7862884160756503, + "grad_norm": 2.5597951412200928, + "learning_rate": 4.01897366655854e-06, + "loss": 0.5569, + "step": 3778 + }, + { + "epoch": 1.7867612293144208, + "grad_norm": 2.462383985519409, + "learning_rate": 4.018478143921081e-06, + "loss": 0.4588, + "step": 3779 + }, + { + "epoch": 1.7872340425531914, + "grad_norm": 2.536701202392578, + "learning_rate": 4.017982526734646e-06, + "loss": 0.5278, + "step": 3780 + }, + { + "epoch": 1.7877068557919622, + "grad_norm": 2.691077470779419, + "learning_rate": 4.017486815030095e-06, + "loss": 0.4815, + "step": 3781 + }, + { + "epoch": 1.788179669030733, + "grad_norm": 2.4277288913726807, + "learning_rate": 4.016991008838294e-06, + "loss": 0.4877, + "step": 3782 + }, + { + "epoch": 1.7886524822695036, + "grad_norm": 2.6740009784698486, + "learning_rate": 4.016495108190115e-06, + "loss": 0.572, + "step": 3783 + }, + { + "epoch": 1.7891252955082741, + "grad_norm": 3.179232120513916, + "learning_rate": 4.0159991131164355e-06, + "loss": 0.4821, + "step": 3784 + }, + { + "epoch": 1.789598108747045, + "grad_norm": 3.2747793197631836, + "learning_rate": 4.015503023648138e-06, + "loss": 0.5517, + "step": 3785 + }, + { + "epoch": 1.7900709219858157, + "grad_norm": 2.671367645263672, + "learning_rate": 4.015006839816113e-06, + "loss": 0.5158, + "step": 3786 + }, + { + "epoch": 1.7905437352245863, + "grad_norm": 2.6600193977355957, + "learning_rate": 4.014510561651256e-06, + "loss": 0.535, + "step": 3787 + }, + { + "epoch": 1.791016548463357, + "grad_norm": 2.481509208679199, + "learning_rate": 4.014014189184466e-06, + "loss": 0.5596, + "step": 3788 + }, + { + "epoch": 1.7914893617021277, + "grad_norm": 2.759816884994507, + "learning_rate": 4.013517722446652e-06, + "loss": 0.5201, + "step": 3789 + }, + { + "epoch": 1.7919621749408985, + "grad_norm": 2.6913561820983887, + "learning_rate": 4.013021161468724e-06, + "loss": 0.5758, + "step": 3790 + }, + { + "epoch": 1.792434988179669, + "grad_norm": 2.775087594985962, + "learning_rate": 4.0125245062816044e-06, + "loss": 0.499, + "step": 3791 + }, + { + "epoch": 1.7929078014184396, + "grad_norm": 2.6134777069091797, + "learning_rate": 4.012027756916216e-06, + "loss": 0.5659, + "step": 3792 + }, + { + "epoch": 1.7933806146572104, + "grad_norm": 2.7109756469726562, + "learning_rate": 4.0115309134034895e-06, + "loss": 0.5337, + "step": 3793 + }, + { + "epoch": 1.7938534278959812, + "grad_norm": 2.5389950275421143, + "learning_rate": 4.0110339757743595e-06, + "loss": 0.4501, + "step": 3794 + }, + { + "epoch": 1.7943262411347518, + "grad_norm": 2.634648561477661, + "learning_rate": 4.010536944059771e-06, + "loss": 0.4411, + "step": 3795 + }, + { + "epoch": 1.7947990543735224, + "grad_norm": 2.527070999145508, + "learning_rate": 4.0100398182906695e-06, + "loss": 0.5145, + "step": 3796 + }, + { + "epoch": 1.7952718676122932, + "grad_norm": 2.62988543510437, + "learning_rate": 4.0095425984980105e-06, + "loss": 0.4981, + "step": 3797 + }, + { + "epoch": 1.795744680851064, + "grad_norm": 2.6032519340515137, + "learning_rate": 4.009045284712752e-06, + "loss": 0.453, + "step": 3798 + }, + { + "epoch": 1.7962174940898346, + "grad_norm": 2.735173463821411, + "learning_rate": 4.008547876965863e-06, + "loss": 0.5925, + "step": 3799 + }, + { + "epoch": 1.7966903073286051, + "grad_norm": 2.6296730041503906, + "learning_rate": 4.00805037528831e-06, + "loss": 0.5651, + "step": 3800 + }, + { + "epoch": 1.797163120567376, + "grad_norm": 2.641214370727539, + "learning_rate": 4.0075527797110735e-06, + "loss": 0.4973, + "step": 3801 + }, + { + "epoch": 1.7976359338061467, + "grad_norm": 2.6104819774627686, + "learning_rate": 4.007055090265136e-06, + "loss": 0.4432, + "step": 3802 + }, + { + "epoch": 1.7981087470449173, + "grad_norm": 2.8200619220733643, + "learning_rate": 4.0065573069814865e-06, + "loss": 0.4899, + "step": 3803 + }, + { + "epoch": 1.7985815602836879, + "grad_norm": 2.982354164123535, + "learning_rate": 4.006059429891119e-06, + "loss": 0.5488, + "step": 3804 + }, + { + "epoch": 1.7990543735224587, + "grad_norm": 2.7561678886413574, + "learning_rate": 4.005561459025034e-06, + "loss": 0.5637, + "step": 3805 + }, + { + "epoch": 1.7995271867612295, + "grad_norm": 2.702212333679199, + "learning_rate": 4.005063394414241e-06, + "loss": 0.4804, + "step": 3806 + }, + { + "epoch": 1.8, + "grad_norm": 2.8655319213867188, + "learning_rate": 4.004565236089748e-06, + "loss": 0.5759, + "step": 3807 + }, + { + "epoch": 1.8004728132387706, + "grad_norm": 2.703676223754883, + "learning_rate": 4.0040669840825756e-06, + "loss": 0.4728, + "step": 3808 + }, + { + "epoch": 1.8009456264775414, + "grad_norm": 2.802645683288574, + "learning_rate": 4.003568638423747e-06, + "loss": 0.5421, + "step": 3809 + }, + { + "epoch": 1.8014184397163122, + "grad_norm": 2.4723124504089355, + "learning_rate": 4.003070199144292e-06, + "loss": 0.4944, + "step": 3810 + }, + { + "epoch": 1.8018912529550828, + "grad_norm": 2.4889068603515625, + "learning_rate": 4.0025716662752475e-06, + "loss": 0.4774, + "step": 3811 + }, + { + "epoch": 1.8023640661938534, + "grad_norm": 2.5408077239990234, + "learning_rate": 4.002073039847653e-06, + "loss": 0.5233, + "step": 3812 + }, + { + "epoch": 1.8028368794326242, + "grad_norm": 2.734602689743042, + "learning_rate": 4.001574319892557e-06, + "loss": 0.5403, + "step": 3813 + }, + { + "epoch": 1.803309692671395, + "grad_norm": 3.3786163330078125, + "learning_rate": 4.001075506441012e-06, + "loss": 0.6969, + "step": 3814 + }, + { + "epoch": 1.8037825059101655, + "grad_norm": 2.7375378608703613, + "learning_rate": 4.000576599524078e-06, + "loss": 0.4907, + "step": 3815 + }, + { + "epoch": 1.804255319148936, + "grad_norm": 3.041804075241089, + "learning_rate": 4.000077599172818e-06, + "loss": 0.6021, + "step": 3816 + }, + { + "epoch": 1.804728132387707, + "grad_norm": 2.697599411010742, + "learning_rate": 3.999578505418305e-06, + "loss": 0.4743, + "step": 3817 + }, + { + "epoch": 1.8052009456264777, + "grad_norm": 2.276921272277832, + "learning_rate": 3.999079318291612e-06, + "loss": 0.4885, + "step": 3818 + }, + { + "epoch": 1.8056737588652483, + "grad_norm": 2.4896953105926514, + "learning_rate": 3.998580037823825e-06, + "loss": 0.503, + "step": 3819 + }, + { + "epoch": 1.8061465721040189, + "grad_norm": 2.6232175827026367, + "learning_rate": 3.998080664046029e-06, + "loss": 0.5058, + "step": 3820 + }, + { + "epoch": 1.8066193853427897, + "grad_norm": 2.695861339569092, + "learning_rate": 3.997581196989319e-06, + "loss": 0.4949, + "step": 3821 + }, + { + "epoch": 1.8070921985815604, + "grad_norm": 2.912886142730713, + "learning_rate": 3.997081636684795e-06, + "loss": 0.4971, + "step": 3822 + }, + { + "epoch": 1.807565011820331, + "grad_norm": 2.876500368118286, + "learning_rate": 3.996581983163561e-06, + "loss": 0.5584, + "step": 3823 + }, + { + "epoch": 1.8080378250591016, + "grad_norm": 2.857069730758667, + "learning_rate": 3.99608223645673e-06, + "loss": 0.5457, + "step": 3824 + }, + { + "epoch": 1.8085106382978724, + "grad_norm": 2.486743211746216, + "learning_rate": 3.995582396595419e-06, + "loss": 0.5291, + "step": 3825 + }, + { + "epoch": 1.808983451536643, + "grad_norm": 2.509441375732422, + "learning_rate": 3.9950824636107486e-06, + "loss": 0.4747, + "step": 3826 + }, + { + "epoch": 1.8094562647754135, + "grad_norm": 2.931394100189209, + "learning_rate": 3.99458243753385e-06, + "loss": 0.5116, + "step": 3827 + }, + { + "epoch": 1.8099290780141843, + "grad_norm": 2.4868650436401367, + "learning_rate": 3.994082318395856e-06, + "loss": 0.4671, + "step": 3828 + }, + { + "epoch": 1.8104018912529551, + "grad_norm": 2.5554752349853516, + "learning_rate": 3.993582106227907e-06, + "loss": 0.4969, + "step": 3829 + }, + { + "epoch": 1.8108747044917257, + "grad_norm": 2.8367133140563965, + "learning_rate": 3.99308180106115e-06, + "loss": 0.5507, + "step": 3830 + }, + { + "epoch": 1.8113475177304963, + "grad_norm": 2.68245792388916, + "learning_rate": 3.992581402926737e-06, + "loss": 0.5115, + "step": 3831 + }, + { + "epoch": 1.811820330969267, + "grad_norm": 2.406674385070801, + "learning_rate": 3.992080911855824e-06, + "loss": 0.545, + "step": 3832 + }, + { + "epoch": 1.8122931442080379, + "grad_norm": 2.5003464221954346, + "learning_rate": 3.991580327879575e-06, + "loss": 0.4331, + "step": 3833 + }, + { + "epoch": 1.8127659574468085, + "grad_norm": 2.49320912361145, + "learning_rate": 3.99107965102916e-06, + "loss": 0.5118, + "step": 3834 + }, + { + "epoch": 1.813238770685579, + "grad_norm": 2.6183295249938965, + "learning_rate": 3.990578881335752e-06, + "loss": 0.5286, + "step": 3835 + }, + { + "epoch": 1.8137115839243498, + "grad_norm": 3.1999518871307373, + "learning_rate": 3.990078018830534e-06, + "loss": 0.5048, + "step": 3836 + }, + { + "epoch": 1.8141843971631206, + "grad_norm": 2.4351117610931396, + "learning_rate": 3.9895770635446915e-06, + "loss": 0.514, + "step": 3837 + }, + { + "epoch": 1.8146572104018912, + "grad_norm": 2.6859259605407715, + "learning_rate": 3.989076015509416e-06, + "loss": 0.5575, + "step": 3838 + }, + { + "epoch": 1.8151300236406618, + "grad_norm": 2.790421962738037, + "learning_rate": 3.988574874755909e-06, + "loss": 0.5467, + "step": 3839 + }, + { + "epoch": 1.8156028368794326, + "grad_norm": 2.5202765464782715, + "learning_rate": 3.988073641315369e-06, + "loss": 0.5229, + "step": 3840 + }, + { + "epoch": 1.8160756501182034, + "grad_norm": 2.623652219772339, + "learning_rate": 3.987572315219009e-06, + "loss": 0.509, + "step": 3841 + }, + { + "epoch": 1.816548463356974, + "grad_norm": 2.6038360595703125, + "learning_rate": 3.987070896498044e-06, + "loss": 0.5304, + "step": 3842 + }, + { + "epoch": 1.8170212765957445, + "grad_norm": 2.9378011226654053, + "learning_rate": 3.9865693851836955e-06, + "loss": 0.5845, + "step": 3843 + }, + { + "epoch": 1.8174940898345153, + "grad_norm": 2.4061124324798584, + "learning_rate": 3.98606778130719e-06, + "loss": 0.4333, + "step": 3844 + }, + { + "epoch": 1.8179669030732861, + "grad_norm": 2.483489751815796, + "learning_rate": 3.985566084899759e-06, + "loss": 0.4827, + "step": 3845 + }, + { + "epoch": 1.8184397163120567, + "grad_norm": 2.7774932384490967, + "learning_rate": 3.985064295992642e-06, + "loss": 0.5016, + "step": 3846 + }, + { + "epoch": 1.8189125295508273, + "grad_norm": 2.5936765670776367, + "learning_rate": 3.984562414617083e-06, + "loss": 0.4448, + "step": 3847 + }, + { + "epoch": 1.819385342789598, + "grad_norm": 2.8608627319335938, + "learning_rate": 3.9840604408043325e-06, + "loss": 0.5735, + "step": 3848 + }, + { + "epoch": 1.8198581560283689, + "grad_norm": 2.6212472915649414, + "learning_rate": 3.983558374585646e-06, + "loss": 0.5091, + "step": 3849 + }, + { + "epoch": 1.8203309692671394, + "grad_norm": 2.832460641860962, + "learning_rate": 3.983056215992284e-06, + "loss": 0.5169, + "step": 3850 + }, + { + "epoch": 1.82080378250591, + "grad_norm": 2.5293610095977783, + "learning_rate": 3.982553965055514e-06, + "loss": 0.4708, + "step": 3851 + }, + { + "epoch": 1.8212765957446808, + "grad_norm": 2.9362871646881104, + "learning_rate": 3.982051621806611e-06, + "loss": 0.575, + "step": 3852 + }, + { + "epoch": 1.8217494089834516, + "grad_norm": 2.69073486328125, + "learning_rate": 3.98154918627685e-06, + "loss": 0.5278, + "step": 3853 + }, + { + "epoch": 1.8222222222222222, + "grad_norm": 2.6711034774780273, + "learning_rate": 3.98104665849752e-06, + "loss": 0.4918, + "step": 3854 + }, + { + "epoch": 1.8226950354609928, + "grad_norm": 2.571110963821411, + "learning_rate": 3.980544038499907e-06, + "loss": 0.5234, + "step": 3855 + }, + { + "epoch": 1.8231678486997636, + "grad_norm": 3.2603371143341064, + "learning_rate": 3.980041326315309e-06, + "loss": 0.5996, + "step": 3856 + }, + { + "epoch": 1.8236406619385344, + "grad_norm": 2.8472323417663574, + "learning_rate": 3.979538521975028e-06, + "loss": 0.4769, + "step": 3857 + }, + { + "epoch": 1.824113475177305, + "grad_norm": 2.6714751720428467, + "learning_rate": 3.979035625510371e-06, + "loss": 0.4826, + "step": 3858 + }, + { + "epoch": 1.8245862884160755, + "grad_norm": 2.6816468238830566, + "learning_rate": 3.97853263695265e-06, + "loss": 0.5127, + "step": 3859 + }, + { + "epoch": 1.8250591016548463, + "grad_norm": 2.6464123725891113, + "learning_rate": 3.978029556333185e-06, + "loss": 0.4925, + "step": 3860 + }, + { + "epoch": 1.825531914893617, + "grad_norm": 2.5317227840423584, + "learning_rate": 3.977526383683301e-06, + "loss": 0.4765, + "step": 3861 + }, + { + "epoch": 1.8260047281323877, + "grad_norm": 2.5052425861358643, + "learning_rate": 3.977023119034328e-06, + "loss": 0.4804, + "step": 3862 + }, + { + "epoch": 1.8264775413711583, + "grad_norm": 2.7022836208343506, + "learning_rate": 3.976519762417602e-06, + "loss": 0.4824, + "step": 3863 + }, + { + "epoch": 1.826950354609929, + "grad_norm": 2.7445900440216064, + "learning_rate": 3.976016313864464e-06, + "loss": 0.5698, + "step": 3864 + }, + { + "epoch": 1.8274231678486998, + "grad_norm": 2.442518711090088, + "learning_rate": 3.975512773406262e-06, + "loss": 0.5133, + "step": 3865 + }, + { + "epoch": 1.8278959810874704, + "grad_norm": 2.4100050926208496, + "learning_rate": 3.975009141074351e-06, + "loss": 0.5044, + "step": 3866 + }, + { + "epoch": 1.828368794326241, + "grad_norm": 2.9507648944854736, + "learning_rate": 3.974505416900088e-06, + "loss": 0.5367, + "step": 3867 + }, + { + "epoch": 1.8288416075650118, + "grad_norm": 2.5662600994110107, + "learning_rate": 3.974001600914837e-06, + "loss": 0.5878, + "step": 3868 + }, + { + "epoch": 1.8293144208037826, + "grad_norm": 2.4306657314300537, + "learning_rate": 3.973497693149971e-06, + "loss": 0.4647, + "step": 3869 + }, + { + "epoch": 1.8297872340425532, + "grad_norm": 2.974686622619629, + "learning_rate": 3.972993693636864e-06, + "loss": 0.4911, + "step": 3870 + }, + { + "epoch": 1.8302600472813237, + "grad_norm": 2.5711987018585205, + "learning_rate": 3.972489602406899e-06, + "loss": 0.5089, + "step": 3871 + }, + { + "epoch": 1.8307328605200945, + "grad_norm": 3.259617328643799, + "learning_rate": 3.971985419491463e-06, + "loss": 0.5966, + "step": 3872 + }, + { + "epoch": 1.8312056737588653, + "grad_norm": 2.7437000274658203, + "learning_rate": 3.971481144921949e-06, + "loss": 0.5097, + "step": 3873 + }, + { + "epoch": 1.831678486997636, + "grad_norm": 2.9597461223602295, + "learning_rate": 3.970976778729757e-06, + "loss": 0.5672, + "step": 3874 + }, + { + "epoch": 1.8321513002364065, + "grad_norm": 2.5775723457336426, + "learning_rate": 3.970472320946291e-06, + "loss": 0.4749, + "step": 3875 + }, + { + "epoch": 1.8326241134751773, + "grad_norm": 2.7381200790405273, + "learning_rate": 3.969967771602961e-06, + "loss": 0.5255, + "step": 3876 + }, + { + "epoch": 1.833096926713948, + "grad_norm": 2.651698350906372, + "learning_rate": 3.969463130731183e-06, + "loss": 0.5098, + "step": 3877 + }, + { + "epoch": 1.8335697399527187, + "grad_norm": 2.7277021408081055, + "learning_rate": 3.968958398362381e-06, + "loss": 0.5251, + "step": 3878 + }, + { + "epoch": 1.8340425531914892, + "grad_norm": 2.5184953212738037, + "learning_rate": 3.968453574527978e-06, + "loss": 0.5086, + "step": 3879 + }, + { + "epoch": 1.83451536643026, + "grad_norm": 2.8227882385253906, + "learning_rate": 3.967948659259412e-06, + "loss": 0.5742, + "step": 3880 + }, + { + "epoch": 1.8349881796690308, + "grad_norm": 2.547922134399414, + "learning_rate": 3.967443652588119e-06, + "loss": 0.5411, + "step": 3881 + }, + { + "epoch": 1.8354609929078014, + "grad_norm": 2.6572835445404053, + "learning_rate": 3.966938554545545e-06, + "loss": 0.4854, + "step": 3882 + }, + { + "epoch": 1.835933806146572, + "grad_norm": 2.9416658878326416, + "learning_rate": 3.966433365163139e-06, + "loss": 0.5236, + "step": 3883 + }, + { + "epoch": 1.8364066193853428, + "grad_norm": 2.344325304031372, + "learning_rate": 3.965928084472357e-06, + "loss": 0.4916, + "step": 3884 + }, + { + "epoch": 1.8368794326241136, + "grad_norm": 2.890418291091919, + "learning_rate": 3.965422712504662e-06, + "loss": 0.5287, + "step": 3885 + }, + { + "epoch": 1.8373522458628841, + "grad_norm": 2.6063363552093506, + "learning_rate": 3.96491724929152e-06, + "loss": 0.4842, + "step": 3886 + }, + { + "epoch": 1.8378250591016547, + "grad_norm": 2.5582427978515625, + "learning_rate": 3.964411694864404e-06, + "loss": 0.4768, + "step": 3887 + }, + { + "epoch": 1.8382978723404255, + "grad_norm": 2.84356951713562, + "learning_rate": 3.963906049254793e-06, + "loss": 0.5284, + "step": 3888 + }, + { + "epoch": 1.8387706855791963, + "grad_norm": 2.7048516273498535, + "learning_rate": 3.963400312494172e-06, + "loss": 0.5271, + "step": 3889 + }, + { + "epoch": 1.839243498817967, + "grad_norm": 2.5401699542999268, + "learning_rate": 3.962894484614031e-06, + "loss": 0.4734, + "step": 3890 + }, + { + "epoch": 1.8397163120567375, + "grad_norm": 2.208256244659424, + "learning_rate": 3.962388565645864e-06, + "loss": 0.4113, + "step": 3891 + }, + { + "epoch": 1.8401891252955083, + "grad_norm": 2.775139331817627, + "learning_rate": 3.961882555621173e-06, + "loss": 0.5172, + "step": 3892 + }, + { + "epoch": 1.840661938534279, + "grad_norm": 2.7540855407714844, + "learning_rate": 3.961376454571466e-06, + "loss": 0.5252, + "step": 3893 + }, + { + "epoch": 1.8411347517730496, + "grad_norm": 2.6731574535369873, + "learning_rate": 3.960870262528255e-06, + "loss": 0.4495, + "step": 3894 + }, + { + "epoch": 1.8416075650118202, + "grad_norm": 2.791492223739624, + "learning_rate": 3.960363979523058e-06, + "loss": 0.5457, + "step": 3895 + }, + { + "epoch": 1.842080378250591, + "grad_norm": 2.9280290603637695, + "learning_rate": 3.959857605587401e-06, + "loss": 0.5373, + "step": 3896 + }, + { + "epoch": 1.8425531914893618, + "grad_norm": 2.5652217864990234, + "learning_rate": 3.95935114075281e-06, + "loss": 0.5191, + "step": 3897 + }, + { + "epoch": 1.8430260047281324, + "grad_norm": 2.7297749519348145, + "learning_rate": 3.958844585050824e-06, + "loss": 0.5366, + "step": 3898 + }, + { + "epoch": 1.843498817966903, + "grad_norm": 2.5302982330322266, + "learning_rate": 3.958337938512983e-06, + "loss": 0.569, + "step": 3899 + }, + { + "epoch": 1.8439716312056738, + "grad_norm": 2.644777297973633, + "learning_rate": 3.957831201170832e-06, + "loss": 0.521, + "step": 3900 + }, + { + "epoch": 1.8444444444444446, + "grad_norm": 2.8375515937805176, + "learning_rate": 3.957324373055925e-06, + "loss": 0.573, + "step": 3901 + }, + { + "epoch": 1.8449172576832151, + "grad_norm": 2.512296676635742, + "learning_rate": 3.956817454199819e-06, + "loss": 0.5081, + "step": 3902 + }, + { + "epoch": 1.8453900709219857, + "grad_norm": 2.3662109375, + "learning_rate": 3.956310444634079e-06, + "loss": 0.4989, + "step": 3903 + }, + { + "epoch": 1.8458628841607565, + "grad_norm": 2.6849682331085205, + "learning_rate": 3.955803344390272e-06, + "loss": 0.5459, + "step": 3904 + }, + { + "epoch": 1.8463356973995273, + "grad_norm": 2.8364317417144775, + "learning_rate": 3.9552961534999756e-06, + "loss": 0.5704, + "step": 3905 + }, + { + "epoch": 1.8468085106382979, + "grad_norm": 2.6006948947906494, + "learning_rate": 3.954788871994768e-06, + "loss": 0.5696, + "step": 3906 + }, + { + "epoch": 1.8472813238770684, + "grad_norm": 2.558300018310547, + "learning_rate": 3.9542814999062375e-06, + "loss": 0.5047, + "step": 3907 + }, + { + "epoch": 1.8477541371158392, + "grad_norm": 2.6343321800231934, + "learning_rate": 3.953774037265974e-06, + "loss": 0.525, + "step": 3908 + }, + { + "epoch": 1.84822695035461, + "grad_norm": 2.5050008296966553, + "learning_rate": 3.953266484105576e-06, + "loss": 0.4867, + "step": 3909 + }, + { + "epoch": 1.8486997635933806, + "grad_norm": 2.3775103092193604, + "learning_rate": 3.952758840456647e-06, + "loss": 0.4349, + "step": 3910 + }, + { + "epoch": 1.8491725768321512, + "grad_norm": 2.508376359939575, + "learning_rate": 3.952251106350794e-06, + "loss": 0.539, + "step": 3911 + }, + { + "epoch": 1.849645390070922, + "grad_norm": 2.7403106689453125, + "learning_rate": 3.951743281819633e-06, + "loss": 0.4478, + "step": 3912 + }, + { + "epoch": 1.8501182033096928, + "grad_norm": 2.5332062244415283, + "learning_rate": 3.951235366894784e-06, + "loss": 0.4658, + "step": 3913 + }, + { + "epoch": 1.8505910165484634, + "grad_norm": 3.0137248039245605, + "learning_rate": 3.950727361607872e-06, + "loss": 0.5047, + "step": 3914 + }, + { + "epoch": 1.851063829787234, + "grad_norm": 2.5820653438568115, + "learning_rate": 3.950219265990528e-06, + "loss": 0.542, + "step": 3915 + }, + { + "epoch": 1.8515366430260047, + "grad_norm": 2.555133819580078, + "learning_rate": 3.949711080074389e-06, + "loss": 0.5253, + "step": 3916 + }, + { + "epoch": 1.8520094562647755, + "grad_norm": 2.876882791519165, + "learning_rate": 3.949202803891099e-06, + "loss": 0.5242, + "step": 3917 + }, + { + "epoch": 1.852482269503546, + "grad_norm": 2.5929203033447266, + "learning_rate": 3.948694437472305e-06, + "loss": 0.5358, + "step": 3918 + }, + { + "epoch": 1.8529550827423167, + "grad_norm": 2.468513250350952, + "learning_rate": 3.948185980849659e-06, + "loss": 0.5119, + "step": 3919 + }, + { + "epoch": 1.8534278959810875, + "grad_norm": 2.9259560108184814, + "learning_rate": 3.947677434054824e-06, + "loss": 0.4756, + "step": 3920 + }, + { + "epoch": 1.8539007092198583, + "grad_norm": 2.5247011184692383, + "learning_rate": 3.947168797119462e-06, + "loss": 0.4627, + "step": 3921 + }, + { + "epoch": 1.8543735224586289, + "grad_norm": 2.7396671772003174, + "learning_rate": 3.946660070075245e-06, + "loss": 0.5013, + "step": 3922 + }, + { + "epoch": 1.8548463356973994, + "grad_norm": 2.7059738636016846, + "learning_rate": 3.946151252953849e-06, + "loss": 0.5875, + "step": 3923 + }, + { + "epoch": 1.8553191489361702, + "grad_norm": 2.5638437271118164, + "learning_rate": 3.945642345786955e-06, + "loss": 0.5063, + "step": 3924 + }, + { + "epoch": 1.855791962174941, + "grad_norm": 2.6647839546203613, + "learning_rate": 3.945133348606251e-06, + "loss": 0.5421, + "step": 3925 + }, + { + "epoch": 1.8562647754137116, + "grad_norm": 3.7235286235809326, + "learning_rate": 3.944624261443431e-06, + "loss": 0.5958, + "step": 3926 + }, + { + "epoch": 1.8567375886524822, + "grad_norm": 2.769984245300293, + "learning_rate": 3.944115084330192e-06, + "loss": 0.5678, + "step": 3927 + }, + { + "epoch": 1.857210401891253, + "grad_norm": 2.567249059677124, + "learning_rate": 3.9436058172982395e-06, + "loss": 0.4767, + "step": 3928 + }, + { + "epoch": 1.8576832151300238, + "grad_norm": 2.6196048259735107, + "learning_rate": 3.943096460379283e-06, + "loss": 0.5345, + "step": 3929 + }, + { + "epoch": 1.8581560283687943, + "grad_norm": 2.5999555587768555, + "learning_rate": 3.942587013605037e-06, + "loss": 0.5482, + "step": 3930 + }, + { + "epoch": 1.858628841607565, + "grad_norm": 2.630387783050537, + "learning_rate": 3.942077477007224e-06, + "loss": 0.6023, + "step": 3931 + }, + { + "epoch": 1.8591016548463357, + "grad_norm": 2.543503761291504, + "learning_rate": 3.941567850617569e-06, + "loss": 0.5157, + "step": 3932 + }, + { + "epoch": 1.8595744680851065, + "grad_norm": 2.5109236240386963, + "learning_rate": 3.941058134467805e-06, + "loss": 0.4774, + "step": 3933 + }, + { + "epoch": 1.860047281323877, + "grad_norm": 2.5110230445861816, + "learning_rate": 3.94054832858967e-06, + "loss": 0.5064, + "step": 3934 + }, + { + "epoch": 1.8605200945626477, + "grad_norm": 2.4780776500701904, + "learning_rate": 3.940038433014908e-06, + "loss": 0.5216, + "step": 3935 + }, + { + "epoch": 1.8609929078014185, + "grad_norm": 2.4398856163024902, + "learning_rate": 3.939528447775266e-06, + "loss": 0.4958, + "step": 3936 + }, + { + "epoch": 1.8614657210401893, + "grad_norm": 2.449498176574707, + "learning_rate": 3.9390183729025e-06, + "loss": 0.5165, + "step": 3937 + }, + { + "epoch": 1.8619385342789598, + "grad_norm": 2.982544422149658, + "learning_rate": 3.938508208428371e-06, + "loss": 0.4803, + "step": 3938 + }, + { + "epoch": 1.8624113475177304, + "grad_norm": 2.6574015617370605, + "learning_rate": 3.937997954384641e-06, + "loss": 0.4797, + "step": 3939 + }, + { + "epoch": 1.8628841607565012, + "grad_norm": 2.7773542404174805, + "learning_rate": 3.937487610803086e-06, + "loss": 0.4843, + "step": 3940 + }, + { + "epoch": 1.863356973995272, + "grad_norm": 2.588937759399414, + "learning_rate": 3.9369771777154805e-06, + "loss": 0.5426, + "step": 3941 + }, + { + "epoch": 1.8638297872340426, + "grad_norm": 2.855442523956299, + "learning_rate": 3.936466655153607e-06, + "loss": 0.5443, + "step": 3942 + }, + { + "epoch": 1.8643026004728132, + "grad_norm": 2.554676055908203, + "learning_rate": 3.935956043149253e-06, + "loss": 0.5334, + "step": 3943 + }, + { + "epoch": 1.864775413711584, + "grad_norm": 2.901599884033203, + "learning_rate": 3.935445341734212e-06, + "loss": 0.5842, + "step": 3944 + }, + { + "epoch": 1.8652482269503547, + "grad_norm": 2.554485321044922, + "learning_rate": 3.934934550940285e-06, + "loss": 0.4941, + "step": 3945 + }, + { + "epoch": 1.8657210401891253, + "grad_norm": 2.357203245162964, + "learning_rate": 3.934423670799275e-06, + "loss": 0.4402, + "step": 3946 + }, + { + "epoch": 1.866193853427896, + "grad_norm": 2.7036049365997314, + "learning_rate": 3.933912701342993e-06, + "loss": 0.4966, + "step": 3947 + }, + { + "epoch": 1.8666666666666667, + "grad_norm": 2.7817211151123047, + "learning_rate": 3.933401642603255e-06, + "loss": 0.4908, + "step": 3948 + }, + { + "epoch": 1.8671394799054375, + "grad_norm": 2.439490795135498, + "learning_rate": 3.932890494611882e-06, + "loss": 0.4322, + "step": 3949 + }, + { + "epoch": 1.867612293144208, + "grad_norm": 3.187152147293091, + "learning_rate": 3.9323792574007e-06, + "loss": 0.501, + "step": 3950 + }, + { + "epoch": 1.8680851063829786, + "grad_norm": 2.405773401260376, + "learning_rate": 3.931867931001543e-06, + "loss": 0.4477, + "step": 3951 + }, + { + "epoch": 1.8685579196217494, + "grad_norm": 2.4922525882720947, + "learning_rate": 3.931356515446248e-06, + "loss": 0.5098, + "step": 3952 + }, + { + "epoch": 1.8690307328605202, + "grad_norm": 2.7781267166137695, + "learning_rate": 3.93084501076666e-06, + "loss": 0.5815, + "step": 3953 + }, + { + "epoch": 1.8695035460992908, + "grad_norm": 2.74621844291687, + "learning_rate": 3.930333416994626e-06, + "loss": 0.5605, + "step": 3954 + }, + { + "epoch": 1.8699763593380614, + "grad_norm": 2.5527689456939697, + "learning_rate": 3.929821734162004e-06, + "loss": 0.5141, + "step": 3955 + }, + { + "epoch": 1.8704491725768322, + "grad_norm": 2.5730628967285156, + "learning_rate": 3.92930996230065e-06, + "loss": 0.5446, + "step": 3956 + }, + { + "epoch": 1.870921985815603, + "grad_norm": 2.7053353786468506, + "learning_rate": 3.9287981014424334e-06, + "loss": 0.4722, + "step": 3957 + }, + { + "epoch": 1.8713947990543736, + "grad_norm": 2.7591893672943115, + "learning_rate": 3.928286151619224e-06, + "loss": 0.509, + "step": 3958 + }, + { + "epoch": 1.8718676122931441, + "grad_norm": 2.6233739852905273, + "learning_rate": 3.927774112862898e-06, + "loss": 0.5266, + "step": 3959 + }, + { + "epoch": 1.872340425531915, + "grad_norm": 2.7715370655059814, + "learning_rate": 3.9272619852053396e-06, + "loss": 0.5612, + "step": 3960 + }, + { + "epoch": 1.8728132387706857, + "grad_norm": 2.4815211296081543, + "learning_rate": 3.926749768678435e-06, + "loss": 0.5498, + "step": 3961 + }, + { + "epoch": 1.8732860520094563, + "grad_norm": 2.6819605827331543, + "learning_rate": 3.926237463314078e-06, + "loss": 0.5499, + "step": 3962 + }, + { + "epoch": 1.8737588652482269, + "grad_norm": 2.638664722442627, + "learning_rate": 3.925725069144168e-06, + "loss": 0.5429, + "step": 3963 + }, + { + "epoch": 1.8742316784869977, + "grad_norm": 2.527294874191284, + "learning_rate": 3.925212586200611e-06, + "loss": 0.5451, + "step": 3964 + }, + { + "epoch": 1.8747044917257685, + "grad_norm": 2.831638813018799, + "learning_rate": 3.924700014515315e-06, + "loss": 0.5276, + "step": 3965 + }, + { + "epoch": 1.875177304964539, + "grad_norm": 2.5906996726989746, + "learning_rate": 3.924187354120196e-06, + "loss": 0.5323, + "step": 3966 + }, + { + "epoch": 1.8756501182033096, + "grad_norm": 2.5482442378997803, + "learning_rate": 3.923674605047175e-06, + "loss": 0.4882, + "step": 3967 + }, + { + "epoch": 1.8761229314420804, + "grad_norm": 2.56402850151062, + "learning_rate": 3.923161767328179e-06, + "loss": 0.5111, + "step": 3968 + }, + { + "epoch": 1.8765957446808512, + "grad_norm": 3.223782539367676, + "learning_rate": 3.9226488409951405e-06, + "loss": 0.5829, + "step": 3969 + }, + { + "epoch": 1.8770685579196218, + "grad_norm": 2.665964365005493, + "learning_rate": 3.922135826079997e-06, + "loss": 0.4739, + "step": 3970 + }, + { + "epoch": 1.8775413711583924, + "grad_norm": 2.602696418762207, + "learning_rate": 3.921622722614691e-06, + "loss": 0.5199, + "step": 3971 + }, + { + "epoch": 1.8780141843971632, + "grad_norm": 2.5384418964385986, + "learning_rate": 3.921109530631172e-06, + "loss": 0.5086, + "step": 3972 + }, + { + "epoch": 1.878486997635934, + "grad_norm": 2.7961080074310303, + "learning_rate": 3.920596250161394e-06, + "loss": 0.5454, + "step": 3973 + }, + { + "epoch": 1.8789598108747045, + "grad_norm": 3.022007465362549, + "learning_rate": 3.920082881237317e-06, + "loss": 0.5537, + "step": 3974 + }, + { + "epoch": 1.8794326241134751, + "grad_norm": 2.699885129928589, + "learning_rate": 3.9195694238909045e-06, + "loss": 0.5274, + "step": 3975 + }, + { + "epoch": 1.879905437352246, + "grad_norm": 2.3994593620300293, + "learning_rate": 3.919055878154129e-06, + "loss": 0.4134, + "step": 3976 + }, + { + "epoch": 1.8803782505910167, + "grad_norm": 4.093045711517334, + "learning_rate": 3.918542244058967e-06, + "loss": 0.5305, + "step": 3977 + }, + { + "epoch": 1.8808510638297873, + "grad_norm": 3.011643171310425, + "learning_rate": 3.9180285216374e-06, + "loss": 0.5481, + "step": 3978 + }, + { + "epoch": 1.8813238770685579, + "grad_norm": 2.6426854133605957, + "learning_rate": 3.917514710921414e-06, + "loss": 0.5415, + "step": 3979 + }, + { + "epoch": 1.8817966903073287, + "grad_norm": 2.4379019737243652, + "learning_rate": 3.917000811943002e-06, + "loss": 0.4566, + "step": 3980 + }, + { + "epoch": 1.8822695035460995, + "grad_norm": 3.18522047996521, + "learning_rate": 3.9164868247341634e-06, + "loss": 0.6079, + "step": 3981 + }, + { + "epoch": 1.88274231678487, + "grad_norm": 2.6451141834259033, + "learning_rate": 3.915972749326903e-06, + "loss": 0.515, + "step": 3982 + }, + { + "epoch": 1.8832151300236406, + "grad_norm": 2.565598726272583, + "learning_rate": 3.915458585753226e-06, + "loss": 0.4799, + "step": 3983 + }, + { + "epoch": 1.8836879432624114, + "grad_norm": 2.711651563644409, + "learning_rate": 3.91494433404515e-06, + "loss": 0.5595, + "step": 3984 + }, + { + "epoch": 1.8841607565011822, + "grad_norm": 2.749328851699829, + "learning_rate": 3.914429994234695e-06, + "loss": 0.495, + "step": 3985 + }, + { + "epoch": 1.8846335697399526, + "grad_norm": 2.9492287635803223, + "learning_rate": 3.913915566353886e-06, + "loss": 0.5683, + "step": 3986 + }, + { + "epoch": 1.8851063829787233, + "grad_norm": 3.07747745513916, + "learning_rate": 3.913401050434756e-06, + "loss": 0.4953, + "step": 3987 + }, + { + "epoch": 1.8855791962174941, + "grad_norm": 2.8746345043182373, + "learning_rate": 3.912886446509338e-06, + "loss": 0.4752, + "step": 3988 + }, + { + "epoch": 1.8860520094562647, + "grad_norm": 2.772954225540161, + "learning_rate": 3.912371754609677e-06, + "loss": 0.5473, + "step": 3989 + }, + { + "epoch": 1.8865248226950353, + "grad_norm": 2.8906044960021973, + "learning_rate": 3.911856974767821e-06, + "loss": 0.5285, + "step": 3990 + }, + { + "epoch": 1.886997635933806, + "grad_norm": 2.8992726802825928, + "learning_rate": 3.9113421070158206e-06, + "loss": 0.571, + "step": 3991 + }, + { + "epoch": 1.887470449172577, + "grad_norm": 2.624662160873413, + "learning_rate": 3.910827151385737e-06, + "loss": 0.5183, + "step": 3992 + }, + { + "epoch": 1.8879432624113475, + "grad_norm": 2.4491732120513916, + "learning_rate": 3.910312107909632e-06, + "loss": 0.4205, + "step": 3993 + }, + { + "epoch": 1.888416075650118, + "grad_norm": 2.278259515762329, + "learning_rate": 3.909796976619575e-06, + "loss": 0.4464, + "step": 3994 + }, + { + "epoch": 1.8888888888888888, + "grad_norm": 2.6481523513793945, + "learning_rate": 3.909281757547644e-06, + "loss": 0.5023, + "step": 3995 + }, + { + "epoch": 1.8893617021276596, + "grad_norm": 2.6687493324279785, + "learning_rate": 3.908766450725917e-06, + "loss": 0.495, + "step": 3996 + }, + { + "epoch": 1.8898345153664302, + "grad_norm": 2.507525682449341, + "learning_rate": 3.908251056186481e-06, + "loss": 0.4155, + "step": 3997 + }, + { + "epoch": 1.8903073286052008, + "grad_norm": 2.7048323154449463, + "learning_rate": 3.907735573961426e-06, + "loss": 0.4601, + "step": 3998 + }, + { + "epoch": 1.8907801418439716, + "grad_norm": 2.6825389862060547, + "learning_rate": 3.907220004082848e-06, + "loss": 0.5067, + "step": 3999 + }, + { + "epoch": 1.8912529550827424, + "grad_norm": 2.775696039199829, + "learning_rate": 3.906704346582852e-06, + "loss": 0.5411, + "step": 4000 + }, + { + "epoch": 1.891725768321513, + "grad_norm": 2.4492077827453613, + "learning_rate": 3.906188601493545e-06, + "loss": 0.4931, + "step": 4001 + }, + { + "epoch": 1.8921985815602835, + "grad_norm": 2.320810556411743, + "learning_rate": 3.905672768847041e-06, + "loss": 0.4908, + "step": 4002 + }, + { + "epoch": 1.8926713947990543, + "grad_norm": 2.455162525177002, + "learning_rate": 3.905156848675455e-06, + "loss": 0.508, + "step": 4003 + }, + { + "epoch": 1.8931442080378251, + "grad_norm": 2.515921115875244, + "learning_rate": 3.904640841010915e-06, + "loss": 0.5318, + "step": 4004 + }, + { + "epoch": 1.8936170212765957, + "grad_norm": 2.7230770587921143, + "learning_rate": 3.904124745885548e-06, + "loss": 0.4793, + "step": 4005 + }, + { + "epoch": 1.8940898345153663, + "grad_norm": 2.519934892654419, + "learning_rate": 3.903608563331491e-06, + "loss": 0.5013, + "step": 4006 + }, + { + "epoch": 1.894562647754137, + "grad_norm": 2.719674587249756, + "learning_rate": 3.903092293380883e-06, + "loss": 0.516, + "step": 4007 + }, + { + "epoch": 1.8950354609929079, + "grad_norm": 3.2107343673706055, + "learning_rate": 3.902575936065869e-06, + "loss": 0.6297, + "step": 4008 + }, + { + "epoch": 1.8955082742316784, + "grad_norm": 2.9773149490356445, + "learning_rate": 3.902059491418603e-06, + "loss": 0.566, + "step": 4009 + }, + { + "epoch": 1.895981087470449, + "grad_norm": 2.6754770278930664, + "learning_rate": 3.90154295947124e-06, + "loss": 0.5187, + "step": 4010 + }, + { + "epoch": 1.8964539007092198, + "grad_norm": 2.457303762435913, + "learning_rate": 3.901026340255943e-06, + "loss": 0.5757, + "step": 4011 + }, + { + "epoch": 1.8969267139479906, + "grad_norm": 2.5944161415100098, + "learning_rate": 3.900509633804878e-06, + "loss": 0.5049, + "step": 4012 + }, + { + "epoch": 1.8973995271867612, + "grad_norm": 2.610445022583008, + "learning_rate": 3.89999284015022e-06, + "loss": 0.521, + "step": 4013 + }, + { + "epoch": 1.8978723404255318, + "grad_norm": 2.6949338912963867, + "learning_rate": 3.899475959324146e-06, + "loss": 0.5619, + "step": 4014 + }, + { + "epoch": 1.8983451536643026, + "grad_norm": 2.7889559268951416, + "learning_rate": 3.898958991358841e-06, + "loss": 0.5223, + "step": 4015 + }, + { + "epoch": 1.8988179669030734, + "grad_norm": 2.569265842437744, + "learning_rate": 3.898441936286493e-06, + "loss": 0.5724, + "step": 4016 + }, + { + "epoch": 1.899290780141844, + "grad_norm": 2.3567774295806885, + "learning_rate": 3.897924794139299e-06, + "loss": 0.4784, + "step": 4017 + }, + { + "epoch": 1.8997635933806145, + "grad_norm": 2.9176526069641113, + "learning_rate": 3.897407564949457e-06, + "loss": 0.646, + "step": 4018 + }, + { + "epoch": 1.9002364066193853, + "grad_norm": 2.7870090007781982, + "learning_rate": 3.896890248749174e-06, + "loss": 0.4922, + "step": 4019 + }, + { + "epoch": 1.900709219858156, + "grad_norm": 2.8310980796813965, + "learning_rate": 3.89637284557066e-06, + "loss": 0.4746, + "step": 4020 + }, + { + "epoch": 1.9011820330969267, + "grad_norm": 2.434915542602539, + "learning_rate": 3.895855355446131e-06, + "loss": 0.4537, + "step": 4021 + }, + { + "epoch": 1.9016548463356973, + "grad_norm": 3.0547034740448, + "learning_rate": 3.89533777840781e-06, + "loss": 0.6161, + "step": 4022 + }, + { + "epoch": 1.902127659574468, + "grad_norm": 3.416774272918701, + "learning_rate": 3.894820114487925e-06, + "loss": 0.5448, + "step": 4023 + }, + { + "epoch": 1.9026004728132389, + "grad_norm": 2.606951951980591, + "learning_rate": 3.894302363718707e-06, + "loss": 0.5501, + "step": 4024 + }, + { + "epoch": 1.9030732860520094, + "grad_norm": 3.082165002822876, + "learning_rate": 3.8937845261323945e-06, + "loss": 0.6035, + "step": 4025 + }, + { + "epoch": 1.90354609929078, + "grad_norm": 2.616093397140503, + "learning_rate": 3.893266601761231e-06, + "loss": 0.5294, + "step": 4026 + }, + { + "epoch": 1.9040189125295508, + "grad_norm": 2.7141637802124023, + "learning_rate": 3.8927485906374654e-06, + "loss": 0.5481, + "step": 4027 + }, + { + "epoch": 1.9044917257683216, + "grad_norm": 2.5129404067993164, + "learning_rate": 3.892230492793352e-06, + "loss": 0.4958, + "step": 4028 + }, + { + "epoch": 1.9049645390070922, + "grad_norm": 2.703403949737549, + "learning_rate": 3.891712308261151e-06, + "loss": 0.4852, + "step": 4029 + }, + { + "epoch": 1.9054373522458627, + "grad_norm": 2.881058931350708, + "learning_rate": 3.891194037073127e-06, + "loss": 0.4662, + "step": 4030 + }, + { + "epoch": 1.9059101654846335, + "grad_norm": 3.216769218444824, + "learning_rate": 3.8906756792615505e-06, + "loss": 0.5076, + "step": 4031 + }, + { + "epoch": 1.9063829787234043, + "grad_norm": 2.442265748977661, + "learning_rate": 3.890157234858697e-06, + "loss": 0.4748, + "step": 4032 + }, + { + "epoch": 1.906855791962175, + "grad_norm": 3.088672399520874, + "learning_rate": 3.889638703896849e-06, + "loss": 0.5729, + "step": 4033 + }, + { + "epoch": 1.9073286052009455, + "grad_norm": 2.9304986000061035, + "learning_rate": 3.889120086408291e-06, + "loss": 0.603, + "step": 4034 + }, + { + "epoch": 1.9078014184397163, + "grad_norm": 2.686093807220459, + "learning_rate": 3.888601382425318e-06, + "loss": 0.4978, + "step": 4035 + }, + { + "epoch": 1.908274231678487, + "grad_norm": 2.5668389797210693, + "learning_rate": 3.888082591980225e-06, + "loss": 0.5086, + "step": 4036 + }, + { + "epoch": 1.9087470449172577, + "grad_norm": 2.530996561050415, + "learning_rate": 3.887563715105315e-06, + "loss": 0.4678, + "step": 4037 + }, + { + "epoch": 1.9092198581560282, + "grad_norm": 3.043342351913452, + "learning_rate": 3.887044751832897e-06, + "loss": 0.5452, + "step": 4038 + }, + { + "epoch": 1.909692671394799, + "grad_norm": 2.799734115600586, + "learning_rate": 3.886525702195284e-06, + "loss": 0.5265, + "step": 4039 + }, + { + "epoch": 1.9101654846335698, + "grad_norm": 2.890022039413452, + "learning_rate": 3.886006566224796e-06, + "loss": 0.4634, + "step": 4040 + }, + { + "epoch": 1.9106382978723404, + "grad_norm": 2.6804237365722656, + "learning_rate": 3.8854873439537555e-06, + "loss": 0.5031, + "step": 4041 + }, + { + "epoch": 1.911111111111111, + "grad_norm": 2.43038272857666, + "learning_rate": 3.884968035414495e-06, + "loss": 0.5098, + "step": 4042 + }, + { + "epoch": 1.9115839243498818, + "grad_norm": 2.589583396911621, + "learning_rate": 3.884448640639346e-06, + "loss": 0.498, + "step": 4043 + }, + { + "epoch": 1.9120567375886526, + "grad_norm": 2.4565231800079346, + "learning_rate": 3.8839291596606524e-06, + "loss": 0.4318, + "step": 4044 + }, + { + "epoch": 1.9125295508274232, + "grad_norm": 2.66762638092041, + "learning_rate": 3.8834095925107575e-06, + "loss": 0.5441, + "step": 4045 + }, + { + "epoch": 1.9130023640661937, + "grad_norm": 2.7334461212158203, + "learning_rate": 3.882889939222013e-06, + "loss": 0.5209, + "step": 4046 + }, + { + "epoch": 1.9134751773049645, + "grad_norm": 2.6398537158966064, + "learning_rate": 3.8823701998267765e-06, + "loss": 0.4874, + "step": 4047 + }, + { + "epoch": 1.9139479905437353, + "grad_norm": 2.82405161857605, + "learning_rate": 3.881850374357409e-06, + "loss": 0.4519, + "step": 4048 + }, + { + "epoch": 1.914420803782506, + "grad_norm": 2.7552523612976074, + "learning_rate": 3.8813304628462776e-06, + "loss": 0.547, + "step": 4049 + }, + { + "epoch": 1.9148936170212765, + "grad_norm": 2.5287928581237793, + "learning_rate": 3.880810465325755e-06, + "loss": 0.5226, + "step": 4050 + }, + { + "epoch": 1.9153664302600473, + "grad_norm": 2.7597358226776123, + "learning_rate": 3.88029038182822e-06, + "loss": 0.5171, + "step": 4051 + }, + { + "epoch": 1.915839243498818, + "grad_norm": 2.563899278640747, + "learning_rate": 3.879770212386055e-06, + "loss": 0.4911, + "step": 4052 + }, + { + "epoch": 1.9163120567375886, + "grad_norm": 2.499404191970825, + "learning_rate": 3.879249957031649e-06, + "loss": 0.5072, + "step": 4053 + }, + { + "epoch": 1.9167848699763592, + "grad_norm": 2.817713499069214, + "learning_rate": 3.878729615797396e-06, + "loss": 0.5452, + "step": 4054 + }, + { + "epoch": 1.91725768321513, + "grad_norm": 2.7152490615844727, + "learning_rate": 3.878209188715696e-06, + "loss": 0.4917, + "step": 4055 + }, + { + "epoch": 1.9177304964539008, + "grad_norm": 2.384265661239624, + "learning_rate": 3.877688675818953e-06, + "loss": 0.4823, + "step": 4056 + }, + { + "epoch": 1.9182033096926714, + "grad_norm": 2.61059308052063, + "learning_rate": 3.877168077139577e-06, + "loss": 0.478, + "step": 4057 + }, + { + "epoch": 1.918676122931442, + "grad_norm": 2.6107938289642334, + "learning_rate": 3.8766473927099824e-06, + "loss": 0.5202, + "step": 4058 + }, + { + "epoch": 1.9191489361702128, + "grad_norm": 2.2339766025543213, + "learning_rate": 3.876126622562592e-06, + "loss": 0.547, + "step": 4059 + }, + { + "epoch": 1.9196217494089836, + "grad_norm": 2.4324610233306885, + "learning_rate": 3.8756057667298304e-06, + "loss": 0.5333, + "step": 4060 + }, + { + "epoch": 1.9200945626477541, + "grad_norm": 2.5521230697631836, + "learning_rate": 3.875084825244131e-06, + "loss": 0.5503, + "step": 4061 + }, + { + "epoch": 1.9205673758865247, + "grad_norm": 2.6985747814178467, + "learning_rate": 3.874563798137928e-06, + "loss": 0.4944, + "step": 4062 + }, + { + "epoch": 1.9210401891252955, + "grad_norm": 2.422332525253296, + "learning_rate": 3.874042685443664e-06, + "loss": 0.4807, + "step": 4063 + }, + { + "epoch": 1.9215130023640663, + "grad_norm": 2.914553165435791, + "learning_rate": 3.873521487193788e-06, + "loss": 0.4439, + "step": 4064 + }, + { + "epoch": 1.9219858156028369, + "grad_norm": 2.8098697662353516, + "learning_rate": 3.873000203420752e-06, + "loss": 0.5433, + "step": 4065 + }, + { + "epoch": 1.9224586288416075, + "grad_norm": 2.6124703884124756, + "learning_rate": 3.872478834157013e-06, + "loss": 0.4812, + "step": 4066 + }, + { + "epoch": 1.9229314420803783, + "grad_norm": 2.511059522628784, + "learning_rate": 3.871957379435035e-06, + "loss": 0.4666, + "step": 4067 + }, + { + "epoch": 1.923404255319149, + "grad_norm": 2.950542688369751, + "learning_rate": 3.871435839287287e-06, + "loss": 0.5687, + "step": 4068 + }, + { + "epoch": 1.9238770685579196, + "grad_norm": 2.4969422817230225, + "learning_rate": 3.870914213746243e-06, + "loss": 0.5235, + "step": 4069 + }, + { + "epoch": 1.9243498817966902, + "grad_norm": 2.512152910232544, + "learning_rate": 3.870392502844382e-06, + "loss": 0.4524, + "step": 4070 + }, + { + "epoch": 1.924822695035461, + "grad_norm": 3.0212557315826416, + "learning_rate": 3.86987070661419e-06, + "loss": 0.4868, + "step": 4071 + }, + { + "epoch": 1.9252955082742318, + "grad_norm": 2.8949966430664062, + "learning_rate": 3.869348825088154e-06, + "loss": 0.5556, + "step": 4072 + }, + { + "epoch": 1.9257683215130024, + "grad_norm": 2.402043581008911, + "learning_rate": 3.868826858298772e-06, + "loss": 0.5307, + "step": 4073 + }, + { + "epoch": 1.926241134751773, + "grad_norm": 2.980992078781128, + "learning_rate": 3.868304806278543e-06, + "loss": 0.6313, + "step": 4074 + }, + { + "epoch": 1.9267139479905437, + "grad_norm": 2.7140514850616455, + "learning_rate": 3.867782669059975e-06, + "loss": 0.5359, + "step": 4075 + }, + { + "epoch": 1.9271867612293145, + "grad_norm": 2.499631643295288, + "learning_rate": 3.867260446675577e-06, + "loss": 0.4873, + "step": 4076 + }, + { + "epoch": 1.9276595744680851, + "grad_norm": 2.915583610534668, + "learning_rate": 3.866738139157866e-06, + "loss": 0.5736, + "step": 4077 + }, + { + "epoch": 1.9281323877068557, + "grad_norm": 2.4231131076812744, + "learning_rate": 3.866215746539363e-06, + "loss": 0.5096, + "step": 4078 + }, + { + "epoch": 1.9286052009456265, + "grad_norm": 2.360074996948242, + "learning_rate": 3.865693268852599e-06, + "loss": 0.4907, + "step": 4079 + }, + { + "epoch": 1.9290780141843973, + "grad_norm": 2.5410032272338867, + "learning_rate": 3.865170706130101e-06, + "loss": 0.473, + "step": 4080 + }, + { + "epoch": 1.9295508274231679, + "grad_norm": 2.780090808868408, + "learning_rate": 3.86464805840441e-06, + "loss": 0.5213, + "step": 4081 + }, + { + "epoch": 1.9300236406619384, + "grad_norm": 2.7318382263183594, + "learning_rate": 3.864125325708068e-06, + "loss": 0.5617, + "step": 4082 + }, + { + "epoch": 1.9304964539007092, + "grad_norm": 2.76509165763855, + "learning_rate": 3.863602508073623e-06, + "loss": 0.52, + "step": 4083 + }, + { + "epoch": 1.93096926713948, + "grad_norm": 2.8041110038757324, + "learning_rate": 3.863079605533631e-06, + "loss": 0.5343, + "step": 4084 + }, + { + "epoch": 1.9314420803782506, + "grad_norm": 2.4462404251098633, + "learning_rate": 3.862556618120647e-06, + "loss": 0.4657, + "step": 4085 + }, + { + "epoch": 1.9319148936170212, + "grad_norm": 2.460864305496216, + "learning_rate": 3.862033545867238e-06, + "loss": 0.517, + "step": 4086 + }, + { + "epoch": 1.932387706855792, + "grad_norm": 2.6480276584625244, + "learning_rate": 3.8615103888059715e-06, + "loss": 0.4702, + "step": 4087 + }, + { + "epoch": 1.9328605200945628, + "grad_norm": 2.7175381183624268, + "learning_rate": 3.860987146969424e-06, + "loss": 0.5073, + "step": 4088 + }, + { + "epoch": 1.9333333333333333, + "grad_norm": 2.4963486194610596, + "learning_rate": 3.860463820390175e-06, + "loss": 0.4491, + "step": 4089 + }, + { + "epoch": 1.933806146572104, + "grad_norm": 2.548135757446289, + "learning_rate": 3.8599404091008075e-06, + "loss": 0.5134, + "step": 4090 + }, + { + "epoch": 1.9342789598108747, + "grad_norm": 2.8693668842315674, + "learning_rate": 3.859416913133916e-06, + "loss": 0.5467, + "step": 4091 + }, + { + "epoch": 1.9347517730496455, + "grad_norm": 2.711273670196533, + "learning_rate": 3.858893332522092e-06, + "loss": 0.6287, + "step": 4092 + }, + { + "epoch": 1.935224586288416, + "grad_norm": 2.8604533672332764, + "learning_rate": 3.858369667297941e-06, + "loss": 0.5661, + "step": 4093 + }, + { + "epoch": 1.9356973995271867, + "grad_norm": 2.936988353729248, + "learning_rate": 3.857845917494066e-06, + "loss": 0.5311, + "step": 4094 + }, + { + "epoch": 1.9361702127659575, + "grad_norm": 2.414093494415283, + "learning_rate": 3.857322083143079e-06, + "loss": 0.505, + "step": 4095 + }, + { + "epoch": 1.9366430260047283, + "grad_norm": 2.5528934001922607, + "learning_rate": 3.856798164277599e-06, + "loss": 0.4759, + "step": 4096 + }, + { + "epoch": 1.9371158392434988, + "grad_norm": 2.592893600463867, + "learning_rate": 3.8562741609302456e-06, + "loss": 0.4932, + "step": 4097 + }, + { + "epoch": 1.9375886524822694, + "grad_norm": 2.9619107246398926, + "learning_rate": 3.855750073133648e-06, + "loss": 0.5563, + "step": 4098 + }, + { + "epoch": 1.9380614657210402, + "grad_norm": 2.864889621734619, + "learning_rate": 3.855225900920438e-06, + "loss": 0.5069, + "step": 4099 + }, + { + "epoch": 1.938534278959811, + "grad_norm": 2.3951032161712646, + "learning_rate": 3.854701644323253e-06, + "loss": 0.4883, + "step": 4100 + }, + { + "epoch": 1.9390070921985816, + "grad_norm": 2.6339633464813232, + "learning_rate": 3.854177303374737e-06, + "loss": 0.5207, + "step": 4101 + }, + { + "epoch": 1.9394799054373522, + "grad_norm": 2.6435508728027344, + "learning_rate": 3.853652878107539e-06, + "loss": 0.4679, + "step": 4102 + }, + { + "epoch": 1.939952718676123, + "grad_norm": 2.4635629653930664, + "learning_rate": 3.853128368554311e-06, + "loss": 0.5639, + "step": 4103 + }, + { + "epoch": 1.9404255319148938, + "grad_norm": 2.664635419845581, + "learning_rate": 3.852603774747714e-06, + "loss": 0.5697, + "step": 4104 + }, + { + "epoch": 1.9408983451536643, + "grad_norm": 2.7020363807678223, + "learning_rate": 3.8520790967204095e-06, + "loss": 0.5462, + "step": 4105 + }, + { + "epoch": 1.941371158392435, + "grad_norm": 3.529282331466675, + "learning_rate": 3.851554334505069e-06, + "loss": 0.54, + "step": 4106 + }, + { + "epoch": 1.9418439716312057, + "grad_norm": 2.7125768661499023, + "learning_rate": 3.851029488134367e-06, + "loss": 0.5355, + "step": 4107 + }, + { + "epoch": 1.9423167848699765, + "grad_norm": 2.5226643085479736, + "learning_rate": 3.850504557640981e-06, + "loss": 0.5106, + "step": 4108 + }, + { + "epoch": 1.942789598108747, + "grad_norm": 2.834352731704712, + "learning_rate": 3.8499795430575995e-06, + "loss": 0.6069, + "step": 4109 + }, + { + "epoch": 1.9432624113475176, + "grad_norm": 2.8484177589416504, + "learning_rate": 3.849454444416911e-06, + "loss": 0.5542, + "step": 4110 + }, + { + "epoch": 1.9437352245862884, + "grad_norm": 2.402539014816284, + "learning_rate": 3.848929261751612e-06, + "loss": 0.47, + "step": 4111 + }, + { + "epoch": 1.9442080378250592, + "grad_norm": 2.7010042667388916, + "learning_rate": 3.848403995094402e-06, + "loss": 0.5263, + "step": 4112 + }, + { + "epoch": 1.9446808510638298, + "grad_norm": 2.441689968109131, + "learning_rate": 3.847878644477988e-06, + "loss": 0.5607, + "step": 4113 + }, + { + "epoch": 1.9451536643026004, + "grad_norm": 2.5994722843170166, + "learning_rate": 3.847353209935081e-06, + "loss": 0.5103, + "step": 4114 + }, + { + "epoch": 1.9456264775413712, + "grad_norm": 2.452242136001587, + "learning_rate": 3.8468276914983975e-06, + "loss": 0.4409, + "step": 4115 + }, + { + "epoch": 1.946099290780142, + "grad_norm": 2.421023368835449, + "learning_rate": 3.84630208920066e-06, + "loss": 0.4429, + "step": 4116 + }, + { + "epoch": 1.9465721040189126, + "grad_norm": 2.696399688720703, + "learning_rate": 3.8457764030745945e-06, + "loss": 0.5352, + "step": 4117 + }, + { + "epoch": 1.9470449172576831, + "grad_norm": 2.3963489532470703, + "learning_rate": 3.845250633152933e-06, + "loss": 0.4505, + "step": 4118 + }, + { + "epoch": 1.947517730496454, + "grad_norm": 2.610649585723877, + "learning_rate": 3.8447247794684135e-06, + "loss": 0.501, + "step": 4119 + }, + { + "epoch": 1.9479905437352247, + "grad_norm": 2.740412712097168, + "learning_rate": 3.8441988420537775e-06, + "loss": 0.5362, + "step": 4120 + }, + { + "epoch": 1.9484633569739953, + "grad_norm": 2.2614004611968994, + "learning_rate": 3.8436728209417755e-06, + "loss": 0.4199, + "step": 4121 + }, + { + "epoch": 1.9489361702127659, + "grad_norm": 3.0683481693267822, + "learning_rate": 3.843146716165158e-06, + "loss": 0.5248, + "step": 4122 + }, + { + "epoch": 1.9494089834515367, + "grad_norm": 3.005174398422241, + "learning_rate": 3.842620527756684e-06, + "loss": 0.5246, + "step": 4123 + }, + { + "epoch": 1.9498817966903075, + "grad_norm": 2.672896385192871, + "learning_rate": 3.842094255749117e-06, + "loss": 0.5586, + "step": 4124 + }, + { + "epoch": 1.950354609929078, + "grad_norm": 2.5481197834014893, + "learning_rate": 3.8415679001752255e-06, + "loss": 0.5061, + "step": 4125 + }, + { + "epoch": 1.9508274231678486, + "grad_norm": 2.515789270401001, + "learning_rate": 3.8410414610677835e-06, + "loss": 0.4645, + "step": 4126 + }, + { + "epoch": 1.9513002364066194, + "grad_norm": 2.7236077785491943, + "learning_rate": 3.84051493845957e-06, + "loss": 0.5623, + "step": 4127 + }, + { + "epoch": 1.9517730496453902, + "grad_norm": 2.6252009868621826, + "learning_rate": 3.839988332383369e-06, + "loss": 0.5078, + "step": 4128 + }, + { + "epoch": 1.9522458628841608, + "grad_norm": 2.719196081161499, + "learning_rate": 3.83946164287197e-06, + "loss": 0.5481, + "step": 4129 + }, + { + "epoch": 1.9527186761229314, + "grad_norm": 2.484163284301758, + "learning_rate": 3.838934869958169e-06, + "loss": 0.5332, + "step": 4130 + }, + { + "epoch": 1.9531914893617022, + "grad_norm": 2.615382671356201, + "learning_rate": 3.838408013674764e-06, + "loss": 0.4742, + "step": 4131 + }, + { + "epoch": 1.953664302600473, + "grad_norm": 2.735321044921875, + "learning_rate": 3.83788107405456e-06, + "loss": 0.421, + "step": 4132 + }, + { + "epoch": 1.9541371158392435, + "grad_norm": 2.892652750015259, + "learning_rate": 3.837354051130369e-06, + "loss": 0.5326, + "step": 4133 + }, + { + "epoch": 1.9546099290780141, + "grad_norm": 2.6800546646118164, + "learning_rate": 3.8368269449350055e-06, + "loss": 0.5041, + "step": 4134 + }, + { + "epoch": 1.955082742316785, + "grad_norm": 2.362470865249634, + "learning_rate": 3.836299755501289e-06, + "loss": 0.4697, + "step": 4135 + }, + { + "epoch": 1.9555555555555557, + "grad_norm": 2.3855135440826416, + "learning_rate": 3.835772482862047e-06, + "loss": 0.5148, + "step": 4136 + }, + { + "epoch": 1.9560283687943263, + "grad_norm": 2.3338418006896973, + "learning_rate": 3.83524512705011e-06, + "loss": 0.4643, + "step": 4137 + }, + { + "epoch": 1.9565011820330969, + "grad_norm": 2.261355400085449, + "learning_rate": 3.834717688098313e-06, + "loss": 0.5573, + "step": 4138 + }, + { + "epoch": 1.9569739952718677, + "grad_norm": 2.8166391849517822, + "learning_rate": 3.834190166039498e-06, + "loss": 0.4868, + "step": 4139 + }, + { + "epoch": 1.9574468085106385, + "grad_norm": 2.4155869483947754, + "learning_rate": 3.833662560906512e-06, + "loss": 0.4923, + "step": 4140 + }, + { + "epoch": 1.957919621749409, + "grad_norm": 2.3977696895599365, + "learning_rate": 3.833134872732206e-06, + "loss": 0.5106, + "step": 4141 + }, + { + "epoch": 1.9583924349881796, + "grad_norm": 2.9541378021240234, + "learning_rate": 3.832607101549438e-06, + "loss": 0.4683, + "step": 4142 + }, + { + "epoch": 1.9588652482269504, + "grad_norm": 2.5862700939178467, + "learning_rate": 3.832079247391068e-06, + "loss": 0.4453, + "step": 4143 + }, + { + "epoch": 1.9593380614657212, + "grad_norm": 2.7459371089935303, + "learning_rate": 3.8315513102899644e-06, + "loss": 0.5511, + "step": 4144 + }, + { + "epoch": 1.9598108747044918, + "grad_norm": 2.904869556427002, + "learning_rate": 3.831023290279e-06, + "loss": 0.5348, + "step": 4145 + }, + { + "epoch": 1.9602836879432624, + "grad_norm": 3.092846632003784, + "learning_rate": 3.830495187391051e-06, + "loss": 0.5664, + "step": 4146 + }, + { + "epoch": 1.9607565011820332, + "grad_norm": 3.2838528156280518, + "learning_rate": 3.829967001659001e-06, + "loss": 0.5115, + "step": 4147 + }, + { + "epoch": 1.961229314420804, + "grad_norm": 2.7799549102783203, + "learning_rate": 3.829438733115738e-06, + "loss": 0.5145, + "step": 4148 + }, + { + "epoch": 1.9617021276595743, + "grad_norm": 2.436084270477295, + "learning_rate": 3.828910381794154e-06, + "loss": 0.4718, + "step": 4149 + }, + { + "epoch": 1.962174940898345, + "grad_norm": 2.6662371158599854, + "learning_rate": 3.828381947727148e-06, + "loss": 0.6129, + "step": 4150 + }, + { + "epoch": 1.962647754137116, + "grad_norm": 2.937000036239624, + "learning_rate": 3.827853430947622e-06, + "loss": 0.522, + "step": 4151 + }, + { + "epoch": 1.9631205673758865, + "grad_norm": 2.5737369060516357, + "learning_rate": 3.827324831488486e-06, + "loss": 0.4916, + "step": 4152 + }, + { + "epoch": 1.963593380614657, + "grad_norm": 2.70232892036438, + "learning_rate": 3.826796149382653e-06, + "loss": 0.4726, + "step": 4153 + }, + { + "epoch": 1.9640661938534278, + "grad_norm": 2.6899707317352295, + "learning_rate": 3.826267384663042e-06, + "loss": 0.529, + "step": 4154 + }, + { + "epoch": 1.9645390070921986, + "grad_norm": 2.6142728328704834, + "learning_rate": 3.825738537362575e-06, + "loss": 0.4999, + "step": 4155 + }, + { + "epoch": 1.9650118203309692, + "grad_norm": 2.43949818611145, + "learning_rate": 3.825209607514183e-06, + "loss": 0.5035, + "step": 4156 + }, + { + "epoch": 1.9654846335697398, + "grad_norm": 2.3735458850860596, + "learning_rate": 3.824680595150801e-06, + "loss": 0.4779, + "step": 4157 + }, + { + "epoch": 1.9659574468085106, + "grad_norm": 2.444307565689087, + "learning_rate": 3.824151500305365e-06, + "loss": 0.4825, + "step": 4158 + }, + { + "epoch": 1.9664302600472814, + "grad_norm": 2.8219668865203857, + "learning_rate": 3.8236223230108224e-06, + "loss": 0.5354, + "step": 4159 + }, + { + "epoch": 1.966903073286052, + "grad_norm": 2.720721483230591, + "learning_rate": 3.823093063300121e-06, + "loss": 0.5064, + "step": 4160 + }, + { + "epoch": 1.9673758865248225, + "grad_norm": 2.324190616607666, + "learning_rate": 3.822563721206217e-06, + "loss": 0.5348, + "step": 4161 + }, + { + "epoch": 1.9678486997635933, + "grad_norm": 2.702155351638794, + "learning_rate": 3.8220342967620695e-06, + "loss": 0.5388, + "step": 4162 + }, + { + "epoch": 1.9683215130023641, + "grad_norm": 2.4956369400024414, + "learning_rate": 3.821504790000642e-06, + "loss": 0.5071, + "step": 4163 + }, + { + "epoch": 1.9687943262411347, + "grad_norm": 2.568039655685425, + "learning_rate": 3.820975200954906e-06, + "loss": 0.5133, + "step": 4164 + }, + { + "epoch": 1.9692671394799053, + "grad_norm": 2.810868978500366, + "learning_rate": 3.820445529657837e-06, + "loss": 0.4856, + "step": 4165 + }, + { + "epoch": 1.969739952718676, + "grad_norm": 2.66365647315979, + "learning_rate": 3.819915776142415e-06, + "loss": 0.5235, + "step": 4166 + }, + { + "epoch": 1.9702127659574469, + "grad_norm": 2.2982139587402344, + "learning_rate": 3.8193859404416265e-06, + "loss": 0.4361, + "step": 4167 + }, + { + "epoch": 1.9706855791962175, + "grad_norm": 2.585672378540039, + "learning_rate": 3.818856022588458e-06, + "loss": 0.4842, + "step": 4168 + }, + { + "epoch": 1.971158392434988, + "grad_norm": 2.57857346534729, + "learning_rate": 3.81832602261591e-06, + "loss": 0.5249, + "step": 4169 + }, + { + "epoch": 1.9716312056737588, + "grad_norm": 2.6947224140167236, + "learning_rate": 3.817795940556981e-06, + "loss": 0.5234, + "step": 4170 + }, + { + "epoch": 1.9721040189125296, + "grad_norm": 2.7453415393829346, + "learning_rate": 3.8172657764446764e-06, + "loss": 0.5219, + "step": 4171 + }, + { + "epoch": 1.9725768321513002, + "grad_norm": 8.424073219299316, + "learning_rate": 3.816735530312009e-06, + "loss": 0.5162, + "step": 4172 + }, + { + "epoch": 1.9730496453900708, + "grad_norm": 2.8229739665985107, + "learning_rate": 3.816205202191993e-06, + "loss": 0.4621, + "step": 4173 + }, + { + "epoch": 1.9735224586288416, + "grad_norm": 2.5969009399414062, + "learning_rate": 3.815674792117651e-06, + "loss": 0.5044, + "step": 4174 + }, + { + "epoch": 1.9739952718676124, + "grad_norm": 2.646024227142334, + "learning_rate": 3.815144300122009e-06, + "loss": 0.5094, + "step": 4175 + }, + { + "epoch": 1.974468085106383, + "grad_norm": 2.4950616359710693, + "learning_rate": 3.814613726238097e-06, + "loss": 0.4827, + "step": 4176 + }, + { + "epoch": 1.9749408983451535, + "grad_norm": 2.5636119842529297, + "learning_rate": 3.8140830704989535e-06, + "loss": 0.5241, + "step": 4177 + }, + { + "epoch": 1.9754137115839243, + "grad_norm": 2.7936553955078125, + "learning_rate": 3.813552332937619e-06, + "loss": 0.5344, + "step": 4178 + }, + { + "epoch": 1.9758865248226951, + "grad_norm": 2.8085341453552246, + "learning_rate": 3.8130215135871405e-06, + "loss": 0.5647, + "step": 4179 + }, + { + "epoch": 1.9763593380614657, + "grad_norm": 2.4776322841644287, + "learning_rate": 3.8124906124805694e-06, + "loss": 0.542, + "step": 4180 + }, + { + "epoch": 1.9768321513002363, + "grad_norm": 2.3227856159210205, + "learning_rate": 3.8119596296509635e-06, + "loss": 0.4618, + "step": 4181 + }, + { + "epoch": 1.977304964539007, + "grad_norm": 2.5157814025878906, + "learning_rate": 3.8114285651313848e-06, + "loss": 0.538, + "step": 4182 + }, + { + "epoch": 1.9777777777777779, + "grad_norm": 2.5630218982696533, + "learning_rate": 3.8108974189548987e-06, + "loss": 0.5254, + "step": 4183 + }, + { + "epoch": 1.9782505910165484, + "grad_norm": 2.703237533569336, + "learning_rate": 3.8103661911545787e-06, + "loss": 0.4859, + "step": 4184 + }, + { + "epoch": 1.978723404255319, + "grad_norm": 2.8808000087738037, + "learning_rate": 3.809834881763502e-06, + "loss": 0.5585, + "step": 4185 + }, + { + "epoch": 1.9791962174940898, + "grad_norm": 2.9047577381134033, + "learning_rate": 3.8093034908147507e-06, + "loss": 0.5022, + "step": 4186 + }, + { + "epoch": 1.9796690307328606, + "grad_norm": 2.7417640686035156, + "learning_rate": 3.8087720183414125e-06, + "loss": 0.5275, + "step": 4187 + }, + { + "epoch": 1.9801418439716312, + "grad_norm": 2.952012062072754, + "learning_rate": 3.8082404643765786e-06, + "loss": 0.543, + "step": 4188 + }, + { + "epoch": 1.9806146572104018, + "grad_norm": 2.538376569747925, + "learning_rate": 3.807708828953348e-06, + "loss": 0.4969, + "step": 4189 + }, + { + "epoch": 1.9810874704491725, + "grad_norm": 2.3476181030273438, + "learning_rate": 3.807177112104823e-06, + "loss": 0.4979, + "step": 4190 + }, + { + "epoch": 1.9815602836879433, + "grad_norm": 2.6480464935302734, + "learning_rate": 3.80664531386411e-06, + "loss": 0.4894, + "step": 4191 + }, + { + "epoch": 1.982033096926714, + "grad_norm": 2.792916774749756, + "learning_rate": 3.8061134342643235e-06, + "loss": 0.5468, + "step": 4192 + }, + { + "epoch": 1.9825059101654845, + "grad_norm": 2.368736743927002, + "learning_rate": 3.805581473338581e-06, + "loss": 0.4672, + "step": 4193 + }, + { + "epoch": 1.9829787234042553, + "grad_norm": 2.379084348678589, + "learning_rate": 3.8050494311200037e-06, + "loss": 0.4577, + "step": 4194 + }, + { + "epoch": 1.983451536643026, + "grad_norm": 2.722471237182617, + "learning_rate": 3.804517307641722e-06, + "loss": 0.4988, + "step": 4195 + }, + { + "epoch": 1.9839243498817967, + "grad_norm": 2.356649875640869, + "learning_rate": 3.8039851029368674e-06, + "loss": 0.4933, + "step": 4196 + }, + { + "epoch": 1.9843971631205672, + "grad_norm": 2.9182281494140625, + "learning_rate": 3.8034528170385776e-06, + "loss": 0.4873, + "step": 4197 + }, + { + "epoch": 1.984869976359338, + "grad_norm": 2.6232199668884277, + "learning_rate": 3.8029204499799976e-06, + "loss": 0.4425, + "step": 4198 + }, + { + "epoch": 1.9853427895981088, + "grad_norm": 2.667541980743408, + "learning_rate": 3.802388001794274e-06, + "loss": 0.5022, + "step": 4199 + }, + { + "epoch": 1.9858156028368794, + "grad_norm": 3.168470621109009, + "learning_rate": 3.8018554725145596e-06, + "loss": 0.5505, + "step": 4200 + }, + { + "epoch": 1.98628841607565, + "grad_norm": 2.716625452041626, + "learning_rate": 3.8013228621740132e-06, + "loss": 0.4937, + "step": 4201 + }, + { + "epoch": 1.9867612293144208, + "grad_norm": 2.3014442920684814, + "learning_rate": 3.800790170805799e-06, + "loss": 0.4734, + "step": 4202 + }, + { + "epoch": 1.9872340425531916, + "grad_norm": 2.9426841735839844, + "learning_rate": 3.8002573984430847e-06, + "loss": 0.4983, + "step": 4203 + }, + { + "epoch": 1.9877068557919622, + "grad_norm": 2.5598278045654297, + "learning_rate": 3.7997245451190435e-06, + "loss": 0.4834, + "step": 4204 + }, + { + "epoch": 1.9881796690307327, + "grad_norm": 2.86458420753479, + "learning_rate": 3.7991916108668538e-06, + "loss": 0.5613, + "step": 4205 + }, + { + "epoch": 1.9886524822695035, + "grad_norm": 2.842914342880249, + "learning_rate": 3.7986585957196997e-06, + "loss": 0.4951, + "step": 4206 + }, + { + "epoch": 1.9891252955082743, + "grad_norm": 3.1828150749206543, + "learning_rate": 3.7981254997107686e-06, + "loss": 0.5913, + "step": 4207 + }, + { + "epoch": 1.989598108747045, + "grad_norm": 2.5765931606292725, + "learning_rate": 3.7975923228732547e-06, + "loss": 0.5544, + "step": 4208 + }, + { + "epoch": 1.9900709219858155, + "grad_norm": 2.492234945297241, + "learning_rate": 3.797059065240357e-06, + "loss": 0.5046, + "step": 4209 + }, + { + "epoch": 1.9905437352245863, + "grad_norm": 2.870346784591675, + "learning_rate": 3.7965257268452795e-06, + "loss": 0.5354, + "step": 4210 + }, + { + "epoch": 1.991016548463357, + "grad_norm": 2.4989993572235107, + "learning_rate": 3.795992307721229e-06, + "loss": 0.4677, + "step": 4211 + }, + { + "epoch": 1.9914893617021276, + "grad_norm": 2.931114673614502, + "learning_rate": 3.7954588079014206e-06, + "loss": 0.5504, + "step": 4212 + }, + { + "epoch": 1.9919621749408982, + "grad_norm": 2.5247652530670166, + "learning_rate": 3.794925227419073e-06, + "loss": 0.4736, + "step": 4213 + }, + { + "epoch": 1.992434988179669, + "grad_norm": 2.6238436698913574, + "learning_rate": 3.794391566307409e-06, + "loss": 0.4591, + "step": 4214 + }, + { + "epoch": 1.9929078014184398, + "grad_norm": 2.654886245727539, + "learning_rate": 3.7938578245996584e-06, + "loss": 0.5149, + "step": 4215 + }, + { + "epoch": 1.9933806146572104, + "grad_norm": 2.509164810180664, + "learning_rate": 3.793324002329054e-06, + "loss": 0.4951, + "step": 4216 + }, + { + "epoch": 1.993853427895981, + "grad_norm": 2.909632921218872, + "learning_rate": 3.7927900995288345e-06, + "loss": 0.5131, + "step": 4217 + }, + { + "epoch": 1.9943262411347518, + "grad_norm": 2.4354615211486816, + "learning_rate": 3.7922561162322456e-06, + "loss": 0.4716, + "step": 4218 + }, + { + "epoch": 1.9947990543735226, + "grad_norm": 2.6514649391174316, + "learning_rate": 3.791722052472534e-06, + "loss": 0.5714, + "step": 4219 + }, + { + "epoch": 1.9952718676122931, + "grad_norm": 2.77089262008667, + "learning_rate": 3.791187908282954e-06, + "loss": 0.5736, + "step": 4220 + }, + { + "epoch": 1.9957446808510637, + "grad_norm": 2.7651021480560303, + "learning_rate": 3.7906536836967657e-06, + "loss": 0.4948, + "step": 4221 + }, + { + "epoch": 1.9962174940898345, + "grad_norm": 2.7536795139312744, + "learning_rate": 3.7901193787472306e-06, + "loss": 0.512, + "step": 4222 + }, + { + "epoch": 1.9966903073286053, + "grad_norm": 2.684893846511841, + "learning_rate": 3.78958499346762e-06, + "loss": 0.5118, + "step": 4223 + }, + { + "epoch": 1.9971631205673759, + "grad_norm": 2.7616753578186035, + "learning_rate": 3.7890505278912054e-06, + "loss": 0.4516, + "step": 4224 + }, + { + "epoch": 1.9976359338061465, + "grad_norm": 2.4731967449188232, + "learning_rate": 3.7885159820512666e-06, + "loss": 0.4736, + "step": 4225 + }, + { + "epoch": 1.9981087470449173, + "grad_norm": 2.366631031036377, + "learning_rate": 3.7879813559810884e-06, + "loss": 0.4999, + "step": 4226 + }, + { + "epoch": 1.998581560283688, + "grad_norm": 2.994624137878418, + "learning_rate": 3.7874466497139582e-06, + "loss": 0.5273, + "step": 4227 + }, + { + "epoch": 1.9990543735224586, + "grad_norm": 2.4499242305755615, + "learning_rate": 3.7869118632831712e-06, + "loss": 0.5761, + "step": 4228 + }, + { + "epoch": 1.9995271867612292, + "grad_norm": 2.3370113372802734, + "learning_rate": 3.7863769967220243e-06, + "loss": 0.4673, + "step": 4229 + }, + { + "epoch": 2.0, + "grad_norm": 3.1131203174591064, + "learning_rate": 3.7858420500638236e-06, + "loss": 0.5118, + "step": 4230 + }, + { + "epoch": 2.000472813238771, + "grad_norm": 2.2747561931610107, + "learning_rate": 3.785307023341876e-06, + "loss": 0.4166, + "step": 4231 + }, + { + "epoch": 2.000945626477541, + "grad_norm": 2.4347424507141113, + "learning_rate": 3.7847719165894963e-06, + "loss": 0.4161, + "step": 4232 + }, + { + "epoch": 2.001418439716312, + "grad_norm": 2.398805618286133, + "learning_rate": 3.784236729840003e-06, + "loss": 0.4652, + "step": 4233 + }, + { + "epoch": 2.0018912529550827, + "grad_norm": 2.1904916763305664, + "learning_rate": 3.783701463126719e-06, + "loss": 0.4554, + "step": 4234 + }, + { + "epoch": 2.0023640661938535, + "grad_norm": 2.237330913543701, + "learning_rate": 3.7831661164829735e-06, + "loss": 0.4471, + "step": 4235 + }, + { + "epoch": 2.002836879432624, + "grad_norm": 2.3656628131866455, + "learning_rate": 3.7826306899421016e-06, + "loss": 0.4052, + "step": 4236 + }, + { + "epoch": 2.0033096926713947, + "grad_norm": 2.615489959716797, + "learning_rate": 3.7820951835374405e-06, + "loss": 0.4847, + "step": 4237 + }, + { + "epoch": 2.0037825059101655, + "grad_norm": 2.453036308288574, + "learning_rate": 3.7815595973023347e-06, + "loss": 0.4672, + "step": 4238 + }, + { + "epoch": 2.0042553191489363, + "grad_norm": 2.537468671798706, + "learning_rate": 3.7810239312701306e-06, + "loss": 0.467, + "step": 4239 + }, + { + "epoch": 2.0047281323877066, + "grad_norm": 2.3321666717529297, + "learning_rate": 3.780488185474184e-06, + "loss": 0.3557, + "step": 4240 + }, + { + "epoch": 2.0052009456264774, + "grad_norm": 2.9051828384399414, + "learning_rate": 3.779952359947854e-06, + "loss": 0.5474, + "step": 4241 + }, + { + "epoch": 2.0056737588652482, + "grad_norm": 2.7458817958831787, + "learning_rate": 3.7794164547245015e-06, + "loss": 0.4659, + "step": 4242 + }, + { + "epoch": 2.006146572104019, + "grad_norm": 2.627046585083008, + "learning_rate": 3.778880469837497e-06, + "loss": 0.4179, + "step": 4243 + }, + { + "epoch": 2.0066193853427894, + "grad_norm": 2.4186174869537354, + "learning_rate": 3.7783444053202135e-06, + "loss": 0.3976, + "step": 4244 + }, + { + "epoch": 2.00709219858156, + "grad_norm": 3.109376907348633, + "learning_rate": 3.7778082612060296e-06, + "loss": 0.4095, + "step": 4245 + }, + { + "epoch": 2.007565011820331, + "grad_norm": 2.583376169204712, + "learning_rate": 3.7772720375283282e-06, + "loss": 0.4325, + "step": 4246 + }, + { + "epoch": 2.0080378250591018, + "grad_norm": 2.6199896335601807, + "learning_rate": 3.776735734320497e-06, + "loss": 0.4207, + "step": 4247 + }, + { + "epoch": 2.008510638297872, + "grad_norm": 2.545353651046753, + "learning_rate": 3.77619935161593e-06, + "loss": 0.4483, + "step": 4248 + }, + { + "epoch": 2.008983451536643, + "grad_norm": 2.770266056060791, + "learning_rate": 3.7756628894480263e-06, + "loss": 0.457, + "step": 4249 + }, + { + "epoch": 2.0094562647754137, + "grad_norm": 2.903254985809326, + "learning_rate": 3.7751263478501878e-06, + "loss": 0.4171, + "step": 4250 + }, + { + "epoch": 2.0099290780141845, + "grad_norm": 2.5576963424682617, + "learning_rate": 3.774589726855822e-06, + "loss": 0.3631, + "step": 4251 + }, + { + "epoch": 2.010401891252955, + "grad_norm": 3.7584285736083984, + "learning_rate": 3.7740530264983434e-06, + "loss": 0.4827, + "step": 4252 + }, + { + "epoch": 2.0108747044917257, + "grad_norm": 3.3116581439971924, + "learning_rate": 3.77351624681117e-06, + "loss": 0.5071, + "step": 4253 + }, + { + "epoch": 2.0113475177304965, + "grad_norm": 3.1370885372161865, + "learning_rate": 3.772979387827723e-06, + "loss": 0.4963, + "step": 4254 + }, + { + "epoch": 2.0118203309692673, + "grad_norm": 2.4832639694213867, + "learning_rate": 3.772442449581432e-06, + "loss": 0.4442, + "step": 4255 + }, + { + "epoch": 2.0122931442080376, + "grad_norm": 2.7645785808563232, + "learning_rate": 3.7719054321057293e-06, + "loss": 0.4572, + "step": 4256 + }, + { + "epoch": 2.0127659574468084, + "grad_norm": 2.7962236404418945, + "learning_rate": 3.7713683354340515e-06, + "loss": 0.4906, + "step": 4257 + }, + { + "epoch": 2.013238770685579, + "grad_norm": 2.647991895675659, + "learning_rate": 3.7708311595998425e-06, + "loss": 0.4027, + "step": 4258 + }, + { + "epoch": 2.01371158392435, + "grad_norm": 2.3780267238616943, + "learning_rate": 3.7702939046365504e-06, + "loss": 0.4285, + "step": 4259 + }, + { + "epoch": 2.0141843971631204, + "grad_norm": 2.5185933113098145, + "learning_rate": 3.7697565705776266e-06, + "loss": 0.4834, + "step": 4260 + }, + { + "epoch": 2.014657210401891, + "grad_norm": 2.432507276535034, + "learning_rate": 3.7692191574565294e-06, + "loss": 0.3695, + "step": 4261 + }, + { + "epoch": 2.015130023640662, + "grad_norm": 2.8010706901550293, + "learning_rate": 3.76868166530672e-06, + "loss": 0.478, + "step": 4262 + }, + { + "epoch": 2.0156028368794328, + "grad_norm": 2.32817006111145, + "learning_rate": 3.768144094161666e-06, + "loss": 0.4154, + "step": 4263 + }, + { + "epoch": 2.016075650118203, + "grad_norm": 3.062812328338623, + "learning_rate": 3.7676064440548405e-06, + "loss": 0.5015, + "step": 4264 + }, + { + "epoch": 2.016548463356974, + "grad_norm": 2.6129536628723145, + "learning_rate": 3.7670687150197194e-06, + "loss": 0.3843, + "step": 4265 + }, + { + "epoch": 2.0170212765957447, + "grad_norm": 2.838259696960449, + "learning_rate": 3.766530907089786e-06, + "loss": 0.4937, + "step": 4266 + }, + { + "epoch": 2.0174940898345155, + "grad_norm": 2.601203680038452, + "learning_rate": 3.7659930202985263e-06, + "loss": 0.4644, + "step": 4267 + }, + { + "epoch": 2.017966903073286, + "grad_norm": 2.5964133739471436, + "learning_rate": 3.7654550546794322e-06, + "loss": 0.4365, + "step": 4268 + }, + { + "epoch": 2.0184397163120567, + "grad_norm": 3.0028915405273438, + "learning_rate": 3.764917010266001e-06, + "loss": 0.434, + "step": 4269 + }, + { + "epoch": 2.0189125295508275, + "grad_norm": 2.719252586364746, + "learning_rate": 3.764378887091734e-06, + "loss": 0.4401, + "step": 4270 + }, + { + "epoch": 2.0193853427895982, + "grad_norm": 2.400254011154175, + "learning_rate": 3.7638406851901377e-06, + "loss": 0.4904, + "step": 4271 + }, + { + "epoch": 2.0198581560283686, + "grad_norm": 2.8015363216400146, + "learning_rate": 3.763302404594724e-06, + "loss": 0.4569, + "step": 4272 + }, + { + "epoch": 2.0203309692671394, + "grad_norm": 2.718416452407837, + "learning_rate": 3.762764045339009e-06, + "loss": 0.5124, + "step": 4273 + }, + { + "epoch": 2.02080378250591, + "grad_norm": 2.484049081802368, + "learning_rate": 3.762225607456514e-06, + "loss": 0.4255, + "step": 4274 + }, + { + "epoch": 2.021276595744681, + "grad_norm": 2.6377930641174316, + "learning_rate": 3.7616870909807645e-06, + "loss": 0.5044, + "step": 4275 + }, + { + "epoch": 2.0217494089834513, + "grad_norm": 2.8845038414001465, + "learning_rate": 3.7611484959452927e-06, + "loss": 0.4924, + "step": 4276 + }, + { + "epoch": 2.022222222222222, + "grad_norm": 2.5939974784851074, + "learning_rate": 3.7606098223836342e-06, + "loss": 0.4873, + "step": 4277 + }, + { + "epoch": 2.022695035460993, + "grad_norm": 2.499826431274414, + "learning_rate": 3.76007107032933e-06, + "loss": 0.4515, + "step": 4278 + }, + { + "epoch": 2.0231678486997637, + "grad_norm": 3.0318663120269775, + "learning_rate": 3.759532239815924e-06, + "loss": 0.4901, + "step": 4279 + }, + { + "epoch": 2.023640661938534, + "grad_norm": 2.857977867126465, + "learning_rate": 3.758993330876969e-06, + "loss": 0.4659, + "step": 4280 + }, + { + "epoch": 2.024113475177305, + "grad_norm": 2.47918438911438, + "learning_rate": 3.7584543435460196e-06, + "loss": 0.4323, + "step": 4281 + }, + { + "epoch": 2.0245862884160757, + "grad_norm": 2.6033785343170166, + "learning_rate": 3.757915277856637e-06, + "loss": 0.4437, + "step": 4282 + }, + { + "epoch": 2.0250591016548465, + "grad_norm": 2.799781322479248, + "learning_rate": 3.757376133842386e-06, + "loss": 0.4523, + "step": 4283 + }, + { + "epoch": 2.025531914893617, + "grad_norm": 2.6092529296875, + "learning_rate": 3.756836911536836e-06, + "loss": 0.3898, + "step": 4284 + }, + { + "epoch": 2.0260047281323876, + "grad_norm": 2.66229248046875, + "learning_rate": 3.7562976109735627e-06, + "loss": 0.4731, + "step": 4285 + }, + { + "epoch": 2.0264775413711584, + "grad_norm": 2.90142822265625, + "learning_rate": 3.7557582321861463e-06, + "loss": 0.4285, + "step": 4286 + }, + { + "epoch": 2.0269503546099292, + "grad_norm": 2.5138802528381348, + "learning_rate": 3.7552187752081707e-06, + "loss": 0.4467, + "step": 4287 + }, + { + "epoch": 2.0274231678486996, + "grad_norm": 3.0656235218048096, + "learning_rate": 3.754679240073226e-06, + "loss": 0.4718, + "step": 4288 + }, + { + "epoch": 2.0278959810874704, + "grad_norm": 2.9633383750915527, + "learning_rate": 3.754139626814907e-06, + "loss": 0.4741, + "step": 4289 + }, + { + "epoch": 2.028368794326241, + "grad_norm": 2.5925145149230957, + "learning_rate": 3.753599935466812e-06, + "loss": 0.4281, + "step": 4290 + }, + { + "epoch": 2.028841607565012, + "grad_norm": 2.837740659713745, + "learning_rate": 3.7530601660625456e-06, + "loss": 0.4757, + "step": 4291 + }, + { + "epoch": 2.0293144208037823, + "grad_norm": 2.3995790481567383, + "learning_rate": 3.752520318635718e-06, + "loss": 0.4148, + "step": 4292 + }, + { + "epoch": 2.029787234042553, + "grad_norm": 2.572601795196533, + "learning_rate": 3.7519803932199424e-06, + "loss": 0.4051, + "step": 4293 + }, + { + "epoch": 2.030260047281324, + "grad_norm": 2.6780295372009277, + "learning_rate": 3.751440389848837e-06, + "loss": 0.4626, + "step": 4294 + }, + { + "epoch": 2.0307328605200947, + "grad_norm": 2.8666839599609375, + "learning_rate": 3.7509003085560257e-06, + "loss": 0.4255, + "step": 4295 + }, + { + "epoch": 2.031205673758865, + "grad_norm": 2.4398207664489746, + "learning_rate": 3.750360149375138e-06, + "loss": 0.4235, + "step": 4296 + }, + { + "epoch": 2.031678486997636, + "grad_norm": 2.436840534210205, + "learning_rate": 3.7498199123398062e-06, + "loss": 0.3907, + "step": 4297 + }, + { + "epoch": 2.0321513002364067, + "grad_norm": 3.3945820331573486, + "learning_rate": 3.7492795974836683e-06, + "loss": 0.465, + "step": 4298 + }, + { + "epoch": 2.0326241134751775, + "grad_norm": 2.6693103313446045, + "learning_rate": 3.7487392048403678e-06, + "loss": 0.4948, + "step": 4299 + }, + { + "epoch": 2.033096926713948, + "grad_norm": 2.7642734050750732, + "learning_rate": 3.748198734443553e-06, + "loss": 0.4538, + "step": 4300 + }, + { + "epoch": 2.0335697399527186, + "grad_norm": 3.1436543464660645, + "learning_rate": 3.747658186326876e-06, + "loss": 0.5137, + "step": 4301 + }, + { + "epoch": 2.0340425531914894, + "grad_norm": 3.482678174972534, + "learning_rate": 3.7471175605239947e-06, + "loss": 0.4982, + "step": 4302 + }, + { + "epoch": 2.03451536643026, + "grad_norm": 2.712557077407837, + "learning_rate": 3.746576857068571e-06, + "loss": 0.4459, + "step": 4303 + }, + { + "epoch": 2.0349881796690306, + "grad_norm": 3.147440195083618, + "learning_rate": 3.7460360759942726e-06, + "loss": 0.5063, + "step": 4304 + }, + { + "epoch": 2.0354609929078014, + "grad_norm": 2.840672492980957, + "learning_rate": 3.7454952173347714e-06, + "loss": 0.5041, + "step": 4305 + }, + { + "epoch": 2.035933806146572, + "grad_norm": 2.584122657775879, + "learning_rate": 3.744954281123745e-06, + "loss": 0.4487, + "step": 4306 + }, + { + "epoch": 2.036406619385343, + "grad_norm": 2.9869542121887207, + "learning_rate": 3.7444132673948737e-06, + "loss": 0.479, + "step": 4307 + }, + { + "epoch": 2.0368794326241133, + "grad_norm": 2.478459358215332, + "learning_rate": 3.7438721761818446e-06, + "loss": 0.4636, + "step": 4308 + }, + { + "epoch": 2.037352245862884, + "grad_norm": 2.5524215698242188, + "learning_rate": 3.7433310075183504e-06, + "loss": 0.4601, + "step": 4309 + }, + { + "epoch": 2.037825059101655, + "grad_norm": 2.3709988594055176, + "learning_rate": 3.742789761438086e-06, + "loss": 0.4163, + "step": 4310 + }, + { + "epoch": 2.0382978723404257, + "grad_norm": 3.140355348587036, + "learning_rate": 3.742248437974752e-06, + "loss": 0.4433, + "step": 4311 + }, + { + "epoch": 2.038770685579196, + "grad_norm": 2.940948486328125, + "learning_rate": 3.741707037162055e-06, + "loss": 0.4299, + "step": 4312 + }, + { + "epoch": 2.039243498817967, + "grad_norm": 3.009157419204712, + "learning_rate": 3.7411655590337055e-06, + "loss": 0.463, + "step": 4313 + }, + { + "epoch": 2.0397163120567376, + "grad_norm": 2.672945737838745, + "learning_rate": 3.7406240036234185e-06, + "loss": 0.4696, + "step": 4314 + }, + { + "epoch": 2.0401891252955084, + "grad_norm": 2.745962142944336, + "learning_rate": 3.740082370964916e-06, + "loss": 0.4931, + "step": 4315 + }, + { + "epoch": 2.040661938534279, + "grad_norm": 2.3939316272735596, + "learning_rate": 3.7395406610919217e-06, + "loss": 0.4396, + "step": 4316 + }, + { + "epoch": 2.0411347517730496, + "grad_norm": 2.4364447593688965, + "learning_rate": 3.738998874038165e-06, + "loss": 0.4807, + "step": 4317 + }, + { + "epoch": 2.0416075650118204, + "grad_norm": 2.360489845275879, + "learning_rate": 3.738457009837381e-06, + "loss": 0.4426, + "step": 4318 + }, + { + "epoch": 2.042080378250591, + "grad_norm": 2.5494935512542725, + "learning_rate": 3.7379150685233108e-06, + "loss": 0.4189, + "step": 4319 + }, + { + "epoch": 2.0425531914893615, + "grad_norm": 2.635472059249878, + "learning_rate": 3.7373730501296963e-06, + "loss": 0.5014, + "step": 4320 + }, + { + "epoch": 2.0430260047281323, + "grad_norm": 2.4982943534851074, + "learning_rate": 3.7368309546902876e-06, + "loss": 0.4658, + "step": 4321 + }, + { + "epoch": 2.043498817966903, + "grad_norm": 2.692742109298706, + "learning_rate": 3.736288782238839e-06, + "loss": 0.4454, + "step": 4322 + }, + { + "epoch": 2.043971631205674, + "grad_norm": 2.6774091720581055, + "learning_rate": 3.7357465328091086e-06, + "loss": 0.5002, + "step": 4323 + }, + { + "epoch": 2.0444444444444443, + "grad_norm": 2.695138692855835, + "learning_rate": 3.735204206434861e-06, + "loss": 0.448, + "step": 4324 + }, + { + "epoch": 2.044917257683215, + "grad_norm": 2.5383570194244385, + "learning_rate": 3.7346618031498635e-06, + "loss": 0.4352, + "step": 4325 + }, + { + "epoch": 2.045390070921986, + "grad_norm": 2.267277240753174, + "learning_rate": 3.7341193229878886e-06, + "loss": 0.4162, + "step": 4326 + }, + { + "epoch": 2.0458628841607567, + "grad_norm": 2.6037328243255615, + "learning_rate": 3.733576765982715e-06, + "loss": 0.4471, + "step": 4327 + }, + { + "epoch": 2.046335697399527, + "grad_norm": 3.261385440826416, + "learning_rate": 3.7330341321681253e-06, + "loss": 0.4618, + "step": 4328 + }, + { + "epoch": 2.046808510638298, + "grad_norm": 2.440650463104248, + "learning_rate": 3.7324914215779072e-06, + "loss": 0.4476, + "step": 4329 + }, + { + "epoch": 2.0472813238770686, + "grad_norm": 2.5940682888031006, + "learning_rate": 3.731948634245853e-06, + "loss": 0.4389, + "step": 4330 + }, + { + "epoch": 2.0477541371158394, + "grad_norm": 2.7428150177001953, + "learning_rate": 3.7314057702057582e-06, + "loss": 0.4477, + "step": 4331 + }, + { + "epoch": 2.0482269503546098, + "grad_norm": 2.3546223640441895, + "learning_rate": 3.730862829491427e-06, + "loss": 0.4047, + "step": 4332 + }, + { + "epoch": 2.0486997635933806, + "grad_norm": 2.552422523498535, + "learning_rate": 3.7303198121366637e-06, + "loss": 0.4438, + "step": 4333 + }, + { + "epoch": 2.0491725768321514, + "grad_norm": 2.99226713180542, + "learning_rate": 3.729776718175281e-06, + "loss": 0.491, + "step": 4334 + }, + { + "epoch": 2.049645390070922, + "grad_norm": 3.2003321647644043, + "learning_rate": 3.7292335476410935e-06, + "loss": 0.5458, + "step": 4335 + }, + { + "epoch": 2.0501182033096925, + "grad_norm": 2.739847183227539, + "learning_rate": 3.7286903005679237e-06, + "loss": 0.4499, + "step": 4336 + }, + { + "epoch": 2.0505910165484633, + "grad_norm": 2.5917470455169678, + "learning_rate": 3.7281469769895963e-06, + "loss": 0.4714, + "step": 4337 + }, + { + "epoch": 2.051063829787234, + "grad_norm": 2.8029327392578125, + "learning_rate": 3.7276035769399422e-06, + "loss": 0.42, + "step": 4338 + }, + { + "epoch": 2.051536643026005, + "grad_norm": 2.484879493713379, + "learning_rate": 3.727060100452796e-06, + "loss": 0.4163, + "step": 4339 + }, + { + "epoch": 2.0520094562647753, + "grad_norm": 2.7126030921936035, + "learning_rate": 3.7265165475619973e-06, + "loss": 0.4112, + "step": 4340 + }, + { + "epoch": 2.052482269503546, + "grad_norm": 2.618267774581909, + "learning_rate": 3.7259729183013927e-06, + "loss": 0.4281, + "step": 4341 + }, + { + "epoch": 2.052955082742317, + "grad_norm": 2.703270673751831, + "learning_rate": 3.7254292127048293e-06, + "loss": 0.4437, + "step": 4342 + }, + { + "epoch": 2.0534278959810877, + "grad_norm": 2.429150104522705, + "learning_rate": 3.7248854308061623e-06, + "loss": 0.3971, + "step": 4343 + }, + { + "epoch": 2.053900709219858, + "grad_norm": 2.54354190826416, + "learning_rate": 3.7243415726392508e-06, + "loss": 0.4485, + "step": 4344 + }, + { + "epoch": 2.054373522458629, + "grad_norm": 2.9515016078948975, + "learning_rate": 3.723797638237957e-06, + "loss": 0.4386, + "step": 4345 + }, + { + "epoch": 2.0548463356973996, + "grad_norm": 2.9129958152770996, + "learning_rate": 3.7232536276361514e-06, + "loss": 0.4595, + "step": 4346 + }, + { + "epoch": 2.0553191489361704, + "grad_norm": 2.5397512912750244, + "learning_rate": 3.722709540867706e-06, + "loss": 0.3681, + "step": 4347 + }, + { + "epoch": 2.0557919621749408, + "grad_norm": 2.79884672164917, + "learning_rate": 3.722165377966499e-06, + "loss": 0.4576, + "step": 4348 + }, + { + "epoch": 2.0562647754137116, + "grad_norm": 2.669936180114746, + "learning_rate": 3.7216211389664137e-06, + "loss": 0.3692, + "step": 4349 + }, + { + "epoch": 2.0567375886524824, + "grad_norm": 2.512326240539551, + "learning_rate": 3.7210768239013355e-06, + "loss": 0.4554, + "step": 4350 + }, + { + "epoch": 2.057210401891253, + "grad_norm": 2.913693904876709, + "learning_rate": 3.7205324328051583e-06, + "loss": 0.5282, + "step": 4351 + }, + { + "epoch": 2.0576832151300235, + "grad_norm": 3.040891170501709, + "learning_rate": 3.719987965711778e-06, + "loss": 0.4778, + "step": 4352 + }, + { + "epoch": 2.0581560283687943, + "grad_norm": 2.7504117488861084, + "learning_rate": 3.7194434226550966e-06, + "loss": 0.4217, + "step": 4353 + }, + { + "epoch": 2.058628841607565, + "grad_norm": 2.5522971153259277, + "learning_rate": 3.718898803669021e-06, + "loss": 0.437, + "step": 4354 + }, + { + "epoch": 2.059101654846336, + "grad_norm": 2.8531908988952637, + "learning_rate": 3.718354108787461e-06, + "loss": 0.4251, + "step": 4355 + }, + { + "epoch": 2.0595744680851062, + "grad_norm": 2.5812065601348877, + "learning_rate": 3.7178093380443337e-06, + "loss": 0.4374, + "step": 4356 + }, + { + "epoch": 2.060047281323877, + "grad_norm": 2.627871513366699, + "learning_rate": 3.7172644914735583e-06, + "loss": 0.436, + "step": 4357 + }, + { + "epoch": 2.060520094562648, + "grad_norm": 2.7146239280700684, + "learning_rate": 3.7167195691090607e-06, + "loss": 0.4204, + "step": 4358 + }, + { + "epoch": 2.0609929078014186, + "grad_norm": 2.486483573913574, + "learning_rate": 3.7161745709847706e-06, + "loss": 0.4015, + "step": 4359 + }, + { + "epoch": 2.061465721040189, + "grad_norm": 2.866049289703369, + "learning_rate": 3.7156294971346226e-06, + "loss": 0.4087, + "step": 4360 + }, + { + "epoch": 2.06193853427896, + "grad_norm": 2.9345552921295166, + "learning_rate": 3.715084347592556e-06, + "loss": 0.5074, + "step": 4361 + }, + { + "epoch": 2.0624113475177306, + "grad_norm": 2.502455711364746, + "learning_rate": 3.7145391223925155e-06, + "loss": 0.469, + "step": 4362 + }, + { + "epoch": 2.0628841607565014, + "grad_norm": 2.6419875621795654, + "learning_rate": 3.713993821568449e-06, + "loss": 0.4493, + "step": 4363 + }, + { + "epoch": 2.0633569739952717, + "grad_norm": 3.812079429626465, + "learning_rate": 3.7134484451543114e-06, + "loss": 0.4764, + "step": 4364 + }, + { + "epoch": 2.0638297872340425, + "grad_norm": 2.581780195236206, + "learning_rate": 3.712902993184059e-06, + "loss": 0.3994, + "step": 4365 + }, + { + "epoch": 2.0643026004728133, + "grad_norm": 2.282508134841919, + "learning_rate": 3.712357465691656e-06, + "loss": 0.4252, + "step": 4366 + }, + { + "epoch": 2.064775413711584, + "grad_norm": 2.4727818965911865, + "learning_rate": 3.71181186271107e-06, + "loss": 0.4558, + "step": 4367 + }, + { + "epoch": 2.0652482269503545, + "grad_norm": 2.7661173343658447, + "learning_rate": 3.711266184276272e-06, + "loss": 0.505, + "step": 4368 + }, + { + "epoch": 2.0657210401891253, + "grad_norm": 2.6264543533325195, + "learning_rate": 3.71072043042124e-06, + "loss": 0.4297, + "step": 4369 + }, + { + "epoch": 2.066193853427896, + "grad_norm": 2.773699998855591, + "learning_rate": 3.7101746011799565e-06, + "loss": 0.4267, + "step": 4370 + }, + { + "epoch": 2.066666666666667, + "grad_norm": 2.686955213546753, + "learning_rate": 3.709628696586407e-06, + "loss": 0.4099, + "step": 4371 + }, + { + "epoch": 2.0671394799054372, + "grad_norm": 2.6066620349884033, + "learning_rate": 3.709082716674582e-06, + "loss": 0.4146, + "step": 4372 + }, + { + "epoch": 2.067612293144208, + "grad_norm": 2.7769250869750977, + "learning_rate": 3.7085366614784784e-06, + "loss": 0.4047, + "step": 4373 + }, + { + "epoch": 2.068085106382979, + "grad_norm": 2.4986939430236816, + "learning_rate": 3.7079905310320957e-06, + "loss": 0.4021, + "step": 4374 + }, + { + "epoch": 2.0685579196217496, + "grad_norm": 2.5456206798553467, + "learning_rate": 3.7074443253694402e-06, + "loss": 0.3569, + "step": 4375 + }, + { + "epoch": 2.06903073286052, + "grad_norm": 2.4079296588897705, + "learning_rate": 3.70689804452452e-06, + "loss": 0.4308, + "step": 4376 + }, + { + "epoch": 2.0695035460992908, + "grad_norm": 2.86014723777771, + "learning_rate": 3.7063516885313513e-06, + "loss": 0.4577, + "step": 4377 + }, + { + "epoch": 2.0699763593380616, + "grad_norm": 2.8025779724121094, + "learning_rate": 3.7058052574239523e-06, + "loss": 0.4615, + "step": 4378 + }, + { + "epoch": 2.0704491725768324, + "grad_norm": 2.902676820755005, + "learning_rate": 3.7052587512363475e-06, + "loss": 0.4765, + "step": 4379 + }, + { + "epoch": 2.0709219858156027, + "grad_norm": 2.814509391784668, + "learning_rate": 3.704712170002566e-06, + "loss": 0.434, + "step": 4380 + }, + { + "epoch": 2.0713947990543735, + "grad_norm": 2.7923502922058105, + "learning_rate": 3.704165513756639e-06, + "loss": 0.4626, + "step": 4381 + }, + { + "epoch": 2.0718676122931443, + "grad_norm": 2.6802031993865967, + "learning_rate": 3.703618782532606e-06, + "loss": 0.4835, + "step": 4382 + }, + { + "epoch": 2.072340425531915, + "grad_norm": 3.0963687896728516, + "learning_rate": 3.7030719763645085e-06, + "loss": 0.4813, + "step": 4383 + }, + { + "epoch": 2.0728132387706855, + "grad_norm": 2.5658695697784424, + "learning_rate": 3.7025250952863956e-06, + "loss": 0.4428, + "step": 4384 + }, + { + "epoch": 2.0732860520094563, + "grad_norm": 2.7738289833068848, + "learning_rate": 3.7019781393323167e-06, + "loss": 0.4376, + "step": 4385 + }, + { + "epoch": 2.073758865248227, + "grad_norm": 2.6446938514709473, + "learning_rate": 3.7014311085363303e-06, + "loss": 0.4208, + "step": 4386 + }, + { + "epoch": 2.0742316784869974, + "grad_norm": 2.7556118965148926, + "learning_rate": 3.7008840029324967e-06, + "loss": 0.3831, + "step": 4387 + }, + { + "epoch": 2.074704491725768, + "grad_norm": 2.573141574859619, + "learning_rate": 3.700336822554882e-06, + "loss": 0.4396, + "step": 4388 + }, + { + "epoch": 2.075177304964539, + "grad_norm": 2.762319803237915, + "learning_rate": 3.6997895674375566e-06, + "loss": 0.4579, + "step": 4389 + }, + { + "epoch": 2.07565011820331, + "grad_norm": 2.729780435562134, + "learning_rate": 3.699242237614596e-06, + "loss": 0.4262, + "step": 4390 + }, + { + "epoch": 2.0761229314420806, + "grad_norm": 2.657480001449585, + "learning_rate": 3.698694833120079e-06, + "loss": 0.4176, + "step": 4391 + }, + { + "epoch": 2.076595744680851, + "grad_norm": 2.8433303833007812, + "learning_rate": 3.6981473539880914e-06, + "loss": 0.457, + "step": 4392 + }, + { + "epoch": 2.0770685579196217, + "grad_norm": 2.819047212600708, + "learning_rate": 3.6975998002527225e-06, + "loss": 0.4244, + "step": 4393 + }, + { + "epoch": 2.0775413711583925, + "grad_norm": 2.6565003395080566, + "learning_rate": 3.697052171948064e-06, + "loss": 0.4384, + "step": 4394 + }, + { + "epoch": 2.078014184397163, + "grad_norm": 2.5795063972473145, + "learning_rate": 3.696504469108216e-06, + "loss": 0.4958, + "step": 4395 + }, + { + "epoch": 2.0784869976359337, + "grad_norm": 2.455730676651001, + "learning_rate": 3.6959566917672822e-06, + "loss": 0.4191, + "step": 4396 + }, + { + "epoch": 2.0789598108747045, + "grad_norm": 2.6706607341766357, + "learning_rate": 3.6954088399593684e-06, + "loss": 0.4709, + "step": 4397 + }, + { + "epoch": 2.0794326241134753, + "grad_norm": 2.3758466243743896, + "learning_rate": 3.694860913718589e-06, + "loss": 0.4231, + "step": 4398 + }, + { + "epoch": 2.079905437352246, + "grad_norm": 2.3488340377807617, + "learning_rate": 3.6943129130790583e-06, + "loss": 0.4321, + "step": 4399 + }, + { + "epoch": 2.0803782505910164, + "grad_norm": 2.6438148021698, + "learning_rate": 3.6937648380748996e-06, + "loss": 0.4907, + "step": 4400 + }, + { + "epoch": 2.0808510638297872, + "grad_norm": 2.9826784133911133, + "learning_rate": 3.6932166887402395e-06, + "loss": 0.4404, + "step": 4401 + }, + { + "epoch": 2.081323877068558, + "grad_norm": 2.5203495025634766, + "learning_rate": 3.6926684651092076e-06, + "loss": 0.4337, + "step": 4402 + }, + { + "epoch": 2.0817966903073284, + "grad_norm": 2.7704148292541504, + "learning_rate": 3.692120167215941e-06, + "loss": 0.4195, + "step": 4403 + }, + { + "epoch": 2.082269503546099, + "grad_norm": 2.879430055618286, + "learning_rate": 3.6915717950945782e-06, + "loss": 0.4498, + "step": 4404 + }, + { + "epoch": 2.08274231678487, + "grad_norm": 2.7659497261047363, + "learning_rate": 3.6910233487792655e-06, + "loss": 0.4017, + "step": 4405 + }, + { + "epoch": 2.083215130023641, + "grad_norm": 3.4017205238342285, + "learning_rate": 3.6904748283041503e-06, + "loss": 0.4733, + "step": 4406 + }, + { + "epoch": 2.083687943262411, + "grad_norm": 2.706223249435425, + "learning_rate": 3.6899262337033887e-06, + "loss": 0.4926, + "step": 4407 + }, + { + "epoch": 2.084160756501182, + "grad_norm": 2.644932508468628, + "learning_rate": 3.6893775650111372e-06, + "loss": 0.3904, + "step": 4408 + }, + { + "epoch": 2.0846335697399527, + "grad_norm": 2.666585683822632, + "learning_rate": 3.6888288222615603e-06, + "loss": 0.4698, + "step": 4409 + }, + { + "epoch": 2.0851063829787235, + "grad_norm": 3.0058486461639404, + "learning_rate": 3.688280005488826e-06, + "loss": 0.5291, + "step": 4410 + }, + { + "epoch": 2.085579196217494, + "grad_norm": 2.533088445663452, + "learning_rate": 3.687731114727105e-06, + "loss": 0.393, + "step": 4411 + }, + { + "epoch": 2.0860520094562647, + "grad_norm": 2.921687364578247, + "learning_rate": 3.6871821500105763e-06, + "loss": 0.4719, + "step": 4412 + }, + { + "epoch": 2.0865248226950355, + "grad_norm": 2.291804313659668, + "learning_rate": 3.686633111373421e-06, + "loss": 0.4105, + "step": 4413 + }, + { + "epoch": 2.0869976359338063, + "grad_norm": 2.496333122253418, + "learning_rate": 3.6860839988498255e-06, + "loss": 0.4704, + "step": 4414 + }, + { + "epoch": 2.0874704491725766, + "grad_norm": 2.8059427738189697, + "learning_rate": 3.6855348124739787e-06, + "loss": 0.4961, + "step": 4415 + }, + { + "epoch": 2.0879432624113474, + "grad_norm": 2.683922290802002, + "learning_rate": 3.6849855522800795e-06, + "loss": 0.4838, + "step": 4416 + }, + { + "epoch": 2.088416075650118, + "grad_norm": 2.694148540496826, + "learning_rate": 3.684436218302324e-06, + "loss": 0.4812, + "step": 4417 + }, + { + "epoch": 2.088888888888889, + "grad_norm": 2.724531888961792, + "learning_rate": 3.683886810574919e-06, + "loss": 0.4495, + "step": 4418 + }, + { + "epoch": 2.0893617021276594, + "grad_norm": 2.6176564693450928, + "learning_rate": 3.6833373291320746e-06, + "loss": 0.4698, + "step": 4419 + }, + { + "epoch": 2.08983451536643, + "grad_norm": 2.534116268157959, + "learning_rate": 3.6827877740080032e-06, + "loss": 0.3912, + "step": 4420 + }, + { + "epoch": 2.090307328605201, + "grad_norm": 2.5747432708740234, + "learning_rate": 3.682238145236924e-06, + "loss": 0.4072, + "step": 4421 + }, + { + "epoch": 2.0907801418439718, + "grad_norm": 2.5947659015655518, + "learning_rate": 3.6816884428530588e-06, + "loss": 0.4638, + "step": 4422 + }, + { + "epoch": 2.091252955082742, + "grad_norm": 2.811992883682251, + "learning_rate": 3.6811386668906353e-06, + "loss": 0.4345, + "step": 4423 + }, + { + "epoch": 2.091725768321513, + "grad_norm": 2.7482287883758545, + "learning_rate": 3.680588817383886e-06, + "loss": 0.4541, + "step": 4424 + }, + { + "epoch": 2.0921985815602837, + "grad_norm": 2.987131357192993, + "learning_rate": 3.6800388943670484e-06, + "loss": 0.4571, + "step": 4425 + }, + { + "epoch": 2.0926713947990545, + "grad_norm": 3.1918671131134033, + "learning_rate": 3.6794888978743637e-06, + "loss": 0.5722, + "step": 4426 + }, + { + "epoch": 2.093144208037825, + "grad_norm": 2.5654571056365967, + "learning_rate": 3.678938827940076e-06, + "loss": 0.4686, + "step": 4427 + }, + { + "epoch": 2.0936170212765957, + "grad_norm": 2.942084789276123, + "learning_rate": 3.6783886845984383e-06, + "loss": 0.4512, + "step": 4428 + }, + { + "epoch": 2.0940898345153665, + "grad_norm": 2.74847674369812, + "learning_rate": 3.677838467883703e-06, + "loss": 0.4506, + "step": 4429 + }, + { + "epoch": 2.0945626477541373, + "grad_norm": 2.7569334506988525, + "learning_rate": 3.6772881778301322e-06, + "loss": 0.502, + "step": 4430 + }, + { + "epoch": 2.0950354609929076, + "grad_norm": 2.969966173171997, + "learning_rate": 3.6767378144719884e-06, + "loss": 0.4772, + "step": 4431 + }, + { + "epoch": 2.0955082742316784, + "grad_norm": 2.773524522781372, + "learning_rate": 3.67618737784354e-06, + "loss": 0.5183, + "step": 4432 + }, + { + "epoch": 2.095981087470449, + "grad_norm": 2.6760106086730957, + "learning_rate": 3.6756368679790617e-06, + "loss": 0.4787, + "step": 4433 + }, + { + "epoch": 2.09645390070922, + "grad_norm": 2.8758978843688965, + "learning_rate": 3.6750862849128304e-06, + "loss": 0.4275, + "step": 4434 + }, + { + "epoch": 2.0969267139479904, + "grad_norm": 2.670509099960327, + "learning_rate": 3.6745356286791288e-06, + "loss": 0.4401, + "step": 4435 + }, + { + "epoch": 2.097399527186761, + "grad_norm": 2.8453969955444336, + "learning_rate": 3.673984899312244e-06, + "loss": 0.4303, + "step": 4436 + }, + { + "epoch": 2.097872340425532, + "grad_norm": 2.6212339401245117, + "learning_rate": 3.673434096846468e-06, + "loss": 0.4675, + "step": 4437 + }, + { + "epoch": 2.0983451536643027, + "grad_norm": 2.8211941719055176, + "learning_rate": 3.672883221316095e-06, + "loss": 0.4678, + "step": 4438 + }, + { + "epoch": 2.098817966903073, + "grad_norm": 2.4838058948516846, + "learning_rate": 3.672332272755427e-06, + "loss": 0.4128, + "step": 4439 + }, + { + "epoch": 2.099290780141844, + "grad_norm": 2.596660852432251, + "learning_rate": 3.671781251198769e-06, + "loss": 0.423, + "step": 4440 + }, + { + "epoch": 2.0997635933806147, + "grad_norm": 2.9979989528656006, + "learning_rate": 3.67123015668043e-06, + "loss": 0.4493, + "step": 4441 + }, + { + "epoch": 2.1002364066193855, + "grad_norm": 2.6232850551605225, + "learning_rate": 3.670678989234725e-06, + "loss": 0.4237, + "step": 4442 + }, + { + "epoch": 2.100709219858156, + "grad_norm": 2.575039863586426, + "learning_rate": 3.670127748895973e-06, + "loss": 0.4464, + "step": 4443 + }, + { + "epoch": 2.1011820330969266, + "grad_norm": 2.3381190299987793, + "learning_rate": 3.669576435698497e-06, + "loss": 0.4208, + "step": 4444 + }, + { + "epoch": 2.1016548463356974, + "grad_norm": 2.9645180702209473, + "learning_rate": 3.669025049676625e-06, + "loss": 0.5272, + "step": 4445 + }, + { + "epoch": 2.1021276595744682, + "grad_norm": 2.719320297241211, + "learning_rate": 3.668473590864689e-06, + "loss": 0.4485, + "step": 4446 + }, + { + "epoch": 2.1026004728132386, + "grad_norm": 2.8665547370910645, + "learning_rate": 3.6679220592970254e-06, + "loss": 0.4433, + "step": 4447 + }, + { + "epoch": 2.1030732860520094, + "grad_norm": 2.6922879219055176, + "learning_rate": 3.667370455007977e-06, + "loss": 0.502, + "step": 4448 + }, + { + "epoch": 2.10354609929078, + "grad_norm": 3.018228530883789, + "learning_rate": 3.6668187780318894e-06, + "loss": 0.4939, + "step": 4449 + }, + { + "epoch": 2.104018912529551, + "grad_norm": 3.187901735305786, + "learning_rate": 3.666267028403112e-06, + "loss": 0.4151, + "step": 4450 + }, + { + "epoch": 2.1044917257683213, + "grad_norm": 2.9521446228027344, + "learning_rate": 3.6657152061560012e-06, + "loss": 0.4343, + "step": 4451 + }, + { + "epoch": 2.104964539007092, + "grad_norm": 2.5125739574432373, + "learning_rate": 3.6651633113249164e-06, + "loss": 0.4071, + "step": 4452 + }, + { + "epoch": 2.105437352245863, + "grad_norm": 2.9164133071899414, + "learning_rate": 3.664611343944221e-06, + "loss": 0.4173, + "step": 4453 + }, + { + "epoch": 2.1059101654846337, + "grad_norm": 2.680893898010254, + "learning_rate": 3.6640593040482834e-06, + "loss": 0.4917, + "step": 4454 + }, + { + "epoch": 2.106382978723404, + "grad_norm": 2.6823534965515137, + "learning_rate": 3.6635071916714774e-06, + "loss": 0.4668, + "step": 4455 + }, + { + "epoch": 2.106855791962175, + "grad_norm": 2.6221907138824463, + "learning_rate": 3.6629550068481806e-06, + "loss": 0.4956, + "step": 4456 + }, + { + "epoch": 2.1073286052009457, + "grad_norm": 3.096370220184326, + "learning_rate": 3.6624027496127745e-06, + "loss": 0.3995, + "step": 4457 + }, + { + "epoch": 2.1078014184397165, + "grad_norm": 2.752885341644287, + "learning_rate": 3.661850419999647e-06, + "loss": 0.4838, + "step": 4458 + }, + { + "epoch": 2.108274231678487, + "grad_norm": 2.6806766986846924, + "learning_rate": 3.661298018043188e-06, + "loss": 0.4817, + "step": 4459 + }, + { + "epoch": 2.1087470449172576, + "grad_norm": 2.6317873001098633, + "learning_rate": 3.660745543777794e-06, + "loss": 0.4777, + "step": 4460 + }, + { + "epoch": 2.1092198581560284, + "grad_norm": 2.4939377307891846, + "learning_rate": 3.6601929972378634e-06, + "loss": 0.4525, + "step": 4461 + }, + { + "epoch": 2.109692671394799, + "grad_norm": 2.4902873039245605, + "learning_rate": 3.659640378457803e-06, + "loss": 0.4392, + "step": 4462 + }, + { + "epoch": 2.1101654846335696, + "grad_norm": 2.5082345008850098, + "learning_rate": 3.6590876874720216e-06, + "loss": 0.4224, + "step": 4463 + }, + { + "epoch": 2.1106382978723404, + "grad_norm": 2.658407211303711, + "learning_rate": 3.6585349243149313e-06, + "loss": 0.4316, + "step": 4464 + }, + { + "epoch": 2.111111111111111, + "grad_norm": 2.562883138656616, + "learning_rate": 3.6579820890209515e-06, + "loss": 0.4491, + "step": 4465 + }, + { + "epoch": 2.111583924349882, + "grad_norm": 2.5719261169433594, + "learning_rate": 3.657429181624505e-06, + "loss": 0.4406, + "step": 4466 + }, + { + "epoch": 2.1120567375886523, + "grad_norm": 2.8840596675872803, + "learning_rate": 3.6568762021600184e-06, + "loss": 0.4267, + "step": 4467 + }, + { + "epoch": 2.112529550827423, + "grad_norm": 2.660304546356201, + "learning_rate": 3.656323150661924e-06, + "loss": 0.4502, + "step": 4468 + }, + { + "epoch": 2.113002364066194, + "grad_norm": 2.610996961593628, + "learning_rate": 3.655770027164657e-06, + "loss": 0.3934, + "step": 4469 + }, + { + "epoch": 2.1134751773049647, + "grad_norm": 2.6000053882598877, + "learning_rate": 3.655216831702658e-06, + "loss": 0.4582, + "step": 4470 + }, + { + "epoch": 2.113947990543735, + "grad_norm": 2.73124098777771, + "learning_rate": 3.654663564310372e-06, + "loss": 0.4748, + "step": 4471 + }, + { + "epoch": 2.114420803782506, + "grad_norm": 2.711091995239258, + "learning_rate": 3.6541102250222495e-06, + "loss": 0.4145, + "step": 4472 + }, + { + "epoch": 2.1148936170212767, + "grad_norm": 2.655996561050415, + "learning_rate": 3.6535568138727438e-06, + "loss": 0.4407, + "step": 4473 + }, + { + "epoch": 2.1153664302600474, + "grad_norm": 2.7630865573883057, + "learning_rate": 3.653003330896313e-06, + "loss": 0.4298, + "step": 4474 + }, + { + "epoch": 2.115839243498818, + "grad_norm": 2.554415464401245, + "learning_rate": 3.6524497761274214e-06, + "loss": 0.44, + "step": 4475 + }, + { + "epoch": 2.1163120567375886, + "grad_norm": 2.790328025817871, + "learning_rate": 3.651896149600535e-06, + "loss": 0.5061, + "step": 4476 + }, + { + "epoch": 2.1167848699763594, + "grad_norm": 2.755267381668091, + "learning_rate": 3.651342451350127e-06, + "loss": 0.4588, + "step": 4477 + }, + { + "epoch": 2.11725768321513, + "grad_norm": 2.8936638832092285, + "learning_rate": 3.6507886814106722e-06, + "loss": 0.468, + "step": 4478 + }, + { + "epoch": 2.1177304964539005, + "grad_norm": 2.7394332885742188, + "learning_rate": 3.6502348398166525e-06, + "loss": 0.383, + "step": 4479 + }, + { + "epoch": 2.1182033096926713, + "grad_norm": 2.3359546661376953, + "learning_rate": 3.649680926602553e-06, + "loss": 0.3903, + "step": 4480 + }, + { + "epoch": 2.118676122931442, + "grad_norm": 3.102202892303467, + "learning_rate": 3.6491269418028637e-06, + "loss": 0.4525, + "step": 4481 + }, + { + "epoch": 2.119148936170213, + "grad_norm": 2.467970848083496, + "learning_rate": 3.648572885452078e-06, + "loss": 0.414, + "step": 4482 + }, + { + "epoch": 2.1196217494089833, + "grad_norm": 2.8984131813049316, + "learning_rate": 3.6480187575846952e-06, + "loss": 0.4571, + "step": 4483 + }, + { + "epoch": 2.120094562647754, + "grad_norm": 2.674834966659546, + "learning_rate": 3.6474645582352187e-06, + "loss": 0.455, + "step": 4484 + }, + { + "epoch": 2.120567375886525, + "grad_norm": 2.8713369369506836, + "learning_rate": 3.6469102874381552e-06, + "loss": 0.4567, + "step": 4485 + }, + { + "epoch": 2.1210401891252957, + "grad_norm": 3.174814462661743, + "learning_rate": 3.646355945228017e-06, + "loss": 0.5295, + "step": 4486 + }, + { + "epoch": 2.121513002364066, + "grad_norm": 2.6409823894500732, + "learning_rate": 3.6458015316393215e-06, + "loss": 0.4308, + "step": 4487 + }, + { + "epoch": 2.121985815602837, + "grad_norm": 2.4228954315185547, + "learning_rate": 3.645247046706588e-06, + "loss": 0.4042, + "step": 4488 + }, + { + "epoch": 2.1224586288416076, + "grad_norm": 2.553551435470581, + "learning_rate": 3.6446924904643427e-06, + "loss": 0.3925, + "step": 4489 + }, + { + "epoch": 2.1229314420803784, + "grad_norm": 2.8019237518310547, + "learning_rate": 3.6441378629471157e-06, + "loss": 0.4079, + "step": 4490 + }, + { + "epoch": 2.123404255319149, + "grad_norm": 2.993251085281372, + "learning_rate": 3.643583164189441e-06, + "loss": 0.4558, + "step": 4491 + }, + { + "epoch": 2.1238770685579196, + "grad_norm": 2.4531471729278564, + "learning_rate": 3.643028394225857e-06, + "loss": 0.4167, + "step": 4492 + }, + { + "epoch": 2.1243498817966904, + "grad_norm": 2.6827852725982666, + "learning_rate": 3.6424735530909065e-06, + "loss": 0.4311, + "step": 4493 + }, + { + "epoch": 2.124822695035461, + "grad_norm": 3.1232128143310547, + "learning_rate": 3.6419186408191377e-06, + "loss": 0.4537, + "step": 4494 + }, + { + "epoch": 2.1252955082742315, + "grad_norm": 2.816348075866699, + "learning_rate": 3.641363657445103e-06, + "loss": 0.4869, + "step": 4495 + }, + { + "epoch": 2.1257683215130023, + "grad_norm": 2.6269683837890625, + "learning_rate": 3.6408086030033575e-06, + "loss": 0.4066, + "step": 4496 + }, + { + "epoch": 2.126241134751773, + "grad_norm": 4.6375956535339355, + "learning_rate": 3.640253477528462e-06, + "loss": 0.4488, + "step": 4497 + }, + { + "epoch": 2.126713947990544, + "grad_norm": 3.020970582962036, + "learning_rate": 3.639698281054983e-06, + "loss": 0.4197, + "step": 4498 + }, + { + "epoch": 2.1271867612293143, + "grad_norm": 2.87904691696167, + "learning_rate": 3.6391430136174892e-06, + "loss": 0.4743, + "step": 4499 + }, + { + "epoch": 2.127659574468085, + "grad_norm": 2.719892978668213, + "learning_rate": 3.6385876752505554e-06, + "loss": 0.388, + "step": 4500 + }, + { + "epoch": 2.128132387706856, + "grad_norm": 2.7321808338165283, + "learning_rate": 3.638032265988759e-06, + "loss": 0.4857, + "step": 4501 + }, + { + "epoch": 2.1286052009456267, + "grad_norm": 2.700814723968506, + "learning_rate": 3.6374767858666836e-06, + "loss": 0.4819, + "step": 4502 + }, + { + "epoch": 2.129078014184397, + "grad_norm": 2.658423662185669, + "learning_rate": 3.6369212349189164e-06, + "loss": 0.4113, + "step": 4503 + }, + { + "epoch": 2.129550827423168, + "grad_norm": 2.673877716064453, + "learning_rate": 3.63636561318005e-06, + "loss": 0.3745, + "step": 4504 + }, + { + "epoch": 2.1300236406619386, + "grad_norm": 2.607758045196533, + "learning_rate": 3.6358099206846787e-06, + "loss": 0.4409, + "step": 4505 + }, + { + "epoch": 2.1304964539007094, + "grad_norm": 2.8117682933807373, + "learning_rate": 3.6352541574674044e-06, + "loss": 0.426, + "step": 4506 + }, + { + "epoch": 2.1309692671394798, + "grad_norm": 2.6970250606536865, + "learning_rate": 3.634698323562832e-06, + "loss": 0.4295, + "step": 4507 + }, + { + "epoch": 2.1314420803782506, + "grad_norm": 2.7133560180664062, + "learning_rate": 3.6341424190055696e-06, + "loss": 0.4443, + "step": 4508 + }, + { + "epoch": 2.1319148936170214, + "grad_norm": 2.57181715965271, + "learning_rate": 3.6335864438302328e-06, + "loss": 0.3995, + "step": 4509 + }, + { + "epoch": 2.132387706855792, + "grad_norm": 2.8618004322052, + "learning_rate": 3.633030398071438e-06, + "loss": 0.5075, + "step": 4510 + }, + { + "epoch": 2.1328605200945625, + "grad_norm": 2.7586729526519775, + "learning_rate": 3.6324742817638087e-06, + "loss": 0.4322, + "step": 4511 + }, + { + "epoch": 2.1333333333333333, + "grad_norm": 2.913256883621216, + "learning_rate": 3.631918094941972e-06, + "loss": 0.4708, + "step": 4512 + }, + { + "epoch": 2.133806146572104, + "grad_norm": 2.7715728282928467, + "learning_rate": 3.6313618376405585e-06, + "loss": 0.5194, + "step": 4513 + }, + { + "epoch": 2.134278959810875, + "grad_norm": 2.7986366748809814, + "learning_rate": 3.6308055098942042e-06, + "loss": 0.4419, + "step": 4514 + }, + { + "epoch": 2.1347517730496453, + "grad_norm": 3.043549060821533, + "learning_rate": 3.6302491117375492e-06, + "loss": 0.4441, + "step": 4515 + }, + { + "epoch": 2.135224586288416, + "grad_norm": 2.771761417388916, + "learning_rate": 3.629692643205238e-06, + "loss": 0.4752, + "step": 4516 + }, + { + "epoch": 2.135697399527187, + "grad_norm": 2.804941415786743, + "learning_rate": 3.6291361043319202e-06, + "loss": 0.4089, + "step": 4517 + }, + { + "epoch": 2.1361702127659576, + "grad_norm": 2.9897940158843994, + "learning_rate": 3.628579495152248e-06, + "loss": 0.4829, + "step": 4518 + }, + { + "epoch": 2.136643026004728, + "grad_norm": 2.9273486137390137, + "learning_rate": 3.6280228157008784e-06, + "loss": 0.4469, + "step": 4519 + }, + { + "epoch": 2.137115839243499, + "grad_norm": 2.584373950958252, + "learning_rate": 3.627466066012475e-06, + "loss": 0.4277, + "step": 4520 + }, + { + "epoch": 2.1375886524822696, + "grad_norm": 3.009333848953247, + "learning_rate": 3.626909246121703e-06, + "loss": 0.4025, + "step": 4521 + }, + { + "epoch": 2.1380614657210404, + "grad_norm": 2.634615659713745, + "learning_rate": 3.626352356063234e-06, + "loss": 0.4046, + "step": 4522 + }, + { + "epoch": 2.1385342789598107, + "grad_norm": 2.87310528755188, + "learning_rate": 3.625795395871743e-06, + "loss": 0.4426, + "step": 4523 + }, + { + "epoch": 2.1390070921985815, + "grad_norm": 2.94985032081604, + "learning_rate": 3.625238365581909e-06, + "loss": 0.445, + "step": 4524 + }, + { + "epoch": 2.1394799054373523, + "grad_norm": 2.470189332962036, + "learning_rate": 3.624681265228416e-06, + "loss": 0.4082, + "step": 4525 + }, + { + "epoch": 2.139952718676123, + "grad_norm": 2.5304040908813477, + "learning_rate": 3.624124094845952e-06, + "loss": 0.403, + "step": 4526 + }, + { + "epoch": 2.1404255319148935, + "grad_norm": 2.6148900985717773, + "learning_rate": 3.62356685446921e-06, + "loss": 0.3867, + "step": 4527 + }, + { + "epoch": 2.1408983451536643, + "grad_norm": 2.885549783706665, + "learning_rate": 3.623009544132886e-06, + "loss": 0.4706, + "step": 4528 + }, + { + "epoch": 2.141371158392435, + "grad_norm": 3.00490665435791, + "learning_rate": 3.6224521638716827e-06, + "loss": 0.4733, + "step": 4529 + }, + { + "epoch": 2.141843971631206, + "grad_norm": 2.925879716873169, + "learning_rate": 3.6218947137203043e-06, + "loss": 0.4581, + "step": 4530 + }, + { + "epoch": 2.1423167848699762, + "grad_norm": 3.10861873626709, + "learning_rate": 3.621337193713462e-06, + "loss": 0.4579, + "step": 4531 + }, + { + "epoch": 2.142789598108747, + "grad_norm": 2.7386577129364014, + "learning_rate": 3.6207796038858693e-06, + "loss": 0.4248, + "step": 4532 + }, + { + "epoch": 2.143262411347518, + "grad_norm": 2.601836681365967, + "learning_rate": 3.6202219442722453e-06, + "loss": 0.4928, + "step": 4533 + }, + { + "epoch": 2.1437352245862886, + "grad_norm": 2.598778247833252, + "learning_rate": 3.6196642149073123e-06, + "loss": 0.4415, + "step": 4534 + }, + { + "epoch": 2.144208037825059, + "grad_norm": 2.443995714187622, + "learning_rate": 3.619106415825798e-06, + "loss": 0.3917, + "step": 4535 + }, + { + "epoch": 2.1446808510638298, + "grad_norm": 2.84643816947937, + "learning_rate": 3.6185485470624354e-06, + "loss": 0.4162, + "step": 4536 + }, + { + "epoch": 2.1451536643026006, + "grad_norm": 2.4568188190460205, + "learning_rate": 3.617990608651959e-06, + "loss": 0.4298, + "step": 4537 + }, + { + "epoch": 2.145626477541371, + "grad_norm": 2.968804359436035, + "learning_rate": 3.61743260062911e-06, + "loss": 0.4696, + "step": 4538 + }, + { + "epoch": 2.1460992907801417, + "grad_norm": 2.629075288772583, + "learning_rate": 3.6168745230286327e-06, + "loss": 0.4234, + "step": 4539 + }, + { + "epoch": 2.1465721040189125, + "grad_norm": 2.7680578231811523, + "learning_rate": 3.6163163758852754e-06, + "loss": 0.4669, + "step": 4540 + }, + { + "epoch": 2.1470449172576833, + "grad_norm": 2.782825469970703, + "learning_rate": 3.615758159233793e-06, + "loss": 0.4552, + "step": 4541 + }, + { + "epoch": 2.147517730496454, + "grad_norm": 2.653047561645508, + "learning_rate": 3.615199873108942e-06, + "loss": 0.4393, + "step": 4542 + }, + { + "epoch": 2.1479905437352245, + "grad_norm": 2.4175806045532227, + "learning_rate": 3.6146415175454852e-06, + "loss": 0.4114, + "step": 4543 + }, + { + "epoch": 2.1484633569739953, + "grad_norm": 2.627943515777588, + "learning_rate": 3.614083092578189e-06, + "loss": 0.4215, + "step": 4544 + }, + { + "epoch": 2.148936170212766, + "grad_norm": 2.8934123516082764, + "learning_rate": 3.6135245982418227e-06, + "loss": 0.4815, + "step": 4545 + }, + { + "epoch": 2.1494089834515364, + "grad_norm": 2.8535244464874268, + "learning_rate": 3.612966034571164e-06, + "loss": 0.4683, + "step": 4546 + }, + { + "epoch": 2.149881796690307, + "grad_norm": 2.7826647758483887, + "learning_rate": 3.6124074016009893e-06, + "loss": 0.4351, + "step": 4547 + }, + { + "epoch": 2.150354609929078, + "grad_norm": 2.6906018257141113, + "learning_rate": 3.6118486993660834e-06, + "loss": 0.4585, + "step": 4548 + }, + { + "epoch": 2.150827423167849, + "grad_norm": 2.726766586303711, + "learning_rate": 3.6112899279012346e-06, + "loss": 0.4753, + "step": 4549 + }, + { + "epoch": 2.1513002364066196, + "grad_norm": 3.0193991661071777, + "learning_rate": 3.6107310872412348e-06, + "loss": 0.4827, + "step": 4550 + }, + { + "epoch": 2.15177304964539, + "grad_norm": 2.6788697242736816, + "learning_rate": 3.610172177420881e-06, + "loss": 0.4333, + "step": 4551 + }, + { + "epoch": 2.1522458628841608, + "grad_norm": 2.865410327911377, + "learning_rate": 3.609613198474973e-06, + "loss": 0.4569, + "step": 4552 + }, + { + "epoch": 2.1527186761229316, + "grad_norm": 2.9199366569519043, + "learning_rate": 3.609054150438317e-06, + "loss": 0.5097, + "step": 4553 + }, + { + "epoch": 2.153191489361702, + "grad_norm": 2.761035203933716, + "learning_rate": 3.6084950333457215e-06, + "loss": 0.5002, + "step": 4554 + }, + { + "epoch": 2.1536643026004727, + "grad_norm": 2.514223337173462, + "learning_rate": 3.607935847232002e-06, + "loss": 0.4171, + "step": 4555 + }, + { + "epoch": 2.1541371158392435, + "grad_norm": 2.5167524814605713, + "learning_rate": 3.6073765921319747e-06, + "loss": 0.4494, + "step": 4556 + }, + { + "epoch": 2.1546099290780143, + "grad_norm": 2.7540643215179443, + "learning_rate": 3.606817268080463e-06, + "loss": 0.4472, + "step": 4557 + }, + { + "epoch": 2.155082742316785, + "grad_norm": 2.7728664875030518, + "learning_rate": 3.6062578751122936e-06, + "loss": 0.4669, + "step": 4558 + }, + { + "epoch": 2.1555555555555554, + "grad_norm": 2.7788400650024414, + "learning_rate": 3.605698413262296e-06, + "loss": 0.4613, + "step": 4559 + }, + { + "epoch": 2.1560283687943262, + "grad_norm": 2.7811810970306396, + "learning_rate": 3.605138882565308e-06, + "loss": 0.4242, + "step": 4560 + }, + { + "epoch": 2.156501182033097, + "grad_norm": 2.7819995880126953, + "learning_rate": 3.6045792830561664e-06, + "loss": 0.443, + "step": 4561 + }, + { + "epoch": 2.1569739952718674, + "grad_norm": 2.671259641647339, + "learning_rate": 3.6040196147697166e-06, + "loss": 0.4336, + "step": 4562 + }, + { + "epoch": 2.157446808510638, + "grad_norm": 2.9296300411224365, + "learning_rate": 3.603459877740807e-06, + "loss": 0.479, + "step": 4563 + }, + { + "epoch": 2.157919621749409, + "grad_norm": 2.834937334060669, + "learning_rate": 3.602900072004289e-06, + "loss": 0.4603, + "step": 4564 + }, + { + "epoch": 2.15839243498818, + "grad_norm": 2.8434760570526123, + "learning_rate": 3.602340197595019e-06, + "loss": 0.4288, + "step": 4565 + }, + { + "epoch": 2.1588652482269506, + "grad_norm": 2.7245426177978516, + "learning_rate": 3.6017802545478593e-06, + "loss": 0.4194, + "step": 4566 + }, + { + "epoch": 2.159338061465721, + "grad_norm": 2.7795023918151855, + "learning_rate": 3.6012202428976735e-06, + "loss": 0.4481, + "step": 4567 + }, + { + "epoch": 2.1598108747044917, + "grad_norm": 2.9482083320617676, + "learning_rate": 3.6006601626793325e-06, + "loss": 0.468, + "step": 4568 + }, + { + "epoch": 2.1602836879432625, + "grad_norm": 2.9563326835632324, + "learning_rate": 3.6001000139277094e-06, + "loss": 0.4427, + "step": 4569 + }, + { + "epoch": 2.160756501182033, + "grad_norm": 2.7755916118621826, + "learning_rate": 3.599539796677682e-06, + "loss": 0.4258, + "step": 4570 + }, + { + "epoch": 2.1612293144208037, + "grad_norm": 2.961045265197754, + "learning_rate": 3.5989795109641333e-06, + "loss": 0.4645, + "step": 4571 + }, + { + "epoch": 2.1617021276595745, + "grad_norm": 3.0184407234191895, + "learning_rate": 3.5984191568219482e-06, + "loss": 0.4192, + "step": 4572 + }, + { + "epoch": 2.1621749408983453, + "grad_norm": 2.9811131954193115, + "learning_rate": 3.5978587342860192e-06, + "loss": 0.408, + "step": 4573 + }, + { + "epoch": 2.162647754137116, + "grad_norm": 2.9172329902648926, + "learning_rate": 3.597298243391242e-06, + "loss": 0.4528, + "step": 4574 + }, + { + "epoch": 2.1631205673758864, + "grad_norm": 2.7798452377319336, + "learning_rate": 3.596737684172513e-06, + "loss": 0.391, + "step": 4575 + }, + { + "epoch": 2.1635933806146572, + "grad_norm": 2.526277542114258, + "learning_rate": 3.596177056664738e-06, + "loss": 0.3699, + "step": 4576 + }, + { + "epoch": 2.164066193853428, + "grad_norm": 2.856269121170044, + "learning_rate": 3.5956163609028244e-06, + "loss": 0.4082, + "step": 4577 + }, + { + "epoch": 2.1645390070921984, + "grad_norm": 2.7681572437286377, + "learning_rate": 3.5950555969216845e-06, + "loss": 0.4064, + "step": 4578 + }, + { + "epoch": 2.165011820330969, + "grad_norm": 2.2924954891204834, + "learning_rate": 3.5944947647562333e-06, + "loss": 0.416, + "step": 4579 + }, + { + "epoch": 2.16548463356974, + "grad_norm": 2.439929485321045, + "learning_rate": 3.5939338644413936e-06, + "loss": 0.4476, + "step": 4580 + }, + { + "epoch": 2.1659574468085108, + "grad_norm": 2.786442518234253, + "learning_rate": 3.5933728960120877e-06, + "loss": 0.4525, + "step": 4581 + }, + { + "epoch": 2.166430260047281, + "grad_norm": 2.5910253524780273, + "learning_rate": 3.5928118595032465e-06, + "loss": 0.4441, + "step": 4582 + }, + { + "epoch": 2.166903073286052, + "grad_norm": 2.8144876956939697, + "learning_rate": 3.5922507549498024e-06, + "loss": 0.497, + "step": 4583 + }, + { + "epoch": 2.1673758865248227, + "grad_norm": 2.5714170932769775, + "learning_rate": 3.591689582386694e-06, + "loss": 0.4625, + "step": 4584 + }, + { + "epoch": 2.1678486997635935, + "grad_norm": 2.878187894821167, + "learning_rate": 3.591128341848861e-06, + "loss": 0.4835, + "step": 4585 + }, + { + "epoch": 2.168321513002364, + "grad_norm": 2.4946508407592773, + "learning_rate": 3.5905670333712504e-06, + "loss": 0.4278, + "step": 4586 + }, + { + "epoch": 2.1687943262411347, + "grad_norm": 2.9186196327209473, + "learning_rate": 3.590005656988814e-06, + "loss": 0.465, + "step": 4587 + }, + { + "epoch": 2.1692671394799055, + "grad_norm": 3.136807441711426, + "learning_rate": 3.5894442127365046e-06, + "loss": 0.4146, + "step": 4588 + }, + { + "epoch": 2.1697399527186763, + "grad_norm": 2.8106343746185303, + "learning_rate": 3.5888827006492804e-06, + "loss": 0.4737, + "step": 4589 + }, + { + "epoch": 2.1702127659574466, + "grad_norm": 2.874553680419922, + "learning_rate": 3.5883211207621047e-06, + "loss": 0.3962, + "step": 4590 + }, + { + "epoch": 2.1706855791962174, + "grad_norm": 2.7914116382598877, + "learning_rate": 3.587759473109946e-06, + "loss": 0.4705, + "step": 4591 + }, + { + "epoch": 2.171158392434988, + "grad_norm": 2.7273290157318115, + "learning_rate": 3.5871977577277745e-06, + "loss": 0.4827, + "step": 4592 + }, + { + "epoch": 2.171631205673759, + "grad_norm": 2.4167256355285645, + "learning_rate": 3.5866359746505653e-06, + "loss": 0.4181, + "step": 4593 + }, + { + "epoch": 2.1721040189125294, + "grad_norm": 2.8929779529571533, + "learning_rate": 3.586074123913299e-06, + "loss": 0.4006, + "step": 4594 + }, + { + "epoch": 2.1725768321513, + "grad_norm": 2.6996190547943115, + "learning_rate": 3.5855122055509593e-06, + "loss": 0.4792, + "step": 4595 + }, + { + "epoch": 2.173049645390071, + "grad_norm": 2.9341464042663574, + "learning_rate": 3.584950219598534e-06, + "loss": 0.3903, + "step": 4596 + }, + { + "epoch": 2.1735224586288417, + "grad_norm": 2.799330234527588, + "learning_rate": 3.5843881660910166e-06, + "loss": 0.4717, + "step": 4597 + }, + { + "epoch": 2.173995271867612, + "grad_norm": 2.5028693675994873, + "learning_rate": 3.5838260450634028e-06, + "loss": 0.4462, + "step": 4598 + }, + { + "epoch": 2.174468085106383, + "grad_norm": 2.5845541954040527, + "learning_rate": 3.583263856550693e-06, + "loss": 0.4327, + "step": 4599 + }, + { + "epoch": 2.1749408983451537, + "grad_norm": 2.4804906845092773, + "learning_rate": 3.5827016005878933e-06, + "loss": 0.4555, + "step": 4600 + }, + { + "epoch": 2.1754137115839245, + "grad_norm": 2.625746011734009, + "learning_rate": 3.5821392772100125e-06, + "loss": 0.455, + "step": 4601 + }, + { + "epoch": 2.175886524822695, + "grad_norm": 2.6230757236480713, + "learning_rate": 3.581576886452064e-06, + "loss": 0.4422, + "step": 4602 + }, + { + "epoch": 2.1763593380614656, + "grad_norm": 3.3104100227355957, + "learning_rate": 3.5810144283490656e-06, + "loss": 0.4212, + "step": 4603 + }, + { + "epoch": 2.1768321513002364, + "grad_norm": 2.6799755096435547, + "learning_rate": 3.5804519029360384e-06, + "loss": 0.4575, + "step": 4604 + }, + { + "epoch": 2.1773049645390072, + "grad_norm": 2.462216854095459, + "learning_rate": 3.5798893102480085e-06, + "loss": 0.4096, + "step": 4605 + }, + { + "epoch": 2.1777777777777776, + "grad_norm": 2.8600878715515137, + "learning_rate": 3.5793266503200074e-06, + "loss": 0.4798, + "step": 4606 + }, + { + "epoch": 2.1782505910165484, + "grad_norm": 2.935746431350708, + "learning_rate": 3.5787639231870673e-06, + "loss": 0.4021, + "step": 4607 + }, + { + "epoch": 2.178723404255319, + "grad_norm": 2.8655526638031006, + "learning_rate": 3.578201128884229e-06, + "loss": 0.4553, + "step": 4608 + }, + { + "epoch": 2.17919621749409, + "grad_norm": 3.219498634338379, + "learning_rate": 3.577638267446533e-06, + "loss": 0.4692, + "step": 4609 + }, + { + "epoch": 2.1796690307328603, + "grad_norm": 3.0449860095977783, + "learning_rate": 3.5770753389090283e-06, + "loss": 0.4675, + "step": 4610 + }, + { + "epoch": 2.180141843971631, + "grad_norm": 2.7045507431030273, + "learning_rate": 3.576512343306765e-06, + "loss": 0.4773, + "step": 4611 + }, + { + "epoch": 2.180614657210402, + "grad_norm": 2.601499557495117, + "learning_rate": 3.5759492806747985e-06, + "loss": 0.4112, + "step": 4612 + }, + { + "epoch": 2.1810874704491727, + "grad_norm": 2.987741470336914, + "learning_rate": 3.575386151048188e-06, + "loss": 0.4651, + "step": 4613 + }, + { + "epoch": 2.181560283687943, + "grad_norm": 2.961228847503662, + "learning_rate": 3.5748229544619973e-06, + "loss": 0.5116, + "step": 4614 + }, + { + "epoch": 2.182033096926714, + "grad_norm": 2.8008430004119873, + "learning_rate": 3.574259690951295e-06, + "loss": 0.4152, + "step": 4615 + }, + { + "epoch": 2.1825059101654847, + "grad_norm": 2.5429348945617676, + "learning_rate": 3.573696360551151e-06, + "loss": 0.4188, + "step": 4616 + }, + { + "epoch": 2.1829787234042555, + "grad_norm": 2.9566478729248047, + "learning_rate": 3.5731329632966428e-06, + "loss": 0.5156, + "step": 4617 + }, + { + "epoch": 2.183451536643026, + "grad_norm": 2.5302467346191406, + "learning_rate": 3.572569499222851e-06, + "loss": 0.4361, + "step": 4618 + }, + { + "epoch": 2.1839243498817966, + "grad_norm": 3.206803560256958, + "learning_rate": 3.5720059683648593e-06, + "loss": 0.5149, + "step": 4619 + }, + { + "epoch": 2.1843971631205674, + "grad_norm": 2.9432034492492676, + "learning_rate": 3.5714423707577573e-06, + "loss": 0.4411, + "step": 4620 + }, + { + "epoch": 2.184869976359338, + "grad_norm": 2.9412078857421875, + "learning_rate": 3.5708787064366358e-06, + "loss": 0.4372, + "step": 4621 + }, + { + "epoch": 2.1853427895981086, + "grad_norm": 3.1702330112457275, + "learning_rate": 3.5703149754365935e-06, + "loss": 0.4761, + "step": 4622 + }, + { + "epoch": 2.1858156028368794, + "grad_norm": 3.1240456104278564, + "learning_rate": 3.569751177792731e-06, + "loss": 0.4854, + "step": 4623 + }, + { + "epoch": 2.18628841607565, + "grad_norm": 2.7221994400024414, + "learning_rate": 3.5691873135401534e-06, + "loss": 0.4048, + "step": 4624 + }, + { + "epoch": 2.186761229314421, + "grad_norm": 2.74397873878479, + "learning_rate": 3.5686233827139695e-06, + "loss": 0.4747, + "step": 4625 + }, + { + "epoch": 2.1872340425531913, + "grad_norm": 2.7379889488220215, + "learning_rate": 3.5680593853492932e-06, + "loss": 0.4963, + "step": 4626 + }, + { + "epoch": 2.187706855791962, + "grad_norm": 3.040205478668213, + "learning_rate": 3.5674953214812435e-06, + "loss": 0.4917, + "step": 4627 + }, + { + "epoch": 2.188179669030733, + "grad_norm": 2.95302677154541, + "learning_rate": 3.56693119114494e-06, + "loss": 0.4758, + "step": 4628 + }, + { + "epoch": 2.1886524822695037, + "grad_norm": 2.5488312244415283, + "learning_rate": 3.56636699437551e-06, + "loss": 0.4057, + "step": 4629 + }, + { + "epoch": 2.189125295508274, + "grad_norm": 2.8379666805267334, + "learning_rate": 3.565802731208083e-06, + "loss": 0.4755, + "step": 4630 + }, + { + "epoch": 2.189598108747045, + "grad_norm": 2.8765869140625, + "learning_rate": 3.565238401677793e-06, + "loss": 0.4232, + "step": 4631 + }, + { + "epoch": 2.1900709219858157, + "grad_norm": 2.9091262817382812, + "learning_rate": 3.5646740058197784e-06, + "loss": 0.3874, + "step": 4632 + }, + { + "epoch": 2.1905437352245865, + "grad_norm": 2.7067387104034424, + "learning_rate": 3.5641095436691826e-06, + "loss": 0.4771, + "step": 4633 + }, + { + "epoch": 2.191016548463357, + "grad_norm": 2.403043508529663, + "learning_rate": 3.563545015261151e-06, + "loss": 0.4062, + "step": 4634 + }, + { + "epoch": 2.1914893617021276, + "grad_norm": 2.8059732913970947, + "learning_rate": 3.562980420630836e-06, + "loss": 0.4635, + "step": 4635 + }, + { + "epoch": 2.1919621749408984, + "grad_norm": 2.5467724800109863, + "learning_rate": 3.56241575981339e-06, + "loss": 0.4552, + "step": 4636 + }, + { + "epoch": 2.192434988179669, + "grad_norm": 2.651024103164673, + "learning_rate": 3.561851032843973e-06, + "loss": 0.38, + "step": 4637 + }, + { + "epoch": 2.1929078014184396, + "grad_norm": 2.5529849529266357, + "learning_rate": 3.5612862397577496e-06, + "loss": 0.4106, + "step": 4638 + }, + { + "epoch": 2.1933806146572103, + "grad_norm": 3.069258451461792, + "learning_rate": 3.5607213805898844e-06, + "loss": 0.461, + "step": 4639 + }, + { + "epoch": 2.193853427895981, + "grad_norm": 2.5652637481689453, + "learning_rate": 3.56015645537555e-06, + "loss": 0.4497, + "step": 4640 + }, + { + "epoch": 2.194326241134752, + "grad_norm": 2.699101209640503, + "learning_rate": 3.5595914641499224e-06, + "loss": 0.4887, + "step": 4641 + }, + { + "epoch": 2.1947990543735223, + "grad_norm": 2.9292235374450684, + "learning_rate": 3.5590264069481805e-06, + "loss": 0.4462, + "step": 4642 + }, + { + "epoch": 2.195271867612293, + "grad_norm": 2.6151106357574463, + "learning_rate": 3.5584612838055077e-06, + "loss": 0.4334, + "step": 4643 + }, + { + "epoch": 2.195744680851064, + "grad_norm": 2.895798444747925, + "learning_rate": 3.5578960947570923e-06, + "loss": 0.4448, + "step": 4644 + }, + { + "epoch": 2.1962174940898347, + "grad_norm": 2.627631425857544, + "learning_rate": 3.557330839838125e-06, + "loss": 0.436, + "step": 4645 + }, + { + "epoch": 2.196690307328605, + "grad_norm": 2.8803584575653076, + "learning_rate": 3.556765519083803e-06, + "loss": 0.4698, + "step": 4646 + }, + { + "epoch": 2.197163120567376, + "grad_norm": 2.436609983444214, + "learning_rate": 3.5562001325293265e-06, + "loss": 0.4043, + "step": 4647 + }, + { + "epoch": 2.1976359338061466, + "grad_norm": 2.5090718269348145, + "learning_rate": 3.5556346802098985e-06, + "loss": 0.4505, + "step": 4648 + }, + { + "epoch": 2.1981087470449174, + "grad_norm": 2.792783737182617, + "learning_rate": 3.5550691621607277e-06, + "loss": 0.43, + "step": 4649 + }, + { + "epoch": 2.198581560283688, + "grad_norm": 2.74153470993042, + "learning_rate": 3.554503578417026e-06, + "loss": 0.4496, + "step": 4650 + }, + { + "epoch": 2.1990543735224586, + "grad_norm": 3.0262627601623535, + "learning_rate": 3.5539379290140114e-06, + "loss": 0.4503, + "step": 4651 + }, + { + "epoch": 2.1995271867612294, + "grad_norm": 2.783811330795288, + "learning_rate": 3.553372213986903e-06, + "loss": 0.432, + "step": 4652 + }, + { + "epoch": 2.2, + "grad_norm": 3.091191053390503, + "learning_rate": 3.5528064333709255e-06, + "loss": 0.4658, + "step": 4653 + }, + { + "epoch": 2.2004728132387705, + "grad_norm": 2.814634084701538, + "learning_rate": 3.5522405872013076e-06, + "loss": 0.4473, + "step": 4654 + }, + { + "epoch": 2.2009456264775413, + "grad_norm": 2.6918299198150635, + "learning_rate": 3.5516746755132824e-06, + "loss": 0.5323, + "step": 4655 + }, + { + "epoch": 2.201418439716312, + "grad_norm": 2.9902455806732178, + "learning_rate": 3.5511086983420867e-06, + "loss": 0.5166, + "step": 4656 + }, + { + "epoch": 2.201891252955083, + "grad_norm": 2.932699203491211, + "learning_rate": 3.5505426557229616e-06, + "loss": 0.5197, + "step": 4657 + }, + { + "epoch": 2.2023640661938533, + "grad_norm": 2.585712432861328, + "learning_rate": 3.549976547691152e-06, + "loss": 0.425, + "step": 4658 + }, + { + "epoch": 2.202836879432624, + "grad_norm": 3.1019949913024902, + "learning_rate": 3.5494103742819065e-06, + "loss": 0.485, + "step": 4659 + }, + { + "epoch": 2.203309692671395, + "grad_norm": 2.3169195652008057, + "learning_rate": 3.548844135530478e-06, + "loss": 0.4064, + "step": 4660 + }, + { + "epoch": 2.2037825059101657, + "grad_norm": 2.779240846633911, + "learning_rate": 3.5482778314721257e-06, + "loss": 0.427, + "step": 4661 + }, + { + "epoch": 2.204255319148936, + "grad_norm": 2.765423059463501, + "learning_rate": 3.5477114621421078e-06, + "loss": 0.5125, + "step": 4662 + }, + { + "epoch": 2.204728132387707, + "grad_norm": 2.5590033531188965, + "learning_rate": 3.5471450275756913e-06, + "loss": 0.4009, + "step": 4663 + }, + { + "epoch": 2.2052009456264776, + "grad_norm": 2.706068515777588, + "learning_rate": 3.546578527808146e-06, + "loss": 0.4604, + "step": 4664 + }, + { + "epoch": 2.2056737588652484, + "grad_norm": 2.7995102405548096, + "learning_rate": 3.546011962874745e-06, + "loss": 0.4088, + "step": 4665 + }, + { + "epoch": 2.2061465721040188, + "grad_norm": 2.6369729042053223, + "learning_rate": 3.5454453328107656e-06, + "loss": 0.4634, + "step": 4666 + }, + { + "epoch": 2.2066193853427896, + "grad_norm": 3.1426475048065186, + "learning_rate": 3.54487863765149e-06, + "loss": 0.4761, + "step": 4667 + }, + { + "epoch": 2.2070921985815604, + "grad_norm": 2.7739460468292236, + "learning_rate": 3.5443118774322027e-06, + "loss": 0.467, + "step": 4668 + }, + { + "epoch": 2.207565011820331, + "grad_norm": 2.559105157852173, + "learning_rate": 3.5437450521881934e-06, + "loss": 0.4268, + "step": 4669 + }, + { + "epoch": 2.2080378250591015, + "grad_norm": 2.726593017578125, + "learning_rate": 3.543178161954758e-06, + "loss": 0.462, + "step": 4670 + }, + { + "epoch": 2.2085106382978723, + "grad_norm": 2.796109199523926, + "learning_rate": 3.5426112067671907e-06, + "loss": 0.4571, + "step": 4671 + }, + { + "epoch": 2.208983451536643, + "grad_norm": 2.7989072799682617, + "learning_rate": 3.5420441866607964e-06, + "loss": 0.4648, + "step": 4672 + }, + { + "epoch": 2.209456264775414, + "grad_norm": 2.6750967502593994, + "learning_rate": 3.5414771016708795e-06, + "loss": 0.4717, + "step": 4673 + }, + { + "epoch": 2.2099290780141843, + "grad_norm": 2.705659866333008, + "learning_rate": 3.5409099518327507e-06, + "loss": 0.4738, + "step": 4674 + }, + { + "epoch": 2.210401891252955, + "grad_norm": 2.79276442527771, + "learning_rate": 3.5403427371817234e-06, + "loss": 0.4625, + "step": 4675 + }, + { + "epoch": 2.210874704491726, + "grad_norm": 2.781339406967163, + "learning_rate": 3.539775457753115e-06, + "loss": 0.438, + "step": 4676 + }, + { + "epoch": 2.2113475177304966, + "grad_norm": 3.0088918209075928, + "learning_rate": 3.5392081135822488e-06, + "loss": 0.4776, + "step": 4677 + }, + { + "epoch": 2.211820330969267, + "grad_norm": 3.0291390419006348, + "learning_rate": 3.538640704704449e-06, + "loss": 0.4634, + "step": 4678 + }, + { + "epoch": 2.212293144208038, + "grad_norm": 2.967867374420166, + "learning_rate": 3.5380732311550477e-06, + "loss": 0.4776, + "step": 4679 + }, + { + "epoch": 2.2127659574468086, + "grad_norm": 2.6268832683563232, + "learning_rate": 3.5375056929693787e-06, + "loss": 0.4646, + "step": 4680 + }, + { + "epoch": 2.2132387706855794, + "grad_norm": 2.6688554286956787, + "learning_rate": 3.536938090182778e-06, + "loss": 0.3975, + "step": 4681 + }, + { + "epoch": 2.2137115839243497, + "grad_norm": 3.0079736709594727, + "learning_rate": 3.5363704228305906e-06, + "loss": 0.4724, + "step": 4682 + }, + { + "epoch": 2.2141843971631205, + "grad_norm": 2.4287586212158203, + "learning_rate": 3.535802690948161e-06, + "loss": 0.4371, + "step": 4683 + }, + { + "epoch": 2.2146572104018913, + "grad_norm": 2.960679531097412, + "learning_rate": 3.53523489457084e-06, + "loss": 0.4347, + "step": 4684 + }, + { + "epoch": 2.215130023640662, + "grad_norm": 2.9646008014678955, + "learning_rate": 3.5346670337339807e-06, + "loss": 0.4803, + "step": 4685 + }, + { + "epoch": 2.2156028368794325, + "grad_norm": 3.0518898963928223, + "learning_rate": 3.534099108472942e-06, + "loss": 0.4712, + "step": 4686 + }, + { + "epoch": 2.2160756501182033, + "grad_norm": 2.776681900024414, + "learning_rate": 3.533531118823086e-06, + "loss": 0.4347, + "step": 4687 + }, + { + "epoch": 2.216548463356974, + "grad_norm": 2.18019437789917, + "learning_rate": 3.53296306481978e-06, + "loss": 0.3551, + "step": 4688 + }, + { + "epoch": 2.217021276595745, + "grad_norm": 2.9400811195373535, + "learning_rate": 3.5323949464983937e-06, + "loss": 0.4912, + "step": 4689 + }, + { + "epoch": 2.2174940898345152, + "grad_norm": 2.798386812210083, + "learning_rate": 3.5318267638943e-06, + "loss": 0.3967, + "step": 4690 + }, + { + "epoch": 2.217966903073286, + "grad_norm": 2.5452775955200195, + "learning_rate": 3.531258517042879e-06, + "loss": 0.3773, + "step": 4691 + }, + { + "epoch": 2.218439716312057, + "grad_norm": 2.711137294769287, + "learning_rate": 3.5306902059795113e-06, + "loss": 0.4123, + "step": 4692 + }, + { + "epoch": 2.2189125295508276, + "grad_norm": 3.0022387504577637, + "learning_rate": 3.530121830739584e-06, + "loss": 0.4898, + "step": 4693 + }, + { + "epoch": 2.219385342789598, + "grad_norm": 2.871814250946045, + "learning_rate": 3.5295533913584877e-06, + "loss": 0.4497, + "step": 4694 + }, + { + "epoch": 2.219858156028369, + "grad_norm": 2.9782521724700928, + "learning_rate": 3.528984887871616e-06, + "loss": 0.4797, + "step": 4695 + }, + { + "epoch": 2.2203309692671396, + "grad_norm": 2.6896398067474365, + "learning_rate": 3.5284163203143673e-06, + "loss": 0.439, + "step": 4696 + }, + { + "epoch": 2.2208037825059104, + "grad_norm": 2.7898833751678467, + "learning_rate": 3.5278476887221436e-06, + "loss": 0.4656, + "step": 4697 + }, + { + "epoch": 2.2212765957446807, + "grad_norm": 2.800416946411133, + "learning_rate": 3.527278993130352e-06, + "loss": 0.4452, + "step": 4698 + }, + { + "epoch": 2.2217494089834515, + "grad_norm": 3.653228998184204, + "learning_rate": 3.526710233574401e-06, + "loss": 0.4189, + "step": 4699 + }, + { + "epoch": 2.2222222222222223, + "grad_norm": 2.856956958770752, + "learning_rate": 3.5261414100897064e-06, + "loss": 0.4298, + "step": 4700 + }, + { + "epoch": 2.222695035460993, + "grad_norm": 2.8576223850250244, + "learning_rate": 3.5255725227116854e-06, + "loss": 0.4425, + "step": 4701 + }, + { + "epoch": 2.2231678486997635, + "grad_norm": 3.1161351203918457, + "learning_rate": 3.5250035714757603e-06, + "loss": 0.4609, + "step": 4702 + }, + { + "epoch": 2.2236406619385343, + "grad_norm": 2.843379259109497, + "learning_rate": 3.5244345564173578e-06, + "loss": 0.3589, + "step": 4703 + }, + { + "epoch": 2.224113475177305, + "grad_norm": 2.877157211303711, + "learning_rate": 3.5238654775719068e-06, + "loss": 0.4591, + "step": 4704 + }, + { + "epoch": 2.2245862884160754, + "grad_norm": 3.488954782485962, + "learning_rate": 3.5232963349748424e-06, + "loss": 0.4836, + "step": 4705 + }, + { + "epoch": 2.225059101654846, + "grad_norm": 2.929037570953369, + "learning_rate": 3.5227271286616025e-06, + "loss": 0.5293, + "step": 4706 + }, + { + "epoch": 2.225531914893617, + "grad_norm": 2.6230576038360596, + "learning_rate": 3.5221578586676286e-06, + "loss": 0.4235, + "step": 4707 + }, + { + "epoch": 2.226004728132388, + "grad_norm": 2.529998302459717, + "learning_rate": 3.5215885250283664e-06, + "loss": 0.4369, + "step": 4708 + }, + { + "epoch": 2.2264775413711586, + "grad_norm": 2.817279577255249, + "learning_rate": 3.521019127779267e-06, + "loss": 0.481, + "step": 4709 + }, + { + "epoch": 2.226950354609929, + "grad_norm": 3.1513843536376953, + "learning_rate": 3.5204496669557833e-06, + "loss": 0.463, + "step": 4710 + }, + { + "epoch": 2.2274231678486998, + "grad_norm": 2.9403610229492188, + "learning_rate": 3.5198801425933725e-06, + "loss": 0.455, + "step": 4711 + }, + { + "epoch": 2.2278959810874706, + "grad_norm": 2.648346424102783, + "learning_rate": 3.5193105547274987e-06, + "loss": 0.4441, + "step": 4712 + }, + { + "epoch": 2.228368794326241, + "grad_norm": 2.791898727416992, + "learning_rate": 3.5187409033936252e-06, + "loss": 0.4682, + "step": 4713 + }, + { + "epoch": 2.2288416075650117, + "grad_norm": 2.8157432079315186, + "learning_rate": 3.5181711886272242e-06, + "loss": 0.4572, + "step": 4714 + }, + { + "epoch": 2.2293144208037825, + "grad_norm": 3.250319480895996, + "learning_rate": 3.5176014104637665e-06, + "loss": 0.4599, + "step": 4715 + }, + { + "epoch": 2.2297872340425533, + "grad_norm": 2.6747050285339355, + "learning_rate": 3.5170315689387307e-06, + "loss": 0.4328, + "step": 4716 + }, + { + "epoch": 2.230260047281324, + "grad_norm": 2.584094762802124, + "learning_rate": 3.5164616640875993e-06, + "loss": 0.4268, + "step": 4717 + }, + { + "epoch": 2.2307328605200945, + "grad_norm": 2.480710506439209, + "learning_rate": 3.5158916959458573e-06, + "loss": 0.438, + "step": 4718 + }, + { + "epoch": 2.2312056737588652, + "grad_norm": 2.9338483810424805, + "learning_rate": 3.515321664548993e-06, + "loss": 0.4937, + "step": 4719 + }, + { + "epoch": 2.231678486997636, + "grad_norm": 2.7880783081054688, + "learning_rate": 3.5147515699325013e-06, + "loss": 0.4624, + "step": 4720 + }, + { + "epoch": 2.2321513002364064, + "grad_norm": 2.740841865539551, + "learning_rate": 3.5141814121318797e-06, + "loss": 0.3689, + "step": 4721 + }, + { + "epoch": 2.232624113475177, + "grad_norm": 2.9541244506835938, + "learning_rate": 3.5136111911826277e-06, + "loss": 0.4092, + "step": 4722 + }, + { + "epoch": 2.233096926713948, + "grad_norm": 2.7205398082733154, + "learning_rate": 3.5130409071202515e-06, + "loss": 0.445, + "step": 4723 + }, + { + "epoch": 2.233569739952719, + "grad_norm": 2.563406229019165, + "learning_rate": 3.51247055998026e-06, + "loss": 0.4335, + "step": 4724 + }, + { + "epoch": 2.2340425531914896, + "grad_norm": 2.4249489307403564, + "learning_rate": 3.5119001497981666e-06, + "loss": 0.4671, + "step": 4725 + }, + { + "epoch": 2.23451536643026, + "grad_norm": 2.711630344390869, + "learning_rate": 3.5113296766094875e-06, + "loss": 0.4177, + "step": 4726 + }, + { + "epoch": 2.2349881796690307, + "grad_norm": 3.0257632732391357, + "learning_rate": 3.5107591404497443e-06, + "loss": 0.4976, + "step": 4727 + }, + { + "epoch": 2.2354609929078015, + "grad_norm": 2.717303991317749, + "learning_rate": 3.5101885413544614e-06, + "loss": 0.4621, + "step": 4728 + }, + { + "epoch": 2.235933806146572, + "grad_norm": 3.2846004962921143, + "learning_rate": 3.509617879359167e-06, + "loss": 0.4284, + "step": 4729 + }, + { + "epoch": 2.2364066193853427, + "grad_norm": 2.7217819690704346, + "learning_rate": 3.5090471544993953e-06, + "loss": 0.4247, + "step": 4730 + }, + { + "epoch": 2.2368794326241135, + "grad_norm": 2.5003223419189453, + "learning_rate": 3.5084763668106812e-06, + "loss": 0.4096, + "step": 4731 + }, + { + "epoch": 2.2373522458628843, + "grad_norm": 2.7312731742858887, + "learning_rate": 3.5079055163285658e-06, + "loss": 0.4741, + "step": 4732 + }, + { + "epoch": 2.237825059101655, + "grad_norm": 2.84940767288208, + "learning_rate": 3.5073346030885934e-06, + "loss": 0.4887, + "step": 4733 + }, + { + "epoch": 2.2382978723404254, + "grad_norm": 3.1188511848449707, + "learning_rate": 3.506763627126313e-06, + "loss": 0.5335, + "step": 4734 + }, + { + "epoch": 2.2387706855791962, + "grad_norm": 2.6741397380828857, + "learning_rate": 3.5061925884772753e-06, + "loss": 0.4137, + "step": 4735 + }, + { + "epoch": 2.239243498817967, + "grad_norm": 3.1542465686798096, + "learning_rate": 3.505621487177037e-06, + "loss": 0.5303, + "step": 4736 + }, + { + "epoch": 2.2397163120567374, + "grad_norm": 5.448268890380859, + "learning_rate": 3.505050323261159e-06, + "loss": 0.4995, + "step": 4737 + }, + { + "epoch": 2.240189125295508, + "grad_norm": 2.7317898273468018, + "learning_rate": 3.5044790967652037e-06, + "loss": 0.4595, + "step": 4738 + }, + { + "epoch": 2.240661938534279, + "grad_norm": 2.8135695457458496, + "learning_rate": 3.50390780772474e-06, + "loss": 0.4593, + "step": 4739 + }, + { + "epoch": 2.2411347517730498, + "grad_norm": 3.1391844749450684, + "learning_rate": 3.5033364561753393e-06, + "loss": 0.4902, + "step": 4740 + }, + { + "epoch": 2.24160756501182, + "grad_norm": 2.6383132934570312, + "learning_rate": 3.5027650421525762e-06, + "loss": 0.3832, + "step": 4741 + }, + { + "epoch": 2.242080378250591, + "grad_norm": 2.742546558380127, + "learning_rate": 3.5021935656920314e-06, + "loss": 0.4012, + "step": 4742 + }, + { + "epoch": 2.2425531914893617, + "grad_norm": 3.1243674755096436, + "learning_rate": 3.5016220268292873e-06, + "loss": 0.4271, + "step": 4743 + }, + { + "epoch": 2.2430260047281325, + "grad_norm": 2.794717788696289, + "learning_rate": 3.501050425599932e-06, + "loss": 0.4604, + "step": 4744 + }, + { + "epoch": 2.243498817966903, + "grad_norm": 2.8481621742248535, + "learning_rate": 3.5004787620395565e-06, + "loss": 0.4814, + "step": 4745 + }, + { + "epoch": 2.2439716312056737, + "grad_norm": 2.8842051029205322, + "learning_rate": 3.499907036183755e-06, + "loss": 0.4987, + "step": 4746 + }, + { + "epoch": 2.2444444444444445, + "grad_norm": 3.074805974960327, + "learning_rate": 3.4993352480681265e-06, + "loss": 0.4966, + "step": 4747 + }, + { + "epoch": 2.2449172576832153, + "grad_norm": 2.7204246520996094, + "learning_rate": 3.4987633977282742e-06, + "loss": 0.4, + "step": 4748 + }, + { + "epoch": 2.2453900709219856, + "grad_norm": 2.685884952545166, + "learning_rate": 3.4981914851998055e-06, + "loss": 0.4285, + "step": 4749 + }, + { + "epoch": 2.2458628841607564, + "grad_norm": 2.1666336059570312, + "learning_rate": 3.4976195105183287e-06, + "loss": 0.3756, + "step": 4750 + }, + { + "epoch": 2.246335697399527, + "grad_norm": 2.863006353378296, + "learning_rate": 3.49704747371946e-06, + "loss": 0.4535, + "step": 4751 + }, + { + "epoch": 2.246808510638298, + "grad_norm": 2.5558736324310303, + "learning_rate": 3.496475374838817e-06, + "loss": 0.4129, + "step": 4752 + }, + { + "epoch": 2.2472813238770684, + "grad_norm": 2.9780309200286865, + "learning_rate": 3.495903213912022e-06, + "loss": 0.4871, + "step": 4753 + }, + { + "epoch": 2.247754137115839, + "grad_norm": 2.951779365539551, + "learning_rate": 3.4953309909747e-06, + "loss": 0.5162, + "step": 4754 + }, + { + "epoch": 2.24822695035461, + "grad_norm": 2.7654693126678467, + "learning_rate": 3.4947587060624834e-06, + "loss": 0.4662, + "step": 4755 + }, + { + "epoch": 2.2486997635933808, + "grad_norm": 2.708247184753418, + "learning_rate": 3.494186359211002e-06, + "loss": 0.4279, + "step": 4756 + }, + { + "epoch": 2.249172576832151, + "grad_norm": 3.09916615486145, + "learning_rate": 3.4936139504558963e-06, + "loss": 0.4085, + "step": 4757 + }, + { + "epoch": 2.249645390070922, + "grad_norm": 2.913806200027466, + "learning_rate": 3.493041479832807e-06, + "loss": 0.4653, + "step": 4758 + }, + { + "epoch": 2.2501182033096927, + "grad_norm": 3.2903928756713867, + "learning_rate": 3.4924689473773787e-06, + "loss": 0.5167, + "step": 4759 + }, + { + "epoch": 2.2505910165484635, + "grad_norm": 3.1302902698516846, + "learning_rate": 3.4918963531252607e-06, + "loss": 0.5398, + "step": 4760 + }, + { + "epoch": 2.251063829787234, + "grad_norm": 2.8858273029327393, + "learning_rate": 3.4913236971121063e-06, + "loss": 0.4395, + "step": 4761 + }, + { + "epoch": 2.2515366430260046, + "grad_norm": 3.194521903991699, + "learning_rate": 3.4907509793735727e-06, + "loss": 0.5258, + "step": 4762 + }, + { + "epoch": 2.2520094562647754, + "grad_norm": 2.8640544414520264, + "learning_rate": 3.49017819994532e-06, + "loss": 0.4073, + "step": 4763 + }, + { + "epoch": 2.2524822695035462, + "grad_norm": 3.139995813369751, + "learning_rate": 3.489605358863011e-06, + "loss": 0.4653, + "step": 4764 + }, + { + "epoch": 2.2529550827423166, + "grad_norm": 2.6228537559509277, + "learning_rate": 3.489032456162317e-06, + "loss": 0.4546, + "step": 4765 + }, + { + "epoch": 2.2534278959810874, + "grad_norm": 2.8197672367095947, + "learning_rate": 3.4884594918789083e-06, + "loss": 0.479, + "step": 4766 + }, + { + "epoch": 2.253900709219858, + "grad_norm": 2.7839298248291016, + "learning_rate": 3.4878864660484612e-06, + "loss": 0.5081, + "step": 4767 + }, + { + "epoch": 2.254373522458629, + "grad_norm": 2.8630709648132324, + "learning_rate": 3.487313378706656e-06, + "loss": 0.4345, + "step": 4768 + }, + { + "epoch": 2.2548463356973993, + "grad_norm": 2.5661563873291016, + "learning_rate": 3.4867402298891755e-06, + "loss": 0.4266, + "step": 4769 + }, + { + "epoch": 2.25531914893617, + "grad_norm": 2.6274025440216064, + "learning_rate": 3.4861670196317084e-06, + "loss": 0.4645, + "step": 4770 + }, + { + "epoch": 2.255791962174941, + "grad_norm": 2.578702449798584, + "learning_rate": 3.485593747969944e-06, + "loss": 0.4242, + "step": 4771 + }, + { + "epoch": 2.2562647754137117, + "grad_norm": 2.322476625442505, + "learning_rate": 3.48502041493958e-06, + "loss": 0.3975, + "step": 4772 + }, + { + "epoch": 2.256737588652482, + "grad_norm": 2.8412630558013916, + "learning_rate": 3.484447020576313e-06, + "loss": 0.4276, + "step": 4773 + }, + { + "epoch": 2.257210401891253, + "grad_norm": 2.6090497970581055, + "learning_rate": 3.483873564915847e-06, + "loss": 0.429, + "step": 4774 + }, + { + "epoch": 2.2576832151300237, + "grad_norm": 2.692458152770996, + "learning_rate": 3.4833000479938877e-06, + "loss": 0.4211, + "step": 4775 + }, + { + "epoch": 2.2581560283687945, + "grad_norm": 2.5546815395355225, + "learning_rate": 3.482726469846146e-06, + "loss": 0.4751, + "step": 4776 + }, + { + "epoch": 2.258628841607565, + "grad_norm": 2.8409626483917236, + "learning_rate": 3.4821528305083376e-06, + "loss": 0.4821, + "step": 4777 + }, + { + "epoch": 2.2591016548463356, + "grad_norm": 2.722966432571411, + "learning_rate": 3.4815791300161785e-06, + "loss": 0.5029, + "step": 4778 + }, + { + "epoch": 2.2595744680851064, + "grad_norm": 2.691603899002075, + "learning_rate": 3.48100536840539e-06, + "loss": 0.4242, + "step": 4779 + }, + { + "epoch": 2.260047281323877, + "grad_norm": 2.64035964012146, + "learning_rate": 3.4804315457116992e-06, + "loss": 0.4033, + "step": 4780 + }, + { + "epoch": 2.2605200945626476, + "grad_norm": 2.758819580078125, + "learning_rate": 3.4798576619708357e-06, + "loss": 0.4321, + "step": 4781 + }, + { + "epoch": 2.2609929078014184, + "grad_norm": 2.8204405307769775, + "learning_rate": 3.4792837172185324e-06, + "loss": 0.4309, + "step": 4782 + }, + { + "epoch": 2.261465721040189, + "grad_norm": 2.529771327972412, + "learning_rate": 3.478709711490525e-06, + "loss": 0.4398, + "step": 4783 + }, + { + "epoch": 2.26193853427896, + "grad_norm": 2.8156251907348633, + "learning_rate": 3.4781356448225557e-06, + "loss": 0.447, + "step": 4784 + }, + { + "epoch": 2.2624113475177303, + "grad_norm": 2.689528703689575, + "learning_rate": 3.477561517250369e-06, + "loss": 0.3907, + "step": 4785 + }, + { + "epoch": 2.262884160756501, + "grad_norm": 2.9148027896881104, + "learning_rate": 3.476987328809713e-06, + "loss": 0.4287, + "step": 4786 + }, + { + "epoch": 2.263356973995272, + "grad_norm": 2.933021306991577, + "learning_rate": 3.4764130795363404e-06, + "loss": 0.4847, + "step": 4787 + }, + { + "epoch": 2.2638297872340427, + "grad_norm": 2.8559257984161377, + "learning_rate": 3.4758387694660064e-06, + "loss": 0.4554, + "step": 4788 + }, + { + "epoch": 2.264302600472813, + "grad_norm": 3.0355522632598877, + "learning_rate": 3.4752643986344707e-06, + "loss": 0.4286, + "step": 4789 + }, + { + "epoch": 2.264775413711584, + "grad_norm": 2.9768362045288086, + "learning_rate": 3.474689967077498e-06, + "loss": 0.4917, + "step": 4790 + }, + { + "epoch": 2.2652482269503547, + "grad_norm": 2.827971935272217, + "learning_rate": 3.474115474830855e-06, + "loss": 0.4542, + "step": 4791 + }, + { + "epoch": 2.2657210401891255, + "grad_norm": 2.559659719467163, + "learning_rate": 3.4735409219303123e-06, + "loss": 0.4168, + "step": 4792 + }, + { + "epoch": 2.266193853427896, + "grad_norm": 2.3172824382781982, + "learning_rate": 3.472966308411645e-06, + "loss": 0.3535, + "step": 4793 + }, + { + "epoch": 2.2666666666666666, + "grad_norm": 2.6779656410217285, + "learning_rate": 3.4723916343106327e-06, + "loss": 0.4599, + "step": 4794 + }, + { + "epoch": 2.2671394799054374, + "grad_norm": 2.55780291557312, + "learning_rate": 3.4718168996630573e-06, + "loss": 0.4185, + "step": 4795 + }, + { + "epoch": 2.267612293144208, + "grad_norm": 2.4929800033569336, + "learning_rate": 3.471242104504704e-06, + "loss": 0.4008, + "step": 4796 + }, + { + "epoch": 2.2680851063829786, + "grad_norm": 2.849475145339966, + "learning_rate": 3.4706672488713642e-06, + "loss": 0.396, + "step": 4797 + }, + { + "epoch": 2.2685579196217494, + "grad_norm": 2.4830739498138428, + "learning_rate": 3.4700923327988306e-06, + "loss": 0.4087, + "step": 4798 + }, + { + "epoch": 2.26903073286052, + "grad_norm": 3.2748119831085205, + "learning_rate": 3.469517356322901e-06, + "loss": 0.4496, + "step": 4799 + }, + { + "epoch": 2.269503546099291, + "grad_norm": 3.0440170764923096, + "learning_rate": 3.468942319479378e-06, + "loss": 0.4903, + "step": 4800 + }, + { + "epoch": 2.2699763593380613, + "grad_norm": 2.8200504779815674, + "learning_rate": 3.4683672223040645e-06, + "loss": 0.4588, + "step": 4801 + }, + { + "epoch": 2.270449172576832, + "grad_norm": 2.675206184387207, + "learning_rate": 3.4677920648327707e-06, + "loss": 0.4257, + "step": 4802 + }, + { + "epoch": 2.270921985815603, + "grad_norm": 2.862675905227661, + "learning_rate": 3.4672168471013084e-06, + "loss": 0.466, + "step": 4803 + }, + { + "epoch": 2.2713947990543737, + "grad_norm": 2.65663743019104, + "learning_rate": 3.4666415691454947e-06, + "loss": 0.4784, + "step": 4804 + }, + { + "epoch": 2.271867612293144, + "grad_norm": 2.5610506534576416, + "learning_rate": 3.4660662310011483e-06, + "loss": 0.4429, + "step": 4805 + }, + { + "epoch": 2.272340425531915, + "grad_norm": 2.6459643840789795, + "learning_rate": 3.465490832704094e-06, + "loss": 0.4345, + "step": 4806 + }, + { + "epoch": 2.2728132387706856, + "grad_norm": 2.426013469696045, + "learning_rate": 3.4649153742901585e-06, + "loss": 0.4533, + "step": 4807 + }, + { + "epoch": 2.2732860520094564, + "grad_norm": 2.6714842319488525, + "learning_rate": 3.4643398557951745e-06, + "loss": 0.4409, + "step": 4808 + }, + { + "epoch": 2.273758865248227, + "grad_norm": 2.703629493713379, + "learning_rate": 3.463764277254976e-06, + "loss": 0.3656, + "step": 4809 + }, + { + "epoch": 2.2742316784869976, + "grad_norm": 2.811753988265991, + "learning_rate": 3.4631886387054025e-06, + "loss": 0.4957, + "step": 4810 + }, + { + "epoch": 2.2747044917257684, + "grad_norm": 2.9469289779663086, + "learning_rate": 3.462612940182295e-06, + "loss": 0.4582, + "step": 4811 + }, + { + "epoch": 2.275177304964539, + "grad_norm": 2.6287801265716553, + "learning_rate": 3.462037181721501e-06, + "loss": 0.4072, + "step": 4812 + }, + { + "epoch": 2.2756501182033095, + "grad_norm": 2.7104952335357666, + "learning_rate": 3.46146136335887e-06, + "loss": 0.4998, + "step": 4813 + }, + { + "epoch": 2.2761229314420803, + "grad_norm": 3.170363187789917, + "learning_rate": 3.460885485130256e-06, + "loss": 0.4722, + "step": 4814 + }, + { + "epoch": 2.276595744680851, + "grad_norm": 2.7315151691436768, + "learning_rate": 3.460309547071516e-06, + "loss": 0.4482, + "step": 4815 + }, + { + "epoch": 2.277068557919622, + "grad_norm": 2.685988187789917, + "learning_rate": 3.4597335492185113e-06, + "loss": 0.4419, + "step": 4816 + }, + { + "epoch": 2.2775413711583923, + "grad_norm": 2.532790184020996, + "learning_rate": 3.459157491607107e-06, + "loss": 0.3961, + "step": 4817 + }, + { + "epoch": 2.278014184397163, + "grad_norm": 2.920729875564575, + "learning_rate": 3.458581374273171e-06, + "loss": 0.4767, + "step": 4818 + }, + { + "epoch": 2.278486997635934, + "grad_norm": 3.2481250762939453, + "learning_rate": 3.458005197252577e-06, + "loss": 0.4985, + "step": 4819 + }, + { + "epoch": 2.2789598108747047, + "grad_norm": 2.373809814453125, + "learning_rate": 3.4574289605811994e-06, + "loss": 0.4259, + "step": 4820 + }, + { + "epoch": 2.279432624113475, + "grad_norm": 2.7851033210754395, + "learning_rate": 3.4568526642949184e-06, + "loss": 0.4829, + "step": 4821 + }, + { + "epoch": 2.279905437352246, + "grad_norm": 2.9777133464813232, + "learning_rate": 3.456276308429618e-06, + "loss": 0.4896, + "step": 4822 + }, + { + "epoch": 2.2803782505910166, + "grad_norm": 2.7922022342681885, + "learning_rate": 3.4556998930211853e-06, + "loss": 0.4908, + "step": 4823 + }, + { + "epoch": 2.2808510638297874, + "grad_norm": 2.699180841445923, + "learning_rate": 3.4551234181055104e-06, + "loss": 0.4518, + "step": 4824 + }, + { + "epoch": 2.2813238770685578, + "grad_norm": 3.1200520992279053, + "learning_rate": 3.4545468837184885e-06, + "loss": 0.4877, + "step": 4825 + }, + { + "epoch": 2.2817966903073286, + "grad_norm": 2.56782603263855, + "learning_rate": 3.453970289896018e-06, + "loss": 0.4281, + "step": 4826 + }, + { + "epoch": 2.2822695035460994, + "grad_norm": 3.241356372833252, + "learning_rate": 3.4533936366740007e-06, + "loss": 0.4338, + "step": 4827 + }, + { + "epoch": 2.28274231678487, + "grad_norm": 3.560295343399048, + "learning_rate": 3.452816924088342e-06, + "loss": 0.4121, + "step": 4828 + }, + { + "epoch": 2.2832151300236405, + "grad_norm": 2.8512449264526367, + "learning_rate": 3.452240152174951e-06, + "loss": 0.4357, + "step": 4829 + }, + { + "epoch": 2.2836879432624113, + "grad_norm": 3.0332651138305664, + "learning_rate": 3.4516633209697408e-06, + "loss": 0.4985, + "step": 4830 + }, + { + "epoch": 2.284160756501182, + "grad_norm": 2.520930528640747, + "learning_rate": 3.451086430508629e-06, + "loss": 0.4021, + "step": 4831 + }, + { + "epoch": 2.284633569739953, + "grad_norm": 2.508227825164795, + "learning_rate": 3.4505094808275363e-06, + "loss": 0.3935, + "step": 4832 + }, + { + "epoch": 2.2851063829787233, + "grad_norm": 2.56752610206604, + "learning_rate": 3.449932471962385e-06, + "loss": 0.4689, + "step": 4833 + }, + { + "epoch": 2.285579196217494, + "grad_norm": 2.7757534980773926, + "learning_rate": 3.449355403949105e-06, + "loss": 0.4565, + "step": 4834 + }, + { + "epoch": 2.286052009456265, + "grad_norm": 3.364821195602417, + "learning_rate": 3.448778276823626e-06, + "loss": 0.4729, + "step": 4835 + }, + { + "epoch": 2.2865248226950357, + "grad_norm": 3.0045557022094727, + "learning_rate": 3.448201090621884e-06, + "loss": 0.4834, + "step": 4836 + }, + { + "epoch": 2.286997635933806, + "grad_norm": 2.9451794624328613, + "learning_rate": 3.4476238453798183e-06, + "loss": 0.489, + "step": 4837 + }, + { + "epoch": 2.287470449172577, + "grad_norm": 2.8307435512542725, + "learning_rate": 3.4470465411333708e-06, + "loss": 0.5079, + "step": 4838 + }, + { + "epoch": 2.2879432624113476, + "grad_norm": 2.7118136882781982, + "learning_rate": 3.4464691779184876e-06, + "loss": 0.4794, + "step": 4839 + }, + { + "epoch": 2.2884160756501184, + "grad_norm": 2.6724441051483154, + "learning_rate": 3.445891755771119e-06, + "loss": 0.4619, + "step": 4840 + }, + { + "epoch": 2.2888888888888888, + "grad_norm": 2.8161258697509766, + "learning_rate": 3.445314274727218e-06, + "loss": 0.4287, + "step": 4841 + }, + { + "epoch": 2.2893617021276595, + "grad_norm": 2.5681750774383545, + "learning_rate": 3.4447367348227433e-06, + "loss": 0.4167, + "step": 4842 + }, + { + "epoch": 2.2898345153664303, + "grad_norm": 2.8136284351348877, + "learning_rate": 3.444159136093654e-06, + "loss": 0.4195, + "step": 4843 + }, + { + "epoch": 2.290307328605201, + "grad_norm": 3.153651714324951, + "learning_rate": 3.443581478575915e-06, + "loss": 0.4821, + "step": 4844 + }, + { + "epoch": 2.2907801418439715, + "grad_norm": 2.980883836746216, + "learning_rate": 3.4430037623054953e-06, + "loss": 0.4627, + "step": 4845 + }, + { + "epoch": 2.2912529550827423, + "grad_norm": 2.786182403564453, + "learning_rate": 3.4424259873183664e-06, + "loss": 0.4342, + "step": 4846 + }, + { + "epoch": 2.291725768321513, + "grad_norm": 2.8938279151916504, + "learning_rate": 3.4418481536505026e-06, + "loss": 0.3997, + "step": 4847 + }, + { + "epoch": 2.2921985815602834, + "grad_norm": 2.5534510612487793, + "learning_rate": 3.4412702613378844e-06, + "loss": 0.3982, + "step": 4848 + }, + { + "epoch": 2.2926713947990542, + "grad_norm": 2.7907063961029053, + "learning_rate": 3.4406923104164956e-06, + "loss": 0.4484, + "step": 4849 + }, + { + "epoch": 2.293144208037825, + "grad_norm": 3.162702798843384, + "learning_rate": 3.4401143009223203e-06, + "loss": 0.4528, + "step": 4850 + }, + { + "epoch": 2.293617021276596, + "grad_norm": 2.4647393226623535, + "learning_rate": 3.4395362328913505e-06, + "loss": 0.3759, + "step": 4851 + }, + { + "epoch": 2.2940898345153666, + "grad_norm": 2.8219876289367676, + "learning_rate": 3.438958106359579e-06, + "loss": 0.4903, + "step": 4852 + }, + { + "epoch": 2.294562647754137, + "grad_norm": 2.827073097229004, + "learning_rate": 3.438379921363003e-06, + "loss": 0.4315, + "step": 4853 + }, + { + "epoch": 2.295035460992908, + "grad_norm": 2.472470283508301, + "learning_rate": 3.4378016779376244e-06, + "loss": 0.4478, + "step": 4854 + }, + { + "epoch": 2.2955082742316786, + "grad_norm": 3.3994734287261963, + "learning_rate": 3.4372233761194473e-06, + "loss": 0.5086, + "step": 4855 + }, + { + "epoch": 2.295981087470449, + "grad_norm": 3.030465602874756, + "learning_rate": 3.4366450159444796e-06, + "loss": 0.4159, + "step": 4856 + }, + { + "epoch": 2.2964539007092197, + "grad_norm": 2.5460705757141113, + "learning_rate": 3.4360665974487346e-06, + "loss": 0.4097, + "step": 4857 + }, + { + "epoch": 2.2969267139479905, + "grad_norm": 2.884469509124756, + "learning_rate": 3.4354881206682273e-06, + "loss": 0.4478, + "step": 4858 + }, + { + "epoch": 2.2973995271867613, + "grad_norm": 2.5139710903167725, + "learning_rate": 3.4349095856389765e-06, + "loss": 0.4286, + "step": 4859 + }, + { + "epoch": 2.297872340425532, + "grad_norm": 3.1628260612487793, + "learning_rate": 3.4343309923970053e-06, + "loss": 0.4617, + "step": 4860 + }, + { + "epoch": 2.2983451536643025, + "grad_norm": 2.6141695976257324, + "learning_rate": 3.4337523409783395e-06, + "loss": 0.3841, + "step": 4861 + }, + { + "epoch": 2.2988179669030733, + "grad_norm": 2.766834259033203, + "learning_rate": 3.43317363141901e-06, + "loss": 0.4484, + "step": 4862 + }, + { + "epoch": 2.299290780141844, + "grad_norm": 2.785491943359375, + "learning_rate": 3.4325948637550503e-06, + "loss": 0.4363, + "step": 4863 + }, + { + "epoch": 2.2997635933806144, + "grad_norm": 2.624929189682007, + "learning_rate": 3.4320160380224988e-06, + "loss": 0.4518, + "step": 4864 + }, + { + "epoch": 2.300236406619385, + "grad_norm": 2.895413398742676, + "learning_rate": 3.4314371542573944e-06, + "loss": 0.4745, + "step": 4865 + }, + { + "epoch": 2.300709219858156, + "grad_norm": 2.603816270828247, + "learning_rate": 3.430858212495783e-06, + "loss": 0.4444, + "step": 4866 + }, + { + "epoch": 2.301182033096927, + "grad_norm": 3.387360095977783, + "learning_rate": 3.4302792127737116e-06, + "loss": 0.4169, + "step": 4867 + }, + { + "epoch": 2.3016548463356976, + "grad_norm": 2.894054651260376, + "learning_rate": 3.4297001551272334e-06, + "loss": 0.4493, + "step": 4868 + }, + { + "epoch": 2.302127659574468, + "grad_norm": 3.0432028770446777, + "learning_rate": 3.4291210395924035e-06, + "loss": 0.4854, + "step": 4869 + }, + { + "epoch": 2.3026004728132388, + "grad_norm": 2.5144734382629395, + "learning_rate": 3.42854186620528e-06, + "loss": 0.4556, + "step": 4870 + }, + { + "epoch": 2.3030732860520096, + "grad_norm": 2.964812755584717, + "learning_rate": 3.427962635001926e-06, + "loss": 0.495, + "step": 4871 + }, + { + "epoch": 2.30354609929078, + "grad_norm": 2.9991118907928467, + "learning_rate": 3.4273833460184077e-06, + "loss": 0.4787, + "step": 4872 + }, + { + "epoch": 2.3040189125295507, + "grad_norm": 2.9424328804016113, + "learning_rate": 3.4268039992907955e-06, + "loss": 0.5006, + "step": 4873 + }, + { + "epoch": 2.3044917257683215, + "grad_norm": 2.792880058288574, + "learning_rate": 3.426224594855162e-06, + "loss": 0.4399, + "step": 4874 + }, + { + "epoch": 2.3049645390070923, + "grad_norm": 2.5308053493499756, + "learning_rate": 3.4256451327475838e-06, + "loss": 0.4843, + "step": 4875 + }, + { + "epoch": 2.305437352245863, + "grad_norm": 2.7937564849853516, + "learning_rate": 3.425065613004142e-06, + "loss": 0.4428, + "step": 4876 + }, + { + "epoch": 2.3059101654846335, + "grad_norm": 2.4231557846069336, + "learning_rate": 3.424486035660921e-06, + "loss": 0.4054, + "step": 4877 + }, + { + "epoch": 2.3063829787234043, + "grad_norm": 3.0622596740722656, + "learning_rate": 3.423906400754009e-06, + "loss": 0.4623, + "step": 4878 + }, + { + "epoch": 2.306855791962175, + "grad_norm": 2.6532933712005615, + "learning_rate": 3.4233267083194955e-06, + "loss": 0.4387, + "step": 4879 + }, + { + "epoch": 2.3073286052009454, + "grad_norm": 2.793325185775757, + "learning_rate": 3.422746958393477e-06, + "loss": 0.4047, + "step": 4880 + }, + { + "epoch": 2.307801418439716, + "grad_norm": 2.9178314208984375, + "learning_rate": 3.422167151012052e-06, + "loss": 0.4397, + "step": 4881 + }, + { + "epoch": 2.308274231678487, + "grad_norm": 3.463913917541504, + "learning_rate": 3.4215872862113214e-06, + "loss": 0.4347, + "step": 4882 + }, + { + "epoch": 2.308747044917258, + "grad_norm": 3.228403091430664, + "learning_rate": 3.421007364027392e-06, + "loss": 0.4405, + "step": 4883 + }, + { + "epoch": 2.3092198581560286, + "grad_norm": 2.896933078765869, + "learning_rate": 3.420427384496372e-06, + "loss": 0.4429, + "step": 4884 + }, + { + "epoch": 2.309692671394799, + "grad_norm": 2.5559937953948975, + "learning_rate": 3.4198473476543755e-06, + "loss": 0.4281, + "step": 4885 + }, + { + "epoch": 2.3101654846335697, + "grad_norm": 3.457918167114258, + "learning_rate": 3.419267253537517e-06, + "loss": 0.4495, + "step": 4886 + }, + { + "epoch": 2.3106382978723405, + "grad_norm": 2.6554839611053467, + "learning_rate": 3.418687102181918e-06, + "loss": 0.4682, + "step": 4887 + }, + { + "epoch": 2.311111111111111, + "grad_norm": 2.8171639442443848, + "learning_rate": 3.4181068936237024e-06, + "loss": 0.4184, + "step": 4888 + }, + { + "epoch": 2.3115839243498817, + "grad_norm": 2.9272499084472656, + "learning_rate": 3.4175266278989955e-06, + "loss": 0.5445, + "step": 4889 + }, + { + "epoch": 2.3120567375886525, + "grad_norm": 2.5928499698638916, + "learning_rate": 3.4169463050439284e-06, + "loss": 0.3808, + "step": 4890 + }, + { + "epoch": 2.3125295508274233, + "grad_norm": 2.6624577045440674, + "learning_rate": 3.4163659250946356e-06, + "loss": 0.4678, + "step": 4891 + }, + { + "epoch": 2.313002364066194, + "grad_norm": 2.666555643081665, + "learning_rate": 3.4157854880872553e-06, + "loss": 0.457, + "step": 4892 + }, + { + "epoch": 2.3134751773049644, + "grad_norm": 3.2987406253814697, + "learning_rate": 3.4152049940579278e-06, + "loss": 0.551, + "step": 4893 + }, + { + "epoch": 2.3139479905437352, + "grad_norm": 2.728119134902954, + "learning_rate": 3.414624443042799e-06, + "loss": 0.3935, + "step": 4894 + }, + { + "epoch": 2.314420803782506, + "grad_norm": 3.133005380630493, + "learning_rate": 3.4140438350780157e-06, + "loss": 0.4981, + "step": 4895 + }, + { + "epoch": 2.3148936170212764, + "grad_norm": 2.591252565383911, + "learning_rate": 3.4134631701997312e-06, + "loss": 0.4251, + "step": 4896 + }, + { + "epoch": 2.315366430260047, + "grad_norm": 3.007136344909668, + "learning_rate": 3.412882448444101e-06, + "loss": 0.4492, + "step": 4897 + }, + { + "epoch": 2.315839243498818, + "grad_norm": 2.6391026973724365, + "learning_rate": 3.412301669847284e-06, + "loss": 0.5151, + "step": 4898 + }, + { + "epoch": 2.3163120567375888, + "grad_norm": 7.453699111938477, + "learning_rate": 3.411720834445441e-06, + "loss": 0.4983, + "step": 4899 + }, + { + "epoch": 2.3167848699763596, + "grad_norm": 2.667712688446045, + "learning_rate": 3.41113994227474e-06, + "loss": 0.4581, + "step": 4900 + }, + { + "epoch": 2.31725768321513, + "grad_norm": 2.7727627754211426, + "learning_rate": 3.41055899337135e-06, + "loss": 0.4731, + "step": 4901 + }, + { + "epoch": 2.3177304964539007, + "grad_norm": 3.0096890926361084, + "learning_rate": 3.409977987771444e-06, + "loss": 0.4996, + "step": 4902 + }, + { + "epoch": 2.3182033096926715, + "grad_norm": 2.725830078125, + "learning_rate": 3.4093969255111993e-06, + "loss": 0.4544, + "step": 4903 + }, + { + "epoch": 2.318676122931442, + "grad_norm": 2.7596993446350098, + "learning_rate": 3.4088158066267945e-06, + "loss": 0.4846, + "step": 4904 + }, + { + "epoch": 2.3191489361702127, + "grad_norm": 2.702620029449463, + "learning_rate": 3.4082346311544156e-06, + "loss": 0.4849, + "step": 4905 + }, + { + "epoch": 2.3196217494089835, + "grad_norm": 2.725374460220337, + "learning_rate": 3.407653399130249e-06, + "loss": 0.4116, + "step": 4906 + }, + { + "epoch": 2.3200945626477543, + "grad_norm": 2.6770219802856445, + "learning_rate": 3.4070721105904847e-06, + "loss": 0.4606, + "step": 4907 + }, + { + "epoch": 2.320567375886525, + "grad_norm": 2.9249117374420166, + "learning_rate": 3.406490765571317e-06, + "loss": 0.461, + "step": 4908 + }, + { + "epoch": 2.3210401891252954, + "grad_norm": 2.7568278312683105, + "learning_rate": 3.405909364108944e-06, + "loss": 0.4065, + "step": 4909 + }, + { + "epoch": 2.321513002364066, + "grad_norm": 2.7231340408325195, + "learning_rate": 3.4053279062395676e-06, + "loss": 0.4173, + "step": 4910 + }, + { + "epoch": 2.321985815602837, + "grad_norm": 3.1401100158691406, + "learning_rate": 3.404746391999393e-06, + "loss": 0.4287, + "step": 4911 + }, + { + "epoch": 2.3224586288416074, + "grad_norm": 2.714853525161743, + "learning_rate": 3.404164821424627e-06, + "loss": 0.4552, + "step": 4912 + }, + { + "epoch": 2.322931442080378, + "grad_norm": 3.1509978771209717, + "learning_rate": 3.4035831945514825e-06, + "loss": 0.5296, + "step": 4913 + }, + { + "epoch": 2.323404255319149, + "grad_norm": 2.567194938659668, + "learning_rate": 3.403001511416174e-06, + "loss": 0.4306, + "step": 4914 + }, + { + "epoch": 2.3238770685579198, + "grad_norm": 2.7473888397216797, + "learning_rate": 3.402419772054922e-06, + "loss": 0.4009, + "step": 4915 + }, + { + "epoch": 2.3243498817966906, + "grad_norm": 2.8617780208587646, + "learning_rate": 3.401837976503947e-06, + "loss": 0.4545, + "step": 4916 + }, + { + "epoch": 2.324822695035461, + "grad_norm": 2.3650572299957275, + "learning_rate": 3.401256124799475e-06, + "loss": 0.4046, + "step": 4917 + }, + { + "epoch": 2.3252955082742317, + "grad_norm": 2.418407678604126, + "learning_rate": 3.4006742169777364e-06, + "loss": 0.4222, + "step": 4918 + }, + { + "epoch": 2.3257683215130025, + "grad_norm": 2.7232494354248047, + "learning_rate": 3.400092253074964e-06, + "loss": 0.4373, + "step": 4919 + }, + { + "epoch": 2.326241134751773, + "grad_norm": 2.702965497970581, + "learning_rate": 3.399510233127394e-06, + "loss": 0.437, + "step": 4920 + }, + { + "epoch": 2.3267139479905437, + "grad_norm": 2.8381760120391846, + "learning_rate": 3.3989281571712664e-06, + "loss": 0.4294, + "step": 4921 + }, + { + "epoch": 2.3271867612293144, + "grad_norm": 2.767131805419922, + "learning_rate": 3.398346025242823e-06, + "loss": 0.4673, + "step": 4922 + }, + { + "epoch": 2.3276595744680852, + "grad_norm": 2.5261805057525635, + "learning_rate": 3.3977638373783123e-06, + "loss": 0.4147, + "step": 4923 + }, + { + "epoch": 2.3281323877068556, + "grad_norm": 2.7176897525787354, + "learning_rate": 3.3971815936139836e-06, + "loss": 0.3885, + "step": 4924 + }, + { + "epoch": 2.3286052009456264, + "grad_norm": 2.849043130874634, + "learning_rate": 3.396599293986092e-06, + "loss": 0.4842, + "step": 4925 + }, + { + "epoch": 2.329078014184397, + "grad_norm": 2.550673484802246, + "learning_rate": 3.3960169385308927e-06, + "loss": 0.4049, + "step": 4926 + }, + { + "epoch": 2.329550827423168, + "grad_norm": 3.0821585655212402, + "learning_rate": 3.3954345272846477e-06, + "loss": 0.53, + "step": 4927 + }, + { + "epoch": 2.3300236406619383, + "grad_norm": 2.68658185005188, + "learning_rate": 3.3948520602836223e-06, + "loss": 0.4592, + "step": 4928 + }, + { + "epoch": 2.330496453900709, + "grad_norm": 2.7391903400421143, + "learning_rate": 3.394269537564082e-06, + "loss": 0.4773, + "step": 4929 + }, + { + "epoch": 2.33096926713948, + "grad_norm": 2.665114164352417, + "learning_rate": 3.393686959162299e-06, + "loss": 0.4671, + "step": 4930 + }, + { + "epoch": 2.3314420803782507, + "grad_norm": 2.6827399730682373, + "learning_rate": 3.3931043251145477e-06, + "loss": 0.4669, + "step": 4931 + }, + { + "epoch": 2.331914893617021, + "grad_norm": 3.1760666370391846, + "learning_rate": 3.392521635457106e-06, + "loss": 0.4729, + "step": 4932 + }, + { + "epoch": 2.332387706855792, + "grad_norm": 2.9686226844787598, + "learning_rate": 3.3919388902262555e-06, + "loss": 0.5017, + "step": 4933 + }, + { + "epoch": 2.3328605200945627, + "grad_norm": 2.471325397491455, + "learning_rate": 3.3913560894582818e-06, + "loss": 0.4195, + "step": 4934 + }, + { + "epoch": 2.3333333333333335, + "grad_norm": 2.4062955379486084, + "learning_rate": 3.3907732331894732e-06, + "loss": 0.3666, + "step": 4935 + }, + { + "epoch": 2.333806146572104, + "grad_norm": 2.6800320148468018, + "learning_rate": 3.3901903214561206e-06, + "loss": 0.4774, + "step": 4936 + }, + { + "epoch": 2.3342789598108746, + "grad_norm": 2.923741102218628, + "learning_rate": 3.389607354294521e-06, + "loss": 0.4546, + "step": 4937 + }, + { + "epoch": 2.3347517730496454, + "grad_norm": 3.0034096240997314, + "learning_rate": 3.3890243317409716e-06, + "loss": 0.5373, + "step": 4938 + }, + { + "epoch": 2.3352245862884162, + "grad_norm": 3.0757339000701904, + "learning_rate": 3.388441253831775e-06, + "loss": 0.4655, + "step": 4939 + }, + { + "epoch": 2.3356973995271866, + "grad_norm": 2.5352041721343994, + "learning_rate": 3.3878581206032373e-06, + "loss": 0.4391, + "step": 4940 + }, + { + "epoch": 2.3361702127659574, + "grad_norm": 2.9332237243652344, + "learning_rate": 3.3872749320916675e-06, + "loss": 0.4685, + "step": 4941 + }, + { + "epoch": 2.336643026004728, + "grad_norm": 2.4871222972869873, + "learning_rate": 3.386691688333379e-06, + "loss": 0.3952, + "step": 4942 + }, + { + "epoch": 2.337115839243499, + "grad_norm": 2.6384918689727783, + "learning_rate": 3.386108389364687e-06, + "loss": 0.4044, + "step": 4943 + }, + { + "epoch": 2.3375886524822693, + "grad_norm": 2.3545165061950684, + "learning_rate": 3.3855250352219102e-06, + "loss": 0.426, + "step": 4944 + }, + { + "epoch": 2.33806146572104, + "grad_norm": 2.972242593765259, + "learning_rate": 3.3849416259413735e-06, + "loss": 0.5033, + "step": 4945 + }, + { + "epoch": 2.338534278959811, + "grad_norm": 3.117351770401001, + "learning_rate": 3.384358161559401e-06, + "loss": 0.4695, + "step": 4946 + }, + { + "epoch": 2.3390070921985817, + "grad_norm": 2.888916492462158, + "learning_rate": 3.383774642112324e-06, + "loss": 0.437, + "step": 4947 + }, + { + "epoch": 2.339479905437352, + "grad_norm": 3.0677435398101807, + "learning_rate": 3.3831910676364753e-06, + "loss": 0.4293, + "step": 4948 + }, + { + "epoch": 2.339952718676123, + "grad_norm": 2.8571784496307373, + "learning_rate": 3.3826074381681916e-06, + "loss": 0.4574, + "step": 4949 + }, + { + "epoch": 2.3404255319148937, + "grad_norm": 2.907276153564453, + "learning_rate": 3.3820237537438127e-06, + "loss": 0.4731, + "step": 4950 + }, + { + "epoch": 2.3408983451536645, + "grad_norm": 2.923762559890747, + "learning_rate": 3.3814400143996823e-06, + "loss": 0.4648, + "step": 4951 + }, + { + "epoch": 2.341371158392435, + "grad_norm": 2.6206982135772705, + "learning_rate": 3.3808562201721473e-06, + "loss": 0.436, + "step": 4952 + }, + { + "epoch": 2.3418439716312056, + "grad_norm": 6.279088973999023, + "learning_rate": 3.380272371097558e-06, + "loss": 0.4461, + "step": 4953 + }, + { + "epoch": 2.3423167848699764, + "grad_norm": 2.785297155380249, + "learning_rate": 3.3796884672122684e-06, + "loss": 0.4619, + "step": 4954 + }, + { + "epoch": 2.342789598108747, + "grad_norm": 2.6241793632507324, + "learning_rate": 3.379104508552634e-06, + "loss": 0.4323, + "step": 4955 + }, + { + "epoch": 2.3432624113475176, + "grad_norm": 2.6052167415618896, + "learning_rate": 3.378520495155017e-06, + "loss": 0.3943, + "step": 4956 + }, + { + "epoch": 2.3437352245862884, + "grad_norm": 2.8247411251068115, + "learning_rate": 3.3779364270557818e-06, + "loss": 0.4689, + "step": 4957 + }, + { + "epoch": 2.344208037825059, + "grad_norm": 2.5348927974700928, + "learning_rate": 3.377352304291294e-06, + "loss": 0.4619, + "step": 4958 + }, + { + "epoch": 2.34468085106383, + "grad_norm": 2.906648874282837, + "learning_rate": 3.376768126897926e-06, + "loss": 0.5191, + "step": 4959 + }, + { + "epoch": 2.3451536643026003, + "grad_norm": 2.796870470046997, + "learning_rate": 3.3761838949120514e-06, + "loss": 0.4227, + "step": 4960 + }, + { + "epoch": 2.345626477541371, + "grad_norm": 2.789635419845581, + "learning_rate": 3.3755996083700464e-06, + "loss": 0.3927, + "step": 4961 + }, + { + "epoch": 2.346099290780142, + "grad_norm": 2.86641263961792, + "learning_rate": 3.375015267308295e-06, + "loss": 0.4097, + "step": 4962 + }, + { + "epoch": 2.3465721040189127, + "grad_norm": 2.8374414443969727, + "learning_rate": 3.374430871763178e-06, + "loss": 0.4566, + "step": 4963 + }, + { + "epoch": 2.347044917257683, + "grad_norm": 2.71951961517334, + "learning_rate": 3.3738464217710854e-06, + "loss": 0.4748, + "step": 4964 + }, + { + "epoch": 2.347517730496454, + "grad_norm": 2.6939785480499268, + "learning_rate": 3.373261917368408e-06, + "loss": 0.4499, + "step": 4965 + }, + { + "epoch": 2.3479905437352246, + "grad_norm": 2.862661600112915, + "learning_rate": 3.37267735859154e-06, + "loss": 0.415, + "step": 4966 + }, + { + "epoch": 2.3484633569739954, + "grad_norm": 2.3657119274139404, + "learning_rate": 3.3720927454768793e-06, + "loss": 0.4112, + "step": 4967 + }, + { + "epoch": 2.348936170212766, + "grad_norm": 3.701571464538574, + "learning_rate": 3.3715080780608277e-06, + "loss": 0.4735, + "step": 4968 + }, + { + "epoch": 2.3494089834515366, + "grad_norm": 2.894350528717041, + "learning_rate": 3.3709233563797895e-06, + "loss": 0.4278, + "step": 4969 + }, + { + "epoch": 2.3498817966903074, + "grad_norm": 3.0072877407073975, + "learning_rate": 3.3703385804701727e-06, + "loss": 0.4718, + "step": 4970 + }, + { + "epoch": 2.350354609929078, + "grad_norm": 2.9920408725738525, + "learning_rate": 3.369753750368389e-06, + "loss": 0.4636, + "step": 4971 + }, + { + "epoch": 2.3508274231678485, + "grad_norm": 2.381770372390747, + "learning_rate": 3.369168866110853e-06, + "loss": 0.3841, + "step": 4972 + }, + { + "epoch": 2.3513002364066193, + "grad_norm": 2.6195342540740967, + "learning_rate": 3.3685839277339825e-06, + "loss": 0.4422, + "step": 4973 + }, + { + "epoch": 2.35177304964539, + "grad_norm": 2.885852575302124, + "learning_rate": 3.3679989352741992e-06, + "loss": 0.4798, + "step": 4974 + }, + { + "epoch": 2.352245862884161, + "grad_norm": 2.820004940032959, + "learning_rate": 3.367413888767929e-06, + "loss": 0.4498, + "step": 4975 + }, + { + "epoch": 2.3527186761229313, + "grad_norm": 2.579680919647217, + "learning_rate": 3.366828788251599e-06, + "loss": 0.4894, + "step": 4976 + }, + { + "epoch": 2.353191489361702, + "grad_norm": 2.7509915828704834, + "learning_rate": 3.366243633761642e-06, + "loss": 0.4354, + "step": 4977 + }, + { + "epoch": 2.353664302600473, + "grad_norm": 3.061767339706421, + "learning_rate": 3.3656584253344917e-06, + "loss": 0.4651, + "step": 4978 + }, + { + "epoch": 2.3541371158392437, + "grad_norm": 2.6109485626220703, + "learning_rate": 3.365073163006587e-06, + "loss": 0.44, + "step": 4979 + }, + { + "epoch": 2.354609929078014, + "grad_norm": 3.4247376918792725, + "learning_rate": 3.36448784681437e-06, + "loss": 0.3993, + "step": 4980 + }, + { + "epoch": 2.355082742316785, + "grad_norm": 2.953695297241211, + "learning_rate": 3.363902476794285e-06, + "loss": 0.4763, + "step": 4981 + }, + { + "epoch": 2.3555555555555556, + "grad_norm": 2.836543083190918, + "learning_rate": 3.3633170529827806e-06, + "loss": 0.4755, + "step": 4982 + }, + { + "epoch": 2.3560283687943264, + "grad_norm": 2.944082021713257, + "learning_rate": 3.36273157541631e-06, + "loss": 0.472, + "step": 4983 + }, + { + "epoch": 2.3565011820330968, + "grad_norm": 2.891716957092285, + "learning_rate": 3.3621460441313262e-06, + "loss": 0.5259, + "step": 4984 + }, + { + "epoch": 2.3569739952718676, + "grad_norm": 2.8448829650878906, + "learning_rate": 3.3615604591642896e-06, + "loss": 0.4587, + "step": 4985 + }, + { + "epoch": 2.3574468085106384, + "grad_norm": 3.114393711090088, + "learning_rate": 3.36097482055166e-06, + "loss": 0.4352, + "step": 4986 + }, + { + "epoch": 2.357919621749409, + "grad_norm": 2.964851140975952, + "learning_rate": 3.360389128329904e-06, + "loss": 0.5015, + "step": 4987 + }, + { + "epoch": 2.3583924349881795, + "grad_norm": 2.4819815158843994, + "learning_rate": 3.3598033825354893e-06, + "loss": 0.3459, + "step": 4988 + }, + { + "epoch": 2.3588652482269503, + "grad_norm": 2.635754346847534, + "learning_rate": 3.359217583204889e-06, + "loss": 0.4367, + "step": 4989 + }, + { + "epoch": 2.359338061465721, + "grad_norm": 2.542482376098633, + "learning_rate": 3.358631730374576e-06, + "loss": 0.3978, + "step": 4990 + }, + { + "epoch": 2.359810874704492, + "grad_norm": 2.614018678665161, + "learning_rate": 3.358045824081031e-06, + "loss": 0.424, + "step": 4991 + }, + { + "epoch": 2.3602836879432623, + "grad_norm": 2.775373697280884, + "learning_rate": 3.3574598643607354e-06, + "loss": 0.4901, + "step": 4992 + }, + { + "epoch": 2.360756501182033, + "grad_norm": 3.091381311416626, + "learning_rate": 3.356873851250173e-06, + "loss": 0.4954, + "step": 4993 + }, + { + "epoch": 2.361229314420804, + "grad_norm": 2.440023422241211, + "learning_rate": 3.3562877847858337e-06, + "loss": 0.4053, + "step": 4994 + }, + { + "epoch": 2.3617021276595747, + "grad_norm": 2.8879518508911133, + "learning_rate": 3.3557016650042084e-06, + "loss": 0.4766, + "step": 4995 + }, + { + "epoch": 2.362174940898345, + "grad_norm": 3.1298391819000244, + "learning_rate": 3.355115491941793e-06, + "loss": 0.4743, + "step": 4996 + }, + { + "epoch": 2.362647754137116, + "grad_norm": 3.3325259685516357, + "learning_rate": 3.3545292656350845e-06, + "loss": 0.4703, + "step": 4997 + }, + { + "epoch": 2.3631205673758866, + "grad_norm": 2.7935359477996826, + "learning_rate": 3.353942986120587e-06, + "loss": 0.432, + "step": 4998 + }, + { + "epoch": 2.3635933806146574, + "grad_norm": 2.623624324798584, + "learning_rate": 3.3533566534348033e-06, + "loss": 0.4302, + "step": 4999 + }, + { + "epoch": 2.3640661938534278, + "grad_norm": 3.1467108726501465, + "learning_rate": 3.3527702676142426e-06, + "loss": 0.4661, + "step": 5000 + }, + { + "epoch": 2.3645390070921986, + "grad_norm": 2.5364840030670166, + "learning_rate": 3.352183828695418e-06, + "loss": 0.4134, + "step": 5001 + }, + { + "epoch": 2.3650118203309693, + "grad_norm": 3.002777338027954, + "learning_rate": 3.3515973367148415e-06, + "loss": 0.3771, + "step": 5002 + }, + { + "epoch": 2.36548463356974, + "grad_norm": 2.660043954849243, + "learning_rate": 3.3510107917090335e-06, + "loss": 0.4254, + "step": 5003 + }, + { + "epoch": 2.3659574468085105, + "grad_norm": 2.7041075229644775, + "learning_rate": 3.3504241937145148e-06, + "loss": 0.4651, + "step": 5004 + }, + { + "epoch": 2.3664302600472813, + "grad_norm": 2.7387280464172363, + "learning_rate": 3.349837542767811e-06, + "loss": 0.3874, + "step": 5005 + }, + { + "epoch": 2.366903073286052, + "grad_norm": 3.012188196182251, + "learning_rate": 3.349250838905449e-06, + "loss": 0.4508, + "step": 5006 + }, + { + "epoch": 2.3673758865248224, + "grad_norm": 2.3108484745025635, + "learning_rate": 3.3486640821639616e-06, + "loss": 0.3783, + "step": 5007 + }, + { + "epoch": 2.3678486997635932, + "grad_norm": 3.2188332080841064, + "learning_rate": 3.3480772725798837e-06, + "loss": 0.4879, + "step": 5008 + }, + { + "epoch": 2.368321513002364, + "grad_norm": 2.566087484359741, + "learning_rate": 3.3474904101897526e-06, + "loss": 0.3847, + "step": 5009 + }, + { + "epoch": 2.368794326241135, + "grad_norm": 2.5581698417663574, + "learning_rate": 3.3469034950301092e-06, + "loss": 0.4201, + "step": 5010 + }, + { + "epoch": 2.3692671394799056, + "grad_norm": 2.900296926498413, + "learning_rate": 3.3463165271374992e-06, + "loss": 0.4568, + "step": 5011 + }, + { + "epoch": 2.369739952718676, + "grad_norm": 2.8239312171936035, + "learning_rate": 3.34572950654847e-06, + "loss": 0.4583, + "step": 5012 + }, + { + "epoch": 2.370212765957447, + "grad_norm": 3.219465970993042, + "learning_rate": 3.3451424332995723e-06, + "loss": 0.5435, + "step": 5013 + }, + { + "epoch": 2.3706855791962176, + "grad_norm": 3.3111915588378906, + "learning_rate": 3.344555307427362e-06, + "loss": 0.435, + "step": 5014 + }, + { + "epoch": 2.371158392434988, + "grad_norm": 3.296668529510498, + "learning_rate": 3.3439681289683946e-06, + "loss": 0.4738, + "step": 5015 + }, + { + "epoch": 2.3716312056737587, + "grad_norm": 3.005722761154175, + "learning_rate": 3.343380897959234e-06, + "loss": 0.4267, + "step": 5016 + }, + { + "epoch": 2.3721040189125295, + "grad_norm": 2.7844085693359375, + "learning_rate": 3.3427936144364425e-06, + "loss": 0.4558, + "step": 5017 + }, + { + "epoch": 2.3725768321513003, + "grad_norm": 2.7532076835632324, + "learning_rate": 3.3422062784365884e-06, + "loss": 0.4144, + "step": 5018 + }, + { + "epoch": 2.373049645390071, + "grad_norm": 2.835764169692993, + "learning_rate": 3.3416188899962413e-06, + "loss": 0.4945, + "step": 5019 + }, + { + "epoch": 2.3735224586288415, + "grad_norm": 3.1513726711273193, + "learning_rate": 3.3410314491519767e-06, + "loss": 0.4971, + "step": 5020 + }, + { + "epoch": 2.3739952718676123, + "grad_norm": 3.0162220001220703, + "learning_rate": 3.3404439559403723e-06, + "loss": 0.4477, + "step": 5021 + }, + { + "epoch": 2.374468085106383, + "grad_norm": 2.676391363143921, + "learning_rate": 3.3398564103980073e-06, + "loss": 0.432, + "step": 5022 + }, + { + "epoch": 2.3749408983451534, + "grad_norm": 2.7806248664855957, + "learning_rate": 3.3392688125614663e-06, + "loss": 0.4818, + "step": 5023 + }, + { + "epoch": 2.3754137115839242, + "grad_norm": 2.968806505203247, + "learning_rate": 3.3386811624673373e-06, + "loss": 0.4893, + "step": 5024 + }, + { + "epoch": 2.375886524822695, + "grad_norm": 2.992684841156006, + "learning_rate": 3.3380934601522087e-06, + "loss": 0.4423, + "step": 5025 + }, + { + "epoch": 2.376359338061466, + "grad_norm": 2.578420639038086, + "learning_rate": 3.3375057056526762e-06, + "loss": 0.3682, + "step": 5026 + }, + { + "epoch": 2.3768321513002366, + "grad_norm": 2.7683115005493164, + "learning_rate": 3.336917899005335e-06, + "loss": 0.4038, + "step": 5027 + }, + { + "epoch": 2.377304964539007, + "grad_norm": 2.838812828063965, + "learning_rate": 3.336330040246786e-06, + "loss": 0.442, + "step": 5028 + }, + { + "epoch": 2.3777777777777778, + "grad_norm": 2.766136646270752, + "learning_rate": 3.335742129413633e-06, + "loss": 0.4745, + "step": 5029 + }, + { + "epoch": 2.3782505910165486, + "grad_norm": 2.862656593322754, + "learning_rate": 3.3351541665424812e-06, + "loss": 0.4324, + "step": 5030 + }, + { + "epoch": 2.378723404255319, + "grad_norm": 2.71425199508667, + "learning_rate": 3.3345661516699433e-06, + "loss": 0.4013, + "step": 5031 + }, + { + "epoch": 2.3791962174940897, + "grad_norm": 2.8404030799865723, + "learning_rate": 3.333978084832629e-06, + "loss": 0.5038, + "step": 5032 + }, + { + "epoch": 2.3796690307328605, + "grad_norm": 2.965851068496704, + "learning_rate": 3.3333899660671574e-06, + "loss": 0.4668, + "step": 5033 + }, + { + "epoch": 2.3801418439716313, + "grad_norm": 2.686452627182007, + "learning_rate": 3.3328017954101464e-06, + "loss": 0.4167, + "step": 5034 + }, + { + "epoch": 2.380614657210402, + "grad_norm": 2.8676156997680664, + "learning_rate": 3.3322135728982197e-06, + "loss": 0.4531, + "step": 5035 + }, + { + "epoch": 2.3810874704491725, + "grad_norm": 2.4456300735473633, + "learning_rate": 3.3316252985680026e-06, + "loss": 0.4173, + "step": 5036 + }, + { + "epoch": 2.3815602836879433, + "grad_norm": 2.5472559928894043, + "learning_rate": 3.331036972456124e-06, + "loss": 0.3926, + "step": 5037 + }, + { + "epoch": 2.382033096926714, + "grad_norm": 2.81900954246521, + "learning_rate": 3.330448594599218e-06, + "loss": 0.4785, + "step": 5038 + }, + { + "epoch": 2.3825059101654844, + "grad_norm": 3.0930590629577637, + "learning_rate": 3.329860165033919e-06, + "loss": 0.4587, + "step": 5039 + }, + { + "epoch": 2.382978723404255, + "grad_norm": 3.0553040504455566, + "learning_rate": 3.3292716837968673e-06, + "loss": 0.5285, + "step": 5040 + }, + { + "epoch": 2.383451536643026, + "grad_norm": 2.577580690383911, + "learning_rate": 3.328683150924704e-06, + "loss": 0.4184, + "step": 5041 + }, + { + "epoch": 2.383924349881797, + "grad_norm": 2.6430366039276123, + "learning_rate": 3.3280945664540735e-06, + "loss": 0.4636, + "step": 5042 + }, + { + "epoch": 2.3843971631205676, + "grad_norm": 3.228360891342163, + "learning_rate": 3.3275059304216255e-06, + "loss": 0.455, + "step": 5043 + }, + { + "epoch": 2.384869976359338, + "grad_norm": 2.776142120361328, + "learning_rate": 3.3269172428640125e-06, + "loss": 0.4785, + "step": 5044 + }, + { + "epoch": 2.3853427895981087, + "grad_norm": 2.755671739578247, + "learning_rate": 3.3263285038178882e-06, + "loss": 0.4625, + "step": 5045 + }, + { + "epoch": 2.3858156028368795, + "grad_norm": 3.061004400253296, + "learning_rate": 3.3257397133199114e-06, + "loss": 0.4641, + "step": 5046 + }, + { + "epoch": 2.38628841607565, + "grad_norm": 2.8391458988189697, + "learning_rate": 3.3251508714067432e-06, + "loss": 0.5003, + "step": 5047 + }, + { + "epoch": 2.3867612293144207, + "grad_norm": 2.390810966491699, + "learning_rate": 3.324561978115049e-06, + "loss": 0.4446, + "step": 5048 + }, + { + "epoch": 2.3872340425531915, + "grad_norm": 2.7760825157165527, + "learning_rate": 3.323973033481496e-06, + "loss": 0.4443, + "step": 5049 + }, + { + "epoch": 2.3877068557919623, + "grad_norm": 3.157893419265747, + "learning_rate": 3.3233840375427552e-06, + "loss": 0.4934, + "step": 5050 + }, + { + "epoch": 2.388179669030733, + "grad_norm": 2.7245349884033203, + "learning_rate": 3.3227949903355e-06, + "loss": 0.4254, + "step": 5051 + }, + { + "epoch": 2.3886524822695034, + "grad_norm": 2.6674044132232666, + "learning_rate": 3.322205891896409e-06, + "loss": 0.4116, + "step": 5052 + }, + { + "epoch": 2.3891252955082742, + "grad_norm": 3.1490554809570312, + "learning_rate": 3.3216167422621627e-06, + "loss": 0.4604, + "step": 5053 + }, + { + "epoch": 2.389598108747045, + "grad_norm": 2.725731134414673, + "learning_rate": 3.321027541469444e-06, + "loss": 0.4836, + "step": 5054 + }, + { + "epoch": 2.3900709219858154, + "grad_norm": 2.5378828048706055, + "learning_rate": 3.3204382895549407e-06, + "loss": 0.4228, + "step": 5055 + }, + { + "epoch": 2.390543735224586, + "grad_norm": 2.8191192150115967, + "learning_rate": 3.3198489865553427e-06, + "loss": 0.4371, + "step": 5056 + }, + { + "epoch": 2.391016548463357, + "grad_norm": 2.5676498413085938, + "learning_rate": 3.3192596325073433e-06, + "loss": 0.4463, + "step": 5057 + }, + { + "epoch": 2.391489361702128, + "grad_norm": 3.0846121311187744, + "learning_rate": 3.3186702274476397e-06, + "loss": 0.5049, + "step": 5058 + }, + { + "epoch": 2.3919621749408986, + "grad_norm": 2.6085152626037598, + "learning_rate": 3.3180807714129293e-06, + "loss": 0.4376, + "step": 5059 + }, + { + "epoch": 2.392434988179669, + "grad_norm": 3.0218591690063477, + "learning_rate": 3.3174912644399172e-06, + "loss": 0.4734, + "step": 5060 + }, + { + "epoch": 2.3929078014184397, + "grad_norm": 2.5904781818389893, + "learning_rate": 3.316901706565308e-06, + "loss": 0.4924, + "step": 5061 + }, + { + "epoch": 2.3933806146572105, + "grad_norm": 2.675478458404541, + "learning_rate": 3.3163120978258123e-06, + "loss": 0.4072, + "step": 5062 + }, + { + "epoch": 2.393853427895981, + "grad_norm": 2.7944445610046387, + "learning_rate": 3.3157224382581415e-06, + "loss": 0.4328, + "step": 5063 + }, + { + "epoch": 2.3943262411347517, + "grad_norm": 2.846224546432495, + "learning_rate": 3.315132727899012e-06, + "loss": 0.4447, + "step": 5064 + }, + { + "epoch": 2.3947990543735225, + "grad_norm": 2.6825828552246094, + "learning_rate": 3.3145429667851402e-06, + "loss": 0.4528, + "step": 5065 + }, + { + "epoch": 2.3952718676122933, + "grad_norm": 3.0305285453796387, + "learning_rate": 3.3139531549532505e-06, + "loss": 0.4538, + "step": 5066 + }, + { + "epoch": 2.395744680851064, + "grad_norm": 2.707540988922119, + "learning_rate": 3.313363292440067e-06, + "loss": 0.4412, + "step": 5067 + }, + { + "epoch": 2.3962174940898344, + "grad_norm": 3.0458385944366455, + "learning_rate": 3.3127733792823173e-06, + "loss": 0.4587, + "step": 5068 + }, + { + "epoch": 2.396690307328605, + "grad_norm": 2.7711992263793945, + "learning_rate": 3.312183415516733e-06, + "loss": 0.4157, + "step": 5069 + }, + { + "epoch": 2.397163120567376, + "grad_norm": 2.6953988075256348, + "learning_rate": 3.3115934011800494e-06, + "loss": 0.3828, + "step": 5070 + }, + { + "epoch": 2.3976359338061464, + "grad_norm": 3.033721923828125, + "learning_rate": 3.311003336309003e-06, + "loss": 0.5204, + "step": 5071 + }, + { + "epoch": 2.398108747044917, + "grad_norm": 2.6134517192840576, + "learning_rate": 3.3104132209403355e-06, + "loss": 0.4181, + "step": 5072 + }, + { + "epoch": 2.398581560283688, + "grad_norm": 2.8800251483917236, + "learning_rate": 3.30982305511079e-06, + "loss": 0.466, + "step": 5073 + }, + { + "epoch": 2.3990543735224588, + "grad_norm": 2.5043210983276367, + "learning_rate": 3.309232838857114e-06, + "loss": 0.4161, + "step": 5074 + }, + { + "epoch": 2.3995271867612296, + "grad_norm": 2.6577322483062744, + "learning_rate": 3.308642572216057e-06, + "loss": 0.465, + "step": 5075 + }, + { + "epoch": 2.4, + "grad_norm": 2.549098253250122, + "learning_rate": 3.3080522552243734e-06, + "loss": 0.4571, + "step": 5076 + }, + { + "epoch": 2.4004728132387707, + "grad_norm": 2.881958246231079, + "learning_rate": 3.3074618879188186e-06, + "loss": 0.4443, + "step": 5077 + }, + { + "epoch": 2.4009456264775415, + "grad_norm": 2.608397960662842, + "learning_rate": 3.3068714703361528e-06, + "loss": 0.3843, + "step": 5078 + }, + { + "epoch": 2.401418439716312, + "grad_norm": 2.8666789531707764, + "learning_rate": 3.306281002513139e-06, + "loss": 0.4857, + "step": 5079 + }, + { + "epoch": 2.4018912529550827, + "grad_norm": 2.9008588790893555, + "learning_rate": 3.3056904844865422e-06, + "loss": 0.4454, + "step": 5080 + }, + { + "epoch": 2.4023640661938535, + "grad_norm": 2.7446060180664062, + "learning_rate": 3.3050999162931315e-06, + "loss": 0.4522, + "step": 5081 + }, + { + "epoch": 2.4028368794326243, + "grad_norm": 2.787116765975952, + "learning_rate": 3.3045092979696804e-06, + "loss": 0.4714, + "step": 5082 + }, + { + "epoch": 2.403309692671395, + "grad_norm": 2.7494192123413086, + "learning_rate": 3.3039186295529613e-06, + "loss": 0.4107, + "step": 5083 + }, + { + "epoch": 2.4037825059101654, + "grad_norm": 2.733794927597046, + "learning_rate": 3.303327911079755e-06, + "loss": 0.4169, + "step": 5084 + }, + { + "epoch": 2.404255319148936, + "grad_norm": 2.7313334941864014, + "learning_rate": 3.3027371425868422e-06, + "loss": 0.4287, + "step": 5085 + }, + { + "epoch": 2.404728132387707, + "grad_norm": 2.7832977771759033, + "learning_rate": 3.3021463241110075e-06, + "loss": 0.5307, + "step": 5086 + }, + { + "epoch": 2.4052009456264773, + "grad_norm": 2.6615281105041504, + "learning_rate": 3.301555455689038e-06, + "loss": 0.4519, + "step": 5087 + }, + { + "epoch": 2.405673758865248, + "grad_norm": 2.343921422958374, + "learning_rate": 3.3009645373577264e-06, + "loss": 0.46, + "step": 5088 + }, + { + "epoch": 2.406146572104019, + "grad_norm": 2.6115355491638184, + "learning_rate": 3.300373569153864e-06, + "loss": 0.4782, + "step": 5089 + }, + { + "epoch": 2.4066193853427897, + "grad_norm": 2.730625629425049, + "learning_rate": 3.299782551114249e-06, + "loss": 0.4632, + "step": 5090 + }, + { + "epoch": 2.40709219858156, + "grad_norm": 2.4495043754577637, + "learning_rate": 3.2991914832756824e-06, + "loss": 0.4243, + "step": 5091 + }, + { + "epoch": 2.407565011820331, + "grad_norm": 2.8731648921966553, + "learning_rate": 3.2986003656749654e-06, + "loss": 0.4262, + "step": 5092 + }, + { + "epoch": 2.4080378250591017, + "grad_norm": 2.870342969894409, + "learning_rate": 3.2980091983489053e-06, + "loss": 0.4735, + "step": 5093 + }, + { + "epoch": 2.4085106382978725, + "grad_norm": 2.500786542892456, + "learning_rate": 3.297417981334312e-06, + "loss": 0.4007, + "step": 5094 + }, + { + "epoch": 2.408983451536643, + "grad_norm": 2.7787322998046875, + "learning_rate": 3.2968267146679978e-06, + "loss": 0.493, + "step": 5095 + }, + { + "epoch": 2.4094562647754136, + "grad_norm": 2.5229599475860596, + "learning_rate": 3.2962353983867783e-06, + "loss": 0.3676, + "step": 5096 + }, + { + "epoch": 2.4099290780141844, + "grad_norm": 3.1955904960632324, + "learning_rate": 3.2956440325274715e-06, + "loss": 0.4888, + "step": 5097 + }, + { + "epoch": 2.4104018912529552, + "grad_norm": 2.8580288887023926, + "learning_rate": 3.2950526171268995e-06, + "loss": 0.4892, + "step": 5098 + }, + { + "epoch": 2.4108747044917256, + "grad_norm": 2.6321749687194824, + "learning_rate": 3.294461152221887e-06, + "loss": 0.3823, + "step": 5099 + }, + { + "epoch": 2.4113475177304964, + "grad_norm": 2.881127119064331, + "learning_rate": 3.293869637849263e-06, + "loss": 0.4569, + "step": 5100 + }, + { + "epoch": 2.411820330969267, + "grad_norm": 2.7742316722869873, + "learning_rate": 3.293278074045857e-06, + "loss": 0.4445, + "step": 5101 + }, + { + "epoch": 2.412293144208038, + "grad_norm": 2.546701431274414, + "learning_rate": 3.2926864608485037e-06, + "loss": 0.3995, + "step": 5102 + }, + { + "epoch": 2.4127659574468083, + "grad_norm": 2.588226318359375, + "learning_rate": 3.292094798294041e-06, + "loss": 0.4081, + "step": 5103 + }, + { + "epoch": 2.413238770685579, + "grad_norm": 2.968689441680908, + "learning_rate": 3.2915030864193077e-06, + "loss": 0.4475, + "step": 5104 + }, + { + "epoch": 2.41371158392435, + "grad_norm": 2.9249184131622314, + "learning_rate": 3.290911325261148e-06, + "loss": 0.4763, + "step": 5105 + }, + { + "epoch": 2.4141843971631207, + "grad_norm": 2.817596673965454, + "learning_rate": 3.2903195148564083e-06, + "loss": 0.4451, + "step": 5106 + }, + { + "epoch": 2.414657210401891, + "grad_norm": 2.6465954780578613, + "learning_rate": 3.2897276552419377e-06, + "loss": 0.4665, + "step": 5107 + }, + { + "epoch": 2.415130023640662, + "grad_norm": 2.8613853454589844, + "learning_rate": 3.2891357464545885e-06, + "loss": 0.4398, + "step": 5108 + }, + { + "epoch": 2.4156028368794327, + "grad_norm": 2.756321907043457, + "learning_rate": 3.2885437885312175e-06, + "loss": 0.4634, + "step": 5109 + }, + { + "epoch": 2.4160756501182035, + "grad_norm": 2.8965282440185547, + "learning_rate": 3.287951781508682e-06, + "loss": 0.4319, + "step": 5110 + }, + { + "epoch": 2.416548463356974, + "grad_norm": 2.896756172180176, + "learning_rate": 3.287359725423844e-06, + "loss": 0.4771, + "step": 5111 + }, + { + "epoch": 2.4170212765957446, + "grad_norm": 2.952911376953125, + "learning_rate": 3.286767620313569e-06, + "loss": 0.5026, + "step": 5112 + }, + { + "epoch": 2.4174940898345154, + "grad_norm": 3.850515604019165, + "learning_rate": 3.2861754662147234e-06, + "loss": 0.4387, + "step": 5113 + }, + { + "epoch": 2.417966903073286, + "grad_norm": 3.0072689056396484, + "learning_rate": 3.2855832631641794e-06, + "loss": 0.4586, + "step": 5114 + }, + { + "epoch": 2.4184397163120566, + "grad_norm": 3.166790246963501, + "learning_rate": 3.2849910111988092e-06, + "loss": 0.4842, + "step": 5115 + }, + { + "epoch": 2.4189125295508274, + "grad_norm": 3.5397679805755615, + "learning_rate": 3.284398710355492e-06, + "loss": 0.5138, + "step": 5116 + }, + { + "epoch": 2.419385342789598, + "grad_norm": 2.779609441757202, + "learning_rate": 3.283806360671106e-06, + "loss": 0.4049, + "step": 5117 + }, + { + "epoch": 2.419858156028369, + "grad_norm": 2.5924575328826904, + "learning_rate": 3.283213962182535e-06, + "loss": 0.433, + "step": 5118 + }, + { + "epoch": 2.4203309692671393, + "grad_norm": 2.7429699897766113, + "learning_rate": 3.282621514926665e-06, + "loss": 0.4674, + "step": 5119 + }, + { + "epoch": 2.42080378250591, + "grad_norm": 2.8113889694213867, + "learning_rate": 3.2820290189403846e-06, + "loss": 0.3898, + "step": 5120 + }, + { + "epoch": 2.421276595744681, + "grad_norm": 2.867105722427368, + "learning_rate": 3.2814364742605863e-06, + "loss": 0.4439, + "step": 5121 + }, + { + "epoch": 2.4217494089834517, + "grad_norm": 2.428597927093506, + "learning_rate": 3.2808438809241654e-06, + "loss": 0.4339, + "step": 5122 + }, + { + "epoch": 2.422222222222222, + "grad_norm": 3.071735143661499, + "learning_rate": 3.2802512389680203e-06, + "loss": 0.4583, + "step": 5123 + }, + { + "epoch": 2.422695035460993, + "grad_norm": 3.046313762664795, + "learning_rate": 3.279658548429051e-06, + "loss": 0.5351, + "step": 5124 + }, + { + "epoch": 2.4231678486997636, + "grad_norm": 2.8412697315216064, + "learning_rate": 3.279065809344163e-06, + "loss": 0.5258, + "step": 5125 + }, + { + "epoch": 2.4236406619385344, + "grad_norm": 2.887169122695923, + "learning_rate": 3.278473021750263e-06, + "loss": 0.4568, + "step": 5126 + }, + { + "epoch": 2.424113475177305, + "grad_norm": 2.8316574096679688, + "learning_rate": 3.2778801856842624e-06, + "loss": 0.46, + "step": 5127 + }, + { + "epoch": 2.4245862884160756, + "grad_norm": 2.7660772800445557, + "learning_rate": 3.277287301183073e-06, + "loss": 0.4323, + "step": 5128 + }, + { + "epoch": 2.4250591016548464, + "grad_norm": 2.737682819366455, + "learning_rate": 3.276694368283611e-06, + "loss": 0.4296, + "step": 5129 + }, + { + "epoch": 2.425531914893617, + "grad_norm": 2.8807425498962402, + "learning_rate": 3.276101387022797e-06, + "loss": 0.4673, + "step": 5130 + }, + { + "epoch": 2.4260047281323875, + "grad_norm": 2.530526876449585, + "learning_rate": 3.275508357437552e-06, + "loss": 0.416, + "step": 5131 + }, + { + "epoch": 2.4264775413711583, + "grad_norm": 3.1189746856689453, + "learning_rate": 3.274915279564803e-06, + "loss": 0.4171, + "step": 5132 + }, + { + "epoch": 2.426950354609929, + "grad_norm": 2.6612462997436523, + "learning_rate": 3.274322153441477e-06, + "loss": 0.4104, + "step": 5133 + }, + { + "epoch": 2.4274231678487, + "grad_norm": 2.717973470687866, + "learning_rate": 3.2737289791045064e-06, + "loss": 0.479, + "step": 5134 + }, + { + "epoch": 2.4278959810874703, + "grad_norm": 2.764216661453247, + "learning_rate": 3.2731357565908247e-06, + "loss": 0.481, + "step": 5135 + }, + { + "epoch": 2.428368794326241, + "grad_norm": 2.5081393718719482, + "learning_rate": 3.272542485937369e-06, + "loss": 0.4592, + "step": 5136 + }, + { + "epoch": 2.428841607565012, + "grad_norm": 3.1380364894866943, + "learning_rate": 3.271949167181081e-06, + "loss": 0.4179, + "step": 5137 + }, + { + "epoch": 2.4293144208037827, + "grad_norm": 2.9275963306427, + "learning_rate": 3.2713558003589026e-06, + "loss": 0.5196, + "step": 5138 + }, + { + "epoch": 2.429787234042553, + "grad_norm": 2.8215506076812744, + "learning_rate": 3.270762385507781e-06, + "loss": 0.4081, + "step": 5139 + }, + { + "epoch": 2.430260047281324, + "grad_norm": 2.9185614585876465, + "learning_rate": 3.270168922664665e-06, + "loss": 0.4936, + "step": 5140 + }, + { + "epoch": 2.4307328605200946, + "grad_norm": 2.6507248878479004, + "learning_rate": 3.269575411866507e-06, + "loss": 0.4834, + "step": 5141 + }, + { + "epoch": 2.4312056737588654, + "grad_norm": 2.864741563796997, + "learning_rate": 3.2689818531502637e-06, + "loss": 0.4562, + "step": 5142 + }, + { + "epoch": 2.431678486997636, + "grad_norm": 2.806919813156128, + "learning_rate": 3.2683882465528917e-06, + "loss": 0.4645, + "step": 5143 + }, + { + "epoch": 2.4321513002364066, + "grad_norm": 2.733372211456299, + "learning_rate": 3.267794592111353e-06, + "loss": 0.4123, + "step": 5144 + }, + { + "epoch": 2.4326241134751774, + "grad_norm": 2.8005833625793457, + "learning_rate": 3.2672008898626116e-06, + "loss": 0.4343, + "step": 5145 + }, + { + "epoch": 2.433096926713948, + "grad_norm": 3.2339670658111572, + "learning_rate": 3.2666071398436354e-06, + "loss": 0.4017, + "step": 5146 + }, + { + "epoch": 2.4335697399527185, + "grad_norm": 2.510251760482788, + "learning_rate": 3.2660133420913932e-06, + "loss": 0.3882, + "step": 5147 + }, + { + "epoch": 2.4340425531914893, + "grad_norm": 3.5633628368377686, + "learning_rate": 3.26541949664286e-06, + "loss": 0.4766, + "step": 5148 + }, + { + "epoch": 2.43451536643026, + "grad_norm": 2.8246724605560303, + "learning_rate": 3.26482560353501e-06, + "loss": 0.3728, + "step": 5149 + }, + { + "epoch": 2.434988179669031, + "grad_norm": 2.4923641681671143, + "learning_rate": 3.264231662804823e-06, + "loss": 0.4346, + "step": 5150 + }, + { + "epoch": 2.4354609929078013, + "grad_norm": 3.180874824523926, + "learning_rate": 3.2636376744892827e-06, + "loss": 0.4351, + "step": 5151 + }, + { + "epoch": 2.435933806146572, + "grad_norm": 2.6933515071868896, + "learning_rate": 3.263043638625373e-06, + "loss": 0.4293, + "step": 5152 + }, + { + "epoch": 2.436406619385343, + "grad_norm": 2.584132194519043, + "learning_rate": 3.262449555250081e-06, + "loss": 0.4589, + "step": 5153 + }, + { + "epoch": 2.4368794326241137, + "grad_norm": 2.8103036880493164, + "learning_rate": 3.2618554244003985e-06, + "loss": 0.463, + "step": 5154 + }, + { + "epoch": 2.437352245862884, + "grad_norm": 2.809070587158203, + "learning_rate": 3.2612612461133197e-06, + "loss": 0.4629, + "step": 5155 + }, + { + "epoch": 2.437825059101655, + "grad_norm": 2.98148512840271, + "learning_rate": 3.2606670204258405e-06, + "loss": 0.451, + "step": 5156 + }, + { + "epoch": 2.4382978723404256, + "grad_norm": 2.691047191619873, + "learning_rate": 3.2600727473749614e-06, + "loss": 0.3878, + "step": 5157 + }, + { + "epoch": 2.4387706855791964, + "grad_norm": 2.900360345840454, + "learning_rate": 3.2594784269976856e-06, + "loss": 0.4216, + "step": 5158 + }, + { + "epoch": 2.4392434988179668, + "grad_norm": 2.8449952602386475, + "learning_rate": 3.258884059331019e-06, + "loss": 0.4268, + "step": 5159 + }, + { + "epoch": 2.4397163120567376, + "grad_norm": 2.7226388454437256, + "learning_rate": 3.258289644411969e-06, + "loss": 0.4381, + "step": 5160 + }, + { + "epoch": 2.4401891252955084, + "grad_norm": 2.513946056365967, + "learning_rate": 3.257695182277547e-06, + "loss": 0.4566, + "step": 5161 + }, + { + "epoch": 2.440661938534279, + "grad_norm": 2.9941394329071045, + "learning_rate": 3.2571006729647693e-06, + "loss": 0.4395, + "step": 5162 + }, + { + "epoch": 2.4411347517730495, + "grad_norm": 2.699094533920288, + "learning_rate": 3.2565061165106523e-06, + "loss": 0.4274, + "step": 5163 + }, + { + "epoch": 2.4416075650118203, + "grad_norm": 2.574193000793457, + "learning_rate": 3.255911512952216e-06, + "loss": 0.4187, + "step": 5164 + }, + { + "epoch": 2.442080378250591, + "grad_norm": 2.920766592025757, + "learning_rate": 3.2553168623264854e-06, + "loss": 0.4911, + "step": 5165 + }, + { + "epoch": 2.4425531914893615, + "grad_norm": 2.728421926498413, + "learning_rate": 3.2547221646704853e-06, + "loss": 0.4466, + "step": 5166 + }, + { + "epoch": 2.4430260047281322, + "grad_norm": 2.8171417713165283, + "learning_rate": 3.254127420021246e-06, + "loss": 0.4331, + "step": 5167 + }, + { + "epoch": 2.443498817966903, + "grad_norm": 2.4069135189056396, + "learning_rate": 3.2535326284157975e-06, + "loss": 0.389, + "step": 5168 + }, + { + "epoch": 2.443971631205674, + "grad_norm": 2.912405490875244, + "learning_rate": 3.2529377898911777e-06, + "loss": 0.4681, + "step": 5169 + }, + { + "epoch": 2.4444444444444446, + "grad_norm": 2.987558126449585, + "learning_rate": 3.2523429044844228e-06, + "loss": 0.4715, + "step": 5170 + }, + { + "epoch": 2.444917257683215, + "grad_norm": 2.5117199420928955, + "learning_rate": 3.251747972232574e-06, + "loss": 0.4531, + "step": 5171 + }, + { + "epoch": 2.445390070921986, + "grad_norm": 2.5405385494232178, + "learning_rate": 3.2511529931726752e-06, + "loss": 0.4323, + "step": 5172 + }, + { + "epoch": 2.4458628841607566, + "grad_norm": 2.989932060241699, + "learning_rate": 3.250557967341773e-06, + "loss": 0.4039, + "step": 5173 + }, + { + "epoch": 2.446335697399527, + "grad_norm": 2.6331627368927, + "learning_rate": 3.2499628947769186e-06, + "loss": 0.5147, + "step": 5174 + }, + { + "epoch": 2.4468085106382977, + "grad_norm": 2.71699857711792, + "learning_rate": 3.249367775515162e-06, + "loss": 0.3748, + "step": 5175 + }, + { + "epoch": 2.4472813238770685, + "grad_norm": 2.9508471488952637, + "learning_rate": 3.2487726095935606e-06, + "loss": 0.5145, + "step": 5176 + }, + { + "epoch": 2.4477541371158393, + "grad_norm": 2.8276431560516357, + "learning_rate": 3.2481773970491713e-06, + "loss": 0.4295, + "step": 5177 + }, + { + "epoch": 2.44822695035461, + "grad_norm": 2.5500540733337402, + "learning_rate": 3.2475821379190565e-06, + "loss": 0.4246, + "step": 5178 + }, + { + "epoch": 2.4486997635933805, + "grad_norm": 2.845641613006592, + "learning_rate": 3.246986832240281e-06, + "loss": 0.4211, + "step": 5179 + }, + { + "epoch": 2.4491725768321513, + "grad_norm": 3.1215856075286865, + "learning_rate": 3.2463914800499097e-06, + "loss": 0.4378, + "step": 5180 + }, + { + "epoch": 2.449645390070922, + "grad_norm": 2.4685606956481934, + "learning_rate": 3.2457960813850137e-06, + "loss": 0.4836, + "step": 5181 + }, + { + "epoch": 2.4501182033096924, + "grad_norm": 2.508028268814087, + "learning_rate": 3.245200636282666e-06, + "loss": 0.4377, + "step": 5182 + }, + { + "epoch": 2.4505910165484632, + "grad_norm": 2.899949312210083, + "learning_rate": 3.244605144779943e-06, + "loss": 0.501, + "step": 5183 + }, + { + "epoch": 2.451063829787234, + "grad_norm": 2.6494483947753906, + "learning_rate": 3.244009606913923e-06, + "loss": 0.4255, + "step": 5184 + }, + { + "epoch": 2.451536643026005, + "grad_norm": 2.4363760948181152, + "learning_rate": 3.243414022721686e-06, + "loss": 0.4402, + "step": 5185 + }, + { + "epoch": 2.4520094562647756, + "grad_norm": 2.4725022315979004, + "learning_rate": 3.242818392240317e-06, + "loss": 0.4388, + "step": 5186 + }, + { + "epoch": 2.452482269503546, + "grad_norm": 2.7010514736175537, + "learning_rate": 3.242222715506905e-06, + "loss": 0.4388, + "step": 5187 + }, + { + "epoch": 2.4529550827423168, + "grad_norm": 2.811464548110962, + "learning_rate": 3.241626992558539e-06, + "loss": 0.4634, + "step": 5188 + }, + { + "epoch": 2.4534278959810876, + "grad_norm": 2.6473052501678467, + "learning_rate": 3.2410312234323123e-06, + "loss": 0.4752, + "step": 5189 + }, + { + "epoch": 2.453900709219858, + "grad_norm": 2.5587213039398193, + "learning_rate": 3.24043540816532e-06, + "loss": 0.4458, + "step": 5190 + }, + { + "epoch": 2.4543735224586287, + "grad_norm": 2.6306557655334473, + "learning_rate": 3.239839546794662e-06, + "loss": 0.4081, + "step": 5191 + }, + { + "epoch": 2.4548463356973995, + "grad_norm": 2.4613633155822754, + "learning_rate": 3.23924363935744e-06, + "loss": 0.4165, + "step": 5192 + }, + { + "epoch": 2.4553191489361703, + "grad_norm": 2.7189204692840576, + "learning_rate": 3.238647685890757e-06, + "loss": 0.4822, + "step": 5193 + }, + { + "epoch": 2.455791962174941, + "grad_norm": 3.015977382659912, + "learning_rate": 3.238051686431722e-06, + "loss": 0.4964, + "step": 5194 + }, + { + "epoch": 2.4562647754137115, + "grad_norm": 2.8868937492370605, + "learning_rate": 3.2374556410174445e-06, + "loss": 0.4514, + "step": 5195 + }, + { + "epoch": 2.4567375886524823, + "grad_norm": 2.7959537506103516, + "learning_rate": 3.2368595496850375e-06, + "loss": 0.475, + "step": 5196 + }, + { + "epoch": 2.457210401891253, + "grad_norm": 3.0086777210235596, + "learning_rate": 3.2362634124716187e-06, + "loss": 0.4913, + "step": 5197 + }, + { + "epoch": 2.4576832151300234, + "grad_norm": 2.621335506439209, + "learning_rate": 3.2356672294143044e-06, + "loss": 0.4259, + "step": 5198 + }, + { + "epoch": 2.458156028368794, + "grad_norm": 3.1620380878448486, + "learning_rate": 3.235071000550218e-06, + "loss": 0.451, + "step": 5199 + }, + { + "epoch": 2.458628841607565, + "grad_norm": 2.7663278579711914, + "learning_rate": 3.234474725916484e-06, + "loss": 0.3854, + "step": 5200 + }, + { + "epoch": 2.459101654846336, + "grad_norm": 2.5187132358551025, + "learning_rate": 3.2338784055502288e-06, + "loss": 0.4068, + "step": 5201 + }, + { + "epoch": 2.4595744680851066, + "grad_norm": 2.6022701263427734, + "learning_rate": 3.233282039488583e-06, + "loss": 0.4484, + "step": 5202 + }, + { + "epoch": 2.460047281323877, + "grad_norm": 2.874750852584839, + "learning_rate": 3.2326856277686807e-06, + "loss": 0.45, + "step": 5203 + }, + { + "epoch": 2.4605200945626478, + "grad_norm": 2.671008586883545, + "learning_rate": 3.232089170427656e-06, + "loss": 0.4446, + "step": 5204 + }, + { + "epoch": 2.4609929078014185, + "grad_norm": 2.7365503311157227, + "learning_rate": 3.2314926675026498e-06, + "loss": 0.4402, + "step": 5205 + }, + { + "epoch": 2.461465721040189, + "grad_norm": 2.8163657188415527, + "learning_rate": 3.230896119030803e-06, + "loss": 0.3881, + "step": 5206 + }, + { + "epoch": 2.4619385342789597, + "grad_norm": 2.812433958053589, + "learning_rate": 3.2302995250492584e-06, + "loss": 0.4897, + "step": 5207 + }, + { + "epoch": 2.4624113475177305, + "grad_norm": 2.786033868789673, + "learning_rate": 3.2297028855951664e-06, + "loss": 0.4069, + "step": 5208 + }, + { + "epoch": 2.4628841607565013, + "grad_norm": 3.0247974395751953, + "learning_rate": 3.229106200705674e-06, + "loss": 0.4048, + "step": 5209 + }, + { + "epoch": 2.463356973995272, + "grad_norm": 3.3280487060546875, + "learning_rate": 3.2285094704179353e-06, + "loss": 0.5613, + "step": 5210 + }, + { + "epoch": 2.4638297872340424, + "grad_norm": 2.603219985961914, + "learning_rate": 3.2279126947691073e-06, + "loss": 0.432, + "step": 5211 + }, + { + "epoch": 2.4643026004728132, + "grad_norm": 3.1532180309295654, + "learning_rate": 3.2273158737963472e-06, + "loss": 0.4602, + "step": 5212 + }, + { + "epoch": 2.464775413711584, + "grad_norm": 2.7512969970703125, + "learning_rate": 3.2267190075368164e-06, + "loss": 0.5064, + "step": 5213 + }, + { + "epoch": 2.4652482269503544, + "grad_norm": 2.926992177963257, + "learning_rate": 3.22612209602768e-06, + "loss": 0.4753, + "step": 5214 + }, + { + "epoch": 2.465721040189125, + "grad_norm": 4.052840709686279, + "learning_rate": 3.2255251393061047e-06, + "loss": 0.5235, + "step": 5215 + }, + { + "epoch": 2.466193853427896, + "grad_norm": 2.8266959190368652, + "learning_rate": 3.2249281374092606e-06, + "loss": 0.3931, + "step": 5216 + }, + { + "epoch": 2.466666666666667, + "grad_norm": 2.564359426498413, + "learning_rate": 3.2243310903743196e-06, + "loss": 0.4146, + "step": 5217 + }, + { + "epoch": 2.4671394799054376, + "grad_norm": 2.387925148010254, + "learning_rate": 3.2237339982384576e-06, + "loss": 0.4142, + "step": 5218 + }, + { + "epoch": 2.467612293144208, + "grad_norm": 2.7045164108276367, + "learning_rate": 3.223136861038853e-06, + "loss": 0.4345, + "step": 5219 + }, + { + "epoch": 2.4680851063829787, + "grad_norm": 2.6963284015655518, + "learning_rate": 3.2225396788126872e-06, + "loss": 0.4243, + "step": 5220 + }, + { + "epoch": 2.4685579196217495, + "grad_norm": 2.8247268199920654, + "learning_rate": 3.221942451597144e-06, + "loss": 0.3919, + "step": 5221 + }, + { + "epoch": 2.46903073286052, + "grad_norm": 3.843836784362793, + "learning_rate": 3.2213451794294093e-06, + "loss": 0.4183, + "step": 5222 + }, + { + "epoch": 2.4695035460992907, + "grad_norm": 2.8579909801483154, + "learning_rate": 3.220747862346674e-06, + "loss": 0.4844, + "step": 5223 + }, + { + "epoch": 2.4699763593380615, + "grad_norm": 3.744027853012085, + "learning_rate": 3.2201505003861294e-06, + "loss": 0.4563, + "step": 5224 + }, + { + "epoch": 2.4704491725768323, + "grad_norm": 2.835108995437622, + "learning_rate": 3.219553093584971e-06, + "loss": 0.4394, + "step": 5225 + }, + { + "epoch": 2.470921985815603, + "grad_norm": 2.5681865215301514, + "learning_rate": 3.218955641980397e-06, + "loss": 0.3907, + "step": 5226 + }, + { + "epoch": 2.4713947990543734, + "grad_norm": 2.963172674179077, + "learning_rate": 3.2183581456096067e-06, + "loss": 0.5163, + "step": 5227 + }, + { + "epoch": 2.4718676122931442, + "grad_norm": 2.7840685844421387, + "learning_rate": 3.2177606045098047e-06, + "loss": 0.411, + "step": 5228 + }, + { + "epoch": 2.472340425531915, + "grad_norm": 2.7849979400634766, + "learning_rate": 3.2171630187181977e-06, + "loss": 0.4671, + "step": 5229 + }, + { + "epoch": 2.4728132387706854, + "grad_norm": 2.736406087875366, + "learning_rate": 3.216565388271994e-06, + "loss": 0.5225, + "step": 5230 + }, + { + "epoch": 2.473286052009456, + "grad_norm": 2.978271007537842, + "learning_rate": 3.215967713208406e-06, + "loss": 0.4668, + "step": 5231 + }, + { + "epoch": 2.473758865248227, + "grad_norm": 2.687560796737671, + "learning_rate": 3.2153699935646475e-06, + "loss": 0.4683, + "step": 5232 + }, + { + "epoch": 2.4742316784869978, + "grad_norm": 2.7096521854400635, + "learning_rate": 3.214772229377936e-06, + "loss": 0.4999, + "step": 5233 + }, + { + "epoch": 2.4747044917257686, + "grad_norm": 3.1861157417297363, + "learning_rate": 3.214174420685493e-06, + "loss": 0.4365, + "step": 5234 + }, + { + "epoch": 2.475177304964539, + "grad_norm": 2.623061418533325, + "learning_rate": 3.2135765675245394e-06, + "loss": 0.3717, + "step": 5235 + }, + { + "epoch": 2.4756501182033097, + "grad_norm": 2.680921792984009, + "learning_rate": 3.2129786699323016e-06, + "loss": 0.4688, + "step": 5236 + }, + { + "epoch": 2.4761229314420805, + "grad_norm": 2.80426025390625, + "learning_rate": 3.2123807279460096e-06, + "loss": 0.5043, + "step": 5237 + }, + { + "epoch": 2.476595744680851, + "grad_norm": 2.676156997680664, + "learning_rate": 3.211782741602893e-06, + "loss": 0.4486, + "step": 5238 + }, + { + "epoch": 2.4770685579196217, + "grad_norm": 2.700822591781616, + "learning_rate": 3.2111847109401855e-06, + "loss": 0.4097, + "step": 5239 + }, + { + "epoch": 2.4775413711583925, + "grad_norm": 2.735387086868286, + "learning_rate": 3.2105866359951254e-06, + "loss": 0.4357, + "step": 5240 + }, + { + "epoch": 2.4780141843971633, + "grad_norm": 2.961874485015869, + "learning_rate": 3.2099885168049507e-06, + "loss": 0.4942, + "step": 5241 + }, + { + "epoch": 2.478486997635934, + "grad_norm": 2.546588659286499, + "learning_rate": 3.209390353406904e-06, + "loss": 0.3852, + "step": 5242 + }, + { + "epoch": 2.4789598108747044, + "grad_norm": 2.6269772052764893, + "learning_rate": 3.208792145838231e-06, + "loss": 0.3935, + "step": 5243 + }, + { + "epoch": 2.479432624113475, + "grad_norm": 2.9009883403778076, + "learning_rate": 3.208193894136179e-06, + "loss": 0.4003, + "step": 5244 + }, + { + "epoch": 2.479905437352246, + "grad_norm": 2.772834300994873, + "learning_rate": 3.2075955983379982e-06, + "loss": 0.4742, + "step": 5245 + }, + { + "epoch": 2.4803782505910164, + "grad_norm": 2.728703737258911, + "learning_rate": 3.2069972584809423e-06, + "loss": 0.4405, + "step": 5246 + }, + { + "epoch": 2.480851063829787, + "grad_norm": 2.72868275642395, + "learning_rate": 3.206398874602268e-06, + "loss": 0.4714, + "step": 5247 + }, + { + "epoch": 2.481323877068558, + "grad_norm": 2.6804213523864746, + "learning_rate": 3.2058004467392323e-06, + "loss": 0.4106, + "step": 5248 + }, + { + "epoch": 2.4817966903073287, + "grad_norm": 2.6740739345550537, + "learning_rate": 3.205201974929098e-06, + "loss": 0.3855, + "step": 5249 + }, + { + "epoch": 2.482269503546099, + "grad_norm": 2.8131754398345947, + "learning_rate": 3.204603459209129e-06, + "loss": 0.418, + "step": 5250 + }, + { + "epoch": 2.48274231678487, + "grad_norm": 2.5242888927459717, + "learning_rate": 3.204004899616592e-06, + "loss": 0.4914, + "step": 5251 + }, + { + "epoch": 2.4832151300236407, + "grad_norm": 2.969191551208496, + "learning_rate": 3.2034062961887567e-06, + "loss": 0.4634, + "step": 5252 + }, + { + "epoch": 2.4836879432624115, + "grad_norm": 2.967968463897705, + "learning_rate": 3.2028076489628963e-06, + "loss": 0.456, + "step": 5253 + }, + { + "epoch": 2.484160756501182, + "grad_norm": 2.9006540775299072, + "learning_rate": 3.2022089579762845e-06, + "loss": 0.4203, + "step": 5254 + }, + { + "epoch": 2.4846335697399526, + "grad_norm": 2.6377336978912354, + "learning_rate": 3.2016102232662003e-06, + "loss": 0.4518, + "step": 5255 + }, + { + "epoch": 2.4851063829787234, + "grad_norm": 2.757749319076538, + "learning_rate": 3.201011444869925e-06, + "loss": 0.4314, + "step": 5256 + }, + { + "epoch": 2.4855791962174942, + "grad_norm": 2.571560859680176, + "learning_rate": 3.20041262282474e-06, + "loss": 0.427, + "step": 5257 + }, + { + "epoch": 2.4860520094562646, + "grad_norm": 3.1367194652557373, + "learning_rate": 3.1998137571679316e-06, + "loss": 0.4901, + "step": 5258 + }, + { + "epoch": 2.4865248226950354, + "grad_norm": 3.194042205810547, + "learning_rate": 3.1992148479367896e-06, + "loss": 0.466, + "step": 5259 + }, + { + "epoch": 2.486997635933806, + "grad_norm": 2.5546324253082275, + "learning_rate": 3.1986158951686052e-06, + "loss": 0.4182, + "step": 5260 + }, + { + "epoch": 2.487470449172577, + "grad_norm": 2.919783115386963, + "learning_rate": 3.198016898900672e-06, + "loss": 0.4234, + "step": 5261 + }, + { + "epoch": 2.4879432624113473, + "grad_norm": 2.865248918533325, + "learning_rate": 3.1974178591702877e-06, + "loss": 0.4291, + "step": 5262 + }, + { + "epoch": 2.488416075650118, + "grad_norm": 2.685737133026123, + "learning_rate": 3.196818776014752e-06, + "loss": 0.4548, + "step": 5263 + }, + { + "epoch": 2.488888888888889, + "grad_norm": 2.826974630355835, + "learning_rate": 3.196219649471365e-06, + "loss": 0.4152, + "step": 5264 + }, + { + "epoch": 2.4893617021276597, + "grad_norm": 2.764975070953369, + "learning_rate": 3.1956204795774336e-06, + "loss": 0.5209, + "step": 5265 + }, + { + "epoch": 2.48983451536643, + "grad_norm": 2.4184255599975586, + "learning_rate": 3.1950212663702662e-06, + "loss": 0.3969, + "step": 5266 + }, + { + "epoch": 2.490307328605201, + "grad_norm": 2.9361133575439453, + "learning_rate": 3.1944220098871713e-06, + "loss": 0.4589, + "step": 5267 + }, + { + "epoch": 2.4907801418439717, + "grad_norm": 2.377051830291748, + "learning_rate": 3.193822710165463e-06, + "loss": 0.4328, + "step": 5268 + }, + { + "epoch": 2.4912529550827425, + "grad_norm": 3.1302497386932373, + "learning_rate": 3.1932233672424563e-06, + "loss": 0.3918, + "step": 5269 + }, + { + "epoch": 2.491725768321513, + "grad_norm": 2.89577579498291, + "learning_rate": 3.192623981155471e-06, + "loss": 0.5004, + "step": 5270 + }, + { + "epoch": 2.4921985815602836, + "grad_norm": 2.7735235691070557, + "learning_rate": 3.1920245519418273e-06, + "loss": 0.4206, + "step": 5271 + }, + { + "epoch": 2.4926713947990544, + "grad_norm": 2.5424516201019287, + "learning_rate": 3.1914250796388493e-06, + "loss": 0.4419, + "step": 5272 + }, + { + "epoch": 2.493144208037825, + "grad_norm": 3.1216981410980225, + "learning_rate": 3.1908255642838628e-06, + "loss": 0.4552, + "step": 5273 + }, + { + "epoch": 2.4936170212765956, + "grad_norm": 3.044045925140381, + "learning_rate": 3.1902260059141978e-06, + "loss": 0.4967, + "step": 5274 + }, + { + "epoch": 2.4940898345153664, + "grad_norm": 2.5630741119384766, + "learning_rate": 3.189626404567186e-06, + "loss": 0.3908, + "step": 5275 + }, + { + "epoch": 2.494562647754137, + "grad_norm": 2.7177648544311523, + "learning_rate": 3.189026760280162e-06, + "loss": 0.4915, + "step": 5276 + }, + { + "epoch": 2.495035460992908, + "grad_norm": 2.653416395187378, + "learning_rate": 3.1884270730904632e-06, + "loss": 0.4633, + "step": 5277 + }, + { + "epoch": 2.4955082742316783, + "grad_norm": 3.7212321758270264, + "learning_rate": 3.1878273430354284e-06, + "loss": 0.4549, + "step": 5278 + }, + { + "epoch": 2.495981087470449, + "grad_norm": 2.4152729511260986, + "learning_rate": 3.187227570152402e-06, + "loss": 0.4674, + "step": 5279 + }, + { + "epoch": 2.49645390070922, + "grad_norm": 2.5354862213134766, + "learning_rate": 3.1866277544787284e-06, + "loss": 0.4135, + "step": 5280 + }, + { + "epoch": 2.4969267139479907, + "grad_norm": 3.1766583919525146, + "learning_rate": 3.186027896051754e-06, + "loss": 0.5656, + "step": 5281 + }, + { + "epoch": 2.497399527186761, + "grad_norm": 2.5636754035949707, + "learning_rate": 3.1854279949088313e-06, + "loss": 0.4138, + "step": 5282 + }, + { + "epoch": 2.497872340425532, + "grad_norm": 2.7615602016448975, + "learning_rate": 3.1848280510873124e-06, + "loss": 0.4936, + "step": 5283 + }, + { + "epoch": 2.4983451536643027, + "grad_norm": 2.964721918106079, + "learning_rate": 3.1842280646245543e-06, + "loss": 0.4865, + "step": 5284 + }, + { + "epoch": 2.4988179669030735, + "grad_norm": 2.6915178298950195, + "learning_rate": 3.1836280355579152e-06, + "loss": 0.4179, + "step": 5285 + }, + { + "epoch": 2.499290780141844, + "grad_norm": 2.820451259613037, + "learning_rate": 3.183027963924755e-06, + "loss": 0.4785, + "step": 5286 + }, + { + "epoch": 2.4997635933806146, + "grad_norm": 2.841719627380371, + "learning_rate": 3.1824278497624393e-06, + "loss": 0.4535, + "step": 5287 + }, + { + "epoch": 2.5002364066193854, + "grad_norm": 2.459167957305908, + "learning_rate": 3.181827693108333e-06, + "loss": 0.4353, + "step": 5288 + }, + { + "epoch": 2.500709219858156, + "grad_norm": 3.2538363933563232, + "learning_rate": 3.1812274939998066e-06, + "loss": 0.4037, + "step": 5289 + }, + { + "epoch": 2.5011820330969265, + "grad_norm": 2.6980504989624023, + "learning_rate": 3.180627252474231e-06, + "loss": 0.4181, + "step": 5290 + }, + { + "epoch": 2.5016548463356973, + "grad_norm": 2.9400012493133545, + "learning_rate": 3.1800269685689804e-06, + "loss": 0.4642, + "step": 5291 + }, + { + "epoch": 2.502127659574468, + "grad_norm": 2.7832958698272705, + "learning_rate": 3.1794266423214328e-06, + "loss": 0.3936, + "step": 5292 + }, + { + "epoch": 2.5026004728132385, + "grad_norm": 2.4017868041992188, + "learning_rate": 3.178826273768967e-06, + "loss": 0.3984, + "step": 5293 + }, + { + "epoch": 2.5030732860520093, + "grad_norm": 2.398120641708374, + "learning_rate": 3.1782258629489665e-06, + "loss": 0.4219, + "step": 5294 + }, + { + "epoch": 2.50354609929078, + "grad_norm": 2.973947763442993, + "learning_rate": 3.177625409898815e-06, + "loss": 0.4192, + "step": 5295 + }, + { + "epoch": 2.504018912529551, + "grad_norm": 3.1169888973236084, + "learning_rate": 3.1770249146559006e-06, + "loss": 0.5098, + "step": 5296 + }, + { + "epoch": 2.5044917257683217, + "grad_norm": 2.816964864730835, + "learning_rate": 3.1764243772576132e-06, + "loss": 0.4228, + "step": 5297 + }, + { + "epoch": 2.504964539007092, + "grad_norm": 2.5624163150787354, + "learning_rate": 3.1758237977413452e-06, + "loss": 0.4389, + "step": 5298 + }, + { + "epoch": 2.505437352245863, + "grad_norm": 2.7477777004241943, + "learning_rate": 3.175223176144494e-06, + "loss": 0.4564, + "step": 5299 + }, + { + "epoch": 2.5059101654846336, + "grad_norm": 3.1478309631347656, + "learning_rate": 3.174622512504456e-06, + "loss": 0.4859, + "step": 5300 + }, + { + "epoch": 2.506382978723404, + "grad_norm": 2.8400418758392334, + "learning_rate": 3.1740218068586315e-06, + "loss": 0.4476, + "step": 5301 + }, + { + "epoch": 2.506855791962175, + "grad_norm": 2.7097036838531494, + "learning_rate": 3.173421059244426e-06, + "loss": 0.4559, + "step": 5302 + }, + { + "epoch": 2.5073286052009456, + "grad_norm": 2.864760637283325, + "learning_rate": 3.172820269699243e-06, + "loss": 0.5124, + "step": 5303 + }, + { + "epoch": 2.5078014184397164, + "grad_norm": 2.877110004425049, + "learning_rate": 3.1722194382604926e-06, + "loss": 0.5083, + "step": 5304 + }, + { + "epoch": 2.508274231678487, + "grad_norm": 3.2369656562805176, + "learning_rate": 3.1716185649655844e-06, + "loss": 0.4894, + "step": 5305 + }, + { + "epoch": 2.5087470449172575, + "grad_norm": 2.7377753257751465, + "learning_rate": 3.171017649851934e-06, + "loss": 0.4324, + "step": 5306 + }, + { + "epoch": 2.5092198581560283, + "grad_norm": 2.883364200592041, + "learning_rate": 3.1704166929569564e-06, + "loss": 0.3731, + "step": 5307 + }, + { + "epoch": 2.509692671394799, + "grad_norm": 2.5724737644195557, + "learning_rate": 3.1698156943180716e-06, + "loss": 0.4768, + "step": 5308 + }, + { + "epoch": 2.5101654846335695, + "grad_norm": 2.7532460689544678, + "learning_rate": 3.1692146539727e-06, + "loss": 0.4385, + "step": 5309 + }, + { + "epoch": 2.5106382978723403, + "grad_norm": 2.786505699157715, + "learning_rate": 3.168613571958267e-06, + "loss": 0.4241, + "step": 5310 + }, + { + "epoch": 2.511111111111111, + "grad_norm": 3.1674118041992188, + "learning_rate": 3.1680124483121975e-06, + "loss": 0.4445, + "step": 5311 + }, + { + "epoch": 2.511583924349882, + "grad_norm": 2.7861545085906982, + "learning_rate": 3.167411283071923e-06, + "loss": 0.4264, + "step": 5312 + }, + { + "epoch": 2.5120567375886527, + "grad_norm": 2.7412493228912354, + "learning_rate": 3.1668100762748745e-06, + "loss": 0.4725, + "step": 5313 + }, + { + "epoch": 2.512529550827423, + "grad_norm": 2.710019588470459, + "learning_rate": 3.1662088279584858e-06, + "loss": 0.5207, + "step": 5314 + }, + { + "epoch": 2.513002364066194, + "grad_norm": 2.694812297821045, + "learning_rate": 3.165607538160194e-06, + "loss": 0.3666, + "step": 5315 + }, + { + "epoch": 2.5134751773049646, + "grad_norm": 2.4390623569488525, + "learning_rate": 3.1650062069174405e-06, + "loss": 0.4025, + "step": 5316 + }, + { + "epoch": 2.513947990543735, + "grad_norm": 3.055738925933838, + "learning_rate": 3.1644048342676663e-06, + "loss": 0.4288, + "step": 5317 + }, + { + "epoch": 2.5144208037825058, + "grad_norm": 3.065824508666992, + "learning_rate": 3.163803420248316e-06, + "loss": 0.4592, + "step": 5318 + }, + { + "epoch": 2.5148936170212766, + "grad_norm": 2.6011085510253906, + "learning_rate": 3.163201964896838e-06, + "loss": 0.4081, + "step": 5319 + }, + { + "epoch": 2.5153664302600474, + "grad_norm": 2.4833033084869385, + "learning_rate": 3.162600468250681e-06, + "loss": 0.4343, + "step": 5320 + }, + { + "epoch": 2.515839243498818, + "grad_norm": 2.9035534858703613, + "learning_rate": 3.161998930347299e-06, + "loss": 0.4972, + "step": 5321 + }, + { + "epoch": 2.5163120567375885, + "grad_norm": 2.788752317428589, + "learning_rate": 3.161397351224146e-06, + "loss": 0.4597, + "step": 5322 + }, + { + "epoch": 2.5167848699763593, + "grad_norm": 2.4344491958618164, + "learning_rate": 3.16079573091868e-06, + "loss": 0.359, + "step": 5323 + }, + { + "epoch": 2.51725768321513, + "grad_norm": 2.750150680541992, + "learning_rate": 3.160194069468361e-06, + "loss": 0.4596, + "step": 5324 + }, + { + "epoch": 2.5177304964539005, + "grad_norm": 2.826902389526367, + "learning_rate": 3.1595923669106526e-06, + "loss": 0.4377, + "step": 5325 + }, + { + "epoch": 2.5182033096926713, + "grad_norm": 2.554439067840576, + "learning_rate": 3.15899062328302e-06, + "loss": 0.4517, + "step": 5326 + }, + { + "epoch": 2.518676122931442, + "grad_norm": 3.0882742404937744, + "learning_rate": 3.158388838622931e-06, + "loss": 0.47, + "step": 5327 + }, + { + "epoch": 2.519148936170213, + "grad_norm": 2.918947696685791, + "learning_rate": 3.157787012967856e-06, + "loss": 0.522, + "step": 5328 + }, + { + "epoch": 2.5196217494089836, + "grad_norm": 2.8057637214660645, + "learning_rate": 3.1571851463552674e-06, + "loss": 0.4837, + "step": 5329 + }, + { + "epoch": 2.520094562647754, + "grad_norm": 2.66241455078125, + "learning_rate": 3.156583238822641e-06, + "loss": 0.3988, + "step": 5330 + }, + { + "epoch": 2.520567375886525, + "grad_norm": 2.9793803691864014, + "learning_rate": 3.155981290407456e-06, + "loss": 0.4737, + "step": 5331 + }, + { + "epoch": 2.5210401891252956, + "grad_norm": 2.847522258758545, + "learning_rate": 3.1553793011471924e-06, + "loss": 0.4394, + "step": 5332 + }, + { + "epoch": 2.521513002364066, + "grad_norm": 2.9561474323272705, + "learning_rate": 3.154777271079333e-06, + "loss": 0.47, + "step": 5333 + }, + { + "epoch": 2.5219858156028367, + "grad_norm": 2.8353018760681152, + "learning_rate": 3.154175200241365e-06, + "loss": 0.4015, + "step": 5334 + }, + { + "epoch": 2.5224586288416075, + "grad_norm": 2.609049081802368, + "learning_rate": 3.153573088670775e-06, + "loss": 0.4723, + "step": 5335 + }, + { + "epoch": 2.5229314420803783, + "grad_norm": 2.8538455963134766, + "learning_rate": 3.1529709364050556e-06, + "loss": 0.4665, + "step": 5336 + }, + { + "epoch": 2.523404255319149, + "grad_norm": 2.768310785293579, + "learning_rate": 3.1523687434816978e-06, + "loss": 0.4933, + "step": 5337 + }, + { + "epoch": 2.5238770685579195, + "grad_norm": 2.9300906658172607, + "learning_rate": 3.1517665099382e-06, + "loss": 0.4651, + "step": 5338 + }, + { + "epoch": 2.5243498817966903, + "grad_norm": 2.6984703540802, + "learning_rate": 3.1511642358120585e-06, + "loss": 0.4442, + "step": 5339 + }, + { + "epoch": 2.524822695035461, + "grad_norm": 2.8148467540740967, + "learning_rate": 3.1505619211407762e-06, + "loss": 0.4611, + "step": 5340 + }, + { + "epoch": 2.5252955082742314, + "grad_norm": 2.816436290740967, + "learning_rate": 3.1499595659618556e-06, + "loss": 0.5291, + "step": 5341 + }, + { + "epoch": 2.5257683215130022, + "grad_norm": 2.902805805206299, + "learning_rate": 3.149357170312802e-06, + "loss": 0.4394, + "step": 5342 + }, + { + "epoch": 2.526241134751773, + "grad_norm": 2.6443474292755127, + "learning_rate": 3.148754734231126e-06, + "loss": 0.4444, + "step": 5343 + }, + { + "epoch": 2.526713947990544, + "grad_norm": 2.6818583011627197, + "learning_rate": 3.148152257754336e-06, + "loss": 0.4256, + "step": 5344 + }, + { + "epoch": 2.5271867612293146, + "grad_norm": 2.5266945362091064, + "learning_rate": 3.1475497409199485e-06, + "loss": 0.4087, + "step": 5345 + }, + { + "epoch": 2.527659574468085, + "grad_norm": 2.6326711177825928, + "learning_rate": 3.146947183765477e-06, + "loss": 0.3842, + "step": 5346 + }, + { + "epoch": 2.5281323877068558, + "grad_norm": 3.122880697250366, + "learning_rate": 3.1463445863284413e-06, + "loss": 0.482, + "step": 5347 + }, + { + "epoch": 2.5286052009456266, + "grad_norm": 2.819258213043213, + "learning_rate": 3.145741948646362e-06, + "loss": 0.4628, + "step": 5348 + }, + { + "epoch": 2.529078014184397, + "grad_norm": 2.5842230319976807, + "learning_rate": 3.145139270756764e-06, + "loss": 0.4479, + "step": 5349 + }, + { + "epoch": 2.5295508274231677, + "grad_norm": 2.7257237434387207, + "learning_rate": 3.144536552697172e-06, + "loss": 0.473, + "step": 5350 + }, + { + "epoch": 2.5300236406619385, + "grad_norm": 2.6876981258392334, + "learning_rate": 3.143933794505115e-06, + "loss": 0.4615, + "step": 5351 + }, + { + "epoch": 2.5304964539007093, + "grad_norm": 2.7942895889282227, + "learning_rate": 3.143330996218124e-06, + "loss": 0.4982, + "step": 5352 + }, + { + "epoch": 2.53096926713948, + "grad_norm": 2.3150579929351807, + "learning_rate": 3.1427281578737327e-06, + "loss": 0.3905, + "step": 5353 + }, + { + "epoch": 2.5314420803782505, + "grad_norm": 2.7326138019561768, + "learning_rate": 3.142125279509478e-06, + "loss": 0.4076, + "step": 5354 + }, + { + "epoch": 2.5319148936170213, + "grad_norm": 2.46362566947937, + "learning_rate": 3.1415223611628976e-06, + "loss": 0.4043, + "step": 5355 + }, + { + "epoch": 2.532387706855792, + "grad_norm": 2.6670427322387695, + "learning_rate": 3.1409194028715323e-06, + "loss": 0.484, + "step": 5356 + }, + { + "epoch": 2.5328605200945624, + "grad_norm": 2.917771100997925, + "learning_rate": 3.140316404672926e-06, + "loss": 0.4539, + "step": 5357 + }, + { + "epoch": 2.533333333333333, + "grad_norm": 2.7964110374450684, + "learning_rate": 3.1397133666046254e-06, + "loss": 0.4706, + "step": 5358 + }, + { + "epoch": 2.533806146572104, + "grad_norm": 2.6481330394744873, + "learning_rate": 3.139110288704179e-06, + "loss": 0.4101, + "step": 5359 + }, + { + "epoch": 2.534278959810875, + "grad_norm": 2.859452962875366, + "learning_rate": 3.1385071710091365e-06, + "loss": 0.4842, + "step": 5360 + }, + { + "epoch": 2.5347517730496456, + "grad_norm": 2.686077356338501, + "learning_rate": 3.137904013557052e-06, + "loss": 0.4073, + "step": 5361 + }, + { + "epoch": 2.535224586288416, + "grad_norm": 3.7147045135498047, + "learning_rate": 3.137300816385482e-06, + "loss": 0.4536, + "step": 5362 + }, + { + "epoch": 2.5356973995271868, + "grad_norm": 2.51054048538208, + "learning_rate": 3.1366975795319856e-06, + "loss": 0.4171, + "step": 5363 + }, + { + "epoch": 2.5361702127659576, + "grad_norm": 3.043149471282959, + "learning_rate": 3.136094303034121e-06, + "loss": 0.5179, + "step": 5364 + }, + { + "epoch": 2.536643026004728, + "grad_norm": 2.398878812789917, + "learning_rate": 3.1354909869294548e-06, + "loss": 0.4144, + "step": 5365 + }, + { + "epoch": 2.5371158392434987, + "grad_norm": 2.969712257385254, + "learning_rate": 3.134887631255551e-06, + "loss": 0.3983, + "step": 5366 + }, + { + "epoch": 2.5375886524822695, + "grad_norm": 2.7707982063293457, + "learning_rate": 3.134284236049978e-06, + "loss": 0.4405, + "step": 5367 + }, + { + "epoch": 2.5380614657210403, + "grad_norm": 2.579742193222046, + "learning_rate": 3.1336808013503073e-06, + "loss": 0.4402, + "step": 5368 + }, + { + "epoch": 2.538534278959811, + "grad_norm": 2.6041927337646484, + "learning_rate": 3.1330773271941113e-06, + "loss": 0.396, + "step": 5369 + }, + { + "epoch": 2.5390070921985815, + "grad_norm": 2.7383856773376465, + "learning_rate": 3.1324738136189658e-06, + "loss": 0.4424, + "step": 5370 + }, + { + "epoch": 2.5394799054373522, + "grad_norm": 3.053644895553589, + "learning_rate": 3.13187026066245e-06, + "loss": 0.473, + "step": 5371 + }, + { + "epoch": 2.539952718676123, + "grad_norm": 2.684244155883789, + "learning_rate": 3.1312666683621428e-06, + "loss": 0.3963, + "step": 5372 + }, + { + "epoch": 2.5404255319148934, + "grad_norm": 2.6505017280578613, + "learning_rate": 3.130663036755629e-06, + "loss": 0.4292, + "step": 5373 + }, + { + "epoch": 2.540898345153664, + "grad_norm": 3.025965929031372, + "learning_rate": 3.1300593658804935e-06, + "loss": 0.4539, + "step": 5374 + }, + { + "epoch": 2.541371158392435, + "grad_norm": 2.72106671333313, + "learning_rate": 3.1294556557743237e-06, + "loss": 0.4519, + "step": 5375 + }, + { + "epoch": 2.541843971631206, + "grad_norm": 2.759995222091675, + "learning_rate": 3.12885190647471e-06, + "loss": 0.451, + "step": 5376 + }, + { + "epoch": 2.5423167848699766, + "grad_norm": 2.697950601577759, + "learning_rate": 3.1282481180192457e-06, + "loss": 0.4328, + "step": 5377 + }, + { + "epoch": 2.542789598108747, + "grad_norm": 2.6970415115356445, + "learning_rate": 3.127644290445526e-06, + "loss": 0.4489, + "step": 5378 + }, + { + "epoch": 2.5432624113475177, + "grad_norm": 2.5856997966766357, + "learning_rate": 3.127040423791148e-06, + "loss": 0.3848, + "step": 5379 + }, + { + "epoch": 2.5437352245862885, + "grad_norm": 2.9798166751861572, + "learning_rate": 3.1264365180937127e-06, + "loss": 0.5038, + "step": 5380 + }, + { + "epoch": 2.544208037825059, + "grad_norm": 3.413175106048584, + "learning_rate": 3.1258325733908224e-06, + "loss": 0.5247, + "step": 5381 + }, + { + "epoch": 2.5446808510638297, + "grad_norm": 2.838517904281616, + "learning_rate": 3.1252285897200818e-06, + "loss": 0.4652, + "step": 5382 + }, + { + "epoch": 2.5451536643026005, + "grad_norm": 2.8342528343200684, + "learning_rate": 3.1246245671190983e-06, + "loss": 0.4245, + "step": 5383 + }, + { + "epoch": 2.5456264775413713, + "grad_norm": 3.06026029586792, + "learning_rate": 3.124020505625482e-06, + "loss": 0.469, + "step": 5384 + }, + { + "epoch": 2.546099290780142, + "grad_norm": 2.633894681930542, + "learning_rate": 3.1234164052768452e-06, + "loss": 0.4509, + "step": 5385 + }, + { + "epoch": 2.5465721040189124, + "grad_norm": 2.634819984436035, + "learning_rate": 3.1228122661108023e-06, + "loss": 0.4879, + "step": 5386 + }, + { + "epoch": 2.5470449172576832, + "grad_norm": 3.9843504428863525, + "learning_rate": 3.1222080881649707e-06, + "loss": 0.4472, + "step": 5387 + }, + { + "epoch": 2.547517730496454, + "grad_norm": 2.5480258464813232, + "learning_rate": 3.1216038714769694e-06, + "loss": 0.4396, + "step": 5388 + }, + { + "epoch": 2.5479905437352244, + "grad_norm": 2.7461917400360107, + "learning_rate": 3.12099961608442e-06, + "loss": 0.4735, + "step": 5389 + }, + { + "epoch": 2.548463356973995, + "grad_norm": 3.167769193649292, + "learning_rate": 3.1203953220249493e-06, + "loss": 0.4196, + "step": 5390 + }, + { + "epoch": 2.548936170212766, + "grad_norm": 2.721696615219116, + "learning_rate": 3.1197909893361814e-06, + "loss": 0.4571, + "step": 5391 + }, + { + "epoch": 2.5494089834515368, + "grad_norm": 2.726668119430542, + "learning_rate": 3.1191866180557463e-06, + "loss": 0.4856, + "step": 5392 + }, + { + "epoch": 2.5498817966903076, + "grad_norm": 2.602205276489258, + "learning_rate": 3.1185822082212754e-06, + "loss": 0.4631, + "step": 5393 + }, + { + "epoch": 2.550354609929078, + "grad_norm": 2.7715859413146973, + "learning_rate": 3.1179777598704025e-06, + "loss": 0.4136, + "step": 5394 + }, + { + "epoch": 2.5508274231678487, + "grad_norm": 2.8081955909729004, + "learning_rate": 3.1173732730407647e-06, + "loss": 0.4963, + "step": 5395 + }, + { + "epoch": 2.5513002364066195, + "grad_norm": 2.946772336959839, + "learning_rate": 3.1167687477700006e-06, + "loss": 0.4443, + "step": 5396 + }, + { + "epoch": 2.55177304964539, + "grad_norm": 2.89345383644104, + "learning_rate": 3.1161641840957503e-06, + "loss": 0.4377, + "step": 5397 + }, + { + "epoch": 2.5522458628841607, + "grad_norm": 2.908317804336548, + "learning_rate": 3.115559582055659e-06, + "loss": 0.4702, + "step": 5398 + }, + { + "epoch": 2.5527186761229315, + "grad_norm": 2.554417848587036, + "learning_rate": 3.1149549416873704e-06, + "loss": 0.3738, + "step": 5399 + }, + { + "epoch": 2.5531914893617023, + "grad_norm": 2.3132457733154297, + "learning_rate": 3.1143502630285356e-06, + "loss": 0.4074, + "step": 5400 + }, + { + "epoch": 2.553664302600473, + "grad_norm": 2.751666784286499, + "learning_rate": 3.1137455461168026e-06, + "loss": 0.4697, + "step": 5401 + }, + { + "epoch": 2.5541371158392434, + "grad_norm": 2.7088871002197266, + "learning_rate": 3.113140790989826e-06, + "loss": 0.4754, + "step": 5402 + }, + { + "epoch": 2.554609929078014, + "grad_norm": 3.0633046627044678, + "learning_rate": 3.1125359976852605e-06, + "loss": 0.4874, + "step": 5403 + }, + { + "epoch": 2.555082742316785, + "grad_norm": 3.399456024169922, + "learning_rate": 3.111931166240764e-06, + "loss": 0.5529, + "step": 5404 + }, + { + "epoch": 2.5555555555555554, + "grad_norm": 2.7729690074920654, + "learning_rate": 3.1113262966939985e-06, + "loss": 0.4677, + "step": 5405 + }, + { + "epoch": 2.556028368794326, + "grad_norm": 2.81025767326355, + "learning_rate": 3.1107213890826244e-06, + "loss": 0.4954, + "step": 5406 + }, + { + "epoch": 2.556501182033097, + "grad_norm": 2.4837241172790527, + "learning_rate": 3.110116443444307e-06, + "loss": 0.3681, + "step": 5407 + }, + { + "epoch": 2.5569739952718678, + "grad_norm": 2.6406874656677246, + "learning_rate": 3.109511459816714e-06, + "loss": 0.4569, + "step": 5408 + }, + { + "epoch": 2.5574468085106385, + "grad_norm": 2.6093738079071045, + "learning_rate": 3.1089064382375155e-06, + "loss": 0.413, + "step": 5409 + }, + { + "epoch": 2.557919621749409, + "grad_norm": 2.6629011631011963, + "learning_rate": 3.108301378744383e-06, + "loss": 0.4286, + "step": 5410 + }, + { + "epoch": 2.5583924349881797, + "grad_norm": 2.694796323776245, + "learning_rate": 3.10769628137499e-06, + "loss": 0.4316, + "step": 5411 + }, + { + "epoch": 2.5588652482269505, + "grad_norm": 2.88023042678833, + "learning_rate": 3.107091146167015e-06, + "loss": 0.4378, + "step": 5412 + }, + { + "epoch": 2.559338061465721, + "grad_norm": 2.8804919719696045, + "learning_rate": 3.1064859731581365e-06, + "loss": 0.4971, + "step": 5413 + }, + { + "epoch": 2.5598108747044916, + "grad_norm": 2.850468397140503, + "learning_rate": 3.1058807623860353e-06, + "loss": 0.4686, + "step": 5414 + }, + { + "epoch": 2.5602836879432624, + "grad_norm": 3.0548019409179688, + "learning_rate": 3.1052755138883963e-06, + "loss": 0.4497, + "step": 5415 + }, + { + "epoch": 2.5607565011820332, + "grad_norm": 3.10168719291687, + "learning_rate": 3.1046702277029046e-06, + "loss": 0.569, + "step": 5416 + }, + { + "epoch": 2.561229314420804, + "grad_norm": 2.5887374877929688, + "learning_rate": 3.1040649038672494e-06, + "loss": 0.3812, + "step": 5417 + }, + { + "epoch": 2.5617021276595744, + "grad_norm": 2.9928438663482666, + "learning_rate": 3.1034595424191212e-06, + "loss": 0.4308, + "step": 5418 + }, + { + "epoch": 2.562174940898345, + "grad_norm": 2.7003073692321777, + "learning_rate": 3.102854143396214e-06, + "loss": 0.4967, + "step": 5419 + }, + { + "epoch": 2.562647754137116, + "grad_norm": 3.172868490219116, + "learning_rate": 3.102248706836222e-06, + "loss": 0.5311, + "step": 5420 + }, + { + "epoch": 2.5631205673758863, + "grad_norm": 3.0146191120147705, + "learning_rate": 3.101643232776844e-06, + "loss": 0.4714, + "step": 5421 + }, + { + "epoch": 2.563593380614657, + "grad_norm": 3.0683791637420654, + "learning_rate": 3.1010377212557806e-06, + "loss": 0.4047, + "step": 5422 + }, + { + "epoch": 2.564066193853428, + "grad_norm": 2.8260676860809326, + "learning_rate": 3.1004321723107334e-06, + "loss": 0.5282, + "step": 5423 + }, + { + "epoch": 2.5645390070921987, + "grad_norm": 3.0792388916015625, + "learning_rate": 3.0998265859794074e-06, + "loss": 0.5323, + "step": 5424 + }, + { + "epoch": 2.5650118203309695, + "grad_norm": 2.7332866191864014, + "learning_rate": 3.09922096229951e-06, + "loss": 0.4401, + "step": 5425 + }, + { + "epoch": 2.56548463356974, + "grad_norm": 2.9366047382354736, + "learning_rate": 3.098615301308751e-06, + "loss": 0.4495, + "step": 5426 + }, + { + "epoch": 2.5659574468085107, + "grad_norm": 2.982088565826416, + "learning_rate": 3.098009603044842e-06, + "loss": 0.495, + "step": 5427 + }, + { + "epoch": 2.5664302600472815, + "grad_norm": 3.1204755306243896, + "learning_rate": 3.0974038675454976e-06, + "loss": 0.4354, + "step": 5428 + }, + { + "epoch": 2.566903073286052, + "grad_norm": 2.835238218307495, + "learning_rate": 3.0967980948484333e-06, + "loss": 0.4161, + "step": 5429 + }, + { + "epoch": 2.5673758865248226, + "grad_norm": 2.8104958534240723, + "learning_rate": 3.096192284991369e-06, + "loss": 0.5045, + "step": 5430 + }, + { + "epoch": 2.5678486997635934, + "grad_norm": 3.1636080741882324, + "learning_rate": 3.0955864380120247e-06, + "loss": 0.4533, + "step": 5431 + }, + { + "epoch": 2.568321513002364, + "grad_norm": 2.980112314224243, + "learning_rate": 3.0949805539481247e-06, + "loss": 0.3998, + "step": 5432 + }, + { + "epoch": 2.568794326241135, + "grad_norm": 2.6379945278167725, + "learning_rate": 3.0943746328373953e-06, + "loss": 0.3785, + "step": 5433 + }, + { + "epoch": 2.5692671394799054, + "grad_norm": 2.780930757522583, + "learning_rate": 3.0937686747175627e-06, + "loss": 0.4801, + "step": 5434 + }, + { + "epoch": 2.569739952718676, + "grad_norm": 2.6608550548553467, + "learning_rate": 3.0931626796263585e-06, + "loss": 0.4047, + "step": 5435 + }, + { + "epoch": 2.570212765957447, + "grad_norm": 3.130584716796875, + "learning_rate": 3.0925566476015156e-06, + "loss": 0.5049, + "step": 5436 + }, + { + "epoch": 2.5706855791962173, + "grad_norm": 2.9699313640594482, + "learning_rate": 3.0919505786807687e-06, + "loss": 0.3847, + "step": 5437 + }, + { + "epoch": 2.571158392434988, + "grad_norm": 2.919260025024414, + "learning_rate": 3.091344472901855e-06, + "loss": 0.4631, + "step": 5438 + }, + { + "epoch": 2.571631205673759, + "grad_norm": 2.956587553024292, + "learning_rate": 3.0907383303025134e-06, + "loss": 0.4974, + "step": 5439 + }, + { + "epoch": 2.5721040189125297, + "grad_norm": 2.758542776107788, + "learning_rate": 3.090132150920486e-06, + "loss": 0.4785, + "step": 5440 + }, + { + "epoch": 2.5725768321513005, + "grad_norm": 2.678469657897949, + "learning_rate": 3.0895259347935175e-06, + "loss": 0.4453, + "step": 5441 + }, + { + "epoch": 2.573049645390071, + "grad_norm": 2.6508545875549316, + "learning_rate": 3.088919681959355e-06, + "loss": 0.4426, + "step": 5442 + }, + { + "epoch": 2.5735224586288417, + "grad_norm": 2.6156187057495117, + "learning_rate": 3.0883133924557453e-06, + "loss": 0.4445, + "step": 5443 + }, + { + "epoch": 2.5739952718676125, + "grad_norm": 2.484374761581421, + "learning_rate": 3.08770706632044e-06, + "loss": 0.4155, + "step": 5444 + }, + { + "epoch": 2.574468085106383, + "grad_norm": 2.7465295791625977, + "learning_rate": 3.087100703591193e-06, + "loss": 0.4085, + "step": 5445 + }, + { + "epoch": 2.5749408983451536, + "grad_norm": 2.771740198135376, + "learning_rate": 3.08649430430576e-06, + "loss": 0.4313, + "step": 5446 + }, + { + "epoch": 2.5754137115839244, + "grad_norm": 2.7480874061584473, + "learning_rate": 3.0858878685018984e-06, + "loss": 0.3471, + "step": 5447 + }, + { + "epoch": 2.575886524822695, + "grad_norm": 2.894913673400879, + "learning_rate": 3.085281396217368e-06, + "loss": 0.4888, + "step": 5448 + }, + { + "epoch": 2.576359338061466, + "grad_norm": 3.037628173828125, + "learning_rate": 3.0846748874899306e-06, + "loss": 0.3976, + "step": 5449 + }, + { + "epoch": 2.5768321513002364, + "grad_norm": 2.4811434745788574, + "learning_rate": 3.0840683423573526e-06, + "loss": 0.4822, + "step": 5450 + }, + { + "epoch": 2.577304964539007, + "grad_norm": 3.0078725814819336, + "learning_rate": 3.0834617608573998e-06, + "loss": 0.4999, + "step": 5451 + }, + { + "epoch": 2.5777777777777775, + "grad_norm": 3.174154043197632, + "learning_rate": 3.0828551430278413e-06, + "loss": 0.4626, + "step": 5452 + }, + { + "epoch": 2.5782505910165483, + "grad_norm": 2.8277535438537598, + "learning_rate": 3.082248488906449e-06, + "loss": 0.4633, + "step": 5453 + }, + { + "epoch": 2.578723404255319, + "grad_norm": 2.731767416000366, + "learning_rate": 3.0816417985309966e-06, + "loss": 0.4148, + "step": 5454 + }, + { + "epoch": 2.57919621749409, + "grad_norm": 2.5480549335479736, + "learning_rate": 3.0810350719392597e-06, + "loss": 0.4773, + "step": 5455 + }, + { + "epoch": 2.5796690307328607, + "grad_norm": 2.9755172729492188, + "learning_rate": 3.080428309169017e-06, + "loss": 0.5107, + "step": 5456 + }, + { + "epoch": 2.580141843971631, + "grad_norm": 2.6499290466308594, + "learning_rate": 3.079821510258048e-06, + "loss": 0.3982, + "step": 5457 + }, + { + "epoch": 2.580614657210402, + "grad_norm": 2.663214921951294, + "learning_rate": 3.079214675244136e-06, + "loss": 0.4419, + "step": 5458 + }, + { + "epoch": 2.5810874704491726, + "grad_norm": 2.595489263534546, + "learning_rate": 3.078607804165066e-06, + "loss": 0.3958, + "step": 5459 + }, + { + "epoch": 2.581560283687943, + "grad_norm": 3.031458854675293, + "learning_rate": 3.0780008970586255e-06, + "loss": 0.518, + "step": 5460 + }, + { + "epoch": 2.582033096926714, + "grad_norm": 2.827071189880371, + "learning_rate": 3.077393953962603e-06, + "loss": 0.4397, + "step": 5461 + }, + { + "epoch": 2.5825059101654846, + "grad_norm": 2.656111240386963, + "learning_rate": 3.0767869749147917e-06, + "loss": 0.4912, + "step": 5462 + }, + { + "epoch": 2.5829787234042554, + "grad_norm": 2.545365333557129, + "learning_rate": 3.076179959952984e-06, + "loss": 0.3991, + "step": 5463 + }, + { + "epoch": 2.583451536643026, + "grad_norm": 2.5794365406036377, + "learning_rate": 3.075572909114977e-06, + "loss": 0.4499, + "step": 5464 + }, + { + "epoch": 2.5839243498817965, + "grad_norm": 2.787140369415283, + "learning_rate": 3.074965822438568e-06, + "loss": 0.386, + "step": 5465 + }, + { + "epoch": 2.5843971631205673, + "grad_norm": 2.6406853199005127, + "learning_rate": 3.0743586999615594e-06, + "loss": 0.4853, + "step": 5466 + }, + { + "epoch": 2.584869976359338, + "grad_norm": 2.8082082271575928, + "learning_rate": 3.073751541721752e-06, + "loss": 0.4669, + "step": 5467 + }, + { + "epoch": 2.5853427895981085, + "grad_norm": 2.8808975219726562, + "learning_rate": 3.073144347756952e-06, + "loss": 0.4193, + "step": 5468 + }, + { + "epoch": 2.5858156028368793, + "grad_norm": 2.823352813720703, + "learning_rate": 3.072537118104968e-06, + "loss": 0.482, + "step": 5469 + }, + { + "epoch": 2.58628841607565, + "grad_norm": 2.6454555988311768, + "learning_rate": 3.0719298528036073e-06, + "loss": 0.4667, + "step": 5470 + }, + { + "epoch": 2.586761229314421, + "grad_norm": 2.871145486831665, + "learning_rate": 3.0713225518906826e-06, + "loss": 0.5125, + "step": 5471 + }, + { + "epoch": 2.5872340425531917, + "grad_norm": 3.1301417350769043, + "learning_rate": 3.070715215404007e-06, + "loss": 0.4827, + "step": 5472 + }, + { + "epoch": 2.587706855791962, + "grad_norm": 2.31062912940979, + "learning_rate": 3.070107843381398e-06, + "loss": 0.3954, + "step": 5473 + }, + { + "epoch": 2.588179669030733, + "grad_norm": 2.8366353511810303, + "learning_rate": 3.069500435860674e-06, + "loss": 0.4597, + "step": 5474 + }, + { + "epoch": 2.5886524822695036, + "grad_norm": 2.900143623352051, + "learning_rate": 3.068892992879654e-06, + "loss": 0.4294, + "step": 5475 + }, + { + "epoch": 2.589125295508274, + "grad_norm": 2.923313617706299, + "learning_rate": 3.0682855144761626e-06, + "loss": 0.505, + "step": 5476 + }, + { + "epoch": 2.5895981087470448, + "grad_norm": 2.726475954055786, + "learning_rate": 3.0676780006880242e-06, + "loss": 0.4208, + "step": 5477 + }, + { + "epoch": 2.5900709219858156, + "grad_norm": 4.115052223205566, + "learning_rate": 3.0670704515530654e-06, + "loss": 0.466, + "step": 5478 + }, + { + "epoch": 2.5905437352245864, + "grad_norm": 2.6018717288970947, + "learning_rate": 3.0664628671091163e-06, + "loss": 0.4697, + "step": 5479 + }, + { + "epoch": 2.591016548463357, + "grad_norm": 2.7393722534179688, + "learning_rate": 3.0658552473940085e-06, + "loss": 0.4618, + "step": 5480 + }, + { + "epoch": 2.5914893617021275, + "grad_norm": 2.8406929969787598, + "learning_rate": 3.065247592445575e-06, + "loss": 0.4806, + "step": 5481 + }, + { + "epoch": 2.5919621749408983, + "grad_norm": 2.9773001670837402, + "learning_rate": 3.0646399023016525e-06, + "loss": 0.4764, + "step": 5482 + }, + { + "epoch": 2.592434988179669, + "grad_norm": 3.374643325805664, + "learning_rate": 3.0640321770000804e-06, + "loss": 0.4481, + "step": 5483 + }, + { + "epoch": 2.5929078014184395, + "grad_norm": 2.5742013454437256, + "learning_rate": 3.0634244165786965e-06, + "loss": 0.432, + "step": 5484 + }, + { + "epoch": 2.5933806146572103, + "grad_norm": 2.9390289783477783, + "learning_rate": 3.062816621075346e-06, + "loss": 0.3941, + "step": 5485 + }, + { + "epoch": 2.593853427895981, + "grad_norm": 2.683414936065674, + "learning_rate": 3.062208790527871e-06, + "loss": 0.4268, + "step": 5486 + }, + { + "epoch": 2.594326241134752, + "grad_norm": 2.689647674560547, + "learning_rate": 3.06160092497412e-06, + "loss": 0.4569, + "step": 5487 + }, + { + "epoch": 2.5947990543735227, + "grad_norm": 3.1170310974121094, + "learning_rate": 3.060993024451943e-06, + "loss": 0.4387, + "step": 5488 + }, + { + "epoch": 2.595271867612293, + "grad_norm": 2.8732447624206543, + "learning_rate": 3.0603850889991894e-06, + "loss": 0.451, + "step": 5489 + }, + { + "epoch": 2.595744680851064, + "grad_norm": 3.0444157123565674, + "learning_rate": 3.0597771186537135e-06, + "loss": 0.4691, + "step": 5490 + }, + { + "epoch": 2.5962174940898346, + "grad_norm": 2.3791720867156982, + "learning_rate": 3.0591691134533714e-06, + "loss": 0.4771, + "step": 5491 + }, + { + "epoch": 2.596690307328605, + "grad_norm": 3.0677225589752197, + "learning_rate": 3.05856107343602e-06, + "loss": 0.459, + "step": 5492 + }, + { + "epoch": 2.5971631205673757, + "grad_norm": 3.1702635288238525, + "learning_rate": 3.05795299863952e-06, + "loss": 0.4816, + "step": 5493 + }, + { + "epoch": 2.5976359338061465, + "grad_norm": 2.964869499206543, + "learning_rate": 3.057344889101734e-06, + "loss": 0.4369, + "step": 5494 + }, + { + "epoch": 2.5981087470449173, + "grad_norm": 3.1333882808685303, + "learning_rate": 3.056736744860525e-06, + "loss": 0.4178, + "step": 5495 + }, + { + "epoch": 2.598581560283688, + "grad_norm": 2.4340405464172363, + "learning_rate": 3.05612856595376e-06, + "loss": 0.4359, + "step": 5496 + }, + { + "epoch": 2.5990543735224585, + "grad_norm": 2.638620615005493, + "learning_rate": 3.0555203524193083e-06, + "loss": 0.3915, + "step": 5497 + }, + { + "epoch": 2.5995271867612293, + "grad_norm": 2.8218815326690674, + "learning_rate": 3.054912104295039e-06, + "loss": 0.4684, + "step": 5498 + }, + { + "epoch": 2.6, + "grad_norm": 2.6696009635925293, + "learning_rate": 3.054303821618827e-06, + "loss": 0.4073, + "step": 5499 + }, + { + "epoch": 2.6004728132387704, + "grad_norm": 2.3880512714385986, + "learning_rate": 3.0536955044285465e-06, + "loss": 0.3576, + "step": 5500 + }, + { + "epoch": 2.6009456264775412, + "grad_norm": 2.762890100479126, + "learning_rate": 3.053087152762075e-06, + "loss": 0.3857, + "step": 5501 + }, + { + "epoch": 2.601418439716312, + "grad_norm": 2.729033946990967, + "learning_rate": 3.052478766657292e-06, + "loss": 0.3935, + "step": 5502 + }, + { + "epoch": 2.601891252955083, + "grad_norm": 2.630490303039551, + "learning_rate": 3.051870346152078e-06, + "loss": 0.3932, + "step": 5503 + }, + { + "epoch": 2.6023640661938536, + "grad_norm": 3.0335981845855713, + "learning_rate": 3.051261891284318e-06, + "loss": 0.4313, + "step": 5504 + }, + { + "epoch": 2.602836879432624, + "grad_norm": 2.969888687133789, + "learning_rate": 3.0506534020918963e-06, + "loss": 0.4698, + "step": 5505 + }, + { + "epoch": 2.603309692671395, + "grad_norm": 3.093996524810791, + "learning_rate": 3.050044878612703e-06, + "loss": 0.5338, + "step": 5506 + }, + { + "epoch": 2.6037825059101656, + "grad_norm": 2.759993314743042, + "learning_rate": 3.049436320884626e-06, + "loss": 0.4429, + "step": 5507 + }, + { + "epoch": 2.604255319148936, + "grad_norm": 2.979422092437744, + "learning_rate": 3.0488277289455587e-06, + "loss": 0.4489, + "step": 5508 + }, + { + "epoch": 2.6047281323877067, + "grad_norm": 2.8266701698303223, + "learning_rate": 3.048219102833396e-06, + "loss": 0.489, + "step": 5509 + }, + { + "epoch": 2.6052009456264775, + "grad_norm": 2.2582461833953857, + "learning_rate": 3.047610442586033e-06, + "loss": 0.3759, + "step": 5510 + }, + { + "epoch": 2.6056737588652483, + "grad_norm": 3.078152894973755, + "learning_rate": 3.0470017482413694e-06, + "loss": 0.5059, + "step": 5511 + }, + { + "epoch": 2.606146572104019, + "grad_norm": 2.7895498275756836, + "learning_rate": 3.0463930198373047e-06, + "loss": 0.4752, + "step": 5512 + }, + { + "epoch": 2.6066193853427895, + "grad_norm": 3.2307958602905273, + "learning_rate": 3.045784257411743e-06, + "loss": 0.4847, + "step": 5513 + }, + { + "epoch": 2.6070921985815603, + "grad_norm": 2.793661594390869, + "learning_rate": 3.0451754610025884e-06, + "loss": 0.4492, + "step": 5514 + }, + { + "epoch": 2.607565011820331, + "grad_norm": 2.4443132877349854, + "learning_rate": 3.0445666306477484e-06, + "loss": 0.4174, + "step": 5515 + }, + { + "epoch": 2.6080378250591014, + "grad_norm": 2.628769636154175, + "learning_rate": 3.0439577663851326e-06, + "loss": 0.3889, + "step": 5516 + }, + { + "epoch": 2.608510638297872, + "grad_norm": 2.9367563724517822, + "learning_rate": 3.0433488682526525e-06, + "loss": 0.437, + "step": 5517 + }, + { + "epoch": 2.608983451536643, + "grad_norm": 3.171353340148926, + "learning_rate": 3.04273993628822e-06, + "loss": 0.47, + "step": 5518 + }, + { + "epoch": 2.609456264775414, + "grad_norm": 2.856576442718506, + "learning_rate": 3.0421309705297513e-06, + "loss": 0.4797, + "step": 5519 + }, + { + "epoch": 2.6099290780141846, + "grad_norm": 2.4926068782806396, + "learning_rate": 3.041521971015165e-06, + "loss": 0.4294, + "step": 5520 + }, + { + "epoch": 2.610401891252955, + "grad_norm": 2.7897613048553467, + "learning_rate": 3.040912937782379e-06, + "loss": 0.4388, + "step": 5521 + }, + { + "epoch": 2.6108747044917258, + "grad_norm": 3.588188886642456, + "learning_rate": 3.0403038708693173e-06, + "loss": 0.4027, + "step": 5522 + }, + { + "epoch": 2.6113475177304966, + "grad_norm": 3.5394980907440186, + "learning_rate": 3.0396947703139017e-06, + "loss": 0.4866, + "step": 5523 + }, + { + "epoch": 2.611820330969267, + "grad_norm": 3.086865186691284, + "learning_rate": 3.03908563615406e-06, + "loss": 0.4344, + "step": 5524 + }, + { + "epoch": 2.6122931442080377, + "grad_norm": 2.649564504623413, + "learning_rate": 3.0384764684277194e-06, + "loss": 0.4571, + "step": 5525 + }, + { + "epoch": 2.6127659574468085, + "grad_norm": 2.945234775543213, + "learning_rate": 3.0378672671728105e-06, + "loss": 0.4885, + "step": 5526 + }, + { + "epoch": 2.6132387706855793, + "grad_norm": 2.625424861907959, + "learning_rate": 3.037258032427265e-06, + "loss": 0.4095, + "step": 5527 + }, + { + "epoch": 2.61371158392435, + "grad_norm": 2.7597248554229736, + "learning_rate": 3.0366487642290175e-06, + "loss": 0.4393, + "step": 5528 + }, + { + "epoch": 2.6141843971631205, + "grad_norm": 2.721189260482788, + "learning_rate": 3.0360394626160043e-06, + "loss": 0.3865, + "step": 5529 + }, + { + "epoch": 2.6146572104018913, + "grad_norm": 2.624056339263916, + "learning_rate": 3.0354301276261656e-06, + "loss": 0.4273, + "step": 5530 + }, + { + "epoch": 2.615130023640662, + "grad_norm": 2.7764177322387695, + "learning_rate": 3.034820759297439e-06, + "loss": 0.4756, + "step": 5531 + }, + { + "epoch": 2.6156028368794324, + "grad_norm": 3.0841729640960693, + "learning_rate": 3.0342113576677696e-06, + "loss": 0.4907, + "step": 5532 + }, + { + "epoch": 2.616075650118203, + "grad_norm": 2.678715705871582, + "learning_rate": 3.0336019227751017e-06, + "loss": 0.4478, + "step": 5533 + }, + { + "epoch": 2.616548463356974, + "grad_norm": 2.378679037094116, + "learning_rate": 3.032992454657382e-06, + "loss": 0.3678, + "step": 5534 + }, + { + "epoch": 2.617021276595745, + "grad_norm": 2.792079210281372, + "learning_rate": 3.0323829533525583e-06, + "loss": 0.4115, + "step": 5535 + }, + { + "epoch": 2.6174940898345156, + "grad_norm": 2.738133192062378, + "learning_rate": 3.0317734188985832e-06, + "loss": 0.4152, + "step": 5536 + }, + { + "epoch": 2.617966903073286, + "grad_norm": 2.6963796615600586, + "learning_rate": 3.0311638513334084e-06, + "loss": 0.4096, + "step": 5537 + }, + { + "epoch": 2.6184397163120567, + "grad_norm": 2.694145679473877, + "learning_rate": 3.03055425069499e-06, + "loss": 0.3793, + "step": 5538 + }, + { + "epoch": 2.6189125295508275, + "grad_norm": 2.762403964996338, + "learning_rate": 3.0299446170212855e-06, + "loss": 0.459, + "step": 5539 + }, + { + "epoch": 2.619385342789598, + "grad_norm": 2.804382562637329, + "learning_rate": 3.0293349503502522e-06, + "loss": 0.4853, + "step": 5540 + }, + { + "epoch": 2.6198581560283687, + "grad_norm": 2.7768518924713135, + "learning_rate": 3.0287252507198537e-06, + "loss": 0.4496, + "step": 5541 + }, + { + "epoch": 2.6203309692671395, + "grad_norm": 2.9075138568878174, + "learning_rate": 3.028115518168052e-06, + "loss": 0.4498, + "step": 5542 + }, + { + "epoch": 2.6208037825059103, + "grad_norm": 2.8966822624206543, + "learning_rate": 3.0275057527328126e-06, + "loss": 0.4434, + "step": 5543 + }, + { + "epoch": 2.621276595744681, + "grad_norm": 2.8140156269073486, + "learning_rate": 3.0268959544521027e-06, + "loss": 0.3935, + "step": 5544 + }, + { + "epoch": 2.6217494089834514, + "grad_norm": 2.8606276512145996, + "learning_rate": 3.0262861233638924e-06, + "loss": 0.4222, + "step": 5545 + }, + { + "epoch": 2.6222222222222222, + "grad_norm": 3.003610134124756, + "learning_rate": 3.0256762595061522e-06, + "loss": 0.428, + "step": 5546 + }, + { + "epoch": 2.622695035460993, + "grad_norm": 2.725907802581787, + "learning_rate": 3.025066362916857e-06, + "loss": 0.3975, + "step": 5547 + }, + { + "epoch": 2.6231678486997634, + "grad_norm": 2.5247902870178223, + "learning_rate": 3.024456433633982e-06, + "loss": 0.4584, + "step": 5548 + }, + { + "epoch": 2.623640661938534, + "grad_norm": 2.932798147201538, + "learning_rate": 3.0238464716955045e-06, + "loss": 0.4991, + "step": 5549 + }, + { + "epoch": 2.624113475177305, + "grad_norm": 2.693547010421753, + "learning_rate": 3.023236477139404e-06, + "loss": 0.4405, + "step": 5550 + }, + { + "epoch": 2.6245862884160758, + "grad_norm": 3.2600035667419434, + "learning_rate": 3.022626450003662e-06, + "loss": 0.4904, + "step": 5551 + }, + { + "epoch": 2.6250591016548466, + "grad_norm": 2.9471960067749023, + "learning_rate": 3.0220163903262627e-06, + "loss": 0.4487, + "step": 5552 + }, + { + "epoch": 2.625531914893617, + "grad_norm": 2.583944082260132, + "learning_rate": 3.0214062981451926e-06, + "loss": 0.3552, + "step": 5553 + }, + { + "epoch": 2.6260047281323877, + "grad_norm": 2.675062656402588, + "learning_rate": 3.0207961734984377e-06, + "loss": 0.4524, + "step": 5554 + }, + { + "epoch": 2.6264775413711585, + "grad_norm": 3.0126802921295166, + "learning_rate": 3.0201860164239887e-06, + "loss": 0.4124, + "step": 5555 + }, + { + "epoch": 2.626950354609929, + "grad_norm": 2.490734577178955, + "learning_rate": 3.019575826959838e-06, + "loss": 0.4095, + "step": 5556 + }, + { + "epoch": 2.6274231678486997, + "grad_norm": 2.72817063331604, + "learning_rate": 3.018965605143978e-06, + "loss": 0.4298, + "step": 5557 + }, + { + "epoch": 2.6278959810874705, + "grad_norm": 3.1298327445983887, + "learning_rate": 3.0183553510144064e-06, + "loss": 0.4961, + "step": 5558 + }, + { + "epoch": 2.6283687943262413, + "grad_norm": 3.2379956245422363, + "learning_rate": 3.0177450646091195e-06, + "loss": 0.4943, + "step": 5559 + }, + { + "epoch": 2.628841607565012, + "grad_norm": 2.5040571689605713, + "learning_rate": 3.017134745966117e-06, + "loss": 0.3701, + "step": 5560 + }, + { + "epoch": 2.6293144208037824, + "grad_norm": 3.047184944152832, + "learning_rate": 3.0165243951234025e-06, + "loss": 0.4587, + "step": 5561 + }, + { + "epoch": 2.629787234042553, + "grad_norm": 2.4926774501800537, + "learning_rate": 3.0159140121189783e-06, + "loss": 0.3723, + "step": 5562 + }, + { + "epoch": 2.630260047281324, + "grad_norm": 2.5434961318969727, + "learning_rate": 3.015303596990851e-06, + "loss": 0.4176, + "step": 5563 + }, + { + "epoch": 2.6307328605200944, + "grad_norm": 2.5117976665496826, + "learning_rate": 3.0146931497770284e-06, + "loss": 0.4218, + "step": 5564 + }, + { + "epoch": 2.631205673758865, + "grad_norm": 2.9408798217773438, + "learning_rate": 3.0140826705155196e-06, + "loss": 0.4473, + "step": 5565 + }, + { + "epoch": 2.631678486997636, + "grad_norm": 2.996422052383423, + "learning_rate": 3.0134721592443385e-06, + "loss": 0.4513, + "step": 5566 + }, + { + "epoch": 2.6321513002364068, + "grad_norm": 2.984356164932251, + "learning_rate": 3.0128616160014955e-06, + "loss": 0.4749, + "step": 5567 + }, + { + "epoch": 2.6326241134751776, + "grad_norm": 2.6075069904327393, + "learning_rate": 3.0122510408250095e-06, + "loss": 0.4707, + "step": 5568 + }, + { + "epoch": 2.633096926713948, + "grad_norm": 2.9463071823120117, + "learning_rate": 3.0116404337528972e-06, + "loss": 0.5125, + "step": 5569 + }, + { + "epoch": 2.6335697399527187, + "grad_norm": 2.98574161529541, + "learning_rate": 3.0110297948231787e-06, + "loss": 0.4487, + "step": 5570 + }, + { + "epoch": 2.6340425531914895, + "grad_norm": 2.6039397716522217, + "learning_rate": 3.010419124073876e-06, + "loss": 0.4516, + "step": 5571 + }, + { + "epoch": 2.63451536643026, + "grad_norm": 2.8480236530303955, + "learning_rate": 3.0098084215430124e-06, + "loss": 0.4962, + "step": 5572 + }, + { + "epoch": 2.6349881796690307, + "grad_norm": 2.527597427368164, + "learning_rate": 3.0091976872686133e-06, + "loss": 0.435, + "step": 5573 + }, + { + "epoch": 2.6354609929078014, + "grad_norm": 2.898303508758545, + "learning_rate": 3.0085869212887076e-06, + "loss": 0.4473, + "step": 5574 + }, + { + "epoch": 2.6359338061465722, + "grad_norm": 2.981414318084717, + "learning_rate": 3.007976123641324e-06, + "loss": 0.4203, + "step": 5575 + }, + { + "epoch": 2.636406619385343, + "grad_norm": 3.219064474105835, + "learning_rate": 3.0073652943644947e-06, + "loss": 0.4596, + "step": 5576 + }, + { + "epoch": 2.6368794326241134, + "grad_norm": 2.7287049293518066, + "learning_rate": 3.0067544334962532e-06, + "loss": 0.433, + "step": 5577 + }, + { + "epoch": 2.637352245862884, + "grad_norm": 2.6232664585113525, + "learning_rate": 3.0061435410746352e-06, + "loss": 0.4254, + "step": 5578 + }, + { + "epoch": 2.637825059101655, + "grad_norm": 2.908311605453491, + "learning_rate": 3.0055326171376788e-06, + "loss": 0.4349, + "step": 5579 + }, + { + "epoch": 2.6382978723404253, + "grad_norm": 2.8369064331054688, + "learning_rate": 3.0049216617234224e-06, + "loss": 0.4675, + "step": 5580 + }, + { + "epoch": 2.638770685579196, + "grad_norm": 2.659499406814575, + "learning_rate": 3.0043106748699085e-06, + "loss": 0.4073, + "step": 5581 + }, + { + "epoch": 2.639243498817967, + "grad_norm": 2.579765558242798, + "learning_rate": 3.00369965661518e-06, + "loss": 0.4536, + "step": 5582 + }, + { + "epoch": 2.6397163120567377, + "grad_norm": 3.572861909866333, + "learning_rate": 3.0030886069972827e-06, + "loss": 0.5227, + "step": 5583 + }, + { + "epoch": 2.6401891252955085, + "grad_norm": 2.6523196697235107, + "learning_rate": 3.002477526054263e-06, + "loss": 0.3846, + "step": 5584 + }, + { + "epoch": 2.640661938534279, + "grad_norm": 3.072181463241577, + "learning_rate": 3.001866413824173e-06, + "loss": 0.5399, + "step": 5585 + }, + { + "epoch": 2.6411347517730497, + "grad_norm": 2.7304325103759766, + "learning_rate": 3.0012552703450597e-06, + "loss": 0.4048, + "step": 5586 + }, + { + "epoch": 2.6416075650118205, + "grad_norm": 3.039491891860962, + "learning_rate": 3.0006440956549798e-06, + "loss": 0.5035, + "step": 5587 + }, + { + "epoch": 2.642080378250591, + "grad_norm": 2.7623798847198486, + "learning_rate": 3.000032889791988e-06, + "loss": 0.4369, + "step": 5588 + }, + { + "epoch": 2.6425531914893616, + "grad_norm": 3.391052722930908, + "learning_rate": 2.9994216527941394e-06, + "loss": 0.5308, + "step": 5589 + }, + { + "epoch": 2.6430260047281324, + "grad_norm": 3.0263915061950684, + "learning_rate": 2.9988103846994954e-06, + "loss": 0.4319, + "step": 5590 + }, + { + "epoch": 2.6434988179669032, + "grad_norm": 2.786607027053833, + "learning_rate": 2.998199085546115e-06, + "loss": 0.4695, + "step": 5591 + }, + { + "epoch": 2.643971631205674, + "grad_norm": 2.884674310684204, + "learning_rate": 2.9975877553720627e-06, + "loss": 0.4615, + "step": 5592 + }, + { + "epoch": 2.6444444444444444, + "grad_norm": 2.6100499629974365, + "learning_rate": 2.996976394215402e-06, + "loss": 0.4784, + "step": 5593 + }, + { + "epoch": 2.644917257683215, + "grad_norm": 2.6978676319122314, + "learning_rate": 2.9963650021142018e-06, + "loss": 0.3911, + "step": 5594 + }, + { + "epoch": 2.645390070921986, + "grad_norm": 2.8080835342407227, + "learning_rate": 2.9957535791065284e-06, + "loss": 0.4997, + "step": 5595 + }, + { + "epoch": 2.6458628841607563, + "grad_norm": 2.6639578342437744, + "learning_rate": 2.9951421252304537e-06, + "loss": 0.4066, + "step": 5596 + }, + { + "epoch": 2.646335697399527, + "grad_norm": 3.102456569671631, + "learning_rate": 2.9945306405240505e-06, + "loss": 0.5554, + "step": 5597 + }, + { + "epoch": 2.646808510638298, + "grad_norm": 2.6524150371551514, + "learning_rate": 2.993919125025392e-06, + "loss": 0.3881, + "step": 5598 + }, + { + "epoch": 2.6472813238770687, + "grad_norm": 2.926316499710083, + "learning_rate": 2.993307578772556e-06, + "loss": 0.4845, + "step": 5599 + }, + { + "epoch": 2.6477541371158395, + "grad_norm": 3.346550703048706, + "learning_rate": 2.9926960018036195e-06, + "loss": 0.4481, + "step": 5600 + }, + { + "epoch": 2.64822695035461, + "grad_norm": 2.6211020946502686, + "learning_rate": 2.9920843941566634e-06, + "loss": 0.4355, + "step": 5601 + }, + { + "epoch": 2.6486997635933807, + "grad_norm": 2.7479333877563477, + "learning_rate": 2.99147275586977e-06, + "loss": 0.4373, + "step": 5602 + }, + { + "epoch": 2.6491725768321515, + "grad_norm": 2.523385524749756, + "learning_rate": 2.9908610869810235e-06, + "loss": 0.4467, + "step": 5603 + }, + { + "epoch": 2.649645390070922, + "grad_norm": 2.93886137008667, + "learning_rate": 2.9902493875285086e-06, + "loss": 0.4956, + "step": 5604 + }, + { + "epoch": 2.6501182033096926, + "grad_norm": 2.7630443572998047, + "learning_rate": 2.989637657550315e-06, + "loss": 0.5012, + "step": 5605 + }, + { + "epoch": 2.6505910165484634, + "grad_norm": 2.6733906269073486, + "learning_rate": 2.989025897084531e-06, + "loss": 0.446, + "step": 5606 + }, + { + "epoch": 2.651063829787234, + "grad_norm": 2.8411107063293457, + "learning_rate": 2.9884141061692484e-06, + "loss": 0.4817, + "step": 5607 + }, + { + "epoch": 2.651536643026005, + "grad_norm": 2.8667192459106445, + "learning_rate": 2.987802284842562e-06, + "loss": 0.3909, + "step": 5608 + }, + { + "epoch": 2.6520094562647754, + "grad_norm": 3.4640755653381348, + "learning_rate": 2.987190433142565e-06, + "loss": 0.4379, + "step": 5609 + }, + { + "epoch": 2.652482269503546, + "grad_norm": 2.675121307373047, + "learning_rate": 2.9865785511073565e-06, + "loss": 0.4833, + "step": 5610 + }, + { + "epoch": 2.652955082742317, + "grad_norm": 2.4375529289245605, + "learning_rate": 2.9859666387750353e-06, + "loss": 0.3949, + "step": 5611 + }, + { + "epoch": 2.6534278959810873, + "grad_norm": 2.7312581539154053, + "learning_rate": 2.9853546961837026e-06, + "loss": 0.4546, + "step": 5612 + }, + { + "epoch": 2.653900709219858, + "grad_norm": 2.7695999145507812, + "learning_rate": 2.9847427233714617e-06, + "loss": 0.4696, + "step": 5613 + }, + { + "epoch": 2.654373522458629, + "grad_norm": 2.6313109397888184, + "learning_rate": 2.984130720376416e-06, + "loss": 0.4733, + "step": 5614 + }, + { + "epoch": 2.6548463356973997, + "grad_norm": 2.656864881515503, + "learning_rate": 2.9835186872366733e-06, + "loss": 0.3806, + "step": 5615 + }, + { + "epoch": 2.65531914893617, + "grad_norm": 2.720075845718384, + "learning_rate": 2.982906623990342e-06, + "loss": 0.4041, + "step": 5616 + }, + { + "epoch": 2.655791962174941, + "grad_norm": 2.6684951782226562, + "learning_rate": 2.9822945306755334e-06, + "loss": 0.4552, + "step": 5617 + }, + { + "epoch": 2.6562647754137116, + "grad_norm": 2.567751884460449, + "learning_rate": 2.9816824073303585e-06, + "loss": 0.465, + "step": 5618 + }, + { + "epoch": 2.656737588652482, + "grad_norm": 2.7490367889404297, + "learning_rate": 2.981070253992933e-06, + "loss": 0.4647, + "step": 5619 + }, + { + "epoch": 2.657210401891253, + "grad_norm": 2.548656463623047, + "learning_rate": 2.9804580707013715e-06, + "loss": 0.4226, + "step": 5620 + }, + { + "epoch": 2.6576832151300236, + "grad_norm": 2.5484731197357178, + "learning_rate": 2.9798458574937927e-06, + "loss": 0.382, + "step": 5621 + }, + { + "epoch": 2.6581560283687944, + "grad_norm": 2.7293949127197266, + "learning_rate": 2.979233614408317e-06, + "loss": 0.4418, + "step": 5622 + }, + { + "epoch": 2.658628841607565, + "grad_norm": 2.645036458969116, + "learning_rate": 2.9786213414830646e-06, + "loss": 0.414, + "step": 5623 + }, + { + "epoch": 2.6591016548463355, + "grad_norm": 2.5287609100341797, + "learning_rate": 2.9780090387561604e-06, + "loss": 0.3914, + "step": 5624 + }, + { + "epoch": 2.6595744680851063, + "grad_norm": 2.5570411682128906, + "learning_rate": 2.9773967062657293e-06, + "loss": 0.4431, + "step": 5625 + }, + { + "epoch": 2.660047281323877, + "grad_norm": 2.681749105453491, + "learning_rate": 2.9767843440498983e-06, + "loss": 0.4245, + "step": 5626 + }, + { + "epoch": 2.6605200945626475, + "grad_norm": 2.8629777431488037, + "learning_rate": 2.976171952146798e-06, + "loss": 0.4643, + "step": 5627 + }, + { + "epoch": 2.6609929078014183, + "grad_norm": 2.577148199081421, + "learning_rate": 2.9755595305945573e-06, + "loss": 0.43, + "step": 5628 + }, + { + "epoch": 2.661465721040189, + "grad_norm": 2.747218370437622, + "learning_rate": 2.97494707943131e-06, + "loss": 0.5194, + "step": 5629 + }, + { + "epoch": 2.66193853427896, + "grad_norm": 2.535604953765869, + "learning_rate": 2.9743345986951904e-06, + "loss": 0.4401, + "step": 5630 + }, + { + "epoch": 2.6624113475177307, + "grad_norm": 3.3341166973114014, + "learning_rate": 2.973722088424336e-06, + "loss": 0.4925, + "step": 5631 + }, + { + "epoch": 2.662884160756501, + "grad_norm": 2.9264349937438965, + "learning_rate": 2.973109548656884e-06, + "loss": 0.4787, + "step": 5632 + }, + { + "epoch": 2.663356973995272, + "grad_norm": 2.7132506370544434, + "learning_rate": 2.9724969794309742e-06, + "loss": 0.4138, + "step": 5633 + }, + { + "epoch": 2.6638297872340426, + "grad_norm": 2.7970192432403564, + "learning_rate": 2.9718843807847497e-06, + "loss": 0.4896, + "step": 5634 + }, + { + "epoch": 2.664302600472813, + "grad_norm": 2.610208749771118, + "learning_rate": 2.9712717527563545e-06, + "loss": 0.3997, + "step": 5635 + }, + { + "epoch": 2.6647754137115838, + "grad_norm": 3.5483577251434326, + "learning_rate": 2.9706590953839335e-06, + "loss": 0.5109, + "step": 5636 + }, + { + "epoch": 2.6652482269503546, + "grad_norm": 2.746933698654175, + "learning_rate": 2.9700464087056345e-06, + "loss": 0.4672, + "step": 5637 + }, + { + "epoch": 2.6657210401891254, + "grad_norm": 2.704436779022217, + "learning_rate": 2.969433692759607e-06, + "loss": 0.4402, + "step": 5638 + }, + { + "epoch": 2.666193853427896, + "grad_norm": 2.859520196914673, + "learning_rate": 2.9688209475840005e-06, + "loss": 0.4679, + "step": 5639 + }, + { + "epoch": 2.6666666666666665, + "grad_norm": 2.518580436706543, + "learning_rate": 2.968208173216971e-06, + "loss": 0.3772, + "step": 5640 + }, + { + "epoch": 2.6671394799054373, + "grad_norm": 2.7624926567077637, + "learning_rate": 2.967595369696671e-06, + "loss": 0.4753, + "step": 5641 + }, + { + "epoch": 2.667612293144208, + "grad_norm": 2.654003620147705, + "learning_rate": 2.966982537061257e-06, + "loss": 0.4583, + "step": 5642 + }, + { + "epoch": 2.6680851063829785, + "grad_norm": 2.8473968505859375, + "learning_rate": 2.966369675348888e-06, + "loss": 0.4623, + "step": 5643 + }, + { + "epoch": 2.6685579196217493, + "grad_norm": 2.5587947368621826, + "learning_rate": 2.9657567845977253e-06, + "loss": 0.4014, + "step": 5644 + }, + { + "epoch": 2.66903073286052, + "grad_norm": 2.572220802307129, + "learning_rate": 2.96514386484593e-06, + "loss": 0.4249, + "step": 5645 + }, + { + "epoch": 2.669503546099291, + "grad_norm": 2.7995707988739014, + "learning_rate": 2.964530916131665e-06, + "loss": 0.4575, + "step": 5646 + }, + { + "epoch": 2.6699763593380617, + "grad_norm": 2.8712687492370605, + "learning_rate": 2.963917938493097e-06, + "loss": 0.4353, + "step": 5647 + }, + { + "epoch": 2.670449172576832, + "grad_norm": 2.856473207473755, + "learning_rate": 2.963304931968393e-06, + "loss": 0.4345, + "step": 5648 + }, + { + "epoch": 2.670921985815603, + "grad_norm": 2.709198474884033, + "learning_rate": 2.9626918965957224e-06, + "loss": 0.4116, + "step": 5649 + }, + { + "epoch": 2.6713947990543736, + "grad_norm": 2.8144607543945312, + "learning_rate": 2.962078832413257e-06, + "loss": 0.4575, + "step": 5650 + }, + { + "epoch": 2.671867612293144, + "grad_norm": 3.131911039352417, + "learning_rate": 2.961465739459168e-06, + "loss": 0.4743, + "step": 5651 + }, + { + "epoch": 2.6723404255319148, + "grad_norm": 2.8487515449523926, + "learning_rate": 2.9608526177716316e-06, + "loss": 0.4314, + "step": 5652 + }, + { + "epoch": 2.6728132387706856, + "grad_norm": 2.613229751586914, + "learning_rate": 2.960239467388823e-06, + "loss": 0.4807, + "step": 5653 + }, + { + "epoch": 2.6732860520094563, + "grad_norm": 2.5049116611480713, + "learning_rate": 2.9596262883489213e-06, + "loss": 0.4708, + "step": 5654 + }, + { + "epoch": 2.673758865248227, + "grad_norm": 2.6347460746765137, + "learning_rate": 2.9590130806901052e-06, + "loss": 0.3689, + "step": 5655 + }, + { + "epoch": 2.6742316784869975, + "grad_norm": 3.3290371894836426, + "learning_rate": 2.9583998444505578e-06, + "loss": 0.4674, + "step": 5656 + }, + { + "epoch": 2.6747044917257683, + "grad_norm": 2.748403549194336, + "learning_rate": 2.957786579668462e-06, + "loss": 0.3852, + "step": 5657 + }, + { + "epoch": 2.675177304964539, + "grad_norm": 2.837573766708374, + "learning_rate": 2.957173286382003e-06, + "loss": 0.4541, + "step": 5658 + }, + { + "epoch": 2.6756501182033094, + "grad_norm": 3.0976510047912598, + "learning_rate": 2.9565599646293686e-06, + "loss": 0.4669, + "step": 5659 + }, + { + "epoch": 2.6761229314420802, + "grad_norm": 2.7059597969055176, + "learning_rate": 2.955946614448747e-06, + "loss": 0.3935, + "step": 5660 + }, + { + "epoch": 2.676595744680851, + "grad_norm": 2.6700541973114014, + "learning_rate": 2.9553332358783294e-06, + "loss": 0.4322, + "step": 5661 + }, + { + "epoch": 2.677068557919622, + "grad_norm": 2.9782698154449463, + "learning_rate": 2.9547198289563068e-06, + "loss": 0.4338, + "step": 5662 + }, + { + "epoch": 2.6775413711583926, + "grad_norm": 2.637876510620117, + "learning_rate": 2.9541063937208755e-06, + "loss": 0.4289, + "step": 5663 + }, + { + "epoch": 2.678014184397163, + "grad_norm": 3.421949863433838, + "learning_rate": 2.953492930210229e-06, + "loss": 0.5458, + "step": 5664 + }, + { + "epoch": 2.678486997635934, + "grad_norm": 2.8273842334747314, + "learning_rate": 2.952879438462567e-06, + "loss": 0.4529, + "step": 5665 + }, + { + "epoch": 2.6789598108747046, + "grad_norm": 2.9090168476104736, + "learning_rate": 2.9522659185160873e-06, + "loss": 0.444, + "step": 5666 + }, + { + "epoch": 2.679432624113475, + "grad_norm": 2.646710157394409, + "learning_rate": 2.9516523704089927e-06, + "loss": 0.4226, + "step": 5667 + }, + { + "epoch": 2.6799054373522457, + "grad_norm": 2.65915584564209, + "learning_rate": 2.951038794179486e-06, + "loss": 0.4307, + "step": 5668 + }, + { + "epoch": 2.6803782505910165, + "grad_norm": 3.004507303237915, + "learning_rate": 2.950425189865771e-06, + "loss": 0.4799, + "step": 5669 + }, + { + "epoch": 2.6808510638297873, + "grad_norm": 2.5210134983062744, + "learning_rate": 2.949811557506054e-06, + "loss": 0.3842, + "step": 5670 + }, + { + "epoch": 2.681323877068558, + "grad_norm": 2.8072893619537354, + "learning_rate": 2.9491978971385436e-06, + "loss": 0.435, + "step": 5671 + }, + { + "epoch": 2.6817966903073285, + "grad_norm": 2.5701990127563477, + "learning_rate": 2.9485842088014498e-06, + "loss": 0.4932, + "step": 5672 + }, + { + "epoch": 2.6822695035460993, + "grad_norm": 2.9368457794189453, + "learning_rate": 2.9479704925329854e-06, + "loss": 0.455, + "step": 5673 + }, + { + "epoch": 2.68274231678487, + "grad_norm": 2.8576247692108154, + "learning_rate": 2.947356748371362e-06, + "loss": 0.4254, + "step": 5674 + }, + { + "epoch": 2.6832151300236404, + "grad_norm": 2.8999195098876953, + "learning_rate": 2.946742976354795e-06, + "loss": 0.4159, + "step": 5675 + }, + { + "epoch": 2.6836879432624112, + "grad_norm": 2.8439736366271973, + "learning_rate": 2.946129176521502e-06, + "loss": 0.4035, + "step": 5676 + }, + { + "epoch": 2.684160756501182, + "grad_norm": 2.8525729179382324, + "learning_rate": 2.945515348909702e-06, + "loss": 0.4137, + "step": 5677 + }, + { + "epoch": 2.684633569739953, + "grad_norm": 2.6573562622070312, + "learning_rate": 2.9449014935576147e-06, + "loss": 0.4203, + "step": 5678 + }, + { + "epoch": 2.6851063829787236, + "grad_norm": 2.765794277191162, + "learning_rate": 2.9442876105034616e-06, + "loss": 0.5184, + "step": 5679 + }, + { + "epoch": 2.685579196217494, + "grad_norm": 2.694617748260498, + "learning_rate": 2.943673699785467e-06, + "loss": 0.417, + "step": 5680 + }, + { + "epoch": 2.6860520094562648, + "grad_norm": 2.740774393081665, + "learning_rate": 2.943059761441857e-06, + "loss": 0.4431, + "step": 5681 + }, + { + "epoch": 2.6865248226950356, + "grad_norm": 2.670642614364624, + "learning_rate": 2.942445795510859e-06, + "loss": 0.4298, + "step": 5682 + }, + { + "epoch": 2.686997635933806, + "grad_norm": 2.838907241821289, + "learning_rate": 2.9418318020307e-06, + "loss": 0.4529, + "step": 5683 + }, + { + "epoch": 2.6874704491725767, + "grad_norm": 2.562317371368408, + "learning_rate": 2.9412177810396135e-06, + "loss": 0.4251, + "step": 5684 + }, + { + "epoch": 2.6879432624113475, + "grad_norm": 2.5805928707122803, + "learning_rate": 2.9406037325758298e-06, + "loss": 0.4405, + "step": 5685 + }, + { + "epoch": 2.6884160756501183, + "grad_norm": 2.5701205730438232, + "learning_rate": 2.939989656677583e-06, + "loss": 0.4184, + "step": 5686 + }, + { + "epoch": 2.688888888888889, + "grad_norm": 2.7990400791168213, + "learning_rate": 2.939375553383111e-06, + "loss": 0.4866, + "step": 5687 + }, + { + "epoch": 2.6893617021276595, + "grad_norm": 3.063319206237793, + "learning_rate": 2.9387614227306487e-06, + "loss": 0.4202, + "step": 5688 + }, + { + "epoch": 2.6898345153664303, + "grad_norm": 3.0891315937042236, + "learning_rate": 2.938147264758437e-06, + "loss": 0.4344, + "step": 5689 + }, + { + "epoch": 2.690307328605201, + "grad_norm": 2.8982670307159424, + "learning_rate": 2.9375330795047165e-06, + "loss": 0.4548, + "step": 5690 + }, + { + "epoch": 2.6907801418439714, + "grad_norm": 2.7947235107421875, + "learning_rate": 2.9369188670077293e-06, + "loss": 0.5028, + "step": 5691 + }, + { + "epoch": 2.691252955082742, + "grad_norm": 3.1615960597991943, + "learning_rate": 2.9363046273057206e-06, + "loss": 0.4855, + "step": 5692 + }, + { + "epoch": 2.691725768321513, + "grad_norm": 2.669516086578369, + "learning_rate": 2.935690360436935e-06, + "loss": 0.3813, + "step": 5693 + }, + { + "epoch": 2.692198581560284, + "grad_norm": 2.8743274211883545, + "learning_rate": 2.935076066439622e-06, + "loss": 0.4302, + "step": 5694 + }, + { + "epoch": 2.6926713947990546, + "grad_norm": 2.6829612255096436, + "learning_rate": 2.9344617453520295e-06, + "loss": 0.4063, + "step": 5695 + }, + { + "epoch": 2.693144208037825, + "grad_norm": 2.776447057723999, + "learning_rate": 2.9338473972124097e-06, + "loss": 0.4921, + "step": 5696 + }, + { + "epoch": 2.6936170212765957, + "grad_norm": 2.7865772247314453, + "learning_rate": 2.9332330220590143e-06, + "loss": 0.4939, + "step": 5697 + }, + { + "epoch": 2.6940898345153665, + "grad_norm": 3.020526170730591, + "learning_rate": 2.932618619930098e-06, + "loss": 0.4839, + "step": 5698 + }, + { + "epoch": 2.694562647754137, + "grad_norm": 2.637057065963745, + "learning_rate": 2.932004190863918e-06, + "loss": 0.4343, + "step": 5699 + }, + { + "epoch": 2.6950354609929077, + "grad_norm": 2.7426512241363525, + "learning_rate": 2.9313897348987314e-06, + "loss": 0.3609, + "step": 5700 + }, + { + "epoch": 2.6955082742316785, + "grad_norm": 2.767186164855957, + "learning_rate": 2.9307752520727974e-06, + "loss": 0.3793, + "step": 5701 + }, + { + "epoch": 2.6959810874704493, + "grad_norm": 2.4791622161865234, + "learning_rate": 2.930160742424377e-06, + "loss": 0.4192, + "step": 5702 + }, + { + "epoch": 2.69645390070922, + "grad_norm": 2.661461591720581, + "learning_rate": 2.9295462059917336e-06, + "loss": 0.4758, + "step": 5703 + }, + { + "epoch": 2.6969267139479904, + "grad_norm": 2.896242380142212, + "learning_rate": 2.928931642813131e-06, + "loss": 0.42, + "step": 5704 + }, + { + "epoch": 2.6973995271867612, + "grad_norm": 2.783813238143921, + "learning_rate": 2.9283170529268366e-06, + "loss": 0.4726, + "step": 5705 + }, + { + "epoch": 2.697872340425532, + "grad_norm": 2.4347333908081055, + "learning_rate": 2.927702436371117e-06, + "loss": 0.4199, + "step": 5706 + }, + { + "epoch": 2.6983451536643024, + "grad_norm": 2.4643805027008057, + "learning_rate": 2.927087793184242e-06, + "loss": 0.3578, + "step": 5707 + }, + { + "epoch": 2.698817966903073, + "grad_norm": 2.6396660804748535, + "learning_rate": 2.9264731234044835e-06, + "loss": 0.4509, + "step": 5708 + }, + { + "epoch": 2.699290780141844, + "grad_norm": 2.7341182231903076, + "learning_rate": 2.925858427070113e-06, + "loss": 0.4331, + "step": 5709 + }, + { + "epoch": 2.699763593380615, + "grad_norm": 2.7578938007354736, + "learning_rate": 2.9252437042194058e-06, + "loss": 0.4508, + "step": 5710 + }, + { + "epoch": 2.7002364066193856, + "grad_norm": 2.557788133621216, + "learning_rate": 2.9246289548906375e-06, + "loss": 0.3775, + "step": 5711 + }, + { + "epoch": 2.700709219858156, + "grad_norm": 2.802851676940918, + "learning_rate": 2.924014179122086e-06, + "loss": 0.4518, + "step": 5712 + }, + { + "epoch": 2.7011820330969267, + "grad_norm": 2.4773001670837402, + "learning_rate": 2.9233993769520313e-06, + "loss": 0.4019, + "step": 5713 + }, + { + "epoch": 2.7016548463356975, + "grad_norm": 3.108971357345581, + "learning_rate": 2.922784548418754e-06, + "loss": 0.4715, + "step": 5714 + }, + { + "epoch": 2.702127659574468, + "grad_norm": 2.8596770763397217, + "learning_rate": 2.9221696935605366e-06, + "loss": 0.4361, + "step": 5715 + }, + { + "epoch": 2.7026004728132387, + "grad_norm": 2.570604085922241, + "learning_rate": 2.9215548124156633e-06, + "loss": 0.3982, + "step": 5716 + }, + { + "epoch": 2.7030732860520095, + "grad_norm": 2.3157799243927, + "learning_rate": 2.9209399050224206e-06, + "loss": 0.456, + "step": 5717 + }, + { + "epoch": 2.7035460992907803, + "grad_norm": 2.6865758895874023, + "learning_rate": 2.9203249714190952e-06, + "loss": 0.4441, + "step": 5718 + }, + { + "epoch": 2.704018912529551, + "grad_norm": 2.76723313331604, + "learning_rate": 2.919710011643978e-06, + "loss": 0.464, + "step": 5719 + }, + { + "epoch": 2.7044917257683214, + "grad_norm": 2.648792028427124, + "learning_rate": 2.9190950257353578e-06, + "loss": 0.3426, + "step": 5720 + }, + { + "epoch": 2.704964539007092, + "grad_norm": 2.878739833831787, + "learning_rate": 2.9184800137315276e-06, + "loss": 0.4431, + "step": 5721 + }, + { + "epoch": 2.705437352245863, + "grad_norm": 2.670567274093628, + "learning_rate": 2.917864975670783e-06, + "loss": 0.4347, + "step": 5722 + }, + { + "epoch": 2.7059101654846334, + "grad_norm": 2.7031569480895996, + "learning_rate": 2.9172499115914184e-06, + "loss": 0.4557, + "step": 5723 + }, + { + "epoch": 2.706382978723404, + "grad_norm": 2.5225696563720703, + "learning_rate": 2.9166348215317314e-06, + "loss": 0.4159, + "step": 5724 + }, + { + "epoch": 2.706855791962175, + "grad_norm": 2.8676085472106934, + "learning_rate": 2.916019705530021e-06, + "loss": 0.5018, + "step": 5725 + }, + { + "epoch": 2.7073286052009458, + "grad_norm": 2.576463460922241, + "learning_rate": 2.915404563624587e-06, + "loss": 0.4317, + "step": 5726 + }, + { + "epoch": 2.7078014184397166, + "grad_norm": 3.155565023422241, + "learning_rate": 2.9147893958537328e-06, + "loss": 0.5029, + "step": 5727 + }, + { + "epoch": 2.708274231678487, + "grad_norm": 2.604079008102417, + "learning_rate": 2.9141742022557622e-06, + "loss": 0.4324, + "step": 5728 + }, + { + "epoch": 2.7087470449172577, + "grad_norm": 2.6597228050231934, + "learning_rate": 2.913558982868979e-06, + "loss": 0.4335, + "step": 5729 + }, + { + "epoch": 2.7092198581560285, + "grad_norm": 2.811384439468384, + "learning_rate": 2.9129437377316923e-06, + "loss": 0.4031, + "step": 5730 + }, + { + "epoch": 2.709692671394799, + "grad_norm": 3.1041207313537598, + "learning_rate": 2.91232846688221e-06, + "loss": 0.481, + "step": 5731 + }, + { + "epoch": 2.7101654846335697, + "grad_norm": 2.5992188453674316, + "learning_rate": 2.9117131703588414e-06, + "loss": 0.4266, + "step": 5732 + }, + { + "epoch": 2.7106382978723405, + "grad_norm": 2.7726242542266846, + "learning_rate": 2.911097848199899e-06, + "loss": 0.4464, + "step": 5733 + }, + { + "epoch": 2.7111111111111112, + "grad_norm": 2.8683483600616455, + "learning_rate": 2.9104825004436966e-06, + "loss": 0.4248, + "step": 5734 + }, + { + "epoch": 2.711583924349882, + "grad_norm": 2.776386022567749, + "learning_rate": 2.9098671271285484e-06, + "loss": 0.4556, + "step": 5735 + }, + { + "epoch": 2.7120567375886524, + "grad_norm": 2.7612528800964355, + "learning_rate": 2.909251728292771e-06, + "loss": 0.455, + "step": 5736 + }, + { + "epoch": 2.712529550827423, + "grad_norm": 2.9223551750183105, + "learning_rate": 2.908636303974684e-06, + "loss": 0.4302, + "step": 5737 + }, + { + "epoch": 2.713002364066194, + "grad_norm": 2.898226022720337, + "learning_rate": 2.908020854212606e-06, + "loss": 0.4827, + "step": 5738 + }, + { + "epoch": 2.7134751773049643, + "grad_norm": 2.706361770629883, + "learning_rate": 2.9074053790448576e-06, + "loss": 0.4444, + "step": 5739 + }, + { + "epoch": 2.713947990543735, + "grad_norm": 2.8227248191833496, + "learning_rate": 2.9067898785097637e-06, + "loss": 0.4661, + "step": 5740 + }, + { + "epoch": 2.714420803782506, + "grad_norm": 2.597837448120117, + "learning_rate": 2.9061743526456474e-06, + "loss": 0.4646, + "step": 5741 + }, + { + "epoch": 2.7148936170212767, + "grad_norm": 2.5525131225585938, + "learning_rate": 2.9055588014908354e-06, + "loss": 0.4172, + "step": 5742 + }, + { + "epoch": 2.7153664302600475, + "grad_norm": 2.713071823120117, + "learning_rate": 2.904943225083655e-06, + "loss": 0.4893, + "step": 5743 + }, + { + "epoch": 2.715839243498818, + "grad_norm": 2.538623571395874, + "learning_rate": 2.9043276234624353e-06, + "loss": 0.3905, + "step": 5744 + }, + { + "epoch": 2.7163120567375887, + "grad_norm": 2.5190389156341553, + "learning_rate": 2.9037119966655076e-06, + "loss": 0.4318, + "step": 5745 + }, + { + "epoch": 2.7167848699763595, + "grad_norm": 2.6587612628936768, + "learning_rate": 2.903096344731204e-06, + "loss": 0.4153, + "step": 5746 + }, + { + "epoch": 2.71725768321513, + "grad_norm": 2.836731433868408, + "learning_rate": 2.902480667697859e-06, + "loss": 0.4779, + "step": 5747 + }, + { + "epoch": 2.7177304964539006, + "grad_norm": 2.8076045513153076, + "learning_rate": 2.9018649656038074e-06, + "loss": 0.5126, + "step": 5748 + }, + { + "epoch": 2.7182033096926714, + "grad_norm": 2.8930516242980957, + "learning_rate": 2.9012492384873865e-06, + "loss": 0.4561, + "step": 5749 + }, + { + "epoch": 2.7186761229314422, + "grad_norm": 2.7000370025634766, + "learning_rate": 2.9006334863869343e-06, + "loss": 0.4659, + "step": 5750 + }, + { + "epoch": 2.719148936170213, + "grad_norm": 2.927011251449585, + "learning_rate": 2.9000177093407926e-06, + "loss": 0.5123, + "step": 5751 + }, + { + "epoch": 2.7196217494089834, + "grad_norm": 3.0102779865264893, + "learning_rate": 2.8994019073873015e-06, + "loss": 0.3972, + "step": 5752 + }, + { + "epoch": 2.720094562647754, + "grad_norm": 2.778838634490967, + "learning_rate": 2.8987860805648054e-06, + "loss": 0.4922, + "step": 5753 + }, + { + "epoch": 2.720567375886525, + "grad_norm": 2.6150314807891846, + "learning_rate": 2.898170228911648e-06, + "loss": 0.4425, + "step": 5754 + }, + { + "epoch": 2.7210401891252953, + "grad_norm": 2.9329984188079834, + "learning_rate": 2.8975543524661777e-06, + "loss": 0.4872, + "step": 5755 + }, + { + "epoch": 2.721513002364066, + "grad_norm": 2.756803512573242, + "learning_rate": 2.8969384512667404e-06, + "loss": 0.4362, + "step": 5756 + }, + { + "epoch": 2.721985815602837, + "grad_norm": 2.600877285003662, + "learning_rate": 2.896322525351686e-06, + "loss": 0.4802, + "step": 5757 + }, + { + "epoch": 2.7224586288416077, + "grad_norm": 2.647069215774536, + "learning_rate": 2.8957065747593655e-06, + "loss": 0.4649, + "step": 5758 + }, + { + "epoch": 2.7229314420803785, + "grad_norm": 2.845388174057007, + "learning_rate": 2.895090599528132e-06, + "loss": 0.4533, + "step": 5759 + }, + { + "epoch": 2.723404255319149, + "grad_norm": 2.973881721496582, + "learning_rate": 2.8944745996963397e-06, + "loss": 0.4959, + "step": 5760 + }, + { + "epoch": 2.7238770685579197, + "grad_norm": 2.8995487689971924, + "learning_rate": 2.8938585753023435e-06, + "loss": 0.4597, + "step": 5761 + }, + { + "epoch": 2.7243498817966905, + "grad_norm": 2.903693437576294, + "learning_rate": 2.8932425263845004e-06, + "loss": 0.4521, + "step": 5762 + }, + { + "epoch": 2.724822695035461, + "grad_norm": 2.7609009742736816, + "learning_rate": 2.8926264529811702e-06, + "loss": 0.4399, + "step": 5763 + }, + { + "epoch": 2.7252955082742316, + "grad_norm": 2.788787603378296, + "learning_rate": 2.892010355130712e-06, + "loss": 0.4614, + "step": 5764 + }, + { + "epoch": 2.7257683215130024, + "grad_norm": 2.786498785018921, + "learning_rate": 2.8913942328714887e-06, + "loss": 0.4798, + "step": 5765 + }, + { + "epoch": 2.726241134751773, + "grad_norm": 2.9809393882751465, + "learning_rate": 2.8907780862418616e-06, + "loss": 0.5108, + "step": 5766 + }, + { + "epoch": 2.726713947990544, + "grad_norm": 2.6621177196502686, + "learning_rate": 2.8901619152801967e-06, + "loss": 0.4031, + "step": 5767 + }, + { + "epoch": 2.7271867612293144, + "grad_norm": 3.3092098236083984, + "learning_rate": 2.8895457200248607e-06, + "loss": 0.4671, + "step": 5768 + }, + { + "epoch": 2.727659574468085, + "grad_norm": 2.866306781768799, + "learning_rate": 2.8889295005142204e-06, + "loss": 0.4434, + "step": 5769 + }, + { + "epoch": 2.728132387706856, + "grad_norm": 2.6861231327056885, + "learning_rate": 2.888313256786646e-06, + "loss": 0.429, + "step": 5770 + }, + { + "epoch": 2.7286052009456263, + "grad_norm": 2.873180389404297, + "learning_rate": 2.8876969888805072e-06, + "loss": 0.4412, + "step": 5771 + }, + { + "epoch": 2.729078014184397, + "grad_norm": 2.511678695678711, + "learning_rate": 2.887080696834178e-06, + "loss": 0.4024, + "step": 5772 + }, + { + "epoch": 2.729550827423168, + "grad_norm": 2.6502726078033447, + "learning_rate": 2.88646438068603e-06, + "loss": 0.4357, + "step": 5773 + }, + { + "epoch": 2.7300236406619387, + "grad_norm": 2.7156145572662354, + "learning_rate": 2.8858480404744403e-06, + "loss": 0.4511, + "step": 5774 + }, + { + "epoch": 2.7304964539007095, + "grad_norm": 2.882582187652588, + "learning_rate": 2.8852316762377842e-06, + "loss": 0.4822, + "step": 5775 + }, + { + "epoch": 2.73096926713948, + "grad_norm": 2.7139666080474854, + "learning_rate": 2.8846152880144413e-06, + "loss": 0.4666, + "step": 5776 + }, + { + "epoch": 2.7314420803782506, + "grad_norm": 2.7453949451446533, + "learning_rate": 2.8839988758427907e-06, + "loss": 0.3927, + "step": 5777 + }, + { + "epoch": 2.731914893617021, + "grad_norm": 2.7859580516815186, + "learning_rate": 2.883382439761214e-06, + "loss": 0.4466, + "step": 5778 + }, + { + "epoch": 2.732387706855792, + "grad_norm": 2.695234537124634, + "learning_rate": 2.882765979808094e-06, + "loss": 0.4227, + "step": 5779 + }, + { + "epoch": 2.7328605200945626, + "grad_norm": 2.8081552982330322, + "learning_rate": 2.8821494960218148e-06, + "loss": 0.447, + "step": 5780 + }, + { + "epoch": 2.7333333333333334, + "grad_norm": 2.887643337249756, + "learning_rate": 2.881532988440762e-06, + "loss": 0.5018, + "step": 5781 + }, + { + "epoch": 2.733806146572104, + "grad_norm": 3.108212471008301, + "learning_rate": 2.8809164571033233e-06, + "loss": 0.4132, + "step": 5782 + }, + { + "epoch": 2.7342789598108745, + "grad_norm": 2.874328374862671, + "learning_rate": 2.880299902047886e-06, + "loss": 0.4618, + "step": 5783 + }, + { + "epoch": 2.7347517730496453, + "grad_norm": 3.089132308959961, + "learning_rate": 2.879683323312843e-06, + "loss": 0.4956, + "step": 5784 + }, + { + "epoch": 2.735224586288416, + "grad_norm": 2.5173206329345703, + "learning_rate": 2.879066720936583e-06, + "loss": 0.4087, + "step": 5785 + }, + { + "epoch": 2.7356973995271865, + "grad_norm": 2.6401286125183105, + "learning_rate": 2.8784500949575014e-06, + "loss": 0.3995, + "step": 5786 + }, + { + "epoch": 2.7361702127659573, + "grad_norm": 2.9371910095214844, + "learning_rate": 2.877833445413991e-06, + "loss": 0.5209, + "step": 5787 + }, + { + "epoch": 2.736643026004728, + "grad_norm": 3.218158006668091, + "learning_rate": 2.8772167723444498e-06, + "loss": 0.4275, + "step": 5788 + }, + { + "epoch": 2.737115839243499, + "grad_norm": 2.9072160720825195, + "learning_rate": 2.8766000757872736e-06, + "loss": 0.4244, + "step": 5789 + }, + { + "epoch": 2.7375886524822697, + "grad_norm": 3.0378096103668213, + "learning_rate": 2.8759833557808614e-06, + "loss": 0.507, + "step": 5790 + }, + { + "epoch": 2.73806146572104, + "grad_norm": 2.728353977203369, + "learning_rate": 2.8753666123636148e-06, + "loss": 0.413, + "step": 5791 + }, + { + "epoch": 2.738534278959811, + "grad_norm": 2.6869957447052, + "learning_rate": 2.874749845573935e-06, + "loss": 0.44, + "step": 5792 + }, + { + "epoch": 2.7390070921985816, + "grad_norm": 2.6381702423095703, + "learning_rate": 2.8741330554502263e-06, + "loss": 0.4708, + "step": 5793 + }, + { + "epoch": 2.739479905437352, + "grad_norm": 2.6944689750671387, + "learning_rate": 2.873516242030892e-06, + "loss": 0.4555, + "step": 5794 + }, + { + "epoch": 2.739952718676123, + "grad_norm": 3.168473243713379, + "learning_rate": 2.8728994053543396e-06, + "loss": 0.4538, + "step": 5795 + }, + { + "epoch": 2.7404255319148936, + "grad_norm": 2.7504515647888184, + "learning_rate": 2.872282545458976e-06, + "loss": 0.4628, + "step": 5796 + }, + { + "epoch": 2.7408983451536644, + "grad_norm": 2.896462917327881, + "learning_rate": 2.8716656623832114e-06, + "loss": 0.4946, + "step": 5797 + }, + { + "epoch": 2.741371158392435, + "grad_norm": 2.8053417205810547, + "learning_rate": 2.8710487561654547e-06, + "loss": 0.4893, + "step": 5798 + }, + { + "epoch": 2.7418439716312055, + "grad_norm": 2.63171124458313, + "learning_rate": 2.870431826844119e-06, + "loss": 0.4257, + "step": 5799 + }, + { + "epoch": 2.7423167848699763, + "grad_norm": 3.0963807106018066, + "learning_rate": 2.869814874457618e-06, + "loss": 0.5404, + "step": 5800 + }, + { + "epoch": 2.742789598108747, + "grad_norm": 2.591132164001465, + "learning_rate": 2.8691978990443664e-06, + "loss": 0.4015, + "step": 5801 + }, + { + "epoch": 2.7432624113475175, + "grad_norm": 3.0319552421569824, + "learning_rate": 2.8685809006427812e-06, + "loss": 0.4411, + "step": 5802 + }, + { + "epoch": 2.7437352245862883, + "grad_norm": 2.7791874408721924, + "learning_rate": 2.8679638792912784e-06, + "loss": 0.43, + "step": 5803 + }, + { + "epoch": 2.744208037825059, + "grad_norm": 3.530632495880127, + "learning_rate": 2.867346835028279e-06, + "loss": 0.4581, + "step": 5804 + }, + { + "epoch": 2.74468085106383, + "grad_norm": 3.2043099403381348, + "learning_rate": 2.8667297678922024e-06, + "loss": 0.4375, + "step": 5805 + }, + { + "epoch": 2.7451536643026007, + "grad_norm": 2.8442344665527344, + "learning_rate": 2.8661126779214716e-06, + "loss": 0.4059, + "step": 5806 + }, + { + "epoch": 2.745626477541371, + "grad_norm": 2.7561380863189697, + "learning_rate": 2.86549556515451e-06, + "loss": 0.4391, + "step": 5807 + }, + { + "epoch": 2.746099290780142, + "grad_norm": 3.229663848876953, + "learning_rate": 2.8648784296297418e-06, + "loss": 0.4579, + "step": 5808 + }, + { + "epoch": 2.7465721040189126, + "grad_norm": 2.8375027179718018, + "learning_rate": 2.864261271385593e-06, + "loss": 0.4566, + "step": 5809 + }, + { + "epoch": 2.747044917257683, + "grad_norm": 2.392998695373535, + "learning_rate": 2.863644090460493e-06, + "loss": 0.4123, + "step": 5810 + }, + { + "epoch": 2.7475177304964538, + "grad_norm": 2.707610607147217, + "learning_rate": 2.86302688689287e-06, + "loss": 0.4299, + "step": 5811 + }, + { + "epoch": 2.7479905437352246, + "grad_norm": 2.824042797088623, + "learning_rate": 2.8624096607211547e-06, + "loss": 0.3799, + "step": 5812 + }, + { + "epoch": 2.7484633569739954, + "grad_norm": 2.933102607727051, + "learning_rate": 2.861792411983779e-06, + "loss": 0.4416, + "step": 5813 + }, + { + "epoch": 2.748936170212766, + "grad_norm": 2.7000277042388916, + "learning_rate": 2.8611751407191757e-06, + "loss": 0.4346, + "step": 5814 + }, + { + "epoch": 2.7494089834515365, + "grad_norm": 2.6757142543792725, + "learning_rate": 2.86055784696578e-06, + "loss": 0.4559, + "step": 5815 + }, + { + "epoch": 2.7498817966903073, + "grad_norm": 2.5791053771972656, + "learning_rate": 2.8599405307620287e-06, + "loss": 0.4619, + "step": 5816 + }, + { + "epoch": 2.750354609929078, + "grad_norm": 3.0327374935150146, + "learning_rate": 2.859323192146359e-06, + "loss": 0.4163, + "step": 5817 + }, + { + "epoch": 2.7508274231678485, + "grad_norm": 2.5580220222473145, + "learning_rate": 2.8587058311572084e-06, + "loss": 0.4005, + "step": 5818 + }, + { + "epoch": 2.7513002364066192, + "grad_norm": 2.592179536819458, + "learning_rate": 2.85808844783302e-06, + "loss": 0.4404, + "step": 5819 + }, + { + "epoch": 2.75177304964539, + "grad_norm": 3.2779927253723145, + "learning_rate": 2.8574710422122342e-06, + "loss": 0.54, + "step": 5820 + }, + { + "epoch": 2.752245862884161, + "grad_norm": 2.4804370403289795, + "learning_rate": 2.8568536143332933e-06, + "loss": 0.4476, + "step": 5821 + }, + { + "epoch": 2.7527186761229316, + "grad_norm": 2.649477481842041, + "learning_rate": 2.8562361642346427e-06, + "loss": 0.4336, + "step": 5822 + }, + { + "epoch": 2.753191489361702, + "grad_norm": 3.138587474822998, + "learning_rate": 2.855618691954728e-06, + "loss": 0.5042, + "step": 5823 + }, + { + "epoch": 2.753664302600473, + "grad_norm": 2.75093412399292, + "learning_rate": 2.855001197531997e-06, + "loss": 0.4327, + "step": 5824 + }, + { + "epoch": 2.7541371158392436, + "grad_norm": 2.678809642791748, + "learning_rate": 2.854383681004898e-06, + "loss": 0.4409, + "step": 5825 + }, + { + "epoch": 2.754609929078014, + "grad_norm": 2.965386390686035, + "learning_rate": 2.853766142411881e-06, + "loss": 0.4716, + "step": 5826 + }, + { + "epoch": 2.7550827423167847, + "grad_norm": 2.6419436931610107, + "learning_rate": 2.853148581791398e-06, + "loss": 0.4367, + "step": 5827 + }, + { + "epoch": 2.7555555555555555, + "grad_norm": 3.205794095993042, + "learning_rate": 2.8525309991819004e-06, + "loss": 0.4869, + "step": 5828 + }, + { + "epoch": 2.7560283687943263, + "grad_norm": 3.041008472442627, + "learning_rate": 2.851913394621844e-06, + "loss": 0.5087, + "step": 5829 + }, + { + "epoch": 2.756501182033097, + "grad_norm": 2.6525566577911377, + "learning_rate": 2.851295768149684e-06, + "loss": 0.3951, + "step": 5830 + }, + { + "epoch": 2.7569739952718675, + "grad_norm": 2.732220411300659, + "learning_rate": 2.850678119803876e-06, + "loss": 0.4797, + "step": 5831 + }, + { + "epoch": 2.7574468085106383, + "grad_norm": 2.8965251445770264, + "learning_rate": 2.8500604496228797e-06, + "loss": 0.4938, + "step": 5832 + }, + { + "epoch": 2.757919621749409, + "grad_norm": 2.48020076751709, + "learning_rate": 2.849442757645154e-06, + "loss": 0.4172, + "step": 5833 + }, + { + "epoch": 2.7583924349881794, + "grad_norm": 2.4764912128448486, + "learning_rate": 2.8488250439091603e-06, + "loss": 0.4123, + "step": 5834 + }, + { + "epoch": 2.7588652482269502, + "grad_norm": 2.4547016620635986, + "learning_rate": 2.84820730845336e-06, + "loss": 0.4116, + "step": 5835 + }, + { + "epoch": 2.759338061465721, + "grad_norm": 2.55476975440979, + "learning_rate": 2.847589551316218e-06, + "loss": 0.4744, + "step": 5836 + }, + { + "epoch": 2.759810874704492, + "grad_norm": 2.3866238594055176, + "learning_rate": 2.846971772536199e-06, + "loss": 0.4406, + "step": 5837 + }, + { + "epoch": 2.7602836879432626, + "grad_norm": 2.855318784713745, + "learning_rate": 2.8463539721517687e-06, + "loss": 0.4517, + "step": 5838 + }, + { + "epoch": 2.760756501182033, + "grad_norm": 2.527198314666748, + "learning_rate": 2.8457361502013954e-06, + "loss": 0.3588, + "step": 5839 + }, + { + "epoch": 2.7612293144208038, + "grad_norm": 2.6761462688446045, + "learning_rate": 2.8451183067235476e-06, + "loss": 0.4192, + "step": 5840 + }, + { + "epoch": 2.7617021276595746, + "grad_norm": 2.5692319869995117, + "learning_rate": 2.8445004417566967e-06, + "loss": 0.4108, + "step": 5841 + }, + { + "epoch": 2.762174940898345, + "grad_norm": 2.5721096992492676, + "learning_rate": 2.8438825553393133e-06, + "loss": 0.3941, + "step": 5842 + }, + { + "epoch": 2.7626477541371157, + "grad_norm": 2.699430227279663, + "learning_rate": 2.843264647509872e-06, + "loss": 0.4418, + "step": 5843 + }, + { + "epoch": 2.7631205673758865, + "grad_norm": 2.6943318843841553, + "learning_rate": 2.842646718306846e-06, + "loss": 0.4505, + "step": 5844 + }, + { + "epoch": 2.7635933806146573, + "grad_norm": 2.661656379699707, + "learning_rate": 2.8420287677687107e-06, + "loss": 0.4413, + "step": 5845 + }, + { + "epoch": 2.764066193853428, + "grad_norm": 2.830467939376831, + "learning_rate": 2.8414107959339444e-06, + "loss": 0.5095, + "step": 5846 + }, + { + "epoch": 2.7645390070921985, + "grad_norm": 2.598053455352783, + "learning_rate": 2.840792802841024e-06, + "loss": 0.4029, + "step": 5847 + }, + { + "epoch": 2.7650118203309693, + "grad_norm": 2.641700029373169, + "learning_rate": 2.8401747885284316e-06, + "loss": 0.4237, + "step": 5848 + }, + { + "epoch": 2.76548463356974, + "grad_norm": 2.6672768592834473, + "learning_rate": 2.8395567530346454e-06, + "loss": 0.4181, + "step": 5849 + }, + { + "epoch": 2.7659574468085104, + "grad_norm": 2.5851705074310303, + "learning_rate": 2.838938696398149e-06, + "loss": 0.4165, + "step": 5850 + }, + { + "epoch": 2.766430260047281, + "grad_norm": 2.318120002746582, + "learning_rate": 2.8383206186574276e-06, + "loss": 0.3578, + "step": 5851 + }, + { + "epoch": 2.766903073286052, + "grad_norm": 2.6199793815612793, + "learning_rate": 2.8377025198509635e-06, + "loss": 0.4719, + "step": 5852 + }, + { + "epoch": 2.767375886524823, + "grad_norm": 2.7186086177825928, + "learning_rate": 2.837084400017245e-06, + "loss": 0.41, + "step": 5853 + }, + { + "epoch": 2.7678486997635936, + "grad_norm": 2.702514886856079, + "learning_rate": 2.8364662591947583e-06, + "loss": 0.4659, + "step": 5854 + }, + { + "epoch": 2.768321513002364, + "grad_norm": 2.612375259399414, + "learning_rate": 2.835848097421993e-06, + "loss": 0.4252, + "step": 5855 + }, + { + "epoch": 2.7687943262411348, + "grad_norm": 3.0127978324890137, + "learning_rate": 2.8352299147374394e-06, + "loss": 0.4084, + "step": 5856 + }, + { + "epoch": 2.7692671394799055, + "grad_norm": 2.6460049152374268, + "learning_rate": 2.83461171117959e-06, + "loss": 0.4035, + "step": 5857 + }, + { + "epoch": 2.769739952718676, + "grad_norm": 2.9844725131988525, + "learning_rate": 2.8339934867869357e-06, + "loss": 0.4912, + "step": 5858 + }, + { + "epoch": 2.7702127659574467, + "grad_norm": 2.731217861175537, + "learning_rate": 2.833375241597972e-06, + "loss": 0.4112, + "step": 5859 + }, + { + "epoch": 2.7706855791962175, + "grad_norm": 2.731194496154785, + "learning_rate": 2.832756975651193e-06, + "loss": 0.4516, + "step": 5860 + }, + { + "epoch": 2.7711583924349883, + "grad_norm": 3.0532076358795166, + "learning_rate": 2.8321386889850965e-06, + "loss": 0.3959, + "step": 5861 + }, + { + "epoch": 2.771631205673759, + "grad_norm": 3.5437800884246826, + "learning_rate": 2.831520381638181e-06, + "loss": 0.6055, + "step": 5862 + }, + { + "epoch": 2.7721040189125294, + "grad_norm": 2.4297714233398438, + "learning_rate": 2.830902053648944e-06, + "loss": 0.4038, + "step": 5863 + }, + { + "epoch": 2.7725768321513002, + "grad_norm": 2.696768045425415, + "learning_rate": 2.8302837050558876e-06, + "loss": 0.3983, + "step": 5864 + }, + { + "epoch": 2.773049645390071, + "grad_norm": 2.6574649810791016, + "learning_rate": 2.8296653358975122e-06, + "loss": 0.4937, + "step": 5865 + }, + { + "epoch": 2.7735224586288414, + "grad_norm": 2.9393341541290283, + "learning_rate": 2.8290469462123234e-06, + "loss": 0.4603, + "step": 5866 + }, + { + "epoch": 2.773995271867612, + "grad_norm": 2.7630696296691895, + "learning_rate": 2.828428536038824e-06, + "loss": 0.4663, + "step": 5867 + }, + { + "epoch": 2.774468085106383, + "grad_norm": 2.7354233264923096, + "learning_rate": 2.8278101054155183e-06, + "loss": 0.4444, + "step": 5868 + }, + { + "epoch": 2.774940898345154, + "grad_norm": 3.0489425659179688, + "learning_rate": 2.827191654380915e-06, + "loss": 0.4684, + "step": 5869 + }, + { + "epoch": 2.7754137115839246, + "grad_norm": 2.9602572917938232, + "learning_rate": 2.8265731829735226e-06, + "loss": 0.4571, + "step": 5870 + }, + { + "epoch": 2.775886524822695, + "grad_norm": 2.774132013320923, + "learning_rate": 2.825954691231851e-06, + "loss": 0.4458, + "step": 5871 + }, + { + "epoch": 2.7763593380614657, + "grad_norm": 2.696622133255005, + "learning_rate": 2.825336179194409e-06, + "loss": 0.4933, + "step": 5872 + }, + { + "epoch": 2.7768321513002365, + "grad_norm": 2.742184638977051, + "learning_rate": 2.8247176468997096e-06, + "loss": 0.4464, + "step": 5873 + }, + { + "epoch": 2.777304964539007, + "grad_norm": 2.7033183574676514, + "learning_rate": 2.824099094386266e-06, + "loss": 0.4369, + "step": 5874 + }, + { + "epoch": 2.7777777777777777, + "grad_norm": 2.7264044284820557, + "learning_rate": 2.8234805216925935e-06, + "loss": 0.4621, + "step": 5875 + }, + { + "epoch": 2.7782505910165485, + "grad_norm": 2.6417739391326904, + "learning_rate": 2.822861928857208e-06, + "loss": 0.4254, + "step": 5876 + }, + { + "epoch": 2.7787234042553193, + "grad_norm": 3.17209529876709, + "learning_rate": 2.8222433159186245e-06, + "loss": 0.5011, + "step": 5877 + }, + { + "epoch": 2.77919621749409, + "grad_norm": 3.1434381008148193, + "learning_rate": 2.8216246829153633e-06, + "loss": 0.4567, + "step": 5878 + }, + { + "epoch": 2.7796690307328604, + "grad_norm": 2.781608819961548, + "learning_rate": 2.821006029885943e-06, + "loss": 0.4723, + "step": 5879 + }, + { + "epoch": 2.780141843971631, + "grad_norm": 3.00079345703125, + "learning_rate": 2.820387356868885e-06, + "loss": 0.4796, + "step": 5880 + }, + { + "epoch": 2.780614657210402, + "grad_norm": 2.703555107116699, + "learning_rate": 2.819768663902712e-06, + "loss": 0.4577, + "step": 5881 + }, + { + "epoch": 2.7810874704491724, + "grad_norm": 2.5741801261901855, + "learning_rate": 2.8191499510259453e-06, + "loss": 0.4255, + "step": 5882 + }, + { + "epoch": 2.781560283687943, + "grad_norm": 2.9871208667755127, + "learning_rate": 2.8185312182771112e-06, + "loss": 0.4495, + "step": 5883 + }, + { + "epoch": 2.782033096926714, + "grad_norm": 2.525317668914795, + "learning_rate": 2.8179124656947343e-06, + "loss": 0.4428, + "step": 5884 + }, + { + "epoch": 2.7825059101654848, + "grad_norm": 2.525092840194702, + "learning_rate": 2.817293693317343e-06, + "loss": 0.4348, + "step": 5885 + }, + { + "epoch": 2.7829787234042556, + "grad_norm": 2.8485171794891357, + "learning_rate": 2.816674901183464e-06, + "loss": 0.4206, + "step": 5886 + }, + { + "epoch": 2.783451536643026, + "grad_norm": 2.6612746715545654, + "learning_rate": 2.8160560893316272e-06, + "loss": 0.396, + "step": 5887 + }, + { + "epoch": 2.7839243498817967, + "grad_norm": 2.7093865871429443, + "learning_rate": 2.815437257800364e-06, + "loss": 0.4468, + "step": 5888 + }, + { + "epoch": 2.7843971631205675, + "grad_norm": 2.6130900382995605, + "learning_rate": 2.814818406628206e-06, + "loss": 0.443, + "step": 5889 + }, + { + "epoch": 2.784869976359338, + "grad_norm": 2.8147552013397217, + "learning_rate": 2.8141995358536866e-06, + "loss": 0.4454, + "step": 5890 + }, + { + "epoch": 2.7853427895981087, + "grad_norm": 2.5621275901794434, + "learning_rate": 2.8135806455153395e-06, + "loss": 0.439, + "step": 5891 + }, + { + "epoch": 2.7858156028368795, + "grad_norm": 2.880228281021118, + "learning_rate": 2.812961735651701e-06, + "loss": 0.3895, + "step": 5892 + }, + { + "epoch": 2.7862884160756503, + "grad_norm": 2.5861377716064453, + "learning_rate": 2.8123428063013068e-06, + "loss": 0.4402, + "step": 5893 + }, + { + "epoch": 2.786761229314421, + "grad_norm": 2.9707765579223633, + "learning_rate": 2.811723857502696e-06, + "loss": 0.4461, + "step": 5894 + }, + { + "epoch": 2.7872340425531914, + "grad_norm": 2.923999309539795, + "learning_rate": 2.811104889294408e-06, + "loss": 0.4395, + "step": 5895 + }, + { + "epoch": 2.787706855791962, + "grad_norm": 2.846933603286743, + "learning_rate": 2.810485901714981e-06, + "loss": 0.5168, + "step": 5896 + }, + { + "epoch": 2.788179669030733, + "grad_norm": 4.1052350997924805, + "learning_rate": 2.8098668948029597e-06, + "loss": 0.5152, + "step": 5897 + }, + { + "epoch": 2.7886524822695034, + "grad_norm": 2.7391018867492676, + "learning_rate": 2.8092478685968856e-06, + "loss": 0.4515, + "step": 5898 + }, + { + "epoch": 2.789125295508274, + "grad_norm": 2.976088285446167, + "learning_rate": 2.8086288231353027e-06, + "loss": 0.5156, + "step": 5899 + }, + { + "epoch": 2.789598108747045, + "grad_norm": 2.6139633655548096, + "learning_rate": 2.8080097584567562e-06, + "loss": 0.4237, + "step": 5900 + }, + { + "epoch": 2.7900709219858157, + "grad_norm": 2.501654624938965, + "learning_rate": 2.807390674599792e-06, + "loss": 0.4349, + "step": 5901 + }, + { + "epoch": 2.7905437352245865, + "grad_norm": 2.8814525604248047, + "learning_rate": 2.8067715716029586e-06, + "loss": 0.4866, + "step": 5902 + }, + { + "epoch": 2.791016548463357, + "grad_norm": 2.7953200340270996, + "learning_rate": 2.8061524495048046e-06, + "loss": 0.3964, + "step": 5903 + }, + { + "epoch": 2.7914893617021277, + "grad_norm": 2.7362849712371826, + "learning_rate": 2.8055333083438808e-06, + "loss": 0.4181, + "step": 5904 + }, + { + "epoch": 2.7919621749408985, + "grad_norm": 2.9740512371063232, + "learning_rate": 2.8049141481587366e-06, + "loss": 0.4784, + "step": 5905 + }, + { + "epoch": 2.792434988179669, + "grad_norm": 2.595813274383545, + "learning_rate": 2.8042949689879262e-06, + "loss": 0.4421, + "step": 5906 + }, + { + "epoch": 2.7929078014184396, + "grad_norm": 2.886899948120117, + "learning_rate": 2.803675770870002e-06, + "loss": 0.4435, + "step": 5907 + }, + { + "epoch": 2.7933806146572104, + "grad_norm": 2.6057486534118652, + "learning_rate": 2.8030565538435196e-06, + "loss": 0.4472, + "step": 5908 + }, + { + "epoch": 2.7938534278959812, + "grad_norm": 2.7422802448272705, + "learning_rate": 2.802437317947034e-06, + "loss": 0.4799, + "step": 5909 + }, + { + "epoch": 2.794326241134752, + "grad_norm": 2.3904244899749756, + "learning_rate": 2.801818063219102e-06, + "loss": 0.4508, + "step": 5910 + }, + { + "epoch": 2.7947990543735224, + "grad_norm": 2.8434207439422607, + "learning_rate": 2.8011987896982835e-06, + "loss": 0.4473, + "step": 5911 + }, + { + "epoch": 2.795271867612293, + "grad_norm": 2.916088819503784, + "learning_rate": 2.8005794974231366e-06, + "loss": 0.464, + "step": 5912 + }, + { + "epoch": 2.795744680851064, + "grad_norm": 2.6483397483825684, + "learning_rate": 2.7999601864322236e-06, + "loss": 0.441, + "step": 5913 + }, + { + "epoch": 2.7962174940898343, + "grad_norm": 2.9287428855895996, + "learning_rate": 2.7993408567641033e-06, + "loss": 0.4551, + "step": 5914 + }, + { + "epoch": 2.796690307328605, + "grad_norm": 2.575024127960205, + "learning_rate": 2.798721508457342e-06, + "loss": 0.4494, + "step": 5915 + }, + { + "epoch": 2.797163120567376, + "grad_norm": 2.7156829833984375, + "learning_rate": 2.7981021415505015e-06, + "loss": 0.419, + "step": 5916 + }, + { + "epoch": 2.7976359338061467, + "grad_norm": 2.850553035736084, + "learning_rate": 2.7974827560821482e-06, + "loss": 0.4709, + "step": 5917 + }, + { + "epoch": 2.7981087470449175, + "grad_norm": 2.673846483230591, + "learning_rate": 2.796863352090847e-06, + "loss": 0.4224, + "step": 5918 + }, + { + "epoch": 2.798581560283688, + "grad_norm": 2.9093217849731445, + "learning_rate": 2.796243929615168e-06, + "loss": 0.468, + "step": 5919 + }, + { + "epoch": 2.7990543735224587, + "grad_norm": 2.4853813648223877, + "learning_rate": 2.7956244886936775e-06, + "loss": 0.4723, + "step": 5920 + }, + { + "epoch": 2.7995271867612295, + "grad_norm": 3.026428461074829, + "learning_rate": 2.795005029364946e-06, + "loss": 0.4721, + "step": 5921 + }, + { + "epoch": 2.8, + "grad_norm": 2.886295795440674, + "learning_rate": 2.794385551667546e-06, + "loss": 0.456, + "step": 5922 + }, + { + "epoch": 2.8004728132387706, + "grad_norm": 3.2260656356811523, + "learning_rate": 2.7937660556400486e-06, + "loss": 0.4499, + "step": 5923 + }, + { + "epoch": 2.8009456264775414, + "grad_norm": 2.7971982955932617, + "learning_rate": 2.793146541321027e-06, + "loss": 0.3982, + "step": 5924 + }, + { + "epoch": 2.801418439716312, + "grad_norm": 2.85461163520813, + "learning_rate": 2.7925270087490546e-06, + "loss": 0.4841, + "step": 5925 + }, + { + "epoch": 2.801891252955083, + "grad_norm": 3.0642316341400146, + "learning_rate": 2.7919074579627086e-06, + "loss": 0.4538, + "step": 5926 + }, + { + "epoch": 2.8023640661938534, + "grad_norm": 2.9053616523742676, + "learning_rate": 2.7912878890005657e-06, + "loss": 0.434, + "step": 5927 + }, + { + "epoch": 2.802836879432624, + "grad_norm": 2.7649240493774414, + "learning_rate": 2.7906683019012027e-06, + "loss": 0.414, + "step": 5928 + }, + { + "epoch": 2.803309692671395, + "grad_norm": 2.8717660903930664, + "learning_rate": 2.7900486967031987e-06, + "loss": 0.4337, + "step": 5929 + }, + { + "epoch": 2.8037825059101653, + "grad_norm": 2.6860995292663574, + "learning_rate": 2.789429073445135e-06, + "loss": 0.447, + "step": 5930 + }, + { + "epoch": 2.804255319148936, + "grad_norm": 2.67509126663208, + "learning_rate": 2.7888094321655918e-06, + "loss": 0.4955, + "step": 5931 + }, + { + "epoch": 2.804728132387707, + "grad_norm": 2.7426326274871826, + "learning_rate": 2.7881897729031514e-06, + "loss": 0.4564, + "step": 5932 + }, + { + "epoch": 2.8052009456264777, + "grad_norm": 2.7087252140045166, + "learning_rate": 2.7875700956963973e-06, + "loss": 0.4571, + "step": 5933 + }, + { + "epoch": 2.8056737588652485, + "grad_norm": 2.513526439666748, + "learning_rate": 2.7869504005839147e-06, + "loss": 0.4361, + "step": 5934 + }, + { + "epoch": 2.806146572104019, + "grad_norm": 3.2246084213256836, + "learning_rate": 2.7863306876042885e-06, + "loss": 0.4612, + "step": 5935 + }, + { + "epoch": 2.8066193853427897, + "grad_norm": 3.226325511932373, + "learning_rate": 2.7857109567961066e-06, + "loss": 0.4528, + "step": 5936 + }, + { + "epoch": 2.8070921985815604, + "grad_norm": 2.8861422538757324, + "learning_rate": 2.785091208197956e-06, + "loss": 0.5049, + "step": 5937 + }, + { + "epoch": 2.807565011820331, + "grad_norm": 2.76279616355896, + "learning_rate": 2.7844714418484257e-06, + "loss": 0.4714, + "step": 5938 + }, + { + "epoch": 2.8080378250591016, + "grad_norm": 2.9591920375823975, + "learning_rate": 2.7838516577861063e-06, + "loss": 0.4633, + "step": 5939 + }, + { + "epoch": 2.8085106382978724, + "grad_norm": 2.536916971206665, + "learning_rate": 2.7832318560495885e-06, + "loss": 0.4108, + "step": 5940 + }, + { + "epoch": 2.808983451536643, + "grad_norm": 3.2484991550445557, + "learning_rate": 2.7826120366774657e-06, + "loss": 0.4888, + "step": 5941 + }, + { + "epoch": 2.8094562647754135, + "grad_norm": 2.7129359245300293, + "learning_rate": 2.781992199708329e-06, + "loss": 0.4008, + "step": 5942 + }, + { + "epoch": 2.8099290780141843, + "grad_norm": 2.4176113605499268, + "learning_rate": 2.781372345180776e-06, + "loss": 0.3864, + "step": 5943 + }, + { + "epoch": 2.810401891252955, + "grad_norm": 2.6557252407073975, + "learning_rate": 2.7807524731334e-06, + "loss": 0.4295, + "step": 5944 + }, + { + "epoch": 2.8108747044917255, + "grad_norm": 2.9191324710845947, + "learning_rate": 2.7801325836047993e-06, + "loss": 0.4854, + "step": 5945 + }, + { + "epoch": 2.8113475177304963, + "grad_norm": 2.6325371265411377, + "learning_rate": 2.7795126766335705e-06, + "loss": 0.4332, + "step": 5946 + }, + { + "epoch": 2.811820330969267, + "grad_norm": 2.658337116241455, + "learning_rate": 2.778892752258314e-06, + "loss": 0.4276, + "step": 5947 + }, + { + "epoch": 2.812293144208038, + "grad_norm": 2.763782262802124, + "learning_rate": 2.778272810517627e-06, + "loss": 0.4246, + "step": 5948 + }, + { + "epoch": 2.8127659574468087, + "grad_norm": 2.407607078552246, + "learning_rate": 2.777652851450113e-06, + "loss": 0.3788, + "step": 5949 + }, + { + "epoch": 2.813238770685579, + "grad_norm": 3.0339951515197754, + "learning_rate": 2.7770328750943736e-06, + "loss": 0.477, + "step": 5950 + }, + { + "epoch": 2.81371158392435, + "grad_norm": 2.3475773334503174, + "learning_rate": 2.776412881489012e-06, + "loss": 0.4206, + "step": 5951 + }, + { + "epoch": 2.8141843971631206, + "grad_norm": 3.0455260276794434, + "learning_rate": 2.7757928706726318e-06, + "loss": 0.4301, + "step": 5952 + }, + { + "epoch": 2.814657210401891, + "grad_norm": 2.803920030593872, + "learning_rate": 2.7751728426838386e-06, + "loss": 0.3738, + "step": 5953 + }, + { + "epoch": 2.815130023640662, + "grad_norm": 3.1083319187164307, + "learning_rate": 2.77455279756124e-06, + "loss": 0.5365, + "step": 5954 + }, + { + "epoch": 2.8156028368794326, + "grad_norm": 3.180809497833252, + "learning_rate": 2.7739327353434427e-06, + "loss": 0.4789, + "step": 5955 + }, + { + "epoch": 2.8160756501182034, + "grad_norm": 2.975043773651123, + "learning_rate": 2.7733126560690543e-06, + "loss": 0.4798, + "step": 5956 + }, + { + "epoch": 2.816548463356974, + "grad_norm": 2.765475034713745, + "learning_rate": 2.772692559776685e-06, + "loss": 0.4206, + "step": 5957 + }, + { + "epoch": 2.8170212765957445, + "grad_norm": 2.48612380027771, + "learning_rate": 2.7720724465049463e-06, + "loss": 0.4234, + "step": 5958 + }, + { + "epoch": 2.8174940898345153, + "grad_norm": 2.7145729064941406, + "learning_rate": 2.77145231629245e-06, + "loss": 0.4713, + "step": 5959 + }, + { + "epoch": 2.817966903073286, + "grad_norm": 2.5993762016296387, + "learning_rate": 2.7708321691778074e-06, + "loss": 0.4144, + "step": 5960 + }, + { + "epoch": 2.8184397163120565, + "grad_norm": 3.0902538299560547, + "learning_rate": 2.770212005199633e-06, + "loss": 0.4822, + "step": 5961 + }, + { + "epoch": 2.8189125295508273, + "grad_norm": 2.849757671356201, + "learning_rate": 2.7695918243965424e-06, + "loss": 0.4449, + "step": 5962 + }, + { + "epoch": 2.819385342789598, + "grad_norm": 2.77148699760437, + "learning_rate": 2.768971626807151e-06, + "loss": 0.4448, + "step": 5963 + }, + { + "epoch": 2.819858156028369, + "grad_norm": 2.7865898609161377, + "learning_rate": 2.7683514124700757e-06, + "loss": 0.4944, + "step": 5964 + }, + { + "epoch": 2.8203309692671397, + "grad_norm": 2.9057955741882324, + "learning_rate": 2.767731181423934e-06, + "loss": 0.5074, + "step": 5965 + }, + { + "epoch": 2.82080378250591, + "grad_norm": 2.725837469100952, + "learning_rate": 2.7671109337073465e-06, + "loss": 0.4207, + "step": 5966 + }, + { + "epoch": 2.821276595744681, + "grad_norm": 3.078531265258789, + "learning_rate": 2.7664906693589315e-06, + "loss": 0.4835, + "step": 5967 + }, + { + "epoch": 2.8217494089834516, + "grad_norm": 2.8692002296447754, + "learning_rate": 2.765870388417312e-06, + "loss": 0.4284, + "step": 5968 + }, + { + "epoch": 2.822222222222222, + "grad_norm": 2.8519723415374756, + "learning_rate": 2.765250090921109e-06, + "loss": 0.541, + "step": 5969 + }, + { + "epoch": 2.8226950354609928, + "grad_norm": 3.2037532329559326, + "learning_rate": 2.7646297769089457e-06, + "loss": 0.4276, + "step": 5970 + }, + { + "epoch": 2.8231678486997636, + "grad_norm": 2.8637137413024902, + "learning_rate": 2.7640094464194468e-06, + "loss": 0.4904, + "step": 5971 + }, + { + "epoch": 2.8236406619385344, + "grad_norm": 2.681516408920288, + "learning_rate": 2.7633890994912372e-06, + "loss": 0.4942, + "step": 5972 + }, + { + "epoch": 2.824113475177305, + "grad_norm": 3.0035219192504883, + "learning_rate": 2.7627687361629434e-06, + "loss": 0.4556, + "step": 5973 + }, + { + "epoch": 2.8245862884160755, + "grad_norm": 2.8107759952545166, + "learning_rate": 2.7621483564731923e-06, + "loss": 0.4225, + "step": 5974 + }, + { + "epoch": 2.8250591016548463, + "grad_norm": 2.87276029586792, + "learning_rate": 2.7615279604606126e-06, + "loss": 0.5045, + "step": 5975 + }, + { + "epoch": 2.825531914893617, + "grad_norm": 2.687953233718872, + "learning_rate": 2.760907548163833e-06, + "loss": 0.4018, + "step": 5976 + }, + { + "epoch": 2.8260047281323875, + "grad_norm": 2.587979555130005, + "learning_rate": 2.760287119621486e-06, + "loss": 0.4407, + "step": 5977 + }, + { + "epoch": 2.8264775413711583, + "grad_norm": 2.805602550506592, + "learning_rate": 2.7596666748722e-06, + "loss": 0.4559, + "step": 5978 + }, + { + "epoch": 2.826950354609929, + "grad_norm": 2.320763111114502, + "learning_rate": 2.759046213954609e-06, + "loss": 0.3847, + "step": 5979 + }, + { + "epoch": 2.8274231678487, + "grad_norm": 2.6876401901245117, + "learning_rate": 2.758425736907347e-06, + "loss": 0.4528, + "step": 5980 + }, + { + "epoch": 2.8278959810874706, + "grad_norm": 2.6852915287017822, + "learning_rate": 2.757805243769046e-06, + "loss": 0.395, + "step": 5981 + }, + { + "epoch": 2.828368794326241, + "grad_norm": 2.808326005935669, + "learning_rate": 2.7571847345783447e-06, + "loss": 0.4647, + "step": 5982 + }, + { + "epoch": 2.828841607565012, + "grad_norm": 2.641479015350342, + "learning_rate": 2.7565642093738766e-06, + "loss": 0.3798, + "step": 5983 + }, + { + "epoch": 2.8293144208037826, + "grad_norm": 2.8066110610961914, + "learning_rate": 2.7559436681942803e-06, + "loss": 0.5072, + "step": 5984 + }, + { + "epoch": 2.829787234042553, + "grad_norm": 2.898375988006592, + "learning_rate": 2.7553231110781936e-06, + "loss": 0.5182, + "step": 5985 + }, + { + "epoch": 2.8302600472813237, + "grad_norm": 2.704890489578247, + "learning_rate": 2.7547025380642574e-06, + "loss": 0.3999, + "step": 5986 + }, + { + "epoch": 2.8307328605200945, + "grad_norm": 2.6024270057678223, + "learning_rate": 2.7540819491911106e-06, + "loss": 0.4302, + "step": 5987 + }, + { + "epoch": 2.8312056737588653, + "grad_norm": 2.8006081581115723, + "learning_rate": 2.7534613444973946e-06, + "loss": 0.4492, + "step": 5988 + }, + { + "epoch": 2.831678486997636, + "grad_norm": 2.9532058238983154, + "learning_rate": 2.752840724021752e-06, + "loss": 0.4552, + "step": 5989 + }, + { + "epoch": 2.8321513002364065, + "grad_norm": 3.1830217838287354, + "learning_rate": 2.7522200878028265e-06, + "loss": 0.5013, + "step": 5990 + }, + { + "epoch": 2.8326241134751773, + "grad_norm": 2.716176748275757, + "learning_rate": 2.7515994358792624e-06, + "loss": 0.4569, + "step": 5991 + }, + { + "epoch": 2.833096926713948, + "grad_norm": 2.6852715015411377, + "learning_rate": 2.7509787682897044e-06, + "loss": 0.4764, + "step": 5992 + }, + { + "epoch": 2.8335697399527184, + "grad_norm": 2.9383316040039062, + "learning_rate": 2.7503580850727985e-06, + "loss": 0.5205, + "step": 5993 + }, + { + "epoch": 2.8340425531914892, + "grad_norm": 2.703132152557373, + "learning_rate": 2.749737386267193e-06, + "loss": 0.4543, + "step": 5994 + }, + { + "epoch": 2.83451536643026, + "grad_norm": 2.4304885864257812, + "learning_rate": 2.7491166719115354e-06, + "loss": 0.4479, + "step": 5995 + }, + { + "epoch": 2.834988179669031, + "grad_norm": 2.975722551345825, + "learning_rate": 2.748495942044475e-06, + "loss": 0.4074, + "step": 5996 + }, + { + "epoch": 2.8354609929078016, + "grad_norm": 3.440208911895752, + "learning_rate": 2.7478751967046617e-06, + "loss": 0.4497, + "step": 5997 + }, + { + "epoch": 2.835933806146572, + "grad_norm": 2.734673261642456, + "learning_rate": 2.747254435930747e-06, + "loss": 0.437, + "step": 5998 + }, + { + "epoch": 2.8364066193853428, + "grad_norm": 3.1918959617614746, + "learning_rate": 2.7466336597613826e-06, + "loss": 0.4197, + "step": 5999 + }, + { + "epoch": 2.8368794326241136, + "grad_norm": 3.1440329551696777, + "learning_rate": 2.7460128682352216e-06, + "loss": 0.4425, + "step": 6000 + }, + { + "epoch": 2.837352245862884, + "grad_norm": 2.582993507385254, + "learning_rate": 2.7453920613909183e-06, + "loss": 0.4475, + "step": 6001 + }, + { + "epoch": 2.8378250591016547, + "grad_norm": 3.2682149410247803, + "learning_rate": 2.744771239267128e-06, + "loss": 0.4615, + "step": 6002 + }, + { + "epoch": 2.8382978723404255, + "grad_norm": 2.848477840423584, + "learning_rate": 2.7441504019025046e-06, + "loss": 0.4093, + "step": 6003 + }, + { + "epoch": 2.8387706855791963, + "grad_norm": 2.3582282066345215, + "learning_rate": 2.7435295493357067e-06, + "loss": 0.3911, + "step": 6004 + }, + { + "epoch": 2.839243498817967, + "grad_norm": 2.7707207202911377, + "learning_rate": 2.742908681605392e-06, + "loss": 0.4069, + "step": 6005 + }, + { + "epoch": 2.8397163120567375, + "grad_norm": 3.0763752460479736, + "learning_rate": 2.7422877987502183e-06, + "loss": 0.512, + "step": 6006 + }, + { + "epoch": 2.8401891252955083, + "grad_norm": 2.8027124404907227, + "learning_rate": 2.741666900808846e-06, + "loss": 0.4922, + "step": 6007 + }, + { + "epoch": 2.840661938534279, + "grad_norm": 2.487982988357544, + "learning_rate": 2.7410459878199353e-06, + "loss": 0.4368, + "step": 6008 + }, + { + "epoch": 2.8411347517730494, + "grad_norm": 2.8727993965148926, + "learning_rate": 2.7404250598221484e-06, + "loss": 0.4639, + "step": 6009 + }, + { + "epoch": 2.84160756501182, + "grad_norm": 2.5556678771972656, + "learning_rate": 2.739804116854147e-06, + "loss": 0.4217, + "step": 6010 + }, + { + "epoch": 2.842080378250591, + "grad_norm": 2.6306912899017334, + "learning_rate": 2.7391831589545948e-06, + "loss": 0.4816, + "step": 6011 + }, + { + "epoch": 2.842553191489362, + "grad_norm": 2.7340946197509766, + "learning_rate": 2.7385621861621557e-06, + "loss": 0.4113, + "step": 6012 + }, + { + "epoch": 2.8430260047281326, + "grad_norm": 2.834190607070923, + "learning_rate": 2.737941198515495e-06, + "loss": 0.4691, + "step": 6013 + }, + { + "epoch": 2.843498817966903, + "grad_norm": 2.7139697074890137, + "learning_rate": 2.737320196053281e-06, + "loss": 0.3798, + "step": 6014 + }, + { + "epoch": 2.8439716312056738, + "grad_norm": 2.7934985160827637, + "learning_rate": 2.736699178814177e-06, + "loss": 0.446, + "step": 6015 + }, + { + "epoch": 2.8444444444444446, + "grad_norm": 2.6941518783569336, + "learning_rate": 2.7360781468368534e-06, + "loss": 0.4787, + "step": 6016 + }, + { + "epoch": 2.844917257683215, + "grad_norm": 3.1530468463897705, + "learning_rate": 2.7354571001599792e-06, + "loss": 0.474, + "step": 6017 + }, + { + "epoch": 2.8453900709219857, + "grad_norm": 2.613875389099121, + "learning_rate": 2.7348360388222243e-06, + "loss": 0.4297, + "step": 6018 + }, + { + "epoch": 2.8458628841607565, + "grad_norm": 2.5481486320495605, + "learning_rate": 2.7342149628622587e-06, + "loss": 0.3762, + "step": 6019 + }, + { + "epoch": 2.8463356973995273, + "grad_norm": 2.6425609588623047, + "learning_rate": 2.7335938723187544e-06, + "loss": 0.4077, + "step": 6020 + }, + { + "epoch": 2.846808510638298, + "grad_norm": 2.6281731128692627, + "learning_rate": 2.7329727672303836e-06, + "loss": 0.466, + "step": 6021 + }, + { + "epoch": 2.8472813238770684, + "grad_norm": 2.8862180709838867, + "learning_rate": 2.7323516476358197e-06, + "loss": 0.4191, + "step": 6022 + }, + { + "epoch": 2.8477541371158392, + "grad_norm": 2.907731533050537, + "learning_rate": 2.7317305135737383e-06, + "loss": 0.4867, + "step": 6023 + }, + { + "epoch": 2.84822695035461, + "grad_norm": 2.825593948364258, + "learning_rate": 2.731109365082814e-06, + "loss": 0.4888, + "step": 6024 + }, + { + "epoch": 2.8486997635933804, + "grad_norm": 2.478163003921509, + "learning_rate": 2.730488202201722e-06, + "loss": 0.4714, + "step": 6025 + }, + { + "epoch": 2.849172576832151, + "grad_norm": 2.928899049758911, + "learning_rate": 2.7298670249691418e-06, + "loss": 0.4671, + "step": 6026 + }, + { + "epoch": 2.849645390070922, + "grad_norm": 2.778256893157959, + "learning_rate": 2.7292458334237488e-06, + "loss": 0.429, + "step": 6027 + }, + { + "epoch": 2.850118203309693, + "grad_norm": 3.0689055919647217, + "learning_rate": 2.7286246276042234e-06, + "loss": 0.4727, + "step": 6028 + }, + { + "epoch": 2.8505910165484636, + "grad_norm": 2.582066774368286, + "learning_rate": 2.7280034075492447e-06, + "loss": 0.4025, + "step": 6029 + }, + { + "epoch": 2.851063829787234, + "grad_norm": 3.6679015159606934, + "learning_rate": 2.7273821732974936e-06, + "loss": 0.4856, + "step": 6030 + }, + { + "epoch": 2.8515366430260047, + "grad_norm": 2.7222588062286377, + "learning_rate": 2.7267609248876516e-06, + "loss": 0.4255, + "step": 6031 + }, + { + "epoch": 2.8520094562647755, + "grad_norm": 2.455038547515869, + "learning_rate": 2.726139662358401e-06, + "loss": 0.4234, + "step": 6032 + }, + { + "epoch": 2.852482269503546, + "grad_norm": 2.8277318477630615, + "learning_rate": 2.7255183857484253e-06, + "loss": 0.4146, + "step": 6033 + }, + { + "epoch": 2.8529550827423167, + "grad_norm": 2.523615837097168, + "learning_rate": 2.724897095096409e-06, + "loss": 0.4227, + "step": 6034 + }, + { + "epoch": 2.8534278959810875, + "grad_norm": 3.353646755218506, + "learning_rate": 2.724275790441036e-06, + "loss": 0.5041, + "step": 6035 + }, + { + "epoch": 2.8539007092198583, + "grad_norm": 2.753981828689575, + "learning_rate": 2.7236544718209934e-06, + "loss": 0.4646, + "step": 6036 + }, + { + "epoch": 2.854373522458629, + "grad_norm": 2.954744577407837, + "learning_rate": 2.723033139274967e-06, + "loss": 0.5182, + "step": 6037 + }, + { + "epoch": 2.8548463356973994, + "grad_norm": 2.4814131259918213, + "learning_rate": 2.7224117928416462e-06, + "loss": 0.4626, + "step": 6038 + }, + { + "epoch": 2.8553191489361702, + "grad_norm": 2.7414886951446533, + "learning_rate": 2.721790432559717e-06, + "loss": 0.4111, + "step": 6039 + }, + { + "epoch": 2.855791962174941, + "grad_norm": 2.8743896484375, + "learning_rate": 2.7211690584678706e-06, + "loss": 0.4986, + "step": 6040 + }, + { + "epoch": 2.8562647754137114, + "grad_norm": 3.0691921710968018, + "learning_rate": 2.720547670604797e-06, + "loss": 0.4743, + "step": 6041 + }, + { + "epoch": 2.856737588652482, + "grad_norm": 2.7273411750793457, + "learning_rate": 2.7199262690091872e-06, + "loss": 0.4403, + "step": 6042 + }, + { + "epoch": 2.857210401891253, + "grad_norm": 2.8022944927215576, + "learning_rate": 2.7193048537197325e-06, + "loss": 0.4413, + "step": 6043 + }, + { + "epoch": 2.8576832151300238, + "grad_norm": 2.4883248805999756, + "learning_rate": 2.718683424775126e-06, + "loss": 0.4485, + "step": 6044 + }, + { + "epoch": 2.8581560283687946, + "grad_norm": 2.457249879837036, + "learning_rate": 2.718061982214062e-06, + "loss": 0.4167, + "step": 6045 + }, + { + "epoch": 2.858628841607565, + "grad_norm": 2.7210328578948975, + "learning_rate": 2.717440526075234e-06, + "loss": 0.4419, + "step": 6046 + }, + { + "epoch": 2.8591016548463357, + "grad_norm": 2.684483766555786, + "learning_rate": 2.7168190563973386e-06, + "loss": 0.4449, + "step": 6047 + }, + { + "epoch": 2.8595744680851065, + "grad_norm": 2.5305230617523193, + "learning_rate": 2.7161975732190706e-06, + "loss": 0.3829, + "step": 6048 + }, + { + "epoch": 2.860047281323877, + "grad_norm": 3.0284602642059326, + "learning_rate": 2.7155760765791278e-06, + "loss": 0.5164, + "step": 6049 + }, + { + "epoch": 2.8605200945626477, + "grad_norm": 3.154599189758301, + "learning_rate": 2.7149545665162085e-06, + "loss": 0.527, + "step": 6050 + }, + { + "epoch": 2.8609929078014185, + "grad_norm": 2.6798126697540283, + "learning_rate": 2.7143330430690113e-06, + "loss": 0.4379, + "step": 6051 + }, + { + "epoch": 2.8614657210401893, + "grad_norm": 2.9531302452087402, + "learning_rate": 2.7137115062762344e-06, + "loss": 0.4549, + "step": 6052 + }, + { + "epoch": 2.86193853427896, + "grad_norm": 2.779531240463257, + "learning_rate": 2.7130899561765787e-06, + "loss": 0.4037, + "step": 6053 + }, + { + "epoch": 2.8624113475177304, + "grad_norm": 2.786763906478882, + "learning_rate": 2.7124683928087466e-06, + "loss": 0.3986, + "step": 6054 + }, + { + "epoch": 2.862884160756501, + "grad_norm": 2.430415630340576, + "learning_rate": 2.7118468162114385e-06, + "loss": 0.4402, + "step": 6055 + }, + { + "epoch": 2.863356973995272, + "grad_norm": 3.027268409729004, + "learning_rate": 2.7112252264233596e-06, + "loss": 0.4737, + "step": 6056 + }, + { + "epoch": 2.8638297872340424, + "grad_norm": 3.024935483932495, + "learning_rate": 2.710603623483211e-06, + "loss": 0.3997, + "step": 6057 + }, + { + "epoch": 2.864302600472813, + "grad_norm": 2.8862195014953613, + "learning_rate": 2.7099820074296985e-06, + "loss": 0.4896, + "step": 6058 + }, + { + "epoch": 2.864775413711584, + "grad_norm": 2.595579147338867, + "learning_rate": 2.709360378301527e-06, + "loss": 0.4387, + "step": 6059 + }, + { + "epoch": 2.8652482269503547, + "grad_norm": 2.8046188354492188, + "learning_rate": 2.708738736137403e-06, + "loss": 0.4726, + "step": 6060 + }, + { + "epoch": 2.8657210401891255, + "grad_norm": 3.040304660797119, + "learning_rate": 2.708117080976033e-06, + "loss": 0.4642, + "step": 6061 + }, + { + "epoch": 2.866193853427896, + "grad_norm": 2.618128538131714, + "learning_rate": 2.7074954128561248e-06, + "loss": 0.3171, + "step": 6062 + }, + { + "epoch": 2.8666666666666667, + "grad_norm": 2.7966055870056152, + "learning_rate": 2.706873731816387e-06, + "loss": 0.4893, + "step": 6063 + }, + { + "epoch": 2.8671394799054375, + "grad_norm": 2.9198038578033447, + "learning_rate": 2.706252037895529e-06, + "loss": 0.4428, + "step": 6064 + }, + { + "epoch": 2.867612293144208, + "grad_norm": 2.417705774307251, + "learning_rate": 2.7056303311322617e-06, + "loss": 0.3704, + "step": 6065 + }, + { + "epoch": 2.8680851063829786, + "grad_norm": 3.143918752670288, + "learning_rate": 2.7050086115652953e-06, + "loss": 0.5247, + "step": 6066 + }, + { + "epoch": 2.8685579196217494, + "grad_norm": 2.620781183242798, + "learning_rate": 2.704386879233341e-06, + "loss": 0.4131, + "step": 6067 + }, + { + "epoch": 2.8690307328605202, + "grad_norm": 2.6929845809936523, + "learning_rate": 2.703765134175112e-06, + "loss": 0.4833, + "step": 6068 + }, + { + "epoch": 2.869503546099291, + "grad_norm": 2.695920944213867, + "learning_rate": 2.7031433764293214e-06, + "loss": 0.435, + "step": 6069 + }, + { + "epoch": 2.8699763593380614, + "grad_norm": 2.6184475421905518, + "learning_rate": 2.702521606034684e-06, + "loss": 0.3898, + "step": 6070 + }, + { + "epoch": 2.870449172576832, + "grad_norm": 3.130624532699585, + "learning_rate": 2.7018998230299136e-06, + "loss": 0.4934, + "step": 6071 + }, + { + "epoch": 2.870921985815603, + "grad_norm": 2.947936534881592, + "learning_rate": 2.701278027453727e-06, + "loss": 0.4167, + "step": 6072 + }, + { + "epoch": 2.8713947990543733, + "grad_norm": 2.389263391494751, + "learning_rate": 2.7006562193448406e-06, + "loss": 0.3854, + "step": 6073 + }, + { + "epoch": 2.871867612293144, + "grad_norm": 2.9040684700012207, + "learning_rate": 2.700034398741971e-06, + "loss": 0.4656, + "step": 6074 + }, + { + "epoch": 2.872340425531915, + "grad_norm": 2.8671910762786865, + "learning_rate": 2.6994125656838365e-06, + "loss": 0.4642, + "step": 6075 + }, + { + "epoch": 2.8728132387706857, + "grad_norm": 2.6957180500030518, + "learning_rate": 2.698790720209156e-06, + "loss": 0.4894, + "step": 6076 + }, + { + "epoch": 2.8732860520094565, + "grad_norm": 2.748342514038086, + "learning_rate": 2.698168862356648e-06, + "loss": 0.4552, + "step": 6077 + }, + { + "epoch": 2.873758865248227, + "grad_norm": 2.7459912300109863, + "learning_rate": 2.6975469921650344e-06, + "loss": 0.4244, + "step": 6078 + }, + { + "epoch": 2.8742316784869977, + "grad_norm": 2.515650987625122, + "learning_rate": 2.6969251096730366e-06, + "loss": 0.4178, + "step": 6079 + }, + { + "epoch": 2.8747044917257685, + "grad_norm": 2.747373342514038, + "learning_rate": 2.696303214919375e-06, + "loss": 0.4623, + "step": 6080 + }, + { + "epoch": 2.875177304964539, + "grad_norm": 2.72092604637146, + "learning_rate": 2.695681307942773e-06, + "loss": 0.4227, + "step": 6081 + }, + { + "epoch": 2.8756501182033096, + "grad_norm": 2.6925108432769775, + "learning_rate": 2.695059388781955e-06, + "loss": 0.3807, + "step": 6082 + }, + { + "epoch": 2.8761229314420804, + "grad_norm": 2.673546314239502, + "learning_rate": 2.6944374574756427e-06, + "loss": 0.424, + "step": 6083 + }, + { + "epoch": 2.876595744680851, + "grad_norm": 2.7018187046051025, + "learning_rate": 2.6938155140625636e-06, + "loss": 0.4367, + "step": 6084 + }, + { + "epoch": 2.877068557919622, + "grad_norm": 2.9420957565307617, + "learning_rate": 2.6931935585814416e-06, + "loss": 0.4223, + "step": 6085 + }, + { + "epoch": 2.8775413711583924, + "grad_norm": 2.6523385047912598, + "learning_rate": 2.6925715910710036e-06, + "loss": 0.4074, + "step": 6086 + }, + { + "epoch": 2.878014184397163, + "grad_norm": 2.6104063987731934, + "learning_rate": 2.691949611569978e-06, + "loss": 0.423, + "step": 6087 + }, + { + "epoch": 2.878486997635934, + "grad_norm": 2.6463685035705566, + "learning_rate": 2.691327620117091e-06, + "loss": 0.4354, + "step": 6088 + }, + { + "epoch": 2.8789598108747043, + "grad_norm": 2.5863583087921143, + "learning_rate": 2.6907056167510725e-06, + "loss": 0.4177, + "step": 6089 + }, + { + "epoch": 2.879432624113475, + "grad_norm": 2.6946942806243896, + "learning_rate": 2.690083601510651e-06, + "loss": 0.4176, + "step": 6090 + }, + { + "epoch": 2.879905437352246, + "grad_norm": 3.0649454593658447, + "learning_rate": 2.6894615744345575e-06, + "loss": 0.4827, + "step": 6091 + }, + { + "epoch": 2.8803782505910167, + "grad_norm": 2.6454906463623047, + "learning_rate": 2.6888395355615226e-06, + "loss": 0.4757, + "step": 6092 + }, + { + "epoch": 2.8808510638297875, + "grad_norm": 3.251805067062378, + "learning_rate": 2.688217484930278e-06, + "loss": 0.5651, + "step": 6093 + }, + { + "epoch": 2.881323877068558, + "grad_norm": 2.543999433517456, + "learning_rate": 2.687595422579555e-06, + "loss": 0.4196, + "step": 6094 + }, + { + "epoch": 2.8817966903073287, + "grad_norm": 3.1502909660339355, + "learning_rate": 2.686973348548088e-06, + "loss": 0.4376, + "step": 6095 + }, + { + "epoch": 2.8822695035460995, + "grad_norm": 2.7800376415252686, + "learning_rate": 2.686351262874611e-06, + "loss": 0.444, + "step": 6096 + }, + { + "epoch": 2.88274231678487, + "grad_norm": 3.1529603004455566, + "learning_rate": 2.685729165597858e-06, + "loss": 0.5137, + "step": 6097 + }, + { + "epoch": 2.8832151300236406, + "grad_norm": 2.6079602241516113, + "learning_rate": 2.685107056756564e-06, + "loss": 0.4213, + "step": 6098 + }, + { + "epoch": 2.8836879432624114, + "grad_norm": 2.8969249725341797, + "learning_rate": 2.6844849363894648e-06, + "loss": 0.4679, + "step": 6099 + }, + { + "epoch": 2.884160756501182, + "grad_norm": 2.5882437229156494, + "learning_rate": 2.6838628045352977e-06, + "loss": 0.3891, + "step": 6100 + }, + { + "epoch": 2.8846335697399526, + "grad_norm": 2.9458062648773193, + "learning_rate": 2.6832406612328007e-06, + "loss": 0.4802, + "step": 6101 + }, + { + "epoch": 2.8851063829787233, + "grad_norm": 2.8463058471679688, + "learning_rate": 2.6826185065207105e-06, + "loss": 0.4332, + "step": 6102 + }, + { + "epoch": 2.885579196217494, + "grad_norm": 2.8799285888671875, + "learning_rate": 2.6819963404377667e-06, + "loss": 0.4474, + "step": 6103 + }, + { + "epoch": 2.8860520094562645, + "grad_norm": 2.846860408782959, + "learning_rate": 2.681374163022709e-06, + "loss": 0.4317, + "step": 6104 + }, + { + "epoch": 2.8865248226950353, + "grad_norm": 2.7918877601623535, + "learning_rate": 2.6807519743142775e-06, + "loss": 0.4243, + "step": 6105 + }, + { + "epoch": 2.886997635933806, + "grad_norm": 2.9351487159729004, + "learning_rate": 2.6801297743512127e-06, + "loss": 0.5253, + "step": 6106 + }, + { + "epoch": 2.887470449172577, + "grad_norm": 2.9422426223754883, + "learning_rate": 2.6795075631722576e-06, + "loss": 0.4887, + "step": 6107 + }, + { + "epoch": 2.8879432624113477, + "grad_norm": 2.6837220191955566, + "learning_rate": 2.678885340816153e-06, + "loss": 0.4761, + "step": 6108 + }, + { + "epoch": 2.888416075650118, + "grad_norm": 2.6800777912139893, + "learning_rate": 2.6782631073216425e-06, + "loss": 0.4248, + "step": 6109 + }, + { + "epoch": 2.888888888888889, + "grad_norm": 2.9654436111450195, + "learning_rate": 2.6776408627274702e-06, + "loss": 0.487, + "step": 6110 + }, + { + "epoch": 2.8893617021276596, + "grad_norm": 2.7725181579589844, + "learning_rate": 2.6770186070723804e-06, + "loss": 0.4166, + "step": 6111 + }, + { + "epoch": 2.88983451536643, + "grad_norm": 2.6547815799713135, + "learning_rate": 2.676396340395118e-06, + "loss": 0.4039, + "step": 6112 + }, + { + "epoch": 2.890307328605201, + "grad_norm": 2.690997838973999, + "learning_rate": 2.6757740627344292e-06, + "loss": 0.4639, + "step": 6113 + }, + { + "epoch": 2.8907801418439716, + "grad_norm": 2.4693069458007812, + "learning_rate": 2.67515177412906e-06, + "loss": 0.4052, + "step": 6114 + }, + { + "epoch": 2.8912529550827424, + "grad_norm": 2.7137033939361572, + "learning_rate": 2.6745294746177576e-06, + "loss": 0.4442, + "step": 6115 + }, + { + "epoch": 2.891725768321513, + "grad_norm": 3.7417004108428955, + "learning_rate": 2.6739071642392712e-06, + "loss": 0.4809, + "step": 6116 + }, + { + "epoch": 2.8921985815602835, + "grad_norm": 2.707094669342041, + "learning_rate": 2.673284843032347e-06, + "loss": 0.411, + "step": 6117 + }, + { + "epoch": 2.8926713947990543, + "grad_norm": 2.7864158153533936, + "learning_rate": 2.672662511035736e-06, + "loss": 0.4939, + "step": 6118 + }, + { + "epoch": 2.893144208037825, + "grad_norm": 2.8753504753112793, + "learning_rate": 2.672040168288187e-06, + "loss": 0.4396, + "step": 6119 + }, + { + "epoch": 2.8936170212765955, + "grad_norm": 2.7581071853637695, + "learning_rate": 2.6714178148284516e-06, + "loss": 0.427, + "step": 6120 + }, + { + "epoch": 2.8940898345153663, + "grad_norm": 2.9754791259765625, + "learning_rate": 2.6707954506952803e-06, + "loss": 0.4255, + "step": 6121 + }, + { + "epoch": 2.894562647754137, + "grad_norm": 2.876939296722412, + "learning_rate": 2.670173075927426e-06, + "loss": 0.4699, + "step": 6122 + }, + { + "epoch": 2.895035460992908, + "grad_norm": 2.4875400066375732, + "learning_rate": 2.6695506905636397e-06, + "loss": 0.3568, + "step": 6123 + }, + { + "epoch": 2.8955082742316787, + "grad_norm": 2.703606128692627, + "learning_rate": 2.668928294642675e-06, + "loss": 0.3646, + "step": 6124 + }, + { + "epoch": 2.895981087470449, + "grad_norm": 2.8618338108062744, + "learning_rate": 2.6683058882032868e-06, + "loss": 0.378, + "step": 6125 + }, + { + "epoch": 2.89645390070922, + "grad_norm": 2.9756760597229004, + "learning_rate": 2.667683471284229e-06, + "loss": 0.4348, + "step": 6126 + }, + { + "epoch": 2.8969267139479906, + "grad_norm": 2.7861104011535645, + "learning_rate": 2.667061043924256e-06, + "loss": 0.4435, + "step": 6127 + }, + { + "epoch": 2.897399527186761, + "grad_norm": 2.7932238578796387, + "learning_rate": 2.6664386061621243e-06, + "loss": 0.4824, + "step": 6128 + }, + { + "epoch": 2.8978723404255318, + "grad_norm": 2.85483455657959, + "learning_rate": 2.6658161580365917e-06, + "loss": 0.4925, + "step": 6129 + }, + { + "epoch": 2.8983451536643026, + "grad_norm": 2.4242141246795654, + "learning_rate": 2.6651936995864136e-06, + "loss": 0.3466, + "step": 6130 + }, + { + "epoch": 2.8988179669030734, + "grad_norm": 3.385214328765869, + "learning_rate": 2.6645712308503473e-06, + "loss": 0.4751, + "step": 6131 + }, + { + "epoch": 2.899290780141844, + "grad_norm": 2.7109622955322266, + "learning_rate": 2.6639487518671525e-06, + "loss": 0.4469, + "step": 6132 + }, + { + "epoch": 2.8997635933806145, + "grad_norm": 2.6537814140319824, + "learning_rate": 2.6633262626755877e-06, + "loss": 0.4678, + "step": 6133 + }, + { + "epoch": 2.9002364066193853, + "grad_norm": 2.5992231369018555, + "learning_rate": 2.6627037633144124e-06, + "loss": 0.4206, + "step": 6134 + }, + { + "epoch": 2.900709219858156, + "grad_norm": 2.988940954208374, + "learning_rate": 2.6620812538223885e-06, + "loss": 0.4554, + "step": 6135 + }, + { + "epoch": 2.9011820330969265, + "grad_norm": 3.0678138732910156, + "learning_rate": 2.661458734238274e-06, + "loss": 0.4671, + "step": 6136 + }, + { + "epoch": 2.9016548463356973, + "grad_norm": 2.6902482509613037, + "learning_rate": 2.6608362046008335e-06, + "loss": 0.372, + "step": 6137 + }, + { + "epoch": 2.902127659574468, + "grad_norm": 3.031597375869751, + "learning_rate": 2.660213664948827e-06, + "loss": 0.4424, + "step": 6138 + }, + { + "epoch": 2.902600472813239, + "grad_norm": 2.8376755714416504, + "learning_rate": 2.6595911153210187e-06, + "loss": 0.4599, + "step": 6139 + }, + { + "epoch": 2.9030732860520096, + "grad_norm": 3.3164854049682617, + "learning_rate": 2.6589685557561707e-06, + "loss": 0.3897, + "step": 6140 + }, + { + "epoch": 2.90354609929078, + "grad_norm": 2.9535014629364014, + "learning_rate": 2.658345986293048e-06, + "loss": 0.4957, + "step": 6141 + }, + { + "epoch": 2.904018912529551, + "grad_norm": 2.821276903152466, + "learning_rate": 2.657723406970415e-06, + "loss": 0.4453, + "step": 6142 + }, + { + "epoch": 2.9044917257683216, + "grad_norm": 2.7314651012420654, + "learning_rate": 2.657100817827037e-06, + "loss": 0.4406, + "step": 6143 + }, + { + "epoch": 2.904964539007092, + "grad_norm": 2.9509520530700684, + "learning_rate": 2.6564782189016804e-06, + "loss": 0.4629, + "step": 6144 + }, + { + "epoch": 2.9054373522458627, + "grad_norm": 2.6234960556030273, + "learning_rate": 2.655855610233111e-06, + "loss": 0.4306, + "step": 6145 + }, + { + "epoch": 2.9059101654846335, + "grad_norm": 2.7209644317626953, + "learning_rate": 2.6552329918600962e-06, + "loss": 0.3643, + "step": 6146 + }, + { + "epoch": 2.9063829787234043, + "grad_norm": 2.9797747135162354, + "learning_rate": 2.654610363821404e-06, + "loss": 0.4616, + "step": 6147 + }, + { + "epoch": 2.906855791962175, + "grad_norm": 2.8179666996002197, + "learning_rate": 2.6539877261558016e-06, + "loss": 0.4526, + "step": 6148 + }, + { + "epoch": 2.9073286052009455, + "grad_norm": 2.7492244243621826, + "learning_rate": 2.653365078902059e-06, + "loss": 0.4862, + "step": 6149 + }, + { + "epoch": 2.9078014184397163, + "grad_norm": 3.0262451171875, + "learning_rate": 2.6527424220989457e-06, + "loss": 0.3728, + "step": 6150 + }, + { + "epoch": 2.908274231678487, + "grad_norm": 2.8092808723449707, + "learning_rate": 2.6521197557852315e-06, + "loss": 0.4668, + "step": 6151 + }, + { + "epoch": 2.9087470449172574, + "grad_norm": 2.915719985961914, + "learning_rate": 2.651497079999687e-06, + "loss": 0.5124, + "step": 6152 + }, + { + "epoch": 2.9092198581560282, + "grad_norm": 2.9794204235076904, + "learning_rate": 2.6508743947810834e-06, + "loss": 0.5207, + "step": 6153 + }, + { + "epoch": 2.909692671394799, + "grad_norm": 2.882453680038452, + "learning_rate": 2.650251700168193e-06, + "loss": 0.4382, + "step": 6154 + }, + { + "epoch": 2.91016548463357, + "grad_norm": 3.183680534362793, + "learning_rate": 2.6496289961997886e-06, + "loss": 0.5134, + "step": 6155 + }, + { + "epoch": 2.9106382978723406, + "grad_norm": 2.9374759197235107, + "learning_rate": 2.649006282914642e-06, + "loss": 0.4748, + "step": 6156 + }, + { + "epoch": 2.911111111111111, + "grad_norm": 2.8096041679382324, + "learning_rate": 2.648383560351527e-06, + "loss": 0.4672, + "step": 6157 + }, + { + "epoch": 2.911583924349882, + "grad_norm": 2.8799238204956055, + "learning_rate": 2.6477608285492196e-06, + "loss": 0.4679, + "step": 6158 + }, + { + "epoch": 2.9120567375886526, + "grad_norm": 2.689310073852539, + "learning_rate": 2.6471380875464923e-06, + "loss": 0.4069, + "step": 6159 + }, + { + "epoch": 2.912529550827423, + "grad_norm": 2.909323215484619, + "learning_rate": 2.6465153373821216e-06, + "loss": 0.4463, + "step": 6160 + }, + { + "epoch": 2.9130023640661937, + "grad_norm": 2.797724962234497, + "learning_rate": 2.6458925780948845e-06, + "loss": 0.4269, + "step": 6161 + }, + { + "epoch": 2.9134751773049645, + "grad_norm": 2.7533204555511475, + "learning_rate": 2.645269809723556e-06, + "loss": 0.453, + "step": 6162 + }, + { + "epoch": 2.9139479905437353, + "grad_norm": 2.6615989208221436, + "learning_rate": 2.6446470323069122e-06, + "loss": 0.3921, + "step": 6163 + }, + { + "epoch": 2.914420803782506, + "grad_norm": 3.0493314266204834, + "learning_rate": 2.644024245883733e-06, + "loss": 0.4779, + "step": 6164 + }, + { + "epoch": 2.9148936170212765, + "grad_norm": 2.649845600128174, + "learning_rate": 2.643401450492795e-06, + "loss": 0.454, + "step": 6165 + }, + { + "epoch": 2.9153664302600473, + "grad_norm": 2.7931838035583496, + "learning_rate": 2.642778646172877e-06, + "loss": 0.504, + "step": 6166 + }, + { + "epoch": 2.915839243498818, + "grad_norm": 2.9518136978149414, + "learning_rate": 2.64215583296276e-06, + "loss": 0.4767, + "step": 6167 + }, + { + "epoch": 2.9163120567375884, + "grad_norm": 2.6047427654266357, + "learning_rate": 2.6415330109012216e-06, + "loss": 0.4316, + "step": 6168 + }, + { + "epoch": 2.916784869976359, + "grad_norm": 2.7732112407684326, + "learning_rate": 2.640910180027044e-06, + "loss": 0.4213, + "step": 6169 + }, + { + "epoch": 2.91725768321513, + "grad_norm": 3.1157236099243164, + "learning_rate": 2.6402873403790068e-06, + "loss": 0.4559, + "step": 6170 + }, + { + "epoch": 2.917730496453901, + "grad_norm": 2.68424129486084, + "learning_rate": 2.6396644919958917e-06, + "loss": 0.3456, + "step": 6171 + }, + { + "epoch": 2.9182033096926716, + "grad_norm": 3.1093270778656006, + "learning_rate": 2.639041634916482e-06, + "loss": 0.4172, + "step": 6172 + }, + { + "epoch": 2.918676122931442, + "grad_norm": 2.9844655990600586, + "learning_rate": 2.6384187691795594e-06, + "loss": 0.4844, + "step": 6173 + }, + { + "epoch": 2.9191489361702128, + "grad_norm": 2.907151222229004, + "learning_rate": 2.637795894823906e-06, + "loss": 0.5126, + "step": 6174 + }, + { + "epoch": 2.9196217494089836, + "grad_norm": 2.804105520248413, + "learning_rate": 2.637173011888307e-06, + "loss": 0.3919, + "step": 6175 + }, + { + "epoch": 2.920094562647754, + "grad_norm": 2.8809266090393066, + "learning_rate": 2.636550120411547e-06, + "loss": 0.4468, + "step": 6176 + }, + { + "epoch": 2.9205673758865247, + "grad_norm": 2.686290979385376, + "learning_rate": 2.6359272204324087e-06, + "loss": 0.4352, + "step": 6177 + }, + { + "epoch": 2.9210401891252955, + "grad_norm": 2.448101758956909, + "learning_rate": 2.635304311989678e-06, + "loss": 0.4218, + "step": 6178 + }, + { + "epoch": 2.9215130023640663, + "grad_norm": 2.81024169921875, + "learning_rate": 2.6346813951221416e-06, + "loss": 0.5177, + "step": 6179 + }, + { + "epoch": 2.921985815602837, + "grad_norm": 2.7590086460113525, + "learning_rate": 2.6340584698685856e-06, + "loss": 0.3897, + "step": 6180 + }, + { + "epoch": 2.9224586288416075, + "grad_norm": 3.1226227283477783, + "learning_rate": 2.6334355362677965e-06, + "loss": 0.4595, + "step": 6181 + }, + { + "epoch": 2.9229314420803783, + "grad_norm": 2.673828125, + "learning_rate": 2.6328125943585607e-06, + "loss": 0.4932, + "step": 6182 + }, + { + "epoch": 2.923404255319149, + "grad_norm": 2.8297293186187744, + "learning_rate": 2.632189644179668e-06, + "loss": 0.3819, + "step": 6183 + }, + { + "epoch": 2.9238770685579194, + "grad_norm": 2.9661548137664795, + "learning_rate": 2.6315666857699056e-06, + "loss": 0.4419, + "step": 6184 + }, + { + "epoch": 2.92434988179669, + "grad_norm": 2.9745798110961914, + "learning_rate": 2.6309437191680627e-06, + "loss": 0.4423, + "step": 6185 + }, + { + "epoch": 2.924822695035461, + "grad_norm": 2.8351712226867676, + "learning_rate": 2.6303207444129285e-06, + "loss": 0.5043, + "step": 6186 + }, + { + "epoch": 2.925295508274232, + "grad_norm": 2.6442384719848633, + "learning_rate": 2.6296977615432927e-06, + "loss": 0.4431, + "step": 6187 + }, + { + "epoch": 2.9257683215130026, + "grad_norm": 2.4128029346466064, + "learning_rate": 2.6290747705979457e-06, + "loss": 0.3603, + "step": 6188 + }, + { + "epoch": 2.926241134751773, + "grad_norm": 2.730424642562866, + "learning_rate": 2.6284517716156786e-06, + "loss": 0.439, + "step": 6189 + }, + { + "epoch": 2.9267139479905437, + "grad_norm": 2.6215405464172363, + "learning_rate": 2.627828764635284e-06, + "loss": 0.4117, + "step": 6190 + }, + { + "epoch": 2.9271867612293145, + "grad_norm": 2.56585955619812, + "learning_rate": 2.627205749695552e-06, + "loss": 0.4404, + "step": 6191 + }, + { + "epoch": 2.927659574468085, + "grad_norm": 2.9587886333465576, + "learning_rate": 2.6265827268352763e-06, + "loss": 0.4295, + "step": 6192 + }, + { + "epoch": 2.9281323877068557, + "grad_norm": 2.6611828804016113, + "learning_rate": 2.625959696093249e-06, + "loss": 0.4441, + "step": 6193 + }, + { + "epoch": 2.9286052009456265, + "grad_norm": 2.4391369819641113, + "learning_rate": 2.6253366575082634e-06, + "loss": 0.4447, + "step": 6194 + }, + { + "epoch": 2.9290780141843973, + "grad_norm": 2.710763454437256, + "learning_rate": 2.6247136111191144e-06, + "loss": 0.4662, + "step": 6195 + }, + { + "epoch": 2.929550827423168, + "grad_norm": 2.770697593688965, + "learning_rate": 2.6240905569645952e-06, + "loss": 0.4263, + "step": 6196 + }, + { + "epoch": 2.9300236406619384, + "grad_norm": 2.5885732173919678, + "learning_rate": 2.623467495083501e-06, + "loss": 0.4303, + "step": 6197 + }, + { + "epoch": 2.9304964539007092, + "grad_norm": 2.5716748237609863, + "learning_rate": 2.6228444255146274e-06, + "loss": 0.3714, + "step": 6198 + }, + { + "epoch": 2.93096926713948, + "grad_norm": 3.0437910556793213, + "learning_rate": 2.6222213482967703e-06, + "loss": 0.4077, + "step": 6199 + }, + { + "epoch": 2.9314420803782504, + "grad_norm": 2.7861344814300537, + "learning_rate": 2.6215982634687253e-06, + "loss": 0.4157, + "step": 6200 + }, + { + "epoch": 2.931914893617021, + "grad_norm": 2.5265355110168457, + "learning_rate": 2.6209751710692905e-06, + "loss": 0.4586, + "step": 6201 + }, + { + "epoch": 2.932387706855792, + "grad_norm": 2.940112590789795, + "learning_rate": 2.6203520711372615e-06, + "loss": 0.4208, + "step": 6202 + }, + { + "epoch": 2.9328605200945628, + "grad_norm": 2.7124581336975098, + "learning_rate": 2.6197289637114363e-06, + "loss": 0.4173, + "step": 6203 + }, + { + "epoch": 2.9333333333333336, + "grad_norm": 2.818523406982422, + "learning_rate": 2.619105848830615e-06, + "loss": 0.4349, + "step": 6204 + }, + { + "epoch": 2.933806146572104, + "grad_norm": 2.7630393505096436, + "learning_rate": 2.6184827265335937e-06, + "loss": 0.5078, + "step": 6205 + }, + { + "epoch": 2.9342789598108747, + "grad_norm": 3.0554699897766113, + "learning_rate": 2.6178595968591726e-06, + "loss": 0.4712, + "step": 6206 + }, + { + "epoch": 2.9347517730496455, + "grad_norm": 2.721992254257202, + "learning_rate": 2.6172364598461507e-06, + "loss": 0.4847, + "step": 6207 + }, + { + "epoch": 2.935224586288416, + "grad_norm": 2.809663772583008, + "learning_rate": 2.6166133155333303e-06, + "loss": 0.4447, + "step": 6208 + }, + { + "epoch": 2.9356973995271867, + "grad_norm": 2.568394660949707, + "learning_rate": 2.6159901639595088e-06, + "loss": 0.4543, + "step": 6209 + }, + { + "epoch": 2.9361702127659575, + "grad_norm": 3.3670637607574463, + "learning_rate": 2.6153670051634884e-06, + "loss": 0.4901, + "step": 6210 + }, + { + "epoch": 2.9366430260047283, + "grad_norm": 3.082508087158203, + "learning_rate": 2.614743839184071e-06, + "loss": 0.4862, + "step": 6211 + }, + { + "epoch": 2.937115839243499, + "grad_norm": 2.692139148712158, + "learning_rate": 2.6141206660600566e-06, + "loss": 0.5199, + "step": 6212 + }, + { + "epoch": 2.9375886524822694, + "grad_norm": 3.231433391571045, + "learning_rate": 2.6134974858302504e-06, + "loss": 0.464, + "step": 6213 + }, + { + "epoch": 2.93806146572104, + "grad_norm": 3.224238157272339, + "learning_rate": 2.612874298533452e-06, + "loss": 0.4507, + "step": 6214 + }, + { + "epoch": 2.938534278959811, + "grad_norm": 2.812755584716797, + "learning_rate": 2.6122511042084663e-06, + "loss": 0.4527, + "step": 6215 + }, + { + "epoch": 2.9390070921985814, + "grad_norm": 2.837811231613159, + "learning_rate": 2.611627902894098e-06, + "loss": 0.4782, + "step": 6216 + }, + { + "epoch": 2.939479905437352, + "grad_norm": 3.093817710876465, + "learning_rate": 2.6110046946291476e-06, + "loss": 0.4933, + "step": 6217 + }, + { + "epoch": 2.939952718676123, + "grad_norm": 2.950119733810425, + "learning_rate": 2.6103814794524235e-06, + "loss": 0.4884, + "step": 6218 + }, + { + "epoch": 2.9404255319148938, + "grad_norm": 2.469681978225708, + "learning_rate": 2.6097582574027274e-06, + "loss": 0.4135, + "step": 6219 + }, + { + "epoch": 2.9408983451536646, + "grad_norm": 2.779238224029541, + "learning_rate": 2.609135028518866e-06, + "loss": 0.5165, + "step": 6220 + }, + { + "epoch": 2.941371158392435, + "grad_norm": 2.807705879211426, + "learning_rate": 2.608511792839645e-06, + "loss": 0.4046, + "step": 6221 + }, + { + "epoch": 2.9418439716312057, + "grad_norm": 2.6067750453948975, + "learning_rate": 2.607888550403871e-06, + "loss": 0.406, + "step": 6222 + }, + { + "epoch": 2.9423167848699765, + "grad_norm": 2.865766763687134, + "learning_rate": 2.607265301250349e-06, + "loss": 0.471, + "step": 6223 + }, + { + "epoch": 2.942789598108747, + "grad_norm": 2.977681875228882, + "learning_rate": 2.6066420454178876e-06, + "loss": 0.4666, + "step": 6224 + }, + { + "epoch": 2.9432624113475176, + "grad_norm": 2.870884418487549, + "learning_rate": 2.606018782945294e-06, + "loss": 0.4768, + "step": 6225 + }, + { + "epoch": 2.9437352245862884, + "grad_norm": 2.992851495742798, + "learning_rate": 2.6053955138713756e-06, + "loss": 0.4657, + "step": 6226 + }, + { + "epoch": 2.9442080378250592, + "grad_norm": 2.7279815673828125, + "learning_rate": 2.6047722382349406e-06, + "loss": 0.4087, + "step": 6227 + }, + { + "epoch": 2.94468085106383, + "grad_norm": 2.8587028980255127, + "learning_rate": 2.604148956074797e-06, + "loss": 0.4452, + "step": 6228 + }, + { + "epoch": 2.9451536643026004, + "grad_norm": 3.001694679260254, + "learning_rate": 2.6035256674297555e-06, + "loss": 0.4852, + "step": 6229 + }, + { + "epoch": 2.945626477541371, + "grad_norm": 2.858069896697998, + "learning_rate": 2.6029023723386237e-06, + "loss": 0.4281, + "step": 6230 + }, + { + "epoch": 2.946099290780142, + "grad_norm": 2.675856828689575, + "learning_rate": 2.602279070840213e-06, + "loss": 0.4545, + "step": 6231 + }, + { + "epoch": 2.9465721040189123, + "grad_norm": 2.530245065689087, + "learning_rate": 2.6016557629733334e-06, + "loss": 0.4619, + "step": 6232 + }, + { + "epoch": 2.947044917257683, + "grad_norm": 2.7533743381500244, + "learning_rate": 2.601032448776795e-06, + "loss": 0.4879, + "step": 6233 + }, + { + "epoch": 2.947517730496454, + "grad_norm": 3.130453109741211, + "learning_rate": 2.600409128289409e-06, + "loss": 0.4056, + "step": 6234 + }, + { + "epoch": 2.9479905437352247, + "grad_norm": 3.4736509323120117, + "learning_rate": 2.5997858015499867e-06, + "loss": 0.5063, + "step": 6235 + }, + { + "epoch": 2.9484633569739955, + "grad_norm": 2.871978282928467, + "learning_rate": 2.5991624685973406e-06, + "loss": 0.4562, + "step": 6236 + }, + { + "epoch": 2.948936170212766, + "grad_norm": 2.976503372192383, + "learning_rate": 2.5985391294702817e-06, + "loss": 0.5079, + "step": 6237 + }, + { + "epoch": 2.9494089834515367, + "grad_norm": 2.578122615814209, + "learning_rate": 2.597915784207623e-06, + "loss": 0.4069, + "step": 6238 + }, + { + "epoch": 2.9498817966903075, + "grad_norm": 2.885911226272583, + "learning_rate": 2.597292432848178e-06, + "loss": 0.4382, + "step": 6239 + }, + { + "epoch": 2.950354609929078, + "grad_norm": 2.9301681518554688, + "learning_rate": 2.5966690754307605e-06, + "loss": 0.4888, + "step": 6240 + }, + { + "epoch": 2.9508274231678486, + "grad_norm": 2.9912192821502686, + "learning_rate": 2.5960457119941834e-06, + "loss": 0.4699, + "step": 6241 + }, + { + "epoch": 2.9513002364066194, + "grad_norm": 2.6612601280212402, + "learning_rate": 2.5954223425772607e-06, + "loss": 0.3736, + "step": 6242 + }, + { + "epoch": 2.9517730496453902, + "grad_norm": 2.9325380325317383, + "learning_rate": 2.5947989672188067e-06, + "loss": 0.4771, + "step": 6243 + }, + { + "epoch": 2.952245862884161, + "grad_norm": 2.8143959045410156, + "learning_rate": 2.594175585957637e-06, + "loss": 0.5103, + "step": 6244 + }, + { + "epoch": 2.9527186761229314, + "grad_norm": 2.355078935623169, + "learning_rate": 2.5935521988325674e-06, + "loss": 0.44, + "step": 6245 + }, + { + "epoch": 2.953191489361702, + "grad_norm": 2.733156442642212, + "learning_rate": 2.5929288058824114e-06, + "loss": 0.4306, + "step": 6246 + }, + { + "epoch": 2.953664302600473, + "grad_norm": 3.182563304901123, + "learning_rate": 2.5923054071459865e-06, + "loss": 0.417, + "step": 6247 + }, + { + "epoch": 2.9541371158392433, + "grad_norm": 2.4162323474884033, + "learning_rate": 2.5916820026621094e-06, + "loss": 0.3802, + "step": 6248 + }, + { + "epoch": 2.954609929078014, + "grad_norm": 2.772706985473633, + "learning_rate": 2.591058592469595e-06, + "loss": 0.4654, + "step": 6249 + }, + { + "epoch": 2.955082742316785, + "grad_norm": 2.6011102199554443, + "learning_rate": 2.5904351766072616e-06, + "loss": 0.4619, + "step": 6250 + }, + { + "epoch": 2.9555555555555557, + "grad_norm": 2.5700361728668213, + "learning_rate": 2.589811755113926e-06, + "loss": 0.3991, + "step": 6251 + }, + { + "epoch": 2.9560283687943265, + "grad_norm": 2.6444971561431885, + "learning_rate": 2.589188328028407e-06, + "loss": 0.4388, + "step": 6252 + }, + { + "epoch": 2.956501182033097, + "grad_norm": 2.739567279815674, + "learning_rate": 2.588564895389521e-06, + "loss": 0.4193, + "step": 6253 + }, + { + "epoch": 2.9569739952718677, + "grad_norm": 2.7070045471191406, + "learning_rate": 2.5879414572360877e-06, + "loss": 0.4347, + "step": 6254 + }, + { + "epoch": 2.9574468085106385, + "grad_norm": 2.7811532020568848, + "learning_rate": 2.587318013606926e-06, + "loss": 0.43, + "step": 6255 + }, + { + "epoch": 2.957919621749409, + "grad_norm": 3.0036091804504395, + "learning_rate": 2.5866945645408537e-06, + "loss": 0.4855, + "step": 6256 + }, + { + "epoch": 2.9583924349881796, + "grad_norm": 2.948573112487793, + "learning_rate": 2.5860711100766918e-06, + "loss": 0.4594, + "step": 6257 + }, + { + "epoch": 2.9588652482269504, + "grad_norm": 2.6371593475341797, + "learning_rate": 2.5854476502532583e-06, + "loss": 0.446, + "step": 6258 + }, + { + "epoch": 2.959338061465721, + "grad_norm": 2.668677806854248, + "learning_rate": 2.5848241851093754e-06, + "loss": 0.3991, + "step": 6259 + }, + { + "epoch": 2.959810874704492, + "grad_norm": 3.1640663146972656, + "learning_rate": 2.5842007146838614e-06, + "loss": 0.5146, + "step": 6260 + }, + { + "epoch": 2.9602836879432624, + "grad_norm": 2.9412102699279785, + "learning_rate": 2.5835772390155382e-06, + "loss": 0.4798, + "step": 6261 + }, + { + "epoch": 2.960756501182033, + "grad_norm": 2.7674343585968018, + "learning_rate": 2.582953758143227e-06, + "loss": 0.4262, + "step": 6262 + }, + { + "epoch": 2.961229314420804, + "grad_norm": 3.5219457149505615, + "learning_rate": 2.582330272105749e-06, + "loss": 0.4905, + "step": 6263 + }, + { + "epoch": 2.9617021276595743, + "grad_norm": 2.4274468421936035, + "learning_rate": 2.5817067809419267e-06, + "loss": 0.4048, + "step": 6264 + }, + { + "epoch": 2.962174940898345, + "grad_norm": 2.6907944679260254, + "learning_rate": 2.5810832846905814e-06, + "loss": 0.388, + "step": 6265 + }, + { + "epoch": 2.962647754137116, + "grad_norm": 2.603151321411133, + "learning_rate": 2.5804597833905347e-06, + "loss": 0.4377, + "step": 6266 + }, + { + "epoch": 2.9631205673758867, + "grad_norm": 2.685837507247925, + "learning_rate": 2.57983627708061e-06, + "loss": 0.4409, + "step": 6267 + }, + { + "epoch": 2.963593380614657, + "grad_norm": 2.8281500339508057, + "learning_rate": 2.579212765799631e-06, + "loss": 0.4567, + "step": 6268 + }, + { + "epoch": 2.964066193853428, + "grad_norm": 2.6387875080108643, + "learning_rate": 2.57858924958642e-06, + "loss": 0.4061, + "step": 6269 + }, + { + "epoch": 2.9645390070921986, + "grad_norm": 2.64139986038208, + "learning_rate": 2.5779657284798017e-06, + "loss": 0.4539, + "step": 6270 + }, + { + "epoch": 2.965011820330969, + "grad_norm": 2.7384836673736572, + "learning_rate": 2.5773422025185983e-06, + "loss": 0.408, + "step": 6271 + }, + { + "epoch": 2.96548463356974, + "grad_norm": 2.262514352798462, + "learning_rate": 2.576718671741636e-06, + "loss": 0.3726, + "step": 6272 + }, + { + "epoch": 2.9659574468085106, + "grad_norm": 2.53800106048584, + "learning_rate": 2.5760951361877384e-06, + "loss": 0.4716, + "step": 6273 + }, + { + "epoch": 2.9664302600472814, + "grad_norm": 3.256701707839966, + "learning_rate": 2.57547159589573e-06, + "loss": 0.518, + "step": 6274 + }, + { + "epoch": 2.966903073286052, + "grad_norm": 2.9427342414855957, + "learning_rate": 2.574848050904436e-06, + "loss": 0.4255, + "step": 6275 + }, + { + "epoch": 2.9673758865248225, + "grad_norm": 2.5794098377227783, + "learning_rate": 2.574224501252682e-06, + "loss": 0.4412, + "step": 6276 + }, + { + "epoch": 2.9678486997635933, + "grad_norm": 2.5894877910614014, + "learning_rate": 2.573600946979294e-06, + "loss": 0.4356, + "step": 6277 + }, + { + "epoch": 2.968321513002364, + "grad_norm": 2.9597361087799072, + "learning_rate": 2.572977388123098e-06, + "loss": 0.4376, + "step": 6278 + }, + { + "epoch": 2.9687943262411345, + "grad_norm": 2.779303550720215, + "learning_rate": 2.5723538247229197e-06, + "loss": 0.3985, + "step": 6279 + }, + { + "epoch": 2.9692671394799053, + "grad_norm": 2.9173855781555176, + "learning_rate": 2.5717302568175866e-06, + "loss": 0.4581, + "step": 6280 + }, + { + "epoch": 2.969739952718676, + "grad_norm": 2.703721284866333, + "learning_rate": 2.5711066844459242e-06, + "loss": 0.3705, + "step": 6281 + }, + { + "epoch": 2.970212765957447, + "grad_norm": 2.5415029525756836, + "learning_rate": 2.5704831076467613e-06, + "loss": 0.4089, + "step": 6282 + }, + { + "epoch": 2.9706855791962177, + "grad_norm": 2.791780948638916, + "learning_rate": 2.5698595264589234e-06, + "loss": 0.4357, + "step": 6283 + }, + { + "epoch": 2.971158392434988, + "grad_norm": 2.887662887573242, + "learning_rate": 2.5692359409212392e-06, + "loss": 0.4093, + "step": 6284 + }, + { + "epoch": 2.971631205673759, + "grad_norm": 3.0309557914733887, + "learning_rate": 2.5686123510725364e-06, + "loss": 0.4461, + "step": 6285 + }, + { + "epoch": 2.9721040189125296, + "grad_norm": 2.6861515045166016, + "learning_rate": 2.5679887569516437e-06, + "loss": 0.4199, + "step": 6286 + }, + { + "epoch": 2.9725768321513, + "grad_norm": 2.7014012336730957, + "learning_rate": 2.5673651585973897e-06, + "loss": 0.4373, + "step": 6287 + }, + { + "epoch": 2.9730496453900708, + "grad_norm": 2.951265811920166, + "learning_rate": 2.5667415560486026e-06, + "loss": 0.4426, + "step": 6288 + }, + { + "epoch": 2.9735224586288416, + "grad_norm": 2.7664504051208496, + "learning_rate": 2.5661179493441106e-06, + "loss": 0.474, + "step": 6289 + }, + { + "epoch": 2.9739952718676124, + "grad_norm": 2.6081087589263916, + "learning_rate": 2.5654943385227445e-06, + "loss": 0.4058, + "step": 6290 + }, + { + "epoch": 2.974468085106383, + "grad_norm": 2.9416966438293457, + "learning_rate": 2.564870723623333e-06, + "loss": 0.506, + "step": 6291 + }, + { + "epoch": 2.9749408983451535, + "grad_norm": 2.9441659450531006, + "learning_rate": 2.564247104684706e-06, + "loss": 0.4505, + "step": 6292 + }, + { + "epoch": 2.9754137115839243, + "grad_norm": 2.7110862731933594, + "learning_rate": 2.563623481745693e-06, + "loss": 0.4493, + "step": 6293 + }, + { + "epoch": 2.975886524822695, + "grad_norm": 2.88459849357605, + "learning_rate": 2.562999854845125e-06, + "loss": 0.4462, + "step": 6294 + }, + { + "epoch": 2.9763593380614655, + "grad_norm": 3.0491793155670166, + "learning_rate": 2.5623762240218327e-06, + "loss": 0.4928, + "step": 6295 + }, + { + "epoch": 2.9768321513002363, + "grad_norm": 2.9475483894348145, + "learning_rate": 2.561752589314646e-06, + "loss": 0.4535, + "step": 6296 + }, + { + "epoch": 2.977304964539007, + "grad_norm": 2.879495859146118, + "learning_rate": 2.561128950762397e-06, + "loss": 0.4393, + "step": 6297 + }, + { + "epoch": 2.977777777777778, + "grad_norm": 2.8478336334228516, + "learning_rate": 2.560505308403916e-06, + "loss": 0.4363, + "step": 6298 + }, + { + "epoch": 2.9782505910165487, + "grad_norm": 2.5475094318389893, + "learning_rate": 2.5598816622780343e-06, + "loss": 0.3825, + "step": 6299 + }, + { + "epoch": 2.978723404255319, + "grad_norm": 2.85430908203125, + "learning_rate": 2.5592580124235838e-06, + "loss": 0.4226, + "step": 6300 + }, + { + "epoch": 2.97919621749409, + "grad_norm": 2.569775104522705, + "learning_rate": 2.5586343588793975e-06, + "loss": 0.4045, + "step": 6301 + }, + { + "epoch": 2.9796690307328606, + "grad_norm": 2.4482202529907227, + "learning_rate": 2.558010701684307e-06, + "loss": 0.4625, + "step": 6302 + }, + { + "epoch": 2.980141843971631, + "grad_norm": 2.9301230907440186, + "learning_rate": 2.5573870408771436e-06, + "loss": 0.4358, + "step": 6303 + }, + { + "epoch": 2.9806146572104018, + "grad_norm": 2.9865870475769043, + "learning_rate": 2.5567633764967416e-06, + "loss": 0.497, + "step": 6304 + }, + { + "epoch": 2.9810874704491725, + "grad_norm": 2.523524522781372, + "learning_rate": 2.556139708581933e-06, + "loss": 0.4141, + "step": 6305 + }, + { + "epoch": 2.9815602836879433, + "grad_norm": 2.8489344120025635, + "learning_rate": 2.5555160371715504e-06, + "loss": 0.4205, + "step": 6306 + }, + { + "epoch": 2.982033096926714, + "grad_norm": 2.417759895324707, + "learning_rate": 2.5548923623044274e-06, + "loss": 0.44, + "step": 6307 + }, + { + "epoch": 2.9825059101654845, + "grad_norm": 2.7626900672912598, + "learning_rate": 2.554268684019398e-06, + "loss": 0.4646, + "step": 6308 + }, + { + "epoch": 2.9829787234042553, + "grad_norm": 3.0916266441345215, + "learning_rate": 2.5536450023552956e-06, + "loss": 0.4443, + "step": 6309 + }, + { + "epoch": 2.983451536643026, + "grad_norm": 2.721992015838623, + "learning_rate": 2.5530213173509542e-06, + "loss": 0.4008, + "step": 6310 + }, + { + "epoch": 2.9839243498817964, + "grad_norm": 2.825334072113037, + "learning_rate": 2.552397629045208e-06, + "loss": 0.4513, + "step": 6311 + }, + { + "epoch": 2.9843971631205672, + "grad_norm": 2.912050485610962, + "learning_rate": 2.5517739374768915e-06, + "loss": 0.4104, + "step": 6312 + }, + { + "epoch": 2.984869976359338, + "grad_norm": 2.760650634765625, + "learning_rate": 2.551150242684838e-06, + "loss": 0.4372, + "step": 6313 + }, + { + "epoch": 2.985342789598109, + "grad_norm": 2.8926033973693848, + "learning_rate": 2.5505265447078838e-06, + "loss": 0.475, + "step": 6314 + }, + { + "epoch": 2.9858156028368796, + "grad_norm": 2.6279892921447754, + "learning_rate": 2.5499028435848633e-06, + "loss": 0.4589, + "step": 6315 + }, + { + "epoch": 2.98628841607565, + "grad_norm": 3.2147316932678223, + "learning_rate": 2.549279139354611e-06, + "loss": 0.4968, + "step": 6316 + }, + { + "epoch": 2.986761229314421, + "grad_norm": 2.4510674476623535, + "learning_rate": 2.5486554320559626e-06, + "loss": 0.4291, + "step": 6317 + }, + { + "epoch": 2.9872340425531916, + "grad_norm": 2.6919643878936768, + "learning_rate": 2.5480317217277544e-06, + "loss": 0.4704, + "step": 6318 + }, + { + "epoch": 2.987706855791962, + "grad_norm": 2.9832234382629395, + "learning_rate": 2.5474080084088215e-06, + "loss": 0.4129, + "step": 6319 + }, + { + "epoch": 2.9881796690307327, + "grad_norm": 2.893209218978882, + "learning_rate": 2.5467842921380004e-06, + "loss": 0.5099, + "step": 6320 + }, + { + "epoch": 2.9886524822695035, + "grad_norm": 2.6734580993652344, + "learning_rate": 2.5461605729541254e-06, + "loss": 0.4588, + "step": 6321 + }, + { + "epoch": 2.9891252955082743, + "grad_norm": 2.5591681003570557, + "learning_rate": 2.5455368508960343e-06, + "loss": 0.4162, + "step": 6322 + }, + { + "epoch": 2.989598108747045, + "grad_norm": 3.2619881629943848, + "learning_rate": 2.5449131260025626e-06, + "loss": 0.4412, + "step": 6323 + }, + { + "epoch": 2.9900709219858155, + "grad_norm": 2.897914409637451, + "learning_rate": 2.544289398312549e-06, + "loss": 0.5079, + "step": 6324 + }, + { + "epoch": 2.9905437352245863, + "grad_norm": 2.7891685962677, + "learning_rate": 2.5436656678648274e-06, + "loss": 0.42, + "step": 6325 + }, + { + "epoch": 2.991016548463357, + "grad_norm": 3.022341728210449, + "learning_rate": 2.5430419346982367e-06, + "loss": 0.4739, + "step": 6326 + }, + { + "epoch": 2.9914893617021274, + "grad_norm": 3.395775556564331, + "learning_rate": 2.542418198851614e-06, + "loss": 0.4822, + "step": 6327 + }, + { + "epoch": 2.9919621749408982, + "grad_norm": 3.0200490951538086, + "learning_rate": 2.541794460363795e-06, + "loss": 0.4755, + "step": 6328 + }, + { + "epoch": 2.992434988179669, + "grad_norm": 3.302020311355591, + "learning_rate": 2.541170719273619e-06, + "loss": 0.4603, + "step": 6329 + }, + { + "epoch": 2.99290780141844, + "grad_norm": 2.5985910892486572, + "learning_rate": 2.5405469756199226e-06, + "loss": 0.4475, + "step": 6330 + }, + { + "epoch": 2.9933806146572106, + "grad_norm": 2.9413928985595703, + "learning_rate": 2.5399232294415434e-06, + "loss": 0.4695, + "step": 6331 + }, + { + "epoch": 2.993853427895981, + "grad_norm": 2.942777156829834, + "learning_rate": 2.53929948077732e-06, + "loss": 0.4462, + "step": 6332 + }, + { + "epoch": 2.9943262411347518, + "grad_norm": 2.971120595932007, + "learning_rate": 2.53867572966609e-06, + "loss": 0.4546, + "step": 6333 + }, + { + "epoch": 2.9947990543735226, + "grad_norm": 2.8248138427734375, + "learning_rate": 2.5380519761466927e-06, + "loss": 0.453, + "step": 6334 + }, + { + "epoch": 2.995271867612293, + "grad_norm": 3.0819008350372314, + "learning_rate": 2.5374282202579647e-06, + "loss": 0.4774, + "step": 6335 + }, + { + "epoch": 2.9957446808510637, + "grad_norm": 2.742570161819458, + "learning_rate": 2.5368044620387466e-06, + "loss": 0.5059, + "step": 6336 + }, + { + "epoch": 2.9962174940898345, + "grad_norm": 2.9087419509887695, + "learning_rate": 2.5361807015278757e-06, + "loss": 0.3606, + "step": 6337 + }, + { + "epoch": 2.9966903073286053, + "grad_norm": 2.6887354850769043, + "learning_rate": 2.5355569387641908e-06, + "loss": 0.4247, + "step": 6338 + }, + { + "epoch": 2.997163120567376, + "grad_norm": 2.8516008853912354, + "learning_rate": 2.534933173786531e-06, + "loss": 0.4502, + "step": 6339 + }, + { + "epoch": 2.9976359338061465, + "grad_norm": 2.4463164806365967, + "learning_rate": 2.5343094066337366e-06, + "loss": 0.3883, + "step": 6340 + }, + { + "epoch": 2.9981087470449173, + "grad_norm": 2.87025785446167, + "learning_rate": 2.533685637344645e-06, + "loss": 0.4534, + "step": 6341 + }, + { + "epoch": 2.998581560283688, + "grad_norm": 3.0706169605255127, + "learning_rate": 2.5330618659580967e-06, + "loss": 0.5426, + "step": 6342 + }, + { + "epoch": 2.9990543735224584, + "grad_norm": 2.7185773849487305, + "learning_rate": 2.532438092512931e-06, + "loss": 0.497, + "step": 6343 + }, + { + "epoch": 2.999527186761229, + "grad_norm": 2.840207815170288, + "learning_rate": 2.531814317047988e-06, + "loss": 0.4073, + "step": 6344 + }, + { + "epoch": 3.0, + "grad_norm": 3.1592655181884766, + "learning_rate": 2.5311905396021063e-06, + "loss": 0.4728, + "step": 6345 + }, + { + "epoch": 3.000472813238771, + "grad_norm": 2.190042495727539, + "learning_rate": 2.530566760214127e-06, + "loss": 0.3588, + "step": 6346 + }, + { + "epoch": 3.000945626477541, + "grad_norm": 2.749516248703003, + "learning_rate": 2.5299429789228898e-06, + "loss": 0.3495, + "step": 6347 + }, + { + "epoch": 3.001418439716312, + "grad_norm": 2.6181938648223877, + "learning_rate": 2.5293191957672335e-06, + "loss": 0.3611, + "step": 6348 + }, + { + "epoch": 3.0018912529550827, + "grad_norm": 2.7235212326049805, + "learning_rate": 2.528695410786e-06, + "loss": 0.4173, + "step": 6349 + }, + { + "epoch": 3.0023640661938535, + "grad_norm": 2.5408031940460205, + "learning_rate": 2.528071624018029e-06, + "loss": 0.3651, + "step": 6350 + }, + { + "epoch": 3.002836879432624, + "grad_norm": 2.7824409008026123, + "learning_rate": 2.5274478355021615e-06, + "loss": 0.378, + "step": 6351 + }, + { + "epoch": 3.0033096926713947, + "grad_norm": 2.7671427726745605, + "learning_rate": 2.526824045277238e-06, + "loss": 0.446, + "step": 6352 + }, + { + "epoch": 3.0037825059101655, + "grad_norm": 2.6746346950531006, + "learning_rate": 2.526200253382098e-06, + "loss": 0.3831, + "step": 6353 + }, + { + "epoch": 3.0042553191489363, + "grad_norm": 2.437439441680908, + "learning_rate": 2.525576459855583e-06, + "loss": 0.352, + "step": 6354 + }, + { + "epoch": 3.0047281323877066, + "grad_norm": 2.7632546424865723, + "learning_rate": 2.5249526647365343e-06, + "loss": 0.4636, + "step": 6355 + }, + { + "epoch": 3.0052009456264774, + "grad_norm": 2.681955099105835, + "learning_rate": 2.524328868063793e-06, + "loss": 0.3978, + "step": 6356 + }, + { + "epoch": 3.0056737588652482, + "grad_norm": 2.9575345516204834, + "learning_rate": 2.523705069876199e-06, + "loss": 0.3803, + "step": 6357 + }, + { + "epoch": 3.006146572104019, + "grad_norm": 2.7368216514587402, + "learning_rate": 2.523081270212594e-06, + "loss": 0.3968, + "step": 6358 + }, + { + "epoch": 3.0066193853427894, + "grad_norm": 2.637592077255249, + "learning_rate": 2.522457469111821e-06, + "loss": 0.3629, + "step": 6359 + }, + { + "epoch": 3.00709219858156, + "grad_norm": 2.579331398010254, + "learning_rate": 2.5218336666127187e-06, + "loss": 0.4044, + "step": 6360 + }, + { + "epoch": 3.007565011820331, + "grad_norm": 3.014544725418091, + "learning_rate": 2.5212098627541296e-06, + "loss": 0.3518, + "step": 6361 + }, + { + "epoch": 3.0080378250591018, + "grad_norm": 2.5261058807373047, + "learning_rate": 2.520586057574896e-06, + "loss": 0.3763, + "step": 6362 + }, + { + "epoch": 3.008510638297872, + "grad_norm": 3.234910249710083, + "learning_rate": 2.519962251113858e-06, + "loss": 0.3691, + "step": 6363 + }, + { + "epoch": 3.008983451536643, + "grad_norm": 3.2930967807769775, + "learning_rate": 2.519338443409859e-06, + "loss": 0.4363, + "step": 6364 + }, + { + "epoch": 3.0094562647754137, + "grad_norm": 2.807910442352295, + "learning_rate": 2.51871463450174e-06, + "loss": 0.3984, + "step": 6365 + }, + { + "epoch": 3.0099290780141845, + "grad_norm": 3.1555075645446777, + "learning_rate": 2.518090824428342e-06, + "loss": 0.4006, + "step": 6366 + }, + { + "epoch": 3.010401891252955, + "grad_norm": 3.1793272495269775, + "learning_rate": 2.5174670132285084e-06, + "loss": 0.4966, + "step": 6367 + }, + { + "epoch": 3.0108747044917257, + "grad_norm": 2.7007548809051514, + "learning_rate": 2.5168432009410805e-06, + "loss": 0.3755, + "step": 6368 + }, + { + "epoch": 3.0113475177304965, + "grad_norm": 2.914792537689209, + "learning_rate": 2.5162193876048995e-06, + "loss": 0.39, + "step": 6369 + }, + { + "epoch": 3.0118203309692673, + "grad_norm": 2.935516119003296, + "learning_rate": 2.5155955732588093e-06, + "loss": 0.4045, + "step": 6370 + }, + { + "epoch": 3.0122931442080376, + "grad_norm": 2.8817989826202393, + "learning_rate": 2.5149717579416503e-06, + "loss": 0.3751, + "step": 6371 + }, + { + "epoch": 3.0127659574468084, + "grad_norm": 2.9181740283966064, + "learning_rate": 2.514347941692266e-06, + "loss": 0.3689, + "step": 6372 + }, + { + "epoch": 3.013238770685579, + "grad_norm": 3.052060604095459, + "learning_rate": 2.5137241245494982e-06, + "loss": 0.3874, + "step": 6373 + }, + { + "epoch": 3.01371158392435, + "grad_norm": 2.6931657791137695, + "learning_rate": 2.513100306552189e-06, + "loss": 0.3673, + "step": 6374 + }, + { + "epoch": 3.0141843971631204, + "grad_norm": 2.3422248363494873, + "learning_rate": 2.5124764877391824e-06, + "loss": 0.3753, + "step": 6375 + }, + { + "epoch": 3.014657210401891, + "grad_norm": 2.5826265811920166, + "learning_rate": 2.5118526681493186e-06, + "loss": 0.3661, + "step": 6376 + }, + { + "epoch": 3.015130023640662, + "grad_norm": 2.7407493591308594, + "learning_rate": 2.5112288478214415e-06, + "loss": 0.3887, + "step": 6377 + }, + { + "epoch": 3.0156028368794328, + "grad_norm": 2.7378315925598145, + "learning_rate": 2.510605026794393e-06, + "loss": 0.3623, + "step": 6378 + }, + { + "epoch": 3.016075650118203, + "grad_norm": 2.59541654586792, + "learning_rate": 2.5099812051070167e-06, + "loss": 0.3804, + "step": 6379 + }, + { + "epoch": 3.016548463356974, + "grad_norm": 3.1022770404815674, + "learning_rate": 2.509357382798154e-06, + "loss": 0.4092, + "step": 6380 + }, + { + "epoch": 3.0170212765957447, + "grad_norm": 2.521545648574829, + "learning_rate": 2.5087335599066476e-06, + "loss": 0.3509, + "step": 6381 + }, + { + "epoch": 3.0174940898345155, + "grad_norm": 2.949395179748535, + "learning_rate": 2.5081097364713407e-06, + "loss": 0.387, + "step": 6382 + }, + { + "epoch": 3.017966903073286, + "grad_norm": 2.4806487560272217, + "learning_rate": 2.507485912531077e-06, + "loss": 0.4004, + "step": 6383 + }, + { + "epoch": 3.0184397163120567, + "grad_norm": 2.6480894088745117, + "learning_rate": 2.506862088124698e-06, + "loss": 0.3366, + "step": 6384 + }, + { + "epoch": 3.0189125295508275, + "grad_norm": 2.62559175491333, + "learning_rate": 2.5062382632910463e-06, + "loss": 0.3676, + "step": 6385 + }, + { + "epoch": 3.0193853427895982, + "grad_norm": 2.694767951965332, + "learning_rate": 2.5056144380689657e-06, + "loss": 0.3438, + "step": 6386 + }, + { + "epoch": 3.0198581560283686, + "grad_norm": 2.808107614517212, + "learning_rate": 2.504990612497299e-06, + "loss": 0.3831, + "step": 6387 + }, + { + "epoch": 3.0203309692671394, + "grad_norm": 3.2392303943634033, + "learning_rate": 2.504366786614888e-06, + "loss": 0.3493, + "step": 6388 + }, + { + "epoch": 3.02080378250591, + "grad_norm": 2.6899030208587646, + "learning_rate": 2.5037429604605774e-06, + "loss": 0.3998, + "step": 6389 + }, + { + "epoch": 3.021276595744681, + "grad_norm": 2.5622799396514893, + "learning_rate": 2.503119134073208e-06, + "loss": 0.3443, + "step": 6390 + }, + { + "epoch": 3.0217494089834513, + "grad_norm": 2.716832399368286, + "learning_rate": 2.502495307491625e-06, + "loss": 0.4465, + "step": 6391 + }, + { + "epoch": 3.022222222222222, + "grad_norm": 2.8117692470550537, + "learning_rate": 2.501871480754669e-06, + "loss": 0.3513, + "step": 6392 + }, + { + "epoch": 3.022695035460993, + "grad_norm": 3.1260762214660645, + "learning_rate": 2.501247653901185e-06, + "loss": 0.4336, + "step": 6393 + }, + { + "epoch": 3.0231678486997637, + "grad_norm": 2.5076897144317627, + "learning_rate": 2.5006238269700137e-06, + "loss": 0.3437, + "step": 6394 + }, + { + "epoch": 3.023640661938534, + "grad_norm": 2.781937837600708, + "learning_rate": 2.5e-06, + "loss": 0.3583, + "step": 6395 + }, + { + "epoch": 3.024113475177305, + "grad_norm": 3.084050178527832, + "learning_rate": 2.499376173029987e-06, + "loss": 0.3785, + "step": 6396 + }, + { + "epoch": 3.0245862884160757, + "grad_norm": 3.2292473316192627, + "learning_rate": 2.498752346098816e-06, + "loss": 0.3858, + "step": 6397 + }, + { + "epoch": 3.0250591016548465, + "grad_norm": 2.738614797592163, + "learning_rate": 2.498128519245332e-06, + "loss": 0.4166, + "step": 6398 + }, + { + "epoch": 3.025531914893617, + "grad_norm": 2.940103054046631, + "learning_rate": 2.4975046925083764e-06, + "loss": 0.4117, + "step": 6399 + }, + { + "epoch": 3.0260047281323876, + "grad_norm": 2.5177032947540283, + "learning_rate": 2.4968808659267927e-06, + "loss": 0.3704, + "step": 6400 + }, + { + "epoch": 3.0264775413711584, + "grad_norm": 2.6969990730285645, + "learning_rate": 2.4962570395394243e-06, + "loss": 0.3721, + "step": 6401 + }, + { + "epoch": 3.0269503546099292, + "grad_norm": 2.9696028232574463, + "learning_rate": 2.495633213385112e-06, + "loss": 0.3934, + "step": 6402 + }, + { + "epoch": 3.0274231678486996, + "grad_norm": 3.4032552242279053, + "learning_rate": 2.495009387502702e-06, + "loss": 0.3877, + "step": 6403 + }, + { + "epoch": 3.0278959810874704, + "grad_norm": 2.6801865100860596, + "learning_rate": 2.4943855619310343e-06, + "loss": 0.3421, + "step": 6404 + }, + { + "epoch": 3.028368794326241, + "grad_norm": 2.827056884765625, + "learning_rate": 2.493761736708954e-06, + "loss": 0.3791, + "step": 6405 + }, + { + "epoch": 3.028841607565012, + "grad_norm": 2.6393566131591797, + "learning_rate": 2.4931379118753034e-06, + "loss": 0.3729, + "step": 6406 + }, + { + "epoch": 3.0293144208037823, + "grad_norm": 2.833519458770752, + "learning_rate": 2.4925140874689236e-06, + "loss": 0.3836, + "step": 6407 + }, + { + "epoch": 3.029787234042553, + "grad_norm": 2.8852169513702393, + "learning_rate": 2.4918902635286597e-06, + "loss": 0.4307, + "step": 6408 + }, + { + "epoch": 3.030260047281324, + "grad_norm": 2.7166404724121094, + "learning_rate": 2.491266440093354e-06, + "loss": 0.3825, + "step": 6409 + }, + { + "epoch": 3.0307328605200947, + "grad_norm": 2.5828018188476562, + "learning_rate": 2.4906426172018474e-06, + "loss": 0.3579, + "step": 6410 + }, + { + "epoch": 3.031205673758865, + "grad_norm": 2.915632724761963, + "learning_rate": 2.490018794892985e-06, + "loss": 0.4099, + "step": 6411 + }, + { + "epoch": 3.031678486997636, + "grad_norm": 2.7117249965667725, + "learning_rate": 2.489394973205607e-06, + "loss": 0.4063, + "step": 6412 + }, + { + "epoch": 3.0321513002364067, + "grad_norm": 2.3989102840423584, + "learning_rate": 2.488771152178559e-06, + "loss": 0.3377, + "step": 6413 + }, + { + "epoch": 3.0326241134751775, + "grad_norm": 2.6560115814208984, + "learning_rate": 2.488147331850682e-06, + "loss": 0.4072, + "step": 6414 + }, + { + "epoch": 3.033096926713948, + "grad_norm": 2.9466328620910645, + "learning_rate": 2.4875235122608184e-06, + "loss": 0.3559, + "step": 6415 + }, + { + "epoch": 3.0335697399527186, + "grad_norm": 2.765348196029663, + "learning_rate": 2.4868996934478114e-06, + "loss": 0.336, + "step": 6416 + }, + { + "epoch": 3.0340425531914894, + "grad_norm": 2.6021807193756104, + "learning_rate": 2.4862758754505017e-06, + "loss": 0.3861, + "step": 6417 + }, + { + "epoch": 3.03451536643026, + "grad_norm": 2.7293684482574463, + "learning_rate": 2.4856520583077344e-06, + "loss": 0.3926, + "step": 6418 + }, + { + "epoch": 3.0349881796690306, + "grad_norm": 2.9704763889312744, + "learning_rate": 2.485028242058351e-06, + "loss": 0.4303, + "step": 6419 + }, + { + "epoch": 3.0354609929078014, + "grad_norm": 3.385713815689087, + "learning_rate": 2.484404426741191e-06, + "loss": 0.44, + "step": 6420 + }, + { + "epoch": 3.035933806146572, + "grad_norm": 3.177983045578003, + "learning_rate": 2.4837806123951013e-06, + "loss": 0.4256, + "step": 6421 + }, + { + "epoch": 3.036406619385343, + "grad_norm": 2.6287200450897217, + "learning_rate": 2.4831567990589203e-06, + "loss": 0.3764, + "step": 6422 + }, + { + "epoch": 3.0368794326241133, + "grad_norm": 2.81823992729187, + "learning_rate": 2.4825329867714924e-06, + "loss": 0.3645, + "step": 6423 + }, + { + "epoch": 3.037352245862884, + "grad_norm": 3.1826934814453125, + "learning_rate": 2.4819091755716586e-06, + "loss": 0.3666, + "step": 6424 + }, + { + "epoch": 3.037825059101655, + "grad_norm": 3.0880346298217773, + "learning_rate": 2.481285365498261e-06, + "loss": 0.4339, + "step": 6425 + }, + { + "epoch": 3.0382978723404257, + "grad_norm": 3.1764965057373047, + "learning_rate": 2.480661556590142e-06, + "loss": 0.4804, + "step": 6426 + }, + { + "epoch": 3.038770685579196, + "grad_norm": 2.89469313621521, + "learning_rate": 2.480037748886142e-06, + "loss": 0.3875, + "step": 6427 + }, + { + "epoch": 3.039243498817967, + "grad_norm": 2.6043636798858643, + "learning_rate": 2.479413942425105e-06, + "loss": 0.3859, + "step": 6428 + }, + { + "epoch": 3.0397163120567376, + "grad_norm": 2.6570727825164795, + "learning_rate": 2.4787901372458712e-06, + "loss": 0.3508, + "step": 6429 + }, + { + "epoch": 3.0401891252955084, + "grad_norm": 2.914050579071045, + "learning_rate": 2.4781663333872825e-06, + "loss": 0.3904, + "step": 6430 + }, + { + "epoch": 3.040661938534279, + "grad_norm": 2.595606803894043, + "learning_rate": 2.47754253088818e-06, + "loss": 0.3753, + "step": 6431 + }, + { + "epoch": 3.0411347517730496, + "grad_norm": 2.68186616897583, + "learning_rate": 2.4769187297874065e-06, + "loss": 0.3545, + "step": 6432 + }, + { + "epoch": 3.0416075650118204, + "grad_norm": 2.956507921218872, + "learning_rate": 2.476294930123802e-06, + "loss": 0.3778, + "step": 6433 + }, + { + "epoch": 3.042080378250591, + "grad_norm": 2.8327226638793945, + "learning_rate": 2.475671131936209e-06, + "loss": 0.3205, + "step": 6434 + }, + { + "epoch": 3.0425531914893615, + "grad_norm": 2.594348430633545, + "learning_rate": 2.475047335263466e-06, + "loss": 0.3859, + "step": 6435 + }, + { + "epoch": 3.0430260047281323, + "grad_norm": 3.5030717849731445, + "learning_rate": 2.4744235401444177e-06, + "loss": 0.3611, + "step": 6436 + }, + { + "epoch": 3.043498817966903, + "grad_norm": 2.8478317260742188, + "learning_rate": 2.4737997466179034e-06, + "loss": 0.3927, + "step": 6437 + }, + { + "epoch": 3.043971631205674, + "grad_norm": 2.677827835083008, + "learning_rate": 2.4731759547227627e-06, + "loss": 0.3784, + "step": 6438 + }, + { + "epoch": 3.0444444444444443, + "grad_norm": 3.0059866905212402, + "learning_rate": 2.4725521644978393e-06, + "loss": 0.4279, + "step": 6439 + }, + { + "epoch": 3.044917257683215, + "grad_norm": 3.012500047683716, + "learning_rate": 2.4719283759819713e-06, + "loss": 0.4007, + "step": 6440 + }, + { + "epoch": 3.045390070921986, + "grad_norm": 2.758204936981201, + "learning_rate": 2.4713045892140007e-06, + "loss": 0.3668, + "step": 6441 + }, + { + "epoch": 3.0458628841607567, + "grad_norm": 2.9551615715026855, + "learning_rate": 2.4706808042327678e-06, + "loss": 0.3524, + "step": 6442 + }, + { + "epoch": 3.046335697399527, + "grad_norm": 2.8639965057373047, + "learning_rate": 2.4700570210771115e-06, + "loss": 0.3886, + "step": 6443 + }, + { + "epoch": 3.046808510638298, + "grad_norm": 2.718219757080078, + "learning_rate": 2.4694332397858738e-06, + "loss": 0.3693, + "step": 6444 + }, + { + "epoch": 3.0472813238770686, + "grad_norm": 3.050135612487793, + "learning_rate": 2.4688094603978933e-06, + "loss": 0.3979, + "step": 6445 + }, + { + "epoch": 3.0477541371158394, + "grad_norm": 2.786186456680298, + "learning_rate": 2.468185682952013e-06, + "loss": 0.3809, + "step": 6446 + }, + { + "epoch": 3.0482269503546098, + "grad_norm": 2.6462252140045166, + "learning_rate": 2.4675619074870697e-06, + "loss": 0.3746, + "step": 6447 + }, + { + "epoch": 3.0486997635933806, + "grad_norm": 2.984783887863159, + "learning_rate": 2.4669381340419037e-06, + "loss": 0.4092, + "step": 6448 + }, + { + "epoch": 3.0491725768321514, + "grad_norm": 2.936380624771118, + "learning_rate": 2.466314362655356e-06, + "loss": 0.4335, + "step": 6449 + }, + { + "epoch": 3.049645390070922, + "grad_norm": 2.730738639831543, + "learning_rate": 2.465690593366264e-06, + "loss": 0.364, + "step": 6450 + }, + { + "epoch": 3.0501182033096925, + "grad_norm": 2.7273590564727783, + "learning_rate": 2.4650668262134693e-06, + "loss": 0.3905, + "step": 6451 + }, + { + "epoch": 3.0505910165484633, + "grad_norm": 2.9588208198547363, + "learning_rate": 2.4644430612358105e-06, + "loss": 0.3936, + "step": 6452 + }, + { + "epoch": 3.051063829787234, + "grad_norm": 2.8721611499786377, + "learning_rate": 2.4638192984721247e-06, + "loss": 0.4279, + "step": 6453 + }, + { + "epoch": 3.051536643026005, + "grad_norm": 3.7179651260375977, + "learning_rate": 2.463195537961254e-06, + "loss": 0.427, + "step": 6454 + }, + { + "epoch": 3.0520094562647753, + "grad_norm": 2.651731491088867, + "learning_rate": 2.4625717797420353e-06, + "loss": 0.3471, + "step": 6455 + }, + { + "epoch": 3.052482269503546, + "grad_norm": 3.898737668991089, + "learning_rate": 2.4619480238533085e-06, + "loss": 0.4574, + "step": 6456 + }, + { + "epoch": 3.052955082742317, + "grad_norm": 2.916252374649048, + "learning_rate": 2.4613242703339108e-06, + "loss": 0.3622, + "step": 6457 + }, + { + "epoch": 3.0534278959810877, + "grad_norm": 3.122565507888794, + "learning_rate": 2.4607005192226806e-06, + "loss": 0.3954, + "step": 6458 + }, + { + "epoch": 3.053900709219858, + "grad_norm": 3.2377424240112305, + "learning_rate": 2.4600767705584575e-06, + "loss": 0.4082, + "step": 6459 + }, + { + "epoch": 3.054373522458629, + "grad_norm": 2.941102981567383, + "learning_rate": 2.459453024380079e-06, + "loss": 0.4324, + "step": 6460 + }, + { + "epoch": 3.0548463356973996, + "grad_norm": 2.964313507080078, + "learning_rate": 2.4588292807263816e-06, + "loss": 0.3037, + "step": 6461 + }, + { + "epoch": 3.0553191489361704, + "grad_norm": 2.824669599533081, + "learning_rate": 2.4582055396362055e-06, + "loss": 0.4076, + "step": 6462 + }, + { + "epoch": 3.0557919621749408, + "grad_norm": 2.7739884853363037, + "learning_rate": 2.457581801148387e-06, + "loss": 0.3615, + "step": 6463 + }, + { + "epoch": 3.0562647754137116, + "grad_norm": 3.2974464893341064, + "learning_rate": 2.456958065301764e-06, + "loss": 0.426, + "step": 6464 + }, + { + "epoch": 3.0567375886524824, + "grad_norm": 3.0801217555999756, + "learning_rate": 2.456334332135174e-06, + "loss": 0.3737, + "step": 6465 + }, + { + "epoch": 3.057210401891253, + "grad_norm": 2.788851022720337, + "learning_rate": 2.455710601687452e-06, + "loss": 0.4367, + "step": 6466 + }, + { + "epoch": 3.0576832151300235, + "grad_norm": 2.8078136444091797, + "learning_rate": 2.4550868739974378e-06, + "loss": 0.3796, + "step": 6467 + }, + { + "epoch": 3.0581560283687943, + "grad_norm": 2.9871349334716797, + "learning_rate": 2.4544631491039657e-06, + "loss": 0.3869, + "step": 6468 + }, + { + "epoch": 3.058628841607565, + "grad_norm": 2.9170174598693848, + "learning_rate": 2.453839427045875e-06, + "loss": 0.4591, + "step": 6469 + }, + { + "epoch": 3.059101654846336, + "grad_norm": 2.7316131591796875, + "learning_rate": 2.4532157078620013e-06, + "loss": 0.3723, + "step": 6470 + }, + { + "epoch": 3.0595744680851062, + "grad_norm": 3.047921657562256, + "learning_rate": 2.4525919915911793e-06, + "loss": 0.3804, + "step": 6471 + }, + { + "epoch": 3.060047281323877, + "grad_norm": 3.047934055328369, + "learning_rate": 2.4519682782722465e-06, + "loss": 0.3949, + "step": 6472 + }, + { + "epoch": 3.060520094562648, + "grad_norm": 2.4911186695098877, + "learning_rate": 2.4513445679440374e-06, + "loss": 0.3629, + "step": 6473 + }, + { + "epoch": 3.0609929078014186, + "grad_norm": 2.5353519916534424, + "learning_rate": 2.4507208606453895e-06, + "loss": 0.3417, + "step": 6474 + }, + { + "epoch": 3.061465721040189, + "grad_norm": 2.474622964859009, + "learning_rate": 2.4500971564151384e-06, + "loss": 0.3468, + "step": 6475 + }, + { + "epoch": 3.06193853427896, + "grad_norm": 2.7016963958740234, + "learning_rate": 2.4494734552921166e-06, + "loss": 0.3872, + "step": 6476 + }, + { + "epoch": 3.0624113475177306, + "grad_norm": 2.912144184112549, + "learning_rate": 2.4488497573151625e-06, + "loss": 0.3727, + "step": 6477 + }, + { + "epoch": 3.0628841607565014, + "grad_norm": 2.8234877586364746, + "learning_rate": 2.4482260625231093e-06, + "loss": 0.3472, + "step": 6478 + }, + { + "epoch": 3.0633569739952717, + "grad_norm": 2.6554179191589355, + "learning_rate": 2.447602370954793e-06, + "loss": 0.343, + "step": 6479 + }, + { + "epoch": 3.0638297872340425, + "grad_norm": 2.666419744491577, + "learning_rate": 2.446978682649047e-06, + "loss": 0.3932, + "step": 6480 + }, + { + "epoch": 3.0643026004728133, + "grad_norm": 2.968574285507202, + "learning_rate": 2.446354997644705e-06, + "loss": 0.4418, + "step": 6481 + }, + { + "epoch": 3.064775413711584, + "grad_norm": 2.692253589630127, + "learning_rate": 2.4457313159806028e-06, + "loss": 0.3141, + "step": 6482 + }, + { + "epoch": 3.0652482269503545, + "grad_norm": 2.5857295989990234, + "learning_rate": 2.445107637695574e-06, + "loss": 0.3392, + "step": 6483 + }, + { + "epoch": 3.0657210401891253, + "grad_norm": 3.2332825660705566, + "learning_rate": 2.4444839628284504e-06, + "loss": 0.4694, + "step": 6484 + }, + { + "epoch": 3.066193853427896, + "grad_norm": 2.7391014099121094, + "learning_rate": 2.4438602914180684e-06, + "loss": 0.3966, + "step": 6485 + }, + { + "epoch": 3.066666666666667, + "grad_norm": 2.7882139682769775, + "learning_rate": 2.4432366235032593e-06, + "loss": 0.3552, + "step": 6486 + }, + { + "epoch": 3.0671394799054372, + "grad_norm": 2.8907811641693115, + "learning_rate": 2.4426129591228573e-06, + "loss": 0.4478, + "step": 6487 + }, + { + "epoch": 3.067612293144208, + "grad_norm": 2.878929853439331, + "learning_rate": 2.4419892983156947e-06, + "loss": 0.3457, + "step": 6488 + }, + { + "epoch": 3.068085106382979, + "grad_norm": 2.7087442874908447, + "learning_rate": 2.441365641120603e-06, + "loss": 0.3491, + "step": 6489 + }, + { + "epoch": 3.0685579196217496, + "grad_norm": 3.2330431938171387, + "learning_rate": 2.4407419875764167e-06, + "loss": 0.3901, + "step": 6490 + }, + { + "epoch": 3.06903073286052, + "grad_norm": 3.0529370307922363, + "learning_rate": 2.440118337721966e-06, + "loss": 0.4059, + "step": 6491 + }, + { + "epoch": 3.0695035460992908, + "grad_norm": 2.4786794185638428, + "learning_rate": 2.439494691596085e-06, + "loss": 0.3153, + "step": 6492 + }, + { + "epoch": 3.0699763593380616, + "grad_norm": 2.956310510635376, + "learning_rate": 2.438871049237604e-06, + "loss": 0.3973, + "step": 6493 + }, + { + "epoch": 3.0704491725768324, + "grad_norm": 3.0816991329193115, + "learning_rate": 2.4382474106853543e-06, + "loss": 0.388, + "step": 6494 + }, + { + "epoch": 3.0709219858156027, + "grad_norm": 2.6103477478027344, + "learning_rate": 2.4376237759781686e-06, + "loss": 0.3656, + "step": 6495 + }, + { + "epoch": 3.0713947990543735, + "grad_norm": 2.974076271057129, + "learning_rate": 2.437000145154875e-06, + "loss": 0.3246, + "step": 6496 + }, + { + "epoch": 3.0718676122931443, + "grad_norm": 2.633605718612671, + "learning_rate": 2.4363765182543075e-06, + "loss": 0.3556, + "step": 6497 + }, + { + "epoch": 3.072340425531915, + "grad_norm": 2.49161434173584, + "learning_rate": 2.4357528953152953e-06, + "loss": 0.3506, + "step": 6498 + }, + { + "epoch": 3.0728132387706855, + "grad_norm": 2.6435935497283936, + "learning_rate": 2.4351292763766676e-06, + "loss": 0.3652, + "step": 6499 + }, + { + "epoch": 3.0732860520094563, + "grad_norm": 2.9710617065429688, + "learning_rate": 2.4345056614772563e-06, + "loss": 0.3713, + "step": 6500 + }, + { + "epoch": 3.073758865248227, + "grad_norm": 2.6947052478790283, + "learning_rate": 2.43388205065589e-06, + "loss": 0.378, + "step": 6501 + }, + { + "epoch": 3.0742316784869974, + "grad_norm": 2.9686238765716553, + "learning_rate": 2.433258443951398e-06, + "loss": 0.3936, + "step": 6502 + }, + { + "epoch": 3.074704491725768, + "grad_norm": 2.6008691787719727, + "learning_rate": 2.432634841402611e-06, + "loss": 0.3709, + "step": 6503 + }, + { + "epoch": 3.075177304964539, + "grad_norm": 2.595116376876831, + "learning_rate": 2.4320112430483563e-06, + "loss": 0.3884, + "step": 6504 + }, + { + "epoch": 3.07565011820331, + "grad_norm": 2.685241460800171, + "learning_rate": 2.431387648927464e-06, + "loss": 0.3751, + "step": 6505 + }, + { + "epoch": 3.0761229314420806, + "grad_norm": 2.8863797187805176, + "learning_rate": 2.430764059078762e-06, + "loss": 0.3765, + "step": 6506 + }, + { + "epoch": 3.076595744680851, + "grad_norm": 3.020766019821167, + "learning_rate": 2.430140473541077e-06, + "loss": 0.362, + "step": 6507 + }, + { + "epoch": 3.0770685579196217, + "grad_norm": 2.9521167278289795, + "learning_rate": 2.42951689235324e-06, + "loss": 0.41, + "step": 6508 + }, + { + "epoch": 3.0775413711583925, + "grad_norm": 2.5844924449920654, + "learning_rate": 2.4288933155540757e-06, + "loss": 0.3258, + "step": 6509 + }, + { + "epoch": 3.078014184397163, + "grad_norm": 3.052661657333374, + "learning_rate": 2.4282697431824138e-06, + "loss": 0.363, + "step": 6510 + }, + { + "epoch": 3.0784869976359337, + "grad_norm": 3.109342575073242, + "learning_rate": 2.427646175277081e-06, + "loss": 0.4105, + "step": 6511 + }, + { + "epoch": 3.0789598108747045, + "grad_norm": 3.3141326904296875, + "learning_rate": 2.427022611876903e-06, + "loss": 0.405, + "step": 6512 + }, + { + "epoch": 3.0794326241134753, + "grad_norm": 3.054673194885254, + "learning_rate": 2.426399053020707e-06, + "loss": 0.3532, + "step": 6513 + }, + { + "epoch": 3.079905437352246, + "grad_norm": 2.823489189147949, + "learning_rate": 2.425775498747318e-06, + "loss": 0.3762, + "step": 6514 + }, + { + "epoch": 3.0803782505910164, + "grad_norm": 2.6739792823791504, + "learning_rate": 2.425151949095565e-06, + "loss": 0.4044, + "step": 6515 + }, + { + "epoch": 3.0808510638297872, + "grad_norm": 2.7313177585601807, + "learning_rate": 2.4245284041042714e-06, + "loss": 0.3136, + "step": 6516 + }, + { + "epoch": 3.081323877068558, + "grad_norm": 3.1661181449890137, + "learning_rate": 2.4239048638122624e-06, + "loss": 0.44, + "step": 6517 + }, + { + "epoch": 3.0817966903073284, + "grad_norm": 3.326542377471924, + "learning_rate": 2.4232813282583647e-06, + "loss": 0.3798, + "step": 6518 + }, + { + "epoch": 3.082269503546099, + "grad_norm": 3.0194952487945557, + "learning_rate": 2.422657797481402e-06, + "loss": 0.423, + "step": 6519 + }, + { + "epoch": 3.08274231678487, + "grad_norm": 2.6704318523406982, + "learning_rate": 2.4220342715201995e-06, + "loss": 0.41, + "step": 6520 + }, + { + "epoch": 3.083215130023641, + "grad_norm": 3.057990312576294, + "learning_rate": 2.421410750413581e-06, + "loss": 0.4096, + "step": 6521 + }, + { + "epoch": 3.083687943262411, + "grad_norm": 2.6242079734802246, + "learning_rate": 2.4207872342003693e-06, + "loss": 0.3673, + "step": 6522 + }, + { + "epoch": 3.084160756501182, + "grad_norm": 2.933910846710205, + "learning_rate": 2.4201637229193904e-06, + "loss": 0.4018, + "step": 6523 + }, + { + "epoch": 3.0846335697399527, + "grad_norm": 2.6973681449890137, + "learning_rate": 2.4195402166094657e-06, + "loss": 0.3533, + "step": 6524 + }, + { + "epoch": 3.0851063829787235, + "grad_norm": 3.096013307571411, + "learning_rate": 2.4189167153094194e-06, + "loss": 0.3872, + "step": 6525 + }, + { + "epoch": 3.085579196217494, + "grad_norm": 3.0707414150238037, + "learning_rate": 2.4182932190580737e-06, + "loss": 0.3775, + "step": 6526 + }, + { + "epoch": 3.0860520094562647, + "grad_norm": 2.873190402984619, + "learning_rate": 2.417669727894251e-06, + "loss": 0.3144, + "step": 6527 + }, + { + "epoch": 3.0865248226950355, + "grad_norm": 2.316431999206543, + "learning_rate": 2.4170462418567732e-06, + "loss": 0.3238, + "step": 6528 + }, + { + "epoch": 3.0869976359338063, + "grad_norm": 2.3672494888305664, + "learning_rate": 2.4164227609844626e-06, + "loss": 0.3585, + "step": 6529 + }, + { + "epoch": 3.0874704491725766, + "grad_norm": 2.904538154602051, + "learning_rate": 2.415799285316139e-06, + "loss": 0.366, + "step": 6530 + }, + { + "epoch": 3.0879432624113474, + "grad_norm": 2.914602279663086, + "learning_rate": 2.415175814890626e-06, + "loss": 0.3793, + "step": 6531 + }, + { + "epoch": 3.088416075650118, + "grad_norm": 2.652005672454834, + "learning_rate": 2.4145523497467417e-06, + "loss": 0.362, + "step": 6532 + }, + { + "epoch": 3.088888888888889, + "grad_norm": 2.5137813091278076, + "learning_rate": 2.413928889923309e-06, + "loss": 0.2974, + "step": 6533 + }, + { + "epoch": 3.0893617021276594, + "grad_norm": 3.2166645526885986, + "learning_rate": 2.413305435459147e-06, + "loss": 0.4151, + "step": 6534 + }, + { + "epoch": 3.08983451536643, + "grad_norm": 3.0506820678710938, + "learning_rate": 2.412681986393075e-06, + "loss": 0.4223, + "step": 6535 + }, + { + "epoch": 3.090307328605201, + "grad_norm": 3.035275936126709, + "learning_rate": 2.412058542763913e-06, + "loss": 0.4841, + "step": 6536 + }, + { + "epoch": 3.0907801418439718, + "grad_norm": 3.3195009231567383, + "learning_rate": 2.4114351046104793e-06, + "loss": 0.4205, + "step": 6537 + }, + { + "epoch": 3.091252955082742, + "grad_norm": 2.8700361251831055, + "learning_rate": 2.410811671971594e-06, + "loss": 0.3704, + "step": 6538 + }, + { + "epoch": 3.091725768321513, + "grad_norm": 2.900595188140869, + "learning_rate": 2.410188244886075e-06, + "loss": 0.4184, + "step": 6539 + }, + { + "epoch": 3.0921985815602837, + "grad_norm": 2.88179349899292, + "learning_rate": 2.409564823392739e-06, + "loss": 0.4156, + "step": 6540 + }, + { + "epoch": 3.0926713947990545, + "grad_norm": 2.677568197250366, + "learning_rate": 2.408941407530406e-06, + "loss": 0.4084, + "step": 6541 + }, + { + "epoch": 3.093144208037825, + "grad_norm": 3.0236027240753174, + "learning_rate": 2.408317997337892e-06, + "loss": 0.4384, + "step": 6542 + }, + { + "epoch": 3.0936170212765957, + "grad_norm": 3.1708545684814453, + "learning_rate": 2.4076945928540143e-06, + "loss": 0.3876, + "step": 6543 + }, + { + "epoch": 3.0940898345153665, + "grad_norm": 3.248821973800659, + "learning_rate": 2.40707119411759e-06, + "loss": 0.3865, + "step": 6544 + }, + { + "epoch": 3.0945626477541373, + "grad_norm": 3.0961649417877197, + "learning_rate": 2.4064478011674334e-06, + "loss": 0.3982, + "step": 6545 + }, + { + "epoch": 3.0950354609929076, + "grad_norm": 3.1989805698394775, + "learning_rate": 2.4058244140423637e-06, + "loss": 0.4777, + "step": 6546 + }, + { + "epoch": 3.0955082742316784, + "grad_norm": 2.805640459060669, + "learning_rate": 2.4052010327811933e-06, + "loss": 0.3764, + "step": 6547 + }, + { + "epoch": 3.095981087470449, + "grad_norm": 2.7225050926208496, + "learning_rate": 2.40457765742274e-06, + "loss": 0.3286, + "step": 6548 + }, + { + "epoch": 3.09645390070922, + "grad_norm": 3.119915008544922, + "learning_rate": 2.4039542880058174e-06, + "loss": 0.4463, + "step": 6549 + }, + { + "epoch": 3.0969267139479904, + "grad_norm": 2.8503530025482178, + "learning_rate": 2.4033309245692403e-06, + "loss": 0.395, + "step": 6550 + }, + { + "epoch": 3.097399527186761, + "grad_norm": 2.947504758834839, + "learning_rate": 2.4027075671518225e-06, + "loss": 0.4024, + "step": 6551 + }, + { + "epoch": 3.097872340425532, + "grad_norm": 3.170905113220215, + "learning_rate": 2.402084215792377e-06, + "loss": 0.4302, + "step": 6552 + }, + { + "epoch": 3.0983451536643027, + "grad_norm": 2.910475492477417, + "learning_rate": 2.4014608705297195e-06, + "loss": 0.4037, + "step": 6553 + }, + { + "epoch": 3.098817966903073, + "grad_norm": 2.627511978149414, + "learning_rate": 2.400837531402661e-06, + "loss": 0.3972, + "step": 6554 + }, + { + "epoch": 3.099290780141844, + "grad_norm": 2.6485681533813477, + "learning_rate": 2.4002141984500133e-06, + "loss": 0.4044, + "step": 6555 + }, + { + "epoch": 3.0997635933806147, + "grad_norm": 2.930954694747925, + "learning_rate": 2.399590871710592e-06, + "loss": 0.4214, + "step": 6556 + }, + { + "epoch": 3.1002364066193855, + "grad_norm": 2.6014554500579834, + "learning_rate": 2.3989675512232063e-06, + "loss": 0.3493, + "step": 6557 + }, + { + "epoch": 3.100709219858156, + "grad_norm": 2.899001121520996, + "learning_rate": 2.398344237026667e-06, + "loss": 0.382, + "step": 6558 + }, + { + "epoch": 3.1011820330969266, + "grad_norm": 2.4698870182037354, + "learning_rate": 2.3977209291597876e-06, + "loss": 0.3558, + "step": 6559 + }, + { + "epoch": 3.1016548463356974, + "grad_norm": 3.2926251888275146, + "learning_rate": 2.3970976276613763e-06, + "loss": 0.4078, + "step": 6560 + }, + { + "epoch": 3.1021276595744682, + "grad_norm": 2.5306150913238525, + "learning_rate": 2.3964743325702454e-06, + "loss": 0.3657, + "step": 6561 + }, + { + "epoch": 3.1026004728132386, + "grad_norm": 2.727583408355713, + "learning_rate": 2.395851043925204e-06, + "loss": 0.3791, + "step": 6562 + }, + { + "epoch": 3.1030732860520094, + "grad_norm": 3.1403541564941406, + "learning_rate": 2.3952277617650602e-06, + "loss": 0.3934, + "step": 6563 + }, + { + "epoch": 3.10354609929078, + "grad_norm": 2.5816383361816406, + "learning_rate": 2.3946044861286256e-06, + "loss": 0.3703, + "step": 6564 + }, + { + "epoch": 3.104018912529551, + "grad_norm": 2.5742220878601074, + "learning_rate": 2.3939812170547067e-06, + "loss": 0.3628, + "step": 6565 + }, + { + "epoch": 3.1044917257683213, + "grad_norm": 2.7276530265808105, + "learning_rate": 2.393357954582113e-06, + "loss": 0.3789, + "step": 6566 + }, + { + "epoch": 3.104964539007092, + "grad_norm": 3.05595064163208, + "learning_rate": 2.3927346987496515e-06, + "loss": 0.3766, + "step": 6567 + }, + { + "epoch": 3.105437352245863, + "grad_norm": 2.786970615386963, + "learning_rate": 2.39211144959613e-06, + "loss": 0.3329, + "step": 6568 + }, + { + "epoch": 3.1059101654846337, + "grad_norm": 3.499018430709839, + "learning_rate": 2.391488207160356e-06, + "loss": 0.4175, + "step": 6569 + }, + { + "epoch": 3.106382978723404, + "grad_norm": 2.969735860824585, + "learning_rate": 2.3908649714811346e-06, + "loss": 0.3893, + "step": 6570 + }, + { + "epoch": 3.106855791962175, + "grad_norm": 3.1494929790496826, + "learning_rate": 2.3902417425972734e-06, + "loss": 0.4048, + "step": 6571 + }, + { + "epoch": 3.1073286052009457, + "grad_norm": 2.6393489837646484, + "learning_rate": 2.3896185205475782e-06, + "loss": 0.3216, + "step": 6572 + }, + { + "epoch": 3.1078014184397165, + "grad_norm": 3.6984152793884277, + "learning_rate": 2.3889953053708528e-06, + "loss": 0.3646, + "step": 6573 + }, + { + "epoch": 3.108274231678487, + "grad_norm": 3.518547534942627, + "learning_rate": 2.388372097105903e-06, + "loss": 0.3627, + "step": 6574 + }, + { + "epoch": 3.1087470449172576, + "grad_norm": 3.422043800354004, + "learning_rate": 2.3877488957915333e-06, + "loss": 0.4116, + "step": 6575 + }, + { + "epoch": 3.1092198581560284, + "grad_norm": 2.8088064193725586, + "learning_rate": 2.3871257014665486e-06, + "loss": 0.3477, + "step": 6576 + }, + { + "epoch": 3.109692671394799, + "grad_norm": 2.7877607345581055, + "learning_rate": 2.3865025141697513e-06, + "loss": 0.351, + "step": 6577 + }, + { + "epoch": 3.1101654846335696, + "grad_norm": 2.9446799755096436, + "learning_rate": 2.3858793339399433e-06, + "loss": 0.4025, + "step": 6578 + }, + { + "epoch": 3.1106382978723404, + "grad_norm": 2.886584758758545, + "learning_rate": 2.3852561608159304e-06, + "loss": 0.3765, + "step": 6579 + }, + { + "epoch": 3.111111111111111, + "grad_norm": 3.45711088180542, + "learning_rate": 2.384632994836513e-06, + "loss": 0.3744, + "step": 6580 + }, + { + "epoch": 3.111583924349882, + "grad_norm": 2.737441301345825, + "learning_rate": 2.3840098360404916e-06, + "loss": 0.4048, + "step": 6581 + }, + { + "epoch": 3.1120567375886523, + "grad_norm": 2.742567300796509, + "learning_rate": 2.383386684466671e-06, + "loss": 0.3717, + "step": 6582 + }, + { + "epoch": 3.112529550827423, + "grad_norm": 3.017970561981201, + "learning_rate": 2.382763540153849e-06, + "loss": 0.3922, + "step": 6583 + }, + { + "epoch": 3.113002364066194, + "grad_norm": 3.132004499435425, + "learning_rate": 2.3821404031408283e-06, + "loss": 0.3969, + "step": 6584 + }, + { + "epoch": 3.1134751773049647, + "grad_norm": 2.910820245742798, + "learning_rate": 2.3815172734664075e-06, + "loss": 0.4241, + "step": 6585 + }, + { + "epoch": 3.113947990543735, + "grad_norm": 3.0029842853546143, + "learning_rate": 2.380894151169386e-06, + "loss": 0.4007, + "step": 6586 + }, + { + "epoch": 3.114420803782506, + "grad_norm": 3.0309178829193115, + "learning_rate": 2.380271036288564e-06, + "loss": 0.3876, + "step": 6587 + }, + { + "epoch": 3.1148936170212767, + "grad_norm": 2.963204860687256, + "learning_rate": 2.379647928862739e-06, + "loss": 0.4017, + "step": 6588 + }, + { + "epoch": 3.1153664302600474, + "grad_norm": 3.0127944946289062, + "learning_rate": 2.3790248289307103e-06, + "loss": 0.3651, + "step": 6589 + }, + { + "epoch": 3.115839243498818, + "grad_norm": 2.557485580444336, + "learning_rate": 2.3784017365312755e-06, + "loss": 0.3419, + "step": 6590 + }, + { + "epoch": 3.1163120567375886, + "grad_norm": 2.8577969074249268, + "learning_rate": 2.3777786517032306e-06, + "loss": 0.372, + "step": 6591 + }, + { + "epoch": 3.1167848699763594, + "grad_norm": 2.450324058532715, + "learning_rate": 2.3771555744853735e-06, + "loss": 0.3442, + "step": 6592 + }, + { + "epoch": 3.11725768321513, + "grad_norm": 2.7939295768737793, + "learning_rate": 2.3765325049164996e-06, + "loss": 0.401, + "step": 6593 + }, + { + "epoch": 3.1177304964539005, + "grad_norm": 2.9690325260162354, + "learning_rate": 2.3759094430354056e-06, + "loss": 0.3962, + "step": 6594 + }, + { + "epoch": 3.1182033096926713, + "grad_norm": 2.7630631923675537, + "learning_rate": 2.375286388880887e-06, + "loss": 0.4126, + "step": 6595 + }, + { + "epoch": 3.118676122931442, + "grad_norm": 2.6259944438934326, + "learning_rate": 2.3746633424917366e-06, + "loss": 0.3285, + "step": 6596 + }, + { + "epoch": 3.119148936170213, + "grad_norm": 2.7107701301574707, + "learning_rate": 2.3740403039067516e-06, + "loss": 0.3636, + "step": 6597 + }, + { + "epoch": 3.1196217494089833, + "grad_norm": 2.985301971435547, + "learning_rate": 2.373417273164724e-06, + "loss": 0.3928, + "step": 6598 + }, + { + "epoch": 3.120094562647754, + "grad_norm": 3.2578976154327393, + "learning_rate": 2.3727942503044483e-06, + "loss": 0.3379, + "step": 6599 + }, + { + "epoch": 3.120567375886525, + "grad_norm": 3.1681406497955322, + "learning_rate": 2.372171235364717e-06, + "loss": 0.4023, + "step": 6600 + }, + { + "epoch": 3.1210401891252957, + "grad_norm": 3.120147705078125, + "learning_rate": 2.371548228384321e-06, + "loss": 0.4228, + "step": 6601 + }, + { + "epoch": 3.121513002364066, + "grad_norm": 2.7786099910736084, + "learning_rate": 2.3709252294020547e-06, + "loss": 0.4386, + "step": 6602 + }, + { + "epoch": 3.121985815602837, + "grad_norm": 2.698849678039551, + "learning_rate": 2.3703022384567086e-06, + "loss": 0.3861, + "step": 6603 + }, + { + "epoch": 3.1224586288416076, + "grad_norm": 2.7917959690093994, + "learning_rate": 2.3696792555870724e-06, + "loss": 0.3535, + "step": 6604 + }, + { + "epoch": 3.1229314420803784, + "grad_norm": 2.8249263763427734, + "learning_rate": 2.3690562808319385e-06, + "loss": 0.3415, + "step": 6605 + }, + { + "epoch": 3.123404255319149, + "grad_norm": 2.567458391189575, + "learning_rate": 2.368433314230095e-06, + "loss": 0.3827, + "step": 6606 + }, + { + "epoch": 3.1238770685579196, + "grad_norm": 2.9670443534851074, + "learning_rate": 2.3678103558203328e-06, + "loss": 0.4238, + "step": 6607 + }, + { + "epoch": 3.1243498817966904, + "grad_norm": 2.6893439292907715, + "learning_rate": 2.36718740564144e-06, + "loss": 0.3461, + "step": 6608 + }, + { + "epoch": 3.124822695035461, + "grad_norm": 3.2669708728790283, + "learning_rate": 2.3665644637322044e-06, + "loss": 0.3992, + "step": 6609 + }, + { + "epoch": 3.1252955082742315, + "grad_norm": 2.889340400695801, + "learning_rate": 2.3659415301314152e-06, + "loss": 0.3829, + "step": 6610 + }, + { + "epoch": 3.1257683215130023, + "grad_norm": 2.625603199005127, + "learning_rate": 2.3653186048778584e-06, + "loss": 0.3559, + "step": 6611 + }, + { + "epoch": 3.126241134751773, + "grad_norm": 2.8128650188446045, + "learning_rate": 2.3646956880103224e-06, + "loss": 0.4035, + "step": 6612 + }, + { + "epoch": 3.126713947990544, + "grad_norm": 3.1887412071228027, + "learning_rate": 2.3640727795675925e-06, + "loss": 0.3938, + "step": 6613 + }, + { + "epoch": 3.1271867612293143, + "grad_norm": 2.886514186859131, + "learning_rate": 2.363449879588454e-06, + "loss": 0.3504, + "step": 6614 + }, + { + "epoch": 3.127659574468085, + "grad_norm": 3.2149860858917236, + "learning_rate": 2.3628269881116937e-06, + "loss": 0.4137, + "step": 6615 + }, + { + "epoch": 3.128132387706856, + "grad_norm": 3.3155312538146973, + "learning_rate": 2.362204105176094e-06, + "loss": 0.3811, + "step": 6616 + }, + { + "epoch": 3.1286052009456267, + "grad_norm": 2.6228792667388916, + "learning_rate": 2.3615812308204415e-06, + "loss": 0.3511, + "step": 6617 + }, + { + "epoch": 3.129078014184397, + "grad_norm": 2.7686524391174316, + "learning_rate": 2.3609583650835187e-06, + "loss": 0.3722, + "step": 6618 + }, + { + "epoch": 3.129550827423168, + "grad_norm": 3.396368980407715, + "learning_rate": 2.3603355080041083e-06, + "loss": 0.4678, + "step": 6619 + }, + { + "epoch": 3.1300236406619386, + "grad_norm": 2.7329437732696533, + "learning_rate": 2.359712659620994e-06, + "loss": 0.3775, + "step": 6620 + }, + { + "epoch": 3.1304964539007094, + "grad_norm": 2.7633914947509766, + "learning_rate": 2.3590898199729567e-06, + "loss": 0.3306, + "step": 6621 + }, + { + "epoch": 3.1309692671394798, + "grad_norm": 3.020887613296509, + "learning_rate": 2.3584669890987792e-06, + "loss": 0.4121, + "step": 6622 + }, + { + "epoch": 3.1314420803782506, + "grad_norm": 2.8912103176116943, + "learning_rate": 2.3578441670372414e-06, + "loss": 0.4297, + "step": 6623 + }, + { + "epoch": 3.1319148936170214, + "grad_norm": 3.0654027462005615, + "learning_rate": 2.3572213538271234e-06, + "loss": 0.3856, + "step": 6624 + }, + { + "epoch": 3.132387706855792, + "grad_norm": 3.1126575469970703, + "learning_rate": 2.356598549507206e-06, + "loss": 0.3886, + "step": 6625 + }, + { + "epoch": 3.1328605200945625, + "grad_norm": 2.7066447734832764, + "learning_rate": 2.3559757541162687e-06, + "loss": 0.4212, + "step": 6626 + }, + { + "epoch": 3.1333333333333333, + "grad_norm": 2.876338243484497, + "learning_rate": 2.355352967693088e-06, + "loss": 0.3607, + "step": 6627 + }, + { + "epoch": 3.133806146572104, + "grad_norm": 2.9011716842651367, + "learning_rate": 2.3547301902764454e-06, + "loss": 0.428, + "step": 6628 + }, + { + "epoch": 3.134278959810875, + "grad_norm": 2.805656909942627, + "learning_rate": 2.3541074219051163e-06, + "loss": 0.4038, + "step": 6629 + }, + { + "epoch": 3.1347517730496453, + "grad_norm": 2.89546275138855, + "learning_rate": 2.353484662617879e-06, + "loss": 0.3798, + "step": 6630 + }, + { + "epoch": 3.135224586288416, + "grad_norm": 3.0290539264678955, + "learning_rate": 2.352861912453508e-06, + "loss": 0.3916, + "step": 6631 + }, + { + "epoch": 3.135697399527187, + "grad_norm": 2.848393440246582, + "learning_rate": 2.352239171450781e-06, + "loss": 0.3423, + "step": 6632 + }, + { + "epoch": 3.1361702127659576, + "grad_norm": 2.871372938156128, + "learning_rate": 2.3516164396484737e-06, + "loss": 0.3872, + "step": 6633 + }, + { + "epoch": 3.136643026004728, + "grad_norm": 3.120682716369629, + "learning_rate": 2.3509937170853585e-06, + "loss": 0.3952, + "step": 6634 + }, + { + "epoch": 3.137115839243499, + "grad_norm": 2.6936683654785156, + "learning_rate": 2.3503710038002127e-06, + "loss": 0.3643, + "step": 6635 + }, + { + "epoch": 3.1375886524822696, + "grad_norm": 3.749519109725952, + "learning_rate": 2.349748299831808e-06, + "loss": 0.4519, + "step": 6636 + }, + { + "epoch": 3.1380614657210404, + "grad_norm": 2.8034276962280273, + "learning_rate": 2.3491256052189175e-06, + "loss": 0.401, + "step": 6637 + }, + { + "epoch": 3.1385342789598107, + "grad_norm": 2.6201975345611572, + "learning_rate": 2.348502920000314e-06, + "loss": 0.3491, + "step": 6638 + }, + { + "epoch": 3.1390070921985815, + "grad_norm": 2.890552043914795, + "learning_rate": 2.347880244214769e-06, + "loss": 0.3439, + "step": 6639 + }, + { + "epoch": 3.1394799054373523, + "grad_norm": 2.899594306945801, + "learning_rate": 2.347257577901055e-06, + "loss": 0.3707, + "step": 6640 + }, + { + "epoch": 3.139952718676123, + "grad_norm": 2.8660130500793457, + "learning_rate": 2.346634921097942e-06, + "loss": 0.3582, + "step": 6641 + }, + { + "epoch": 3.1404255319148935, + "grad_norm": 2.9805452823638916, + "learning_rate": 2.346012273844199e-06, + "loss": 0.3466, + "step": 6642 + }, + { + "epoch": 3.1408983451536643, + "grad_norm": 3.162977457046509, + "learning_rate": 2.345389636178597e-06, + "loss": 0.3657, + "step": 6643 + }, + { + "epoch": 3.141371158392435, + "grad_norm": 2.838988780975342, + "learning_rate": 2.344767008139904e-06, + "loss": 0.3826, + "step": 6644 + }, + { + "epoch": 3.141843971631206, + "grad_norm": 3.8427252769470215, + "learning_rate": 2.3441443897668893e-06, + "loss": 0.3697, + "step": 6645 + }, + { + "epoch": 3.1423167848699762, + "grad_norm": 2.9233880043029785, + "learning_rate": 2.34352178109832e-06, + "loss": 0.3481, + "step": 6646 + }, + { + "epoch": 3.142789598108747, + "grad_norm": 2.5840606689453125, + "learning_rate": 2.342899182172963e-06, + "loss": 0.3746, + "step": 6647 + }, + { + "epoch": 3.143262411347518, + "grad_norm": 2.806793451309204, + "learning_rate": 2.3422765930295857e-06, + "loss": 0.419, + "step": 6648 + }, + { + "epoch": 3.1437352245862886, + "grad_norm": 2.803952693939209, + "learning_rate": 2.3416540137069522e-06, + "loss": 0.3965, + "step": 6649 + }, + { + "epoch": 3.144208037825059, + "grad_norm": 2.8416364192962646, + "learning_rate": 2.3410314442438297e-06, + "loss": 0.4317, + "step": 6650 + }, + { + "epoch": 3.1446808510638298, + "grad_norm": 2.9956440925598145, + "learning_rate": 2.3404088846789826e-06, + "loss": 0.4268, + "step": 6651 + }, + { + "epoch": 3.1451536643026006, + "grad_norm": 3.1649162769317627, + "learning_rate": 2.339786335051173e-06, + "loss": 0.4149, + "step": 6652 + }, + { + "epoch": 3.145626477541371, + "grad_norm": 2.909107208251953, + "learning_rate": 2.3391637953991673e-06, + "loss": 0.4085, + "step": 6653 + }, + { + "epoch": 3.1460992907801417, + "grad_norm": 2.416755199432373, + "learning_rate": 2.3385412657617264e-06, + "loss": 0.3585, + "step": 6654 + }, + { + "epoch": 3.1465721040189125, + "grad_norm": 3.1122629642486572, + "learning_rate": 2.3379187461776123e-06, + "loss": 0.3876, + "step": 6655 + }, + { + "epoch": 3.1470449172576833, + "grad_norm": 2.6854658126831055, + "learning_rate": 2.337296236685588e-06, + "loss": 0.3125, + "step": 6656 + }, + { + "epoch": 3.147517730496454, + "grad_norm": 2.779876708984375, + "learning_rate": 2.3366737373244127e-06, + "loss": 0.3688, + "step": 6657 + }, + { + "epoch": 3.1479905437352245, + "grad_norm": 3.1444761753082275, + "learning_rate": 2.3360512481328484e-06, + "loss": 0.4089, + "step": 6658 + }, + { + "epoch": 3.1484633569739953, + "grad_norm": 2.71445894241333, + "learning_rate": 2.335428769149654e-06, + "loss": 0.3532, + "step": 6659 + }, + { + "epoch": 3.148936170212766, + "grad_norm": 2.9788241386413574, + "learning_rate": 2.334806300413587e-06, + "loss": 0.4238, + "step": 6660 + }, + { + "epoch": 3.1494089834515364, + "grad_norm": 3.0118865966796875, + "learning_rate": 2.334183841963409e-06, + "loss": 0.4437, + "step": 6661 + }, + { + "epoch": 3.149881796690307, + "grad_norm": 3.2229537963867188, + "learning_rate": 2.3335613938378753e-06, + "loss": 0.3582, + "step": 6662 + }, + { + "epoch": 3.150354609929078, + "grad_norm": 2.734997034072876, + "learning_rate": 2.3329389560757447e-06, + "loss": 0.3737, + "step": 6663 + }, + { + "epoch": 3.150827423167849, + "grad_norm": 3.4746382236480713, + "learning_rate": 2.3323165287157724e-06, + "loss": 0.3516, + "step": 6664 + }, + { + "epoch": 3.1513002364066196, + "grad_norm": 2.9428153038024902, + "learning_rate": 2.3316941117967137e-06, + "loss": 0.3985, + "step": 6665 + }, + { + "epoch": 3.15177304964539, + "grad_norm": 2.6840944290161133, + "learning_rate": 2.3310717053573257e-06, + "loss": 0.3274, + "step": 6666 + }, + { + "epoch": 3.1522458628841608, + "grad_norm": 3.048335552215576, + "learning_rate": 2.3304493094363607e-06, + "loss": 0.4262, + "step": 6667 + }, + { + "epoch": 3.1527186761229316, + "grad_norm": 2.87381911277771, + "learning_rate": 2.329826924072575e-06, + "loss": 0.3867, + "step": 6668 + }, + { + "epoch": 3.153191489361702, + "grad_norm": 2.6236355304718018, + "learning_rate": 2.32920454930472e-06, + "loss": 0.3649, + "step": 6669 + }, + { + "epoch": 3.1536643026004727, + "grad_norm": 3.1326401233673096, + "learning_rate": 2.328582185171549e-06, + "loss": 0.3451, + "step": 6670 + }, + { + "epoch": 3.1541371158392435, + "grad_norm": 3.011826992034912, + "learning_rate": 2.327959831711814e-06, + "loss": 0.4118, + "step": 6671 + }, + { + "epoch": 3.1546099290780143, + "grad_norm": 2.834933280944824, + "learning_rate": 2.3273374889642646e-06, + "loss": 0.4378, + "step": 6672 + }, + { + "epoch": 3.155082742316785, + "grad_norm": 3.085756778717041, + "learning_rate": 2.326715156967654e-06, + "loss": 0.4389, + "step": 6673 + }, + { + "epoch": 3.1555555555555554, + "grad_norm": 2.7912232875823975, + "learning_rate": 2.3260928357607305e-06, + "loss": 0.3352, + "step": 6674 + }, + { + "epoch": 3.1560283687943262, + "grad_norm": 2.7643113136291504, + "learning_rate": 2.3254705253822424e-06, + "loss": 0.3449, + "step": 6675 + }, + { + "epoch": 3.156501182033097, + "grad_norm": 2.8984663486480713, + "learning_rate": 2.3248482258709405e-06, + "loss": 0.4231, + "step": 6676 + }, + { + "epoch": 3.1569739952718674, + "grad_norm": 3.214996814727783, + "learning_rate": 2.324225937265572e-06, + "loss": 0.4616, + "step": 6677 + }, + { + "epoch": 3.157446808510638, + "grad_norm": 2.58534836769104, + "learning_rate": 2.3236036596048827e-06, + "loss": 0.3264, + "step": 6678 + }, + { + "epoch": 3.157919621749409, + "grad_norm": 2.790714740753174, + "learning_rate": 2.322981392927621e-06, + "loss": 0.4086, + "step": 6679 + }, + { + "epoch": 3.15839243498818, + "grad_norm": 2.726029872894287, + "learning_rate": 2.32235913727253e-06, + "loss": 0.3344, + "step": 6680 + }, + { + "epoch": 3.1588652482269506, + "grad_norm": 2.8392906188964844, + "learning_rate": 2.3217368926783583e-06, + "loss": 0.3468, + "step": 6681 + }, + { + "epoch": 3.159338061465721, + "grad_norm": 2.9796900749206543, + "learning_rate": 2.321114659183848e-06, + "loss": 0.4051, + "step": 6682 + }, + { + "epoch": 3.1598108747044917, + "grad_norm": 3.0399303436279297, + "learning_rate": 2.320492436827743e-06, + "loss": 0.402, + "step": 6683 + }, + { + "epoch": 3.1602836879432625, + "grad_norm": 2.9295334815979004, + "learning_rate": 2.3198702256487877e-06, + "loss": 0.3975, + "step": 6684 + }, + { + "epoch": 3.160756501182033, + "grad_norm": 2.881552219390869, + "learning_rate": 2.319248025685723e-06, + "loss": 0.4342, + "step": 6685 + }, + { + "epoch": 3.1612293144208037, + "grad_norm": 3.0711705684661865, + "learning_rate": 2.3186258369772916e-06, + "loss": 0.3829, + "step": 6686 + }, + { + "epoch": 3.1617021276595745, + "grad_norm": 2.6614468097686768, + "learning_rate": 2.3180036595622345e-06, + "loss": 0.3473, + "step": 6687 + }, + { + "epoch": 3.1621749408983453, + "grad_norm": 3.0084400177001953, + "learning_rate": 2.3173814934792903e-06, + "loss": 0.4363, + "step": 6688 + }, + { + "epoch": 3.162647754137116, + "grad_norm": 2.9340786933898926, + "learning_rate": 2.3167593387672006e-06, + "loss": 0.4235, + "step": 6689 + }, + { + "epoch": 3.1631205673758864, + "grad_norm": 3.0765340328216553, + "learning_rate": 2.3161371954647023e-06, + "loss": 0.4601, + "step": 6690 + }, + { + "epoch": 3.1635933806146572, + "grad_norm": 2.816096067428589, + "learning_rate": 2.3155150636105356e-06, + "loss": 0.3764, + "step": 6691 + }, + { + "epoch": 3.164066193853428, + "grad_norm": 3.0476551055908203, + "learning_rate": 2.3148929432434372e-06, + "loss": 0.3956, + "step": 6692 + }, + { + "epoch": 3.1645390070921984, + "grad_norm": 2.628934860229492, + "learning_rate": 2.314270834402143e-06, + "loss": 0.3551, + "step": 6693 + }, + { + "epoch": 3.165011820330969, + "grad_norm": 3.3933539390563965, + "learning_rate": 2.31364873712539e-06, + "loss": 0.4523, + "step": 6694 + }, + { + "epoch": 3.16548463356974, + "grad_norm": 3.256176233291626, + "learning_rate": 2.313026651451912e-06, + "loss": 0.417, + "step": 6695 + }, + { + "epoch": 3.1659574468085108, + "grad_norm": 2.92926025390625, + "learning_rate": 2.312404577420445e-06, + "loss": 0.4365, + "step": 6696 + }, + { + "epoch": 3.166430260047281, + "grad_norm": 2.9514732360839844, + "learning_rate": 2.3117825150697233e-06, + "loss": 0.4632, + "step": 6697 + }, + { + "epoch": 3.166903073286052, + "grad_norm": 2.8635852336883545, + "learning_rate": 2.3111604644384778e-06, + "loss": 0.4018, + "step": 6698 + }, + { + "epoch": 3.1673758865248227, + "grad_norm": 2.5937020778656006, + "learning_rate": 2.3105384255654433e-06, + "loss": 0.3682, + "step": 6699 + }, + { + "epoch": 3.1678486997635935, + "grad_norm": 2.857851266860962, + "learning_rate": 2.3099163984893497e-06, + "loss": 0.3293, + "step": 6700 + }, + { + "epoch": 3.168321513002364, + "grad_norm": 2.5903947353363037, + "learning_rate": 2.3092943832489283e-06, + "loss": 0.3543, + "step": 6701 + }, + { + "epoch": 3.1687943262411347, + "grad_norm": 2.9783661365509033, + "learning_rate": 2.30867237988291e-06, + "loss": 0.3707, + "step": 6702 + }, + { + "epoch": 3.1692671394799055, + "grad_norm": 3.0133306980133057, + "learning_rate": 2.3080503884300225e-06, + "loss": 0.439, + "step": 6703 + }, + { + "epoch": 3.1697399527186763, + "grad_norm": 2.7119483947753906, + "learning_rate": 2.3074284089289968e-06, + "loss": 0.3956, + "step": 6704 + }, + { + "epoch": 3.1702127659574466, + "grad_norm": 3.0499672889709473, + "learning_rate": 2.3068064414185597e-06, + "loss": 0.434, + "step": 6705 + }, + { + "epoch": 3.1706855791962174, + "grad_norm": 2.862807512283325, + "learning_rate": 2.306184485937437e-06, + "loss": 0.3644, + "step": 6706 + }, + { + "epoch": 3.171158392434988, + "grad_norm": 2.9445149898529053, + "learning_rate": 2.305562542524358e-06, + "loss": 0.3894, + "step": 6707 + }, + { + "epoch": 3.171631205673759, + "grad_norm": 3.0442428588867188, + "learning_rate": 2.304940611218046e-06, + "loss": 0.3816, + "step": 6708 + }, + { + "epoch": 3.1721040189125294, + "grad_norm": 2.7101798057556152, + "learning_rate": 2.304318692057228e-06, + "loss": 0.3708, + "step": 6709 + }, + { + "epoch": 3.1725768321513, + "grad_norm": 2.7874515056610107, + "learning_rate": 2.303696785080626e-06, + "loss": 0.404, + "step": 6710 + }, + { + "epoch": 3.173049645390071, + "grad_norm": 3.0438833236694336, + "learning_rate": 2.303074890326964e-06, + "loss": 0.4342, + "step": 6711 + }, + { + "epoch": 3.1735224586288417, + "grad_norm": 2.6079208850860596, + "learning_rate": 2.302453007834966e-06, + "loss": 0.3725, + "step": 6712 + }, + { + "epoch": 3.173995271867612, + "grad_norm": 3.3353021144866943, + "learning_rate": 2.3018311376433523e-06, + "loss": 0.4372, + "step": 6713 + }, + { + "epoch": 3.174468085106383, + "grad_norm": 2.840771436691284, + "learning_rate": 2.3012092797908454e-06, + "loss": 0.3979, + "step": 6714 + }, + { + "epoch": 3.1749408983451537, + "grad_norm": 3.0474867820739746, + "learning_rate": 2.3005874343161648e-06, + "loss": 0.4077, + "step": 6715 + }, + { + "epoch": 3.1754137115839245, + "grad_norm": 2.849835157394409, + "learning_rate": 2.2999656012580296e-06, + "loss": 0.393, + "step": 6716 + }, + { + "epoch": 3.175886524822695, + "grad_norm": 2.6361217498779297, + "learning_rate": 2.29934378065516e-06, + "loss": 0.3894, + "step": 6717 + }, + { + "epoch": 3.1763593380614656, + "grad_norm": 3.139700174331665, + "learning_rate": 2.298721972546273e-06, + "loss": 0.36, + "step": 6718 + }, + { + "epoch": 3.1768321513002364, + "grad_norm": 2.987861156463623, + "learning_rate": 2.298100176970087e-06, + "loss": 0.4306, + "step": 6719 + }, + { + "epoch": 3.1773049645390072, + "grad_norm": 2.6403157711029053, + "learning_rate": 2.297478393965317e-06, + "loss": 0.3978, + "step": 6720 + }, + { + "epoch": 3.1777777777777776, + "grad_norm": 2.819519281387329, + "learning_rate": 2.296856623570679e-06, + "loss": 0.3467, + "step": 6721 + }, + { + "epoch": 3.1782505910165484, + "grad_norm": 2.7195916175842285, + "learning_rate": 2.296234865824889e-06, + "loss": 0.3685, + "step": 6722 + }, + { + "epoch": 3.178723404255319, + "grad_norm": 3.015488624572754, + "learning_rate": 2.2956131207666604e-06, + "loss": 0.3751, + "step": 6723 + }, + { + "epoch": 3.17919621749409, + "grad_norm": 2.9283792972564697, + "learning_rate": 2.2949913884347055e-06, + "loss": 0.3261, + "step": 6724 + }, + { + "epoch": 3.1796690307328603, + "grad_norm": 3.358991861343384, + "learning_rate": 2.294369668867739e-06, + "loss": 0.4505, + "step": 6725 + }, + { + "epoch": 3.180141843971631, + "grad_norm": 2.9143471717834473, + "learning_rate": 2.2937479621044712e-06, + "loss": 0.3612, + "step": 6726 + }, + { + "epoch": 3.180614657210402, + "grad_norm": 3.020519495010376, + "learning_rate": 2.2931262681836136e-06, + "loss": 0.4241, + "step": 6727 + }, + { + "epoch": 3.1810874704491727, + "grad_norm": 2.693737745285034, + "learning_rate": 2.2925045871438765e-06, + "loss": 0.366, + "step": 6728 + }, + { + "epoch": 3.181560283687943, + "grad_norm": 2.9427194595336914, + "learning_rate": 2.2918829190239677e-06, + "loss": 0.3741, + "step": 6729 + }, + { + "epoch": 3.182033096926714, + "grad_norm": 2.529383659362793, + "learning_rate": 2.291261263862598e-06, + "loss": 0.4469, + "step": 6730 + }, + { + "epoch": 3.1825059101654847, + "grad_norm": 3.0097804069519043, + "learning_rate": 2.290639621698473e-06, + "loss": 0.4167, + "step": 6731 + }, + { + "epoch": 3.1829787234042555, + "grad_norm": 2.7047014236450195, + "learning_rate": 2.290017992570302e-06, + "loss": 0.3615, + "step": 6732 + }, + { + "epoch": 3.183451536643026, + "grad_norm": 2.676964282989502, + "learning_rate": 2.2893963765167897e-06, + "loss": 0.3722, + "step": 6733 + }, + { + "epoch": 3.1839243498817966, + "grad_norm": 3.0529778003692627, + "learning_rate": 2.2887747735766413e-06, + "loss": 0.395, + "step": 6734 + }, + { + "epoch": 3.1843971631205674, + "grad_norm": 2.826725721359253, + "learning_rate": 2.288153183788562e-06, + "loss": 0.3713, + "step": 6735 + }, + { + "epoch": 3.184869976359338, + "grad_norm": 2.8689587116241455, + "learning_rate": 2.287531607191254e-06, + "loss": 0.4383, + "step": 6736 + }, + { + "epoch": 3.1853427895981086, + "grad_norm": 3.1835694313049316, + "learning_rate": 2.2869100438234217e-06, + "loss": 0.3908, + "step": 6737 + }, + { + "epoch": 3.1858156028368794, + "grad_norm": 3.227262020111084, + "learning_rate": 2.286288493723767e-06, + "loss": 0.3549, + "step": 6738 + }, + { + "epoch": 3.18628841607565, + "grad_norm": 2.7543468475341797, + "learning_rate": 2.2856669569309896e-06, + "loss": 0.351, + "step": 6739 + }, + { + "epoch": 3.186761229314421, + "grad_norm": 2.5381555557250977, + "learning_rate": 2.2850454334837923e-06, + "loss": 0.3473, + "step": 6740 + }, + { + "epoch": 3.1872340425531913, + "grad_norm": 2.785923957824707, + "learning_rate": 2.284423923420872e-06, + "loss": 0.4144, + "step": 6741 + }, + { + "epoch": 3.187706855791962, + "grad_norm": 2.583853006362915, + "learning_rate": 2.28380242678093e-06, + "loss": 0.3088, + "step": 6742 + }, + { + "epoch": 3.188179669030733, + "grad_norm": 2.604647159576416, + "learning_rate": 2.2831809436026627e-06, + "loss": 0.3474, + "step": 6743 + }, + { + "epoch": 3.1886524822695037, + "grad_norm": 6.13611364364624, + "learning_rate": 2.2825594739247662e-06, + "loss": 0.4089, + "step": 6744 + }, + { + "epoch": 3.189125295508274, + "grad_norm": 3.034011125564575, + "learning_rate": 2.281938017785939e-06, + "loss": 0.4569, + "step": 6745 + }, + { + "epoch": 3.189598108747045, + "grad_norm": 2.9352638721466064, + "learning_rate": 2.281316575224874e-06, + "loss": 0.4293, + "step": 6746 + }, + { + "epoch": 3.1900709219858157, + "grad_norm": 3.860957384109497, + "learning_rate": 2.280695146280268e-06, + "loss": 0.4082, + "step": 6747 + }, + { + "epoch": 3.1905437352245865, + "grad_norm": 2.8131468296051025, + "learning_rate": 2.280073730990814e-06, + "loss": 0.3194, + "step": 6748 + }, + { + "epoch": 3.191016548463357, + "grad_norm": 3.1310737133026123, + "learning_rate": 2.2794523293952033e-06, + "loss": 0.4454, + "step": 6749 + }, + { + "epoch": 3.1914893617021276, + "grad_norm": 3.065091133117676, + "learning_rate": 2.27883094153213e-06, + "loss": 0.3789, + "step": 6750 + }, + { + "epoch": 3.1919621749408984, + "grad_norm": 3.315216541290283, + "learning_rate": 2.278209567440284e-06, + "loss": 0.4037, + "step": 6751 + }, + { + "epoch": 3.192434988179669, + "grad_norm": 3.0228476524353027, + "learning_rate": 2.2775882071583546e-06, + "loss": 0.3652, + "step": 6752 + }, + { + "epoch": 3.1929078014184396, + "grad_norm": 3.703540802001953, + "learning_rate": 2.2769668607250336e-06, + "loss": 0.3477, + "step": 6753 + }, + { + "epoch": 3.1933806146572103, + "grad_norm": 2.952481508255005, + "learning_rate": 2.2763455281790065e-06, + "loss": 0.4026, + "step": 6754 + }, + { + "epoch": 3.193853427895981, + "grad_norm": 2.5798189640045166, + "learning_rate": 2.275724209558965e-06, + "loss": 0.3475, + "step": 6755 + }, + { + "epoch": 3.194326241134752, + "grad_norm": 2.599669933319092, + "learning_rate": 2.2751029049035923e-06, + "loss": 0.3499, + "step": 6756 + }, + { + "epoch": 3.1947990543735223, + "grad_norm": 3.0463781356811523, + "learning_rate": 2.2744816142515756e-06, + "loss": 0.3927, + "step": 6757 + }, + { + "epoch": 3.195271867612293, + "grad_norm": 3.134199380874634, + "learning_rate": 2.2738603376416003e-06, + "loss": 0.3957, + "step": 6758 + }, + { + "epoch": 3.195744680851064, + "grad_norm": 3.1326372623443604, + "learning_rate": 2.273239075112349e-06, + "loss": 0.4305, + "step": 6759 + }, + { + "epoch": 3.1962174940898347, + "grad_norm": 2.847128391265869, + "learning_rate": 2.2726178267025072e-06, + "loss": 0.3825, + "step": 6760 + }, + { + "epoch": 3.196690307328605, + "grad_norm": 2.697584629058838, + "learning_rate": 2.2719965924507566e-06, + "loss": 0.3517, + "step": 6761 + }, + { + "epoch": 3.197163120567376, + "grad_norm": 2.881446599960327, + "learning_rate": 2.271375372395777e-06, + "loss": 0.3791, + "step": 6762 + }, + { + "epoch": 3.1976359338061466, + "grad_norm": 3.085054874420166, + "learning_rate": 2.270754166576252e-06, + "loss": 0.4324, + "step": 6763 + }, + { + "epoch": 3.1981087470449174, + "grad_norm": 3.3494462966918945, + "learning_rate": 2.270132975030859e-06, + "loss": 0.4242, + "step": 6764 + }, + { + "epoch": 3.198581560283688, + "grad_norm": 2.8617660999298096, + "learning_rate": 2.2695117977982785e-06, + "loss": 0.3563, + "step": 6765 + }, + { + "epoch": 3.1990543735224586, + "grad_norm": 2.7437968254089355, + "learning_rate": 2.2688906349171873e-06, + "loss": 0.4042, + "step": 6766 + }, + { + "epoch": 3.1995271867612294, + "grad_norm": 3.1129143238067627, + "learning_rate": 2.268269486426262e-06, + "loss": 0.3761, + "step": 6767 + }, + { + "epoch": 3.2, + "grad_norm": 3.32441782951355, + "learning_rate": 2.2676483523641807e-06, + "loss": 0.4439, + "step": 6768 + }, + { + "epoch": 3.2004728132387705, + "grad_norm": 2.8744730949401855, + "learning_rate": 2.267027232769617e-06, + "loss": 0.4015, + "step": 6769 + }, + { + "epoch": 3.2009456264775413, + "grad_norm": 3.6283397674560547, + "learning_rate": 2.2664061276812465e-06, + "loss": 0.3634, + "step": 6770 + }, + { + "epoch": 3.201418439716312, + "grad_norm": 2.7826597690582275, + "learning_rate": 2.2657850371377426e-06, + "loss": 0.3178, + "step": 6771 + }, + { + "epoch": 3.201891252955083, + "grad_norm": 2.668173313140869, + "learning_rate": 2.265163961177776e-06, + "loss": 0.3662, + "step": 6772 + }, + { + "epoch": 3.2023640661938533, + "grad_norm": 2.868441104888916, + "learning_rate": 2.264542899840021e-06, + "loss": 0.4235, + "step": 6773 + }, + { + "epoch": 3.202836879432624, + "grad_norm": 3.2715935707092285, + "learning_rate": 2.263921853163147e-06, + "loss": 0.4741, + "step": 6774 + }, + { + "epoch": 3.203309692671395, + "grad_norm": 2.8647544384002686, + "learning_rate": 2.2633008211858233e-06, + "loss": 0.3885, + "step": 6775 + }, + { + "epoch": 3.2037825059101657, + "grad_norm": 3.070164680480957, + "learning_rate": 2.2626798039467207e-06, + "loss": 0.4191, + "step": 6776 + }, + { + "epoch": 3.204255319148936, + "grad_norm": 2.846686840057373, + "learning_rate": 2.262058801484505e-06, + "loss": 0.3619, + "step": 6777 + }, + { + "epoch": 3.204728132387707, + "grad_norm": 2.767031192779541, + "learning_rate": 2.261437813837845e-06, + "loss": 0.3248, + "step": 6778 + }, + { + "epoch": 3.2052009456264776, + "grad_norm": 2.6819260120391846, + "learning_rate": 2.2608168410454065e-06, + "loss": 0.3871, + "step": 6779 + }, + { + "epoch": 3.2056737588652484, + "grad_norm": 3.1176788806915283, + "learning_rate": 2.260195883145854e-06, + "loss": 0.3929, + "step": 6780 + }, + { + "epoch": 3.2061465721040188, + "grad_norm": 3.143209457397461, + "learning_rate": 2.2595749401778524e-06, + "loss": 0.4188, + "step": 6781 + }, + { + "epoch": 3.2066193853427896, + "grad_norm": 2.9685657024383545, + "learning_rate": 2.2589540121800647e-06, + "loss": 0.4049, + "step": 6782 + }, + { + "epoch": 3.2070921985815604, + "grad_norm": 2.6853368282318115, + "learning_rate": 2.258333099191155e-06, + "loss": 0.349, + "step": 6783 + }, + { + "epoch": 3.207565011820331, + "grad_norm": 2.8418309688568115, + "learning_rate": 2.257712201249783e-06, + "loss": 0.4121, + "step": 6784 + }, + { + "epoch": 3.2080378250591015, + "grad_norm": 2.9441449642181396, + "learning_rate": 2.2570913183946085e-06, + "loss": 0.3846, + "step": 6785 + }, + { + "epoch": 3.2085106382978723, + "grad_norm": 2.9956493377685547, + "learning_rate": 2.256470450664294e-06, + "loss": 0.3941, + "step": 6786 + }, + { + "epoch": 3.208983451536643, + "grad_norm": 3.1774401664733887, + "learning_rate": 2.255849598097496e-06, + "loss": 0.4252, + "step": 6787 + }, + { + "epoch": 3.209456264775414, + "grad_norm": 2.8948934078216553, + "learning_rate": 2.255228760732873e-06, + "loss": 0.3963, + "step": 6788 + }, + { + "epoch": 3.2099290780141843, + "grad_norm": 3.440021276473999, + "learning_rate": 2.2546079386090825e-06, + "loss": 0.3777, + "step": 6789 + }, + { + "epoch": 3.210401891252955, + "grad_norm": 3.1573195457458496, + "learning_rate": 2.253987131764779e-06, + "loss": 0.3896, + "step": 6790 + }, + { + "epoch": 3.210874704491726, + "grad_norm": 3.4218719005584717, + "learning_rate": 2.2533663402386183e-06, + "loss": 0.3979, + "step": 6791 + }, + { + "epoch": 3.2113475177304966, + "grad_norm": 3.3442487716674805, + "learning_rate": 2.252745564069253e-06, + "loss": 0.406, + "step": 6792 + }, + { + "epoch": 3.211820330969267, + "grad_norm": 2.6089327335357666, + "learning_rate": 2.2521248032953387e-06, + "loss": 0.3539, + "step": 6793 + }, + { + "epoch": 3.212293144208038, + "grad_norm": 3.8015971183776855, + "learning_rate": 2.251504057955526e-06, + "loss": 0.4184, + "step": 6794 + }, + { + "epoch": 3.2127659574468086, + "grad_norm": 3.797565460205078, + "learning_rate": 2.250883328088465e-06, + "loss": 0.3392, + "step": 6795 + }, + { + "epoch": 3.2132387706855794, + "grad_norm": 3.290762186050415, + "learning_rate": 2.2502626137328077e-06, + "loss": 0.3726, + "step": 6796 + }, + { + "epoch": 3.2137115839243497, + "grad_norm": 3.149158000946045, + "learning_rate": 2.2496419149272023e-06, + "loss": 0.3869, + "step": 6797 + }, + { + "epoch": 3.2141843971631205, + "grad_norm": 2.652902364730835, + "learning_rate": 2.2490212317102964e-06, + "loss": 0.3256, + "step": 6798 + }, + { + "epoch": 3.2146572104018913, + "grad_norm": 3.3039770126342773, + "learning_rate": 2.248400564120739e-06, + "loss": 0.4231, + "step": 6799 + }, + { + "epoch": 3.215130023640662, + "grad_norm": 3.0190038681030273, + "learning_rate": 2.247779912197174e-06, + "loss": 0.4319, + "step": 6800 + }, + { + "epoch": 3.2156028368794325, + "grad_norm": 2.861393690109253, + "learning_rate": 2.2471592759782485e-06, + "loss": 0.465, + "step": 6801 + }, + { + "epoch": 3.2160756501182033, + "grad_norm": 2.7796146869659424, + "learning_rate": 2.246538655502606e-06, + "loss": 0.3896, + "step": 6802 + }, + { + "epoch": 3.216548463356974, + "grad_norm": 3.1849005222320557, + "learning_rate": 2.24591805080889e-06, + "loss": 0.3782, + "step": 6803 + }, + { + "epoch": 3.217021276595745, + "grad_norm": 3.076164960861206, + "learning_rate": 2.2452974619357435e-06, + "loss": 0.4023, + "step": 6804 + }, + { + "epoch": 3.2174940898345152, + "grad_norm": 2.7006006240844727, + "learning_rate": 2.2446768889218064e-06, + "loss": 0.3902, + "step": 6805 + }, + { + "epoch": 3.217966903073286, + "grad_norm": 2.9310474395751953, + "learning_rate": 2.2440563318057205e-06, + "loss": 0.366, + "step": 6806 + }, + { + "epoch": 3.218439716312057, + "grad_norm": 3.057248592376709, + "learning_rate": 2.2434357906261246e-06, + "loss": 0.4042, + "step": 6807 + }, + { + "epoch": 3.2189125295508276, + "grad_norm": 3.3720197677612305, + "learning_rate": 2.242815265421656e-06, + "loss": 0.3816, + "step": 6808 + }, + { + "epoch": 3.219385342789598, + "grad_norm": 2.9626352787017822, + "learning_rate": 2.2421947562309545e-06, + "loss": 0.363, + "step": 6809 + }, + { + "epoch": 3.219858156028369, + "grad_norm": 2.7848782539367676, + "learning_rate": 2.2415742630926533e-06, + "loss": 0.3597, + "step": 6810 + }, + { + "epoch": 3.2203309692671396, + "grad_norm": 2.757319450378418, + "learning_rate": 2.2409537860453913e-06, + "loss": 0.3304, + "step": 6811 + }, + { + "epoch": 3.2208037825059104, + "grad_norm": 2.7765560150146484, + "learning_rate": 2.240333325127801e-06, + "loss": 0.3896, + "step": 6812 + }, + { + "epoch": 3.2212765957446807, + "grad_norm": 2.9882447719573975, + "learning_rate": 2.239712880378515e-06, + "loss": 0.4004, + "step": 6813 + }, + { + "epoch": 3.2217494089834515, + "grad_norm": 2.8551244735717773, + "learning_rate": 2.2390924518361673e-06, + "loss": 0.4167, + "step": 6814 + }, + { + "epoch": 3.2222222222222223, + "grad_norm": 2.8051679134368896, + "learning_rate": 2.2384720395393878e-06, + "loss": 0.3319, + "step": 6815 + }, + { + "epoch": 3.222695035460993, + "grad_norm": 3.1172873973846436, + "learning_rate": 2.2378516435268086e-06, + "loss": 0.379, + "step": 6816 + }, + { + "epoch": 3.2231678486997635, + "grad_norm": 3.0282177925109863, + "learning_rate": 2.237231263837058e-06, + "loss": 0.3855, + "step": 6817 + }, + { + "epoch": 3.2236406619385343, + "grad_norm": 2.7156803607940674, + "learning_rate": 2.236610900508763e-06, + "loss": 0.4062, + "step": 6818 + }, + { + "epoch": 3.224113475177305, + "grad_norm": 2.721327781677246, + "learning_rate": 2.235990553580554e-06, + "loss": 0.3726, + "step": 6819 + }, + { + "epoch": 3.2245862884160754, + "grad_norm": 2.881181240081787, + "learning_rate": 2.235370223091055e-06, + "loss": 0.421, + "step": 6820 + }, + { + "epoch": 3.225059101654846, + "grad_norm": 2.8074657917022705, + "learning_rate": 2.234749909078892e-06, + "loss": 0.3628, + "step": 6821 + }, + { + "epoch": 3.225531914893617, + "grad_norm": 2.8781638145446777, + "learning_rate": 2.234129611582689e-06, + "loss": 0.3857, + "step": 6822 + }, + { + "epoch": 3.226004728132388, + "grad_norm": 2.9473299980163574, + "learning_rate": 2.233509330641068e-06, + "loss": 0.4358, + "step": 6823 + }, + { + "epoch": 3.2264775413711586, + "grad_norm": 3.261209011077881, + "learning_rate": 2.2328890662926543e-06, + "loss": 0.4115, + "step": 6824 + }, + { + "epoch": 3.226950354609929, + "grad_norm": 3.2796943187713623, + "learning_rate": 2.232268818576067e-06, + "loss": 0.3846, + "step": 6825 + }, + { + "epoch": 3.2274231678486998, + "grad_norm": 3.1083059310913086, + "learning_rate": 2.2316485875299247e-06, + "loss": 0.3452, + "step": 6826 + }, + { + "epoch": 3.2278959810874706, + "grad_norm": 2.7947003841400146, + "learning_rate": 2.23102837319285e-06, + "loss": 0.3733, + "step": 6827 + }, + { + "epoch": 3.228368794326241, + "grad_norm": 2.792348861694336, + "learning_rate": 2.230408175603458e-06, + "loss": 0.411, + "step": 6828 + }, + { + "epoch": 3.2288416075650117, + "grad_norm": 2.8563876152038574, + "learning_rate": 2.229787994800368e-06, + "loss": 0.4303, + "step": 6829 + }, + { + "epoch": 3.2293144208037825, + "grad_norm": 2.9573659896850586, + "learning_rate": 2.2291678308221943e-06, + "loss": 0.4124, + "step": 6830 + }, + { + "epoch": 3.2297872340425533, + "grad_norm": 2.8554422855377197, + "learning_rate": 2.228547683707551e-06, + "loss": 0.3715, + "step": 6831 + }, + { + "epoch": 3.230260047281324, + "grad_norm": 2.9457242488861084, + "learning_rate": 2.227927553495054e-06, + "loss": 0.4339, + "step": 6832 + }, + { + "epoch": 3.2307328605200945, + "grad_norm": 2.799135684967041, + "learning_rate": 2.227307440223315e-06, + "loss": 0.3335, + "step": 6833 + }, + { + "epoch": 3.2312056737588652, + "grad_norm": 2.768529176712036, + "learning_rate": 2.2266873439309465e-06, + "loss": 0.3929, + "step": 6834 + }, + { + "epoch": 3.231678486997636, + "grad_norm": 3.124069929122925, + "learning_rate": 2.2260672646565585e-06, + "loss": 0.4205, + "step": 6835 + }, + { + "epoch": 3.2321513002364064, + "grad_norm": 2.8153982162475586, + "learning_rate": 2.2254472024387603e-06, + "loss": 0.3565, + "step": 6836 + }, + { + "epoch": 3.232624113475177, + "grad_norm": 3.1802141666412354, + "learning_rate": 2.224827157316162e-06, + "loss": 0.4614, + "step": 6837 + }, + { + "epoch": 3.233096926713948, + "grad_norm": 2.669651746749878, + "learning_rate": 2.2242071293273682e-06, + "loss": 0.3581, + "step": 6838 + }, + { + "epoch": 3.233569739952719, + "grad_norm": 3.073127269744873, + "learning_rate": 2.223587118510989e-06, + "loss": 0.3581, + "step": 6839 + }, + { + "epoch": 3.2340425531914896, + "grad_norm": 2.875955820083618, + "learning_rate": 2.222967124905627e-06, + "loss": 0.3905, + "step": 6840 + }, + { + "epoch": 3.23451536643026, + "grad_norm": 2.887744903564453, + "learning_rate": 2.2223471485498872e-06, + "loss": 0.4131, + "step": 6841 + }, + { + "epoch": 3.2349881796690307, + "grad_norm": 2.6957902908325195, + "learning_rate": 2.2217271894823735e-06, + "loss": 0.3631, + "step": 6842 + }, + { + "epoch": 3.2354609929078015, + "grad_norm": 2.7098400592803955, + "learning_rate": 2.221107247741688e-06, + "loss": 0.3959, + "step": 6843 + }, + { + "epoch": 3.235933806146572, + "grad_norm": 2.986271858215332, + "learning_rate": 2.22048732336643e-06, + "loss": 0.3515, + "step": 6844 + }, + { + "epoch": 3.2364066193853427, + "grad_norm": 3.0537121295928955, + "learning_rate": 2.2198674163952015e-06, + "loss": 0.438, + "step": 6845 + }, + { + "epoch": 3.2368794326241135, + "grad_norm": 2.8351151943206787, + "learning_rate": 2.2192475268666e-06, + "loss": 0.4069, + "step": 6846 + }, + { + "epoch": 3.2373522458628843, + "grad_norm": 2.6455280780792236, + "learning_rate": 2.218627654819225e-06, + "loss": 0.3626, + "step": 6847 + }, + { + "epoch": 3.237825059101655, + "grad_norm": 3.060352325439453, + "learning_rate": 2.2180078002916717e-06, + "loss": 0.3306, + "step": 6848 + }, + { + "epoch": 3.2382978723404254, + "grad_norm": 3.0178887844085693, + "learning_rate": 2.2173879633225355e-06, + "loss": 0.4111, + "step": 6849 + }, + { + "epoch": 3.2387706855791962, + "grad_norm": 2.895822763442993, + "learning_rate": 2.2167681439504123e-06, + "loss": 0.4053, + "step": 6850 + }, + { + "epoch": 3.239243498817967, + "grad_norm": 2.7295608520507812, + "learning_rate": 2.2161483422138945e-06, + "loss": 0.4021, + "step": 6851 + }, + { + "epoch": 3.2397163120567374, + "grad_norm": 3.1004912853240967, + "learning_rate": 2.2155285581515747e-06, + "loss": 0.3882, + "step": 6852 + }, + { + "epoch": 3.240189125295508, + "grad_norm": 2.927987813949585, + "learning_rate": 2.214908791802045e-06, + "loss": 0.4036, + "step": 6853 + }, + { + "epoch": 3.240661938534279, + "grad_norm": 3.1679599285125732, + "learning_rate": 2.2142890432038943e-06, + "loss": 0.3897, + "step": 6854 + }, + { + "epoch": 3.2411347517730498, + "grad_norm": 3.2094008922576904, + "learning_rate": 2.213669312395712e-06, + "loss": 0.4429, + "step": 6855 + }, + { + "epoch": 3.24160756501182, + "grad_norm": 4.637594223022461, + "learning_rate": 2.2130495994160857e-06, + "loss": 0.3708, + "step": 6856 + }, + { + "epoch": 3.242080378250591, + "grad_norm": 3.0063490867614746, + "learning_rate": 2.212429904303603e-06, + "loss": 0.3949, + "step": 6857 + }, + { + "epoch": 3.2425531914893617, + "grad_norm": 3.285444736480713, + "learning_rate": 2.21181022709685e-06, + "loss": 0.4236, + "step": 6858 + }, + { + "epoch": 3.2430260047281325, + "grad_norm": 3.02506422996521, + "learning_rate": 2.2111905678344086e-06, + "loss": 0.368, + "step": 6859 + }, + { + "epoch": 3.243498817966903, + "grad_norm": 2.9845006465911865, + "learning_rate": 2.2105709265548657e-06, + "loss": 0.4154, + "step": 6860 + }, + { + "epoch": 3.2439716312056737, + "grad_norm": 3.2537527084350586, + "learning_rate": 2.2099513032968013e-06, + "loss": 0.4385, + "step": 6861 + }, + { + "epoch": 3.2444444444444445, + "grad_norm": 2.8521063327789307, + "learning_rate": 2.2093316980987985e-06, + "loss": 0.384, + "step": 6862 + }, + { + "epoch": 3.2449172576832153, + "grad_norm": 3.186844825744629, + "learning_rate": 2.208712110999436e-06, + "loss": 0.4131, + "step": 6863 + }, + { + "epoch": 3.2453900709219856, + "grad_norm": 2.932058095932007, + "learning_rate": 2.208092542037292e-06, + "loss": 0.3341, + "step": 6864 + }, + { + "epoch": 3.2458628841607564, + "grad_norm": 3.0818707942962646, + "learning_rate": 2.2074729912509462e-06, + "loss": 0.4149, + "step": 6865 + }, + { + "epoch": 3.246335697399527, + "grad_norm": 2.9788503646850586, + "learning_rate": 2.2068534586789735e-06, + "loss": 0.3572, + "step": 6866 + }, + { + "epoch": 3.246808510638298, + "grad_norm": 2.84075665473938, + "learning_rate": 2.206233944359952e-06, + "loss": 0.3561, + "step": 6867 + }, + { + "epoch": 3.2472813238770684, + "grad_norm": 2.966459035873413, + "learning_rate": 2.2056144483324545e-06, + "loss": 0.3909, + "step": 6868 + }, + { + "epoch": 3.247754137115839, + "grad_norm": 2.892038106918335, + "learning_rate": 2.204994970635054e-06, + "loss": 0.3557, + "step": 6869 + }, + { + "epoch": 3.24822695035461, + "grad_norm": 2.7458810806274414, + "learning_rate": 2.2043755113063233e-06, + "loss": 0.3551, + "step": 6870 + }, + { + "epoch": 3.2486997635933808, + "grad_norm": 2.766803741455078, + "learning_rate": 2.2037560703848334e-06, + "loss": 0.3343, + "step": 6871 + }, + { + "epoch": 3.249172576832151, + "grad_norm": 2.9780561923980713, + "learning_rate": 2.2031366479091533e-06, + "loss": 0.4004, + "step": 6872 + }, + { + "epoch": 3.249645390070922, + "grad_norm": 2.8848516941070557, + "learning_rate": 2.202517243917853e-06, + "loss": 0.3467, + "step": 6873 + }, + { + "epoch": 3.2501182033096927, + "grad_norm": 2.9962213039398193, + "learning_rate": 2.201897858449499e-06, + "loss": 0.3796, + "step": 6874 + }, + { + "epoch": 3.2505910165484635, + "grad_norm": 2.838131904602051, + "learning_rate": 2.201278491542659e-06, + "loss": 0.3683, + "step": 6875 + }, + { + "epoch": 3.251063829787234, + "grad_norm": 3.0232505798339844, + "learning_rate": 2.200659143235897e-06, + "loss": 0.3793, + "step": 6876 + }, + { + "epoch": 3.2515366430260046, + "grad_norm": 3.0690126419067383, + "learning_rate": 2.2000398135677776e-06, + "loss": 0.417, + "step": 6877 + }, + { + "epoch": 3.2520094562647754, + "grad_norm": 3.1838719844818115, + "learning_rate": 2.1994205025768643e-06, + "loss": 0.4608, + "step": 6878 + }, + { + "epoch": 3.2524822695035462, + "grad_norm": 3.1187257766723633, + "learning_rate": 2.198801210301717e-06, + "loss": 0.3396, + "step": 6879 + }, + { + "epoch": 3.2529550827423166, + "grad_norm": 2.7608656883239746, + "learning_rate": 2.1981819367808984e-06, + "loss": 0.386, + "step": 6880 + }, + { + "epoch": 3.2534278959810874, + "grad_norm": 3.027456283569336, + "learning_rate": 2.197562682052968e-06, + "loss": 0.3941, + "step": 6881 + }, + { + "epoch": 3.253900709219858, + "grad_norm": 2.925515651702881, + "learning_rate": 2.1969434461564816e-06, + "loss": 0.3608, + "step": 6882 + }, + { + "epoch": 3.254373522458629, + "grad_norm": 2.946770668029785, + "learning_rate": 2.196324229129999e-06, + "loss": 0.4116, + "step": 6883 + }, + { + "epoch": 3.2548463356973993, + "grad_norm": 2.6497952938079834, + "learning_rate": 2.1957050310120746e-06, + "loss": 0.338, + "step": 6884 + }, + { + "epoch": 3.25531914893617, + "grad_norm": 2.6915128231048584, + "learning_rate": 2.195085851841264e-06, + "loss": 0.3372, + "step": 6885 + }, + { + "epoch": 3.255791962174941, + "grad_norm": 3.4022350311279297, + "learning_rate": 2.1944666916561205e-06, + "loss": 0.3844, + "step": 6886 + }, + { + "epoch": 3.2562647754137117, + "grad_norm": 2.7463366985321045, + "learning_rate": 2.1938475504951958e-06, + "loss": 0.3268, + "step": 6887 + }, + { + "epoch": 3.256737588652482, + "grad_norm": 2.828810691833496, + "learning_rate": 2.193228428397042e-06, + "loss": 0.3275, + "step": 6888 + }, + { + "epoch": 3.257210401891253, + "grad_norm": 3.4016268253326416, + "learning_rate": 2.192609325400208e-06, + "loss": 0.3916, + "step": 6889 + }, + { + "epoch": 3.2576832151300237, + "grad_norm": 2.4980733394622803, + "learning_rate": 2.191990241543245e-06, + "loss": 0.3636, + "step": 6890 + }, + { + "epoch": 3.2581560283687945, + "grad_norm": 3.0384702682495117, + "learning_rate": 2.191371176864698e-06, + "loss": 0.398, + "step": 6891 + }, + { + "epoch": 3.258628841607565, + "grad_norm": 2.8949527740478516, + "learning_rate": 2.190752131403115e-06, + "loss": 0.3919, + "step": 6892 + }, + { + "epoch": 3.2591016548463356, + "grad_norm": 2.765617609024048, + "learning_rate": 2.190133105197041e-06, + "loss": 0.3799, + "step": 6893 + }, + { + "epoch": 3.2595744680851064, + "grad_norm": 2.6149277687072754, + "learning_rate": 2.18951409828502e-06, + "loss": 0.3895, + "step": 6894 + }, + { + "epoch": 3.260047281323877, + "grad_norm": 2.9738945960998535, + "learning_rate": 2.1888951107055934e-06, + "loss": 0.3879, + "step": 6895 + }, + { + "epoch": 3.2605200945626476, + "grad_norm": 2.9438633918762207, + "learning_rate": 2.1882761424973053e-06, + "loss": 0.438, + "step": 6896 + }, + { + "epoch": 3.2609929078014184, + "grad_norm": 3.114243984222412, + "learning_rate": 2.1876571936986936e-06, + "loss": 0.4737, + "step": 6897 + }, + { + "epoch": 3.261465721040189, + "grad_norm": 3.017526388168335, + "learning_rate": 2.1870382643483e-06, + "loss": 0.4039, + "step": 6898 + }, + { + "epoch": 3.26193853427896, + "grad_norm": 3.1475703716278076, + "learning_rate": 2.1864193544846613e-06, + "loss": 0.3825, + "step": 6899 + }, + { + "epoch": 3.2624113475177303, + "grad_norm": 2.75502872467041, + "learning_rate": 2.1858004641463142e-06, + "loss": 0.3507, + "step": 6900 + }, + { + "epoch": 3.262884160756501, + "grad_norm": 3.0467209815979004, + "learning_rate": 2.1851815933717944e-06, + "loss": 0.3938, + "step": 6901 + }, + { + "epoch": 3.263356973995272, + "grad_norm": 2.993014097213745, + "learning_rate": 2.184562742199636e-06, + "loss": 0.3711, + "step": 6902 + }, + { + "epoch": 3.2638297872340427, + "grad_norm": 2.607309341430664, + "learning_rate": 2.183943910668373e-06, + "loss": 0.3689, + "step": 6903 + }, + { + "epoch": 3.264302600472813, + "grad_norm": 2.961653470993042, + "learning_rate": 2.1833250988165373e-06, + "loss": 0.3806, + "step": 6904 + }, + { + "epoch": 3.264775413711584, + "grad_norm": 2.8202552795410156, + "learning_rate": 2.1827063066826574e-06, + "loss": 0.391, + "step": 6905 + }, + { + "epoch": 3.2652482269503547, + "grad_norm": 3.032648801803589, + "learning_rate": 2.1820875343052666e-06, + "loss": 0.4011, + "step": 6906 + }, + { + "epoch": 3.2657210401891255, + "grad_norm": 2.8265180587768555, + "learning_rate": 2.1814687817228896e-06, + "loss": 0.3923, + "step": 6907 + }, + { + "epoch": 3.266193853427896, + "grad_norm": 3.1425564289093018, + "learning_rate": 2.1808500489740555e-06, + "loss": 0.4913, + "step": 6908 + }, + { + "epoch": 3.2666666666666666, + "grad_norm": 2.977809429168701, + "learning_rate": 2.18023133609729e-06, + "loss": 0.379, + "step": 6909 + }, + { + "epoch": 3.2671394799054374, + "grad_norm": 3.509551525115967, + "learning_rate": 2.1796126431311153e-06, + "loss": 0.4025, + "step": 6910 + }, + { + "epoch": 3.267612293144208, + "grad_norm": 2.9133846759796143, + "learning_rate": 2.178993970114058e-06, + "loss": 0.4209, + "step": 6911 + }, + { + "epoch": 3.2680851063829786, + "grad_norm": 2.945513963699341, + "learning_rate": 2.178375317084637e-06, + "loss": 0.3882, + "step": 6912 + }, + { + "epoch": 3.2685579196217494, + "grad_norm": 2.7868733406066895, + "learning_rate": 2.1777566840813763e-06, + "loss": 0.3456, + "step": 6913 + }, + { + "epoch": 3.26903073286052, + "grad_norm": 2.803220748901367, + "learning_rate": 2.1771380711427937e-06, + "loss": 0.3394, + "step": 6914 + }, + { + "epoch": 3.269503546099291, + "grad_norm": 3.1293554306030273, + "learning_rate": 2.176519478307407e-06, + "loss": 0.402, + "step": 6915 + }, + { + "epoch": 3.2699763593380613, + "grad_norm": 2.843971014022827, + "learning_rate": 2.1759009056137347e-06, + "loss": 0.3449, + "step": 6916 + }, + { + "epoch": 3.270449172576832, + "grad_norm": 2.9983274936676025, + "learning_rate": 2.1752823531002917e-06, + "loss": 0.4091, + "step": 6917 + }, + { + "epoch": 3.270921985815603, + "grad_norm": 2.686722993850708, + "learning_rate": 2.174663820805592e-06, + "loss": 0.4303, + "step": 6918 + }, + { + "epoch": 3.2713947990543737, + "grad_norm": 2.669349431991577, + "learning_rate": 2.1740453087681508e-06, + "loss": 0.3796, + "step": 6919 + }, + { + "epoch": 3.271867612293144, + "grad_norm": 2.992138624191284, + "learning_rate": 2.173426817026477e-06, + "loss": 0.4125, + "step": 6920 + }, + { + "epoch": 3.272340425531915, + "grad_norm": 3.332834243774414, + "learning_rate": 2.1728083456190852e-06, + "loss": 0.3885, + "step": 6921 + }, + { + "epoch": 3.2728132387706856, + "grad_norm": 2.869673013687134, + "learning_rate": 2.1721898945844825e-06, + "loss": 0.3941, + "step": 6922 + }, + { + "epoch": 3.2732860520094564, + "grad_norm": 2.804440975189209, + "learning_rate": 2.1715714639611774e-06, + "loss": 0.4007, + "step": 6923 + }, + { + "epoch": 3.273758865248227, + "grad_norm": 3.1751439571380615, + "learning_rate": 2.1709530537876774e-06, + "loss": 0.3981, + "step": 6924 + }, + { + "epoch": 3.2742316784869976, + "grad_norm": 2.6367175579071045, + "learning_rate": 2.1703346641024878e-06, + "loss": 0.3582, + "step": 6925 + }, + { + "epoch": 3.2747044917257684, + "grad_norm": 2.99164080619812, + "learning_rate": 2.1697162949441137e-06, + "loss": 0.3846, + "step": 6926 + }, + { + "epoch": 3.275177304964539, + "grad_norm": 3.3206982612609863, + "learning_rate": 2.169097946351057e-06, + "loss": 0.3689, + "step": 6927 + }, + { + "epoch": 3.2756501182033095, + "grad_norm": 2.927907943725586, + "learning_rate": 2.16847961836182e-06, + "loss": 0.3536, + "step": 6928 + }, + { + "epoch": 3.2761229314420803, + "grad_norm": 3.1950864791870117, + "learning_rate": 2.167861311014904e-06, + "loss": 0.4154, + "step": 6929 + }, + { + "epoch": 3.276595744680851, + "grad_norm": 2.888383388519287, + "learning_rate": 2.1672430243488073e-06, + "loss": 0.3702, + "step": 6930 + }, + { + "epoch": 3.277068557919622, + "grad_norm": 2.842287063598633, + "learning_rate": 2.166624758402029e-06, + "loss": 0.3623, + "step": 6931 + }, + { + "epoch": 3.2775413711583923, + "grad_norm": 2.84350323677063, + "learning_rate": 2.166006513213065e-06, + "loss": 0.3757, + "step": 6932 + }, + { + "epoch": 3.278014184397163, + "grad_norm": 3.105626344680786, + "learning_rate": 2.165388288820411e-06, + "loss": 0.3955, + "step": 6933 + }, + { + "epoch": 3.278486997635934, + "grad_norm": 3.273508071899414, + "learning_rate": 2.164770085262561e-06, + "loss": 0.4046, + "step": 6934 + }, + { + "epoch": 3.2789598108747047, + "grad_norm": 3.2530124187469482, + "learning_rate": 2.1641519025780066e-06, + "loss": 0.3141, + "step": 6935 + }, + { + "epoch": 3.279432624113475, + "grad_norm": 2.822849750518799, + "learning_rate": 2.163533740805242e-06, + "loss": 0.3973, + "step": 6936 + }, + { + "epoch": 3.279905437352246, + "grad_norm": 2.772097587585449, + "learning_rate": 2.162915599982756e-06, + "loss": 0.3606, + "step": 6937 + }, + { + "epoch": 3.2803782505910166, + "grad_norm": 3.150696039199829, + "learning_rate": 2.1622974801490365e-06, + "loss": 0.4709, + "step": 6938 + }, + { + "epoch": 3.2808510638297874, + "grad_norm": 3.2072134017944336, + "learning_rate": 2.1616793813425736e-06, + "loss": 0.3946, + "step": 6939 + }, + { + "epoch": 3.2813238770685578, + "grad_norm": 2.9922473430633545, + "learning_rate": 2.1610613036018515e-06, + "loss": 0.3263, + "step": 6940 + }, + { + "epoch": 3.2817966903073286, + "grad_norm": 2.7818009853363037, + "learning_rate": 2.1604432469653555e-06, + "loss": 0.3887, + "step": 6941 + }, + { + "epoch": 3.2822695035460994, + "grad_norm": 3.12998628616333, + "learning_rate": 2.15982521147157e-06, + "loss": 0.3522, + "step": 6942 + }, + { + "epoch": 3.28274231678487, + "grad_norm": 2.876678228378296, + "learning_rate": 2.159207197158976e-06, + "loss": 0.3643, + "step": 6943 + }, + { + "epoch": 3.2832151300236405, + "grad_norm": 2.825488805770874, + "learning_rate": 2.1585892040660565e-06, + "loss": 0.3223, + "step": 6944 + }, + { + "epoch": 3.2836879432624113, + "grad_norm": 2.8724498748779297, + "learning_rate": 2.1579712322312906e-06, + "loss": 0.3855, + "step": 6945 + }, + { + "epoch": 3.284160756501182, + "grad_norm": 2.841064691543579, + "learning_rate": 2.1573532816931547e-06, + "loss": 0.4106, + "step": 6946 + }, + { + "epoch": 3.284633569739953, + "grad_norm": 3.053391218185425, + "learning_rate": 2.1567353524901288e-06, + "loss": 0.4875, + "step": 6947 + }, + { + "epoch": 3.2851063829787233, + "grad_norm": 2.7294771671295166, + "learning_rate": 2.156117444660687e-06, + "loss": 0.3856, + "step": 6948 + }, + { + "epoch": 3.285579196217494, + "grad_norm": 3.0965659618377686, + "learning_rate": 2.155499558243304e-06, + "loss": 0.4104, + "step": 6949 + }, + { + "epoch": 3.286052009456265, + "grad_norm": 2.778923511505127, + "learning_rate": 2.1548816932764536e-06, + "loss": 0.3636, + "step": 6950 + }, + { + "epoch": 3.2865248226950357, + "grad_norm": 2.890679121017456, + "learning_rate": 2.1542638497986054e-06, + "loss": 0.4026, + "step": 6951 + }, + { + "epoch": 3.286997635933806, + "grad_norm": 3.0466806888580322, + "learning_rate": 2.1536460278482326e-06, + "loss": 0.3856, + "step": 6952 + }, + { + "epoch": 3.287470449172577, + "grad_norm": 3.1367077827453613, + "learning_rate": 2.1530282274638013e-06, + "loss": 0.3767, + "step": 6953 + }, + { + "epoch": 3.2879432624113476, + "grad_norm": 2.984694719314575, + "learning_rate": 2.1524104486837823e-06, + "loss": 0.4142, + "step": 6954 + }, + { + "epoch": 3.2884160756501184, + "grad_norm": 3.1542797088623047, + "learning_rate": 2.151792691546641e-06, + "loss": 0.4361, + "step": 6955 + }, + { + "epoch": 3.2888888888888888, + "grad_norm": 2.7306816577911377, + "learning_rate": 2.1511749560908405e-06, + "loss": 0.3692, + "step": 6956 + }, + { + "epoch": 3.2893617021276595, + "grad_norm": 3.6679904460906982, + "learning_rate": 2.150557242354847e-06, + "loss": 0.4496, + "step": 6957 + }, + { + "epoch": 3.2898345153664303, + "grad_norm": 3.2040863037109375, + "learning_rate": 2.1499395503771207e-06, + "loss": 0.3526, + "step": 6958 + }, + { + "epoch": 3.290307328605201, + "grad_norm": 3.2416043281555176, + "learning_rate": 2.1493218801961246e-06, + "loss": 0.3955, + "step": 6959 + }, + { + "epoch": 3.2907801418439715, + "grad_norm": 2.8164525032043457, + "learning_rate": 2.1487042318503174e-06, + "loss": 0.3727, + "step": 6960 + }, + { + "epoch": 3.2912529550827423, + "grad_norm": 2.5954513549804688, + "learning_rate": 2.148086605378156e-06, + "loss": 0.3315, + "step": 6961 + }, + { + "epoch": 3.291725768321513, + "grad_norm": 2.8068149089813232, + "learning_rate": 2.1474690008181e-06, + "loss": 0.3702, + "step": 6962 + }, + { + "epoch": 3.2921985815602834, + "grad_norm": 2.9063730239868164, + "learning_rate": 2.1468514182086025e-06, + "loss": 0.3357, + "step": 6963 + }, + { + "epoch": 3.2926713947990542, + "grad_norm": 2.7623207569122314, + "learning_rate": 2.1462338575881197e-06, + "loss": 0.381, + "step": 6964 + }, + { + "epoch": 3.293144208037825, + "grad_norm": 2.6818830966949463, + "learning_rate": 2.145616318995103e-06, + "loss": 0.3733, + "step": 6965 + }, + { + "epoch": 3.293617021276596, + "grad_norm": 2.7966864109039307, + "learning_rate": 2.1449988024680034e-06, + "loss": 0.3993, + "step": 6966 + }, + { + "epoch": 3.2940898345153666, + "grad_norm": 3.0644514560699463, + "learning_rate": 2.1443813080452728e-06, + "loss": 0.3541, + "step": 6967 + }, + { + "epoch": 3.294562647754137, + "grad_norm": 3.03204607963562, + "learning_rate": 2.1437638357653586e-06, + "loss": 0.3864, + "step": 6968 + }, + { + "epoch": 3.295035460992908, + "grad_norm": 2.980565071105957, + "learning_rate": 2.143146385666707e-06, + "loss": 0.36, + "step": 6969 + }, + { + "epoch": 3.2955082742316786, + "grad_norm": 3.1261661052703857, + "learning_rate": 2.1425289577877675e-06, + "loss": 0.4053, + "step": 6970 + }, + { + "epoch": 3.295981087470449, + "grad_norm": 3.0194897651672363, + "learning_rate": 2.1419115521669804e-06, + "loss": 0.4553, + "step": 6971 + }, + { + "epoch": 3.2964539007092197, + "grad_norm": 2.7620482444763184, + "learning_rate": 2.141294168842792e-06, + "loss": 0.3846, + "step": 6972 + }, + { + "epoch": 3.2969267139479905, + "grad_norm": 2.9575016498565674, + "learning_rate": 2.1406768078536427e-06, + "loss": 0.4415, + "step": 6973 + }, + { + "epoch": 3.2973995271867613, + "grad_norm": 3.17909574508667, + "learning_rate": 2.1400594692379717e-06, + "loss": 0.4514, + "step": 6974 + }, + { + "epoch": 3.297872340425532, + "grad_norm": 3.128613233566284, + "learning_rate": 2.1394421530342207e-06, + "loss": 0.3757, + "step": 6975 + }, + { + "epoch": 3.2983451536643025, + "grad_norm": 3.0247111320495605, + "learning_rate": 2.1388248592808243e-06, + "loss": 0.3881, + "step": 6976 + }, + { + "epoch": 3.2988179669030733, + "grad_norm": 2.8091228008270264, + "learning_rate": 2.1382075880162217e-06, + "loss": 0.3782, + "step": 6977 + }, + { + "epoch": 3.299290780141844, + "grad_norm": 2.985105514526367, + "learning_rate": 2.137590339278846e-06, + "loss": 0.3783, + "step": 6978 + }, + { + "epoch": 3.2997635933806144, + "grad_norm": 3.1862502098083496, + "learning_rate": 2.1369731131071304e-06, + "loss": 0.4776, + "step": 6979 + }, + { + "epoch": 3.300236406619385, + "grad_norm": 3.3138091564178467, + "learning_rate": 2.1363559095395075e-06, + "loss": 0.4056, + "step": 6980 + }, + { + "epoch": 3.300709219858156, + "grad_norm": 3.023695707321167, + "learning_rate": 2.135738728614407e-06, + "loss": 0.3716, + "step": 6981 + }, + { + "epoch": 3.301182033096927, + "grad_norm": 6.149252414703369, + "learning_rate": 2.135121570370259e-06, + "loss": 0.3713, + "step": 6982 + }, + { + "epoch": 3.3016548463356976, + "grad_norm": 2.689671754837036, + "learning_rate": 2.134504434845491e-06, + "loss": 0.3541, + "step": 6983 + }, + { + "epoch": 3.302127659574468, + "grad_norm": 3.241212844848633, + "learning_rate": 2.1338873220785284e-06, + "loss": 0.4328, + "step": 6984 + }, + { + "epoch": 3.3026004728132388, + "grad_norm": 3.6037068367004395, + "learning_rate": 2.133270232107798e-06, + "loss": 0.4091, + "step": 6985 + }, + { + "epoch": 3.3030732860520096, + "grad_norm": 3.300031900405884, + "learning_rate": 2.1326531649717216e-06, + "loss": 0.3742, + "step": 6986 + }, + { + "epoch": 3.30354609929078, + "grad_norm": 2.82257342338562, + "learning_rate": 2.1320361207087225e-06, + "loss": 0.3622, + "step": 6987 + }, + { + "epoch": 3.3040189125295507, + "grad_norm": 3.297513246536255, + "learning_rate": 2.1314190993572196e-06, + "loss": 0.4606, + "step": 6988 + }, + { + "epoch": 3.3044917257683215, + "grad_norm": 2.676440954208374, + "learning_rate": 2.130802100955634e-06, + "loss": 0.382, + "step": 6989 + }, + { + "epoch": 3.3049645390070923, + "grad_norm": 2.9548017978668213, + "learning_rate": 2.130185125542383e-06, + "loss": 0.3751, + "step": 6990 + }, + { + "epoch": 3.305437352245863, + "grad_norm": 2.800647020339966, + "learning_rate": 2.129568173155882e-06, + "loss": 0.3868, + "step": 6991 + }, + { + "epoch": 3.3059101654846335, + "grad_norm": 3.3789260387420654, + "learning_rate": 2.128951243834546e-06, + "loss": 0.4373, + "step": 6992 + }, + { + "epoch": 3.3063829787234043, + "grad_norm": 2.944807767868042, + "learning_rate": 2.12833433761679e-06, + "loss": 0.4205, + "step": 6993 + }, + { + "epoch": 3.306855791962175, + "grad_norm": 2.577975273132324, + "learning_rate": 2.127717454541025e-06, + "loss": 0.4197, + "step": 6994 + }, + { + "epoch": 3.3073286052009454, + "grad_norm": 3.0542666912078857, + "learning_rate": 2.127100594645661e-06, + "loss": 0.3811, + "step": 6995 + }, + { + "epoch": 3.307801418439716, + "grad_norm": 3.163015842437744, + "learning_rate": 2.1264837579691088e-06, + "loss": 0.415, + "step": 6996 + }, + { + "epoch": 3.308274231678487, + "grad_norm": 2.9161269664764404, + "learning_rate": 2.1258669445497746e-06, + "loss": 0.3714, + "step": 6997 + }, + { + "epoch": 3.308747044917258, + "grad_norm": 2.934483289718628, + "learning_rate": 2.1252501544260657e-06, + "loss": 0.4085, + "step": 6998 + }, + { + "epoch": 3.3092198581560286, + "grad_norm": 3.155613660812378, + "learning_rate": 2.1246333876363852e-06, + "loss": 0.4698, + "step": 6999 + }, + { + "epoch": 3.309692671394799, + "grad_norm": 2.648171901702881, + "learning_rate": 2.124016644219139e-06, + "loss": 0.3091, + "step": 7000 + }, + { + "epoch": 3.3101654846335697, + "grad_norm": 2.908219814300537, + "learning_rate": 2.123399924212728e-06, + "loss": 0.4063, + "step": 7001 + }, + { + "epoch": 3.3106382978723405, + "grad_norm": 3.138749361038208, + "learning_rate": 2.122783227655551e-06, + "loss": 0.4296, + "step": 7002 + }, + { + "epoch": 3.311111111111111, + "grad_norm": 3.044466018676758, + "learning_rate": 2.1221665545860094e-06, + "loss": 0.4424, + "step": 7003 + }, + { + "epoch": 3.3115839243498817, + "grad_norm": 2.6758792400360107, + "learning_rate": 2.121549905042499e-06, + "loss": 0.4073, + "step": 7004 + }, + { + "epoch": 3.3120567375886525, + "grad_norm": 2.8901989459991455, + "learning_rate": 2.1209332790634174e-06, + "loss": 0.3842, + "step": 7005 + }, + { + "epoch": 3.3125295508274233, + "grad_norm": 2.8179712295532227, + "learning_rate": 2.1203166766871582e-06, + "loss": 0.366, + "step": 7006 + }, + { + "epoch": 3.313002364066194, + "grad_norm": 2.6536550521850586, + "learning_rate": 2.1197000979521138e-06, + "loss": 0.3851, + "step": 7007 + }, + { + "epoch": 3.3134751773049644, + "grad_norm": 3.1277682781219482, + "learning_rate": 2.1190835428966775e-06, + "loss": 0.4249, + "step": 7008 + }, + { + "epoch": 3.3139479905437352, + "grad_norm": 2.924666166305542, + "learning_rate": 2.1184670115592383e-06, + "loss": 0.3873, + "step": 7009 + }, + { + "epoch": 3.314420803782506, + "grad_norm": 2.7921009063720703, + "learning_rate": 2.1178505039781856e-06, + "loss": 0.3754, + "step": 7010 + }, + { + "epoch": 3.3148936170212764, + "grad_norm": 2.5349879264831543, + "learning_rate": 2.1172340201919067e-06, + "loss": 0.3701, + "step": 7011 + }, + { + "epoch": 3.315366430260047, + "grad_norm": 2.849376678466797, + "learning_rate": 2.1166175602387866e-06, + "loss": 0.3963, + "step": 7012 + }, + { + "epoch": 3.315839243498818, + "grad_norm": 3.141280174255371, + "learning_rate": 2.11600112415721e-06, + "loss": 0.4158, + "step": 7013 + }, + { + "epoch": 3.3163120567375888, + "grad_norm": 2.922807455062866, + "learning_rate": 2.11538471198556e-06, + "loss": 0.3667, + "step": 7014 + }, + { + "epoch": 3.3167848699763596, + "grad_norm": 2.770400047302246, + "learning_rate": 2.114768323762216e-06, + "loss": 0.3674, + "step": 7015 + }, + { + "epoch": 3.31725768321513, + "grad_norm": 2.7706570625305176, + "learning_rate": 2.114151959525561e-06, + "loss": 0.3761, + "step": 7016 + }, + { + "epoch": 3.3177304964539007, + "grad_norm": 3.041755437850952, + "learning_rate": 2.1135356193139704e-06, + "loss": 0.4483, + "step": 7017 + }, + { + "epoch": 3.3182033096926715, + "grad_norm": 3.5757904052734375, + "learning_rate": 2.1129193031658227e-06, + "loss": 0.4094, + "step": 7018 + }, + { + "epoch": 3.318676122931442, + "grad_norm": 2.9292917251586914, + "learning_rate": 2.1123030111194936e-06, + "loss": 0.3514, + "step": 7019 + }, + { + "epoch": 3.3191489361702127, + "grad_norm": 3.1443874835968018, + "learning_rate": 2.111686743213355e-06, + "loss": 0.4098, + "step": 7020 + }, + { + "epoch": 3.3196217494089835, + "grad_norm": 2.9738030433654785, + "learning_rate": 2.1110704994857804e-06, + "loss": 0.3584, + "step": 7021 + }, + { + "epoch": 3.3200945626477543, + "grad_norm": 2.8961563110351562, + "learning_rate": 2.1104542799751397e-06, + "loss": 0.3736, + "step": 7022 + }, + { + "epoch": 3.320567375886525, + "grad_norm": 3.9264683723449707, + "learning_rate": 2.1098380847198037e-06, + "loss": 0.457, + "step": 7023 + }, + { + "epoch": 3.3210401891252954, + "grad_norm": 2.8742756843566895, + "learning_rate": 2.109221913758139e-06, + "loss": 0.4252, + "step": 7024 + }, + { + "epoch": 3.321513002364066, + "grad_norm": 3.7229559421539307, + "learning_rate": 2.108605767128512e-06, + "loss": 0.4451, + "step": 7025 + }, + { + "epoch": 3.321985815602837, + "grad_norm": 2.6417593955993652, + "learning_rate": 2.1079896448692884e-06, + "loss": 0.3658, + "step": 7026 + }, + { + "epoch": 3.3224586288416074, + "grad_norm": 2.8780412673950195, + "learning_rate": 2.10737354701883e-06, + "loss": 0.4225, + "step": 7027 + }, + { + "epoch": 3.322931442080378, + "grad_norm": 2.557816505432129, + "learning_rate": 2.1067574736155e-06, + "loss": 0.3812, + "step": 7028 + }, + { + "epoch": 3.323404255319149, + "grad_norm": 2.859062910079956, + "learning_rate": 2.106141424697658e-06, + "loss": 0.3629, + "step": 7029 + }, + { + "epoch": 3.3238770685579198, + "grad_norm": 2.4776878356933594, + "learning_rate": 2.1055254003036607e-06, + "loss": 0.3591, + "step": 7030 + }, + { + "epoch": 3.3243498817966906, + "grad_norm": 3.085066795349121, + "learning_rate": 2.1049094004718687e-06, + "loss": 0.4237, + "step": 7031 + }, + { + "epoch": 3.324822695035461, + "grad_norm": 2.862592935562134, + "learning_rate": 2.1042934252406345e-06, + "loss": 0.3185, + "step": 7032 + }, + { + "epoch": 3.3252955082742317, + "grad_norm": 2.965743064880371, + "learning_rate": 2.1036774746483145e-06, + "loss": 0.4058, + "step": 7033 + }, + { + "epoch": 3.3257683215130025, + "grad_norm": 2.7420589923858643, + "learning_rate": 2.103061548733261e-06, + "loss": 0.3566, + "step": 7034 + }, + { + "epoch": 3.326241134751773, + "grad_norm": 2.7824347019195557, + "learning_rate": 2.1024456475338235e-06, + "loss": 0.3553, + "step": 7035 + }, + { + "epoch": 3.3267139479905437, + "grad_norm": 3.0410704612731934, + "learning_rate": 2.1018297710883528e-06, + "loss": 0.3772, + "step": 7036 + }, + { + "epoch": 3.3271867612293144, + "grad_norm": 3.0811562538146973, + "learning_rate": 2.101213919435196e-06, + "loss": 0.3738, + "step": 7037 + }, + { + "epoch": 3.3276595744680852, + "grad_norm": 2.939445734024048, + "learning_rate": 2.100598092612699e-06, + "loss": 0.4107, + "step": 7038 + }, + { + "epoch": 3.3281323877068556, + "grad_norm": 3.05804705619812, + "learning_rate": 2.0999822906592086e-06, + "loss": 0.3972, + "step": 7039 + }, + { + "epoch": 3.3286052009456264, + "grad_norm": 2.803558111190796, + "learning_rate": 2.0993665136130657e-06, + "loss": 0.3487, + "step": 7040 + }, + { + "epoch": 3.329078014184397, + "grad_norm": 2.937675714492798, + "learning_rate": 2.0987507615126147e-06, + "loss": 0.4095, + "step": 7041 + }, + { + "epoch": 3.329550827423168, + "grad_norm": 2.853905439376831, + "learning_rate": 2.098135034396194e-06, + "loss": 0.3775, + "step": 7042 + }, + { + "epoch": 3.3300236406619383, + "grad_norm": 3.3520495891571045, + "learning_rate": 2.097519332302142e-06, + "loss": 0.4065, + "step": 7043 + }, + { + "epoch": 3.330496453900709, + "grad_norm": 2.8787078857421875, + "learning_rate": 2.096903655268797e-06, + "loss": 0.3452, + "step": 7044 + }, + { + "epoch": 3.33096926713948, + "grad_norm": 2.993896007537842, + "learning_rate": 2.096288003334493e-06, + "loss": 0.3814, + "step": 7045 + }, + { + "epoch": 3.3314420803782507, + "grad_norm": 3.5248336791992188, + "learning_rate": 2.0956723765375655e-06, + "loss": 0.3852, + "step": 7046 + }, + { + "epoch": 3.331914893617021, + "grad_norm": 3.2227890491485596, + "learning_rate": 2.0950567749163463e-06, + "loss": 0.3913, + "step": 7047 + }, + { + "epoch": 3.332387706855792, + "grad_norm": 3.390401601791382, + "learning_rate": 2.094441198509165e-06, + "loss": 0.3944, + "step": 7048 + }, + { + "epoch": 3.3328605200945627, + "grad_norm": 3.2057554721832275, + "learning_rate": 2.0938256473543534e-06, + "loss": 0.404, + "step": 7049 + }, + { + "epoch": 3.3333333333333335, + "grad_norm": 2.866708755493164, + "learning_rate": 2.0932101214902367e-06, + "loss": 0.4345, + "step": 7050 + }, + { + "epoch": 3.333806146572104, + "grad_norm": 3.4304039478302, + "learning_rate": 2.0925946209551428e-06, + "loss": 0.4209, + "step": 7051 + }, + { + "epoch": 3.3342789598108746, + "grad_norm": 3.996561288833618, + "learning_rate": 2.091979145787395e-06, + "loss": 0.4394, + "step": 7052 + }, + { + "epoch": 3.3347517730496454, + "grad_norm": 3.1932613849639893, + "learning_rate": 2.0913636960253166e-06, + "loss": 0.3837, + "step": 7053 + }, + { + "epoch": 3.3352245862884162, + "grad_norm": 2.908832311630249, + "learning_rate": 2.0907482717072293e-06, + "loss": 0.3526, + "step": 7054 + }, + { + "epoch": 3.3356973995271866, + "grad_norm": 2.7319607734680176, + "learning_rate": 2.090132872871452e-06, + "loss": 0.3686, + "step": 7055 + }, + { + "epoch": 3.3361702127659574, + "grad_norm": 2.9213504791259766, + "learning_rate": 2.0895174995563043e-06, + "loss": 0.4034, + "step": 7056 + }, + { + "epoch": 3.336643026004728, + "grad_norm": 2.8093936443328857, + "learning_rate": 2.0889021518001017e-06, + "loss": 0.4151, + "step": 7057 + }, + { + "epoch": 3.337115839243499, + "grad_norm": 3.1840829849243164, + "learning_rate": 2.0882868296411594e-06, + "loss": 0.3501, + "step": 7058 + }, + { + "epoch": 3.3375886524822693, + "grad_norm": 2.793567657470703, + "learning_rate": 2.087671533117791e-06, + "loss": 0.3911, + "step": 7059 + }, + { + "epoch": 3.33806146572104, + "grad_norm": 3.0820090770721436, + "learning_rate": 2.0870562622683077e-06, + "loss": 0.432, + "step": 7060 + }, + { + "epoch": 3.338534278959811, + "grad_norm": 2.774630546569824, + "learning_rate": 2.0864410171310213e-06, + "loss": 0.3434, + "step": 7061 + }, + { + "epoch": 3.3390070921985817, + "grad_norm": 2.70447039604187, + "learning_rate": 2.085825797744239e-06, + "loss": 0.3787, + "step": 7062 + }, + { + "epoch": 3.339479905437352, + "grad_norm": 3.1014437675476074, + "learning_rate": 2.0852106041462672e-06, + "loss": 0.4568, + "step": 7063 + }, + { + "epoch": 3.339952718676123, + "grad_norm": 3.312680244445801, + "learning_rate": 2.0845954363754133e-06, + "loss": 0.4285, + "step": 7064 + }, + { + "epoch": 3.3404255319148937, + "grad_norm": 2.7070534229278564, + "learning_rate": 2.0839802944699806e-06, + "loss": 0.4096, + "step": 7065 + }, + { + "epoch": 3.3408983451536645, + "grad_norm": 2.8172531127929688, + "learning_rate": 2.083365178468269e-06, + "loss": 0.3652, + "step": 7066 + }, + { + "epoch": 3.341371158392435, + "grad_norm": 2.896378517150879, + "learning_rate": 2.082750088408582e-06, + "loss": 0.3778, + "step": 7067 + }, + { + "epoch": 3.3418439716312056, + "grad_norm": 2.769805669784546, + "learning_rate": 2.0821350243292175e-06, + "loss": 0.3593, + "step": 7068 + }, + { + "epoch": 3.3423167848699764, + "grad_norm": 2.672520875930786, + "learning_rate": 2.0815199862684728e-06, + "loss": 0.3873, + "step": 7069 + }, + { + "epoch": 3.342789598108747, + "grad_norm": 2.841327428817749, + "learning_rate": 2.0809049742646435e-06, + "loss": 0.41, + "step": 7070 + }, + { + "epoch": 3.3432624113475176, + "grad_norm": 3.0540482997894287, + "learning_rate": 2.080289988356023e-06, + "loss": 0.32, + "step": 7071 + }, + { + "epoch": 3.3437352245862884, + "grad_norm": 3.471684217453003, + "learning_rate": 2.079675028580905e-06, + "loss": 0.3779, + "step": 7072 + }, + { + "epoch": 3.344208037825059, + "grad_norm": 2.8545875549316406, + "learning_rate": 2.07906009497758e-06, + "loss": 0.4645, + "step": 7073 + }, + { + "epoch": 3.34468085106383, + "grad_norm": 2.7771127223968506, + "learning_rate": 2.078445187584337e-06, + "loss": 0.3889, + "step": 7074 + }, + { + "epoch": 3.3451536643026003, + "grad_norm": 2.769188165664673, + "learning_rate": 2.0778303064394647e-06, + "loss": 0.3745, + "step": 7075 + }, + { + "epoch": 3.345626477541371, + "grad_norm": 2.739577531814575, + "learning_rate": 2.0772154515812467e-06, + "loss": 0.4402, + "step": 7076 + }, + { + "epoch": 3.346099290780142, + "grad_norm": 2.6124343872070312, + "learning_rate": 2.0766006230479696e-06, + "loss": 0.3595, + "step": 7077 + }, + { + "epoch": 3.3465721040189127, + "grad_norm": 2.7100563049316406, + "learning_rate": 2.0759858208779136e-06, + "loss": 0.3641, + "step": 7078 + }, + { + "epoch": 3.347044917257683, + "grad_norm": 2.8594000339508057, + "learning_rate": 2.075371045109363e-06, + "loss": 0.402, + "step": 7079 + }, + { + "epoch": 3.347517730496454, + "grad_norm": 3.2045278549194336, + "learning_rate": 2.0747562957805955e-06, + "loss": 0.4719, + "step": 7080 + }, + { + "epoch": 3.3479905437352246, + "grad_norm": 2.825594663619995, + "learning_rate": 2.0741415729298874e-06, + "loss": 0.4127, + "step": 7081 + }, + { + "epoch": 3.3484633569739954, + "grad_norm": 2.992403984069824, + "learning_rate": 2.0735268765955173e-06, + "loss": 0.3943, + "step": 7082 + }, + { + "epoch": 3.348936170212766, + "grad_norm": 3.0629165172576904, + "learning_rate": 2.072912206815758e-06, + "loss": 0.4132, + "step": 7083 + }, + { + "epoch": 3.3494089834515366, + "grad_norm": 2.7553658485412598, + "learning_rate": 2.0722975636288836e-06, + "loss": 0.3667, + "step": 7084 + }, + { + "epoch": 3.3498817966903074, + "grad_norm": 3.1556780338287354, + "learning_rate": 2.0716829470731647e-06, + "loss": 0.4383, + "step": 7085 + }, + { + "epoch": 3.350354609929078, + "grad_norm": 2.6693310737609863, + "learning_rate": 2.071068357186869e-06, + "loss": 0.363, + "step": 7086 + }, + { + "epoch": 3.3508274231678485, + "grad_norm": 2.738314628601074, + "learning_rate": 2.0704537940082673e-06, + "loss": 0.3493, + "step": 7087 + }, + { + "epoch": 3.3513002364066193, + "grad_norm": 3.2205989360809326, + "learning_rate": 2.069839257575624e-06, + "loss": 0.3802, + "step": 7088 + }, + { + "epoch": 3.35177304964539, + "grad_norm": 2.8969876766204834, + "learning_rate": 2.069224747927203e-06, + "loss": 0.3846, + "step": 7089 + }, + { + "epoch": 3.352245862884161, + "grad_norm": 2.833179473876953, + "learning_rate": 2.0686102651012694e-06, + "loss": 0.3892, + "step": 7090 + }, + { + "epoch": 3.3527186761229313, + "grad_norm": 3.303830623626709, + "learning_rate": 2.067995809136082e-06, + "loss": 0.4009, + "step": 7091 + }, + { + "epoch": 3.353191489361702, + "grad_norm": 3.3684141635894775, + "learning_rate": 2.0673813800699024e-06, + "loss": 0.4434, + "step": 7092 + }, + { + "epoch": 3.353664302600473, + "grad_norm": 2.6549112796783447, + "learning_rate": 2.066766977940987e-06, + "loss": 0.3941, + "step": 7093 + }, + { + "epoch": 3.3541371158392437, + "grad_norm": 2.852935314178467, + "learning_rate": 2.066152602787591e-06, + "loss": 0.4143, + "step": 7094 + }, + { + "epoch": 3.354609929078014, + "grad_norm": 2.9621706008911133, + "learning_rate": 2.0655382546479713e-06, + "loss": 0.4502, + "step": 7095 + }, + { + "epoch": 3.355082742316785, + "grad_norm": 3.2836413383483887, + "learning_rate": 2.064923933560378e-06, + "loss": 0.3993, + "step": 7096 + }, + { + "epoch": 3.3555555555555556, + "grad_norm": 2.8187968730926514, + "learning_rate": 2.0643096395630654e-06, + "loss": 0.3766, + "step": 7097 + }, + { + "epoch": 3.3560283687943264, + "grad_norm": 2.7965118885040283, + "learning_rate": 2.0636953726942803e-06, + "loss": 0.4258, + "step": 7098 + }, + { + "epoch": 3.3565011820330968, + "grad_norm": 3.002030611038208, + "learning_rate": 2.063081132992271e-06, + "loss": 0.3548, + "step": 7099 + }, + { + "epoch": 3.3569739952718676, + "grad_norm": 2.927603006362915, + "learning_rate": 2.0624669204952847e-06, + "loss": 0.3759, + "step": 7100 + }, + { + "epoch": 3.3574468085106384, + "grad_norm": 2.911393165588379, + "learning_rate": 2.061852735241563e-06, + "loss": 0.3599, + "step": 7101 + }, + { + "epoch": 3.357919621749409, + "grad_norm": 3.0596864223480225, + "learning_rate": 2.0612385772693517e-06, + "loss": 0.3557, + "step": 7102 + }, + { + "epoch": 3.3583924349881795, + "grad_norm": 2.7869808673858643, + "learning_rate": 2.0606244466168905e-06, + "loss": 0.3696, + "step": 7103 + }, + { + "epoch": 3.3588652482269503, + "grad_norm": 2.927715539932251, + "learning_rate": 2.060010343322417e-06, + "loss": 0.3309, + "step": 7104 + }, + { + "epoch": 3.359338061465721, + "grad_norm": 3.44653058052063, + "learning_rate": 2.059396267424171e-06, + "loss": 0.4453, + "step": 7105 + }, + { + "epoch": 3.359810874704492, + "grad_norm": 3.047652244567871, + "learning_rate": 2.0587822189603873e-06, + "loss": 0.3615, + "step": 7106 + }, + { + "epoch": 3.3602836879432623, + "grad_norm": 2.6640517711639404, + "learning_rate": 2.0581681979693002e-06, + "loss": 0.3716, + "step": 7107 + }, + { + "epoch": 3.360756501182033, + "grad_norm": 2.8253493309020996, + "learning_rate": 2.0575542044891424e-06, + "loss": 0.3485, + "step": 7108 + }, + { + "epoch": 3.361229314420804, + "grad_norm": 3.0512938499450684, + "learning_rate": 2.0569402385581433e-06, + "loss": 0.4582, + "step": 7109 + }, + { + "epoch": 3.3617021276595747, + "grad_norm": 2.935060739517212, + "learning_rate": 2.0563263002145333e-06, + "loss": 0.425, + "step": 7110 + }, + { + "epoch": 3.362174940898345, + "grad_norm": 3.2708780765533447, + "learning_rate": 2.0557123894965396e-06, + "loss": 0.4193, + "step": 7111 + }, + { + "epoch": 3.362647754137116, + "grad_norm": 2.758329391479492, + "learning_rate": 2.055098506442386e-06, + "loss": 0.3754, + "step": 7112 + }, + { + "epoch": 3.3631205673758866, + "grad_norm": 3.0359015464782715, + "learning_rate": 2.0544846510902987e-06, + "loss": 0.4207, + "step": 7113 + }, + { + "epoch": 3.3635933806146574, + "grad_norm": 3.096968412399292, + "learning_rate": 2.0538708234784983e-06, + "loss": 0.4303, + "step": 7114 + }, + { + "epoch": 3.3640661938534278, + "grad_norm": 3.0777673721313477, + "learning_rate": 2.053257023645206e-06, + "loss": 0.3904, + "step": 7115 + }, + { + "epoch": 3.3645390070921986, + "grad_norm": 2.9483232498168945, + "learning_rate": 2.0526432516286394e-06, + "loss": 0.3949, + "step": 7116 + }, + { + "epoch": 3.3650118203309693, + "grad_norm": 2.839067220687866, + "learning_rate": 2.0520295074670154e-06, + "loss": 0.3705, + "step": 7117 + }, + { + "epoch": 3.36548463356974, + "grad_norm": 3.0450778007507324, + "learning_rate": 2.0514157911985506e-06, + "loss": 0.3987, + "step": 7118 + }, + { + "epoch": 3.3659574468085105, + "grad_norm": 3.425318717956543, + "learning_rate": 2.0508021028614564e-06, + "loss": 0.3941, + "step": 7119 + }, + { + "epoch": 3.3664302600472813, + "grad_norm": 2.9509286880493164, + "learning_rate": 2.0501884424939465e-06, + "loss": 0.354, + "step": 7120 + }, + { + "epoch": 3.366903073286052, + "grad_norm": 2.799504518508911, + "learning_rate": 2.0495748101342303e-06, + "loss": 0.3891, + "step": 7121 + }, + { + "epoch": 3.3673758865248224, + "grad_norm": 2.9140994548797607, + "learning_rate": 2.048961205820515e-06, + "loss": 0.3638, + "step": 7122 + }, + { + "epoch": 3.3678486997635932, + "grad_norm": 2.8074216842651367, + "learning_rate": 2.0483476295910077e-06, + "loss": 0.3501, + "step": 7123 + }, + { + "epoch": 3.368321513002364, + "grad_norm": 2.770829677581787, + "learning_rate": 2.0477340814839126e-06, + "loss": 0.3774, + "step": 7124 + }, + { + "epoch": 3.368794326241135, + "grad_norm": 2.581655502319336, + "learning_rate": 2.047120561537434e-06, + "loss": 0.3523, + "step": 7125 + }, + { + "epoch": 3.3692671394799056, + "grad_norm": 3.4234209060668945, + "learning_rate": 2.046507069789772e-06, + "loss": 0.4191, + "step": 7126 + }, + { + "epoch": 3.369739952718676, + "grad_norm": 2.669860601425171, + "learning_rate": 2.045893606279126e-06, + "loss": 0.3542, + "step": 7127 + }, + { + "epoch": 3.370212765957447, + "grad_norm": 3.2426629066467285, + "learning_rate": 2.045280171043694e-06, + "loss": 0.4416, + "step": 7128 + }, + { + "epoch": 3.3706855791962176, + "grad_norm": 3.1318910121917725, + "learning_rate": 2.044666764121672e-06, + "loss": 0.3999, + "step": 7129 + }, + { + "epoch": 3.371158392434988, + "grad_norm": 2.7044012546539307, + "learning_rate": 2.044053385551254e-06, + "loss": 0.3907, + "step": 7130 + }, + { + "epoch": 3.3716312056737587, + "grad_norm": 2.9429895877838135, + "learning_rate": 2.0434400353706322e-06, + "loss": 0.3827, + "step": 7131 + }, + { + "epoch": 3.3721040189125295, + "grad_norm": 2.7258787155151367, + "learning_rate": 2.0428267136179973e-06, + "loss": 0.3688, + "step": 7132 + }, + { + "epoch": 3.3725768321513003, + "grad_norm": 2.765108108520508, + "learning_rate": 2.042213420331539e-06, + "loss": 0.4078, + "step": 7133 + }, + { + "epoch": 3.373049645390071, + "grad_norm": 3.2951347827911377, + "learning_rate": 2.0416001555494435e-06, + "loss": 0.4259, + "step": 7134 + }, + { + "epoch": 3.3735224586288415, + "grad_norm": 3.3917062282562256, + "learning_rate": 2.040986919309895e-06, + "loss": 0.5094, + "step": 7135 + }, + { + "epoch": 3.3739952718676123, + "grad_norm": 2.746434450149536, + "learning_rate": 2.04037371165108e-06, + "loss": 0.3513, + "step": 7136 + }, + { + "epoch": 3.374468085106383, + "grad_norm": 3.268731117248535, + "learning_rate": 2.0397605326111774e-06, + "loss": 0.3909, + "step": 7137 + }, + { + "epoch": 3.3749408983451534, + "grad_norm": 2.8498165607452393, + "learning_rate": 2.0391473822283692e-06, + "loss": 0.3657, + "step": 7138 + }, + { + "epoch": 3.3754137115839242, + "grad_norm": 2.855966567993164, + "learning_rate": 2.0385342605408325e-06, + "loss": 0.3927, + "step": 7139 + }, + { + "epoch": 3.375886524822695, + "grad_norm": 3.1839048862457275, + "learning_rate": 2.0379211675867438e-06, + "loss": 0.4476, + "step": 7140 + }, + { + "epoch": 3.376359338061466, + "grad_norm": 2.9379947185516357, + "learning_rate": 2.037308103404278e-06, + "loss": 0.3657, + "step": 7141 + }, + { + "epoch": 3.3768321513002366, + "grad_norm": 2.9251210689544678, + "learning_rate": 2.0366950680316073e-06, + "loss": 0.3975, + "step": 7142 + }, + { + "epoch": 3.377304964539007, + "grad_norm": 2.811885118484497, + "learning_rate": 2.036082061506904e-06, + "loss": 0.3064, + "step": 7143 + }, + { + "epoch": 3.3777777777777778, + "grad_norm": 2.755229949951172, + "learning_rate": 2.0354690838683363e-06, + "loss": 0.3328, + "step": 7144 + }, + { + "epoch": 3.3782505910165486, + "grad_norm": 3.006819725036621, + "learning_rate": 2.0348561351540706e-06, + "loss": 0.4168, + "step": 7145 + }, + { + "epoch": 3.378723404255319, + "grad_norm": 2.8788509368896484, + "learning_rate": 2.034243215402275e-06, + "loss": 0.4123, + "step": 7146 + }, + { + "epoch": 3.3791962174940897, + "grad_norm": 2.9732980728149414, + "learning_rate": 2.033630324651112e-06, + "loss": 0.3371, + "step": 7147 + }, + { + "epoch": 3.3796690307328605, + "grad_norm": 2.7731754779815674, + "learning_rate": 2.033017462938744e-06, + "loss": 0.382, + "step": 7148 + }, + { + "epoch": 3.3801418439716313, + "grad_norm": 2.766395092010498, + "learning_rate": 2.032404630303331e-06, + "loss": 0.3295, + "step": 7149 + }, + { + "epoch": 3.380614657210402, + "grad_norm": 3.197960138320923, + "learning_rate": 2.03179182678303e-06, + "loss": 0.354, + "step": 7150 + }, + { + "epoch": 3.3810874704491725, + "grad_norm": 3.048553228378296, + "learning_rate": 2.031179052416e-06, + "loss": 0.4027, + "step": 7151 + }, + { + "epoch": 3.3815602836879433, + "grad_norm": 3.1527998447418213, + "learning_rate": 2.0305663072403934e-06, + "loss": 0.4229, + "step": 7152 + }, + { + "epoch": 3.382033096926714, + "grad_norm": 3.0407028198242188, + "learning_rate": 2.029953591294366e-06, + "loss": 0.4254, + "step": 7153 + }, + { + "epoch": 3.3825059101654844, + "grad_norm": 2.7170357704162598, + "learning_rate": 2.0293409046160673e-06, + "loss": 0.3307, + "step": 7154 + }, + { + "epoch": 3.382978723404255, + "grad_norm": 3.0128726959228516, + "learning_rate": 2.028728247243646e-06, + "loss": 0.3873, + "step": 7155 + }, + { + "epoch": 3.383451536643026, + "grad_norm": 4.861877918243408, + "learning_rate": 2.0281156192152507e-06, + "loss": 0.4371, + "step": 7156 + }, + { + "epoch": 3.383924349881797, + "grad_norm": 2.890249252319336, + "learning_rate": 2.0275030205690257e-06, + "loss": 0.3899, + "step": 7157 + }, + { + "epoch": 3.3843971631205676, + "grad_norm": 3.0774779319763184, + "learning_rate": 2.026890451343117e-06, + "loss": 0.4151, + "step": 7158 + }, + { + "epoch": 3.384869976359338, + "grad_norm": 2.8705947399139404, + "learning_rate": 2.026277911575665e-06, + "loss": 0.4004, + "step": 7159 + }, + { + "epoch": 3.3853427895981087, + "grad_norm": 3.170760154724121, + "learning_rate": 2.0256654013048096e-06, + "loss": 0.4442, + "step": 7160 + }, + { + "epoch": 3.3858156028368795, + "grad_norm": 4.211156368255615, + "learning_rate": 2.0250529205686905e-06, + "loss": 0.4605, + "step": 7161 + }, + { + "epoch": 3.38628841607565, + "grad_norm": 2.513519287109375, + "learning_rate": 2.0244404694054435e-06, + "loss": 0.3506, + "step": 7162 + }, + { + "epoch": 3.3867612293144207, + "grad_norm": 3.1558821201324463, + "learning_rate": 2.023828047853203e-06, + "loss": 0.43, + "step": 7163 + }, + { + "epoch": 3.3872340425531915, + "grad_norm": 3.6770291328430176, + "learning_rate": 2.023215655950102e-06, + "loss": 0.3911, + "step": 7164 + }, + { + "epoch": 3.3877068557919623, + "grad_norm": 2.6544485092163086, + "learning_rate": 2.022603293734271e-06, + "loss": 0.3306, + "step": 7165 + }, + { + "epoch": 3.388179669030733, + "grad_norm": 3.34232759475708, + "learning_rate": 2.0219909612438405e-06, + "loss": 0.4233, + "step": 7166 + }, + { + "epoch": 3.3886524822695034, + "grad_norm": 3.388561725616455, + "learning_rate": 2.0213786585169363e-06, + "loss": 0.4171, + "step": 7167 + }, + { + "epoch": 3.3891252955082742, + "grad_norm": 2.8606953620910645, + "learning_rate": 2.020766385591684e-06, + "loss": 0.3864, + "step": 7168 + }, + { + "epoch": 3.389598108747045, + "grad_norm": 3.0135979652404785, + "learning_rate": 2.020154142506208e-06, + "loss": 0.3933, + "step": 7169 + }, + { + "epoch": 3.3900709219858154, + "grad_norm": 2.5003163814544678, + "learning_rate": 2.0195419292986294e-06, + "loss": 0.3852, + "step": 7170 + }, + { + "epoch": 3.390543735224586, + "grad_norm": 2.8591368198394775, + "learning_rate": 2.0189297460070685e-06, + "loss": 0.3962, + "step": 7171 + }, + { + "epoch": 3.391016548463357, + "grad_norm": 2.8830223083496094, + "learning_rate": 2.0183175926696427e-06, + "loss": 0.3632, + "step": 7172 + }, + { + "epoch": 3.391489361702128, + "grad_norm": 3.3904542922973633, + "learning_rate": 2.0177054693244674e-06, + "loss": 0.4284, + "step": 7173 + }, + { + "epoch": 3.3919621749408986, + "grad_norm": 3.0325920581817627, + "learning_rate": 2.0170933760096585e-06, + "loss": 0.4331, + "step": 7174 + }, + { + "epoch": 3.392434988179669, + "grad_norm": 2.60345196723938, + "learning_rate": 2.016481312763327e-06, + "loss": 0.4077, + "step": 7175 + }, + { + "epoch": 3.3929078014184397, + "grad_norm": 2.8146891593933105, + "learning_rate": 2.0158692796235845e-06, + "loss": 0.4224, + "step": 7176 + }, + { + "epoch": 3.3933806146572105, + "grad_norm": 2.8158490657806396, + "learning_rate": 2.0152572766285396e-06, + "loss": 0.3454, + "step": 7177 + }, + { + "epoch": 3.393853427895981, + "grad_norm": 3.2753400802612305, + "learning_rate": 2.0146453038162978e-06, + "loss": 0.3615, + "step": 7178 + }, + { + "epoch": 3.3943262411347517, + "grad_norm": 3.0527124404907227, + "learning_rate": 2.0140333612249655e-06, + "loss": 0.415, + "step": 7179 + }, + { + "epoch": 3.3947990543735225, + "grad_norm": 2.6813764572143555, + "learning_rate": 2.0134214488926435e-06, + "loss": 0.3391, + "step": 7180 + }, + { + "epoch": 3.3952718676122933, + "grad_norm": 2.809319496154785, + "learning_rate": 2.0128095668574356e-06, + "loss": 0.3123, + "step": 7181 + }, + { + "epoch": 3.395744680851064, + "grad_norm": 2.6619064807891846, + "learning_rate": 2.0121977151574396e-06, + "loss": 0.4222, + "step": 7182 + }, + { + "epoch": 3.3962174940898344, + "grad_norm": 2.9201200008392334, + "learning_rate": 2.0115858938307516e-06, + "loss": 0.3712, + "step": 7183 + }, + { + "epoch": 3.396690307328605, + "grad_norm": 3.2058637142181396, + "learning_rate": 2.0109741029154696e-06, + "loss": 0.4004, + "step": 7184 + }, + { + "epoch": 3.397163120567376, + "grad_norm": 2.821855306625366, + "learning_rate": 2.0103623424496862e-06, + "loss": 0.4053, + "step": 7185 + }, + { + "epoch": 3.3976359338061464, + "grad_norm": 3.0371549129486084, + "learning_rate": 2.009750612471492e-06, + "loss": 0.4246, + "step": 7186 + }, + { + "epoch": 3.398108747044917, + "grad_norm": 2.8827290534973145, + "learning_rate": 2.009138913018978e-06, + "loss": 0.3256, + "step": 7187 + }, + { + "epoch": 3.398581560283688, + "grad_norm": 3.168039560317993, + "learning_rate": 2.0085272441302305e-06, + "loss": 0.4233, + "step": 7188 + }, + { + "epoch": 3.3990543735224588, + "grad_norm": 3.259723663330078, + "learning_rate": 2.0079156058433374e-06, + "loss": 0.4168, + "step": 7189 + }, + { + "epoch": 3.3995271867612296, + "grad_norm": 2.456231117248535, + "learning_rate": 2.007303998196382e-06, + "loss": 0.3383, + "step": 7190 + }, + { + "epoch": 3.4, + "grad_norm": 2.735180377960205, + "learning_rate": 2.006692421227445e-06, + "loss": 0.3475, + "step": 7191 + }, + { + "epoch": 3.4004728132387707, + "grad_norm": 2.76263427734375, + "learning_rate": 2.006080874974609e-06, + "loss": 0.3651, + "step": 7192 + }, + { + "epoch": 3.4009456264775415, + "grad_norm": 3.36867094039917, + "learning_rate": 2.0054693594759504e-06, + "loss": 0.4479, + "step": 7193 + }, + { + "epoch": 3.401418439716312, + "grad_norm": 2.532167673110962, + "learning_rate": 2.004857874769547e-06, + "loss": 0.3818, + "step": 7194 + }, + { + "epoch": 3.4018912529550827, + "grad_norm": 2.8723537921905518, + "learning_rate": 2.0042464208934724e-06, + "loss": 0.3332, + "step": 7195 + }, + { + "epoch": 3.4023640661938535, + "grad_norm": 2.676460027694702, + "learning_rate": 2.0036349978857987e-06, + "loss": 0.3488, + "step": 7196 + }, + { + "epoch": 3.4028368794326243, + "grad_norm": 2.805851459503174, + "learning_rate": 2.0030236057845983e-06, + "loss": 0.3796, + "step": 7197 + }, + { + "epoch": 3.403309692671395, + "grad_norm": 2.688988447189331, + "learning_rate": 2.0024122446279377e-06, + "loss": 0.3707, + "step": 7198 + }, + { + "epoch": 3.4037825059101654, + "grad_norm": 3.118720293045044, + "learning_rate": 2.0018009144538853e-06, + "loss": 0.4064, + "step": 7199 + }, + { + "epoch": 3.404255319148936, + "grad_norm": 2.876507520675659, + "learning_rate": 2.001189615300506e-06, + "loss": 0.3543, + "step": 7200 + }, + { + "epoch": 3.404728132387707, + "grad_norm": 3.0043466091156006, + "learning_rate": 2.000578347205861e-06, + "loss": 0.3833, + "step": 7201 + }, + { + "epoch": 3.4052009456264773, + "grad_norm": 3.1057114601135254, + "learning_rate": 1.9999671102080133e-06, + "loss": 0.4154, + "step": 7202 + }, + { + "epoch": 3.405673758865248, + "grad_norm": 2.9791855812072754, + "learning_rate": 1.9993559043450202e-06, + "loss": 0.3865, + "step": 7203 + }, + { + "epoch": 3.406146572104019, + "grad_norm": 3.4403460025787354, + "learning_rate": 1.9987447296549407e-06, + "loss": 0.3883, + "step": 7204 + }, + { + "epoch": 3.4066193853427897, + "grad_norm": 2.9962027072906494, + "learning_rate": 1.998133586175829e-06, + "loss": 0.3796, + "step": 7205 + }, + { + "epoch": 3.40709219858156, + "grad_norm": 3.0613129138946533, + "learning_rate": 1.997522473945737e-06, + "loss": 0.3917, + "step": 7206 + }, + { + "epoch": 3.407565011820331, + "grad_norm": 3.065985679626465, + "learning_rate": 1.996911393002718e-06, + "loss": 0.3521, + "step": 7207 + }, + { + "epoch": 3.4080378250591017, + "grad_norm": 2.976177930831909, + "learning_rate": 1.996300343384821e-06, + "loss": 0.3852, + "step": 7208 + }, + { + "epoch": 3.4085106382978725, + "grad_norm": 3.3587961196899414, + "learning_rate": 1.995689325130092e-06, + "loss": 0.3947, + "step": 7209 + }, + { + "epoch": 3.408983451536643, + "grad_norm": 2.626983165740967, + "learning_rate": 1.995078338276578e-06, + "loss": 0.316, + "step": 7210 + }, + { + "epoch": 3.4094562647754136, + "grad_norm": 3.14713978767395, + "learning_rate": 1.9944673828623217e-06, + "loss": 0.4008, + "step": 7211 + }, + { + "epoch": 3.4099290780141844, + "grad_norm": 2.968918800354004, + "learning_rate": 1.993856458925365e-06, + "loss": 0.439, + "step": 7212 + }, + { + "epoch": 3.4104018912529552, + "grad_norm": 2.7724127769470215, + "learning_rate": 1.9932455665037476e-06, + "loss": 0.3941, + "step": 7213 + }, + { + "epoch": 3.4108747044917256, + "grad_norm": 2.963146448135376, + "learning_rate": 1.9926347056355057e-06, + "loss": 0.3893, + "step": 7214 + }, + { + "epoch": 3.4113475177304964, + "grad_norm": 2.791637420654297, + "learning_rate": 1.9920238763586765e-06, + "loss": 0.4068, + "step": 7215 + }, + { + "epoch": 3.411820330969267, + "grad_norm": 3.030275583267212, + "learning_rate": 1.9914130787112924e-06, + "loss": 0.3828, + "step": 7216 + }, + { + "epoch": 3.412293144208038, + "grad_norm": 3.113128900527954, + "learning_rate": 1.990802312731387e-06, + "loss": 0.3903, + "step": 7217 + }, + { + "epoch": 3.4127659574468083, + "grad_norm": 3.104170322418213, + "learning_rate": 1.9901915784569884e-06, + "loss": 0.4171, + "step": 7218 + }, + { + "epoch": 3.413238770685579, + "grad_norm": 3.1247572898864746, + "learning_rate": 1.989580875926125e-06, + "loss": 0.4022, + "step": 7219 + }, + { + "epoch": 3.41371158392435, + "grad_norm": 2.9487457275390625, + "learning_rate": 1.988970205176822e-06, + "loss": 0.3948, + "step": 7220 + }, + { + "epoch": 3.4141843971631207, + "grad_norm": 2.8763654232025146, + "learning_rate": 1.9883595662471028e-06, + "loss": 0.3588, + "step": 7221 + }, + { + "epoch": 3.414657210401891, + "grad_norm": 2.563152551651001, + "learning_rate": 1.987748959174991e-06, + "loss": 0.3509, + "step": 7222 + }, + { + "epoch": 3.415130023640662, + "grad_norm": 3.148759365081787, + "learning_rate": 1.9871383839985053e-06, + "loss": 0.4364, + "step": 7223 + }, + { + "epoch": 3.4156028368794327, + "grad_norm": 2.8187363147735596, + "learning_rate": 1.986527840755663e-06, + "loss": 0.3803, + "step": 7224 + }, + { + "epoch": 3.4160756501182035, + "grad_norm": 3.009376287460327, + "learning_rate": 1.985917329484481e-06, + "loss": 0.3841, + "step": 7225 + }, + { + "epoch": 3.416548463356974, + "grad_norm": 2.869291067123413, + "learning_rate": 1.985306850222972e-06, + "loss": 0.3877, + "step": 7226 + }, + { + "epoch": 3.4170212765957446, + "grad_norm": 3.108461856842041, + "learning_rate": 1.9846964030091497e-06, + "loss": 0.3767, + "step": 7227 + }, + { + "epoch": 3.4174940898345154, + "grad_norm": 3.096320629119873, + "learning_rate": 1.9840859878810226e-06, + "loss": 0.4603, + "step": 7228 + }, + { + "epoch": 3.417966903073286, + "grad_norm": 2.8519909381866455, + "learning_rate": 1.983475604876598e-06, + "loss": 0.3263, + "step": 7229 + }, + { + "epoch": 3.4184397163120566, + "grad_norm": 3.192051410675049, + "learning_rate": 1.9828652540338835e-06, + "loss": 0.4132, + "step": 7230 + }, + { + "epoch": 3.4189125295508274, + "grad_norm": 3.0398056507110596, + "learning_rate": 1.9822549353908817e-06, + "loss": 0.4038, + "step": 7231 + }, + { + "epoch": 3.419385342789598, + "grad_norm": 3.12247896194458, + "learning_rate": 1.9816446489855944e-06, + "loss": 0.409, + "step": 7232 + }, + { + "epoch": 3.419858156028369, + "grad_norm": 3.20316481590271, + "learning_rate": 1.9810343948560223e-06, + "loss": 0.4058, + "step": 7233 + }, + { + "epoch": 3.4203309692671393, + "grad_norm": 3.3397457599639893, + "learning_rate": 1.9804241730401625e-06, + "loss": 0.3657, + "step": 7234 + }, + { + "epoch": 3.42080378250591, + "grad_norm": 3.928691864013672, + "learning_rate": 1.979813983576012e-06, + "loss": 0.361, + "step": 7235 + }, + { + "epoch": 3.421276595744681, + "grad_norm": 3.5814051628112793, + "learning_rate": 1.9792038265015635e-06, + "loss": 0.3975, + "step": 7236 + }, + { + "epoch": 3.4217494089834517, + "grad_norm": 2.8578879833221436, + "learning_rate": 1.9785937018548086e-06, + "loss": 0.3915, + "step": 7237 + }, + { + "epoch": 3.422222222222222, + "grad_norm": 3.0343220233917236, + "learning_rate": 1.977983609673738e-06, + "loss": 0.3686, + "step": 7238 + }, + { + "epoch": 3.422695035460993, + "grad_norm": 3.2719056606292725, + "learning_rate": 1.977373549996338e-06, + "loss": 0.3905, + "step": 7239 + }, + { + "epoch": 3.4231678486997636, + "grad_norm": 2.6638169288635254, + "learning_rate": 1.976763522860597e-06, + "loss": 0.3631, + "step": 7240 + }, + { + "epoch": 3.4236406619385344, + "grad_norm": 2.7679927349090576, + "learning_rate": 1.9761535283044967e-06, + "loss": 0.377, + "step": 7241 + }, + { + "epoch": 3.424113475177305, + "grad_norm": 2.774540424346924, + "learning_rate": 1.975543566366019e-06, + "loss": 0.3509, + "step": 7242 + }, + { + "epoch": 3.4245862884160756, + "grad_norm": 2.811659336090088, + "learning_rate": 1.9749336370831438e-06, + "loss": 0.3835, + "step": 7243 + }, + { + "epoch": 3.4250591016548464, + "grad_norm": 2.8533360958099365, + "learning_rate": 1.9743237404938478e-06, + "loss": 0.3765, + "step": 7244 + }, + { + "epoch": 3.425531914893617, + "grad_norm": 2.712301015853882, + "learning_rate": 1.9737138766361084e-06, + "loss": 0.3797, + "step": 7245 + }, + { + "epoch": 3.4260047281323875, + "grad_norm": 2.9763426780700684, + "learning_rate": 1.9731040455478986e-06, + "loss": 0.4223, + "step": 7246 + }, + { + "epoch": 3.4264775413711583, + "grad_norm": 2.8802297115325928, + "learning_rate": 1.9724942472671882e-06, + "loss": 0.3666, + "step": 7247 + }, + { + "epoch": 3.426950354609929, + "grad_norm": 2.934107542037964, + "learning_rate": 1.9718844818319486e-06, + "loss": 0.3612, + "step": 7248 + }, + { + "epoch": 3.4274231678487, + "grad_norm": 3.0172696113586426, + "learning_rate": 1.9712747492801467e-06, + "loss": 0.3643, + "step": 7249 + }, + { + "epoch": 3.4278959810874703, + "grad_norm": 3.368419647216797, + "learning_rate": 1.970665049649748e-06, + "loss": 0.4511, + "step": 7250 + }, + { + "epoch": 3.428368794326241, + "grad_norm": 3.077819585800171, + "learning_rate": 1.9700553829787162e-06, + "loss": 0.4013, + "step": 7251 + }, + { + "epoch": 3.428841607565012, + "grad_norm": 2.690673828125, + "learning_rate": 1.96944574930501e-06, + "loss": 0.3776, + "step": 7252 + }, + { + "epoch": 3.4293144208037827, + "grad_norm": 3.1122169494628906, + "learning_rate": 1.9688361486665924e-06, + "loss": 0.3802, + "step": 7253 + }, + { + "epoch": 3.429787234042553, + "grad_norm": 2.9874207973480225, + "learning_rate": 1.968226581101417e-06, + "loss": 0.4492, + "step": 7254 + }, + { + "epoch": 3.430260047281324, + "grad_norm": 2.885493278503418, + "learning_rate": 1.967617046647442e-06, + "loss": 0.3958, + "step": 7255 + }, + { + "epoch": 3.4307328605200946, + "grad_norm": 2.953897476196289, + "learning_rate": 1.9670075453426195e-06, + "loss": 0.3973, + "step": 7256 + }, + { + "epoch": 3.4312056737588654, + "grad_norm": 2.685088634490967, + "learning_rate": 1.966398077224899e-06, + "loss": 0.393, + "step": 7257 + }, + { + "epoch": 3.431678486997636, + "grad_norm": 4.035208702087402, + "learning_rate": 1.9657886423322313e-06, + "loss": 0.4263, + "step": 7258 + }, + { + "epoch": 3.4321513002364066, + "grad_norm": 2.942042827606201, + "learning_rate": 1.965179240702562e-06, + "loss": 0.4319, + "step": 7259 + }, + { + "epoch": 3.4326241134751774, + "grad_norm": 3.0794999599456787, + "learning_rate": 1.9645698723738356e-06, + "loss": 0.4199, + "step": 7260 + }, + { + "epoch": 3.433096926713948, + "grad_norm": 3.0653584003448486, + "learning_rate": 1.963960537383996e-06, + "loss": 0.3723, + "step": 7261 + }, + { + "epoch": 3.4335697399527185, + "grad_norm": 3.1571545600891113, + "learning_rate": 1.963351235770983e-06, + "loss": 0.4211, + "step": 7262 + }, + { + "epoch": 3.4340425531914893, + "grad_norm": 2.6681735515594482, + "learning_rate": 1.962741967572736e-06, + "loss": 0.3333, + "step": 7263 + }, + { + "epoch": 3.43451536643026, + "grad_norm": 2.9747934341430664, + "learning_rate": 1.9621327328271907e-06, + "loss": 0.3896, + "step": 7264 + }, + { + "epoch": 3.434988179669031, + "grad_norm": 2.7994508743286133, + "learning_rate": 1.9615235315722814e-06, + "loss": 0.3642, + "step": 7265 + }, + { + "epoch": 3.4354609929078013, + "grad_norm": 2.933928966522217, + "learning_rate": 1.9609143638459405e-06, + "loss": 0.3955, + "step": 7266 + }, + { + "epoch": 3.435933806146572, + "grad_norm": 2.9577367305755615, + "learning_rate": 1.9603052296860983e-06, + "loss": 0.3437, + "step": 7267 + }, + { + "epoch": 3.436406619385343, + "grad_norm": 3.017282009124756, + "learning_rate": 1.959696129130684e-06, + "loss": 0.3784, + "step": 7268 + }, + { + "epoch": 3.4368794326241137, + "grad_norm": 3.2072815895080566, + "learning_rate": 1.959087062217622e-06, + "loss": 0.3901, + "step": 7269 + }, + { + "epoch": 3.437352245862884, + "grad_norm": 2.91153621673584, + "learning_rate": 1.9584780289848358e-06, + "loss": 0.4402, + "step": 7270 + }, + { + "epoch": 3.437825059101655, + "grad_norm": 2.846842050552368, + "learning_rate": 1.9578690294702495e-06, + "loss": 0.3804, + "step": 7271 + }, + { + "epoch": 3.4382978723404256, + "grad_norm": 3.0958521366119385, + "learning_rate": 1.957260063711781e-06, + "loss": 0.4103, + "step": 7272 + }, + { + "epoch": 3.4387706855791964, + "grad_norm": 2.9808530807495117, + "learning_rate": 1.9566511317473483e-06, + "loss": 0.4127, + "step": 7273 + }, + { + "epoch": 3.4392434988179668, + "grad_norm": 2.725851058959961, + "learning_rate": 1.9560422336148678e-06, + "loss": 0.3493, + "step": 7274 + }, + { + "epoch": 3.4397163120567376, + "grad_norm": 2.7861814498901367, + "learning_rate": 1.9554333693522515e-06, + "loss": 0.3703, + "step": 7275 + }, + { + "epoch": 3.4401891252955084, + "grad_norm": 3.128708839416504, + "learning_rate": 1.954824538997412e-06, + "loss": 0.3917, + "step": 7276 + }, + { + "epoch": 3.440661938534279, + "grad_norm": 3.117403268814087, + "learning_rate": 1.954215742588257e-06, + "loss": 0.3581, + "step": 7277 + }, + { + "epoch": 3.4411347517730495, + "grad_norm": 2.710076093673706, + "learning_rate": 1.9536069801626957e-06, + "loss": 0.3255, + "step": 7278 + }, + { + "epoch": 3.4416075650118203, + "grad_norm": 2.7732627391815186, + "learning_rate": 1.952998251758632e-06, + "loss": 0.375, + "step": 7279 + }, + { + "epoch": 3.442080378250591, + "grad_norm": 2.896050453186035, + "learning_rate": 1.9523895574139673e-06, + "loss": 0.4087, + "step": 7280 + }, + { + "epoch": 3.4425531914893615, + "grad_norm": 2.9051663875579834, + "learning_rate": 1.9517808971666048e-06, + "loss": 0.3423, + "step": 7281 + }, + { + "epoch": 3.4430260047281322, + "grad_norm": 3.0232038497924805, + "learning_rate": 1.9511722710544417e-06, + "loss": 0.364, + "step": 7282 + }, + { + "epoch": 3.443498817966903, + "grad_norm": 2.753870725631714, + "learning_rate": 1.9505636791153744e-06, + "loss": 0.3484, + "step": 7283 + }, + { + "epoch": 3.443971631205674, + "grad_norm": 2.944079637527466, + "learning_rate": 1.9499551213872983e-06, + "loss": 0.3354, + "step": 7284 + }, + { + "epoch": 3.4444444444444446, + "grad_norm": 3.1531970500946045, + "learning_rate": 1.949346597908104e-06, + "loss": 0.3394, + "step": 7285 + }, + { + "epoch": 3.444917257683215, + "grad_norm": 3.0357189178466797, + "learning_rate": 1.948738108715683e-06, + "loss": 0.4302, + "step": 7286 + }, + { + "epoch": 3.445390070921986, + "grad_norm": 3.3698086738586426, + "learning_rate": 1.948129653847923e-06, + "loss": 0.419, + "step": 7287 + }, + { + "epoch": 3.4458628841607566, + "grad_norm": 3.343132495880127, + "learning_rate": 1.947521233342709e-06, + "loss": 0.3895, + "step": 7288 + }, + { + "epoch": 3.446335697399527, + "grad_norm": 3.1905252933502197, + "learning_rate": 1.9469128472379257e-06, + "loss": 0.429, + "step": 7289 + }, + { + "epoch": 3.4468085106382977, + "grad_norm": 2.8517212867736816, + "learning_rate": 1.946304495571454e-06, + "loss": 0.3513, + "step": 7290 + }, + { + "epoch": 3.4472813238770685, + "grad_norm": 2.7713496685028076, + "learning_rate": 1.9456961783811735e-06, + "loss": 0.4331, + "step": 7291 + }, + { + "epoch": 3.4477541371158393, + "grad_norm": 2.8258652687072754, + "learning_rate": 1.945087895704962e-06, + "loss": 0.3539, + "step": 7292 + }, + { + "epoch": 3.44822695035461, + "grad_norm": 2.757322072982788, + "learning_rate": 1.9444796475806925e-06, + "loss": 0.3865, + "step": 7293 + }, + { + "epoch": 3.4486997635933805, + "grad_norm": 2.8410696983337402, + "learning_rate": 1.943871434046241e-06, + "loss": 0.3612, + "step": 7294 + }, + { + "epoch": 3.4491725768321513, + "grad_norm": 3.2297637462615967, + "learning_rate": 1.9432632551394753e-06, + "loss": 0.3956, + "step": 7295 + }, + { + "epoch": 3.449645390070922, + "grad_norm": 2.991351842880249, + "learning_rate": 1.9426551108982666e-06, + "loss": 0.3864, + "step": 7296 + }, + { + "epoch": 3.4501182033096924, + "grad_norm": 2.7942168712615967, + "learning_rate": 1.94204700136048e-06, + "loss": 0.4314, + "step": 7297 + }, + { + "epoch": 3.4505910165484632, + "grad_norm": 2.8188698291778564, + "learning_rate": 1.9414389265639805e-06, + "loss": 0.3585, + "step": 7298 + }, + { + "epoch": 3.451063829787234, + "grad_norm": 3.2826895713806152, + "learning_rate": 1.9408308865466295e-06, + "loss": 0.4614, + "step": 7299 + }, + { + "epoch": 3.451536643026005, + "grad_norm": 3.273867130279541, + "learning_rate": 1.9402228813462865e-06, + "loss": 0.3533, + "step": 7300 + }, + { + "epoch": 3.4520094562647756, + "grad_norm": 3.5334157943725586, + "learning_rate": 1.939614911000811e-06, + "loss": 0.4088, + "step": 7301 + }, + { + "epoch": 3.452482269503546, + "grad_norm": 2.983908176422119, + "learning_rate": 1.9390069755480583e-06, + "loss": 0.3725, + "step": 7302 + }, + { + "epoch": 3.4529550827423168, + "grad_norm": 2.893660306930542, + "learning_rate": 1.93839907502588e-06, + "loss": 0.3746, + "step": 7303 + }, + { + "epoch": 3.4534278959810876, + "grad_norm": 3.1762871742248535, + "learning_rate": 1.9377912094721295e-06, + "loss": 0.446, + "step": 7304 + }, + { + "epoch": 3.453900709219858, + "grad_norm": 3.3231537342071533, + "learning_rate": 1.9371833789246554e-06, + "loss": 0.4837, + "step": 7305 + }, + { + "epoch": 3.4543735224586287, + "grad_norm": 3.548333168029785, + "learning_rate": 1.936575583421304e-06, + "loss": 0.3911, + "step": 7306 + }, + { + "epoch": 3.4548463356973995, + "grad_norm": 3.0627071857452393, + "learning_rate": 1.9359678229999213e-06, + "loss": 0.3751, + "step": 7307 + }, + { + "epoch": 3.4553191489361703, + "grad_norm": 2.797663927078247, + "learning_rate": 1.9353600976983475e-06, + "loss": 0.41, + "step": 7308 + }, + { + "epoch": 3.455791962174941, + "grad_norm": 2.803269624710083, + "learning_rate": 1.9347524075544258e-06, + "loss": 0.3775, + "step": 7309 + }, + { + "epoch": 3.4562647754137115, + "grad_norm": 2.828010320663452, + "learning_rate": 1.934144752605993e-06, + "loss": 0.375, + "step": 7310 + }, + { + "epoch": 3.4567375886524823, + "grad_norm": 3.456477165222168, + "learning_rate": 1.933537132890884e-06, + "loss": 0.4764, + "step": 7311 + }, + { + "epoch": 3.457210401891253, + "grad_norm": 2.723670244216919, + "learning_rate": 1.9329295484469354e-06, + "loss": 0.3581, + "step": 7312 + }, + { + "epoch": 3.4576832151300234, + "grad_norm": 3.9723474979400635, + "learning_rate": 1.9323219993119766e-06, + "loss": 0.3951, + "step": 7313 + }, + { + "epoch": 3.458156028368794, + "grad_norm": 2.951300859451294, + "learning_rate": 1.931714485523838e-06, + "loss": 0.3865, + "step": 7314 + }, + { + "epoch": 3.458628841607565, + "grad_norm": 2.9265835285186768, + "learning_rate": 1.931107007120347e-06, + "loss": 0.3731, + "step": 7315 + }, + { + "epoch": 3.459101654846336, + "grad_norm": 3.271883249282837, + "learning_rate": 1.930499564139327e-06, + "loss": 0.3971, + "step": 7316 + }, + { + "epoch": 3.4595744680851066, + "grad_norm": 2.8716280460357666, + "learning_rate": 1.929892156618603e-06, + "loss": 0.3332, + "step": 7317 + }, + { + "epoch": 3.460047281323877, + "grad_norm": 2.9820191860198975, + "learning_rate": 1.929284784595993e-06, + "loss": 0.3907, + "step": 7318 + }, + { + "epoch": 3.4605200945626478, + "grad_norm": 3.313225269317627, + "learning_rate": 1.9286774481093183e-06, + "loss": 0.3678, + "step": 7319 + }, + { + "epoch": 3.4609929078014185, + "grad_norm": 3.365387439727783, + "learning_rate": 1.928070147196394e-06, + "loss": 0.4894, + "step": 7320 + }, + { + "epoch": 3.461465721040189, + "grad_norm": 3.1723599433898926, + "learning_rate": 1.927462881895033e-06, + "loss": 0.4607, + "step": 7321 + }, + { + "epoch": 3.4619385342789597, + "grad_norm": 2.7644999027252197, + "learning_rate": 1.9268556522430483e-06, + "loss": 0.3627, + "step": 7322 + }, + { + "epoch": 3.4624113475177305, + "grad_norm": 2.65572190284729, + "learning_rate": 1.9262484582782483e-06, + "loss": 0.3893, + "step": 7323 + }, + { + "epoch": 3.4628841607565013, + "grad_norm": 2.992037773132324, + "learning_rate": 1.9256413000384415e-06, + "loss": 0.4175, + "step": 7324 + }, + { + "epoch": 3.463356973995272, + "grad_norm": 3.020496368408203, + "learning_rate": 1.925034177561433e-06, + "loss": 0.42, + "step": 7325 + }, + { + "epoch": 3.4638297872340424, + "grad_norm": 2.780334234237671, + "learning_rate": 1.9244270908850236e-06, + "loss": 0.4195, + "step": 7326 + }, + { + "epoch": 3.4643026004728132, + "grad_norm": 2.863028049468994, + "learning_rate": 1.9238200400470166e-06, + "loss": 0.3706, + "step": 7327 + }, + { + "epoch": 3.464775413711584, + "grad_norm": 3.2766900062561035, + "learning_rate": 1.923213025085209e-06, + "loss": 0.4506, + "step": 7328 + }, + { + "epoch": 3.4652482269503544, + "grad_norm": 2.7300634384155273, + "learning_rate": 1.9226060460373975e-06, + "loss": 0.3463, + "step": 7329 + }, + { + "epoch": 3.465721040189125, + "grad_norm": 3.136104106903076, + "learning_rate": 1.921999102941376e-06, + "loss": 0.3839, + "step": 7330 + }, + { + "epoch": 3.466193853427896, + "grad_norm": 2.944932699203491, + "learning_rate": 1.921392195834934e-06, + "loss": 0.432, + "step": 7331 + }, + { + "epoch": 3.466666666666667, + "grad_norm": 3.428375005722046, + "learning_rate": 1.9207853247558647e-06, + "loss": 0.3407, + "step": 7332 + }, + { + "epoch": 3.4671394799054376, + "grad_norm": 3.3732450008392334, + "learning_rate": 1.9201784897419535e-06, + "loss": 0.361, + "step": 7333 + }, + { + "epoch": 3.467612293144208, + "grad_norm": 2.8291900157928467, + "learning_rate": 1.9195716908309836e-06, + "loss": 0.3805, + "step": 7334 + }, + { + "epoch": 3.4680851063829787, + "grad_norm": 3.3229610919952393, + "learning_rate": 1.9189649280607407e-06, + "loss": 0.3756, + "step": 7335 + }, + { + "epoch": 3.4685579196217495, + "grad_norm": 2.949416160583496, + "learning_rate": 1.918358201469004e-06, + "loss": 0.4316, + "step": 7336 + }, + { + "epoch": 3.46903073286052, + "grad_norm": 3.525501251220703, + "learning_rate": 1.9177515110935515e-06, + "loss": 0.4018, + "step": 7337 + }, + { + "epoch": 3.4695035460992907, + "grad_norm": 3.1439104080200195, + "learning_rate": 1.917144856972159e-06, + "loss": 0.4176, + "step": 7338 + }, + { + "epoch": 3.4699763593380615, + "grad_norm": 3.0022377967834473, + "learning_rate": 1.9165382391426006e-06, + "loss": 0.3962, + "step": 7339 + }, + { + "epoch": 3.4704491725768323, + "grad_norm": 3.2174794673919678, + "learning_rate": 1.9159316576426482e-06, + "loss": 0.441, + "step": 7340 + }, + { + "epoch": 3.470921985815603, + "grad_norm": 2.965123414993286, + "learning_rate": 1.9153251125100694e-06, + "loss": 0.4105, + "step": 7341 + }, + { + "epoch": 3.4713947990543734, + "grad_norm": 2.722904920578003, + "learning_rate": 1.9147186037826333e-06, + "loss": 0.4102, + "step": 7342 + }, + { + "epoch": 3.4718676122931442, + "grad_norm": 3.4894051551818848, + "learning_rate": 1.9141121314981033e-06, + "loss": 0.4225, + "step": 7343 + }, + { + "epoch": 3.472340425531915, + "grad_norm": 2.828497886657715, + "learning_rate": 1.913505695694241e-06, + "loss": 0.374, + "step": 7344 + }, + { + "epoch": 3.4728132387706854, + "grad_norm": 3.3046014308929443, + "learning_rate": 1.9128992964088077e-06, + "loss": 0.3568, + "step": 7345 + }, + { + "epoch": 3.473286052009456, + "grad_norm": 2.927281618118286, + "learning_rate": 1.9122929336795605e-06, + "loss": 0.4308, + "step": 7346 + }, + { + "epoch": 3.473758865248227, + "grad_norm": 2.9569990634918213, + "learning_rate": 1.911686607544256e-06, + "loss": 0.3226, + "step": 7347 + }, + { + "epoch": 3.4742316784869978, + "grad_norm": 3.1061038970947266, + "learning_rate": 1.9110803180406468e-06, + "loss": 0.4426, + "step": 7348 + }, + { + "epoch": 3.4747044917257686, + "grad_norm": 2.9609580039978027, + "learning_rate": 1.9104740652064825e-06, + "loss": 0.3835, + "step": 7349 + }, + { + "epoch": 3.475177304964539, + "grad_norm": 3.1547608375549316, + "learning_rate": 1.9098678490795147e-06, + "loss": 0.3814, + "step": 7350 + }, + { + "epoch": 3.4756501182033097, + "grad_norm": 2.869022846221924, + "learning_rate": 1.909261669697487e-06, + "loss": 0.4048, + "step": 7351 + }, + { + "epoch": 3.4761229314420805, + "grad_norm": 3.0565078258514404, + "learning_rate": 1.908655527098146e-06, + "loss": 0.3736, + "step": 7352 + }, + { + "epoch": 3.476595744680851, + "grad_norm": 2.893603563308716, + "learning_rate": 1.9080494213192317e-06, + "loss": 0.3906, + "step": 7353 + }, + { + "epoch": 3.4770685579196217, + "grad_norm": 2.818938732147217, + "learning_rate": 1.9074433523984844e-06, + "loss": 0.3958, + "step": 7354 + }, + { + "epoch": 3.4775413711583925, + "grad_norm": 2.675461769104004, + "learning_rate": 1.9068373203736419e-06, + "loss": 0.3371, + "step": 7355 + }, + { + "epoch": 3.4780141843971633, + "grad_norm": 2.5831551551818848, + "learning_rate": 1.9062313252824384e-06, + "loss": 0.3365, + "step": 7356 + }, + { + "epoch": 3.478486997635934, + "grad_norm": 3.299736738204956, + "learning_rate": 1.9056253671626054e-06, + "loss": 0.3923, + "step": 7357 + }, + { + "epoch": 3.4789598108747044, + "grad_norm": 2.508787155151367, + "learning_rate": 1.905019446051876e-06, + "loss": 0.3367, + "step": 7358 + }, + { + "epoch": 3.479432624113475, + "grad_norm": 2.980327606201172, + "learning_rate": 1.9044135619879753e-06, + "loss": 0.3842, + "step": 7359 + }, + { + "epoch": 3.479905437352246, + "grad_norm": 3.2114269733428955, + "learning_rate": 1.9038077150086317e-06, + "loss": 0.4625, + "step": 7360 + }, + { + "epoch": 3.4803782505910164, + "grad_norm": 3.2119715213775635, + "learning_rate": 1.9032019051515677e-06, + "loss": 0.4197, + "step": 7361 + }, + { + "epoch": 3.480851063829787, + "grad_norm": 3.2967300415039062, + "learning_rate": 1.9025961324545034e-06, + "loss": 0.4462, + "step": 7362 + }, + { + "epoch": 3.481323877068558, + "grad_norm": 3.132643461227417, + "learning_rate": 1.9019903969551589e-06, + "loss": 0.4355, + "step": 7363 + }, + { + "epoch": 3.4817966903073287, + "grad_norm": 2.9940602779388428, + "learning_rate": 1.9013846986912493e-06, + "loss": 0.3584, + "step": 7364 + }, + { + "epoch": 3.482269503546099, + "grad_norm": 2.901935577392578, + "learning_rate": 1.9007790377004907e-06, + "loss": 0.3987, + "step": 7365 + }, + { + "epoch": 3.48274231678487, + "grad_norm": 3.143404006958008, + "learning_rate": 1.9001734140205939e-06, + "loss": 0.3642, + "step": 7366 + }, + { + "epoch": 3.4832151300236407, + "grad_norm": 3.6701388359069824, + "learning_rate": 1.899567827689267e-06, + "loss": 0.4111, + "step": 7367 + }, + { + "epoch": 3.4836879432624115, + "grad_norm": 2.6821651458740234, + "learning_rate": 1.8989622787442202e-06, + "loss": 0.3827, + "step": 7368 + }, + { + "epoch": 3.484160756501182, + "grad_norm": 3.3916640281677246, + "learning_rate": 1.8983567672231562e-06, + "loss": 0.4214, + "step": 7369 + }, + { + "epoch": 3.4846335697399526, + "grad_norm": 3.3192927837371826, + "learning_rate": 1.8977512931637788e-06, + "loss": 0.4375, + "step": 7370 + }, + { + "epoch": 3.4851063829787234, + "grad_norm": 3.177884578704834, + "learning_rate": 1.8971458566037877e-06, + "loss": 0.3944, + "step": 7371 + }, + { + "epoch": 3.4855791962174942, + "grad_norm": 3.1554532051086426, + "learning_rate": 1.896540457580879e-06, + "loss": 0.3712, + "step": 7372 + }, + { + "epoch": 3.4860520094562646, + "grad_norm": 2.7773821353912354, + "learning_rate": 1.8959350961327516e-06, + "loss": 0.3534, + "step": 7373 + }, + { + "epoch": 3.4865248226950354, + "grad_norm": 2.4792168140411377, + "learning_rate": 1.8953297722970956e-06, + "loss": 0.3571, + "step": 7374 + }, + { + "epoch": 3.486997635933806, + "grad_norm": 2.676361083984375, + "learning_rate": 1.8947244861116044e-06, + "loss": 0.3365, + "step": 7375 + }, + { + "epoch": 3.487470449172577, + "grad_norm": 2.9340765476226807, + "learning_rate": 1.8941192376139655e-06, + "loss": 0.4656, + "step": 7376 + }, + { + "epoch": 3.4879432624113473, + "grad_norm": 3.7924742698669434, + "learning_rate": 1.8935140268418646e-06, + "loss": 0.3639, + "step": 7377 + }, + { + "epoch": 3.488416075650118, + "grad_norm": 2.798912286758423, + "learning_rate": 1.892908853832986e-06, + "loss": 0.3741, + "step": 7378 + }, + { + "epoch": 3.488888888888889, + "grad_norm": 3.1731197834014893, + "learning_rate": 1.8923037186250112e-06, + "loss": 0.4041, + "step": 7379 + }, + { + "epoch": 3.4893617021276597, + "grad_norm": 2.893725633621216, + "learning_rate": 1.8916986212556182e-06, + "loss": 0.3103, + "step": 7380 + }, + { + "epoch": 3.48983451536643, + "grad_norm": 3.2489001750946045, + "learning_rate": 1.891093561762486e-06, + "loss": 0.328, + "step": 7381 + }, + { + "epoch": 3.490307328605201, + "grad_norm": 2.8076415061950684, + "learning_rate": 1.8904885401832862e-06, + "loss": 0.426, + "step": 7382 + }, + { + "epoch": 3.4907801418439717, + "grad_norm": 3.076544761657715, + "learning_rate": 1.8898835565556938e-06, + "loss": 0.3664, + "step": 7383 + }, + { + "epoch": 3.4912529550827425, + "grad_norm": 2.7615935802459717, + "learning_rate": 1.8892786109173769e-06, + "loss": 0.3718, + "step": 7384 + }, + { + "epoch": 3.491725768321513, + "grad_norm": 2.9050116539001465, + "learning_rate": 1.8886737033060023e-06, + "loss": 0.3456, + "step": 7385 + }, + { + "epoch": 3.4921985815602836, + "grad_norm": 2.4928293228149414, + "learning_rate": 1.8880688337592366e-06, + "loss": 0.3487, + "step": 7386 + }, + { + "epoch": 3.4926713947990544, + "grad_norm": 2.773418426513672, + "learning_rate": 1.88746400231474e-06, + "loss": 0.3771, + "step": 7387 + }, + { + "epoch": 3.493144208037825, + "grad_norm": 2.7137296199798584, + "learning_rate": 1.886859209010175e-06, + "loss": 0.376, + "step": 7388 + }, + { + "epoch": 3.4936170212765956, + "grad_norm": 3.327976942062378, + "learning_rate": 1.886254453883199e-06, + "loss": 0.3481, + "step": 7389 + }, + { + "epoch": 3.4940898345153664, + "grad_norm": 3.8637235164642334, + "learning_rate": 1.8856497369714655e-06, + "loss": 0.3726, + "step": 7390 + }, + { + "epoch": 3.494562647754137, + "grad_norm": 3.1517951488494873, + "learning_rate": 1.88504505831263e-06, + "loss": 0.4459, + "step": 7391 + }, + { + "epoch": 3.495035460992908, + "grad_norm": 3.160130262374878, + "learning_rate": 1.884440417944342e-06, + "loss": 0.3918, + "step": 7392 + }, + { + "epoch": 3.4955082742316783, + "grad_norm": 2.6518726348876953, + "learning_rate": 1.8838358159042503e-06, + "loss": 0.3493, + "step": 7393 + }, + { + "epoch": 3.495981087470449, + "grad_norm": 2.7487380504608154, + "learning_rate": 1.8832312522300009e-06, + "loss": 0.3846, + "step": 7394 + }, + { + "epoch": 3.49645390070922, + "grad_norm": 3.062293291091919, + "learning_rate": 1.8826267269592355e-06, + "loss": 0.3792, + "step": 7395 + }, + { + "epoch": 3.4969267139479907, + "grad_norm": 3.3636794090270996, + "learning_rate": 1.8820222401295979e-06, + "loss": 0.4504, + "step": 7396 + }, + { + "epoch": 3.497399527186761, + "grad_norm": 3.230196237564087, + "learning_rate": 1.8814177917787246e-06, + "loss": 0.3953, + "step": 7397 + }, + { + "epoch": 3.497872340425532, + "grad_norm": 2.891002893447876, + "learning_rate": 1.8808133819442541e-06, + "loss": 0.3923, + "step": 7398 + }, + { + "epoch": 3.4983451536643027, + "grad_norm": 2.7478551864624023, + "learning_rate": 1.8802090106638196e-06, + "loss": 0.4115, + "step": 7399 + }, + { + "epoch": 3.4988179669030735, + "grad_norm": 3.0452797412872314, + "learning_rate": 1.8796046779750515e-06, + "loss": 0.4154, + "step": 7400 + }, + { + "epoch": 3.499290780141844, + "grad_norm": 3.0759124755859375, + "learning_rate": 1.87900038391558e-06, + "loss": 0.4277, + "step": 7401 + }, + { + "epoch": 3.4997635933806146, + "grad_norm": 2.7563929557800293, + "learning_rate": 1.8783961285230314e-06, + "loss": 0.3896, + "step": 7402 + }, + { + "epoch": 3.5002364066193854, + "grad_norm": 2.661916494369507, + "learning_rate": 1.87779191183503e-06, + "loss": 0.3625, + "step": 7403 + }, + { + "epoch": 3.500709219858156, + "grad_norm": 2.881241798400879, + "learning_rate": 1.877187733889199e-06, + "loss": 0.3724, + "step": 7404 + }, + { + "epoch": 3.5011820330969265, + "grad_norm": 3.2405693531036377, + "learning_rate": 1.8765835947231554e-06, + "loss": 0.3974, + "step": 7405 + }, + { + "epoch": 3.5016548463356973, + "grad_norm": 2.924288034439087, + "learning_rate": 1.8759794943745184e-06, + "loss": 0.3467, + "step": 7406 + }, + { + "epoch": 3.502127659574468, + "grad_norm": 3.031663656234741, + "learning_rate": 1.8753754328809027e-06, + "loss": 0.3995, + "step": 7407 + }, + { + "epoch": 3.5026004728132385, + "grad_norm": 3.028277635574341, + "learning_rate": 1.874771410279919e-06, + "loss": 0.3741, + "step": 7408 + }, + { + "epoch": 3.5030732860520093, + "grad_norm": 3.0211644172668457, + "learning_rate": 1.8741674266091782e-06, + "loss": 0.4018, + "step": 7409 + }, + { + "epoch": 3.50354609929078, + "grad_norm": 2.732234239578247, + "learning_rate": 1.8735634819062875e-06, + "loss": 0.313, + "step": 7410 + }, + { + "epoch": 3.504018912529551, + "grad_norm": 3.139596939086914, + "learning_rate": 1.8729595762088525e-06, + "loss": 0.4112, + "step": 7411 + }, + { + "epoch": 3.5044917257683217, + "grad_norm": 2.894230365753174, + "learning_rate": 1.8723557095544754e-06, + "loss": 0.3891, + "step": 7412 + }, + { + "epoch": 3.504964539007092, + "grad_norm": 2.850205659866333, + "learning_rate": 1.8717518819807547e-06, + "loss": 0.424, + "step": 7413 + }, + { + "epoch": 3.505437352245863, + "grad_norm": 3.047736644744873, + "learning_rate": 1.8711480935252907e-06, + "loss": 0.3757, + "step": 7414 + }, + { + "epoch": 3.5059101654846336, + "grad_norm": 3.0174455642700195, + "learning_rate": 1.8705443442256772e-06, + "loss": 0.3625, + "step": 7415 + }, + { + "epoch": 3.506382978723404, + "grad_norm": 2.840681552886963, + "learning_rate": 1.869940634119507e-06, + "loss": 0.3595, + "step": 7416 + }, + { + "epoch": 3.506855791962175, + "grad_norm": 3.067473888397217, + "learning_rate": 1.8693369632443713e-06, + "loss": 0.432, + "step": 7417 + }, + { + "epoch": 3.5073286052009456, + "grad_norm": 2.94655179977417, + "learning_rate": 1.8687333316378572e-06, + "loss": 0.4222, + "step": 7418 + }, + { + "epoch": 3.5078014184397164, + "grad_norm": 2.968548536300659, + "learning_rate": 1.868129739337551e-06, + "loss": 0.4098, + "step": 7419 + }, + { + "epoch": 3.508274231678487, + "grad_norm": 2.70094895362854, + "learning_rate": 1.867526186381034e-06, + "loss": 0.386, + "step": 7420 + }, + { + "epoch": 3.5087470449172575, + "grad_norm": 3.25897216796875, + "learning_rate": 1.8669226728058895e-06, + "loss": 0.4411, + "step": 7421 + }, + { + "epoch": 3.5092198581560283, + "grad_norm": 4.281215667724609, + "learning_rate": 1.866319198649694e-06, + "loss": 0.4011, + "step": 7422 + }, + { + "epoch": 3.509692671394799, + "grad_norm": 2.8394858837127686, + "learning_rate": 1.8657157639500223e-06, + "loss": 0.4162, + "step": 7423 + }, + { + "epoch": 3.5101654846335695, + "grad_norm": 2.732691764831543, + "learning_rate": 1.86511236874445e-06, + "loss": 0.3603, + "step": 7424 + }, + { + "epoch": 3.5106382978723403, + "grad_norm": 3.0152828693389893, + "learning_rate": 1.8645090130705463e-06, + "loss": 0.3811, + "step": 7425 + }, + { + "epoch": 3.511111111111111, + "grad_norm": 3.1762008666992188, + "learning_rate": 1.8639056969658793e-06, + "loss": 0.3985, + "step": 7426 + }, + { + "epoch": 3.511583924349882, + "grad_norm": 3.151123523712158, + "learning_rate": 1.863302420468016e-06, + "loss": 0.3582, + "step": 7427 + }, + { + "epoch": 3.5120567375886527, + "grad_norm": 2.738206386566162, + "learning_rate": 1.862699183614518e-06, + "loss": 0.3768, + "step": 7428 + }, + { + "epoch": 3.512529550827423, + "grad_norm": 3.235212564468384, + "learning_rate": 1.8620959864429487e-06, + "loss": 0.3964, + "step": 7429 + }, + { + "epoch": 3.513002364066194, + "grad_norm": 3.1113579273223877, + "learning_rate": 1.8614928289908648e-06, + "loss": 0.3979, + "step": 7430 + }, + { + "epoch": 3.5134751773049646, + "grad_norm": 2.6802520751953125, + "learning_rate": 1.860889711295822e-06, + "loss": 0.327, + "step": 7431 + }, + { + "epoch": 3.513947990543735, + "grad_norm": 2.9212403297424316, + "learning_rate": 1.860286633395375e-06, + "loss": 0.4104, + "step": 7432 + }, + { + "epoch": 3.5144208037825058, + "grad_norm": 2.868861198425293, + "learning_rate": 1.8596835953270742e-06, + "loss": 0.383, + "step": 7433 + }, + { + "epoch": 3.5148936170212766, + "grad_norm": 2.831655740737915, + "learning_rate": 1.8590805971284686e-06, + "loss": 0.3615, + "step": 7434 + }, + { + "epoch": 3.5153664302600474, + "grad_norm": 3.1540114879608154, + "learning_rate": 1.8584776388371039e-06, + "loss": 0.3914, + "step": 7435 + }, + { + "epoch": 3.515839243498818, + "grad_norm": 3.22031307220459, + "learning_rate": 1.8578747204905223e-06, + "loss": 0.4358, + "step": 7436 + }, + { + "epoch": 3.5163120567375885, + "grad_norm": 3.2922887802124023, + "learning_rate": 1.8572718421262677e-06, + "loss": 0.3894, + "step": 7437 + }, + { + "epoch": 3.5167848699763593, + "grad_norm": 2.936475992202759, + "learning_rate": 1.856669003781876e-06, + "loss": 0.3748, + "step": 7438 + }, + { + "epoch": 3.51725768321513, + "grad_norm": 3.4542860984802246, + "learning_rate": 1.8560662054948856e-06, + "loss": 0.3362, + "step": 7439 + }, + { + "epoch": 3.5177304964539005, + "grad_norm": 3.1532278060913086, + "learning_rate": 1.8554634473028288e-06, + "loss": 0.411, + "step": 7440 + }, + { + "epoch": 3.5182033096926713, + "grad_norm": 3.1678943634033203, + "learning_rate": 1.854860729243237e-06, + "loss": 0.4357, + "step": 7441 + }, + { + "epoch": 3.518676122931442, + "grad_norm": 2.608930826187134, + "learning_rate": 1.8542580513536385e-06, + "loss": 0.3851, + "step": 7442 + }, + { + "epoch": 3.519148936170213, + "grad_norm": 3.127915143966675, + "learning_rate": 1.853655413671559e-06, + "loss": 0.4227, + "step": 7443 + }, + { + "epoch": 3.5196217494089836, + "grad_norm": 3.0593245029449463, + "learning_rate": 1.8530528162345238e-06, + "loss": 0.4315, + "step": 7444 + }, + { + "epoch": 3.520094562647754, + "grad_norm": 2.7818729877471924, + "learning_rate": 1.852450259080053e-06, + "loss": 0.4018, + "step": 7445 + }, + { + "epoch": 3.520567375886525, + "grad_norm": 3.2635445594787598, + "learning_rate": 1.8518477422456639e-06, + "loss": 0.415, + "step": 7446 + }, + { + "epoch": 3.5210401891252956, + "grad_norm": 2.5713813304901123, + "learning_rate": 1.851245265768875e-06, + "loss": 0.3309, + "step": 7447 + }, + { + "epoch": 3.521513002364066, + "grad_norm": 2.6778969764709473, + "learning_rate": 1.8506428296871982e-06, + "loss": 0.3106, + "step": 7448 + }, + { + "epoch": 3.5219858156028367, + "grad_norm": 2.901095390319824, + "learning_rate": 1.8500404340381455e-06, + "loss": 0.3729, + "step": 7449 + }, + { + "epoch": 3.5224586288416075, + "grad_norm": 3.1000046730041504, + "learning_rate": 1.849438078859225e-06, + "loss": 0.438, + "step": 7450 + }, + { + "epoch": 3.5229314420803783, + "grad_norm": 2.901890993118286, + "learning_rate": 1.8488357641879417e-06, + "loss": 0.3934, + "step": 7451 + }, + { + "epoch": 3.523404255319149, + "grad_norm": 3.2212157249450684, + "learning_rate": 1.8482334900618009e-06, + "loss": 0.4359, + "step": 7452 + }, + { + "epoch": 3.5238770685579195, + "grad_norm": 3.3780901432037354, + "learning_rate": 1.847631256518303e-06, + "loss": 0.4022, + "step": 7453 + }, + { + "epoch": 3.5243498817966903, + "grad_norm": 2.9996445178985596, + "learning_rate": 1.847029063594945e-06, + "loss": 0.3989, + "step": 7454 + }, + { + "epoch": 3.524822695035461, + "grad_norm": 2.8581080436706543, + "learning_rate": 1.8464269113292255e-06, + "loss": 0.3401, + "step": 7455 + }, + { + "epoch": 3.5252955082742314, + "grad_norm": 2.9551661014556885, + "learning_rate": 1.8458247997586354e-06, + "loss": 0.4556, + "step": 7456 + }, + { + "epoch": 3.5257683215130022, + "grad_norm": 2.9672555923461914, + "learning_rate": 1.8452227289206672e-06, + "loss": 0.3575, + "step": 7457 + }, + { + "epoch": 3.526241134751773, + "grad_norm": 3.226273536682129, + "learning_rate": 1.8446206988528087e-06, + "loss": 0.3769, + "step": 7458 + }, + { + "epoch": 3.526713947990544, + "grad_norm": 2.994356155395508, + "learning_rate": 1.8440187095925443e-06, + "loss": 0.3653, + "step": 7459 + }, + { + "epoch": 3.5271867612293146, + "grad_norm": 2.489049196243286, + "learning_rate": 1.8434167611773595e-06, + "loss": 0.3454, + "step": 7460 + }, + { + "epoch": 3.527659574468085, + "grad_norm": 2.7897472381591797, + "learning_rate": 1.8428148536447333e-06, + "loss": 0.3526, + "step": 7461 + }, + { + "epoch": 3.5281323877068558, + "grad_norm": 2.947746992111206, + "learning_rate": 1.842212987032145e-06, + "loss": 0.3542, + "step": 7462 + }, + { + "epoch": 3.5286052009456266, + "grad_norm": 2.9303736686706543, + "learning_rate": 1.84161116137707e-06, + "loss": 0.3618, + "step": 7463 + }, + { + "epoch": 3.529078014184397, + "grad_norm": 2.81052827835083, + "learning_rate": 1.8410093767169807e-06, + "loss": 0.3833, + "step": 7464 + }, + { + "epoch": 3.5295508274231677, + "grad_norm": 3.4084126949310303, + "learning_rate": 1.840407633089348e-06, + "loss": 0.3868, + "step": 7465 + }, + { + "epoch": 3.5300236406619385, + "grad_norm": 2.8372802734375, + "learning_rate": 1.839805930531639e-06, + "loss": 0.3407, + "step": 7466 + }, + { + "epoch": 3.5304964539007093, + "grad_norm": 2.9218525886535645, + "learning_rate": 1.8392042690813205e-06, + "loss": 0.3772, + "step": 7467 + }, + { + "epoch": 3.53096926713948, + "grad_norm": 3.425274610519409, + "learning_rate": 1.8386026487758552e-06, + "loss": 0.3996, + "step": 7468 + }, + { + "epoch": 3.5314420803782505, + "grad_norm": 3.027423858642578, + "learning_rate": 1.8380010696527015e-06, + "loss": 0.3752, + "step": 7469 + }, + { + "epoch": 3.5319148936170213, + "grad_norm": 2.974896192550659, + "learning_rate": 1.8373995317493193e-06, + "loss": 0.3657, + "step": 7470 + }, + { + "epoch": 3.532387706855792, + "grad_norm": 2.837458610534668, + "learning_rate": 1.8367980351031628e-06, + "loss": 0.3949, + "step": 7471 + }, + { + "epoch": 3.5328605200945624, + "grad_norm": 2.8257288932800293, + "learning_rate": 1.8361965797516844e-06, + "loss": 0.3253, + "step": 7472 + }, + { + "epoch": 3.533333333333333, + "grad_norm": 2.8278095722198486, + "learning_rate": 1.8355951657323351e-06, + "loss": 0.3588, + "step": 7473 + }, + { + "epoch": 3.533806146572104, + "grad_norm": 2.641160249710083, + "learning_rate": 1.8349937930825601e-06, + "loss": 0.3423, + "step": 7474 + }, + { + "epoch": 3.534278959810875, + "grad_norm": 2.6909263134002686, + "learning_rate": 1.8343924618398065e-06, + "loss": 0.3973, + "step": 7475 + }, + { + "epoch": 3.5347517730496456, + "grad_norm": 3.0727429389953613, + "learning_rate": 1.8337911720415157e-06, + "loss": 0.4207, + "step": 7476 + }, + { + "epoch": 3.535224586288416, + "grad_norm": 3.218925714492798, + "learning_rate": 1.8331899237251265e-06, + "loss": 0.3955, + "step": 7477 + }, + { + "epoch": 3.5356973995271868, + "grad_norm": 3.163914918899536, + "learning_rate": 1.832588716928078e-06, + "loss": 0.4655, + "step": 7478 + }, + { + "epoch": 3.5361702127659576, + "grad_norm": 2.8622686862945557, + "learning_rate": 1.831987551687803e-06, + "loss": 0.4084, + "step": 7479 + }, + { + "epoch": 3.536643026004728, + "grad_norm": 2.8534188270568848, + "learning_rate": 1.831386428041734e-06, + "loss": 0.4144, + "step": 7480 + }, + { + "epoch": 3.5371158392434987, + "grad_norm": 2.8138554096221924, + "learning_rate": 1.8307853460273008e-06, + "loss": 0.3835, + "step": 7481 + }, + { + "epoch": 3.5375886524822695, + "grad_norm": 3.061960458755493, + "learning_rate": 1.830184305681929e-06, + "loss": 0.4128, + "step": 7482 + }, + { + "epoch": 3.5380614657210403, + "grad_norm": 2.8524835109710693, + "learning_rate": 1.8295833070430444e-06, + "loss": 0.3372, + "step": 7483 + }, + { + "epoch": 3.538534278959811, + "grad_norm": 3.2567028999328613, + "learning_rate": 1.8289823501480663e-06, + "loss": 0.4533, + "step": 7484 + }, + { + "epoch": 3.5390070921985815, + "grad_norm": 2.945634603500366, + "learning_rate": 1.8283814350344158e-06, + "loss": 0.3565, + "step": 7485 + }, + { + "epoch": 3.5394799054373522, + "grad_norm": 2.903287649154663, + "learning_rate": 1.8277805617395089e-06, + "loss": 0.349, + "step": 7486 + }, + { + "epoch": 3.539952718676123, + "grad_norm": 3.249272584915161, + "learning_rate": 1.827179730300757e-06, + "loss": 0.4076, + "step": 7487 + }, + { + "epoch": 3.5404255319148934, + "grad_norm": 2.9591739177703857, + "learning_rate": 1.8265789407555748e-06, + "loss": 0.3439, + "step": 7488 + }, + { + "epoch": 3.540898345153664, + "grad_norm": 3.8527538776397705, + "learning_rate": 1.8259781931413683e-06, + "loss": 0.4684, + "step": 7489 + }, + { + "epoch": 3.541371158392435, + "grad_norm": 2.7392261028289795, + "learning_rate": 1.8253774874955449e-06, + "loss": 0.3494, + "step": 7490 + }, + { + "epoch": 3.541843971631206, + "grad_norm": 2.880993127822876, + "learning_rate": 1.8247768238555069e-06, + "loss": 0.3546, + "step": 7491 + }, + { + "epoch": 3.5423167848699766, + "grad_norm": 2.9944894313812256, + "learning_rate": 1.8241762022586545e-06, + "loss": 0.3594, + "step": 7492 + }, + { + "epoch": 3.542789598108747, + "grad_norm": 3.0084292888641357, + "learning_rate": 1.8235756227423878e-06, + "loss": 0.408, + "step": 7493 + }, + { + "epoch": 3.5432624113475177, + "grad_norm": 2.75227689743042, + "learning_rate": 1.8229750853440998e-06, + "loss": 0.3515, + "step": 7494 + }, + { + "epoch": 3.5437352245862885, + "grad_norm": 3.041893243789673, + "learning_rate": 1.8223745901011856e-06, + "loss": 0.401, + "step": 7495 + }, + { + "epoch": 3.544208037825059, + "grad_norm": 2.8728370666503906, + "learning_rate": 1.8217741370510345e-06, + "loss": 0.3832, + "step": 7496 + }, + { + "epoch": 3.5446808510638297, + "grad_norm": 3.095460891723633, + "learning_rate": 1.8211737262310331e-06, + "loss": 0.3086, + "step": 7497 + }, + { + "epoch": 3.5451536643026005, + "grad_norm": 3.1869826316833496, + "learning_rate": 1.8205733576785678e-06, + "loss": 0.3666, + "step": 7498 + }, + { + "epoch": 3.5456264775413713, + "grad_norm": 3.307560443878174, + "learning_rate": 1.8199730314310204e-06, + "loss": 0.4489, + "step": 7499 + }, + { + "epoch": 3.546099290780142, + "grad_norm": 2.9531142711639404, + "learning_rate": 1.8193727475257697e-06, + "loss": 0.4017, + "step": 7500 + }, + { + "epoch": 3.5465721040189124, + "grad_norm": 3.2969162464141846, + "learning_rate": 1.8187725060001942e-06, + "loss": 0.4179, + "step": 7501 + }, + { + "epoch": 3.5470449172576832, + "grad_norm": 2.9434688091278076, + "learning_rate": 1.818172306891667e-06, + "loss": 0.3562, + "step": 7502 + }, + { + "epoch": 3.547517730496454, + "grad_norm": 3.070732355117798, + "learning_rate": 1.8175721502375616e-06, + "loss": 0.393, + "step": 7503 + }, + { + "epoch": 3.5479905437352244, + "grad_norm": 2.970898389816284, + "learning_rate": 1.8169720360752457e-06, + "loss": 0.4448, + "step": 7504 + }, + { + "epoch": 3.548463356973995, + "grad_norm": 2.7050931453704834, + "learning_rate": 1.8163719644420858e-06, + "loss": 0.3562, + "step": 7505 + }, + { + "epoch": 3.548936170212766, + "grad_norm": 3.0073063373565674, + "learning_rate": 1.8157719353754467e-06, + "loss": 0.4429, + "step": 7506 + }, + { + "epoch": 3.5494089834515368, + "grad_norm": 3.0240445137023926, + "learning_rate": 1.8151719489126874e-06, + "loss": 0.4073, + "step": 7507 + }, + { + "epoch": 3.5498817966903076, + "grad_norm": 3.558763265609741, + "learning_rate": 1.8145720050911695e-06, + "loss": 0.4025, + "step": 7508 + }, + { + "epoch": 3.550354609929078, + "grad_norm": 3.637258768081665, + "learning_rate": 1.8139721039482473e-06, + "loss": 0.5074, + "step": 7509 + }, + { + "epoch": 3.5508274231678487, + "grad_norm": 2.804719924926758, + "learning_rate": 1.8133722455212726e-06, + "loss": 0.3727, + "step": 7510 + }, + { + "epoch": 3.5513002364066195, + "grad_norm": 2.9034759998321533, + "learning_rate": 1.8127724298475984e-06, + "loss": 0.3642, + "step": 7511 + }, + { + "epoch": 3.55177304964539, + "grad_norm": 3.1304872035980225, + "learning_rate": 1.8121726569645714e-06, + "loss": 0.434, + "step": 7512 + }, + { + "epoch": 3.5522458628841607, + "grad_norm": 3.019956111907959, + "learning_rate": 1.8115729269095378e-06, + "loss": 0.4235, + "step": 7513 + }, + { + "epoch": 3.5527186761229315, + "grad_norm": 2.7984633445739746, + "learning_rate": 1.810973239719839e-06, + "loss": 0.3344, + "step": 7514 + }, + { + "epoch": 3.5531914893617023, + "grad_norm": 2.839709997177124, + "learning_rate": 1.8103735954328145e-06, + "loss": 0.3708, + "step": 7515 + }, + { + "epoch": 3.553664302600473, + "grad_norm": 2.766819477081299, + "learning_rate": 1.809773994085803e-06, + "loss": 0.3402, + "step": 7516 + }, + { + "epoch": 3.5541371158392434, + "grad_norm": 2.707942247390747, + "learning_rate": 1.8091744357161372e-06, + "loss": 0.4327, + "step": 7517 + }, + { + "epoch": 3.554609929078014, + "grad_norm": 3.512702465057373, + "learning_rate": 1.8085749203611516e-06, + "loss": 0.3965, + "step": 7518 + }, + { + "epoch": 3.555082742316785, + "grad_norm": 2.717024803161621, + "learning_rate": 1.8079754480581738e-06, + "loss": 0.3237, + "step": 7519 + }, + { + "epoch": 3.5555555555555554, + "grad_norm": 2.659001350402832, + "learning_rate": 1.8073760188445296e-06, + "loss": 0.3546, + "step": 7520 + }, + { + "epoch": 3.556028368794326, + "grad_norm": 2.615028142929077, + "learning_rate": 1.8067766327575445e-06, + "loss": 0.3232, + "step": 7521 + }, + { + "epoch": 3.556501182033097, + "grad_norm": 2.659428119659424, + "learning_rate": 1.8061772898345386e-06, + "loss": 0.3769, + "step": 7522 + }, + { + "epoch": 3.5569739952718678, + "grad_norm": 3.142369270324707, + "learning_rate": 1.8055779901128296e-06, + "loss": 0.4292, + "step": 7523 + }, + { + "epoch": 3.5574468085106385, + "grad_norm": 3.0832736492156982, + "learning_rate": 1.8049787336297352e-06, + "loss": 0.3871, + "step": 7524 + }, + { + "epoch": 3.557919621749409, + "grad_norm": 2.778411865234375, + "learning_rate": 1.8043795204225664e-06, + "loss": 0.3938, + "step": 7525 + }, + { + "epoch": 3.5583924349881797, + "grad_norm": 3.1651480197906494, + "learning_rate": 1.8037803505286355e-06, + "loss": 0.3315, + "step": 7526 + }, + { + "epoch": 3.5588652482269505, + "grad_norm": 3.266508102416992, + "learning_rate": 1.8031812239852498e-06, + "loss": 0.4156, + "step": 7527 + }, + { + "epoch": 3.559338061465721, + "grad_norm": 3.1345436573028564, + "learning_rate": 1.8025821408297127e-06, + "loss": 0.3813, + "step": 7528 + }, + { + "epoch": 3.5598108747044916, + "grad_norm": 3.1535425186157227, + "learning_rate": 1.8019831010993289e-06, + "loss": 0.3897, + "step": 7529 + }, + { + "epoch": 3.5602836879432624, + "grad_norm": 3.0934345722198486, + "learning_rate": 1.8013841048313952e-06, + "loss": 0.4074, + "step": 7530 + }, + { + "epoch": 3.5607565011820332, + "grad_norm": 3.224876642227173, + "learning_rate": 1.8007851520632108e-06, + "loss": 0.3969, + "step": 7531 + }, + { + "epoch": 3.561229314420804, + "grad_norm": 3.082303285598755, + "learning_rate": 1.8001862428320693e-06, + "loss": 0.3559, + "step": 7532 + }, + { + "epoch": 3.5617021276595744, + "grad_norm": 3.5289969444274902, + "learning_rate": 1.7995873771752608e-06, + "loss": 0.3961, + "step": 7533 + }, + { + "epoch": 3.562174940898345, + "grad_norm": 3.1893370151519775, + "learning_rate": 1.7989885551300762e-06, + "loss": 0.3721, + "step": 7534 + }, + { + "epoch": 3.562647754137116, + "grad_norm": 2.6911089420318604, + "learning_rate": 1.7983897767337999e-06, + "loss": 0.3801, + "step": 7535 + }, + { + "epoch": 3.5631205673758863, + "grad_norm": 3.0837483406066895, + "learning_rate": 1.797791042023716e-06, + "loss": 0.3886, + "step": 7536 + }, + { + "epoch": 3.563593380614657, + "grad_norm": 2.973459005355835, + "learning_rate": 1.7971923510371054e-06, + "loss": 0.438, + "step": 7537 + }, + { + "epoch": 3.564066193853428, + "grad_norm": 3.1537392139434814, + "learning_rate": 1.7965937038112435e-06, + "loss": 0.4022, + "step": 7538 + }, + { + "epoch": 3.5645390070921987, + "grad_norm": 3.2339680194854736, + "learning_rate": 1.795995100383409e-06, + "loss": 0.3883, + "step": 7539 + }, + { + "epoch": 3.5650118203309695, + "grad_norm": 2.5029079914093018, + "learning_rate": 1.7953965407908714e-06, + "loss": 0.3522, + "step": 7540 + }, + { + "epoch": 3.56548463356974, + "grad_norm": 3.1560211181640625, + "learning_rate": 1.7947980250709027e-06, + "loss": 0.4024, + "step": 7541 + }, + { + "epoch": 3.5659574468085107, + "grad_norm": 2.950477361679077, + "learning_rate": 1.7941995532607687e-06, + "loss": 0.3598, + "step": 7542 + }, + { + "epoch": 3.5664302600472815, + "grad_norm": 3.1263279914855957, + "learning_rate": 1.793601125397733e-06, + "loss": 0.3535, + "step": 7543 + }, + { + "epoch": 3.566903073286052, + "grad_norm": 2.986631393432617, + "learning_rate": 1.7930027415190587e-06, + "loss": 0.4251, + "step": 7544 + }, + { + "epoch": 3.5673758865248226, + "grad_norm": 2.6882247924804688, + "learning_rate": 1.7924044016620022e-06, + "loss": 0.3584, + "step": 7545 + }, + { + "epoch": 3.5678486997635934, + "grad_norm": 2.9358696937561035, + "learning_rate": 1.791806105863822e-06, + "loss": 0.3671, + "step": 7546 + }, + { + "epoch": 3.568321513002364, + "grad_norm": 2.774198055267334, + "learning_rate": 1.7912078541617704e-06, + "loss": 0.3505, + "step": 7547 + }, + { + "epoch": 3.568794326241135, + "grad_norm": 2.7384231090545654, + "learning_rate": 1.7906096465930964e-06, + "loss": 0.3992, + "step": 7548 + }, + { + "epoch": 3.5692671394799054, + "grad_norm": 2.8625354766845703, + "learning_rate": 1.7900114831950506e-06, + "loss": 0.3858, + "step": 7549 + }, + { + "epoch": 3.569739952718676, + "grad_norm": 2.737884044647217, + "learning_rate": 1.7894133640048761e-06, + "loss": 0.3973, + "step": 7550 + }, + { + "epoch": 3.570212765957447, + "grad_norm": 2.9817614555358887, + "learning_rate": 1.7888152890598154e-06, + "loss": 0.3613, + "step": 7551 + }, + { + "epoch": 3.5706855791962173, + "grad_norm": 2.760956287384033, + "learning_rate": 1.7882172583971081e-06, + "loss": 0.3645, + "step": 7552 + }, + { + "epoch": 3.571158392434988, + "grad_norm": 2.6867735385894775, + "learning_rate": 1.7876192720539908e-06, + "loss": 0.3771, + "step": 7553 + }, + { + "epoch": 3.571631205673759, + "grad_norm": 3.3362443447113037, + "learning_rate": 1.7870213300676986e-06, + "loss": 0.3989, + "step": 7554 + }, + { + "epoch": 3.5721040189125297, + "grad_norm": 2.8359227180480957, + "learning_rate": 1.7864234324754617e-06, + "loss": 0.3645, + "step": 7555 + }, + { + "epoch": 3.5725768321513005, + "grad_norm": 3.3070647716522217, + "learning_rate": 1.7858255793145076e-06, + "loss": 0.4128, + "step": 7556 + }, + { + "epoch": 3.573049645390071, + "grad_norm": 2.544879913330078, + "learning_rate": 1.7852277706220644e-06, + "loss": 0.3779, + "step": 7557 + }, + { + "epoch": 3.5735224586288417, + "grad_norm": 2.890796661376953, + "learning_rate": 1.7846300064353525e-06, + "loss": 0.373, + "step": 7558 + }, + { + "epoch": 3.5739952718676125, + "grad_norm": 2.9703400135040283, + "learning_rate": 1.7840322867915944e-06, + "loss": 0.3619, + "step": 7559 + }, + { + "epoch": 3.574468085106383, + "grad_norm": 3.0122430324554443, + "learning_rate": 1.7834346117280066e-06, + "loss": 0.4079, + "step": 7560 + }, + { + "epoch": 3.5749408983451536, + "grad_norm": 2.904963493347168, + "learning_rate": 1.7828369812818025e-06, + "loss": 0.38, + "step": 7561 + }, + { + "epoch": 3.5754137115839244, + "grad_norm": 3.0917439460754395, + "learning_rate": 1.7822393954901957e-06, + "loss": 0.383, + "step": 7562 + }, + { + "epoch": 3.575886524822695, + "grad_norm": 2.633920907974243, + "learning_rate": 1.7816418543903935e-06, + "loss": 0.3823, + "step": 7563 + }, + { + "epoch": 3.576359338061466, + "grad_norm": 2.9266390800476074, + "learning_rate": 1.781044358019604e-06, + "loss": 0.4642, + "step": 7564 + }, + { + "epoch": 3.5768321513002364, + "grad_norm": 2.878138780593872, + "learning_rate": 1.7804469064150299e-06, + "loss": 0.4056, + "step": 7565 + }, + { + "epoch": 3.577304964539007, + "grad_norm": 2.949370861053467, + "learning_rate": 1.7798494996138708e-06, + "loss": 0.3863, + "step": 7566 + }, + { + "epoch": 3.5777777777777775, + "grad_norm": 3.1444685459136963, + "learning_rate": 1.7792521376533264e-06, + "loss": 0.3611, + "step": 7567 + }, + { + "epoch": 3.5782505910165483, + "grad_norm": 3.0719716548919678, + "learning_rate": 1.7786548205705906e-06, + "loss": 0.3866, + "step": 7568 + }, + { + "epoch": 3.578723404255319, + "grad_norm": 3.155343770980835, + "learning_rate": 1.7780575484028566e-06, + "loss": 0.3896, + "step": 7569 + }, + { + "epoch": 3.57919621749409, + "grad_norm": 3.0447211265563965, + "learning_rate": 1.7774603211873138e-06, + "loss": 0.3713, + "step": 7570 + }, + { + "epoch": 3.5796690307328607, + "grad_norm": 2.8683619499206543, + "learning_rate": 1.7768631389611471e-06, + "loss": 0.3808, + "step": 7571 + }, + { + "epoch": 3.580141843971631, + "grad_norm": 3.1548070907592773, + "learning_rate": 1.776266001761543e-06, + "loss": 0.3714, + "step": 7572 + }, + { + "epoch": 3.580614657210402, + "grad_norm": 2.8699257373809814, + "learning_rate": 1.7756689096256816e-06, + "loss": 0.3694, + "step": 7573 + }, + { + "epoch": 3.5810874704491726, + "grad_norm": 2.834714412689209, + "learning_rate": 1.7750718625907398e-06, + "loss": 0.3935, + "step": 7574 + }, + { + "epoch": 3.581560283687943, + "grad_norm": 3.3828539848327637, + "learning_rate": 1.7744748606938957e-06, + "loss": 0.4783, + "step": 7575 + }, + { + "epoch": 3.582033096926714, + "grad_norm": 3.3892476558685303, + "learning_rate": 1.7738779039723202e-06, + "loss": 0.41, + "step": 7576 + }, + { + "epoch": 3.5825059101654846, + "grad_norm": 3.014289379119873, + "learning_rate": 1.7732809924631842e-06, + "loss": 0.3516, + "step": 7577 + }, + { + "epoch": 3.5829787234042554, + "grad_norm": 3.477212429046631, + "learning_rate": 1.772684126203654e-06, + "loss": 0.4144, + "step": 7578 + }, + { + "epoch": 3.583451536643026, + "grad_norm": 2.9156792163848877, + "learning_rate": 1.772087305230893e-06, + "loss": 0.3772, + "step": 7579 + }, + { + "epoch": 3.5839243498817965, + "grad_norm": 2.639169931411743, + "learning_rate": 1.7714905295820651e-06, + "loss": 0.3487, + "step": 7580 + }, + { + "epoch": 3.5843971631205673, + "grad_norm": 3.196894407272339, + "learning_rate": 1.7708937992943263e-06, + "loss": 0.4852, + "step": 7581 + }, + { + "epoch": 3.584869976359338, + "grad_norm": 2.9140779972076416, + "learning_rate": 1.7702971144048347e-06, + "loss": 0.3703, + "step": 7582 + }, + { + "epoch": 3.5853427895981085, + "grad_norm": 3.3844895362854004, + "learning_rate": 1.7697004749507418e-06, + "loss": 0.4227, + "step": 7583 + }, + { + "epoch": 3.5858156028368793, + "grad_norm": 3.080061912536621, + "learning_rate": 1.769103880969198e-06, + "loss": 0.4237, + "step": 7584 + }, + { + "epoch": 3.58628841607565, + "grad_norm": 3.037505865097046, + "learning_rate": 1.7685073324973506e-06, + "loss": 0.3902, + "step": 7585 + }, + { + "epoch": 3.586761229314421, + "grad_norm": 3.6563873291015625, + "learning_rate": 1.7679108295723436e-06, + "loss": 0.3956, + "step": 7586 + }, + { + "epoch": 3.5872340425531917, + "grad_norm": 3.158935546875, + "learning_rate": 1.76731437223132e-06, + "loss": 0.3898, + "step": 7587 + }, + { + "epoch": 3.587706855791962, + "grad_norm": 3.059199571609497, + "learning_rate": 1.7667179605114176e-06, + "loss": 0.4183, + "step": 7588 + }, + { + "epoch": 3.588179669030733, + "grad_norm": 2.8123233318328857, + "learning_rate": 1.7661215944497716e-06, + "loss": 0.3731, + "step": 7589 + }, + { + "epoch": 3.5886524822695036, + "grad_norm": 3.094287633895874, + "learning_rate": 1.7655252740835169e-06, + "loss": 0.4562, + "step": 7590 + }, + { + "epoch": 3.589125295508274, + "grad_norm": 2.886833667755127, + "learning_rate": 1.7649289994497822e-06, + "loss": 0.4178, + "step": 7591 + }, + { + "epoch": 3.5895981087470448, + "grad_norm": 3.3040647506713867, + "learning_rate": 1.764332770585696e-06, + "loss": 0.4311, + "step": 7592 + }, + { + "epoch": 3.5900709219858156, + "grad_norm": 2.7948951721191406, + "learning_rate": 1.7637365875283827e-06, + "loss": 0.3704, + "step": 7593 + }, + { + "epoch": 3.5905437352245864, + "grad_norm": 3.092221975326538, + "learning_rate": 1.7631404503149623e-06, + "loss": 0.4166, + "step": 7594 + }, + { + "epoch": 3.591016548463357, + "grad_norm": 3.6018600463867188, + "learning_rate": 1.7625443589825564e-06, + "loss": 0.4251, + "step": 7595 + }, + { + "epoch": 3.5914893617021275, + "grad_norm": 2.708017110824585, + "learning_rate": 1.7619483135682791e-06, + "loss": 0.3775, + "step": 7596 + }, + { + "epoch": 3.5919621749408983, + "grad_norm": 2.8069381713867188, + "learning_rate": 1.7613523141092438e-06, + "loss": 0.3929, + "step": 7597 + }, + { + "epoch": 3.592434988179669, + "grad_norm": 3.097787380218506, + "learning_rate": 1.7607563606425616e-06, + "loss": 0.3992, + "step": 7598 + }, + { + "epoch": 3.5929078014184395, + "grad_norm": 2.9691715240478516, + "learning_rate": 1.7601604532053385e-06, + "loss": 0.4001, + "step": 7599 + }, + { + "epoch": 3.5933806146572103, + "grad_norm": 2.5511624813079834, + "learning_rate": 1.7595645918346807e-06, + "loss": 0.3136, + "step": 7600 + }, + { + "epoch": 3.593853427895981, + "grad_norm": 2.4688427448272705, + "learning_rate": 1.7589687765676891e-06, + "loss": 0.3922, + "step": 7601 + }, + { + "epoch": 3.594326241134752, + "grad_norm": 3.004023790359497, + "learning_rate": 1.7583730074414613e-06, + "loss": 0.4203, + "step": 7602 + }, + { + "epoch": 3.5947990543735227, + "grad_norm": 2.902641773223877, + "learning_rate": 1.7577772844930957e-06, + "loss": 0.3855, + "step": 7603 + }, + { + "epoch": 3.595271867612293, + "grad_norm": 3.851375102996826, + "learning_rate": 1.7571816077596826e-06, + "loss": 0.3769, + "step": 7604 + }, + { + "epoch": 3.595744680851064, + "grad_norm": 3.03249192237854, + "learning_rate": 1.756585977278315e-06, + "loss": 0.3448, + "step": 7605 + }, + { + "epoch": 3.5962174940898346, + "grad_norm": 2.992363214492798, + "learning_rate": 1.7559903930860789e-06, + "loss": 0.3893, + "step": 7606 + }, + { + "epoch": 3.596690307328605, + "grad_norm": 2.9322855472564697, + "learning_rate": 1.7553948552200577e-06, + "loss": 0.4337, + "step": 7607 + }, + { + "epoch": 3.5971631205673757, + "grad_norm": 3.2564096450805664, + "learning_rate": 1.7547993637173347e-06, + "loss": 0.3943, + "step": 7608 + }, + { + "epoch": 3.5976359338061465, + "grad_norm": 2.9988484382629395, + "learning_rate": 1.7542039186149867e-06, + "loss": 0.3421, + "step": 7609 + }, + { + "epoch": 3.5981087470449173, + "grad_norm": 2.8188817501068115, + "learning_rate": 1.7536085199500914e-06, + "loss": 0.3657, + "step": 7610 + }, + { + "epoch": 3.598581560283688, + "grad_norm": 3.0583255290985107, + "learning_rate": 1.7530131677597206e-06, + "loss": 0.4036, + "step": 7611 + }, + { + "epoch": 3.5990543735224585, + "grad_norm": 2.8700921535491943, + "learning_rate": 1.7524178620809435e-06, + "loss": 0.3928, + "step": 7612 + }, + { + "epoch": 3.5995271867612293, + "grad_norm": 3.4497945308685303, + "learning_rate": 1.751822602950829e-06, + "loss": 0.3517, + "step": 7613 + }, + { + "epoch": 3.6, + "grad_norm": 3.334191083908081, + "learning_rate": 1.75122739040644e-06, + "loss": 0.3414, + "step": 7614 + }, + { + "epoch": 3.6004728132387704, + "grad_norm": 3.1435158252716064, + "learning_rate": 1.7506322244848387e-06, + "loss": 0.4075, + "step": 7615 + }, + { + "epoch": 3.6009456264775412, + "grad_norm": 3.178990125656128, + "learning_rate": 1.7500371052230824e-06, + "loss": 0.4688, + "step": 7616 + }, + { + "epoch": 3.601418439716312, + "grad_norm": 2.9292044639587402, + "learning_rate": 1.7494420326582267e-06, + "loss": 0.3882, + "step": 7617 + }, + { + "epoch": 3.601891252955083, + "grad_norm": 2.6899197101593018, + "learning_rate": 1.7488470068273256e-06, + "loss": 0.3916, + "step": 7618 + }, + { + "epoch": 3.6023640661938536, + "grad_norm": 2.8319191932678223, + "learning_rate": 1.7482520277674273e-06, + "loss": 0.3924, + "step": 7619 + }, + { + "epoch": 3.602836879432624, + "grad_norm": 2.74589204788208, + "learning_rate": 1.747657095515578e-06, + "loss": 0.2911, + "step": 7620 + }, + { + "epoch": 3.603309692671395, + "grad_norm": 2.857028007507324, + "learning_rate": 1.7470622101088233e-06, + "loss": 0.3618, + "step": 7621 + }, + { + "epoch": 3.6037825059101656, + "grad_norm": 3.3715617656707764, + "learning_rate": 1.746467371584203e-06, + "loss": 0.4186, + "step": 7622 + }, + { + "epoch": 3.604255319148936, + "grad_norm": 2.839526414871216, + "learning_rate": 1.745872579978755e-06, + "loss": 0.4088, + "step": 7623 + }, + { + "epoch": 3.6047281323877067, + "grad_norm": 3.7689156532287598, + "learning_rate": 1.7452778353295155e-06, + "loss": 0.4748, + "step": 7624 + }, + { + "epoch": 3.6052009456264775, + "grad_norm": 2.9345123767852783, + "learning_rate": 1.7446831376735152e-06, + "loss": 0.4117, + "step": 7625 + }, + { + "epoch": 3.6056737588652483, + "grad_norm": 2.7898924350738525, + "learning_rate": 1.7440884870477845e-06, + "loss": 0.3515, + "step": 7626 + }, + { + "epoch": 3.606146572104019, + "grad_norm": 3.4268569946289062, + "learning_rate": 1.7434938834893481e-06, + "loss": 0.4051, + "step": 7627 + }, + { + "epoch": 3.6066193853427895, + "grad_norm": 3.019066095352173, + "learning_rate": 1.7428993270352311e-06, + "loss": 0.4128, + "step": 7628 + }, + { + "epoch": 3.6070921985815603, + "grad_norm": 3.1277568340301514, + "learning_rate": 1.742304817722454e-06, + "loss": 0.37, + "step": 7629 + }, + { + "epoch": 3.607565011820331, + "grad_norm": 2.924818277359009, + "learning_rate": 1.7417103555880318e-06, + "loss": 0.3792, + "step": 7630 + }, + { + "epoch": 3.6080378250591014, + "grad_norm": 2.664699077606201, + "learning_rate": 1.7411159406689821e-06, + "loss": 0.3584, + "step": 7631 + }, + { + "epoch": 3.608510638297872, + "grad_norm": 3.223729133605957, + "learning_rate": 1.7405215730023144e-06, + "loss": 0.3956, + "step": 7632 + }, + { + "epoch": 3.608983451536643, + "grad_norm": 2.934225559234619, + "learning_rate": 1.7399272526250388e-06, + "loss": 0.4179, + "step": 7633 + }, + { + "epoch": 3.609456264775414, + "grad_norm": 2.833798885345459, + "learning_rate": 1.7393329795741603e-06, + "loss": 0.3283, + "step": 7634 + }, + { + "epoch": 3.6099290780141846, + "grad_norm": 3.008798837661743, + "learning_rate": 1.738738753886681e-06, + "loss": 0.3704, + "step": 7635 + }, + { + "epoch": 3.610401891252955, + "grad_norm": 2.8714520931243896, + "learning_rate": 1.7381445755996023e-06, + "loss": 0.3646, + "step": 7636 + }, + { + "epoch": 3.6108747044917258, + "grad_norm": 3.083554267883301, + "learning_rate": 1.7375504447499193e-06, + "loss": 0.3785, + "step": 7637 + }, + { + "epoch": 3.6113475177304966, + "grad_norm": 3.270347833633423, + "learning_rate": 1.7369563613746277e-06, + "loss": 0.4426, + "step": 7638 + }, + { + "epoch": 3.611820330969267, + "grad_norm": 2.7754862308502197, + "learning_rate": 1.7363623255107175e-06, + "loss": 0.3448, + "step": 7639 + }, + { + "epoch": 3.6122931442080377, + "grad_norm": 2.98140025138855, + "learning_rate": 1.7357683371951767e-06, + "loss": 0.4027, + "step": 7640 + }, + { + "epoch": 3.6127659574468085, + "grad_norm": 3.1640074253082275, + "learning_rate": 1.7351743964649908e-06, + "loss": 0.3913, + "step": 7641 + }, + { + "epoch": 3.6132387706855793, + "grad_norm": 2.758202075958252, + "learning_rate": 1.7345805033571417e-06, + "loss": 0.4148, + "step": 7642 + }, + { + "epoch": 3.61371158392435, + "grad_norm": 3.1030571460723877, + "learning_rate": 1.7339866579086074e-06, + "loss": 0.4002, + "step": 7643 + }, + { + "epoch": 3.6141843971631205, + "grad_norm": 3.2414135932922363, + "learning_rate": 1.733392860156366e-06, + "loss": 0.4732, + "step": 7644 + }, + { + "epoch": 3.6146572104018913, + "grad_norm": 2.8720390796661377, + "learning_rate": 1.7327991101373886e-06, + "loss": 0.4112, + "step": 7645 + }, + { + "epoch": 3.615130023640662, + "grad_norm": 3.0104875564575195, + "learning_rate": 1.7322054078886474e-06, + "loss": 0.3934, + "step": 7646 + }, + { + "epoch": 3.6156028368794324, + "grad_norm": 2.8615126609802246, + "learning_rate": 1.7316117534471091e-06, + "loss": 0.3437, + "step": 7647 + }, + { + "epoch": 3.616075650118203, + "grad_norm": 2.8283586502075195, + "learning_rate": 1.7310181468497369e-06, + "loss": 0.374, + "step": 7648 + }, + { + "epoch": 3.616548463356974, + "grad_norm": 3.2289321422576904, + "learning_rate": 1.7304245881334935e-06, + "loss": 0.3899, + "step": 7649 + }, + { + "epoch": 3.617021276595745, + "grad_norm": 3.126882791519165, + "learning_rate": 1.7298310773353356e-06, + "loss": 0.388, + "step": 7650 + }, + { + "epoch": 3.6174940898345156, + "grad_norm": 3.013657569885254, + "learning_rate": 1.7292376144922201e-06, + "loss": 0.379, + "step": 7651 + }, + { + "epoch": 3.617966903073286, + "grad_norm": 3.070192337036133, + "learning_rate": 1.7286441996410989e-06, + "loss": 0.3801, + "step": 7652 + }, + { + "epoch": 3.6184397163120567, + "grad_norm": 2.805380344390869, + "learning_rate": 1.7280508328189199e-06, + "loss": 0.3577, + "step": 7653 + }, + { + "epoch": 3.6189125295508275, + "grad_norm": 3.2853379249572754, + "learning_rate": 1.7274575140626318e-06, + "loss": 0.4168, + "step": 7654 + }, + { + "epoch": 3.619385342789598, + "grad_norm": 3.16316819190979, + "learning_rate": 1.7268642434091761e-06, + "loss": 0.425, + "step": 7655 + }, + { + "epoch": 3.6198581560283687, + "grad_norm": 3.2971179485321045, + "learning_rate": 1.7262710208954947e-06, + "loss": 0.3884, + "step": 7656 + }, + { + "epoch": 3.6203309692671395, + "grad_norm": 3.1823747158050537, + "learning_rate": 1.725677846558524e-06, + "loss": 0.3419, + "step": 7657 + }, + { + "epoch": 3.6208037825059103, + "grad_norm": 3.114654779434204, + "learning_rate": 1.7250847204351973e-06, + "loss": 0.3951, + "step": 7658 + }, + { + "epoch": 3.621276595744681, + "grad_norm": 3.0272440910339355, + "learning_rate": 1.7244916425624482e-06, + "loss": 0.4102, + "step": 7659 + }, + { + "epoch": 3.6217494089834514, + "grad_norm": 2.973611354827881, + "learning_rate": 1.7238986129772035e-06, + "loss": 0.3827, + "step": 7660 + }, + { + "epoch": 3.6222222222222222, + "grad_norm": 3.063713312149048, + "learning_rate": 1.7233056317163894e-06, + "loss": 0.3909, + "step": 7661 + }, + { + "epoch": 3.622695035460993, + "grad_norm": 3.203725576400757, + "learning_rate": 1.7227126988169283e-06, + "loss": 0.3933, + "step": 7662 + }, + { + "epoch": 3.6231678486997634, + "grad_norm": 2.945887327194214, + "learning_rate": 1.7221198143157386e-06, + "loss": 0.3722, + "step": 7663 + }, + { + "epoch": 3.623640661938534, + "grad_norm": 3.042691469192505, + "learning_rate": 1.7215269782497373e-06, + "loss": 0.4108, + "step": 7664 + }, + { + "epoch": 3.624113475177305, + "grad_norm": 2.8496763706207275, + "learning_rate": 1.720934190655837e-06, + "loss": 0.3867, + "step": 7665 + }, + { + "epoch": 3.6245862884160758, + "grad_norm": 2.7017154693603516, + "learning_rate": 1.7203414515709493e-06, + "loss": 0.3246, + "step": 7666 + }, + { + "epoch": 3.6250591016548466, + "grad_norm": 2.66630482673645, + "learning_rate": 1.7197487610319808e-06, + "loss": 0.365, + "step": 7667 + }, + { + "epoch": 3.625531914893617, + "grad_norm": 2.8724591732025146, + "learning_rate": 1.7191561190758348e-06, + "loss": 0.3361, + "step": 7668 + }, + { + "epoch": 3.6260047281323877, + "grad_norm": 3.1413803100585938, + "learning_rate": 1.7185635257394143e-06, + "loss": 0.3949, + "step": 7669 + }, + { + "epoch": 3.6264775413711585, + "grad_norm": 2.9866268634796143, + "learning_rate": 1.7179709810596163e-06, + "loss": 0.3728, + "step": 7670 + }, + { + "epoch": 3.626950354609929, + "grad_norm": 3.003497838973999, + "learning_rate": 1.717378485073336e-06, + "loss": 0.384, + "step": 7671 + }, + { + "epoch": 3.6274231678486997, + "grad_norm": 3.0043468475341797, + "learning_rate": 1.716786037817466e-06, + "loss": 0.3432, + "step": 7672 + }, + { + "epoch": 3.6278959810874705, + "grad_norm": 3.216550827026367, + "learning_rate": 1.7161936393288945e-06, + "loss": 0.3963, + "step": 7673 + }, + { + "epoch": 3.6283687943262413, + "grad_norm": 3.1091387271881104, + "learning_rate": 1.715601289644509e-06, + "loss": 0.4347, + "step": 7674 + }, + { + "epoch": 3.628841607565012, + "grad_norm": 3.2288286685943604, + "learning_rate": 1.7150089888011916e-06, + "loss": 0.4291, + "step": 7675 + }, + { + "epoch": 3.6293144208037824, + "grad_norm": 2.943941831588745, + "learning_rate": 1.7144167368358216e-06, + "loss": 0.3643, + "step": 7676 + }, + { + "epoch": 3.629787234042553, + "grad_norm": 2.819683313369751, + "learning_rate": 1.7138245337852774e-06, + "loss": 0.4051, + "step": 7677 + }, + { + "epoch": 3.630260047281324, + "grad_norm": 2.9988269805908203, + "learning_rate": 1.713232379686432e-06, + "loss": 0.4102, + "step": 7678 + }, + { + "epoch": 3.6307328605200944, + "grad_norm": 3.0041310787200928, + "learning_rate": 1.7126402745761566e-06, + "loss": 0.3854, + "step": 7679 + }, + { + "epoch": 3.631205673758865, + "grad_norm": 2.8700194358825684, + "learning_rate": 1.7120482184913192e-06, + "loss": 0.3441, + "step": 7680 + }, + { + "epoch": 3.631678486997636, + "grad_norm": 3.5275180339813232, + "learning_rate": 1.7114562114687833e-06, + "loss": 0.3808, + "step": 7681 + }, + { + "epoch": 3.6321513002364068, + "grad_norm": 3.182326078414917, + "learning_rate": 1.710864253545412e-06, + "loss": 0.4178, + "step": 7682 + }, + { + "epoch": 3.6326241134751776, + "grad_norm": 3.0514512062072754, + "learning_rate": 1.7102723447580627e-06, + "loss": 0.3527, + "step": 7683 + }, + { + "epoch": 3.633096926713948, + "grad_norm": 2.8293066024780273, + "learning_rate": 1.7096804851435922e-06, + "loss": 0.3723, + "step": 7684 + }, + { + "epoch": 3.6335697399527187, + "grad_norm": 2.9601097106933594, + "learning_rate": 1.709088674738853e-06, + "loss": 0.3704, + "step": 7685 + }, + { + "epoch": 3.6340425531914895, + "grad_norm": 2.8070995807647705, + "learning_rate": 1.7084969135806933e-06, + "loss": 0.346, + "step": 7686 + }, + { + "epoch": 3.63451536643026, + "grad_norm": 3.0162715911865234, + "learning_rate": 1.70790520170596e-06, + "loss": 0.39, + "step": 7687 + }, + { + "epoch": 3.6349881796690307, + "grad_norm": 3.018763780593872, + "learning_rate": 1.7073135391514967e-06, + "loss": 0.4621, + "step": 7688 + }, + { + "epoch": 3.6354609929078014, + "grad_norm": 2.963604688644409, + "learning_rate": 1.706721925954144e-06, + "loss": 0.339, + "step": 7689 + }, + { + "epoch": 3.6359338061465722, + "grad_norm": 2.8532896041870117, + "learning_rate": 1.7061303621507383e-06, + "loss": 0.3915, + "step": 7690 + }, + { + "epoch": 3.636406619385343, + "grad_norm": 3.248006820678711, + "learning_rate": 1.7055388477781133e-06, + "loss": 0.3712, + "step": 7691 + }, + { + "epoch": 3.6368794326241134, + "grad_norm": 3.2195777893066406, + "learning_rate": 1.7049473828731011e-06, + "loss": 0.4358, + "step": 7692 + }, + { + "epoch": 3.637352245862884, + "grad_norm": 2.7190768718719482, + "learning_rate": 1.7043559674725296e-06, + "loss": 0.341, + "step": 7693 + }, + { + "epoch": 3.637825059101655, + "grad_norm": 2.6047232151031494, + "learning_rate": 1.7037646016132223e-06, + "loss": 0.3513, + "step": 7694 + }, + { + "epoch": 3.6382978723404253, + "grad_norm": 3.0824201107025146, + "learning_rate": 1.7031732853320026e-06, + "loss": 0.4097, + "step": 7695 + }, + { + "epoch": 3.638770685579196, + "grad_norm": 2.845461130142212, + "learning_rate": 1.7025820186656883e-06, + "loss": 0.3395, + "step": 7696 + }, + { + "epoch": 3.639243498817967, + "grad_norm": 2.937863826751709, + "learning_rate": 1.7019908016510953e-06, + "loss": 0.395, + "step": 7697 + }, + { + "epoch": 3.6397163120567377, + "grad_norm": 3.349780559539795, + "learning_rate": 1.701399634325036e-06, + "loss": 0.3889, + "step": 7698 + }, + { + "epoch": 3.6401891252955085, + "grad_norm": 2.8527066707611084, + "learning_rate": 1.7008085167243187e-06, + "loss": 0.3753, + "step": 7699 + }, + { + "epoch": 3.640661938534279, + "grad_norm": 2.8112385272979736, + "learning_rate": 1.7002174488857517e-06, + "loss": 0.3912, + "step": 7700 + }, + { + "epoch": 3.6411347517730497, + "grad_norm": 2.731933832168579, + "learning_rate": 1.6996264308461363e-06, + "loss": 0.4142, + "step": 7701 + }, + { + "epoch": 3.6416075650118205, + "grad_norm": 3.70465350151062, + "learning_rate": 1.6990354626422744e-06, + "loss": 0.4089, + "step": 7702 + }, + { + "epoch": 3.642080378250591, + "grad_norm": 2.8656258583068848, + "learning_rate": 1.698444544310962e-06, + "loss": 0.3771, + "step": 7703 + }, + { + "epoch": 3.6425531914893616, + "grad_norm": 2.878830671310425, + "learning_rate": 1.697853675888993e-06, + "loss": 0.3754, + "step": 7704 + }, + { + "epoch": 3.6430260047281324, + "grad_norm": 3.440528154373169, + "learning_rate": 1.6972628574131586e-06, + "loss": 0.4543, + "step": 7705 + }, + { + "epoch": 3.6434988179669032, + "grad_norm": 2.70736026763916, + "learning_rate": 1.6966720889202451e-06, + "loss": 0.4049, + "step": 7706 + }, + { + "epoch": 3.643971631205674, + "grad_norm": 2.787992238998413, + "learning_rate": 1.6960813704470391e-06, + "loss": 0.3854, + "step": 7707 + }, + { + "epoch": 3.6444444444444444, + "grad_norm": 2.631490707397461, + "learning_rate": 1.6954907020303213e-06, + "loss": 0.3775, + "step": 7708 + }, + { + "epoch": 3.644917257683215, + "grad_norm": 3.052255392074585, + "learning_rate": 1.6949000837068685e-06, + "loss": 0.3873, + "step": 7709 + }, + { + "epoch": 3.645390070921986, + "grad_norm": 2.7443203926086426, + "learning_rate": 1.6943095155134586e-06, + "loss": 0.3362, + "step": 7710 + }, + { + "epoch": 3.6458628841607563, + "grad_norm": 2.931688070297241, + "learning_rate": 1.6937189974868618e-06, + "loss": 0.3839, + "step": 7711 + }, + { + "epoch": 3.646335697399527, + "grad_norm": 2.950242757797241, + "learning_rate": 1.6931285296638479e-06, + "loss": 0.3552, + "step": 7712 + }, + { + "epoch": 3.646808510638298, + "grad_norm": 2.940735340118408, + "learning_rate": 1.6925381120811823e-06, + "loss": 0.3881, + "step": 7713 + }, + { + "epoch": 3.6472813238770687, + "grad_norm": 2.771355390548706, + "learning_rate": 1.6919477447756273e-06, + "loss": 0.3578, + "step": 7714 + }, + { + "epoch": 3.6477541371158395, + "grad_norm": 2.919004201889038, + "learning_rate": 1.6913574277839435e-06, + "loss": 0.3971, + "step": 7715 + }, + { + "epoch": 3.64822695035461, + "grad_norm": 3.293705463409424, + "learning_rate": 1.6907671611428872e-06, + "loss": 0.422, + "step": 7716 + }, + { + "epoch": 3.6486997635933807, + "grad_norm": 2.744239091873169, + "learning_rate": 1.6901769448892103e-06, + "loss": 0.398, + "step": 7717 + }, + { + "epoch": 3.6491725768321515, + "grad_norm": 3.1726129055023193, + "learning_rate": 1.689586779059665e-06, + "loss": 0.39, + "step": 7718 + }, + { + "epoch": 3.649645390070922, + "grad_norm": 3.146743059158325, + "learning_rate": 1.688996663690997e-06, + "loss": 0.4059, + "step": 7719 + }, + { + "epoch": 3.6501182033096926, + "grad_norm": 2.941025495529175, + "learning_rate": 1.688406598819951e-06, + "loss": 0.3479, + "step": 7720 + }, + { + "epoch": 3.6505910165484634, + "grad_norm": 3.3480939865112305, + "learning_rate": 1.6878165844832679e-06, + "loss": 0.4141, + "step": 7721 + }, + { + "epoch": 3.651063829787234, + "grad_norm": 2.9145030975341797, + "learning_rate": 1.6872266207176833e-06, + "loss": 0.3497, + "step": 7722 + }, + { + "epoch": 3.651536643026005, + "grad_norm": 3.119502067565918, + "learning_rate": 1.686636707559934e-06, + "loss": 0.424, + "step": 7723 + }, + { + "epoch": 3.6520094562647754, + "grad_norm": 3.0867667198181152, + "learning_rate": 1.6860468450467497e-06, + "loss": 0.3998, + "step": 7724 + }, + { + "epoch": 3.652482269503546, + "grad_norm": 2.9128987789154053, + "learning_rate": 1.6854570332148602e-06, + "loss": 0.4043, + "step": 7725 + }, + { + "epoch": 3.652955082742317, + "grad_norm": 2.9973206520080566, + "learning_rate": 1.6848672721009896e-06, + "loss": 0.3395, + "step": 7726 + }, + { + "epoch": 3.6534278959810873, + "grad_norm": 2.824916124343872, + "learning_rate": 1.6842775617418591e-06, + "loss": 0.4102, + "step": 7727 + }, + { + "epoch": 3.653900709219858, + "grad_norm": 2.7984440326690674, + "learning_rate": 1.6836879021741887e-06, + "loss": 0.3823, + "step": 7728 + }, + { + "epoch": 3.654373522458629, + "grad_norm": 2.8412179946899414, + "learning_rate": 1.6830982934346917e-06, + "loss": 0.3755, + "step": 7729 + }, + { + "epoch": 3.6548463356973997, + "grad_norm": 3.1677138805389404, + "learning_rate": 1.6825087355600836e-06, + "loss": 0.4224, + "step": 7730 + }, + { + "epoch": 3.65531914893617, + "grad_norm": 3.097085475921631, + "learning_rate": 1.6819192285870718e-06, + "loss": 0.4103, + "step": 7731 + }, + { + "epoch": 3.655791962174941, + "grad_norm": 2.9802496433258057, + "learning_rate": 1.6813297725523613e-06, + "loss": 0.4297, + "step": 7732 + }, + { + "epoch": 3.6562647754137116, + "grad_norm": 3.0135059356689453, + "learning_rate": 1.680740367492657e-06, + "loss": 0.4526, + "step": 7733 + }, + { + "epoch": 3.656737588652482, + "grad_norm": 2.7776739597320557, + "learning_rate": 1.6801510134446575e-06, + "loss": 0.3924, + "step": 7734 + }, + { + "epoch": 3.657210401891253, + "grad_norm": 2.7500126361846924, + "learning_rate": 1.6795617104450595e-06, + "loss": 0.3785, + "step": 7735 + }, + { + "epoch": 3.6576832151300236, + "grad_norm": 3.494142770767212, + "learning_rate": 1.6789724585305566e-06, + "loss": 0.3483, + "step": 7736 + }, + { + "epoch": 3.6581560283687944, + "grad_norm": 3.055081605911255, + "learning_rate": 1.6783832577378377e-06, + "loss": 0.4481, + "step": 7737 + }, + { + "epoch": 3.658628841607565, + "grad_norm": 2.781412124633789, + "learning_rate": 1.6777941081035914e-06, + "loss": 0.3969, + "step": 7738 + }, + { + "epoch": 3.6591016548463355, + "grad_norm": 3.1672184467315674, + "learning_rate": 1.677205009664501e-06, + "loss": 0.3959, + "step": 7739 + }, + { + "epoch": 3.6595744680851063, + "grad_norm": 3.0597715377807617, + "learning_rate": 1.6766159624572458e-06, + "loss": 0.418, + "step": 7740 + }, + { + "epoch": 3.660047281323877, + "grad_norm": 3.2906267642974854, + "learning_rate": 1.676026966518505e-06, + "loss": 0.4335, + "step": 7741 + }, + { + "epoch": 3.6605200945626475, + "grad_norm": 3.2519290447235107, + "learning_rate": 1.6754380218849515e-06, + "loss": 0.3786, + "step": 7742 + }, + { + "epoch": 3.6609929078014183, + "grad_norm": 3.24716854095459, + "learning_rate": 1.6748491285932572e-06, + "loss": 0.3599, + "step": 7743 + }, + { + "epoch": 3.661465721040189, + "grad_norm": 3.2940993309020996, + "learning_rate": 1.6742602866800897e-06, + "loss": 0.3934, + "step": 7744 + }, + { + "epoch": 3.66193853427896, + "grad_norm": 2.917409896850586, + "learning_rate": 1.6736714961821124e-06, + "loss": 0.4197, + "step": 7745 + }, + { + "epoch": 3.6624113475177307, + "grad_norm": 3.005068063735962, + "learning_rate": 1.6730827571359887e-06, + "loss": 0.4239, + "step": 7746 + }, + { + "epoch": 3.662884160756501, + "grad_norm": 2.751880168914795, + "learning_rate": 1.6724940695783745e-06, + "loss": 0.4257, + "step": 7747 + }, + { + "epoch": 3.663356973995272, + "grad_norm": 3.090670585632324, + "learning_rate": 1.6719054335459273e-06, + "loss": 0.3686, + "step": 7748 + }, + { + "epoch": 3.6638297872340426, + "grad_norm": 3.250251293182373, + "learning_rate": 1.6713168490752974e-06, + "loss": 0.4249, + "step": 7749 + }, + { + "epoch": 3.664302600472813, + "grad_norm": 2.8662827014923096, + "learning_rate": 1.6707283162031335e-06, + "loss": 0.3692, + "step": 7750 + }, + { + "epoch": 3.6647754137115838, + "grad_norm": 2.8709118366241455, + "learning_rate": 1.6701398349660813e-06, + "loss": 0.3929, + "step": 7751 + }, + { + "epoch": 3.6652482269503546, + "grad_norm": 2.992035388946533, + "learning_rate": 1.6695514054007822e-06, + "loss": 0.4131, + "step": 7752 + }, + { + "epoch": 3.6657210401891254, + "grad_norm": 3.0427589416503906, + "learning_rate": 1.668963027543876e-06, + "loss": 0.387, + "step": 7753 + }, + { + "epoch": 3.666193853427896, + "grad_norm": 3.0147807598114014, + "learning_rate": 1.6683747014319987e-06, + "loss": 0.3648, + "step": 7754 + }, + { + "epoch": 3.6666666666666665, + "grad_norm": 2.5483829975128174, + "learning_rate": 1.6677864271017811e-06, + "loss": 0.3643, + "step": 7755 + }, + { + "epoch": 3.6671394799054373, + "grad_norm": 2.7661986351013184, + "learning_rate": 1.6671982045898544e-06, + "loss": 0.3731, + "step": 7756 + }, + { + "epoch": 3.667612293144208, + "grad_norm": 2.778036117553711, + "learning_rate": 1.666610033932843e-06, + "loss": 0.3744, + "step": 7757 + }, + { + "epoch": 3.6680851063829785, + "grad_norm": 2.9028329849243164, + "learning_rate": 1.6660219151673712e-06, + "loss": 0.4286, + "step": 7758 + }, + { + "epoch": 3.6685579196217493, + "grad_norm": 2.826687812805176, + "learning_rate": 1.6654338483300575e-06, + "loss": 0.318, + "step": 7759 + }, + { + "epoch": 3.66903073286052, + "grad_norm": 2.7063660621643066, + "learning_rate": 1.6648458334575186e-06, + "loss": 0.3351, + "step": 7760 + }, + { + "epoch": 3.669503546099291, + "grad_norm": 2.708361864089966, + "learning_rate": 1.664257870586368e-06, + "loss": 0.376, + "step": 7761 + }, + { + "epoch": 3.6699763593380617, + "grad_norm": 3.1139161586761475, + "learning_rate": 1.6636699597532141e-06, + "loss": 0.3572, + "step": 7762 + }, + { + "epoch": 3.670449172576832, + "grad_norm": 3.0858285427093506, + "learning_rate": 1.6630821009946658e-06, + "loss": 0.4204, + "step": 7763 + }, + { + "epoch": 3.670921985815603, + "grad_norm": 3.5593984127044678, + "learning_rate": 1.6624942943473252e-06, + "loss": 0.463, + "step": 7764 + }, + { + "epoch": 3.6713947990543736, + "grad_norm": 2.863851308822632, + "learning_rate": 1.6619065398477921e-06, + "loss": 0.4272, + "step": 7765 + }, + { + "epoch": 3.671867612293144, + "grad_norm": 2.833399772644043, + "learning_rate": 1.6613188375326638e-06, + "loss": 0.3509, + "step": 7766 + }, + { + "epoch": 3.6723404255319148, + "grad_norm": 2.988948345184326, + "learning_rate": 1.6607311874385346e-06, + "loss": 0.3572, + "step": 7767 + }, + { + "epoch": 3.6728132387706856, + "grad_norm": 2.7349398136138916, + "learning_rate": 1.6601435896019936e-06, + "loss": 0.3875, + "step": 7768 + }, + { + "epoch": 3.6732860520094563, + "grad_norm": 2.8544445037841797, + "learning_rate": 1.659556044059629e-06, + "loss": 0.4057, + "step": 7769 + }, + { + "epoch": 3.673758865248227, + "grad_norm": 3.0341904163360596, + "learning_rate": 1.6589685508480235e-06, + "loss": 0.3935, + "step": 7770 + }, + { + "epoch": 3.6742316784869975, + "grad_norm": 2.7495710849761963, + "learning_rate": 1.6583811100037595e-06, + "loss": 0.352, + "step": 7771 + }, + { + "epoch": 3.6747044917257683, + "grad_norm": 3.258525848388672, + "learning_rate": 1.6577937215634133e-06, + "loss": 0.4414, + "step": 7772 + }, + { + "epoch": 3.675177304964539, + "grad_norm": 3.0686328411102295, + "learning_rate": 1.657206385563558e-06, + "loss": 0.353, + "step": 7773 + }, + { + "epoch": 3.6756501182033094, + "grad_norm": 3.1168248653411865, + "learning_rate": 1.6566191020407668e-06, + "loss": 0.4064, + "step": 7774 + }, + { + "epoch": 3.6761229314420802, + "grad_norm": 2.7837352752685547, + "learning_rate": 1.6560318710316053e-06, + "loss": 0.3956, + "step": 7775 + }, + { + "epoch": 3.676595744680851, + "grad_norm": 3.1514039039611816, + "learning_rate": 1.6554446925726391e-06, + "loss": 0.4044, + "step": 7776 + }, + { + "epoch": 3.677068557919622, + "grad_norm": 3.010352611541748, + "learning_rate": 1.6548575667004285e-06, + "loss": 0.4162, + "step": 7777 + }, + { + "epoch": 3.6775413711583926, + "grad_norm": 3.1727633476257324, + "learning_rate": 1.6542704934515308e-06, + "loss": 0.411, + "step": 7778 + }, + { + "epoch": 3.678014184397163, + "grad_norm": 3.6771271228790283, + "learning_rate": 1.6536834728625018e-06, + "loss": 0.4562, + "step": 7779 + }, + { + "epoch": 3.678486997635934, + "grad_norm": 2.9793131351470947, + "learning_rate": 1.6530965049698908e-06, + "loss": 0.4039, + "step": 7780 + }, + { + "epoch": 3.6789598108747046, + "grad_norm": 3.193751096725464, + "learning_rate": 1.6525095898102478e-06, + "loss": 0.4064, + "step": 7781 + }, + { + "epoch": 3.679432624113475, + "grad_norm": 2.6643173694610596, + "learning_rate": 1.6519227274201169e-06, + "loss": 0.3731, + "step": 7782 + }, + { + "epoch": 3.6799054373522457, + "grad_norm": 3.4855685234069824, + "learning_rate": 1.6513359178360384e-06, + "loss": 0.3815, + "step": 7783 + }, + { + "epoch": 3.6803782505910165, + "grad_norm": 3.320537567138672, + "learning_rate": 1.6507491610945514e-06, + "loss": 0.4065, + "step": 7784 + }, + { + "epoch": 3.6808510638297873, + "grad_norm": 3.2793102264404297, + "learning_rate": 1.6501624572321895e-06, + "loss": 0.429, + "step": 7785 + }, + { + "epoch": 3.681323877068558, + "grad_norm": 2.8609631061553955, + "learning_rate": 1.6495758062854854e-06, + "loss": 0.3881, + "step": 7786 + }, + { + "epoch": 3.6817966903073285, + "grad_norm": 2.956533193588257, + "learning_rate": 1.6489892082909675e-06, + "loss": 0.3662, + "step": 7787 + }, + { + "epoch": 3.6822695035460993, + "grad_norm": 2.949092149734497, + "learning_rate": 1.6484026632851591e-06, + "loss": 0.4021, + "step": 7788 + }, + { + "epoch": 3.68274231678487, + "grad_norm": 2.802572250366211, + "learning_rate": 1.6478161713045831e-06, + "loss": 0.3862, + "step": 7789 + }, + { + "epoch": 3.6832151300236404, + "grad_norm": 3.0064797401428223, + "learning_rate": 1.6472297323857578e-06, + "loss": 0.3495, + "step": 7790 + }, + { + "epoch": 3.6836879432624112, + "grad_norm": 2.74037766456604, + "learning_rate": 1.646643346565197e-06, + "loss": 0.362, + "step": 7791 + }, + { + "epoch": 3.684160756501182, + "grad_norm": 3.423326253890991, + "learning_rate": 1.646057013879414e-06, + "loss": 0.4383, + "step": 7792 + }, + { + "epoch": 3.684633569739953, + "grad_norm": 2.7426443099975586, + "learning_rate": 1.645470734364915e-06, + "loss": 0.3741, + "step": 7793 + }, + { + "epoch": 3.6851063829787236, + "grad_norm": 2.879296064376831, + "learning_rate": 1.6448845080582077e-06, + "loss": 0.4003, + "step": 7794 + }, + { + "epoch": 3.685579196217494, + "grad_norm": 2.6821188926696777, + "learning_rate": 1.6442983349957924e-06, + "loss": 0.3545, + "step": 7795 + }, + { + "epoch": 3.6860520094562648, + "grad_norm": 2.714059829711914, + "learning_rate": 1.6437122152141665e-06, + "loss": 0.3702, + "step": 7796 + }, + { + "epoch": 3.6865248226950356, + "grad_norm": 2.593811511993408, + "learning_rate": 1.6431261487498274e-06, + "loss": 0.3512, + "step": 7797 + }, + { + "epoch": 3.686997635933806, + "grad_norm": 2.687533378601074, + "learning_rate": 1.6425401356392652e-06, + "loss": 0.36, + "step": 7798 + }, + { + "epoch": 3.6874704491725767, + "grad_norm": 3.1675431728363037, + "learning_rate": 1.6419541759189694e-06, + "loss": 0.3349, + "step": 7799 + }, + { + "epoch": 3.6879432624113475, + "grad_norm": 2.777310371398926, + "learning_rate": 1.6413682696254246e-06, + "loss": 0.317, + "step": 7800 + }, + { + "epoch": 3.6884160756501183, + "grad_norm": 3.0121655464172363, + "learning_rate": 1.640782416795112e-06, + "loss": 0.3612, + "step": 7801 + }, + { + "epoch": 3.688888888888889, + "grad_norm": 3.0532145500183105, + "learning_rate": 1.6401966174645113e-06, + "loss": 0.4065, + "step": 7802 + }, + { + "epoch": 3.6893617021276595, + "grad_norm": 2.8221664428710938, + "learning_rate": 1.6396108716700961e-06, + "loss": 0.3669, + "step": 7803 + }, + { + "epoch": 3.6898345153664303, + "grad_norm": 2.966357707977295, + "learning_rate": 1.6390251794483405e-06, + "loss": 0.391, + "step": 7804 + }, + { + "epoch": 3.690307328605201, + "grad_norm": 3.460252046585083, + "learning_rate": 1.6384395408357118e-06, + "loss": 0.429, + "step": 7805 + }, + { + "epoch": 3.6907801418439714, + "grad_norm": 2.8907718658447266, + "learning_rate": 1.637853955868674e-06, + "loss": 0.3761, + "step": 7806 + }, + { + "epoch": 3.691252955082742, + "grad_norm": 3.114612102508545, + "learning_rate": 1.6372684245836912e-06, + "loss": 0.4376, + "step": 7807 + }, + { + "epoch": 3.691725768321513, + "grad_norm": 2.9361326694488525, + "learning_rate": 1.6366829470172191e-06, + "loss": 0.3672, + "step": 7808 + }, + { + "epoch": 3.692198581560284, + "grad_norm": 3.2719476222991943, + "learning_rate": 1.6360975232057156e-06, + "loss": 0.4266, + "step": 7809 + }, + { + "epoch": 3.6926713947990546, + "grad_norm": 2.873952865600586, + "learning_rate": 1.635512153185631e-06, + "loss": 0.4056, + "step": 7810 + }, + { + "epoch": 3.693144208037825, + "grad_norm": 3.0273401737213135, + "learning_rate": 1.634926836993413e-06, + "loss": 0.3947, + "step": 7811 + }, + { + "epoch": 3.6936170212765957, + "grad_norm": 2.868738889694214, + "learning_rate": 1.634341574665509e-06, + "loss": 0.3935, + "step": 7812 + }, + { + "epoch": 3.6940898345153665, + "grad_norm": 3.3080437183380127, + "learning_rate": 1.6337563662383591e-06, + "loss": 0.3606, + "step": 7813 + }, + { + "epoch": 3.694562647754137, + "grad_norm": 2.8339016437530518, + "learning_rate": 1.6331712117484014e-06, + "loss": 0.4019, + "step": 7814 + }, + { + "epoch": 3.6950354609929077, + "grad_norm": 2.666815996170044, + "learning_rate": 1.6325861112320717e-06, + "loss": 0.3502, + "step": 7815 + }, + { + "epoch": 3.6955082742316785, + "grad_norm": 2.7624311447143555, + "learning_rate": 1.6320010647258008e-06, + "loss": 0.3481, + "step": 7816 + }, + { + "epoch": 3.6959810874704493, + "grad_norm": 2.7796332836151123, + "learning_rate": 1.6314160722660183e-06, + "loss": 0.3735, + "step": 7817 + }, + { + "epoch": 3.69645390070922, + "grad_norm": 2.954318046569824, + "learning_rate": 1.6308311338891484e-06, + "loss": 0.3933, + "step": 7818 + }, + { + "epoch": 3.6969267139479904, + "grad_norm": 2.821072816848755, + "learning_rate": 1.6302462496316115e-06, + "loss": 0.3437, + "step": 7819 + }, + { + "epoch": 3.6973995271867612, + "grad_norm": 3.436192750930786, + "learning_rate": 1.629661419529828e-06, + "loss": 0.4469, + "step": 7820 + }, + { + "epoch": 3.697872340425532, + "grad_norm": 3.1361067295074463, + "learning_rate": 1.629076643620211e-06, + "loss": 0.3887, + "step": 7821 + }, + { + "epoch": 3.6983451536643024, + "grad_norm": 3.355024576187134, + "learning_rate": 1.6284919219391732e-06, + "loss": 0.424, + "step": 7822 + }, + { + "epoch": 3.698817966903073, + "grad_norm": 2.7671639919281006, + "learning_rate": 1.6279072545231212e-06, + "loss": 0.3765, + "step": 7823 + }, + { + "epoch": 3.699290780141844, + "grad_norm": 2.9509360790252686, + "learning_rate": 1.6273226414084606e-06, + "loss": 0.4057, + "step": 7824 + }, + { + "epoch": 3.699763593380615, + "grad_norm": 2.9852921962738037, + "learning_rate": 1.6267380826315932e-06, + "loss": 0.4238, + "step": 7825 + }, + { + "epoch": 3.7002364066193856, + "grad_norm": 2.826594114303589, + "learning_rate": 1.626153578228915e-06, + "loss": 0.3958, + "step": 7826 + }, + { + "epoch": 3.700709219858156, + "grad_norm": 2.9103410243988037, + "learning_rate": 1.6255691282368228e-06, + "loss": 0.394, + "step": 7827 + }, + { + "epoch": 3.7011820330969267, + "grad_norm": 3.362992525100708, + "learning_rate": 1.6249847326917068e-06, + "loss": 0.4233, + "step": 7828 + }, + { + "epoch": 3.7016548463356975, + "grad_norm": 2.711280107498169, + "learning_rate": 1.624400391629954e-06, + "loss": 0.2977, + "step": 7829 + }, + { + "epoch": 3.702127659574468, + "grad_norm": 2.8354649543762207, + "learning_rate": 1.6238161050879497e-06, + "loss": 0.3549, + "step": 7830 + }, + { + "epoch": 3.7026004728132387, + "grad_norm": 3.096376895904541, + "learning_rate": 1.6232318731020743e-06, + "loss": 0.3486, + "step": 7831 + }, + { + "epoch": 3.7030732860520095, + "grad_norm": 2.918267250061035, + "learning_rate": 1.6226476957087064e-06, + "loss": 0.3659, + "step": 7832 + }, + { + "epoch": 3.7035460992907803, + "grad_norm": 2.705399513244629, + "learning_rate": 1.6220635729442195e-06, + "loss": 0.4301, + "step": 7833 + }, + { + "epoch": 3.704018912529551, + "grad_norm": 2.9192235469818115, + "learning_rate": 1.621479504844983e-06, + "loss": 0.3384, + "step": 7834 + }, + { + "epoch": 3.7044917257683214, + "grad_norm": 2.78623104095459, + "learning_rate": 1.6208954914473669e-06, + "loss": 0.3528, + "step": 7835 + }, + { + "epoch": 3.704964539007092, + "grad_norm": 3.0218069553375244, + "learning_rate": 1.6203115327877333e-06, + "loss": 0.3698, + "step": 7836 + }, + { + "epoch": 3.705437352245863, + "grad_norm": 3.019101619720459, + "learning_rate": 1.6197276289024422e-06, + "loss": 0.4398, + "step": 7837 + }, + { + "epoch": 3.7059101654846334, + "grad_norm": 2.9220848083496094, + "learning_rate": 1.6191437798278531e-06, + "loss": 0.3803, + "step": 7838 + }, + { + "epoch": 3.706382978723404, + "grad_norm": 3.2731969356536865, + "learning_rate": 1.6185599856003181e-06, + "loss": 0.4529, + "step": 7839 + }, + { + "epoch": 3.706855791962175, + "grad_norm": 2.85239577293396, + "learning_rate": 1.617976246256188e-06, + "loss": 0.3801, + "step": 7840 + }, + { + "epoch": 3.7073286052009458, + "grad_norm": 2.8250765800476074, + "learning_rate": 1.6173925618318092e-06, + "loss": 0.3267, + "step": 7841 + }, + { + "epoch": 3.7078014184397166, + "grad_norm": 2.9152321815490723, + "learning_rate": 1.616808932363525e-06, + "loss": 0.428, + "step": 7842 + }, + { + "epoch": 3.708274231678487, + "grad_norm": 2.912656545639038, + "learning_rate": 1.6162253578876766e-06, + "loss": 0.3802, + "step": 7843 + }, + { + "epoch": 3.7087470449172577, + "grad_norm": 3.0700762271881104, + "learning_rate": 1.6156418384405992e-06, + "loss": 0.377, + "step": 7844 + }, + { + "epoch": 3.7092198581560285, + "grad_norm": 2.873141050338745, + "learning_rate": 1.6150583740586274e-06, + "loss": 0.399, + "step": 7845 + }, + { + "epoch": 3.709692671394799, + "grad_norm": 2.899555206298828, + "learning_rate": 1.6144749647780906e-06, + "loss": 0.402, + "step": 7846 + }, + { + "epoch": 3.7101654846335697, + "grad_norm": 3.257697343826294, + "learning_rate": 1.6138916106353139e-06, + "loss": 0.4193, + "step": 7847 + }, + { + "epoch": 3.7106382978723405, + "grad_norm": 2.6879804134368896, + "learning_rate": 1.613308311666622e-06, + "loss": 0.3474, + "step": 7848 + }, + { + "epoch": 3.7111111111111112, + "grad_norm": 2.712491273880005, + "learning_rate": 1.6127250679083323e-06, + "loss": 0.3315, + "step": 7849 + }, + { + "epoch": 3.711583924349882, + "grad_norm": 2.9762673377990723, + "learning_rate": 1.6121418793967631e-06, + "loss": 0.3953, + "step": 7850 + }, + { + "epoch": 3.7120567375886524, + "grad_norm": 2.743668556213379, + "learning_rate": 1.6115587461682258e-06, + "loss": 0.381, + "step": 7851 + }, + { + "epoch": 3.712529550827423, + "grad_norm": 3.0545318126678467, + "learning_rate": 1.6109756682590288e-06, + "loss": 0.412, + "step": 7852 + }, + { + "epoch": 3.713002364066194, + "grad_norm": 3.0125906467437744, + "learning_rate": 1.61039264570548e-06, + "loss": 0.3931, + "step": 7853 + }, + { + "epoch": 3.7134751773049643, + "grad_norm": 2.809302806854248, + "learning_rate": 1.6098096785438794e-06, + "loss": 0.3943, + "step": 7854 + }, + { + "epoch": 3.713947990543735, + "grad_norm": 3.092452049255371, + "learning_rate": 1.6092267668105276e-06, + "loss": 0.3932, + "step": 7855 + }, + { + "epoch": 3.714420803782506, + "grad_norm": 2.9878969192504883, + "learning_rate": 1.608643910541719e-06, + "loss": 0.4289, + "step": 7856 + }, + { + "epoch": 3.7148936170212767, + "grad_norm": 2.693387508392334, + "learning_rate": 1.6080611097737444e-06, + "loss": 0.373, + "step": 7857 + }, + { + "epoch": 3.7153664302600475, + "grad_norm": 3.4097673892974854, + "learning_rate": 1.6074783645428945e-06, + "loss": 0.4487, + "step": 7858 + }, + { + "epoch": 3.715839243498818, + "grad_norm": 3.1466784477233887, + "learning_rate": 1.6068956748854525e-06, + "loss": 0.3648, + "step": 7859 + }, + { + "epoch": 3.7163120567375887, + "grad_norm": 3.062107563018799, + "learning_rate": 1.6063130408377015e-06, + "loss": 0.3899, + "step": 7860 + }, + { + "epoch": 3.7167848699763595, + "grad_norm": 3.2298364639282227, + "learning_rate": 1.6057304624359188e-06, + "loss": 0.4243, + "step": 7861 + }, + { + "epoch": 3.71725768321513, + "grad_norm": 3.0285773277282715, + "learning_rate": 1.6051479397163784e-06, + "loss": 0.3469, + "step": 7862 + }, + { + "epoch": 3.7177304964539006, + "grad_norm": 2.8438515663146973, + "learning_rate": 1.6045654727153525e-06, + "loss": 0.3363, + "step": 7863 + }, + { + "epoch": 3.7182033096926714, + "grad_norm": 3.1558034420013428, + "learning_rate": 1.6039830614691081e-06, + "loss": 0.4326, + "step": 7864 + }, + { + "epoch": 3.7186761229314422, + "grad_norm": 2.438640594482422, + "learning_rate": 1.603400706013909e-06, + "loss": 0.3647, + "step": 7865 + }, + { + "epoch": 3.719148936170213, + "grad_norm": 3.0443127155303955, + "learning_rate": 1.6028184063860168e-06, + "loss": 0.4295, + "step": 7866 + }, + { + "epoch": 3.7196217494089834, + "grad_norm": 2.8009512424468994, + "learning_rate": 1.602236162621688e-06, + "loss": 0.4055, + "step": 7867 + }, + { + "epoch": 3.720094562647754, + "grad_norm": 3.227698802947998, + "learning_rate": 1.6016539747571775e-06, + "loss": 0.4681, + "step": 7868 + }, + { + "epoch": 3.720567375886525, + "grad_norm": 2.8242595195770264, + "learning_rate": 1.601071842828735e-06, + "loss": 0.3707, + "step": 7869 + }, + { + "epoch": 3.7210401891252953, + "grad_norm": 3.602937698364258, + "learning_rate": 1.6004897668726067e-06, + "loss": 0.5201, + "step": 7870 + }, + { + "epoch": 3.721513002364066, + "grad_norm": 3.578422784805298, + "learning_rate": 1.599907746925037e-06, + "loss": 0.4514, + "step": 7871 + }, + { + "epoch": 3.721985815602837, + "grad_norm": 2.7365758419036865, + "learning_rate": 1.5993257830222635e-06, + "loss": 0.356, + "step": 7872 + }, + { + "epoch": 3.7224586288416077, + "grad_norm": 3.125636577606201, + "learning_rate": 1.5987438752005258e-06, + "loss": 0.4277, + "step": 7873 + }, + { + "epoch": 3.7229314420803785, + "grad_norm": 2.7157294750213623, + "learning_rate": 1.5981620234960549e-06, + "loss": 0.363, + "step": 7874 + }, + { + "epoch": 3.723404255319149, + "grad_norm": 2.90950083732605, + "learning_rate": 1.5975802279450793e-06, + "loss": 0.4027, + "step": 7875 + }, + { + "epoch": 3.7238770685579197, + "grad_norm": 2.659787178039551, + "learning_rate": 1.596998488583827e-06, + "loss": 0.3632, + "step": 7876 + }, + { + "epoch": 3.7243498817966905, + "grad_norm": 3.221623182296753, + "learning_rate": 1.5964168054485185e-06, + "loss": 0.4295, + "step": 7877 + }, + { + "epoch": 3.724822695035461, + "grad_norm": 2.6838672161102295, + "learning_rate": 1.595835178575374e-06, + "loss": 0.3413, + "step": 7878 + }, + { + "epoch": 3.7252955082742316, + "grad_norm": 2.804706335067749, + "learning_rate": 1.5952536080006084e-06, + "loss": 0.3801, + "step": 7879 + }, + { + "epoch": 3.7257683215130024, + "grad_norm": 2.7647509574890137, + "learning_rate": 1.5946720937604326e-06, + "loss": 0.3941, + "step": 7880 + }, + { + "epoch": 3.726241134751773, + "grad_norm": 2.8363754749298096, + "learning_rate": 1.5940906358910566e-06, + "loss": 0.3772, + "step": 7881 + }, + { + "epoch": 3.726713947990544, + "grad_norm": 3.4147698879241943, + "learning_rate": 1.5935092344286835e-06, + "loss": 0.399, + "step": 7882 + }, + { + "epoch": 3.7271867612293144, + "grad_norm": 2.984090805053711, + "learning_rate": 1.5929278894095162e-06, + "loss": 0.3373, + "step": 7883 + }, + { + "epoch": 3.727659574468085, + "grad_norm": 3.250173330307007, + "learning_rate": 1.5923466008697521e-06, + "loss": 0.3932, + "step": 7884 + }, + { + "epoch": 3.728132387706856, + "grad_norm": 3.2699649333953857, + "learning_rate": 1.5917653688455848e-06, + "loss": 0.4529, + "step": 7885 + }, + { + "epoch": 3.7286052009456263, + "grad_norm": 3.175934076309204, + "learning_rate": 1.591184193373206e-06, + "loss": 0.3726, + "step": 7886 + }, + { + "epoch": 3.729078014184397, + "grad_norm": 2.8128812313079834, + "learning_rate": 1.5906030744888024e-06, + "loss": 0.3648, + "step": 7887 + }, + { + "epoch": 3.729550827423168, + "grad_norm": 3.025012493133545, + "learning_rate": 1.5900220122285564e-06, + "loss": 0.3945, + "step": 7888 + }, + { + "epoch": 3.7300236406619387, + "grad_norm": 3.237680435180664, + "learning_rate": 1.5894410066286512e-06, + "loss": 0.3815, + "step": 7889 + }, + { + "epoch": 3.7304964539007095, + "grad_norm": 3.458033323287964, + "learning_rate": 1.5888600577252605e-06, + "loss": 0.4104, + "step": 7890 + }, + { + "epoch": 3.73096926713948, + "grad_norm": 2.718867540359497, + "learning_rate": 1.58827916555456e-06, + "loss": 0.3243, + "step": 7891 + }, + { + "epoch": 3.7314420803782506, + "grad_norm": 3.047157049179077, + "learning_rate": 1.5876983301527176e-06, + "loss": 0.3689, + "step": 7892 + }, + { + "epoch": 3.731914893617021, + "grad_norm": 3.2904715538024902, + "learning_rate": 1.5871175515558995e-06, + "loss": 0.4045, + "step": 7893 + }, + { + "epoch": 3.732387706855792, + "grad_norm": 2.956467866897583, + "learning_rate": 1.5865368298002692e-06, + "loss": 0.3806, + "step": 7894 + }, + { + "epoch": 3.7328605200945626, + "grad_norm": 3.3309173583984375, + "learning_rate": 1.5859561649219843e-06, + "loss": 0.4011, + "step": 7895 + }, + { + "epoch": 3.7333333333333334, + "grad_norm": 2.7853524684906006, + "learning_rate": 1.5853755569572018e-06, + "loss": 0.3239, + "step": 7896 + }, + { + "epoch": 3.733806146572104, + "grad_norm": 2.9832780361175537, + "learning_rate": 1.584795005942073e-06, + "loss": 0.4582, + "step": 7897 + }, + { + "epoch": 3.7342789598108745, + "grad_norm": 3.2866461277008057, + "learning_rate": 1.584214511912745e-06, + "loss": 0.3876, + "step": 7898 + }, + { + "epoch": 3.7347517730496453, + "grad_norm": 3.018526792526245, + "learning_rate": 1.5836340749053646e-06, + "loss": 0.3221, + "step": 7899 + }, + { + "epoch": 3.735224586288416, + "grad_norm": 2.9109885692596436, + "learning_rate": 1.583053694956072e-06, + "loss": 0.4225, + "step": 7900 + }, + { + "epoch": 3.7356973995271865, + "grad_norm": 3.104146718978882, + "learning_rate": 1.5824733721010051e-06, + "loss": 0.3843, + "step": 7901 + }, + { + "epoch": 3.7361702127659573, + "grad_norm": 3.0982813835144043, + "learning_rate": 1.5818931063762989e-06, + "loss": 0.4223, + "step": 7902 + }, + { + "epoch": 3.736643026004728, + "grad_norm": 2.7797579765319824, + "learning_rate": 1.5813128978180819e-06, + "loss": 0.3536, + "step": 7903 + }, + { + "epoch": 3.737115839243499, + "grad_norm": 2.870884656906128, + "learning_rate": 1.5807327464624835e-06, + "loss": 0.3053, + "step": 7904 + }, + { + "epoch": 3.7375886524822697, + "grad_norm": 2.896674633026123, + "learning_rate": 1.5801526523456251e-06, + "loss": 0.3806, + "step": 7905 + }, + { + "epoch": 3.73806146572104, + "grad_norm": 3.009662389755249, + "learning_rate": 1.5795726155036284e-06, + "loss": 0.3568, + "step": 7906 + }, + { + "epoch": 3.738534278959811, + "grad_norm": 2.6860599517822266, + "learning_rate": 1.578992635972609e-06, + "loss": 0.4392, + "step": 7907 + }, + { + "epoch": 3.7390070921985816, + "grad_norm": 2.9046099185943604, + "learning_rate": 1.578412713788679e-06, + "loss": 0.3756, + "step": 7908 + }, + { + "epoch": 3.739479905437352, + "grad_norm": 2.8035101890563965, + "learning_rate": 1.5778328489879488e-06, + "loss": 0.3576, + "step": 7909 + }, + { + "epoch": 3.739952718676123, + "grad_norm": 2.767514228820801, + "learning_rate": 1.5772530416065238e-06, + "loss": 0.4037, + "step": 7910 + }, + { + "epoch": 3.7404255319148936, + "grad_norm": 3.0867795944213867, + "learning_rate": 1.576673291680505e-06, + "loss": 0.4394, + "step": 7911 + }, + { + "epoch": 3.7408983451536644, + "grad_norm": 3.295976161956787, + "learning_rate": 1.5760935992459926e-06, + "loss": 0.3938, + "step": 7912 + }, + { + "epoch": 3.741371158392435, + "grad_norm": 2.725949287414551, + "learning_rate": 1.5755139643390794e-06, + "loss": 0.3633, + "step": 7913 + }, + { + "epoch": 3.7418439716312055, + "grad_norm": 3.0864083766937256, + "learning_rate": 1.5749343869958585e-06, + "loss": 0.3034, + "step": 7914 + }, + { + "epoch": 3.7423167848699763, + "grad_norm": 3.707273244857788, + "learning_rate": 1.5743548672524175e-06, + "loss": 0.4206, + "step": 7915 + }, + { + "epoch": 3.742789598108747, + "grad_norm": 2.9829516410827637, + "learning_rate": 1.573775405144839e-06, + "loss": 0.333, + "step": 7916 + }, + { + "epoch": 3.7432624113475175, + "grad_norm": 3.3303117752075195, + "learning_rate": 1.5731960007092056e-06, + "loss": 0.4558, + "step": 7917 + }, + { + "epoch": 3.7437352245862883, + "grad_norm": 2.63291335105896, + "learning_rate": 1.5726166539815925e-06, + "loss": 0.39, + "step": 7918 + }, + { + "epoch": 3.744208037825059, + "grad_norm": 3.0533673763275146, + "learning_rate": 1.572037364998075e-06, + "loss": 0.3586, + "step": 7919 + }, + { + "epoch": 3.74468085106383, + "grad_norm": 2.9185104370117188, + "learning_rate": 1.5714581337947216e-06, + "loss": 0.3809, + "step": 7920 + }, + { + "epoch": 3.7451536643026007, + "grad_norm": 3.1863298416137695, + "learning_rate": 1.5708789604075975e-06, + "loss": 0.4132, + "step": 7921 + }, + { + "epoch": 3.745626477541371, + "grad_norm": 3.2700514793395996, + "learning_rate": 1.5702998448727674e-06, + "loss": 0.4601, + "step": 7922 + }, + { + "epoch": 3.746099290780142, + "grad_norm": 3.4729206562042236, + "learning_rate": 1.5697207872262886e-06, + "loss": 0.4585, + "step": 7923 + }, + { + "epoch": 3.7465721040189126, + "grad_norm": 3.1432926654815674, + "learning_rate": 1.5691417875042182e-06, + "loss": 0.3128, + "step": 7924 + }, + { + "epoch": 3.747044917257683, + "grad_norm": 3.096121072769165, + "learning_rate": 1.5685628457426066e-06, + "loss": 0.3903, + "step": 7925 + }, + { + "epoch": 3.7475177304964538, + "grad_norm": 2.6897027492523193, + "learning_rate": 1.5679839619775023e-06, + "loss": 0.3707, + "step": 7926 + }, + { + "epoch": 3.7479905437352246, + "grad_norm": 2.8020687103271484, + "learning_rate": 1.5674051362449503e-06, + "loss": 0.3986, + "step": 7927 + }, + { + "epoch": 3.7484633569739954, + "grad_norm": 3.1278326511383057, + "learning_rate": 1.56682636858099e-06, + "loss": 0.3899, + "step": 7928 + }, + { + "epoch": 3.748936170212766, + "grad_norm": 2.907982587814331, + "learning_rate": 1.5662476590216613e-06, + "loss": 0.3422, + "step": 7929 + }, + { + "epoch": 3.7494089834515365, + "grad_norm": 3.1246347427368164, + "learning_rate": 1.5656690076029962e-06, + "loss": 0.452, + "step": 7930 + }, + { + "epoch": 3.7498817966903073, + "grad_norm": 2.9161367416381836, + "learning_rate": 1.565090414361024e-06, + "loss": 0.3284, + "step": 7931 + }, + { + "epoch": 3.750354609929078, + "grad_norm": 2.943183422088623, + "learning_rate": 1.564511879331773e-06, + "loss": 0.3478, + "step": 7932 + }, + { + "epoch": 3.7508274231678485, + "grad_norm": 3.2308566570281982, + "learning_rate": 1.563933402551266e-06, + "loss": 0.4143, + "step": 7933 + }, + { + "epoch": 3.7513002364066192, + "grad_norm": 2.6846251487731934, + "learning_rate": 1.5633549840555206e-06, + "loss": 0.3681, + "step": 7934 + }, + { + "epoch": 3.75177304964539, + "grad_norm": 3.0995283126831055, + "learning_rate": 1.562776623880554e-06, + "loss": 0.4642, + "step": 7935 + }, + { + "epoch": 3.752245862884161, + "grad_norm": 2.7406163215637207, + "learning_rate": 1.562198322062376e-06, + "loss": 0.3823, + "step": 7936 + }, + { + "epoch": 3.7527186761229316, + "grad_norm": 2.85732364654541, + "learning_rate": 1.5616200786369978e-06, + "loss": 0.3053, + "step": 7937 + }, + { + "epoch": 3.753191489361702, + "grad_norm": 2.812526226043701, + "learning_rate": 1.5610418936404223e-06, + "loss": 0.3944, + "step": 7938 + }, + { + "epoch": 3.753664302600473, + "grad_norm": 2.8886849880218506, + "learning_rate": 1.5604637671086499e-06, + "loss": 0.3936, + "step": 7939 + }, + { + "epoch": 3.7541371158392436, + "grad_norm": 2.831774950027466, + "learning_rate": 1.5598856990776801e-06, + "loss": 0.3146, + "step": 7940 + }, + { + "epoch": 3.754609929078014, + "grad_norm": 2.8853790760040283, + "learning_rate": 1.5593076895835052e-06, + "loss": 0.3286, + "step": 7941 + }, + { + "epoch": 3.7550827423167847, + "grad_norm": 3.2724483013153076, + "learning_rate": 1.5587297386621158e-06, + "loss": 0.3396, + "step": 7942 + }, + { + "epoch": 3.7555555555555555, + "grad_norm": 3.5077168941497803, + "learning_rate": 1.5581518463494983e-06, + "loss": 0.4528, + "step": 7943 + }, + { + "epoch": 3.7560283687943263, + "grad_norm": 3.031503915786743, + "learning_rate": 1.5575740126816346e-06, + "loss": 0.3803, + "step": 7944 + }, + { + "epoch": 3.756501182033097, + "grad_norm": 3.0939114093780518, + "learning_rate": 1.556996237694506e-06, + "loss": 0.3931, + "step": 7945 + }, + { + "epoch": 3.7569739952718675, + "grad_norm": 2.9404146671295166, + "learning_rate": 1.556418521424085e-06, + "loss": 0.3608, + "step": 7946 + }, + { + "epoch": 3.7574468085106383, + "grad_norm": 3.4363012313842773, + "learning_rate": 1.5558408639063465e-06, + "loss": 0.4335, + "step": 7947 + }, + { + "epoch": 3.757919621749409, + "grad_norm": 3.2819864749908447, + "learning_rate": 1.5552632651772575e-06, + "loss": 0.4147, + "step": 7948 + }, + { + "epoch": 3.7583924349881794, + "grad_norm": 2.917788505554199, + "learning_rate": 1.554685725272782e-06, + "loss": 0.3516, + "step": 7949 + }, + { + "epoch": 3.7588652482269502, + "grad_norm": 2.8425943851470947, + "learning_rate": 1.5541082442288818e-06, + "loss": 0.3596, + "step": 7950 + }, + { + "epoch": 3.759338061465721, + "grad_norm": 3.087005376815796, + "learning_rate": 1.5535308220815126e-06, + "loss": 0.3968, + "step": 7951 + }, + { + "epoch": 3.759810874704492, + "grad_norm": 2.743110179901123, + "learning_rate": 1.5529534588666298e-06, + "loss": 0.3802, + "step": 7952 + }, + { + "epoch": 3.7602836879432626, + "grad_norm": 2.914424180984497, + "learning_rate": 1.5523761546201825e-06, + "loss": 0.4055, + "step": 7953 + }, + { + "epoch": 3.760756501182033, + "grad_norm": 2.9691991806030273, + "learning_rate": 1.551798909378116e-06, + "loss": 0.3384, + "step": 7954 + }, + { + "epoch": 3.7612293144208038, + "grad_norm": 2.433657646179199, + "learning_rate": 1.5512217231763747e-06, + "loss": 0.3019, + "step": 7955 + }, + { + "epoch": 3.7617021276595746, + "grad_norm": 2.7904880046844482, + "learning_rate": 1.5506445960508957e-06, + "loss": 0.389, + "step": 7956 + }, + { + "epoch": 3.762174940898345, + "grad_norm": 2.9241607189178467, + "learning_rate": 1.5500675280376154e-06, + "loss": 0.4291, + "step": 7957 + }, + { + "epoch": 3.7626477541371157, + "grad_norm": 3.216491222381592, + "learning_rate": 1.549490519172465e-06, + "loss": 0.4065, + "step": 7958 + }, + { + "epoch": 3.7631205673758865, + "grad_norm": 2.8859689235687256, + "learning_rate": 1.548913569491371e-06, + "loss": 0.353, + "step": 7959 + }, + { + "epoch": 3.7635933806146573, + "grad_norm": 2.958773136138916, + "learning_rate": 1.5483366790302594e-06, + "loss": 0.3829, + "step": 7960 + }, + { + "epoch": 3.764066193853428, + "grad_norm": 2.868649482727051, + "learning_rate": 1.5477598478250505e-06, + "loss": 0.3591, + "step": 7961 + }, + { + "epoch": 3.7645390070921985, + "grad_norm": 2.6912996768951416, + "learning_rate": 1.5471830759116591e-06, + "loss": 0.3695, + "step": 7962 + }, + { + "epoch": 3.7650118203309693, + "grad_norm": 3.3318257331848145, + "learning_rate": 1.5466063633260004e-06, + "loss": 0.4126, + "step": 7963 + }, + { + "epoch": 3.76548463356974, + "grad_norm": 2.865525007247925, + "learning_rate": 1.5460297101039825e-06, + "loss": 0.4235, + "step": 7964 + }, + { + "epoch": 3.7659574468085104, + "grad_norm": 2.8639180660247803, + "learning_rate": 1.5454531162815123e-06, + "loss": 0.4392, + "step": 7965 + }, + { + "epoch": 3.766430260047281, + "grad_norm": 2.5752499103546143, + "learning_rate": 1.5448765818944902e-06, + "loss": 0.4113, + "step": 7966 + }, + { + "epoch": 3.766903073286052, + "grad_norm": 2.7622742652893066, + "learning_rate": 1.5443001069788155e-06, + "loss": 0.3785, + "step": 7967 + }, + { + "epoch": 3.767375886524823, + "grad_norm": 2.965579032897949, + "learning_rate": 1.5437236915703829e-06, + "loss": 0.335, + "step": 7968 + }, + { + "epoch": 3.7678486997635936, + "grad_norm": 3.0587408542633057, + "learning_rate": 1.5431473357050816e-06, + "loss": 0.4047, + "step": 7969 + }, + { + "epoch": 3.768321513002364, + "grad_norm": 3.2929413318634033, + "learning_rate": 1.5425710394188014e-06, + "loss": 0.4061, + "step": 7970 + }, + { + "epoch": 3.7687943262411348, + "grad_norm": 2.663043975830078, + "learning_rate": 1.541994802747424e-06, + "loss": 0.3478, + "step": 7971 + }, + { + "epoch": 3.7692671394799055, + "grad_norm": 3.0657591819763184, + "learning_rate": 1.5414186257268293e-06, + "loss": 0.3735, + "step": 7972 + }, + { + "epoch": 3.769739952718676, + "grad_norm": 2.963189125061035, + "learning_rate": 1.5408425083928939e-06, + "loss": 0.4743, + "step": 7973 + }, + { + "epoch": 3.7702127659574467, + "grad_norm": 3.1509387493133545, + "learning_rate": 1.540266450781489e-06, + "loss": 0.4164, + "step": 7974 + }, + { + "epoch": 3.7706855791962175, + "grad_norm": 3.4436306953430176, + "learning_rate": 1.539690452928485e-06, + "loss": 0.4583, + "step": 7975 + }, + { + "epoch": 3.7711583924349883, + "grad_norm": 3.1746156215667725, + "learning_rate": 1.5391145148697454e-06, + "loss": 0.4042, + "step": 7976 + }, + { + "epoch": 3.771631205673759, + "grad_norm": 3.531028985977173, + "learning_rate": 1.5385386366411304e-06, + "loss": 0.4304, + "step": 7977 + }, + { + "epoch": 3.7721040189125294, + "grad_norm": 2.867871046066284, + "learning_rate": 1.5379628182785e-06, + "loss": 0.4023, + "step": 7978 + }, + { + "epoch": 3.7725768321513002, + "grad_norm": 3.0504629611968994, + "learning_rate": 1.5373870598177051e-06, + "loss": 0.3785, + "step": 7979 + }, + { + "epoch": 3.773049645390071, + "grad_norm": 2.8188650608062744, + "learning_rate": 1.5368113612945983e-06, + "loss": 0.3808, + "step": 7980 + }, + { + "epoch": 3.7735224586288414, + "grad_norm": 3.0809133052825928, + "learning_rate": 1.5362357227450248e-06, + "loss": 0.3912, + "step": 7981 + }, + { + "epoch": 3.773995271867612, + "grad_norm": 3.223273277282715, + "learning_rate": 1.5356601442048257e-06, + "loss": 0.3802, + "step": 7982 + }, + { + "epoch": 3.774468085106383, + "grad_norm": 2.7513339519500732, + "learning_rate": 1.535084625709842e-06, + "loss": 0.3822, + "step": 7983 + }, + { + "epoch": 3.774940898345154, + "grad_norm": 3.085592031478882, + "learning_rate": 1.5345091672959074e-06, + "loss": 0.4348, + "step": 7984 + }, + { + "epoch": 3.7754137115839246, + "grad_norm": 3.315108299255371, + "learning_rate": 1.5339337689988525e-06, + "loss": 0.4196, + "step": 7985 + }, + { + "epoch": 3.775886524822695, + "grad_norm": 3.713372230529785, + "learning_rate": 1.533358430854507e-06, + "loss": 0.4292, + "step": 7986 + }, + { + "epoch": 3.7763593380614657, + "grad_norm": 2.7899155616760254, + "learning_rate": 1.532783152898692e-06, + "loss": 0.3874, + "step": 7987 + }, + { + "epoch": 3.7768321513002365, + "grad_norm": 2.918851852416992, + "learning_rate": 1.5322079351672297e-06, + "loss": 0.4073, + "step": 7988 + }, + { + "epoch": 3.777304964539007, + "grad_norm": 3.13395619392395, + "learning_rate": 1.5316327776959361e-06, + "loss": 0.3441, + "step": 7989 + }, + { + "epoch": 3.7777777777777777, + "grad_norm": 3.2320916652679443, + "learning_rate": 1.531057680520623e-06, + "loss": 0.372, + "step": 7990 + }, + { + "epoch": 3.7782505910165485, + "grad_norm": 3.1130621433258057, + "learning_rate": 1.5304826436770991e-06, + "loss": 0.3514, + "step": 7991 + }, + { + "epoch": 3.7787234042553193, + "grad_norm": 3.223207712173462, + "learning_rate": 1.5299076672011696e-06, + "loss": 0.44, + "step": 7992 + }, + { + "epoch": 3.77919621749409, + "grad_norm": 3.0757877826690674, + "learning_rate": 1.5293327511286366e-06, + "loss": 0.4051, + "step": 7993 + }, + { + "epoch": 3.7796690307328604, + "grad_norm": 2.936678409576416, + "learning_rate": 1.528757895495297e-06, + "loss": 0.3965, + "step": 7994 + }, + { + "epoch": 3.780141843971631, + "grad_norm": 2.993445873260498, + "learning_rate": 1.5281831003369435e-06, + "loss": 0.4222, + "step": 7995 + }, + { + "epoch": 3.780614657210402, + "grad_norm": 2.9140853881835938, + "learning_rate": 1.5276083656893679e-06, + "loss": 0.3662, + "step": 7996 + }, + { + "epoch": 3.7810874704491724, + "grad_norm": 3.2649893760681152, + "learning_rate": 1.5270336915883549e-06, + "loss": 0.4272, + "step": 7997 + }, + { + "epoch": 3.781560283687943, + "grad_norm": 3.0631372928619385, + "learning_rate": 1.5264590780696887e-06, + "loss": 0.4111, + "step": 7998 + }, + { + "epoch": 3.782033096926714, + "grad_norm": 2.791299343109131, + "learning_rate": 1.5258845251691463e-06, + "loss": 0.416, + "step": 7999 + }, + { + "epoch": 3.7825059101654848, + "grad_norm": 3.262294054031372, + "learning_rate": 1.5253100329225023e-06, + "loss": 0.4236, + "step": 8000 + }, + { + "epoch": 3.7829787234042556, + "grad_norm": 2.574486017227173, + "learning_rate": 1.5247356013655295e-06, + "loss": 0.4089, + "step": 8001 + }, + { + "epoch": 3.783451536643026, + "grad_norm": 3.1566531658172607, + "learning_rate": 1.5241612305339936e-06, + "loss": 0.3955, + "step": 8002 + }, + { + "epoch": 3.7839243498817967, + "grad_norm": 2.5845813751220703, + "learning_rate": 1.5235869204636602e-06, + "loss": 0.3672, + "step": 8003 + }, + { + "epoch": 3.7843971631205675, + "grad_norm": 2.877570629119873, + "learning_rate": 1.5230126711902876e-06, + "loss": 0.3919, + "step": 8004 + }, + { + "epoch": 3.784869976359338, + "grad_norm": 3.183061122894287, + "learning_rate": 1.5224384827496314e-06, + "loss": 0.3291, + "step": 8005 + }, + { + "epoch": 3.7853427895981087, + "grad_norm": 3.0778391361236572, + "learning_rate": 1.5218643551774451e-06, + "loss": 0.3571, + "step": 8006 + }, + { + "epoch": 3.7858156028368795, + "grad_norm": 3.2364399433135986, + "learning_rate": 1.5212902885094762e-06, + "loss": 0.4045, + "step": 8007 + }, + { + "epoch": 3.7862884160756503, + "grad_norm": 3.0571746826171875, + "learning_rate": 1.5207162827814687e-06, + "loss": 0.4181, + "step": 8008 + }, + { + "epoch": 3.786761229314421, + "grad_norm": 2.7215163707733154, + "learning_rate": 1.5201423380291652e-06, + "loss": 0.3328, + "step": 8009 + }, + { + "epoch": 3.7872340425531914, + "grad_norm": 3.0521233081817627, + "learning_rate": 1.5195684542883007e-06, + "loss": 0.4072, + "step": 8010 + }, + { + "epoch": 3.787706855791962, + "grad_norm": 2.541666269302368, + "learning_rate": 1.5189946315946104e-06, + "loss": 0.3293, + "step": 8011 + }, + { + "epoch": 3.788179669030733, + "grad_norm": 3.0041720867156982, + "learning_rate": 1.5184208699838232e-06, + "loss": 0.3998, + "step": 8012 + }, + { + "epoch": 3.7886524822695034, + "grad_norm": 3.0763001441955566, + "learning_rate": 1.5178471694916635e-06, + "loss": 0.38, + "step": 8013 + }, + { + "epoch": 3.789125295508274, + "grad_norm": 3.0788497924804688, + "learning_rate": 1.5172735301538544e-06, + "loss": 0.3986, + "step": 8014 + }, + { + "epoch": 3.789598108747045, + "grad_norm": 2.830225944519043, + "learning_rate": 1.5166999520061127e-06, + "loss": 0.3977, + "step": 8015 + }, + { + "epoch": 3.7900709219858157, + "grad_norm": 3.196078062057495, + "learning_rate": 1.5161264350841543e-06, + "loss": 0.4058, + "step": 8016 + }, + { + "epoch": 3.7905437352245865, + "grad_norm": 9.898200988769531, + "learning_rate": 1.5155529794236884e-06, + "loss": 0.3451, + "step": 8017 + }, + { + "epoch": 3.791016548463357, + "grad_norm": 3.0028066635131836, + "learning_rate": 1.514979585060421e-06, + "loss": 0.4029, + "step": 8018 + }, + { + "epoch": 3.7914893617021277, + "grad_norm": 2.984926223754883, + "learning_rate": 1.5144062520300562e-06, + "loss": 0.3995, + "step": 8019 + }, + { + "epoch": 3.7919621749408985, + "grad_norm": 2.938596487045288, + "learning_rate": 1.5138329803682925e-06, + "loss": 0.386, + "step": 8020 + }, + { + "epoch": 3.792434988179669, + "grad_norm": 2.992565393447876, + "learning_rate": 1.513259770110825e-06, + "loss": 0.3919, + "step": 8021 + }, + { + "epoch": 3.7929078014184396, + "grad_norm": 3.0182361602783203, + "learning_rate": 1.5126866212933453e-06, + "loss": 0.3506, + "step": 8022 + }, + { + "epoch": 3.7933806146572104, + "grad_norm": 3.2039108276367188, + "learning_rate": 1.5121135339515392e-06, + "loss": 0.3807, + "step": 8023 + }, + { + "epoch": 3.7938534278959812, + "grad_norm": 2.9290878772735596, + "learning_rate": 1.5115405081210927e-06, + "loss": 0.3596, + "step": 8024 + }, + { + "epoch": 3.794326241134752, + "grad_norm": 3.106152057647705, + "learning_rate": 1.510967543837683e-06, + "loss": 0.3703, + "step": 8025 + }, + { + "epoch": 3.7947990543735224, + "grad_norm": 2.9752190113067627, + "learning_rate": 1.510394641136989e-06, + "loss": 0.4049, + "step": 8026 + }, + { + "epoch": 3.795271867612293, + "grad_norm": 2.996206283569336, + "learning_rate": 1.5098218000546815e-06, + "loss": 0.4286, + "step": 8027 + }, + { + "epoch": 3.795744680851064, + "grad_norm": 2.9403493404388428, + "learning_rate": 1.5092490206264281e-06, + "loss": 0.3628, + "step": 8028 + }, + { + "epoch": 3.7962174940898343, + "grad_norm": 2.8101110458374023, + "learning_rate": 1.5086763028878943e-06, + "loss": 0.4016, + "step": 8029 + }, + { + "epoch": 3.796690307328605, + "grad_norm": 3.162264108657837, + "learning_rate": 1.5081036468747401e-06, + "loss": 0.4133, + "step": 8030 + }, + { + "epoch": 3.797163120567376, + "grad_norm": 2.6871988773345947, + "learning_rate": 1.5075310526226223e-06, + "loss": 0.3748, + "step": 8031 + }, + { + "epoch": 3.7976359338061467, + "grad_norm": 2.997924327850342, + "learning_rate": 1.5069585201671944e-06, + "loss": 0.4083, + "step": 8032 + }, + { + "epoch": 3.7981087470449175, + "grad_norm": 2.8266279697418213, + "learning_rate": 1.506386049544104e-06, + "loss": 0.4488, + "step": 8033 + }, + { + "epoch": 3.798581560283688, + "grad_norm": 2.7106378078460693, + "learning_rate": 1.5058136407889985e-06, + "loss": 0.363, + "step": 8034 + }, + { + "epoch": 3.7990543735224587, + "grad_norm": 2.8983304500579834, + "learning_rate": 1.5052412939375183e-06, + "loss": 0.4156, + "step": 8035 + }, + { + "epoch": 3.7995271867612295, + "grad_norm": 3.0333914756774902, + "learning_rate": 1.5046690090253001e-06, + "loss": 0.3694, + "step": 8036 + }, + { + "epoch": 3.8, + "grad_norm": 2.872662305831909, + "learning_rate": 1.5040967860879785e-06, + "loss": 0.3492, + "step": 8037 + }, + { + "epoch": 3.8004728132387706, + "grad_norm": 2.7279646396636963, + "learning_rate": 1.5035246251611835e-06, + "loss": 0.327, + "step": 8038 + }, + { + "epoch": 3.8009456264775414, + "grad_norm": 2.969326972961426, + "learning_rate": 1.5029525262805405e-06, + "loss": 0.3977, + "step": 8039 + }, + { + "epoch": 3.801418439716312, + "grad_norm": 3.073899745941162, + "learning_rate": 1.5023804894816723e-06, + "loss": 0.388, + "step": 8040 + }, + { + "epoch": 3.801891252955083, + "grad_norm": 3.026284694671631, + "learning_rate": 1.5018085148001953e-06, + "loss": 0.3761, + "step": 8041 + }, + { + "epoch": 3.8023640661938534, + "grad_norm": 3.0478618144989014, + "learning_rate": 1.5012366022717262e-06, + "loss": 0.4415, + "step": 8042 + }, + { + "epoch": 3.802836879432624, + "grad_norm": 2.801584005355835, + "learning_rate": 1.500664751931874e-06, + "loss": 0.4079, + "step": 8043 + }, + { + "epoch": 3.803309692671395, + "grad_norm": 3.4839112758636475, + "learning_rate": 1.5000929638162459e-06, + "loss": 0.4391, + "step": 8044 + }, + { + "epoch": 3.8037825059101653, + "grad_norm": 2.6945605278015137, + "learning_rate": 1.4995212379604446e-06, + "loss": 0.3564, + "step": 8045 + }, + { + "epoch": 3.804255319148936, + "grad_norm": 3.0870234966278076, + "learning_rate": 1.4989495744000687e-06, + "loss": 0.3801, + "step": 8046 + }, + { + "epoch": 3.804728132387707, + "grad_norm": 2.975332021713257, + "learning_rate": 1.4983779731707135e-06, + "loss": 0.3408, + "step": 8047 + }, + { + "epoch": 3.8052009456264777, + "grad_norm": 2.9920027256011963, + "learning_rate": 1.497806434307969e-06, + "loss": 0.3875, + "step": 8048 + }, + { + "epoch": 3.8056737588652485, + "grad_norm": 3.1974916458129883, + "learning_rate": 1.4972349578474244e-06, + "loss": 0.4492, + "step": 8049 + }, + { + "epoch": 3.806146572104019, + "grad_norm": 2.839503526687622, + "learning_rate": 1.4966635438246622e-06, + "loss": 0.3785, + "step": 8050 + }, + { + "epoch": 3.8066193853427897, + "grad_norm": 3.274502992630005, + "learning_rate": 1.4960921922752603e-06, + "loss": 0.4404, + "step": 8051 + }, + { + "epoch": 3.8070921985815604, + "grad_norm": 3.0852737426757812, + "learning_rate": 1.4955209032347967e-06, + "loss": 0.4047, + "step": 8052 + }, + { + "epoch": 3.807565011820331, + "grad_norm": 2.9251608848571777, + "learning_rate": 1.4949496767388417e-06, + "loss": 0.3654, + "step": 8053 + }, + { + "epoch": 3.8080378250591016, + "grad_norm": 2.518220901489258, + "learning_rate": 1.4943785128229635e-06, + "loss": 0.3157, + "step": 8054 + }, + { + "epoch": 3.8085106382978724, + "grad_norm": 3.3993279933929443, + "learning_rate": 1.4938074115227257e-06, + "loss": 0.4204, + "step": 8055 + }, + { + "epoch": 3.808983451536643, + "grad_norm": 3.2847096920013428, + "learning_rate": 1.4932363728736876e-06, + "loss": 0.339, + "step": 8056 + }, + { + "epoch": 3.8094562647754135, + "grad_norm": 2.7779417037963867, + "learning_rate": 1.492665396911407e-06, + "loss": 0.3538, + "step": 8057 + }, + { + "epoch": 3.8099290780141843, + "grad_norm": 2.958131790161133, + "learning_rate": 1.4920944836714353e-06, + "loss": 0.363, + "step": 8058 + }, + { + "epoch": 3.810401891252955, + "grad_norm": 3.1873440742492676, + "learning_rate": 1.491523633189319e-06, + "loss": 0.3785, + "step": 8059 + }, + { + "epoch": 3.8108747044917255, + "grad_norm": 3.132652759552002, + "learning_rate": 1.4909528455006055e-06, + "loss": 0.375, + "step": 8060 + }, + { + "epoch": 3.8113475177304963, + "grad_norm": 2.8598761558532715, + "learning_rate": 1.490382120640833e-06, + "loss": 0.4152, + "step": 8061 + }, + { + "epoch": 3.811820330969267, + "grad_norm": 3.115870952606201, + "learning_rate": 1.4898114586455399e-06, + "loss": 0.4609, + "step": 8062 + }, + { + "epoch": 3.812293144208038, + "grad_norm": 3.347944974899292, + "learning_rate": 1.4892408595502571e-06, + "loss": 0.3836, + "step": 8063 + }, + { + "epoch": 3.8127659574468087, + "grad_norm": 3.1747031211853027, + "learning_rate": 1.4886703233905132e-06, + "loss": 0.374, + "step": 8064 + }, + { + "epoch": 3.813238770685579, + "grad_norm": 2.945139169692993, + "learning_rate": 1.4880998502018345e-06, + "loss": 0.3652, + "step": 8065 + }, + { + "epoch": 3.81371158392435, + "grad_norm": 2.8911492824554443, + "learning_rate": 1.4875294400197403e-06, + "loss": 0.3683, + "step": 8066 + }, + { + "epoch": 3.8141843971631206, + "grad_norm": 3.080268383026123, + "learning_rate": 1.4869590928797491e-06, + "loss": 0.3919, + "step": 8067 + }, + { + "epoch": 3.814657210401891, + "grad_norm": 3.0834288597106934, + "learning_rate": 1.4863888088173734e-06, + "loss": 0.3988, + "step": 8068 + }, + { + "epoch": 3.815130023640662, + "grad_norm": 2.765702724456787, + "learning_rate": 1.4858185878681213e-06, + "loss": 0.3659, + "step": 8069 + }, + { + "epoch": 3.8156028368794326, + "grad_norm": 3.074059247970581, + "learning_rate": 1.4852484300674993e-06, + "loss": 0.3888, + "step": 8070 + }, + { + "epoch": 3.8160756501182034, + "grad_norm": 3.0009944438934326, + "learning_rate": 1.484678335451007e-06, + "loss": 0.417, + "step": 8071 + }, + { + "epoch": 3.816548463356974, + "grad_norm": 2.6661112308502197, + "learning_rate": 1.4841083040541438e-06, + "loss": 0.3544, + "step": 8072 + }, + { + "epoch": 3.8170212765957445, + "grad_norm": 2.7849514484405518, + "learning_rate": 1.4835383359124018e-06, + "loss": 0.3691, + "step": 8073 + }, + { + "epoch": 3.8174940898345153, + "grad_norm": 3.008070707321167, + "learning_rate": 1.4829684310612697e-06, + "loss": 0.4228, + "step": 8074 + }, + { + "epoch": 3.817966903073286, + "grad_norm": 2.649296998977661, + "learning_rate": 1.4823985895362348e-06, + "loss": 0.3642, + "step": 8075 + }, + { + "epoch": 3.8184397163120565, + "grad_norm": 2.6017661094665527, + "learning_rate": 1.4818288113727768e-06, + "loss": 0.3537, + "step": 8076 + }, + { + "epoch": 3.8189125295508273, + "grad_norm": 2.9071972370147705, + "learning_rate": 1.481259096606375e-06, + "loss": 0.3096, + "step": 8077 + }, + { + "epoch": 3.819385342789598, + "grad_norm": 3.0866518020629883, + "learning_rate": 1.4806894452725024e-06, + "loss": 0.4148, + "step": 8078 + }, + { + "epoch": 3.819858156028369, + "grad_norm": 3.2099499702453613, + "learning_rate": 1.4801198574066272e-06, + "loss": 0.4058, + "step": 8079 + }, + { + "epoch": 3.8203309692671397, + "grad_norm": 3.0204920768737793, + "learning_rate": 1.4795503330442176e-06, + "loss": 0.3427, + "step": 8080 + }, + { + "epoch": 3.82080378250591, + "grad_norm": 2.88667368888855, + "learning_rate": 1.478980872220734e-06, + "loss": 0.4075, + "step": 8081 + }, + { + "epoch": 3.821276595744681, + "grad_norm": 2.926673173904419, + "learning_rate": 1.4784114749716338e-06, + "loss": 0.3449, + "step": 8082 + }, + { + "epoch": 3.8217494089834516, + "grad_norm": 2.818936347961426, + "learning_rate": 1.4778421413323723e-06, + "loss": 0.3628, + "step": 8083 + }, + { + "epoch": 3.822222222222222, + "grad_norm": 2.960322380065918, + "learning_rate": 1.4772728713383983e-06, + "loss": 0.3669, + "step": 8084 + }, + { + "epoch": 3.8226950354609928, + "grad_norm": 2.940131902694702, + "learning_rate": 1.4767036650251584e-06, + "loss": 0.4357, + "step": 8085 + }, + { + "epoch": 3.8231678486997636, + "grad_norm": 2.9251785278320312, + "learning_rate": 1.4761345224280943e-06, + "loss": 0.4046, + "step": 8086 + }, + { + "epoch": 3.8236406619385344, + "grad_norm": 3.115590810775757, + "learning_rate": 1.475565443582643e-06, + "loss": 0.3712, + "step": 8087 + }, + { + "epoch": 3.824113475177305, + "grad_norm": 2.5968618392944336, + "learning_rate": 1.4749964285242408e-06, + "loss": 0.3432, + "step": 8088 + }, + { + "epoch": 3.8245862884160755, + "grad_norm": 3.195409059524536, + "learning_rate": 1.4744274772883148e-06, + "loss": 0.3717, + "step": 8089 + }, + { + "epoch": 3.8250591016548463, + "grad_norm": 2.8658018112182617, + "learning_rate": 1.4738585899102942e-06, + "loss": 0.3807, + "step": 8090 + }, + { + "epoch": 3.825531914893617, + "grad_norm": 2.9005510807037354, + "learning_rate": 1.4732897664255998e-06, + "loss": 0.3988, + "step": 8091 + }, + { + "epoch": 3.8260047281323875, + "grad_norm": 3.9155731201171875, + "learning_rate": 1.472721006869649e-06, + "loss": 0.3981, + "step": 8092 + }, + { + "epoch": 3.8264775413711583, + "grad_norm": 2.89312744140625, + "learning_rate": 1.4721523112778575e-06, + "loss": 0.3286, + "step": 8093 + }, + { + "epoch": 3.826950354609929, + "grad_norm": 3.006071090698242, + "learning_rate": 1.4715836796856332e-06, + "loss": 0.3901, + "step": 8094 + }, + { + "epoch": 3.8274231678487, + "grad_norm": 3.083411693572998, + "learning_rate": 1.4710151121283845e-06, + "loss": 0.3741, + "step": 8095 + }, + { + "epoch": 3.8278959810874706, + "grad_norm": 2.864989995956421, + "learning_rate": 1.4704466086415131e-06, + "loss": 0.3887, + "step": 8096 + }, + { + "epoch": 3.828368794326241, + "grad_norm": 2.4846417903900146, + "learning_rate": 1.4698781692604158e-06, + "loss": 0.33, + "step": 8097 + }, + { + "epoch": 3.828841607565012, + "grad_norm": 3.2497007846832275, + "learning_rate": 1.4693097940204893e-06, + "loss": 0.4011, + "step": 8098 + }, + { + "epoch": 3.8293144208037826, + "grad_norm": 3.0079777240753174, + "learning_rate": 1.4687414829571218e-06, + "loss": 0.4263, + "step": 8099 + }, + { + "epoch": 3.829787234042553, + "grad_norm": 2.8538410663604736, + "learning_rate": 1.4681732361057005e-06, + "loss": 0.3651, + "step": 8100 + }, + { + "epoch": 3.8302600472813237, + "grad_norm": 3.238163948059082, + "learning_rate": 1.4676050535016076e-06, + "loss": 0.392, + "step": 8101 + }, + { + "epoch": 3.8307328605200945, + "grad_norm": 2.9991304874420166, + "learning_rate": 1.46703693518022e-06, + "loss": 0.3643, + "step": 8102 + }, + { + "epoch": 3.8312056737588653, + "grad_norm": 2.9816839694976807, + "learning_rate": 1.466468881176914e-06, + "loss": 0.3803, + "step": 8103 + }, + { + "epoch": 3.831678486997636, + "grad_norm": 3.2009265422821045, + "learning_rate": 1.465900891527059e-06, + "loss": 0.3828, + "step": 8104 + }, + { + "epoch": 3.8321513002364065, + "grad_norm": 2.9479124546051025, + "learning_rate": 1.4653329662660201e-06, + "loss": 0.3683, + "step": 8105 + }, + { + "epoch": 3.8326241134751773, + "grad_norm": 2.938507080078125, + "learning_rate": 1.4647651054291614e-06, + "loss": 0.3703, + "step": 8106 + }, + { + "epoch": 3.833096926713948, + "grad_norm": 2.7777645587921143, + "learning_rate": 1.4641973090518397e-06, + "loss": 0.3982, + "step": 8107 + }, + { + "epoch": 3.8335697399527184, + "grad_norm": 3.2470149993896484, + "learning_rate": 1.4636295771694099e-06, + "loss": 0.3748, + "step": 8108 + }, + { + "epoch": 3.8340425531914892, + "grad_norm": 2.869310140609741, + "learning_rate": 1.4630619098172223e-06, + "loss": 0.3577, + "step": 8109 + }, + { + "epoch": 3.83451536643026, + "grad_norm": 3.1245369911193848, + "learning_rate": 1.4624943070306225e-06, + "loss": 0.4518, + "step": 8110 + }, + { + "epoch": 3.834988179669031, + "grad_norm": 3.0390701293945312, + "learning_rate": 1.4619267688449529e-06, + "loss": 0.5051, + "step": 8111 + }, + { + "epoch": 3.8354609929078016, + "grad_norm": 2.929943799972534, + "learning_rate": 1.4613592952955507e-06, + "loss": 0.4207, + "step": 8112 + }, + { + "epoch": 3.835933806146572, + "grad_norm": 3.17008376121521, + "learning_rate": 1.4607918864177523e-06, + "loss": 0.3836, + "step": 8113 + }, + { + "epoch": 3.8364066193853428, + "grad_norm": 3.0689237117767334, + "learning_rate": 1.460224542246886e-06, + "loss": 0.3413, + "step": 8114 + }, + { + "epoch": 3.8368794326241136, + "grad_norm": 2.9966423511505127, + "learning_rate": 1.4596572628182774e-06, + "loss": 0.4367, + "step": 8115 + }, + { + "epoch": 3.837352245862884, + "grad_norm": 3.0572052001953125, + "learning_rate": 1.45909004816725e-06, + "loss": 0.4089, + "step": 8116 + }, + { + "epoch": 3.8378250591016547, + "grad_norm": 2.911263942718506, + "learning_rate": 1.4585228983291203e-06, + "loss": 0.3848, + "step": 8117 + }, + { + "epoch": 3.8382978723404255, + "grad_norm": 2.9233853816986084, + "learning_rate": 1.4579558133392038e-06, + "loss": 0.4012, + "step": 8118 + }, + { + "epoch": 3.8387706855791963, + "grad_norm": 2.7813868522644043, + "learning_rate": 1.4573887932328097e-06, + "loss": 0.3898, + "step": 8119 + }, + { + "epoch": 3.839243498817967, + "grad_norm": 2.8727006912231445, + "learning_rate": 1.4568218380452436e-06, + "loss": 0.3965, + "step": 8120 + }, + { + "epoch": 3.8397163120567375, + "grad_norm": 3.0381174087524414, + "learning_rate": 1.4562549478118077e-06, + "loss": 0.4304, + "step": 8121 + }, + { + "epoch": 3.8401891252955083, + "grad_norm": 2.7406346797943115, + "learning_rate": 1.4556881225677982e-06, + "loss": 0.3636, + "step": 8122 + }, + { + "epoch": 3.840661938534279, + "grad_norm": 3.3900108337402344, + "learning_rate": 1.4551213623485111e-06, + "loss": 0.3863, + "step": 8123 + }, + { + "epoch": 3.8411347517730494, + "grad_norm": 2.885150909423828, + "learning_rate": 1.4545546671892354e-06, + "loss": 0.3679, + "step": 8124 + }, + { + "epoch": 3.84160756501182, + "grad_norm": 3.3361690044403076, + "learning_rate": 1.4539880371252555e-06, + "loss": 0.4333, + "step": 8125 + }, + { + "epoch": 3.842080378250591, + "grad_norm": 3.1547763347625732, + "learning_rate": 1.4534214721918545e-06, + "loss": 0.4477, + "step": 8126 + }, + { + "epoch": 3.842553191489362, + "grad_norm": 3.0337510108947754, + "learning_rate": 1.4528549724243095e-06, + "loss": 0.3647, + "step": 8127 + }, + { + "epoch": 3.8430260047281326, + "grad_norm": 2.8390069007873535, + "learning_rate": 1.452288537857893e-06, + "loss": 0.3698, + "step": 8128 + }, + { + "epoch": 3.843498817966903, + "grad_norm": 2.857513427734375, + "learning_rate": 1.451722168527876e-06, + "loss": 0.3842, + "step": 8129 + }, + { + "epoch": 3.8439716312056738, + "grad_norm": 3.015320062637329, + "learning_rate": 1.451155864469522e-06, + "loss": 0.4058, + "step": 8130 + }, + { + "epoch": 3.8444444444444446, + "grad_norm": 2.923957347869873, + "learning_rate": 1.450589625718094e-06, + "loss": 0.3976, + "step": 8131 + }, + { + "epoch": 3.844917257683215, + "grad_norm": 3.332338571548462, + "learning_rate": 1.4500234523088492e-06, + "loss": 0.4118, + "step": 8132 + }, + { + "epoch": 3.8453900709219857, + "grad_norm": 3.0403711795806885, + "learning_rate": 1.4494573442770381e-06, + "loss": 0.3715, + "step": 8133 + }, + { + "epoch": 3.8458628841607565, + "grad_norm": 3.2310287952423096, + "learning_rate": 1.4488913016579135e-06, + "loss": 0.4587, + "step": 8134 + }, + { + "epoch": 3.8463356973995273, + "grad_norm": 3.091282844543457, + "learning_rate": 1.448325324486718e-06, + "loss": 0.4234, + "step": 8135 + }, + { + "epoch": 3.846808510638298, + "grad_norm": 3.11161208152771, + "learning_rate": 1.4477594127986933e-06, + "loss": 0.4176, + "step": 8136 + }, + { + "epoch": 3.8472813238770684, + "grad_norm": 3.21042537689209, + "learning_rate": 1.4471935666290751e-06, + "loss": 0.4326, + "step": 8137 + }, + { + "epoch": 3.8477541371158392, + "grad_norm": 3.411543846130371, + "learning_rate": 1.4466277860130981e-06, + "loss": 0.4525, + "step": 8138 + }, + { + "epoch": 3.84822695035461, + "grad_norm": 3.0475308895111084, + "learning_rate": 1.4460620709859898e-06, + "loss": 0.3906, + "step": 8139 + }, + { + "epoch": 3.8486997635933804, + "grad_norm": 2.989367723464966, + "learning_rate": 1.4454964215829742e-06, + "loss": 0.3732, + "step": 8140 + }, + { + "epoch": 3.849172576832151, + "grad_norm": 2.8130393028259277, + "learning_rate": 1.4449308378392734e-06, + "loss": 0.3733, + "step": 8141 + }, + { + "epoch": 3.849645390070922, + "grad_norm": 12.2243013381958, + "learning_rate": 1.444365319790103e-06, + "loss": 0.3506, + "step": 8142 + }, + { + "epoch": 3.850118203309693, + "grad_norm": 3.075556516647339, + "learning_rate": 1.4437998674706743e-06, + "loss": 0.376, + "step": 8143 + }, + { + "epoch": 3.8505910165484636, + "grad_norm": 2.765650510787964, + "learning_rate": 1.4432344809161974e-06, + "loss": 0.3865, + "step": 8144 + }, + { + "epoch": 3.851063829787234, + "grad_norm": 3.171588897705078, + "learning_rate": 1.4426691601618747e-06, + "loss": 0.4391, + "step": 8145 + }, + { + "epoch": 3.8515366430260047, + "grad_norm": 2.8378992080688477, + "learning_rate": 1.4421039052429083e-06, + "loss": 0.3984, + "step": 8146 + }, + { + "epoch": 3.8520094562647755, + "grad_norm": 2.6588387489318848, + "learning_rate": 1.4415387161944929e-06, + "loss": 0.3961, + "step": 8147 + }, + { + "epoch": 3.852482269503546, + "grad_norm": 2.919325351715088, + "learning_rate": 1.4409735930518197e-06, + "loss": 0.4058, + "step": 8148 + }, + { + "epoch": 3.8529550827423167, + "grad_norm": 3.2239115238189697, + "learning_rate": 1.4404085358500778e-06, + "loss": 0.4018, + "step": 8149 + }, + { + "epoch": 3.8534278959810875, + "grad_norm": 3.2509875297546387, + "learning_rate": 1.4398435446244502e-06, + "loss": 0.4078, + "step": 8150 + }, + { + "epoch": 3.8539007092198583, + "grad_norm": 3.124782085418701, + "learning_rate": 1.4392786194101155e-06, + "loss": 0.4459, + "step": 8151 + }, + { + "epoch": 3.854373522458629, + "grad_norm": 2.924095392227173, + "learning_rate": 1.4387137602422512e-06, + "loss": 0.3686, + "step": 8152 + }, + { + "epoch": 3.8548463356973994, + "grad_norm": 2.9307191371917725, + "learning_rate": 1.4381489671560272e-06, + "loss": 0.4345, + "step": 8153 + }, + { + "epoch": 3.8553191489361702, + "grad_norm": 2.868488073348999, + "learning_rate": 1.4375842401866113e-06, + "loss": 0.366, + "step": 8154 + }, + { + "epoch": 3.855791962174941, + "grad_norm": 2.9893085956573486, + "learning_rate": 1.4370195793691661e-06, + "loss": 0.3401, + "step": 8155 + }, + { + "epoch": 3.8562647754137114, + "grad_norm": 3.0113472938537598, + "learning_rate": 1.4364549847388492e-06, + "loss": 0.4051, + "step": 8156 + }, + { + "epoch": 3.856737588652482, + "grad_norm": 3.4693121910095215, + "learning_rate": 1.4358904563308184e-06, + "loss": 0.4505, + "step": 8157 + }, + { + "epoch": 3.857210401891253, + "grad_norm": 2.9048118591308594, + "learning_rate": 1.4353259941802216e-06, + "loss": 0.3973, + "step": 8158 + }, + { + "epoch": 3.8576832151300238, + "grad_norm": 3.264910936355591, + "learning_rate": 1.434761598322208e-06, + "loss": 0.4317, + "step": 8159 + }, + { + "epoch": 3.8581560283687946, + "grad_norm": 2.973742723464966, + "learning_rate": 1.4341972687919186e-06, + "loss": 0.3896, + "step": 8160 + }, + { + "epoch": 3.858628841607565, + "grad_norm": 2.7802605628967285, + "learning_rate": 1.4336330056244906e-06, + "loss": 0.4063, + "step": 8161 + }, + { + "epoch": 3.8591016548463357, + "grad_norm": 3.1401731967926025, + "learning_rate": 1.433068808855061e-06, + "loss": 0.4068, + "step": 8162 + }, + { + "epoch": 3.8595744680851065, + "grad_norm": 3.132723331451416, + "learning_rate": 1.432504678518757e-06, + "loss": 0.4724, + "step": 8163 + }, + { + "epoch": 3.860047281323877, + "grad_norm": 2.94944167137146, + "learning_rate": 1.4319406146507068e-06, + "loss": 0.3666, + "step": 8164 + }, + { + "epoch": 3.8605200945626477, + "grad_norm": 2.972322463989258, + "learning_rate": 1.4313766172860311e-06, + "loss": 0.4226, + "step": 8165 + }, + { + "epoch": 3.8609929078014185, + "grad_norm": 2.9808123111724854, + "learning_rate": 1.430812686459847e-06, + "loss": 0.4079, + "step": 8166 + }, + { + "epoch": 3.8614657210401893, + "grad_norm": 2.9656291007995605, + "learning_rate": 1.4302488222072698e-06, + "loss": 0.3423, + "step": 8167 + }, + { + "epoch": 3.86193853427896, + "grad_norm": 2.886765241622925, + "learning_rate": 1.4296850245634073e-06, + "loss": 0.3577, + "step": 8168 + }, + { + "epoch": 3.8624113475177304, + "grad_norm": 3.0613043308258057, + "learning_rate": 1.4291212935633653e-06, + "loss": 0.4121, + "step": 8169 + }, + { + "epoch": 3.862884160756501, + "grad_norm": 2.842050313949585, + "learning_rate": 1.4285576292422445e-06, + "loss": 0.373, + "step": 8170 + }, + { + "epoch": 3.863356973995272, + "grad_norm": 3.0604517459869385, + "learning_rate": 1.4279940316351413e-06, + "loss": 0.3938, + "step": 8171 + }, + { + "epoch": 3.8638297872340424, + "grad_norm": 3.9742302894592285, + "learning_rate": 1.42743050077715e-06, + "loss": 0.4463, + "step": 8172 + }, + { + "epoch": 3.864302600472813, + "grad_norm": 2.8330607414245605, + "learning_rate": 1.4268670367033572e-06, + "loss": 0.4423, + "step": 8173 + }, + { + "epoch": 3.864775413711584, + "grad_norm": 2.953256607055664, + "learning_rate": 1.4263036394488497e-06, + "loss": 0.3553, + "step": 8174 + }, + { + "epoch": 3.8652482269503547, + "grad_norm": 2.865849018096924, + "learning_rate": 1.4257403090487065e-06, + "loss": 0.3348, + "step": 8175 + }, + { + "epoch": 3.8657210401891255, + "grad_norm": 2.712502956390381, + "learning_rate": 1.4251770455380027e-06, + "loss": 0.3896, + "step": 8176 + }, + { + "epoch": 3.866193853427896, + "grad_norm": 2.798898220062256, + "learning_rate": 1.4246138489518123e-06, + "loss": 0.4275, + "step": 8177 + }, + { + "epoch": 3.8666666666666667, + "grad_norm": 2.830899953842163, + "learning_rate": 1.4240507193252023e-06, + "loss": 0.3952, + "step": 8178 + }, + { + "epoch": 3.8671394799054375, + "grad_norm": 2.5789451599121094, + "learning_rate": 1.4234876566932348e-06, + "loss": 0.3483, + "step": 8179 + }, + { + "epoch": 3.867612293144208, + "grad_norm": 2.8513095378875732, + "learning_rate": 1.422924661090972e-06, + "loss": 0.3403, + "step": 8180 + }, + { + "epoch": 3.8680851063829786, + "grad_norm": 3.5031449794769287, + "learning_rate": 1.4223617325534664e-06, + "loss": 0.3964, + "step": 8181 + }, + { + "epoch": 3.8685579196217494, + "grad_norm": 2.7495479583740234, + "learning_rate": 1.4217988711157715e-06, + "loss": 0.3376, + "step": 8182 + }, + { + "epoch": 3.8690307328605202, + "grad_norm": 2.8609421253204346, + "learning_rate": 1.421236076812933e-06, + "loss": 0.3967, + "step": 8183 + }, + { + "epoch": 3.869503546099291, + "grad_norm": 3.0624637603759766, + "learning_rate": 1.420673349679994e-06, + "loss": 0.3764, + "step": 8184 + }, + { + "epoch": 3.8699763593380614, + "grad_norm": 3.3084404468536377, + "learning_rate": 1.4201106897519926e-06, + "loss": 0.4567, + "step": 8185 + }, + { + "epoch": 3.870449172576832, + "grad_norm": 3.164116382598877, + "learning_rate": 1.4195480970639624e-06, + "loss": 0.4217, + "step": 8186 + }, + { + "epoch": 3.870921985815603, + "grad_norm": 2.971390724182129, + "learning_rate": 1.4189855716509355e-06, + "loss": 0.3981, + "step": 8187 + }, + { + "epoch": 3.8713947990543733, + "grad_norm": 3.0537233352661133, + "learning_rate": 1.418423113547937e-06, + "loss": 0.4093, + "step": 8188 + }, + { + "epoch": 3.871867612293144, + "grad_norm": 3.698120594024658, + "learning_rate": 1.4178607227899877e-06, + "loss": 0.3158, + "step": 8189 + }, + { + "epoch": 3.872340425531915, + "grad_norm": 3.0320451259613037, + "learning_rate": 1.417298399412107e-06, + "loss": 0.3903, + "step": 8190 + }, + { + "epoch": 3.8728132387706857, + "grad_norm": 2.913296699523926, + "learning_rate": 1.4167361434493068e-06, + "loss": 0.3396, + "step": 8191 + }, + { + "epoch": 3.8732860520094565, + "grad_norm": 3.011906147003174, + "learning_rate": 1.4161739549365976e-06, + "loss": 0.3915, + "step": 8192 + }, + { + "epoch": 3.873758865248227, + "grad_norm": 3.2707724571228027, + "learning_rate": 1.4156118339089842e-06, + "loss": 0.4466, + "step": 8193 + }, + { + "epoch": 3.8742316784869977, + "grad_norm": 3.036747694015503, + "learning_rate": 1.4150497804014656e-06, + "loss": 0.4095, + "step": 8194 + }, + { + "epoch": 3.8747044917257685, + "grad_norm": 2.8851394653320312, + "learning_rate": 1.4144877944490411e-06, + "loss": 0.4235, + "step": 8195 + }, + { + "epoch": 3.875177304964539, + "grad_norm": 3.099785566329956, + "learning_rate": 1.4139258760867008e-06, + "loss": 0.4102, + "step": 8196 + }, + { + "epoch": 3.8756501182033096, + "grad_norm": 3.0752081871032715, + "learning_rate": 1.4133640253494347e-06, + "loss": 0.4165, + "step": 8197 + }, + { + "epoch": 3.8761229314420804, + "grad_norm": 2.842257261276245, + "learning_rate": 1.412802242272226e-06, + "loss": 0.3573, + "step": 8198 + }, + { + "epoch": 3.876595744680851, + "grad_norm": 2.93868350982666, + "learning_rate": 1.4122405268900547e-06, + "loss": 0.36, + "step": 8199 + }, + { + "epoch": 3.877068557919622, + "grad_norm": 2.674356460571289, + "learning_rate": 1.411678879237896e-06, + "loss": 0.3763, + "step": 8200 + }, + { + "epoch": 3.8775413711583924, + "grad_norm": 2.710617780685425, + "learning_rate": 1.411117299350721e-06, + "loss": 0.358, + "step": 8201 + }, + { + "epoch": 3.878014184397163, + "grad_norm": 3.0299410820007324, + "learning_rate": 1.4105557872634968e-06, + "loss": 0.3723, + "step": 8202 + }, + { + "epoch": 3.878486997635934, + "grad_norm": 3.1951241493225098, + "learning_rate": 1.4099943430111874e-06, + "loss": 0.4163, + "step": 8203 + }, + { + "epoch": 3.8789598108747043, + "grad_norm": 2.752410411834717, + "learning_rate": 1.4094329666287495e-06, + "loss": 0.3753, + "step": 8204 + }, + { + "epoch": 3.879432624113475, + "grad_norm": 3.1242496967315674, + "learning_rate": 1.40887165815114e-06, + "loss": 0.3694, + "step": 8205 + }, + { + "epoch": 3.879905437352246, + "grad_norm": 5.16750431060791, + "learning_rate": 1.4083104176133079e-06, + "loss": 0.3869, + "step": 8206 + }, + { + "epoch": 3.8803782505910167, + "grad_norm": 3.2995245456695557, + "learning_rate": 1.4077492450501978e-06, + "loss": 0.4194, + "step": 8207 + }, + { + "epoch": 3.8808510638297875, + "grad_norm": 3.506807804107666, + "learning_rate": 1.4071881404967541e-06, + "loss": 0.3873, + "step": 8208 + }, + { + "epoch": 3.881323877068558, + "grad_norm": 3.1201252937316895, + "learning_rate": 1.4066271039879123e-06, + "loss": 0.3625, + "step": 8209 + }, + { + "epoch": 3.8817966903073287, + "grad_norm": 2.870683193206787, + "learning_rate": 1.4060661355586073e-06, + "loss": 0.4039, + "step": 8210 + }, + { + "epoch": 3.8822695035460995, + "grad_norm": 3.177701234817505, + "learning_rate": 1.405505235243767e-06, + "loss": 0.3715, + "step": 8211 + }, + { + "epoch": 3.88274231678487, + "grad_norm": 3.0319771766662598, + "learning_rate": 1.4049444030783157e-06, + "loss": 0.3588, + "step": 8212 + }, + { + "epoch": 3.8832151300236406, + "grad_norm": 2.4598889350891113, + "learning_rate": 1.404383639097176e-06, + "loss": 0.2788, + "step": 8213 + }, + { + "epoch": 3.8836879432624114, + "grad_norm": 2.916987419128418, + "learning_rate": 1.4038229433352623e-06, + "loss": 0.4167, + "step": 8214 + }, + { + "epoch": 3.884160756501182, + "grad_norm": 3.005075216293335, + "learning_rate": 1.4032623158274872e-06, + "loss": 0.4251, + "step": 8215 + }, + { + "epoch": 3.8846335697399526, + "grad_norm": 3.1718621253967285, + "learning_rate": 1.4027017566087591e-06, + "loss": 0.383, + "step": 8216 + }, + { + "epoch": 3.8851063829787233, + "grad_norm": 2.954662322998047, + "learning_rate": 1.402141265713981e-06, + "loss": 0.373, + "step": 8217 + }, + { + "epoch": 3.885579196217494, + "grad_norm": 3.408008337020874, + "learning_rate": 1.4015808431780526e-06, + "loss": 0.4216, + "step": 8218 + }, + { + "epoch": 3.8860520094562645, + "grad_norm": 3.1599369049072266, + "learning_rate": 1.4010204890358675e-06, + "loss": 0.4544, + "step": 8219 + }, + { + "epoch": 3.8865248226950353, + "grad_norm": 2.8919107913970947, + "learning_rate": 1.4004602033223186e-06, + "loss": 0.3785, + "step": 8220 + }, + { + "epoch": 3.886997635933806, + "grad_norm": 3.522581100463867, + "learning_rate": 1.3998999860722918e-06, + "loss": 0.4276, + "step": 8221 + }, + { + "epoch": 3.887470449172577, + "grad_norm": 2.9278945922851562, + "learning_rate": 1.399339837320668e-06, + "loss": 0.409, + "step": 8222 + }, + { + "epoch": 3.8879432624113477, + "grad_norm": 3.032557725906372, + "learning_rate": 1.398779757102327e-06, + "loss": 0.3973, + "step": 8223 + }, + { + "epoch": 3.888416075650118, + "grad_norm": 2.843118667602539, + "learning_rate": 1.3982197454521423e-06, + "loss": 0.3418, + "step": 8224 + }, + { + "epoch": 3.888888888888889, + "grad_norm": 2.8620638847351074, + "learning_rate": 1.3976598024049815e-06, + "loss": 0.3751, + "step": 8225 + }, + { + "epoch": 3.8893617021276596, + "grad_norm": 2.532327175140381, + "learning_rate": 1.3970999279957124e-06, + "loss": 0.3541, + "step": 8226 + }, + { + "epoch": 3.88983451536643, + "grad_norm": 3.1074535846710205, + "learning_rate": 1.3965401222591935e-06, + "loss": 0.4706, + "step": 8227 + }, + { + "epoch": 3.890307328605201, + "grad_norm": 3.1558735370635986, + "learning_rate": 1.3959803852302839e-06, + "loss": 0.448, + "step": 8228 + }, + { + "epoch": 3.8907801418439716, + "grad_norm": 3.0862064361572266, + "learning_rate": 1.3954207169438344e-06, + "loss": 0.3308, + "step": 8229 + }, + { + "epoch": 3.8912529550827424, + "grad_norm": 2.9246280193328857, + "learning_rate": 1.3948611174346927e-06, + "loss": 0.3771, + "step": 8230 + }, + { + "epoch": 3.891725768321513, + "grad_norm": 2.7959492206573486, + "learning_rate": 1.394301586737704e-06, + "loss": 0.4248, + "step": 8231 + }, + { + "epoch": 3.8921985815602835, + "grad_norm": 2.787670373916626, + "learning_rate": 1.3937421248877075e-06, + "loss": 0.3416, + "step": 8232 + }, + { + "epoch": 3.8926713947990543, + "grad_norm": 3.0775792598724365, + "learning_rate": 1.393182731919538e-06, + "loss": 0.4345, + "step": 8233 + }, + { + "epoch": 3.893144208037825, + "grad_norm": 2.6338887214660645, + "learning_rate": 1.3926234078680268e-06, + "loss": 0.3995, + "step": 8234 + }, + { + "epoch": 3.8936170212765955, + "grad_norm": 2.9975900650024414, + "learning_rate": 1.392064152767999e-06, + "loss": 0.3997, + "step": 8235 + }, + { + "epoch": 3.8940898345153663, + "grad_norm": 2.8615779876708984, + "learning_rate": 1.3915049666542791e-06, + "loss": 0.3687, + "step": 8236 + }, + { + "epoch": 3.894562647754137, + "grad_norm": 3.0132436752319336, + "learning_rate": 1.3909458495616835e-06, + "loss": 0.4085, + "step": 8237 + }, + { + "epoch": 3.895035460992908, + "grad_norm": 3.141291379928589, + "learning_rate": 1.3903868015250278e-06, + "loss": 0.3903, + "step": 8238 + }, + { + "epoch": 3.8955082742316787, + "grad_norm": 2.6998603343963623, + "learning_rate": 1.3898278225791204e-06, + "loss": 0.3576, + "step": 8239 + }, + { + "epoch": 3.895981087470449, + "grad_norm": 3.212578535079956, + "learning_rate": 1.3892689127587656e-06, + "loss": 0.4321, + "step": 8240 + }, + { + "epoch": 3.89645390070922, + "grad_norm": 3.15732741355896, + "learning_rate": 1.3887100720987662e-06, + "loss": 0.4247, + "step": 8241 + }, + { + "epoch": 3.8969267139479906, + "grad_norm": 2.6001040935516357, + "learning_rate": 1.3881513006339168e-06, + "loss": 0.3376, + "step": 8242 + }, + { + "epoch": 3.897399527186761, + "grad_norm": 2.766188859939575, + "learning_rate": 1.3875925983990113e-06, + "loss": 0.3771, + "step": 8243 + }, + { + "epoch": 3.8978723404255318, + "grad_norm": 2.7471580505371094, + "learning_rate": 1.3870339654288372e-06, + "loss": 0.3311, + "step": 8244 + }, + { + "epoch": 3.8983451536643026, + "grad_norm": 3.577664375305176, + "learning_rate": 1.3864754017581769e-06, + "loss": 0.3725, + "step": 8245 + }, + { + "epoch": 3.8988179669030734, + "grad_norm": 2.8747243881225586, + "learning_rate": 1.3859169074218116e-06, + "loss": 0.3706, + "step": 8246 + }, + { + "epoch": 3.899290780141844, + "grad_norm": 2.5249671936035156, + "learning_rate": 1.3853584824545152e-06, + "loss": 0.3621, + "step": 8247 + }, + { + "epoch": 3.8997635933806145, + "grad_norm": 2.7290890216827393, + "learning_rate": 1.3848001268910589e-06, + "loss": 0.3209, + "step": 8248 + }, + { + "epoch": 3.9002364066193853, + "grad_norm": 3.0917534828186035, + "learning_rate": 1.3842418407662084e-06, + "loss": 0.3904, + "step": 8249 + }, + { + "epoch": 3.900709219858156, + "grad_norm": 3.099494695663452, + "learning_rate": 1.383683624114725e-06, + "loss": 0.3714, + "step": 8250 + }, + { + "epoch": 3.9011820330969265, + "grad_norm": 3.077505588531494, + "learning_rate": 1.3831254769713687e-06, + "loss": 0.4166, + "step": 8251 + }, + { + "epoch": 3.9016548463356973, + "grad_norm": 2.9983766078948975, + "learning_rate": 1.3825673993708915e-06, + "loss": 0.3909, + "step": 8252 + }, + { + "epoch": 3.902127659574468, + "grad_norm": 2.7958667278289795, + "learning_rate": 1.3820093913480415e-06, + "loss": 0.3966, + "step": 8253 + }, + { + "epoch": 3.902600472813239, + "grad_norm": 3.0938336849212646, + "learning_rate": 1.3814514529375656e-06, + "loss": 0.4118, + "step": 8254 + }, + { + "epoch": 3.9030732860520096, + "grad_norm": 3.2711637020111084, + "learning_rate": 1.3808935841742016e-06, + "loss": 0.4021, + "step": 8255 + }, + { + "epoch": 3.90354609929078, + "grad_norm": 3.23563814163208, + "learning_rate": 1.3803357850926885e-06, + "loss": 0.3679, + "step": 8256 + }, + { + "epoch": 3.904018912529551, + "grad_norm": 2.77942156791687, + "learning_rate": 1.3797780557277563e-06, + "loss": 0.3938, + "step": 8257 + }, + { + "epoch": 3.9044917257683216, + "grad_norm": 3.1273257732391357, + "learning_rate": 1.3792203961141313e-06, + "loss": 0.3579, + "step": 8258 + }, + { + "epoch": 3.904964539007092, + "grad_norm": 3.69164776802063, + "learning_rate": 1.378662806286539e-06, + "loss": 0.3712, + "step": 8259 + }, + { + "epoch": 3.9054373522458627, + "grad_norm": 2.8818306922912598, + "learning_rate": 1.3781052862796957e-06, + "loss": 0.3972, + "step": 8260 + }, + { + "epoch": 3.9059101654846335, + "grad_norm": 2.776651382446289, + "learning_rate": 1.377547836128318e-06, + "loss": 0.3605, + "step": 8261 + }, + { + "epoch": 3.9063829787234043, + "grad_norm": 3.1498706340789795, + "learning_rate": 1.376990455867115e-06, + "loss": 0.3995, + "step": 8262 + }, + { + "epoch": 3.906855791962175, + "grad_norm": 2.777390956878662, + "learning_rate": 1.3764331455307916e-06, + "loss": 0.3463, + "step": 8263 + }, + { + "epoch": 3.9073286052009455, + "grad_norm": 2.9953835010528564, + "learning_rate": 1.3758759051540496e-06, + "loss": 0.3881, + "step": 8264 + }, + { + "epoch": 3.9078014184397163, + "grad_norm": 3.737194538116455, + "learning_rate": 1.375318734771585e-06, + "loss": 0.4456, + "step": 8265 + }, + { + "epoch": 3.908274231678487, + "grad_norm": 3.1575849056243896, + "learning_rate": 1.374761634418092e-06, + "loss": 0.3613, + "step": 8266 + }, + { + "epoch": 3.9087470449172574, + "grad_norm": 3.140662908554077, + "learning_rate": 1.374204604128258e-06, + "loss": 0.4462, + "step": 8267 + }, + { + "epoch": 3.9092198581560282, + "grad_norm": 3.2106714248657227, + "learning_rate": 1.3736476439367663e-06, + "loss": 0.3801, + "step": 8268 + }, + { + "epoch": 3.909692671394799, + "grad_norm": 2.888345956802368, + "learning_rate": 1.3730907538782976e-06, + "loss": 0.4209, + "step": 8269 + }, + { + "epoch": 3.91016548463357, + "grad_norm": 2.8903355598449707, + "learning_rate": 1.3725339339875252e-06, + "loss": 0.3612, + "step": 8270 + }, + { + "epoch": 3.9106382978723406, + "grad_norm": 3.2661736011505127, + "learning_rate": 1.371977184299122e-06, + "loss": 0.4151, + "step": 8271 + }, + { + "epoch": 3.911111111111111, + "grad_norm": 3.1532459259033203, + "learning_rate": 1.3714205048477535e-06, + "loss": 0.3706, + "step": 8272 + }, + { + "epoch": 3.911583924349882, + "grad_norm": 2.907306432723999, + "learning_rate": 1.3708638956680804e-06, + "loss": 0.4113, + "step": 8273 + }, + { + "epoch": 3.9120567375886526, + "grad_norm": 2.7301599979400635, + "learning_rate": 1.3703073567947622e-06, + "loss": 0.355, + "step": 8274 + }, + { + "epoch": 3.912529550827423, + "grad_norm": 2.595625877380371, + "learning_rate": 1.3697508882624516e-06, + "loss": 0.3733, + "step": 8275 + }, + { + "epoch": 3.9130023640661937, + "grad_norm": 2.784294366836548, + "learning_rate": 1.369194490105796e-06, + "loss": 0.3366, + "step": 8276 + }, + { + "epoch": 3.9134751773049645, + "grad_norm": 3.0179800987243652, + "learning_rate": 1.3686381623594419e-06, + "loss": 0.3922, + "step": 8277 + }, + { + "epoch": 3.9139479905437353, + "grad_norm": 2.6641111373901367, + "learning_rate": 1.3680819050580291e-06, + "loss": 0.3324, + "step": 8278 + }, + { + "epoch": 3.914420803782506, + "grad_norm": 2.917741060256958, + "learning_rate": 1.3675257182361923e-06, + "loss": 0.3784, + "step": 8279 + }, + { + "epoch": 3.9148936170212765, + "grad_norm": 2.959599018096924, + "learning_rate": 1.3669696019285626e-06, + "loss": 0.3846, + "step": 8280 + }, + { + "epoch": 3.9153664302600473, + "grad_norm": 3.078824043273926, + "learning_rate": 1.3664135561697683e-06, + "loss": 0.4357, + "step": 8281 + }, + { + "epoch": 3.915839243498818, + "grad_norm": 3.0174930095672607, + "learning_rate": 1.3658575809944313e-06, + "loss": 0.3643, + "step": 8282 + }, + { + "epoch": 3.9163120567375884, + "grad_norm": 2.6805408000946045, + "learning_rate": 1.365301676437169e-06, + "loss": 0.3193, + "step": 8283 + }, + { + "epoch": 3.916784869976359, + "grad_norm": 2.6996054649353027, + "learning_rate": 1.3647458425325966e-06, + "loss": 0.3378, + "step": 8284 + }, + { + "epoch": 3.91725768321513, + "grad_norm": 2.7950546741485596, + "learning_rate": 1.3641900793153223e-06, + "loss": 0.3864, + "step": 8285 + }, + { + "epoch": 3.917730496453901, + "grad_norm": 2.9658634662628174, + "learning_rate": 1.363634386819951e-06, + "loss": 0.3452, + "step": 8286 + }, + { + "epoch": 3.9182033096926716, + "grad_norm": 3.0684404373168945, + "learning_rate": 1.363078765081084e-06, + "loss": 0.3278, + "step": 8287 + }, + { + "epoch": 3.918676122931442, + "grad_norm": 3.0293614864349365, + "learning_rate": 1.3625232141333164e-06, + "loss": 0.3827, + "step": 8288 + }, + { + "epoch": 3.9191489361702128, + "grad_norm": 2.9969890117645264, + "learning_rate": 1.3619677340112413e-06, + "loss": 0.3412, + "step": 8289 + }, + { + "epoch": 3.9196217494089836, + "grad_norm": 2.991654396057129, + "learning_rate": 1.3614123247494457e-06, + "loss": 0.3683, + "step": 8290 + }, + { + "epoch": 3.920094562647754, + "grad_norm": 3.032158374786377, + "learning_rate": 1.360856986382511e-06, + "loss": 0.421, + "step": 8291 + }, + { + "epoch": 3.9205673758865247, + "grad_norm": 3.1413731575012207, + "learning_rate": 1.3603017189450173e-06, + "loss": 0.3818, + "step": 8292 + }, + { + "epoch": 3.9210401891252955, + "grad_norm": 3.295527219772339, + "learning_rate": 1.3597465224715387e-06, + "loss": 0.4828, + "step": 8293 + }, + { + "epoch": 3.9215130023640663, + "grad_norm": 3.116053581237793, + "learning_rate": 1.359191396996643e-06, + "loss": 0.4108, + "step": 8294 + }, + { + "epoch": 3.921985815602837, + "grad_norm": 2.957446336746216, + "learning_rate": 1.3586363425548975e-06, + "loss": 0.3482, + "step": 8295 + }, + { + "epoch": 3.9224586288416075, + "grad_norm": 2.745471715927124, + "learning_rate": 1.3580813591808627e-06, + "loss": 0.4184, + "step": 8296 + }, + { + "epoch": 3.9229314420803783, + "grad_norm": 3.0920722484588623, + "learning_rate": 1.3575264469090943e-06, + "loss": 0.3826, + "step": 8297 + }, + { + "epoch": 3.923404255319149, + "grad_norm": 2.8719749450683594, + "learning_rate": 1.3569716057741444e-06, + "loss": 0.3953, + "step": 8298 + }, + { + "epoch": 3.9238770685579194, + "grad_norm": 3.1278762817382812, + "learning_rate": 1.3564168358105597e-06, + "loss": 0.3658, + "step": 8299 + }, + { + "epoch": 3.92434988179669, + "grad_norm": 2.7752785682678223, + "learning_rate": 1.3558621370528851e-06, + "loss": 0.3447, + "step": 8300 + }, + { + "epoch": 3.924822695035461, + "grad_norm": 2.948575735092163, + "learning_rate": 1.3553075095356575e-06, + "loss": 0.3803, + "step": 8301 + }, + { + "epoch": 3.925295508274232, + "grad_norm": 2.8164193630218506, + "learning_rate": 1.354752953293413e-06, + "loss": 0.3724, + "step": 8302 + }, + { + "epoch": 3.9257683215130026, + "grad_norm": 3.2431271076202393, + "learning_rate": 1.3541984683606798e-06, + "loss": 0.382, + "step": 8303 + }, + { + "epoch": 3.926241134751773, + "grad_norm": 2.8485286235809326, + "learning_rate": 1.353644054771983e-06, + "loss": 0.3632, + "step": 8304 + }, + { + "epoch": 3.9267139479905437, + "grad_norm": 3.334914445877075, + "learning_rate": 1.3530897125618456e-06, + "loss": 0.5286, + "step": 8305 + }, + { + "epoch": 3.9271867612293145, + "grad_norm": 3.3895132541656494, + "learning_rate": 1.3525354417647815e-06, + "loss": 0.3838, + "step": 8306 + }, + { + "epoch": 3.927659574468085, + "grad_norm": 3.141935110092163, + "learning_rate": 1.351981242415305e-06, + "loss": 0.3928, + "step": 8307 + }, + { + "epoch": 3.9281323877068557, + "grad_norm": 3.3013596534729004, + "learning_rate": 1.3514271145479225e-06, + "loss": 0.4046, + "step": 8308 + }, + { + "epoch": 3.9286052009456265, + "grad_norm": 2.8704745769500732, + "learning_rate": 1.3508730581971363e-06, + "loss": 0.3542, + "step": 8309 + }, + { + "epoch": 3.9290780141843973, + "grad_norm": 3.179405689239502, + "learning_rate": 1.3503190733974472e-06, + "loss": 0.3911, + "step": 8310 + }, + { + "epoch": 3.929550827423168, + "grad_norm": 3.1091885566711426, + "learning_rate": 1.3497651601833481e-06, + "loss": 0.3552, + "step": 8311 + }, + { + "epoch": 3.9300236406619384, + "grad_norm": 2.687678813934326, + "learning_rate": 1.3492113185893288e-06, + "loss": 0.3462, + "step": 8312 + }, + { + "epoch": 3.9304964539007092, + "grad_norm": 3.4954965114593506, + "learning_rate": 1.3486575486498749e-06, + "loss": 0.4358, + "step": 8313 + }, + { + "epoch": 3.93096926713948, + "grad_norm": 2.8652899265289307, + "learning_rate": 1.3481038503994652e-06, + "loss": 0.3434, + "step": 8314 + }, + { + "epoch": 3.9314420803782504, + "grad_norm": 3.927623748779297, + "learning_rate": 1.3475502238725797e-06, + "loss": 0.4662, + "step": 8315 + }, + { + "epoch": 3.931914893617021, + "grad_norm": 3.1166276931762695, + "learning_rate": 1.346996669103687e-06, + "loss": 0.3953, + "step": 8316 + }, + { + "epoch": 3.932387706855792, + "grad_norm": 3.140003204345703, + "learning_rate": 1.346443186127257e-06, + "loss": 0.3616, + "step": 8317 + }, + { + "epoch": 3.9328605200945628, + "grad_norm": 3.335466146469116, + "learning_rate": 1.3458897749777516e-06, + "loss": 0.3854, + "step": 8318 + }, + { + "epoch": 3.9333333333333336, + "grad_norm": 2.8305466175079346, + "learning_rate": 1.3453364356896282e-06, + "loss": 0.374, + "step": 8319 + }, + { + "epoch": 3.933806146572104, + "grad_norm": 2.9511806964874268, + "learning_rate": 1.344783168297343e-06, + "loss": 0.4235, + "step": 8320 + }, + { + "epoch": 3.9342789598108747, + "grad_norm": 3.1868233680725098, + "learning_rate": 1.3442299728353448e-06, + "loss": 0.4384, + "step": 8321 + }, + { + "epoch": 3.9347517730496455, + "grad_norm": 3.1358237266540527, + "learning_rate": 1.3436768493380766e-06, + "loss": 0.4011, + "step": 8322 + }, + { + "epoch": 3.935224586288416, + "grad_norm": 3.126192808151245, + "learning_rate": 1.343123797839982e-06, + "loss": 0.4061, + "step": 8323 + }, + { + "epoch": 3.9356973995271867, + "grad_norm": 2.9724647998809814, + "learning_rate": 1.3425708183754949e-06, + "loss": 0.3859, + "step": 8324 + }, + { + "epoch": 3.9361702127659575, + "grad_norm": 3.1526355743408203, + "learning_rate": 1.3420179109790485e-06, + "loss": 0.3543, + "step": 8325 + }, + { + "epoch": 3.9366430260047283, + "grad_norm": 3.1289172172546387, + "learning_rate": 1.3414650756850695e-06, + "loss": 0.3836, + "step": 8326 + }, + { + "epoch": 3.937115839243499, + "grad_norm": 2.851264715194702, + "learning_rate": 1.34091231252798e-06, + "loss": 0.3294, + "step": 8327 + }, + { + "epoch": 3.9375886524822694, + "grad_norm": 2.921872138977051, + "learning_rate": 1.3403596215421981e-06, + "loss": 0.3698, + "step": 8328 + }, + { + "epoch": 3.93806146572104, + "grad_norm": 2.947258234024048, + "learning_rate": 1.339807002762137e-06, + "loss": 0.3616, + "step": 8329 + }, + { + "epoch": 3.938534278959811, + "grad_norm": 3.011021375656128, + "learning_rate": 1.3392544562222077e-06, + "loss": 0.3387, + "step": 8330 + }, + { + "epoch": 3.9390070921985814, + "grad_norm": 3.5230746269226074, + "learning_rate": 1.3387019819568134e-06, + "loss": 0.4054, + "step": 8331 + }, + { + "epoch": 3.939479905437352, + "grad_norm": 3.120321035385132, + "learning_rate": 1.3381495800003536e-06, + "loss": 0.4389, + "step": 8332 + }, + { + "epoch": 3.939952718676123, + "grad_norm": 3.0090999603271484, + "learning_rate": 1.3375972503872259e-06, + "loss": 0.4158, + "step": 8333 + }, + { + "epoch": 3.9404255319148938, + "grad_norm": 3.4807989597320557, + "learning_rate": 1.3370449931518198e-06, + "loss": 0.4144, + "step": 8334 + }, + { + "epoch": 3.9408983451536646, + "grad_norm": 2.8535733222961426, + "learning_rate": 1.336492808328523e-06, + "loss": 0.4281, + "step": 8335 + }, + { + "epoch": 3.941371158392435, + "grad_norm": 2.9032745361328125, + "learning_rate": 1.3359406959517174e-06, + "loss": 0.3389, + "step": 8336 + }, + { + "epoch": 3.9418439716312057, + "grad_norm": 2.725823163986206, + "learning_rate": 1.3353886560557793e-06, + "loss": 0.369, + "step": 8337 + }, + { + "epoch": 3.9423167848699765, + "grad_norm": 3.1965179443359375, + "learning_rate": 1.3348366886750844e-06, + "loss": 0.4031, + "step": 8338 + }, + { + "epoch": 3.942789598108747, + "grad_norm": 2.6991076469421387, + "learning_rate": 1.3342847938439985e-06, + "loss": 0.3434, + "step": 8339 + }, + { + "epoch": 3.9432624113475176, + "grad_norm": 4.491400718688965, + "learning_rate": 1.3337329715968877e-06, + "loss": 0.4175, + "step": 8340 + }, + { + "epoch": 3.9437352245862884, + "grad_norm": 4.005452632904053, + "learning_rate": 1.3331812219681112e-06, + "loss": 0.4191, + "step": 8341 + }, + { + "epoch": 3.9442080378250592, + "grad_norm": 3.1575794219970703, + "learning_rate": 1.3326295449920238e-06, + "loss": 0.4135, + "step": 8342 + }, + { + "epoch": 3.94468085106383, + "grad_norm": 3.2383973598480225, + "learning_rate": 1.3320779407029755e-06, + "loss": 0.38, + "step": 8343 + }, + { + "epoch": 3.9451536643026004, + "grad_norm": 2.873703718185425, + "learning_rate": 1.3315264091353119e-06, + "loss": 0.4128, + "step": 8344 + }, + { + "epoch": 3.945626477541371, + "grad_norm": 2.947274923324585, + "learning_rate": 1.330974950323376e-06, + "loss": 0.3342, + "step": 8345 + }, + { + "epoch": 3.946099290780142, + "grad_norm": 3.2874088287353516, + "learning_rate": 1.330423564301504e-06, + "loss": 0.3849, + "step": 8346 + }, + { + "epoch": 3.9465721040189123, + "grad_norm": 2.885772466659546, + "learning_rate": 1.3298722511040275e-06, + "loss": 0.3562, + "step": 8347 + }, + { + "epoch": 3.947044917257683, + "grad_norm": 3.0031309127807617, + "learning_rate": 1.3293210107652753e-06, + "loss": 0.3593, + "step": 8348 + }, + { + "epoch": 3.947517730496454, + "grad_norm": 2.815854549407959, + "learning_rate": 1.3287698433195712e-06, + "loss": 0.3633, + "step": 8349 + }, + { + "epoch": 3.9479905437352247, + "grad_norm": 3.228415012359619, + "learning_rate": 1.328218748801232e-06, + "loss": 0.3869, + "step": 8350 + }, + { + "epoch": 3.9484633569739955, + "grad_norm": 2.8497684001922607, + "learning_rate": 1.3276677272445743e-06, + "loss": 0.3833, + "step": 8351 + }, + { + "epoch": 3.948936170212766, + "grad_norm": 3.2330706119537354, + "learning_rate": 1.3271167786839057e-06, + "loss": 0.4414, + "step": 8352 + }, + { + "epoch": 3.9494089834515367, + "grad_norm": 3.2179152965545654, + "learning_rate": 1.3265659031535332e-06, + "loss": 0.3821, + "step": 8353 + }, + { + "epoch": 3.9498817966903075, + "grad_norm": 3.0861377716064453, + "learning_rate": 1.3260151006877567e-06, + "loss": 0.4014, + "step": 8354 + }, + { + "epoch": 3.950354609929078, + "grad_norm": 3.118872880935669, + "learning_rate": 1.325464371320871e-06, + "loss": 0.3836, + "step": 8355 + }, + { + "epoch": 3.9508274231678486, + "grad_norm": 2.9787702560424805, + "learning_rate": 1.32491371508717e-06, + "loss": 0.3794, + "step": 8356 + }, + { + "epoch": 3.9513002364066194, + "grad_norm": 3.132089853286743, + "learning_rate": 1.3243631320209387e-06, + "loss": 0.3698, + "step": 8357 + }, + { + "epoch": 3.9517730496453902, + "grad_norm": 2.5304882526397705, + "learning_rate": 1.32381262215646e-06, + "loss": 0.3687, + "step": 8358 + }, + { + "epoch": 3.952245862884161, + "grad_norm": 2.9121861457824707, + "learning_rate": 1.3232621855280126e-06, + "loss": 0.3704, + "step": 8359 + }, + { + "epoch": 3.9527186761229314, + "grad_norm": 3.0885608196258545, + "learning_rate": 1.3227118221698688e-06, + "loss": 0.4303, + "step": 8360 + }, + { + "epoch": 3.953191489361702, + "grad_norm": 2.7274837493896484, + "learning_rate": 1.3221615321162979e-06, + "loss": 0.3556, + "step": 8361 + }, + { + "epoch": 3.953664302600473, + "grad_norm": 3.1329922676086426, + "learning_rate": 1.3216113154015625e-06, + "loss": 0.4042, + "step": 8362 + }, + { + "epoch": 3.9541371158392433, + "grad_norm": 2.937380313873291, + "learning_rate": 1.3210611720599243e-06, + "loss": 0.3358, + "step": 8363 + }, + { + "epoch": 3.954609929078014, + "grad_norm": 2.939194440841675, + "learning_rate": 1.3205111021256378e-06, + "loss": 0.3885, + "step": 8364 + }, + { + "epoch": 3.955082742316785, + "grad_norm": 2.9151997566223145, + "learning_rate": 1.3199611056329516e-06, + "loss": 0.4094, + "step": 8365 + }, + { + "epoch": 3.9555555555555557, + "grad_norm": 3.029733419418335, + "learning_rate": 1.3194111826161143e-06, + "loss": 0.3999, + "step": 8366 + }, + { + "epoch": 3.9560283687943265, + "grad_norm": 2.7899951934814453, + "learning_rate": 1.3188613331093653e-06, + "loss": 0.321, + "step": 8367 + }, + { + "epoch": 3.956501182033097, + "grad_norm": 3.1109507083892822, + "learning_rate": 1.3183115571469425e-06, + "loss": 0.4266, + "step": 8368 + }, + { + "epoch": 3.9569739952718677, + "grad_norm": 3.085594415664673, + "learning_rate": 1.3177618547630774e-06, + "loss": 0.4412, + "step": 8369 + }, + { + "epoch": 3.9574468085106385, + "grad_norm": 3.0980300903320312, + "learning_rate": 1.3172122259919968e-06, + "loss": 0.3385, + "step": 8370 + }, + { + "epoch": 3.957919621749409, + "grad_norm": 3.103438138961792, + "learning_rate": 1.3166626708679256e-06, + "loss": 0.3887, + "step": 8371 + }, + { + "epoch": 3.9583924349881796, + "grad_norm": 2.8235526084899902, + "learning_rate": 1.3161131894250812e-06, + "loss": 0.3759, + "step": 8372 + }, + { + "epoch": 3.9588652482269504, + "grad_norm": 2.8316404819488525, + "learning_rate": 1.3155637816976762e-06, + "loss": 0.3666, + "step": 8373 + }, + { + "epoch": 3.959338061465721, + "grad_norm": 2.7873756885528564, + "learning_rate": 1.3150144477199218e-06, + "loss": 0.3284, + "step": 8374 + }, + { + "epoch": 3.959810874704492, + "grad_norm": 3.355039119720459, + "learning_rate": 1.3144651875260218e-06, + "loss": 0.4197, + "step": 8375 + }, + { + "epoch": 3.9602836879432624, + "grad_norm": 3.477721929550171, + "learning_rate": 1.3139160011501761e-06, + "loss": 0.3298, + "step": 8376 + }, + { + "epoch": 3.960756501182033, + "grad_norm": 3.557152032852173, + "learning_rate": 1.3133668886265805e-06, + "loss": 0.3788, + "step": 8377 + }, + { + "epoch": 3.961229314420804, + "grad_norm": 3.06707763671875, + "learning_rate": 1.312817849989424e-06, + "loss": 0.3613, + "step": 8378 + }, + { + "epoch": 3.9617021276595743, + "grad_norm": 2.7702202796936035, + "learning_rate": 1.3122688852728956e-06, + "loss": 0.402, + "step": 8379 + }, + { + "epoch": 3.962174940898345, + "grad_norm": 2.8121016025543213, + "learning_rate": 1.3117199945111746e-06, + "loss": 0.3576, + "step": 8380 + }, + { + "epoch": 3.962647754137116, + "grad_norm": 2.809282064437866, + "learning_rate": 1.3111711777384403e-06, + "loss": 0.3741, + "step": 8381 + }, + { + "epoch": 3.9631205673758867, + "grad_norm": 3.1175687313079834, + "learning_rate": 1.3106224349888638e-06, + "loss": 0.3388, + "step": 8382 + }, + { + "epoch": 3.963593380614657, + "grad_norm": 2.930525064468384, + "learning_rate": 1.310073766296612e-06, + "loss": 0.3593, + "step": 8383 + }, + { + "epoch": 3.964066193853428, + "grad_norm": 3.0673177242279053, + "learning_rate": 1.3095251716958501e-06, + "loss": 0.402, + "step": 8384 + }, + { + "epoch": 3.9645390070921986, + "grad_norm": 2.9725706577301025, + "learning_rate": 1.3089766512207347e-06, + "loss": 0.3707, + "step": 8385 + }, + { + "epoch": 3.965011820330969, + "grad_norm": 2.9790916442871094, + "learning_rate": 1.3084282049054218e-06, + "loss": 0.3292, + "step": 8386 + }, + { + "epoch": 3.96548463356974, + "grad_norm": 3.257035493850708, + "learning_rate": 1.3078798327840598e-06, + "loss": 0.3753, + "step": 8387 + }, + { + "epoch": 3.9659574468085106, + "grad_norm": 3.0534379482269287, + "learning_rate": 1.307331534890792e-06, + "loss": 0.4134, + "step": 8388 + }, + { + "epoch": 3.9664302600472814, + "grad_norm": 2.919243812561035, + "learning_rate": 1.306783311259761e-06, + "loss": 0.4283, + "step": 8389 + }, + { + "epoch": 3.966903073286052, + "grad_norm": 2.7643322944641113, + "learning_rate": 1.306235161925101e-06, + "loss": 0.3454, + "step": 8390 + }, + { + "epoch": 3.9673758865248225, + "grad_norm": 3.0208916664123535, + "learning_rate": 1.3056870869209431e-06, + "loss": 0.385, + "step": 8391 + }, + { + "epoch": 3.9678486997635933, + "grad_norm": 2.8657243251800537, + "learning_rate": 1.3051390862814135e-06, + "loss": 0.3614, + "step": 8392 + }, + { + "epoch": 3.968321513002364, + "grad_norm": 3.2093591690063477, + "learning_rate": 1.3045911600406325e-06, + "loss": 0.3774, + "step": 8393 + }, + { + "epoch": 3.9687943262411345, + "grad_norm": 3.091618537902832, + "learning_rate": 1.3040433082327192e-06, + "loss": 0.4157, + "step": 8394 + }, + { + "epoch": 3.9692671394799053, + "grad_norm": 2.99763560295105, + "learning_rate": 1.3034955308917849e-06, + "loss": 0.4017, + "step": 8395 + }, + { + "epoch": 3.969739952718676, + "grad_norm": 3.063109874725342, + "learning_rate": 1.3029478280519364e-06, + "loss": 0.4568, + "step": 8396 + }, + { + "epoch": 3.970212765957447, + "grad_norm": 3.2660679817199707, + "learning_rate": 1.3024001997472791e-06, + "loss": 0.3999, + "step": 8397 + }, + { + "epoch": 3.9706855791962177, + "grad_norm": 2.860121250152588, + "learning_rate": 1.3018526460119088e-06, + "loss": 0.433, + "step": 8398 + }, + { + "epoch": 3.971158392434988, + "grad_norm": 3.1037673950195312, + "learning_rate": 1.3013051668799216e-06, + "loss": 0.4526, + "step": 8399 + }, + { + "epoch": 3.971631205673759, + "grad_norm": 2.9408578872680664, + "learning_rate": 1.3007577623854053e-06, + "loss": 0.3722, + "step": 8400 + }, + { + "epoch": 3.9721040189125296, + "grad_norm": 3.0684635639190674, + "learning_rate": 1.3002104325624436e-06, + "loss": 0.3789, + "step": 8401 + }, + { + "epoch": 3.9725768321513, + "grad_norm": 2.6469366550445557, + "learning_rate": 1.2996631774451187e-06, + "loss": 0.3409, + "step": 8402 + }, + { + "epoch": 3.9730496453900708, + "grad_norm": 3.3741610050201416, + "learning_rate": 1.2991159970675033e-06, + "loss": 0.3544, + "step": 8403 + }, + { + "epoch": 3.9735224586288416, + "grad_norm": 3.3716588020324707, + "learning_rate": 1.2985688914636701e-06, + "loss": 0.3747, + "step": 8404 + }, + { + "epoch": 3.9739952718676124, + "grad_norm": 3.000469923019409, + "learning_rate": 1.2980218606676837e-06, + "loss": 0.4506, + "step": 8405 + }, + { + "epoch": 3.974468085106383, + "grad_norm": 3.0139408111572266, + "learning_rate": 1.2974749047136057e-06, + "loss": 0.4156, + "step": 8406 + }, + { + "epoch": 3.9749408983451535, + "grad_norm": 2.9494218826293945, + "learning_rate": 1.2969280236354925e-06, + "loss": 0.3378, + "step": 8407 + }, + { + "epoch": 3.9754137115839243, + "grad_norm": 2.6061158180236816, + "learning_rate": 1.2963812174673948e-06, + "loss": 0.3887, + "step": 8408 + }, + { + "epoch": 3.975886524822695, + "grad_norm": 2.873987913131714, + "learning_rate": 1.295834486243362e-06, + "loss": 0.3202, + "step": 8409 + }, + { + "epoch": 3.9763593380614655, + "grad_norm": 3.0106539726257324, + "learning_rate": 1.2952878299974358e-06, + "loss": 0.4142, + "step": 8410 + }, + { + "epoch": 3.9768321513002363, + "grad_norm": 3.0011982917785645, + "learning_rate": 1.2947412487636527e-06, + "loss": 0.4121, + "step": 8411 + }, + { + "epoch": 3.977304964539007, + "grad_norm": 3.1321003437042236, + "learning_rate": 1.294194742576048e-06, + "loss": 0.4033, + "step": 8412 + }, + { + "epoch": 3.977777777777778, + "grad_norm": 2.812255382537842, + "learning_rate": 1.2936483114686487e-06, + "loss": 0.3414, + "step": 8413 + }, + { + "epoch": 3.9782505910165487, + "grad_norm": 2.9594221115112305, + "learning_rate": 1.2931019554754804e-06, + "loss": 0.3666, + "step": 8414 + }, + { + "epoch": 3.978723404255319, + "grad_norm": 3.119440793991089, + "learning_rate": 1.2925556746305612e-06, + "loss": 0.3902, + "step": 8415 + }, + { + "epoch": 3.97919621749409, + "grad_norm": 3.042102098464966, + "learning_rate": 1.2920094689679047e-06, + "loss": 0.344, + "step": 8416 + }, + { + "epoch": 3.9796690307328606, + "grad_norm": 2.8443872928619385, + "learning_rate": 1.2914633385215225e-06, + "loss": 0.372, + "step": 8417 + }, + { + "epoch": 3.980141843971631, + "grad_norm": 3.483201265335083, + "learning_rate": 1.2909172833254187e-06, + "loss": 0.4028, + "step": 8418 + }, + { + "epoch": 3.9806146572104018, + "grad_norm": 2.966996431350708, + "learning_rate": 1.2903713034135934e-06, + "loss": 0.3527, + "step": 8419 + }, + { + "epoch": 3.9810874704491725, + "grad_norm": 2.7813172340393066, + "learning_rate": 1.2898253988200437e-06, + "loss": 0.3873, + "step": 8420 + }, + { + "epoch": 3.9815602836879433, + "grad_norm": 3.24611234664917, + "learning_rate": 1.2892795695787602e-06, + "loss": 0.4783, + "step": 8421 + }, + { + "epoch": 3.982033096926714, + "grad_norm": 3.345573663711548, + "learning_rate": 1.2887338157237289e-06, + "loss": 0.4179, + "step": 8422 + }, + { + "epoch": 3.9825059101654845, + "grad_norm": 3.1726880073547363, + "learning_rate": 1.288188137288931e-06, + "loss": 0.3725, + "step": 8423 + }, + { + "epoch": 3.9829787234042553, + "grad_norm": 3.398966073989868, + "learning_rate": 1.2876425343083449e-06, + "loss": 0.4117, + "step": 8424 + }, + { + "epoch": 3.983451536643026, + "grad_norm": 2.615680456161499, + "learning_rate": 1.2870970068159423e-06, + "loss": 0.324, + "step": 8425 + }, + { + "epoch": 3.9839243498817964, + "grad_norm": 3.0505547523498535, + "learning_rate": 1.2865515548456893e-06, + "loss": 0.3698, + "step": 8426 + }, + { + "epoch": 3.9843971631205672, + "grad_norm": 3.077404260635376, + "learning_rate": 1.2860061784315514e-06, + "loss": 0.3592, + "step": 8427 + }, + { + "epoch": 3.984869976359338, + "grad_norm": 2.654080390930176, + "learning_rate": 1.2854608776074855e-06, + "loss": 0.3451, + "step": 8428 + }, + { + "epoch": 3.985342789598109, + "grad_norm": 3.1023523807525635, + "learning_rate": 1.284915652407444e-06, + "loss": 0.3809, + "step": 8429 + }, + { + "epoch": 3.9858156028368796, + "grad_norm": 3.0526652336120605, + "learning_rate": 1.2843705028653783e-06, + "loss": 0.3633, + "step": 8430 + }, + { + "epoch": 3.98628841607565, + "grad_norm": 2.7829604148864746, + "learning_rate": 1.2838254290152296e-06, + "loss": 0.3213, + "step": 8431 + }, + { + "epoch": 3.986761229314421, + "grad_norm": 3.2218687534332275, + "learning_rate": 1.28328043089094e-06, + "loss": 0.465, + "step": 8432 + }, + { + "epoch": 3.9872340425531916, + "grad_norm": 2.952998161315918, + "learning_rate": 1.2827355085264425e-06, + "loss": 0.4405, + "step": 8433 + }, + { + "epoch": 3.987706855791962, + "grad_norm": 2.81211519241333, + "learning_rate": 1.2821906619556667e-06, + "loss": 0.3444, + "step": 8434 + }, + { + "epoch": 3.9881796690307327, + "grad_norm": 3.1707375049591064, + "learning_rate": 1.281645891212539e-06, + "loss": 0.4019, + "step": 8435 + }, + { + "epoch": 3.9886524822695035, + "grad_norm": 2.791504383087158, + "learning_rate": 1.2811011963309788e-06, + "loss": 0.3606, + "step": 8436 + }, + { + "epoch": 3.9891252955082743, + "grad_norm": 2.954782247543335, + "learning_rate": 1.280556577344903e-06, + "loss": 0.3141, + "step": 8437 + }, + { + "epoch": 3.989598108747045, + "grad_norm": 2.718273878097534, + "learning_rate": 1.2800120342882223e-06, + "loss": 0.3715, + "step": 8438 + }, + { + "epoch": 3.9900709219858155, + "grad_norm": 3.2916250228881836, + "learning_rate": 1.2794675671948425e-06, + "loss": 0.4048, + "step": 8439 + }, + { + "epoch": 3.9905437352245863, + "grad_norm": 3.060060977935791, + "learning_rate": 1.2789231760986655e-06, + "loss": 0.4032, + "step": 8440 + }, + { + "epoch": 3.991016548463357, + "grad_norm": 2.8467273712158203, + "learning_rate": 1.2783788610335882e-06, + "loss": 0.4041, + "step": 8441 + }, + { + "epoch": 3.9914893617021274, + "grad_norm": 3.161790132522583, + "learning_rate": 1.2778346220335013e-06, + "loss": 0.4049, + "step": 8442 + }, + { + "epoch": 3.9919621749408982, + "grad_norm": 2.6512296199798584, + "learning_rate": 1.277290459132295e-06, + "loss": 0.3598, + "step": 8443 + }, + { + "epoch": 3.992434988179669, + "grad_norm": 2.792736291885376, + "learning_rate": 1.276746372363849e-06, + "loss": 0.3874, + "step": 8444 + }, + { + "epoch": 3.99290780141844, + "grad_norm": 2.887047052383423, + "learning_rate": 1.2762023617620433e-06, + "loss": 0.4255, + "step": 8445 + }, + { + "epoch": 3.9933806146572106, + "grad_norm": 3.0420780181884766, + "learning_rate": 1.275658427360751e-06, + "loss": 0.4489, + "step": 8446 + }, + { + "epoch": 3.993853427895981, + "grad_norm": 3.107618570327759, + "learning_rate": 1.2751145691938383e-06, + "loss": 0.4354, + "step": 8447 + }, + { + "epoch": 3.9943262411347518, + "grad_norm": 2.656224250793457, + "learning_rate": 1.2745707872951718e-06, + "loss": 0.4188, + "step": 8448 + }, + { + "epoch": 3.9947990543735226, + "grad_norm": 2.9895219802856445, + "learning_rate": 1.2740270816986079e-06, + "loss": 0.391, + "step": 8449 + }, + { + "epoch": 3.995271867612293, + "grad_norm": 2.919255018234253, + "learning_rate": 1.2734834524380025e-06, + "loss": 0.4058, + "step": 8450 + }, + { + "epoch": 3.9957446808510637, + "grad_norm": 3.4418535232543945, + "learning_rate": 1.2729398995472048e-06, + "loss": 0.3977, + "step": 8451 + }, + { + "epoch": 3.9962174940898345, + "grad_norm": 2.980224132537842, + "learning_rate": 1.272396423060058e-06, + "loss": 0.4417, + "step": 8452 + }, + { + "epoch": 3.9966903073286053, + "grad_norm": 3.6488101482391357, + "learning_rate": 1.2718530230104043e-06, + "loss": 0.4472, + "step": 8453 + }, + { + "epoch": 3.997163120567376, + "grad_norm": 2.725437641143799, + "learning_rate": 1.2713096994320774e-06, + "loss": 0.3125, + "step": 8454 + }, + { + "epoch": 3.9976359338061465, + "grad_norm": 3.453794002532959, + "learning_rate": 1.2707664523589076e-06, + "loss": 0.3792, + "step": 8455 + }, + { + "epoch": 3.9981087470449173, + "grad_norm": 2.8443076610565186, + "learning_rate": 1.270223281824721e-06, + "loss": 0.3627, + "step": 8456 + }, + { + "epoch": 3.998581560283688, + "grad_norm": 3.1851959228515625, + "learning_rate": 1.2696801878633372e-06, + "loss": 0.3745, + "step": 8457 + }, + { + "epoch": 3.9990543735224584, + "grad_norm": 2.897239923477173, + "learning_rate": 1.2691371705085743e-06, + "loss": 0.3817, + "step": 8458 + }, + { + "epoch": 3.999527186761229, + "grad_norm": 2.92111873626709, + "learning_rate": 1.2685942297942416e-06, + "loss": 0.3824, + "step": 8459 + }, + { + "epoch": 4.0, + "grad_norm": 2.9540340900421143, + "learning_rate": 1.268051365754148e-06, + "loss": 0.3656, + "step": 8460 + }, + { + "epoch": 4.000472813238771, + "grad_norm": 2.6815075874328613, + "learning_rate": 1.2675085784220936e-06, + "loss": 0.3296, + "step": 8461 + }, + { + "epoch": 4.000945626477542, + "grad_norm": 3.0823302268981934, + "learning_rate": 1.2669658678318747e-06, + "loss": 0.3918, + "step": 8462 + }, + { + "epoch": 4.001418439716312, + "grad_norm": 2.5451176166534424, + "learning_rate": 1.2664232340172855e-06, + "loss": 0.3528, + "step": 8463 + }, + { + "epoch": 4.001891252955082, + "grad_norm": 2.539541721343994, + "learning_rate": 1.2658806770121119e-06, + "loss": 0.3034, + "step": 8464 + }, + { + "epoch": 4.002364066193853, + "grad_norm": 2.5537798404693604, + "learning_rate": 1.2653381968501374e-06, + "loss": 0.2981, + "step": 8465 + }, + { + "epoch": 4.002836879432624, + "grad_norm": 2.6316089630126953, + "learning_rate": 1.26479579356514e-06, + "loss": 0.3328, + "step": 8466 + }, + { + "epoch": 4.003309692671395, + "grad_norm": 3.080700635910034, + "learning_rate": 1.2642534671908914e-06, + "loss": 0.3471, + "step": 8467 + }, + { + "epoch": 4.0037825059101655, + "grad_norm": 3.0111753940582275, + "learning_rate": 1.2637112177611614e-06, + "loss": 0.3147, + "step": 8468 + }, + { + "epoch": 4.004255319148936, + "grad_norm": 2.759606122970581, + "learning_rate": 1.2631690453097128e-06, + "loss": 0.2634, + "step": 8469 + }, + { + "epoch": 4.004728132387707, + "grad_norm": 2.862098217010498, + "learning_rate": 1.2626269498703048e-06, + "loss": 0.3333, + "step": 8470 + }, + { + "epoch": 4.005200945626478, + "grad_norm": 3.122239589691162, + "learning_rate": 1.262084931476691e-06, + "loss": 0.3311, + "step": 8471 + }, + { + "epoch": 4.005673758865248, + "grad_norm": 2.6428070068359375, + "learning_rate": 1.261542990162619e-06, + "loss": 0.3534, + "step": 8472 + }, + { + "epoch": 4.006146572104019, + "grad_norm": 3.2870724201202393, + "learning_rate": 1.261001125961836e-06, + "loss": 0.3373, + "step": 8473 + }, + { + "epoch": 4.006619385342789, + "grad_norm": 2.7675375938415527, + "learning_rate": 1.26045933890808e-06, + "loss": 0.3117, + "step": 8474 + }, + { + "epoch": 4.00709219858156, + "grad_norm": 2.811736583709717, + "learning_rate": 1.2599176290350844e-06, + "loss": 0.3087, + "step": 8475 + }, + { + "epoch": 4.007565011820331, + "grad_norm": 2.9146902561187744, + "learning_rate": 1.2593759963765817e-06, + "loss": 0.336, + "step": 8476 + }, + { + "epoch": 4.008037825059102, + "grad_norm": 3.074338674545288, + "learning_rate": 1.2588344409662945e-06, + "loss": 0.384, + "step": 8477 + }, + { + "epoch": 4.008510638297873, + "grad_norm": 3.5597734451293945, + "learning_rate": 1.2582929628379455e-06, + "loss": 0.4061, + "step": 8478 + }, + { + "epoch": 4.008983451536643, + "grad_norm": 3.0091497898101807, + "learning_rate": 1.2577515620252489e-06, + "loss": 0.3783, + "step": 8479 + }, + { + "epoch": 4.009456264775413, + "grad_norm": 2.9654228687286377, + "learning_rate": 1.2572102385619145e-06, + "loss": 0.3541, + "step": 8480 + }, + { + "epoch": 4.009929078014184, + "grad_norm": 3.140733242034912, + "learning_rate": 1.2566689924816502e-06, + "loss": 0.3706, + "step": 8481 + }, + { + "epoch": 4.010401891252955, + "grad_norm": 3.2180161476135254, + "learning_rate": 1.2561278238181548e-06, + "loss": 0.3573, + "step": 8482 + }, + { + "epoch": 4.010874704491726, + "grad_norm": 2.91209077835083, + "learning_rate": 1.2555867326051265e-06, + "loss": 0.3619, + "step": 8483 + }, + { + "epoch": 4.0113475177304965, + "grad_norm": 3.016916036605835, + "learning_rate": 1.255045718876256e-06, + "loss": 0.3866, + "step": 8484 + }, + { + "epoch": 4.011820330969267, + "grad_norm": 3.1008472442626953, + "learning_rate": 1.2545047826652294e-06, + "loss": 0.3352, + "step": 8485 + }, + { + "epoch": 4.012293144208038, + "grad_norm": 3.136124610900879, + "learning_rate": 1.2539639240057287e-06, + "loss": 0.3478, + "step": 8486 + }, + { + "epoch": 4.012765957446809, + "grad_norm": 3.0021555423736572, + "learning_rate": 1.2534231429314299e-06, + "loss": 0.3522, + "step": 8487 + }, + { + "epoch": 4.013238770685579, + "grad_norm": 3.2261948585510254, + "learning_rate": 1.2528824394760065e-06, + "loss": 0.3632, + "step": 8488 + }, + { + "epoch": 4.01371158392435, + "grad_norm": 3.0598134994506836, + "learning_rate": 1.2523418136731252e-06, + "loss": 0.3422, + "step": 8489 + }, + { + "epoch": 4.01418439716312, + "grad_norm": 2.9821391105651855, + "learning_rate": 1.2518012655564476e-06, + "loss": 0.3324, + "step": 8490 + }, + { + "epoch": 4.014657210401891, + "grad_norm": 2.583130359649658, + "learning_rate": 1.251260795159633e-06, + "loss": 0.3509, + "step": 8491 + }, + { + "epoch": 4.015130023640662, + "grad_norm": 3.3090853691101074, + "learning_rate": 1.2507204025163333e-06, + "loss": 0.3494, + "step": 8492 + }, + { + "epoch": 4.015602836879433, + "grad_norm": 2.6412856578826904, + "learning_rate": 1.250180087660195e-06, + "loss": 0.3419, + "step": 8493 + }, + { + "epoch": 4.0160756501182036, + "grad_norm": 2.729210615158081, + "learning_rate": 1.2496398506248634e-06, + "loss": 0.3591, + "step": 8494 + }, + { + "epoch": 4.016548463356974, + "grad_norm": 2.892150402069092, + "learning_rate": 1.2490996914439745e-06, + "loss": 0.3866, + "step": 8495 + }, + { + "epoch": 4.017021276595744, + "grad_norm": 3.1967804431915283, + "learning_rate": 1.2485596101511638e-06, + "loss": 0.358, + "step": 8496 + }, + { + "epoch": 4.017494089834515, + "grad_norm": 3.0190439224243164, + "learning_rate": 1.2480196067800588e-06, + "loss": 0.3723, + "step": 8497 + }, + { + "epoch": 4.017966903073286, + "grad_norm": 2.856370210647583, + "learning_rate": 1.2474796813642822e-06, + "loss": 0.3519, + "step": 8498 + }, + { + "epoch": 4.018439716312057, + "grad_norm": 2.979842185974121, + "learning_rate": 1.2469398339374546e-06, + "loss": 0.3483, + "step": 8499 + }, + { + "epoch": 4.0189125295508275, + "grad_norm": 3.0953211784362793, + "learning_rate": 1.246400064533189e-06, + "loss": 0.355, + "step": 8500 + }, + { + "epoch": 4.019385342789598, + "grad_norm": 3.342609167098999, + "learning_rate": 1.2458603731850938e-06, + "loss": 0.4258, + "step": 8501 + }, + { + "epoch": 4.019858156028369, + "grad_norm": 3.2789435386657715, + "learning_rate": 1.2453207599267747e-06, + "loss": 0.3653, + "step": 8502 + }, + { + "epoch": 4.02033096926714, + "grad_norm": 2.8867030143737793, + "learning_rate": 1.2447812247918303e-06, + "loss": 0.3128, + "step": 8503 + }, + { + "epoch": 4.02080378250591, + "grad_norm": 2.9467437267303467, + "learning_rate": 1.2442417678138552e-06, + "loss": 0.3149, + "step": 8504 + }, + { + "epoch": 4.0212765957446805, + "grad_norm": 2.6293485164642334, + "learning_rate": 1.2437023890264377e-06, + "loss": 0.2751, + "step": 8505 + }, + { + "epoch": 4.021749408983451, + "grad_norm": 2.9672160148620605, + "learning_rate": 1.2431630884631648e-06, + "loss": 0.3858, + "step": 8506 + }, + { + "epoch": 4.022222222222222, + "grad_norm": 3.0518734455108643, + "learning_rate": 1.2426238661576154e-06, + "loss": 0.3404, + "step": 8507 + }, + { + "epoch": 4.022695035460993, + "grad_norm": 2.829012632369995, + "learning_rate": 1.2420847221433633e-06, + "loss": 0.3211, + "step": 8508 + }, + { + "epoch": 4.023167848699764, + "grad_norm": 2.855806589126587, + "learning_rate": 1.2415456564539808e-06, + "loss": 0.3462, + "step": 8509 + }, + { + "epoch": 4.0236406619385345, + "grad_norm": 3.491786003112793, + "learning_rate": 1.2410066691230311e-06, + "loss": 0.3793, + "step": 8510 + }, + { + "epoch": 4.024113475177305, + "grad_norm": 2.9612972736358643, + "learning_rate": 1.2404677601840765e-06, + "loss": 0.3899, + "step": 8511 + }, + { + "epoch": 4.024586288416075, + "grad_norm": 2.949498176574707, + "learning_rate": 1.2399289296706718e-06, + "loss": 0.3655, + "step": 8512 + }, + { + "epoch": 4.025059101654846, + "grad_norm": 2.736524820327759, + "learning_rate": 1.2393901776163664e-06, + "loss": 0.318, + "step": 8513 + }, + { + "epoch": 4.025531914893617, + "grad_norm": 3.005297899246216, + "learning_rate": 1.2388515040547077e-06, + "loss": 0.3484, + "step": 8514 + }, + { + "epoch": 4.026004728132388, + "grad_norm": 2.9835290908813477, + "learning_rate": 1.2383129090192361e-06, + "loss": 0.3205, + "step": 8515 + }, + { + "epoch": 4.026477541371158, + "grad_norm": 3.1437056064605713, + "learning_rate": 1.2377743925434865e-06, + "loss": 0.3524, + "step": 8516 + }, + { + "epoch": 4.026950354609929, + "grad_norm": 3.0250096321105957, + "learning_rate": 1.2372359546609917e-06, + "loss": 0.3398, + "step": 8517 + }, + { + "epoch": 4.0274231678487, + "grad_norm": 3.109083890914917, + "learning_rate": 1.2366975954052767e-06, + "loss": 0.3317, + "step": 8518 + }, + { + "epoch": 4.027895981087471, + "grad_norm": 2.7713027000427246, + "learning_rate": 1.2361593148098634e-06, + "loss": 0.335, + "step": 8519 + }, + { + "epoch": 4.028368794326241, + "grad_norm": 2.9302117824554443, + "learning_rate": 1.2356211129082673e-06, + "loss": 0.3054, + "step": 8520 + }, + { + "epoch": 4.0288416075650115, + "grad_norm": 3.1805200576782227, + "learning_rate": 1.2350829897339996e-06, + "loss": 0.3219, + "step": 8521 + }, + { + "epoch": 4.029314420803782, + "grad_norm": 3.2687618732452393, + "learning_rate": 1.2345449453205688e-06, + "loss": 0.3966, + "step": 8522 + }, + { + "epoch": 4.029787234042553, + "grad_norm": 3.2010693550109863, + "learning_rate": 1.2340069797014741e-06, + "loss": 0.3547, + "step": 8523 + }, + { + "epoch": 4.030260047281324, + "grad_norm": 2.7061285972595215, + "learning_rate": 1.233469092910215e-06, + "loss": 0.2829, + "step": 8524 + }, + { + "epoch": 4.030732860520095, + "grad_norm": 3.1565401554107666, + "learning_rate": 1.2329312849802817e-06, + "loss": 0.3376, + "step": 8525 + }, + { + "epoch": 4.0312056737588655, + "grad_norm": 2.8864760398864746, + "learning_rate": 1.2323935559451603e-06, + "loss": 0.3946, + "step": 8526 + }, + { + "epoch": 4.031678486997636, + "grad_norm": 3.4621710777282715, + "learning_rate": 1.2318559058383348e-06, + "loss": 0.3859, + "step": 8527 + }, + { + "epoch": 4.032151300236406, + "grad_norm": 3.074201822280884, + "learning_rate": 1.2313183346932806e-06, + "loss": 0.3583, + "step": 8528 + }, + { + "epoch": 4.032624113475177, + "grad_norm": 3.1746935844421387, + "learning_rate": 1.2307808425434715e-06, + "loss": 0.3766, + "step": 8529 + }, + { + "epoch": 4.033096926713948, + "grad_norm": 3.327202081680298, + "learning_rate": 1.2302434294223738e-06, + "loss": 0.3556, + "step": 8530 + }, + { + "epoch": 4.033569739952719, + "grad_norm": 3.375643730163574, + "learning_rate": 1.2297060953634496e-06, + "loss": 0.3574, + "step": 8531 + }, + { + "epoch": 4.034042553191489, + "grad_norm": 2.8553316593170166, + "learning_rate": 1.2291688404001573e-06, + "loss": 0.2807, + "step": 8532 + }, + { + "epoch": 4.03451536643026, + "grad_norm": 3.439772367477417, + "learning_rate": 1.2286316645659492e-06, + "loss": 0.3519, + "step": 8533 + }, + { + "epoch": 4.034988179669031, + "grad_norm": 2.794694662094116, + "learning_rate": 1.2280945678942724e-06, + "loss": 0.3117, + "step": 8534 + }, + { + "epoch": 4.035460992907802, + "grad_norm": 2.9869043827056885, + "learning_rate": 1.2275575504185697e-06, + "loss": 0.3663, + "step": 8535 + }, + { + "epoch": 4.035933806146572, + "grad_norm": 2.711435317993164, + "learning_rate": 1.2270206121722777e-06, + "loss": 0.3547, + "step": 8536 + }, + { + "epoch": 4.0364066193853425, + "grad_norm": 2.843391180038452, + "learning_rate": 1.2264837531888317e-06, + "loss": 0.3124, + "step": 8537 + }, + { + "epoch": 4.036879432624113, + "grad_norm": 3.2082388401031494, + "learning_rate": 1.225946973501658e-06, + "loss": 0.3573, + "step": 8538 + }, + { + "epoch": 4.037352245862884, + "grad_norm": 2.799604654312134, + "learning_rate": 1.2254102731441786e-06, + "loss": 0.3234, + "step": 8539 + }, + { + "epoch": 4.037825059101655, + "grad_norm": 2.682777166366577, + "learning_rate": 1.2248736521498137e-06, + "loss": 0.3087, + "step": 8540 + }, + { + "epoch": 4.038297872340426, + "grad_norm": 2.8138248920440674, + "learning_rate": 1.2243371105519741e-06, + "loss": 0.3668, + "step": 8541 + }, + { + "epoch": 4.0387706855791965, + "grad_norm": 3.3388478755950928, + "learning_rate": 1.2238006483840702e-06, + "loss": 0.3294, + "step": 8542 + }, + { + "epoch": 4.039243498817967, + "grad_norm": 3.06247615814209, + "learning_rate": 1.2232642656795039e-06, + "loss": 0.3348, + "step": 8543 + }, + { + "epoch": 4.039716312056737, + "grad_norm": 2.742628335952759, + "learning_rate": 1.2227279624716724e-06, + "loss": 0.3427, + "step": 8544 + }, + { + "epoch": 4.040189125295508, + "grad_norm": 3.0785365104675293, + "learning_rate": 1.222191738793971e-06, + "loss": 0.3762, + "step": 8545 + }, + { + "epoch": 4.040661938534279, + "grad_norm": 3.0352790355682373, + "learning_rate": 1.2216555946797862e-06, + "loss": 0.3311, + "step": 8546 + }, + { + "epoch": 4.04113475177305, + "grad_norm": 3.1949729919433594, + "learning_rate": 1.2211195301625028e-06, + "loss": 0.3429, + "step": 8547 + }, + { + "epoch": 4.04160756501182, + "grad_norm": 3.214021921157837, + "learning_rate": 1.2205835452754989e-06, + "loss": 0.3528, + "step": 8548 + }, + { + "epoch": 4.042080378250591, + "grad_norm": 3.206296443939209, + "learning_rate": 1.2200476400521474e-06, + "loss": 0.3499, + "step": 8549 + }, + { + "epoch": 4.042553191489362, + "grad_norm": 3.0067825317382812, + "learning_rate": 1.2195118145258167e-06, + "loss": 0.3597, + "step": 8550 + }, + { + "epoch": 4.043026004728133, + "grad_norm": 2.7811057567596436, + "learning_rate": 1.21897606872987e-06, + "loss": 0.3268, + "step": 8551 + }, + { + "epoch": 4.043498817966903, + "grad_norm": 3.1679844856262207, + "learning_rate": 1.218440402697667e-06, + "loss": 0.4025, + "step": 8552 + }, + { + "epoch": 4.0439716312056735, + "grad_norm": 3.2010326385498047, + "learning_rate": 1.217904816462561e-06, + "loss": 0.3426, + "step": 8553 + }, + { + "epoch": 4.044444444444444, + "grad_norm": 3.381863832473755, + "learning_rate": 1.217369310057899e-06, + "loss": 0.3693, + "step": 8554 + }, + { + "epoch": 4.044917257683215, + "grad_norm": 3.471402168273926, + "learning_rate": 1.2168338835170267e-06, + "loss": 0.3977, + "step": 8555 + }, + { + "epoch": 4.045390070921986, + "grad_norm": 3.0549192428588867, + "learning_rate": 1.2162985368732813e-06, + "loss": 0.3262, + "step": 8556 + }, + { + "epoch": 4.045862884160757, + "grad_norm": 3.02451229095459, + "learning_rate": 1.215763270159998e-06, + "loss": 0.3408, + "step": 8557 + }, + { + "epoch": 4.0463356973995275, + "grad_norm": 3.1335513591766357, + "learning_rate": 1.215228083410505e-06, + "loss": 0.3275, + "step": 8558 + }, + { + "epoch": 4.046808510638298, + "grad_norm": 3.379655599594116, + "learning_rate": 1.2146929766581242e-06, + "loss": 0.3511, + "step": 8559 + }, + { + "epoch": 4.047281323877068, + "grad_norm": 3.210146903991699, + "learning_rate": 1.2141579499361772e-06, + "loss": 0.3607, + "step": 8560 + }, + { + "epoch": 4.047754137115839, + "grad_norm": 3.3693792819976807, + "learning_rate": 1.2136230032779753e-06, + "loss": 0.3642, + "step": 8561 + }, + { + "epoch": 4.04822695035461, + "grad_norm": 3.0397274494171143, + "learning_rate": 1.2130881367168292e-06, + "loss": 0.3376, + "step": 8562 + }, + { + "epoch": 4.048699763593381, + "grad_norm": 3.119372606277466, + "learning_rate": 1.212553350286042e-06, + "loss": 0.3581, + "step": 8563 + }, + { + "epoch": 4.049172576832151, + "grad_norm": 2.9431848526000977, + "learning_rate": 1.2120186440189124e-06, + "loss": 0.3453, + "step": 8564 + }, + { + "epoch": 4.049645390070922, + "grad_norm": 3.256748914718628, + "learning_rate": 1.2114840179487333e-06, + "loss": 0.3766, + "step": 8565 + }, + { + "epoch": 4.050118203309693, + "grad_norm": 2.792759656906128, + "learning_rate": 1.2109494721087953e-06, + "loss": 0.3396, + "step": 8566 + }, + { + "epoch": 4.050591016548464, + "grad_norm": 2.9790122509002686, + "learning_rate": 1.2104150065323813e-06, + "loss": 0.3631, + "step": 8567 + }, + { + "epoch": 4.051063829787234, + "grad_norm": 2.7998805046081543, + "learning_rate": 1.2098806212527705e-06, + "loss": 0.3442, + "step": 8568 + }, + { + "epoch": 4.0515366430260045, + "grad_norm": 3.1292848587036133, + "learning_rate": 1.2093463163032351e-06, + "loss": 0.3798, + "step": 8569 + }, + { + "epoch": 4.052009456264775, + "grad_norm": 3.156205892562866, + "learning_rate": 1.2088120917170465e-06, + "loss": 0.309, + "step": 8570 + }, + { + "epoch": 4.052482269503546, + "grad_norm": 2.8891193866729736, + "learning_rate": 1.208277947527467e-06, + "loss": 0.2989, + "step": 8571 + }, + { + "epoch": 4.052955082742317, + "grad_norm": 3.087719678878784, + "learning_rate": 1.2077438837677548e-06, + "loss": 0.3348, + "step": 8572 + }, + { + "epoch": 4.053427895981088, + "grad_norm": 3.345583915710449, + "learning_rate": 1.2072099004711657e-06, + "loss": 0.3395, + "step": 8573 + }, + { + "epoch": 4.0539007092198585, + "grad_norm": 2.9834377765655518, + "learning_rate": 1.2066759976709463e-06, + "loss": 0.3252, + "step": 8574 + }, + { + "epoch": 4.054373522458629, + "grad_norm": 3.0764353275299072, + "learning_rate": 1.2061421754003425e-06, + "loss": 0.3467, + "step": 8575 + }, + { + "epoch": 4.054846335697399, + "grad_norm": 3.332232713699341, + "learning_rate": 1.2056084336925919e-06, + "loss": 0.3448, + "step": 8576 + }, + { + "epoch": 4.05531914893617, + "grad_norm": 3.1885993480682373, + "learning_rate": 1.2050747725809275e-06, + "loss": 0.325, + "step": 8577 + }, + { + "epoch": 4.055791962174941, + "grad_norm": 3.2727091312408447, + "learning_rate": 1.2045411920985798e-06, + "loss": 0.3755, + "step": 8578 + }, + { + "epoch": 4.0562647754137116, + "grad_norm": 3.0687687397003174, + "learning_rate": 1.2040076922787708e-06, + "loss": 0.2791, + "step": 8579 + }, + { + "epoch": 4.056737588652482, + "grad_norm": 3.2538771629333496, + "learning_rate": 1.2034742731547211e-06, + "loss": 0.3409, + "step": 8580 + }, + { + "epoch": 4.057210401891253, + "grad_norm": 3.237423896789551, + "learning_rate": 1.2029409347596429e-06, + "loss": 0.3803, + "step": 8581 + }, + { + "epoch": 4.057683215130024, + "grad_norm": 3.3347854614257812, + "learning_rate": 1.2024076771267457e-06, + "loss": 0.3123, + "step": 8582 + }, + { + "epoch": 4.058156028368795, + "grad_norm": 3.1294021606445312, + "learning_rate": 1.2018745002892327e-06, + "loss": 0.33, + "step": 8583 + }, + { + "epoch": 4.058628841607565, + "grad_norm": 2.9440014362335205, + "learning_rate": 1.2013414042803013e-06, + "loss": 0.3698, + "step": 8584 + }, + { + "epoch": 4.0591016548463354, + "grad_norm": 3.602764129638672, + "learning_rate": 1.200808389133147e-06, + "loss": 0.3733, + "step": 8585 + }, + { + "epoch": 4.059574468085106, + "grad_norm": 3.2689952850341797, + "learning_rate": 1.2002754548809578e-06, + "loss": 0.3188, + "step": 8586 + }, + { + "epoch": 4.060047281323877, + "grad_norm": 3.15454363822937, + "learning_rate": 1.199742601556916e-06, + "loss": 0.3493, + "step": 8587 + }, + { + "epoch": 4.060520094562648, + "grad_norm": 2.843860387802124, + "learning_rate": 1.1992098291942016e-06, + "loss": 0.3277, + "step": 8588 + }, + { + "epoch": 4.060992907801419, + "grad_norm": 3.0749056339263916, + "learning_rate": 1.1986771378259876e-06, + "loss": 0.3465, + "step": 8589 + }, + { + "epoch": 4.061465721040189, + "grad_norm": 3.3339948654174805, + "learning_rate": 1.1981445274854412e-06, + "loss": 0.3507, + "step": 8590 + }, + { + "epoch": 4.06193853427896, + "grad_norm": 2.7992780208587646, + "learning_rate": 1.1976119982057275e-06, + "loss": 0.302, + "step": 8591 + }, + { + "epoch": 4.06241134751773, + "grad_norm": 3.0862269401550293, + "learning_rate": 1.1970795500200028e-06, + "loss": 0.3365, + "step": 8592 + }, + { + "epoch": 4.062884160756501, + "grad_norm": 3.263456106185913, + "learning_rate": 1.1965471829614222e-06, + "loss": 0.3764, + "step": 8593 + }, + { + "epoch": 4.063356973995272, + "grad_norm": 3.0682623386383057, + "learning_rate": 1.1960148970631332e-06, + "loss": 0.3488, + "step": 8594 + }, + { + "epoch": 4.0638297872340425, + "grad_norm": 2.8910646438598633, + "learning_rate": 1.195482692358278e-06, + "loss": 0.3224, + "step": 8595 + }, + { + "epoch": 4.064302600472813, + "grad_norm": 3.170072555541992, + "learning_rate": 1.1949505688799961e-06, + "loss": 0.3058, + "step": 8596 + }, + { + "epoch": 4.064775413711584, + "grad_norm": 3.018674373626709, + "learning_rate": 1.19441852666142e-06, + "loss": 0.3824, + "step": 8597 + }, + { + "epoch": 4.065248226950355, + "grad_norm": 3.0038044452667236, + "learning_rate": 1.1938865657356773e-06, + "loss": 0.3657, + "step": 8598 + }, + { + "epoch": 4.065721040189126, + "grad_norm": 3.248204469680786, + "learning_rate": 1.193354686135891e-06, + "loss": 0.3305, + "step": 8599 + }, + { + "epoch": 4.066193853427896, + "grad_norm": 3.144714832305908, + "learning_rate": 1.192822887895178e-06, + "loss": 0.3395, + "step": 8600 + }, + { + "epoch": 4.066666666666666, + "grad_norm": 2.9457240104675293, + "learning_rate": 1.1922911710466531e-06, + "loss": 0.3288, + "step": 8601 + }, + { + "epoch": 4.067139479905437, + "grad_norm": 3.1602869033813477, + "learning_rate": 1.1917595356234218e-06, + "loss": 0.3713, + "step": 8602 + }, + { + "epoch": 4.067612293144208, + "grad_norm": 3.0820837020874023, + "learning_rate": 1.1912279816585888e-06, + "loss": 0.2987, + "step": 8603 + }, + { + "epoch": 4.068085106382979, + "grad_norm": 3.0366809368133545, + "learning_rate": 1.1906965091852502e-06, + "loss": 0.4151, + "step": 8604 + }, + { + "epoch": 4.06855791962175, + "grad_norm": 3.229402780532837, + "learning_rate": 1.190165118236498e-06, + "loss": 0.321, + "step": 8605 + }, + { + "epoch": 4.06903073286052, + "grad_norm": 2.832232713699341, + "learning_rate": 1.1896338088454217e-06, + "loss": 0.3551, + "step": 8606 + }, + { + "epoch": 4.069503546099291, + "grad_norm": 3.5618600845336914, + "learning_rate": 1.1891025810451012e-06, + "loss": 0.3704, + "step": 8607 + }, + { + "epoch": 4.069976359338061, + "grad_norm": 3.287827491760254, + "learning_rate": 1.1885714348686158e-06, + "loss": 0.3469, + "step": 8608 + }, + { + "epoch": 4.070449172576832, + "grad_norm": 3.468825101852417, + "learning_rate": 1.188040370349037e-06, + "loss": 0.3687, + "step": 8609 + }, + { + "epoch": 4.070921985815603, + "grad_norm": 3.2931180000305176, + "learning_rate": 1.1875093875194302e-06, + "loss": 0.3832, + "step": 8610 + }, + { + "epoch": 4.0713947990543735, + "grad_norm": 2.9613003730773926, + "learning_rate": 1.18697848641286e-06, + "loss": 0.3314, + "step": 8611 + }, + { + "epoch": 4.071867612293144, + "grad_norm": 3.1507649421691895, + "learning_rate": 1.1864476670623816e-06, + "loss": 0.3153, + "step": 8612 + }, + { + "epoch": 4.072340425531915, + "grad_norm": 2.844064950942993, + "learning_rate": 1.1859169295010478e-06, + "loss": 0.3566, + "step": 8613 + }, + { + "epoch": 4.072813238770686, + "grad_norm": 3.227264881134033, + "learning_rate": 1.1853862737619042e-06, + "loss": 0.3717, + "step": 8614 + }, + { + "epoch": 4.073286052009456, + "grad_norm": 2.9416239261627197, + "learning_rate": 1.1848556998779922e-06, + "loss": 0.3438, + "step": 8615 + }, + { + "epoch": 4.073758865248227, + "grad_norm": 4.1662492752075195, + "learning_rate": 1.18432520788235e-06, + "loss": 0.362, + "step": 8616 + }, + { + "epoch": 4.074231678486997, + "grad_norm": 3.47951602935791, + "learning_rate": 1.183794797808008e-06, + "loss": 0.3672, + "step": 8617 + }, + { + "epoch": 4.074704491725768, + "grad_norm": 2.998969793319702, + "learning_rate": 1.1832644696879919e-06, + "loss": 0.3281, + "step": 8618 + }, + { + "epoch": 4.075177304964539, + "grad_norm": 2.956167221069336, + "learning_rate": 1.182734223555324e-06, + "loss": 0.3059, + "step": 8619 + }, + { + "epoch": 4.07565011820331, + "grad_norm": 3.447821855545044, + "learning_rate": 1.1822040594430195e-06, + "loss": 0.333, + "step": 8620 + }, + { + "epoch": 4.076122931442081, + "grad_norm": 3.072972059249878, + "learning_rate": 1.1816739773840905e-06, + "loss": 0.3737, + "step": 8621 + }, + { + "epoch": 4.076595744680851, + "grad_norm": 3.142913341522217, + "learning_rate": 1.1811439774115424e-06, + "loss": 0.3697, + "step": 8622 + }, + { + "epoch": 4.077068557919622, + "grad_norm": 3.4997763633728027, + "learning_rate": 1.1806140595583745e-06, + "loss": 0.4177, + "step": 8623 + }, + { + "epoch": 4.077541371158392, + "grad_norm": 3.032951831817627, + "learning_rate": 1.1800842238575853e-06, + "loss": 0.351, + "step": 8624 + }, + { + "epoch": 4.078014184397163, + "grad_norm": 2.8878438472747803, + "learning_rate": 1.1795544703421625e-06, + "loss": 0.3409, + "step": 8625 + }, + { + "epoch": 4.078486997635934, + "grad_norm": 2.931614637374878, + "learning_rate": 1.1790247990450936e-06, + "loss": 0.3416, + "step": 8626 + }, + { + "epoch": 4.0789598108747045, + "grad_norm": 3.1719822883605957, + "learning_rate": 1.1784952099993586e-06, + "loss": 0.3574, + "step": 8627 + }, + { + "epoch": 4.079432624113475, + "grad_norm": 2.960068464279175, + "learning_rate": 1.1779657032379322e-06, + "loss": 0.3557, + "step": 8628 + }, + { + "epoch": 4.079905437352246, + "grad_norm": 3.1410937309265137, + "learning_rate": 1.1774362787937843e-06, + "loss": 0.3839, + "step": 8629 + }, + { + "epoch": 4.080378250591017, + "grad_norm": 3.596153736114502, + "learning_rate": 1.1769069366998793e-06, + "loss": 0.3135, + "step": 8630 + }, + { + "epoch": 4.080851063829787, + "grad_norm": 3.385826587677002, + "learning_rate": 1.1763776769891786e-06, + "loss": 0.3624, + "step": 8631 + }, + { + "epoch": 4.081323877068558, + "grad_norm": 3.2531018257141113, + "learning_rate": 1.175848499694636e-06, + "loss": 0.3593, + "step": 8632 + }, + { + "epoch": 4.081796690307328, + "grad_norm": 3.3864004611968994, + "learning_rate": 1.1753194048492004e-06, + "loss": 0.3929, + "step": 8633 + }, + { + "epoch": 4.082269503546099, + "grad_norm": 2.8734285831451416, + "learning_rate": 1.1747903924858175e-06, + "loss": 0.3145, + "step": 8634 + }, + { + "epoch": 4.08274231678487, + "grad_norm": 3.3261659145355225, + "learning_rate": 1.174261462637426e-06, + "loss": 0.3351, + "step": 8635 + }, + { + "epoch": 4.083215130023641, + "grad_norm": 3.413990020751953, + "learning_rate": 1.1737326153369594e-06, + "loss": 0.3984, + "step": 8636 + }, + { + "epoch": 4.083687943262412, + "grad_norm": 3.311741590499878, + "learning_rate": 1.1732038506173481e-06, + "loss": 0.3716, + "step": 8637 + }, + { + "epoch": 4.084160756501182, + "grad_norm": 3.691573143005371, + "learning_rate": 1.1726751685115142e-06, + "loss": 0.3542, + "step": 8638 + }, + { + "epoch": 4.084633569739952, + "grad_norm": 3.1951167583465576, + "learning_rate": 1.1721465690523784e-06, + "loss": 0.3683, + "step": 8639 + }, + { + "epoch": 4.085106382978723, + "grad_norm": 3.1731514930725098, + "learning_rate": 1.1716180522728534e-06, + "loss": 0.3552, + "step": 8640 + }, + { + "epoch": 4.085579196217494, + "grad_norm": 3.1588845252990723, + "learning_rate": 1.1710896182058465e-06, + "loss": 0.3908, + "step": 8641 + }, + { + "epoch": 4.086052009456265, + "grad_norm": 3.6902294158935547, + "learning_rate": 1.1705612668842628e-06, + "loss": 0.4099, + "step": 8642 + }, + { + "epoch": 4.0865248226950355, + "grad_norm": 4.56397819519043, + "learning_rate": 1.1700329983409988e-06, + "loss": 0.3456, + "step": 8643 + }, + { + "epoch": 4.086997635933806, + "grad_norm": 2.924715995788574, + "learning_rate": 1.1695048126089492e-06, + "loss": 0.3885, + "step": 8644 + }, + { + "epoch": 4.087470449172577, + "grad_norm": 3.537550687789917, + "learning_rate": 1.1689767097210009e-06, + "loss": 0.3551, + "step": 8645 + }, + { + "epoch": 4.087943262411348, + "grad_norm": 3.0198440551757812, + "learning_rate": 1.1684486897100364e-06, + "loss": 0.3448, + "step": 8646 + }, + { + "epoch": 4.088416075650118, + "grad_norm": 3.448965072631836, + "learning_rate": 1.1679207526089334e-06, + "loss": 0.3252, + "step": 8647 + }, + { + "epoch": 4.088888888888889, + "grad_norm": 3.057326078414917, + "learning_rate": 1.167392898450563e-06, + "loss": 0.3231, + "step": 8648 + }, + { + "epoch": 4.089361702127659, + "grad_norm": 3.0788655281066895, + "learning_rate": 1.1668651272677948e-06, + "loss": 0.3273, + "step": 8649 + }, + { + "epoch": 4.08983451536643, + "grad_norm": 2.9126291275024414, + "learning_rate": 1.1663374390934893e-06, + "loss": 0.3162, + "step": 8650 + }, + { + "epoch": 4.090307328605201, + "grad_norm": 3.278874635696411, + "learning_rate": 1.1658098339605027e-06, + "loss": 0.3123, + "step": 8651 + }, + { + "epoch": 4.090780141843972, + "grad_norm": 2.8490889072418213, + "learning_rate": 1.1652823119016882e-06, + "loss": 0.3408, + "step": 8652 + }, + { + "epoch": 4.091252955082743, + "grad_norm": 3.0473995208740234, + "learning_rate": 1.164754872949891e-06, + "loss": 0.3349, + "step": 8653 + }, + { + "epoch": 4.091725768321513, + "grad_norm": 2.9052987098693848, + "learning_rate": 1.1642275171379535e-06, + "loss": 0.3113, + "step": 8654 + }, + { + "epoch": 4.092198581560283, + "grad_norm": 3.1060919761657715, + "learning_rate": 1.1637002444987116e-06, + "loss": 0.3139, + "step": 8655 + }, + { + "epoch": 4.092671394799054, + "grad_norm": 3.172394275665283, + "learning_rate": 1.163173055064995e-06, + "loss": 0.3555, + "step": 8656 + }, + { + "epoch": 4.093144208037825, + "grad_norm": 3.36523699760437, + "learning_rate": 1.1626459488696313e-06, + "loss": 0.3835, + "step": 8657 + }, + { + "epoch": 4.093617021276596, + "grad_norm": 2.9513938426971436, + "learning_rate": 1.1621189259454393e-06, + "loss": 0.3432, + "step": 8658 + }, + { + "epoch": 4.0940898345153665, + "grad_norm": 2.8415515422821045, + "learning_rate": 1.1615919863252365e-06, + "loss": 0.3494, + "step": 8659 + }, + { + "epoch": 4.094562647754137, + "grad_norm": 3.3759984970092773, + "learning_rate": 1.1610651300418315e-06, + "loss": 0.3519, + "step": 8660 + }, + { + "epoch": 4.095035460992908, + "grad_norm": 3.1927380561828613, + "learning_rate": 1.1605383571280304e-06, + "loss": 0.3675, + "step": 8661 + }, + { + "epoch": 4.095508274231679, + "grad_norm": 2.800658941268921, + "learning_rate": 1.1600116676166321e-06, + "loss": 0.3291, + "step": 8662 + }, + { + "epoch": 4.095981087470449, + "grad_norm": 2.9948630332946777, + "learning_rate": 1.1594850615404316e-06, + "loss": 0.332, + "step": 8663 + }, + { + "epoch": 4.0964539007092196, + "grad_norm": 3.032003879547119, + "learning_rate": 1.1589585389322176e-06, + "loss": 0.3583, + "step": 8664 + }, + { + "epoch": 4.09692671394799, + "grad_norm": 2.9765310287475586, + "learning_rate": 1.1584320998247757e-06, + "loss": 0.3296, + "step": 8665 + }, + { + "epoch": 4.097399527186761, + "grad_norm": 3.049954414367676, + "learning_rate": 1.1579057442508838e-06, + "loss": 0.4007, + "step": 8666 + }, + { + "epoch": 4.097872340425532, + "grad_norm": 3.3874928951263428, + "learning_rate": 1.1573794722433168e-06, + "loss": 0.3856, + "step": 8667 + }, + { + "epoch": 4.098345153664303, + "grad_norm": 2.56701397895813, + "learning_rate": 1.1568532838348432e-06, + "loss": 0.3291, + "step": 8668 + }, + { + "epoch": 4.0988179669030735, + "grad_norm": 2.956408739089966, + "learning_rate": 1.1563271790582247e-06, + "loss": 0.3538, + "step": 8669 + }, + { + "epoch": 4.099290780141844, + "grad_norm": 3.827467679977417, + "learning_rate": 1.1558011579462225e-06, + "loss": 0.3764, + "step": 8670 + }, + { + "epoch": 4.099763593380614, + "grad_norm": 3.3271424770355225, + "learning_rate": 1.1552752205315867e-06, + "loss": 0.339, + "step": 8671 + }, + { + "epoch": 4.100236406619385, + "grad_norm": 3.0050785541534424, + "learning_rate": 1.1547493668470675e-06, + "loss": 0.3764, + "step": 8672 + }, + { + "epoch": 4.100709219858156, + "grad_norm": 2.6030385494232178, + "learning_rate": 1.1542235969254065e-06, + "loss": 0.3507, + "step": 8673 + }, + { + "epoch": 4.101182033096927, + "grad_norm": 3.081695556640625, + "learning_rate": 1.1536979107993402e-06, + "loss": 0.3386, + "step": 8674 + }, + { + "epoch": 4.101654846335697, + "grad_norm": 2.7685163021087646, + "learning_rate": 1.1531723085016025e-06, + "loss": 0.3456, + "step": 8675 + }, + { + "epoch": 4.102127659574468, + "grad_norm": 3.037252902984619, + "learning_rate": 1.1526467900649195e-06, + "loss": 0.3489, + "step": 8676 + }, + { + "epoch": 4.102600472813239, + "grad_norm": 2.9675045013427734, + "learning_rate": 1.1521213555220129e-06, + "loss": 0.3638, + "step": 8677 + }, + { + "epoch": 4.10307328605201, + "grad_norm": 3.3377575874328613, + "learning_rate": 1.1515960049055994e-06, + "loss": 0.3557, + "step": 8678 + }, + { + "epoch": 4.10354609929078, + "grad_norm": 2.7452030181884766, + "learning_rate": 1.1510707382483888e-06, + "loss": 0.285, + "step": 8679 + }, + { + "epoch": 4.1040189125295505, + "grad_norm": 2.9602560997009277, + "learning_rate": 1.1505455555830897e-06, + "loss": 0.321, + "step": 8680 + }, + { + "epoch": 4.104491725768321, + "grad_norm": 3.122945547103882, + "learning_rate": 1.1500204569424007e-06, + "loss": 0.375, + "step": 8681 + }, + { + "epoch": 4.104964539007092, + "grad_norm": 3.334885835647583, + "learning_rate": 1.149495442359019e-06, + "loss": 0.3399, + "step": 8682 + }, + { + "epoch": 4.105437352245863, + "grad_norm": 3.3663594722747803, + "learning_rate": 1.1489705118656346e-06, + "loss": 0.3482, + "step": 8683 + }, + { + "epoch": 4.105910165484634, + "grad_norm": 2.9761641025543213, + "learning_rate": 1.1484456654949313e-06, + "loss": 0.3317, + "step": 8684 + }, + { + "epoch": 4.1063829787234045, + "grad_norm": 3.486905574798584, + "learning_rate": 1.147920903279591e-06, + "loss": 0.3737, + "step": 8685 + }, + { + "epoch": 4.106855791962175, + "grad_norm": 3.4820523262023926, + "learning_rate": 1.1473962252522875e-06, + "loss": 0.4135, + "step": 8686 + }, + { + "epoch": 4.107328605200945, + "grad_norm": 3.314117431640625, + "learning_rate": 1.146871631445689e-06, + "loss": 0.369, + "step": 8687 + }, + { + "epoch": 4.107801418439716, + "grad_norm": 2.9497411251068115, + "learning_rate": 1.1463471218924615e-06, + "loss": 0.3233, + "step": 8688 + }, + { + "epoch": 4.108274231678487, + "grad_norm": 3.1337075233459473, + "learning_rate": 1.1458226966252624e-06, + "loss": 0.401, + "step": 8689 + }, + { + "epoch": 4.108747044917258, + "grad_norm": 3.0163166522979736, + "learning_rate": 1.1452983556767473e-06, + "loss": 0.3812, + "step": 8690 + }, + { + "epoch": 4.109219858156028, + "grad_norm": 2.976491928100586, + "learning_rate": 1.1447740990795629e-06, + "loss": 0.3508, + "step": 8691 + }, + { + "epoch": 4.109692671394799, + "grad_norm": 3.2449910640716553, + "learning_rate": 1.144249926866353e-06, + "loss": 0.3056, + "step": 8692 + }, + { + "epoch": 4.11016548463357, + "grad_norm": 2.562558650970459, + "learning_rate": 1.1437258390697553e-06, + "loss": 0.2878, + "step": 8693 + }, + { + "epoch": 4.110638297872341, + "grad_norm": 3.1823108196258545, + "learning_rate": 1.1432018357224017e-06, + "loss": 0.2849, + "step": 8694 + }, + { + "epoch": 4.111111111111111, + "grad_norm": 2.9045653343200684, + "learning_rate": 1.1426779168569217e-06, + "loss": 0.3264, + "step": 8695 + }, + { + "epoch": 4.1115839243498815, + "grad_norm": 2.7991254329681396, + "learning_rate": 1.1421540825059355e-06, + "loss": 0.3427, + "step": 8696 + }, + { + "epoch": 4.112056737588652, + "grad_norm": 2.9184927940368652, + "learning_rate": 1.14163033270206e-06, + "loss": 0.3073, + "step": 8697 + }, + { + "epoch": 4.112529550827423, + "grad_norm": 3.189335584640503, + "learning_rate": 1.1411066674779084e-06, + "loss": 0.3836, + "step": 8698 + }, + { + "epoch": 4.113002364066194, + "grad_norm": 2.899711847305298, + "learning_rate": 1.140583086866085e-06, + "loss": 0.3378, + "step": 8699 + }, + { + "epoch": 4.113475177304965, + "grad_norm": 3.167665481567383, + "learning_rate": 1.1400595908991927e-06, + "loss": 0.3273, + "step": 8700 + }, + { + "epoch": 4.1139479905437355, + "grad_norm": 3.2930212020874023, + "learning_rate": 1.1395361796098268e-06, + "loss": 0.3221, + "step": 8701 + }, + { + "epoch": 4.114420803782506, + "grad_norm": 3.0603861808776855, + "learning_rate": 1.1390128530305764e-06, + "loss": 0.3371, + "step": 8702 + }, + { + "epoch": 4.114893617021276, + "grad_norm": 3.6339457035064697, + "learning_rate": 1.1384896111940289e-06, + "loss": 0.3986, + "step": 8703 + }, + { + "epoch": 4.115366430260047, + "grad_norm": 2.975799322128296, + "learning_rate": 1.1379664541327623e-06, + "loss": 0.3021, + "step": 8704 + }, + { + "epoch": 4.115839243498818, + "grad_norm": 2.9100987911224365, + "learning_rate": 1.1374433818793534e-06, + "loss": 0.3473, + "step": 8705 + }, + { + "epoch": 4.116312056737589, + "grad_norm": 2.9515233039855957, + "learning_rate": 1.1369203944663704e-06, + "loss": 0.3004, + "step": 8706 + }, + { + "epoch": 4.116784869976359, + "grad_norm": 3.283583879470825, + "learning_rate": 1.1363974919263774e-06, + "loss": 0.401, + "step": 8707 + }, + { + "epoch": 4.11725768321513, + "grad_norm": 3.307530641555786, + "learning_rate": 1.1358746742919325e-06, + "loss": 0.322, + "step": 8708 + }, + { + "epoch": 4.117730496453901, + "grad_norm": 3.6834614276885986, + "learning_rate": 1.135351941595591e-06, + "loss": 0.3703, + "step": 8709 + }, + { + "epoch": 4.118203309692672, + "grad_norm": 3.0829904079437256, + "learning_rate": 1.1348292938699e-06, + "loss": 0.3283, + "step": 8710 + }, + { + "epoch": 4.118676122931442, + "grad_norm": 2.914794921875, + "learning_rate": 1.1343067311474033e-06, + "loss": 0.337, + "step": 8711 + }, + { + "epoch": 4.1191489361702125, + "grad_norm": 3.550536870956421, + "learning_rate": 1.1337842534606368e-06, + "loss": 0.3752, + "step": 8712 + }, + { + "epoch": 4.119621749408983, + "grad_norm": 3.337012767791748, + "learning_rate": 1.1332618608421353e-06, + "loss": 0.3604, + "step": 8713 + }, + { + "epoch": 4.120094562647754, + "grad_norm": 2.7749485969543457, + "learning_rate": 1.1327395533244248e-06, + "loss": 0.3712, + "step": 8714 + }, + { + "epoch": 4.120567375886525, + "grad_norm": 3.571261405944824, + "learning_rate": 1.1322173309400258e-06, + "loss": 0.4148, + "step": 8715 + }, + { + "epoch": 4.121040189125296, + "grad_norm": 3.264871597290039, + "learning_rate": 1.1316951937214573e-06, + "loss": 0.3229, + "step": 8716 + }, + { + "epoch": 4.1215130023640665, + "grad_norm": 2.974625825881958, + "learning_rate": 1.131173141701228e-06, + "loss": 0.3372, + "step": 8717 + }, + { + "epoch": 4.121985815602837, + "grad_norm": 3.18060302734375, + "learning_rate": 1.1306511749118466e-06, + "loss": 0.4041, + "step": 8718 + }, + { + "epoch": 4.122458628841607, + "grad_norm": 2.7793190479278564, + "learning_rate": 1.1301292933858115e-06, + "loss": 0.3329, + "step": 8719 + }, + { + "epoch": 4.122931442080378, + "grad_norm": 3.0883100032806396, + "learning_rate": 1.1296074971556179e-06, + "loss": 0.3999, + "step": 8720 + }, + { + "epoch": 4.123404255319149, + "grad_norm": 2.984799385070801, + "learning_rate": 1.1290857862537573e-06, + "loss": 0.3432, + "step": 8721 + }, + { + "epoch": 4.12387706855792, + "grad_norm": 3.0691094398498535, + "learning_rate": 1.1285641607127127e-06, + "loss": 0.3043, + "step": 8722 + }, + { + "epoch": 4.12434988179669, + "grad_norm": 3.2218985557556152, + "learning_rate": 1.128042620564965e-06, + "loss": 0.363, + "step": 8723 + }, + { + "epoch": 4.124822695035461, + "grad_norm": 2.951098918914795, + "learning_rate": 1.1275211658429877e-06, + "loss": 0.3459, + "step": 8724 + }, + { + "epoch": 4.125295508274232, + "grad_norm": 3.038513660430908, + "learning_rate": 1.1269997965792493e-06, + "loss": 0.3073, + "step": 8725 + }, + { + "epoch": 4.125768321513003, + "grad_norm": 2.7548015117645264, + "learning_rate": 1.1264785128062129e-06, + "loss": 0.3587, + "step": 8726 + }, + { + "epoch": 4.126241134751773, + "grad_norm": 3.618379592895508, + "learning_rate": 1.125957314556336e-06, + "loss": 0.4009, + "step": 8727 + }, + { + "epoch": 4.1267139479905435, + "grad_norm": 3.264702320098877, + "learning_rate": 1.1254362018620728e-06, + "loss": 0.3684, + "step": 8728 + }, + { + "epoch": 4.127186761229314, + "grad_norm": 3.209995746612549, + "learning_rate": 1.1249151747558704e-06, + "loss": 0.3796, + "step": 8729 + }, + { + "epoch": 4.127659574468085, + "grad_norm": 3.164973735809326, + "learning_rate": 1.1243942332701693e-06, + "loss": 0.3147, + "step": 8730 + }, + { + "epoch": 4.128132387706856, + "grad_norm": 3.309659957885742, + "learning_rate": 1.1238733774374087e-06, + "loss": 0.308, + "step": 8731 + }, + { + "epoch": 4.128605200945627, + "grad_norm": 3.138901710510254, + "learning_rate": 1.1233526072900184e-06, + "loss": 0.3721, + "step": 8732 + }, + { + "epoch": 4.1290780141843975, + "grad_norm": 3.5710649490356445, + "learning_rate": 1.122831922860424e-06, + "loss": 0.3872, + "step": 8733 + }, + { + "epoch": 4.129550827423168, + "grad_norm": 3.192469835281372, + "learning_rate": 1.1223113241810482e-06, + "loss": 0.349, + "step": 8734 + }, + { + "epoch": 4.130023640661938, + "grad_norm": 2.9302608966827393, + "learning_rate": 1.121790811284304e-06, + "loss": 0.3207, + "step": 8735 + }, + { + "epoch": 4.130496453900709, + "grad_norm": 3.022963047027588, + "learning_rate": 1.121270384202604e-06, + "loss": 0.3487, + "step": 8736 + }, + { + "epoch": 4.13096926713948, + "grad_norm": 3.0473732948303223, + "learning_rate": 1.1207500429683513e-06, + "loss": 0.3083, + "step": 8737 + }, + { + "epoch": 4.131442080378251, + "grad_norm": 2.9411537647247314, + "learning_rate": 1.1202297876139448e-06, + "loss": 0.3077, + "step": 8738 + }, + { + "epoch": 4.131914893617021, + "grad_norm": 2.9274520874023438, + "learning_rate": 1.1197096181717804e-06, + "loss": 0.3071, + "step": 8739 + }, + { + "epoch": 4.132387706855792, + "grad_norm": 2.79213285446167, + "learning_rate": 1.1191895346742454e-06, + "loss": 0.3346, + "step": 8740 + }, + { + "epoch": 4.132860520094563, + "grad_norm": 3.2763726711273193, + "learning_rate": 1.1186695371537235e-06, + "loss": 0.3753, + "step": 8741 + }, + { + "epoch": 4.133333333333334, + "grad_norm": 3.245525598526001, + "learning_rate": 1.1181496256425927e-06, + "loss": 0.3586, + "step": 8742 + }, + { + "epoch": 4.133806146572104, + "grad_norm": 3.557176351547241, + "learning_rate": 1.1176298001732244e-06, + "loss": 0.3547, + "step": 8743 + }, + { + "epoch": 4.1342789598108745, + "grad_norm": 3.674633741378784, + "learning_rate": 1.117110060777988e-06, + "loss": 0.3994, + "step": 8744 + }, + { + "epoch": 4.134751773049645, + "grad_norm": 3.168025016784668, + "learning_rate": 1.1165904074892433e-06, + "loss": 0.3568, + "step": 8745 + }, + { + "epoch": 4.135224586288416, + "grad_norm": 2.9492177963256836, + "learning_rate": 1.1160708403393488e-06, + "loss": 0.3257, + "step": 8746 + }, + { + "epoch": 4.135697399527187, + "grad_norm": 3.139941930770874, + "learning_rate": 1.1155513593606548e-06, + "loss": 0.3464, + "step": 8747 + }, + { + "epoch": 4.136170212765958, + "grad_norm": 3.1875250339508057, + "learning_rate": 1.115031964585506e-06, + "loss": 0.3154, + "step": 8748 + }, + { + "epoch": 4.136643026004728, + "grad_norm": 3.0219457149505615, + "learning_rate": 1.1145126560462447e-06, + "loss": 0.3433, + "step": 8749 + }, + { + "epoch": 4.137115839243499, + "grad_norm": 2.992807149887085, + "learning_rate": 1.1139934337752046e-06, + "loss": 0.3127, + "step": 8750 + }, + { + "epoch": 4.137588652482269, + "grad_norm": 3.354733943939209, + "learning_rate": 1.1134742978047163e-06, + "loss": 0.3166, + "step": 8751 + }, + { + "epoch": 4.13806146572104, + "grad_norm": 3.1885886192321777, + "learning_rate": 1.1129552481671042e-06, + "loss": 0.3872, + "step": 8752 + }, + { + "epoch": 4.138534278959811, + "grad_norm": 2.8869078159332275, + "learning_rate": 1.1124362848946858e-06, + "loss": 0.3218, + "step": 8753 + }, + { + "epoch": 4.1390070921985815, + "grad_norm": 3.818469285964966, + "learning_rate": 1.1119174080197762e-06, + "loss": 0.3442, + "step": 8754 + }, + { + "epoch": 4.139479905437352, + "grad_norm": 3.2445592880249023, + "learning_rate": 1.1113986175746833e-06, + "loss": 0.3858, + "step": 8755 + }, + { + "epoch": 4.139952718676123, + "grad_norm": 2.654083490371704, + "learning_rate": 1.1108799135917098e-06, + "loss": 0.3023, + "step": 8756 + }, + { + "epoch": 4.140425531914894, + "grad_norm": 3.129635810852051, + "learning_rate": 1.1103612961031527e-06, + "loss": 0.3179, + "step": 8757 + }, + { + "epoch": 4.140898345153665, + "grad_norm": 2.8118138313293457, + "learning_rate": 1.1098427651413035e-06, + "loss": 0.3374, + "step": 8758 + }, + { + "epoch": 4.141371158392435, + "grad_norm": 3.104051113128662, + "learning_rate": 1.1093243207384506e-06, + "loss": 0.3202, + "step": 8759 + }, + { + "epoch": 4.141843971631205, + "grad_norm": 3.12392520904541, + "learning_rate": 1.1088059629268744e-06, + "loss": 0.3567, + "step": 8760 + }, + { + "epoch": 4.142316784869976, + "grad_norm": 3.467481851577759, + "learning_rate": 1.1082876917388497e-06, + "loss": 0.4148, + "step": 8761 + }, + { + "epoch": 4.142789598108747, + "grad_norm": 3.1120564937591553, + "learning_rate": 1.1077695072066488e-06, + "loss": 0.3838, + "step": 8762 + }, + { + "epoch": 4.143262411347518, + "grad_norm": 3.028073310852051, + "learning_rate": 1.107251409362535e-06, + "loss": 0.3703, + "step": 8763 + }, + { + "epoch": 4.143735224586289, + "grad_norm": 3.091510057449341, + "learning_rate": 1.1067333982387699e-06, + "loss": 0.3695, + "step": 8764 + }, + { + "epoch": 4.144208037825059, + "grad_norm": 3.9426586627960205, + "learning_rate": 1.1062154738676067e-06, + "loss": 0.3934, + "step": 8765 + }, + { + "epoch": 4.14468085106383, + "grad_norm": 2.923741102218628, + "learning_rate": 1.1056976362812939e-06, + "loss": 0.3679, + "step": 8766 + }, + { + "epoch": 4.1451536643026, + "grad_norm": 3.1010327339172363, + "learning_rate": 1.1051798855120757e-06, + "loss": 0.3314, + "step": 8767 + }, + { + "epoch": 4.145626477541371, + "grad_norm": 2.9165778160095215, + "learning_rate": 1.1046622215921896e-06, + "loss": 0.3473, + "step": 8768 + }, + { + "epoch": 4.146099290780142, + "grad_norm": 2.8494462966918945, + "learning_rate": 1.1041446445538692e-06, + "loss": 0.3226, + "step": 8769 + }, + { + "epoch": 4.1465721040189125, + "grad_norm": 2.53379225730896, + "learning_rate": 1.1036271544293412e-06, + "loss": 0.3123, + "step": 8770 + }, + { + "epoch": 4.147044917257683, + "grad_norm": 3.0433695316314697, + "learning_rate": 1.1031097512508274e-06, + "loss": 0.37, + "step": 8771 + }, + { + "epoch": 4.147517730496454, + "grad_norm": 3.418458938598633, + "learning_rate": 1.1025924350505431e-06, + "loss": 0.3266, + "step": 8772 + }, + { + "epoch": 4.147990543735225, + "grad_norm": 2.843733787536621, + "learning_rate": 1.1020752058607017e-06, + "loss": 0.3548, + "step": 8773 + }, + { + "epoch": 4.148463356973995, + "grad_norm": 3.122965097427368, + "learning_rate": 1.1015580637135073e-06, + "loss": 0.3214, + "step": 8774 + }, + { + "epoch": 4.148936170212766, + "grad_norm": 3.0042455196380615, + "learning_rate": 1.1010410086411601e-06, + "loss": 0.3395, + "step": 8775 + }, + { + "epoch": 4.149408983451536, + "grad_norm": 2.841426372528076, + "learning_rate": 1.1005240406758546e-06, + "loss": 0.3381, + "step": 8776 + }, + { + "epoch": 4.149881796690307, + "grad_norm": 2.8241262435913086, + "learning_rate": 1.100007159849781e-06, + "loss": 0.3504, + "step": 8777 + }, + { + "epoch": 4.150354609929078, + "grad_norm": 2.5685677528381348, + "learning_rate": 1.0994903661951223e-06, + "loss": 0.309, + "step": 8778 + }, + { + "epoch": 4.150827423167849, + "grad_norm": 3.197665214538574, + "learning_rate": 1.0989736597440581e-06, + "loss": 0.3722, + "step": 8779 + }, + { + "epoch": 4.15130023640662, + "grad_norm": 3.1483469009399414, + "learning_rate": 1.098457040528761e-06, + "loss": 0.3301, + "step": 8780 + }, + { + "epoch": 4.15177304964539, + "grad_norm": 2.8838415145874023, + "learning_rate": 1.0979405085813972e-06, + "loss": 0.3212, + "step": 8781 + }, + { + "epoch": 4.152245862884161, + "grad_norm": 3.1998705863952637, + "learning_rate": 1.0974240639341312e-06, + "loss": 0.3557, + "step": 8782 + }, + { + "epoch": 4.152718676122931, + "grad_norm": 2.9004411697387695, + "learning_rate": 1.0969077066191187e-06, + "loss": 0.351, + "step": 8783 + }, + { + "epoch": 4.153191489361702, + "grad_norm": 3.036574125289917, + "learning_rate": 1.0963914366685096e-06, + "loss": 0.3762, + "step": 8784 + }, + { + "epoch": 4.153664302600473, + "grad_norm": 3.6683623790740967, + "learning_rate": 1.0958752541144523e-06, + "loss": 0.3938, + "step": 8785 + }, + { + "epoch": 4.1541371158392435, + "grad_norm": 2.922271490097046, + "learning_rate": 1.0953591589890852e-06, + "loss": 0.3375, + "step": 8786 + }, + { + "epoch": 4.154609929078014, + "grad_norm": 3.1750547885894775, + "learning_rate": 1.094843151324545e-06, + "loss": 0.3455, + "step": 8787 + }, + { + "epoch": 4.155082742316785, + "grad_norm": 2.7836148738861084, + "learning_rate": 1.0943272311529602e-06, + "loss": 0.3359, + "step": 8788 + }, + { + "epoch": 4.155555555555556, + "grad_norm": 3.4582557678222656, + "learning_rate": 1.0938113985064553e-06, + "loss": 0.3358, + "step": 8789 + }, + { + "epoch": 4.156028368794326, + "grad_norm": 3.0436923503875732, + "learning_rate": 1.0932956534171483e-06, + "loss": 0.3531, + "step": 8790 + }, + { + "epoch": 4.156501182033097, + "grad_norm": 3.1420092582702637, + "learning_rate": 1.092779995917152e-06, + "loss": 0.3917, + "step": 8791 + }, + { + "epoch": 4.156973995271867, + "grad_norm": 2.9556260108947754, + "learning_rate": 1.0922644260385756e-06, + "loss": 0.3259, + "step": 8792 + }, + { + "epoch": 4.157446808510638, + "grad_norm": 2.8876030445098877, + "learning_rate": 1.091748943813521e-06, + "loss": 0.3447, + "step": 8793 + }, + { + "epoch": 4.157919621749409, + "grad_norm": 3.039207696914673, + "learning_rate": 1.0912335492740836e-06, + "loss": 0.3216, + "step": 8794 + }, + { + "epoch": 4.15839243498818, + "grad_norm": 2.852355480194092, + "learning_rate": 1.0907182424523568e-06, + "loss": 0.2906, + "step": 8795 + }, + { + "epoch": 4.158865248226951, + "grad_norm": 2.931675434112549, + "learning_rate": 1.0902030233804245e-06, + "loss": 0.3124, + "step": 8796 + }, + { + "epoch": 4.159338061465721, + "grad_norm": 3.010590076446533, + "learning_rate": 1.0896878920903691e-06, + "loss": 0.283, + "step": 8797 + }, + { + "epoch": 4.159810874704492, + "grad_norm": 3.093153953552246, + "learning_rate": 1.0891728486142648e-06, + "loss": 0.3269, + "step": 8798 + }, + { + "epoch": 4.160283687943262, + "grad_norm": 2.93019437789917, + "learning_rate": 1.0886578929841798e-06, + "loss": 0.3261, + "step": 8799 + }, + { + "epoch": 4.160756501182033, + "grad_norm": 3.176790475845337, + "learning_rate": 1.0881430252321803e-06, + "loss": 0.2805, + "step": 8800 + }, + { + "epoch": 4.161229314420804, + "grad_norm": 3.215359687805176, + "learning_rate": 1.0876282453903228e-06, + "loss": 0.4022, + "step": 8801 + }, + { + "epoch": 4.1617021276595745, + "grad_norm": 3.3343284130096436, + "learning_rate": 1.0871135534906623e-06, + "loss": 0.3469, + "step": 8802 + }, + { + "epoch": 4.162174940898345, + "grad_norm": 3.030043363571167, + "learning_rate": 1.0865989495652456e-06, + "loss": 0.3548, + "step": 8803 + }, + { + "epoch": 4.162647754137116, + "grad_norm": 2.9456260204315186, + "learning_rate": 1.0860844336461146e-06, + "loss": 0.3356, + "step": 8804 + }, + { + "epoch": 4.163120567375887, + "grad_norm": 2.9399044513702393, + "learning_rate": 1.0855700057653063e-06, + "loss": 0.362, + "step": 8805 + }, + { + "epoch": 4.163593380614657, + "grad_norm": 3.3188061714172363, + "learning_rate": 1.0850556659548513e-06, + "loss": 0.3866, + "step": 8806 + }, + { + "epoch": 4.164066193853428, + "grad_norm": 3.1601030826568604, + "learning_rate": 1.084541414246775e-06, + "loss": 0.3662, + "step": 8807 + }, + { + "epoch": 4.164539007092198, + "grad_norm": 3.0458695888519287, + "learning_rate": 1.0840272506730993e-06, + "loss": 0.3318, + "step": 8808 + }, + { + "epoch": 4.165011820330969, + "grad_norm": 3.056387186050415, + "learning_rate": 1.0835131752658365e-06, + "loss": 0.3538, + "step": 8809 + }, + { + "epoch": 4.16548463356974, + "grad_norm": 2.9833531379699707, + "learning_rate": 1.0829991880569984e-06, + "loss": 0.3088, + "step": 8810 + }, + { + "epoch": 4.165957446808511, + "grad_norm": 3.325438976287842, + "learning_rate": 1.0824852890785876e-06, + "loss": 0.3524, + "step": 8811 + }, + { + "epoch": 4.166430260047282, + "grad_norm": 2.781290054321289, + "learning_rate": 1.0819714783626009e-06, + "loss": 0.3925, + "step": 8812 + }, + { + "epoch": 4.166903073286052, + "grad_norm": 16.3265323638916, + "learning_rate": 1.0814577559410336e-06, + "loss": 0.4248, + "step": 8813 + }, + { + "epoch": 4.167375886524822, + "grad_norm": 2.906619071960449, + "learning_rate": 1.0809441218458708e-06, + "loss": 0.3904, + "step": 8814 + }, + { + "epoch": 4.167848699763593, + "grad_norm": 2.7133800983428955, + "learning_rate": 1.0804305761090957e-06, + "loss": 0.2855, + "step": 8815 + }, + { + "epoch": 4.168321513002364, + "grad_norm": 3.252946376800537, + "learning_rate": 1.0799171187626844e-06, + "loss": 0.3285, + "step": 8816 + }, + { + "epoch": 4.168794326241135, + "grad_norm": 3.0832788944244385, + "learning_rate": 1.0794037498386062e-06, + "loss": 0.3175, + "step": 8817 + }, + { + "epoch": 4.1692671394799055, + "grad_norm": 3.046424150466919, + "learning_rate": 1.0788904693688284e-06, + "loss": 0.3545, + "step": 8818 + }, + { + "epoch": 4.169739952718676, + "grad_norm": 3.643488645553589, + "learning_rate": 1.0783772773853095e-06, + "loss": 0.3889, + "step": 8819 + }, + { + "epoch": 4.170212765957447, + "grad_norm": 3.433997392654419, + "learning_rate": 1.077864173920004e-06, + "loss": 0.311, + "step": 8820 + }, + { + "epoch": 4.170685579196218, + "grad_norm": 3.287684679031372, + "learning_rate": 1.0773511590048605e-06, + "loss": 0.3708, + "step": 8821 + }, + { + "epoch": 4.171158392434988, + "grad_norm": 3.5546534061431885, + "learning_rate": 1.0768382326718212e-06, + "loss": 0.3845, + "step": 8822 + }, + { + "epoch": 4.171631205673759, + "grad_norm": 3.2245540618896484, + "learning_rate": 1.076325394952826e-06, + "loss": 0.4412, + "step": 8823 + }, + { + "epoch": 4.172104018912529, + "grad_norm": 3.199784994125366, + "learning_rate": 1.0758126458798046e-06, + "loss": 0.3635, + "step": 8824 + }, + { + "epoch": 4.1725768321513, + "grad_norm": 2.961003303527832, + "learning_rate": 1.075299985484686e-06, + "loss": 0.3167, + "step": 8825 + }, + { + "epoch": 4.173049645390071, + "grad_norm": 2.8316452503204346, + "learning_rate": 1.07478741379939e-06, + "loss": 0.3342, + "step": 8826 + }, + { + "epoch": 4.173522458628842, + "grad_norm": 3.0721595287323, + "learning_rate": 1.0742749308558316e-06, + "loss": 0.3642, + "step": 8827 + }, + { + "epoch": 4.1739952718676125, + "grad_norm": 3.001324415206909, + "learning_rate": 1.0737625366859225e-06, + "loss": 0.3479, + "step": 8828 + }, + { + "epoch": 4.174468085106383, + "grad_norm": 3.199108839035034, + "learning_rate": 1.0732502313215665e-06, + "loss": 0.3434, + "step": 8829 + }, + { + "epoch": 4.174940898345153, + "grad_norm": 3.602139472961426, + "learning_rate": 1.072738014794661e-06, + "loss": 0.401, + "step": 8830 + }, + { + "epoch": 4.175413711583924, + "grad_norm": 3.2303357124328613, + "learning_rate": 1.0722258871371025e-06, + "loss": 0.3603, + "step": 8831 + }, + { + "epoch": 4.175886524822695, + "grad_norm": 3.138611316680908, + "learning_rate": 1.0717138483807766e-06, + "loss": 0.3481, + "step": 8832 + }, + { + "epoch": 4.176359338061466, + "grad_norm": 3.059134006500244, + "learning_rate": 1.071201898557567e-06, + "loss": 0.357, + "step": 8833 + }, + { + "epoch": 4.176832151300236, + "grad_norm": 3.237121820449829, + "learning_rate": 1.0706900376993501e-06, + "loss": 0.3424, + "step": 8834 + }, + { + "epoch": 4.177304964539007, + "grad_norm": 3.1065425872802734, + "learning_rate": 1.0701782658379974e-06, + "loss": 0.3506, + "step": 8835 + }, + { + "epoch": 4.177777777777778, + "grad_norm": 2.9971365928649902, + "learning_rate": 1.0696665830053743e-06, + "loss": 0.3205, + "step": 8836 + }, + { + "epoch": 4.178250591016549, + "grad_norm": 3.2898313999176025, + "learning_rate": 1.0691549892333406e-06, + "loss": 0.3297, + "step": 8837 + }, + { + "epoch": 4.178723404255319, + "grad_norm": 3.166144609451294, + "learning_rate": 1.0686434845537525e-06, + "loss": 0.3097, + "step": 8838 + }, + { + "epoch": 4.1791962174940895, + "grad_norm": 2.9629571437835693, + "learning_rate": 1.0681320689984581e-06, + "loss": 0.3709, + "step": 8839 + }, + { + "epoch": 4.17966903073286, + "grad_norm": 3.2954351902008057, + "learning_rate": 1.0676207425993004e-06, + "loss": 0.3448, + "step": 8840 + }, + { + "epoch": 4.180141843971631, + "grad_norm": 2.8537824153900146, + "learning_rate": 1.0671095053881194e-06, + "loss": 0.3069, + "step": 8841 + }, + { + "epoch": 4.180614657210402, + "grad_norm": 3.382916212081909, + "learning_rate": 1.0665983573967453e-06, + "loss": 0.3909, + "step": 8842 + }, + { + "epoch": 4.181087470449173, + "grad_norm": 3.4717860221862793, + "learning_rate": 1.0660872986570072e-06, + "loss": 0.3641, + "step": 8843 + }, + { + "epoch": 4.1815602836879435, + "grad_norm": 3.088916778564453, + "learning_rate": 1.0655763292007256e-06, + "loss": 0.3184, + "step": 8844 + }, + { + "epoch": 4.182033096926714, + "grad_norm": 2.8693177700042725, + "learning_rate": 1.065065449059715e-06, + "loss": 0.3486, + "step": 8845 + }, + { + "epoch": 4.182505910165484, + "grad_norm": 3.162811517715454, + "learning_rate": 1.0645546582657881e-06, + "loss": 0.3559, + "step": 8846 + }, + { + "epoch": 4.182978723404255, + "grad_norm": 3.8519816398620605, + "learning_rate": 1.0640439568507475e-06, + "loss": 0.4159, + "step": 8847 + }, + { + "epoch": 4.183451536643026, + "grad_norm": 2.9316959381103516, + "learning_rate": 1.063533344846394e-06, + "loss": 0.34, + "step": 8848 + }, + { + "epoch": 4.183924349881797, + "grad_norm": 3.018986463546753, + "learning_rate": 1.0630228222845205e-06, + "loss": 0.3378, + "step": 8849 + }, + { + "epoch": 4.184397163120567, + "grad_norm": 2.949428081512451, + "learning_rate": 1.062512389196914e-06, + "loss": 0.3634, + "step": 8850 + }, + { + "epoch": 4.184869976359338, + "grad_norm": 3.3298749923706055, + "learning_rate": 1.0620020456153585e-06, + "loss": 0.3067, + "step": 8851 + }, + { + "epoch": 4.185342789598109, + "grad_norm": 3.0566864013671875, + "learning_rate": 1.0614917915716302e-06, + "loss": 0.3534, + "step": 8852 + }, + { + "epoch": 4.18581560283688, + "grad_norm": 3.156620979309082, + "learning_rate": 1.0609816270975007e-06, + "loss": 0.3684, + "step": 8853 + }, + { + "epoch": 4.18628841607565, + "grad_norm": 3.0776474475860596, + "learning_rate": 1.0604715522247352e-06, + "loss": 0.3616, + "step": 8854 + }, + { + "epoch": 4.1867612293144205, + "grad_norm": 3.1254587173461914, + "learning_rate": 1.059961566985093e-06, + "loss": 0.3455, + "step": 8855 + }, + { + "epoch": 4.187234042553191, + "grad_norm": 2.8769783973693848, + "learning_rate": 1.0594516714103306e-06, + "loss": 0.2754, + "step": 8856 + }, + { + "epoch": 4.187706855791962, + "grad_norm": 3.461308240890503, + "learning_rate": 1.0589418655321962e-06, + "loss": 0.3744, + "step": 8857 + }, + { + "epoch": 4.188179669030733, + "grad_norm": 3.3546712398529053, + "learning_rate": 1.0584321493824317e-06, + "loss": 0.4116, + "step": 8858 + }, + { + "epoch": 4.188652482269504, + "grad_norm": 3.233792543411255, + "learning_rate": 1.0579225229927775e-06, + "loss": 0.3591, + "step": 8859 + }, + { + "epoch": 4.1891252955082745, + "grad_norm": 3.295444965362549, + "learning_rate": 1.0574129863949633e-06, + "loss": 0.3179, + "step": 8860 + }, + { + "epoch": 4.189598108747045, + "grad_norm": 3.403062105178833, + "learning_rate": 1.0569035396207178e-06, + "loss": 0.3948, + "step": 8861 + }, + { + "epoch": 4.190070921985815, + "grad_norm": 2.901970148086548, + "learning_rate": 1.0563941827017613e-06, + "loss": 0.3537, + "step": 8862 + }, + { + "epoch": 4.190543735224586, + "grad_norm": 3.1239142417907715, + "learning_rate": 1.0558849156698078e-06, + "loss": 0.3764, + "step": 8863 + }, + { + "epoch": 4.191016548463357, + "grad_norm": 2.8480169773101807, + "learning_rate": 1.0553757385565694e-06, + "loss": 0.3085, + "step": 8864 + }, + { + "epoch": 4.191489361702128, + "grad_norm": 3.0914061069488525, + "learning_rate": 1.0548666513937487e-06, + "loss": 0.3003, + "step": 8865 + }, + { + "epoch": 4.191962174940898, + "grad_norm": 2.9875683784484863, + "learning_rate": 1.0543576542130452e-06, + "loss": 0.3178, + "step": 8866 + }, + { + "epoch": 4.192434988179669, + "grad_norm": 2.952052354812622, + "learning_rate": 1.053848747046152e-06, + "loss": 0.3221, + "step": 8867 + }, + { + "epoch": 4.19290780141844, + "grad_norm": 3.2211997509002686, + "learning_rate": 1.0533399299247559e-06, + "loss": 0.3698, + "step": 8868 + }, + { + "epoch": 4.193380614657211, + "grad_norm": 3.2954046726226807, + "learning_rate": 1.0528312028805392e-06, + "loss": 0.3697, + "step": 8869 + }, + { + "epoch": 4.193853427895981, + "grad_norm": 2.978306293487549, + "learning_rate": 1.0523225659451768e-06, + "loss": 0.3358, + "step": 8870 + }, + { + "epoch": 4.1943262411347515, + "grad_norm": 3.3803653717041016, + "learning_rate": 1.0518140191503415e-06, + "loss": 0.3851, + "step": 8871 + }, + { + "epoch": 4.194799054373522, + "grad_norm": 3.282294273376465, + "learning_rate": 1.051305562527697e-06, + "loss": 0.4518, + "step": 8872 + }, + { + "epoch": 4.195271867612293, + "grad_norm": 2.950310468673706, + "learning_rate": 1.0507971961089017e-06, + "loss": 0.3045, + "step": 8873 + }, + { + "epoch": 4.195744680851064, + "grad_norm": 3.4069037437438965, + "learning_rate": 1.0502889199256114e-06, + "loss": 0.3832, + "step": 8874 + }, + { + "epoch": 4.196217494089835, + "grad_norm": 3.1440858840942383, + "learning_rate": 1.0497807340094722e-06, + "loss": 0.2958, + "step": 8875 + }, + { + "epoch": 4.1966903073286055, + "grad_norm": 3.050755262374878, + "learning_rate": 1.049272638392129e-06, + "loss": 0.3494, + "step": 8876 + }, + { + "epoch": 4.197163120567376, + "grad_norm": 2.908078670501709, + "learning_rate": 1.0487646331052171e-06, + "loss": 0.349, + "step": 8877 + }, + { + "epoch": 4.197635933806146, + "grad_norm": 3.2089946269989014, + "learning_rate": 1.048256718180367e-06, + "loss": 0.3507, + "step": 8878 + }, + { + "epoch": 4.198108747044917, + "grad_norm": 2.984745740890503, + "learning_rate": 1.0477488936492067e-06, + "loss": 0.3252, + "step": 8879 + }, + { + "epoch": 4.198581560283688, + "grad_norm": 2.9207515716552734, + "learning_rate": 1.0472411595433545e-06, + "loss": 0.3192, + "step": 8880 + }, + { + "epoch": 4.199054373522459, + "grad_norm": 3.0090811252593994, + "learning_rate": 1.0467335158944242e-06, + "loss": 0.3827, + "step": 8881 + }, + { + "epoch": 4.199527186761229, + "grad_norm": 3.2763171195983887, + "learning_rate": 1.0462259627340265e-06, + "loss": 0.3481, + "step": 8882 + }, + { + "epoch": 4.2, + "grad_norm": 3.068268299102783, + "learning_rate": 1.0457185000937636e-06, + "loss": 0.3926, + "step": 8883 + }, + { + "epoch": 4.200472813238771, + "grad_norm": 2.6999998092651367, + "learning_rate": 1.0452111280052326e-06, + "loss": 0.2884, + "step": 8884 + }, + { + "epoch": 4.200945626477542, + "grad_norm": 3.1187727451324463, + "learning_rate": 1.044703846500026e-06, + "loss": 0.3797, + "step": 8885 + }, + { + "epoch": 4.201418439716312, + "grad_norm": 2.7876172065734863, + "learning_rate": 1.0441966556097283e-06, + "loss": 0.3284, + "step": 8886 + }, + { + "epoch": 4.2018912529550825, + "grad_norm": 2.973261833190918, + "learning_rate": 1.0436895553659224e-06, + "loss": 0.2845, + "step": 8887 + }, + { + "epoch": 4.202364066193853, + "grad_norm": 3.496096611022949, + "learning_rate": 1.0431825458001811e-06, + "loss": 0.3341, + "step": 8888 + }, + { + "epoch": 4.202836879432624, + "grad_norm": 3.370410680770874, + "learning_rate": 1.0426756269440761e-06, + "loss": 0.3459, + "step": 8889 + }, + { + "epoch": 4.203309692671395, + "grad_norm": 2.864126682281494, + "learning_rate": 1.0421687988291693e-06, + "loss": 0.3195, + "step": 8890 + }, + { + "epoch": 4.203782505910166, + "grad_norm": 3.3575501441955566, + "learning_rate": 1.0416620614870181e-06, + "loss": 0.3424, + "step": 8891 + }, + { + "epoch": 4.2042553191489365, + "grad_norm": 3.4441967010498047, + "learning_rate": 1.0411554149491766e-06, + "loss": 0.3677, + "step": 8892 + }, + { + "epoch": 4.204728132387707, + "grad_norm": 3.014472007751465, + "learning_rate": 1.0406488592471898e-06, + "loss": 0.3004, + "step": 8893 + }, + { + "epoch": 4.205200945626477, + "grad_norm": 3.1186721324920654, + "learning_rate": 1.0401423944126002e-06, + "loss": 0.4182, + "step": 8894 + }, + { + "epoch": 4.205673758865248, + "grad_norm": 3.166337013244629, + "learning_rate": 1.0396360204769426e-06, + "loss": 0.3303, + "step": 8895 + }, + { + "epoch": 4.206146572104019, + "grad_norm": 3.081855058670044, + "learning_rate": 1.0391297374717454e-06, + "loss": 0.3096, + "step": 8896 + }, + { + "epoch": 4.20661938534279, + "grad_norm": 3.0924830436706543, + "learning_rate": 1.0386235454285348e-06, + "loss": 0.3238, + "step": 8897 + }, + { + "epoch": 4.20709219858156, + "grad_norm": 3.043519973754883, + "learning_rate": 1.0381174443788277e-06, + "loss": 0.3322, + "step": 8898 + }, + { + "epoch": 4.207565011820331, + "grad_norm": 3.160785675048828, + "learning_rate": 1.0376114343541377e-06, + "loss": 0.3244, + "step": 8899 + }, + { + "epoch": 4.208037825059102, + "grad_norm": 2.9988417625427246, + "learning_rate": 1.037105515385971e-06, + "loss": 0.3386, + "step": 8900 + }, + { + "epoch": 4.208510638297873, + "grad_norm": 2.981959342956543, + "learning_rate": 1.0365996875058284e-06, + "loss": 0.3412, + "step": 8901 + }, + { + "epoch": 4.208983451536643, + "grad_norm": 3.144815683364868, + "learning_rate": 1.0360939507452075e-06, + "loss": 0.3716, + "step": 8902 + }, + { + "epoch": 4.2094562647754135, + "grad_norm": 2.9644055366516113, + "learning_rate": 1.0355883051355972e-06, + "loss": 0.3488, + "step": 8903 + }, + { + "epoch": 4.209929078014184, + "grad_norm": 3.3212029933929443, + "learning_rate": 1.035082750708481e-06, + "loss": 0.3048, + "step": 8904 + }, + { + "epoch": 4.210401891252955, + "grad_norm": 2.82843279838562, + "learning_rate": 1.034577287495339e-06, + "loss": 0.3141, + "step": 8905 + }, + { + "epoch": 4.210874704491726, + "grad_norm": 3.040215253829956, + "learning_rate": 1.034071915527643e-06, + "loss": 0.3517, + "step": 8906 + }, + { + "epoch": 4.211347517730497, + "grad_norm": 2.850985288619995, + "learning_rate": 1.033566634836862e-06, + "loss": 0.3556, + "step": 8907 + }, + { + "epoch": 4.2118203309692674, + "grad_norm": 3.522962808609009, + "learning_rate": 1.0330614454544564e-06, + "loss": 0.3432, + "step": 8908 + }, + { + "epoch": 4.212293144208038, + "grad_norm": 3.0228631496429443, + "learning_rate": 1.032556347411881e-06, + "loss": 0.3165, + "step": 8909 + }, + { + "epoch": 4.212765957446808, + "grad_norm": 3.275134563446045, + "learning_rate": 1.0320513407405886e-06, + "loss": 0.413, + "step": 8910 + }, + { + "epoch": 4.213238770685579, + "grad_norm": 2.850020408630371, + "learning_rate": 1.0315464254720213e-06, + "loss": 0.3051, + "step": 8911 + }, + { + "epoch": 4.21371158392435, + "grad_norm": 3.153916597366333, + "learning_rate": 1.0310416016376203e-06, + "loss": 0.2973, + "step": 8912 + }, + { + "epoch": 4.2141843971631205, + "grad_norm": 3.423772096633911, + "learning_rate": 1.0305368692688175e-06, + "loss": 0.302, + "step": 8913 + }, + { + "epoch": 4.214657210401891, + "grad_norm": 3.420687198638916, + "learning_rate": 1.0300322283970404e-06, + "loss": 0.3732, + "step": 8914 + }, + { + "epoch": 4.215130023640662, + "grad_norm": 3.2490479946136475, + "learning_rate": 1.02952767905371e-06, + "loss": 0.3793, + "step": 8915 + }, + { + "epoch": 4.215602836879433, + "grad_norm": 3.3043079376220703, + "learning_rate": 1.0290232212702438e-06, + "loss": 0.3472, + "step": 8916 + }, + { + "epoch": 4.216075650118204, + "grad_norm": 3.152435779571533, + "learning_rate": 1.0285188550780516e-06, + "loss": 0.3617, + "step": 8917 + }, + { + "epoch": 4.216548463356974, + "grad_norm": 3.311063766479492, + "learning_rate": 1.0280145805085384e-06, + "loss": 0.3681, + "step": 8918 + }, + { + "epoch": 4.217021276595744, + "grad_norm": 3.1113057136535645, + "learning_rate": 1.0275103975931016e-06, + "loss": 0.3526, + "step": 8919 + }, + { + "epoch": 4.217494089834515, + "grad_norm": 2.7904412746429443, + "learning_rate": 1.0270063063631369e-06, + "loss": 0.3125, + "step": 8920 + }, + { + "epoch": 4.217966903073286, + "grad_norm": 3.3566761016845703, + "learning_rate": 1.0265023068500293e-06, + "loss": 0.3305, + "step": 8921 + }, + { + "epoch": 4.218439716312057, + "grad_norm": 2.97943115234375, + "learning_rate": 1.0259983990851633e-06, + "loss": 0.3277, + "step": 8922 + }, + { + "epoch": 4.218912529550828, + "grad_norm": 3.1507925987243652, + "learning_rate": 1.0254945830999134e-06, + "loss": 0.385, + "step": 8923 + }, + { + "epoch": 4.219385342789598, + "grad_norm": 2.632859706878662, + "learning_rate": 1.0249908589256493e-06, + "loss": 0.2889, + "step": 8924 + }, + { + "epoch": 4.219858156028369, + "grad_norm": 2.9816136360168457, + "learning_rate": 1.0244872265937378e-06, + "loss": 0.2838, + "step": 8925 + }, + { + "epoch": 4.220330969267139, + "grad_norm": 2.751431465148926, + "learning_rate": 1.0239836861355369e-06, + "loss": 0.3069, + "step": 8926 + }, + { + "epoch": 4.22080378250591, + "grad_norm": 3.3390228748321533, + "learning_rate": 1.0234802375823985e-06, + "loss": 0.3074, + "step": 8927 + }, + { + "epoch": 4.221276595744681, + "grad_norm": 3.345242500305176, + "learning_rate": 1.0229768809656726e-06, + "loss": 0.3603, + "step": 8928 + }, + { + "epoch": 4.2217494089834515, + "grad_norm": 3.0684640407562256, + "learning_rate": 1.0224736163166984e-06, + "loss": 0.3343, + "step": 8929 + }, + { + "epoch": 4.222222222222222, + "grad_norm": 3.2813572883605957, + "learning_rate": 1.0219704436668146e-06, + "loss": 0.3173, + "step": 8930 + }, + { + "epoch": 4.222695035460993, + "grad_norm": 3.135668992996216, + "learning_rate": 1.0214673630473504e-06, + "loss": 0.3608, + "step": 8931 + }, + { + "epoch": 4.223167848699764, + "grad_norm": 2.8798727989196777, + "learning_rate": 1.0209643744896303e-06, + "loss": 0.3317, + "step": 8932 + }, + { + "epoch": 4.223640661938534, + "grad_norm": 3.2659590244293213, + "learning_rate": 1.0204614780249731e-06, + "loss": 0.3017, + "step": 8933 + }, + { + "epoch": 4.224113475177305, + "grad_norm": 2.988126516342163, + "learning_rate": 1.0199586736846911e-06, + "loss": 0.3555, + "step": 8934 + }, + { + "epoch": 4.224586288416075, + "grad_norm": 3.3775575160980225, + "learning_rate": 1.0194559615000937e-06, + "loss": 0.3966, + "step": 8935 + }, + { + "epoch": 4.225059101654846, + "grad_norm": 3.1004798412323, + "learning_rate": 1.0189533415024817e-06, + "loss": 0.3192, + "step": 8936 + }, + { + "epoch": 4.225531914893617, + "grad_norm": 3.1722211837768555, + "learning_rate": 1.0184508137231498e-06, + "loss": 0.3075, + "step": 8937 + }, + { + "epoch": 4.226004728132388, + "grad_norm": 3.0679538249969482, + "learning_rate": 1.0179483781933903e-06, + "loss": 0.3475, + "step": 8938 + }, + { + "epoch": 4.226477541371159, + "grad_norm": 3.079246759414673, + "learning_rate": 1.0174460349444857e-06, + "loss": 0.4054, + "step": 8939 + }, + { + "epoch": 4.226950354609929, + "grad_norm": 3.308229684829712, + "learning_rate": 1.0169437840077169e-06, + "loss": 0.3455, + "step": 8940 + }, + { + "epoch": 4.2274231678487, + "grad_norm": 3.363147258758545, + "learning_rate": 1.0164416254143552e-06, + "loss": 0.3538, + "step": 8941 + }, + { + "epoch": 4.22789598108747, + "grad_norm": 2.7227768898010254, + "learning_rate": 1.0159395591956677e-06, + "loss": 0.3206, + "step": 8942 + }, + { + "epoch": 4.228368794326241, + "grad_norm": 3.0010764598846436, + "learning_rate": 1.0154375853829175e-06, + "loss": 0.3593, + "step": 8943 + }, + { + "epoch": 4.228841607565012, + "grad_norm": 3.0478785037994385, + "learning_rate": 1.0149357040073581e-06, + "loss": 0.3808, + "step": 8944 + }, + { + "epoch": 4.2293144208037825, + "grad_norm": 2.804421901702881, + "learning_rate": 1.0144339151002416e-06, + "loss": 0.3633, + "step": 8945 + }, + { + "epoch": 4.229787234042553, + "grad_norm": 3.563140630722046, + "learning_rate": 1.013932218692811e-06, + "loss": 0.2843, + "step": 8946 + }, + { + "epoch": 4.230260047281324, + "grad_norm": 3.1959750652313232, + "learning_rate": 1.0134306148163051e-06, + "loss": 0.377, + "step": 8947 + }, + { + "epoch": 4.230732860520095, + "grad_norm": 3.0841214656829834, + "learning_rate": 1.0129291035019565e-06, + "loss": 0.3234, + "step": 8948 + }, + { + "epoch": 4.231205673758865, + "grad_norm": 3.3893179893493652, + "learning_rate": 1.0124276847809911e-06, + "loss": 0.3823, + "step": 8949 + }, + { + "epoch": 4.231678486997636, + "grad_norm": 3.2250518798828125, + "learning_rate": 1.0119263586846316e-06, + "loss": 0.35, + "step": 8950 + }, + { + "epoch": 4.232151300236406, + "grad_norm": 3.287285566329956, + "learning_rate": 1.0114251252440928e-06, + "loss": 0.3306, + "step": 8951 + }, + { + "epoch": 4.232624113475177, + "grad_norm": 3.5018274784088135, + "learning_rate": 1.0109239844905836e-06, + "loss": 0.378, + "step": 8952 + }, + { + "epoch": 4.233096926713948, + "grad_norm": 3.224838972091675, + "learning_rate": 1.0104229364553093e-06, + "loss": 0.3379, + "step": 8953 + }, + { + "epoch": 4.233569739952719, + "grad_norm": 3.2302494049072266, + "learning_rate": 1.0099219811694668e-06, + "loss": 0.358, + "step": 8954 + }, + { + "epoch": 4.23404255319149, + "grad_norm": 3.098205804824829, + "learning_rate": 1.0094211186642483e-06, + "loss": 0.3669, + "step": 8955 + }, + { + "epoch": 4.23451536643026, + "grad_norm": 3.0045907497406006, + "learning_rate": 1.0089203489708415e-06, + "loss": 0.3293, + "step": 8956 + }, + { + "epoch": 4.234988179669031, + "grad_norm": 3.245818853378296, + "learning_rate": 1.0084196721204254e-06, + "loss": 0.3365, + "step": 8957 + }, + { + "epoch": 4.235460992907801, + "grad_norm": 2.8547208309173584, + "learning_rate": 1.007919088144177e-06, + "loss": 0.3072, + "step": 8958 + }, + { + "epoch": 4.235933806146572, + "grad_norm": 3.2914109230041504, + "learning_rate": 1.0074185970732642e-06, + "loss": 0.3734, + "step": 8959 + }, + { + "epoch": 4.236406619385343, + "grad_norm": 2.527096748352051, + "learning_rate": 1.0069181989388496e-06, + "loss": 0.3091, + "step": 8960 + }, + { + "epoch": 4.2368794326241135, + "grad_norm": 2.921369791030884, + "learning_rate": 1.006417893772093e-06, + "loss": 0.3162, + "step": 8961 + }, + { + "epoch": 4.237352245862884, + "grad_norm": 2.8698911666870117, + "learning_rate": 1.005917681604145e-06, + "loss": 0.3818, + "step": 8962 + }, + { + "epoch": 4.237825059101655, + "grad_norm": 2.958021402359009, + "learning_rate": 1.0054175624661514e-06, + "loss": 0.2934, + "step": 8963 + }, + { + "epoch": 4.238297872340426, + "grad_norm": 3.0945863723754883, + "learning_rate": 1.0049175363892527e-06, + "loss": 0.384, + "step": 8964 + }, + { + "epoch": 4.238770685579196, + "grad_norm": 2.890333890914917, + "learning_rate": 1.0044176034045822e-06, + "loss": 0.3312, + "step": 8965 + }, + { + "epoch": 4.239243498817967, + "grad_norm": 3.460975408554077, + "learning_rate": 1.0039177635432706e-06, + "loss": 0.4015, + "step": 8966 + }, + { + "epoch": 4.239716312056737, + "grad_norm": 3.5411946773529053, + "learning_rate": 1.003418016836439e-06, + "loss": 0.3307, + "step": 8967 + }, + { + "epoch": 4.240189125295508, + "grad_norm": 3.3310446739196777, + "learning_rate": 1.0029183633152061e-06, + "loss": 0.3154, + "step": 8968 + }, + { + "epoch": 4.240661938534279, + "grad_norm": 3.121110677719116, + "learning_rate": 1.0024188030106822e-06, + "loss": 0.3827, + "step": 8969 + }, + { + "epoch": 4.24113475177305, + "grad_norm": 3.421278715133667, + "learning_rate": 1.0019193359539717e-06, + "loss": 0.3677, + "step": 8970 + }, + { + "epoch": 4.241607565011821, + "grad_norm": 3.1790332794189453, + "learning_rate": 1.0014199621761761e-06, + "loss": 0.3219, + "step": 8971 + }, + { + "epoch": 4.242080378250591, + "grad_norm": 3.238412380218506, + "learning_rate": 1.0009206817083878e-06, + "loss": 0.3296, + "step": 8972 + }, + { + "epoch": 4.242553191489361, + "grad_norm": 3.0206923484802246, + "learning_rate": 1.0004214945816959e-06, + "loss": 0.3769, + "step": 8973 + }, + { + "epoch": 4.243026004728132, + "grad_norm": 3.2117667198181152, + "learning_rate": 9.999224008271822e-07, + "loss": 0.3876, + "step": 8974 + }, + { + "epoch": 4.243498817966903, + "grad_norm": 2.849250316619873, + "learning_rate": 9.99423400475922e-07, + "loss": 0.3267, + "step": 8975 + }, + { + "epoch": 4.243971631205674, + "grad_norm": 3.084845542907715, + "learning_rate": 9.989244935589878e-07, + "loss": 0.3074, + "step": 8976 + }, + { + "epoch": 4.2444444444444445, + "grad_norm": 3.0177342891693115, + "learning_rate": 9.984256801074434e-07, + "loss": 0.3524, + "step": 8977 + }, + { + "epoch": 4.244917257683215, + "grad_norm": 3.196692943572998, + "learning_rate": 9.979269601523477e-07, + "loss": 0.3943, + "step": 8978 + }, + { + "epoch": 4.245390070921986, + "grad_norm": 2.849760055541992, + "learning_rate": 9.97428333724753e-07, + "loss": 0.3145, + "step": 8979 + }, + { + "epoch": 4.245862884160757, + "grad_norm": 3.003265857696533, + "learning_rate": 9.969298008557083e-07, + "loss": 0.3393, + "step": 8980 + }, + { + "epoch": 4.246335697399527, + "grad_norm": 2.925597667694092, + "learning_rate": 9.96431361576254e-07, + "loss": 0.331, + "step": 8981 + }, + { + "epoch": 4.246808510638298, + "grad_norm": 2.87599515914917, + "learning_rate": 9.959330159174257e-07, + "loss": 0.3102, + "step": 8982 + }, + { + "epoch": 4.247281323877068, + "grad_norm": 2.841588020324707, + "learning_rate": 9.954347639102528e-07, + "loss": 0.3261, + "step": 8983 + }, + { + "epoch": 4.247754137115839, + "grad_norm": 3.14918851852417, + "learning_rate": 9.949366055857605e-07, + "loss": 0.3366, + "step": 8984 + }, + { + "epoch": 4.24822695035461, + "grad_norm": 3.113927125930786, + "learning_rate": 9.944385409749654e-07, + "loss": 0.3532, + "step": 8985 + }, + { + "epoch": 4.248699763593381, + "grad_norm": 3.0749151706695557, + "learning_rate": 9.939405701088818e-07, + "loss": 0.3659, + "step": 8986 + }, + { + "epoch": 4.2491725768321515, + "grad_norm": 2.831846237182617, + "learning_rate": 9.934426930185145e-07, + "loss": 0.2965, + "step": 8987 + }, + { + "epoch": 4.249645390070922, + "grad_norm": 3.0280253887176514, + "learning_rate": 9.929449097348642e-07, + "loss": 0.3144, + "step": 8988 + }, + { + "epoch": 4.250118203309692, + "grad_norm": 3.250284433364868, + "learning_rate": 9.924472202889267e-07, + "loss": 0.3414, + "step": 8989 + }, + { + "epoch": 4.250591016548463, + "grad_norm": 3.582306146621704, + "learning_rate": 9.9194962471169e-07, + "loss": 0.3593, + "step": 8990 + }, + { + "epoch": 4.251063829787234, + "grad_norm": 2.8985490798950195, + "learning_rate": 9.914521230341382e-07, + "loss": 0.2948, + "step": 8991 + }, + { + "epoch": 4.251536643026005, + "grad_norm": 3.399209499359131, + "learning_rate": 9.909547152872476e-07, + "loss": 0.3942, + "step": 8992 + }, + { + "epoch": 4.2520094562647754, + "grad_norm": 3.344658613204956, + "learning_rate": 9.904574015019895e-07, + "loss": 0.3649, + "step": 8993 + }, + { + "epoch": 4.252482269503546, + "grad_norm": 3.057995319366455, + "learning_rate": 9.899601817093305e-07, + "loss": 0.317, + "step": 8994 + }, + { + "epoch": 4.252955082742317, + "grad_norm": 3.4610090255737305, + "learning_rate": 9.894630559402296e-07, + "loss": 0.3235, + "step": 8995 + }, + { + "epoch": 4.253427895981088, + "grad_norm": 3.6014657020568848, + "learning_rate": 9.889660242256407e-07, + "loss": 0.3057, + "step": 8996 + }, + { + "epoch": 4.253900709219858, + "grad_norm": 2.850391149520874, + "learning_rate": 9.884690865965118e-07, + "loss": 0.3584, + "step": 8997 + }, + { + "epoch": 4.2543735224586285, + "grad_norm": 3.100820541381836, + "learning_rate": 9.879722430837844e-07, + "loss": 0.3802, + "step": 8998 + }, + { + "epoch": 4.254846335697399, + "grad_norm": 3.1044704914093018, + "learning_rate": 9.874754937183962e-07, + "loss": 0.3293, + "step": 8999 + }, + { + "epoch": 4.25531914893617, + "grad_norm": 2.750356912612915, + "learning_rate": 9.869788385312764e-07, + "loss": 0.3218, + "step": 9000 + }, + { + "epoch": 4.255791962174941, + "grad_norm": 3.1008687019348145, + "learning_rate": 9.864822775533494e-07, + "loss": 0.3316, + "step": 9001 + }, + { + "epoch": 4.256264775413712, + "grad_norm": 3.2051985263824463, + "learning_rate": 9.859858108155351e-07, + "loss": 0.3661, + "step": 9002 + }, + { + "epoch": 4.2567375886524825, + "grad_norm": 3.1303839683532715, + "learning_rate": 9.854894383487448e-07, + "loss": 0.3683, + "step": 9003 + }, + { + "epoch": 4.257210401891253, + "grad_norm": 3.0718302726745605, + "learning_rate": 9.84993160183887e-07, + "loss": 0.3284, + "step": 9004 + }, + { + "epoch": 4.257683215130023, + "grad_norm": 2.9759013652801514, + "learning_rate": 9.844969763518625e-07, + "loss": 0.3465, + "step": 9005 + }, + { + "epoch": 4.258156028368794, + "grad_norm": 3.1965582370758057, + "learning_rate": 9.840008868835647e-07, + "loss": 0.3593, + "step": 9006 + }, + { + "epoch": 4.258628841607565, + "grad_norm": 3.2931249141693115, + "learning_rate": 9.835048918098853e-07, + "loss": 0.3631, + "step": 9007 + }, + { + "epoch": 4.259101654846336, + "grad_norm": 3.070627450942993, + "learning_rate": 9.830089911617054e-07, + "loss": 0.3541, + "step": 9008 + }, + { + "epoch": 4.259574468085106, + "grad_norm": 3.209110736846924, + "learning_rate": 9.825131849699051e-07, + "loss": 0.344, + "step": 9009 + }, + { + "epoch": 4.260047281323877, + "grad_norm": 3.2239089012145996, + "learning_rate": 9.820174732653545e-07, + "loss": 0.3469, + "step": 9010 + }, + { + "epoch": 4.260520094562648, + "grad_norm": 3.0812292098999023, + "learning_rate": 9.815218560789199e-07, + "loss": 0.2898, + "step": 9011 + }, + { + "epoch": 4.260992907801419, + "grad_norm": 3.1709752082824707, + "learning_rate": 9.81026333441461e-07, + "loss": 0.381, + "step": 9012 + }, + { + "epoch": 4.261465721040189, + "grad_norm": 3.1551907062530518, + "learning_rate": 9.805309053838308e-07, + "loss": 0.2959, + "step": 9013 + }, + { + "epoch": 4.2619385342789595, + "grad_norm": 3.3751494884490967, + "learning_rate": 9.800355719368793e-07, + "loss": 0.3806, + "step": 9014 + }, + { + "epoch": 4.26241134751773, + "grad_norm": 3.2392799854278564, + "learning_rate": 9.795403331314479e-07, + "loss": 0.3006, + "step": 9015 + }, + { + "epoch": 4.262884160756501, + "grad_norm": 3.1428463459014893, + "learning_rate": 9.790451889983724e-07, + "loss": 0.3212, + "step": 9016 + }, + { + "epoch": 4.263356973995272, + "grad_norm": 3.353379726409912, + "learning_rate": 9.785501395684844e-07, + "loss": 0.3555, + "step": 9017 + }, + { + "epoch": 4.263829787234043, + "grad_norm": 3.3555281162261963, + "learning_rate": 9.780551848726068e-07, + "loss": 0.3729, + "step": 9018 + }, + { + "epoch": 4.2643026004728135, + "grad_norm": 3.0275049209594727, + "learning_rate": 9.775603249415606e-07, + "loss": 0.3579, + "step": 9019 + }, + { + "epoch": 4.264775413711584, + "grad_norm": 3.2631473541259766, + "learning_rate": 9.770655598061569e-07, + "loss": 0.3755, + "step": 9020 + }, + { + "epoch": 4.265248226950354, + "grad_norm": 2.9419705867767334, + "learning_rate": 9.76570889497202e-07, + "loss": 0.28, + "step": 9021 + }, + { + "epoch": 4.265721040189125, + "grad_norm": 2.931673288345337, + "learning_rate": 9.76076314045499e-07, + "loss": 0.3497, + "step": 9022 + }, + { + "epoch": 4.266193853427896, + "grad_norm": 3.218503952026367, + "learning_rate": 9.755818334818416e-07, + "loss": 0.3775, + "step": 9023 + }, + { + "epoch": 4.266666666666667, + "grad_norm": 3.1422977447509766, + "learning_rate": 9.750874478370181e-07, + "loss": 0.3957, + "step": 9024 + }, + { + "epoch": 4.267139479905437, + "grad_norm": 3.066502571105957, + "learning_rate": 9.745931571418134e-07, + "loss": 0.3361, + "step": 9025 + }, + { + "epoch": 4.267612293144208, + "grad_norm": 3.186897039413452, + "learning_rate": 9.740989614270044e-07, + "loss": 0.3794, + "step": 9026 + }, + { + "epoch": 4.268085106382979, + "grad_norm": 3.2698588371276855, + "learning_rate": 9.736048607233623e-07, + "loss": 0.3595, + "step": 9027 + }, + { + "epoch": 4.26855791962175, + "grad_norm": 2.9609718322753906, + "learning_rate": 9.731108550616523e-07, + "loss": 0.3387, + "step": 9028 + }, + { + "epoch": 4.26903073286052, + "grad_norm": 3.10768985748291, + "learning_rate": 9.72616944472633e-07, + "loss": 0.387, + "step": 9029 + }, + { + "epoch": 4.2695035460992905, + "grad_norm": 2.8060896396636963, + "learning_rate": 9.721231289870602e-07, + "loss": 0.3132, + "step": 9030 + }, + { + "epoch": 4.269976359338061, + "grad_norm": 3.0502681732177734, + "learning_rate": 9.716294086356801e-07, + "loss": 0.3246, + "step": 9031 + }, + { + "epoch": 4.270449172576832, + "grad_norm": 2.8298611640930176, + "learning_rate": 9.711357834492356e-07, + "loss": 0.2958, + "step": 9032 + }, + { + "epoch": 4.270921985815603, + "grad_norm": 2.693819761276245, + "learning_rate": 9.70642253458462e-07, + "loss": 0.325, + "step": 9033 + }, + { + "epoch": 4.271394799054374, + "grad_norm": 2.8179452419281006, + "learning_rate": 9.701488186940885e-07, + "loss": 0.3252, + "step": 9034 + }, + { + "epoch": 4.2718676122931445, + "grad_norm": 2.9885077476501465, + "learning_rate": 9.696554791868406e-07, + "loss": 0.3234, + "step": 9035 + }, + { + "epoch": 4.272340425531915, + "grad_norm": 4.8119378089904785, + "learning_rate": 9.691622349674349e-07, + "loss": 0.3814, + "step": 9036 + }, + { + "epoch": 4.272813238770685, + "grad_norm": 3.971498966217041, + "learning_rate": 9.68669086066585e-07, + "loss": 0.3684, + "step": 9037 + }, + { + "epoch": 4.273286052009456, + "grad_norm": 3.0153439044952393, + "learning_rate": 9.681760325149967e-07, + "loss": 0.3449, + "step": 9038 + }, + { + "epoch": 4.273758865248227, + "grad_norm": 3.4421799182891846, + "learning_rate": 9.676830743433688e-07, + "loss": 0.3578, + "step": 9039 + }, + { + "epoch": 4.274231678486998, + "grad_norm": 3.2896533012390137, + "learning_rate": 9.67190211582398e-07, + "loss": 0.3421, + "step": 9040 + }, + { + "epoch": 4.274704491725768, + "grad_norm": 3.388833522796631, + "learning_rate": 9.666974442627717e-07, + "loss": 0.4089, + "step": 9041 + }, + { + "epoch": 4.275177304964539, + "grad_norm": 3.1000685691833496, + "learning_rate": 9.662047724151718e-07, + "loss": 0.4046, + "step": 9042 + }, + { + "epoch": 4.27565011820331, + "grad_norm": 3.5651235580444336, + "learning_rate": 9.657121960702753e-07, + "loss": 0.4275, + "step": 9043 + }, + { + "epoch": 4.276122931442081, + "grad_norm": 2.944434881210327, + "learning_rate": 9.65219715258752e-07, + "loss": 0.3395, + "step": 9044 + }, + { + "epoch": 4.276595744680851, + "grad_norm": 2.7315311431884766, + "learning_rate": 9.64727330011268e-07, + "loss": 0.3305, + "step": 9045 + }, + { + "epoch": 4.2770685579196215, + "grad_norm": 3.423567533493042, + "learning_rate": 9.642350403584805e-07, + "loss": 0.3605, + "step": 9046 + }, + { + "epoch": 4.277541371158392, + "grad_norm": 3.239745616912842, + "learning_rate": 9.637428463310435e-07, + "loss": 0.3519, + "step": 9047 + }, + { + "epoch": 4.278014184397163, + "grad_norm": 3.388700008392334, + "learning_rate": 9.632507479596035e-07, + "loss": 0.359, + "step": 9048 + }, + { + "epoch": 4.278486997635934, + "grad_norm": 3.3524253368377686, + "learning_rate": 9.627587452747996e-07, + "loss": 0.3381, + "step": 9049 + }, + { + "epoch": 4.278959810874705, + "grad_norm": 3.1089365482330322, + "learning_rate": 9.622668383072695e-07, + "loss": 0.3143, + "step": 9050 + }, + { + "epoch": 4.2794326241134755, + "grad_norm": 3.3477213382720947, + "learning_rate": 9.617750270876402e-07, + "loss": 0.3788, + "step": 9051 + }, + { + "epoch": 4.279905437352246, + "grad_norm": 2.934818983078003, + "learning_rate": 9.612833116465342e-07, + "loss": 0.3589, + "step": 9052 + }, + { + "epoch": 4.280378250591016, + "grad_norm": 3.125014305114746, + "learning_rate": 9.607916920145704e-07, + "loss": 0.3181, + "step": 9053 + }, + { + "epoch": 4.280851063829787, + "grad_norm": 3.5860400199890137, + "learning_rate": 9.60300168222358e-07, + "loss": 0.3597, + "step": 9054 + }, + { + "epoch": 4.281323877068558, + "grad_norm": 3.1414008140563965, + "learning_rate": 9.598087403005032e-07, + "loss": 0.3186, + "step": 9055 + }, + { + "epoch": 4.281796690307329, + "grad_norm": 2.841228723526001, + "learning_rate": 9.593174082796046e-07, + "loss": 0.3547, + "step": 9056 + }, + { + "epoch": 4.282269503546099, + "grad_norm": 3.1145405769348145, + "learning_rate": 9.588261721902547e-07, + "loss": 0.3317, + "step": 9057 + }, + { + "epoch": 4.28274231678487, + "grad_norm": 2.9518024921417236, + "learning_rate": 9.58335032063042e-07, + "loss": 0.3723, + "step": 9058 + }, + { + "epoch": 4.283215130023641, + "grad_norm": 2.887479782104492, + "learning_rate": 9.578439879285467e-07, + "loss": 0.3288, + "step": 9059 + }, + { + "epoch": 4.283687943262412, + "grad_norm": 3.253427267074585, + "learning_rate": 9.573530398173444e-07, + "loss": 0.326, + "step": 9060 + }, + { + "epoch": 4.284160756501182, + "grad_norm": 3.0442020893096924, + "learning_rate": 9.568621877600038e-07, + "loss": 0.3807, + "step": 9061 + }, + { + "epoch": 4.2846335697399525, + "grad_norm": 2.928743600845337, + "learning_rate": 9.563714317870877e-07, + "loss": 0.2977, + "step": 9062 + }, + { + "epoch": 4.285106382978723, + "grad_norm": 3.2095022201538086, + "learning_rate": 9.558807719291543e-07, + "loss": 0.3571, + "step": 9063 + }, + { + "epoch": 4.285579196217494, + "grad_norm": 3.3752429485321045, + "learning_rate": 9.55390208216754e-07, + "loss": 0.3928, + "step": 9064 + }, + { + "epoch": 4.286052009456265, + "grad_norm": 3.125702381134033, + "learning_rate": 9.548997406804333e-07, + "loss": 0.3334, + "step": 9065 + }, + { + "epoch": 4.286524822695036, + "grad_norm": 3.058772563934326, + "learning_rate": 9.544093693507308e-07, + "loss": 0.3184, + "step": 9066 + }, + { + "epoch": 4.2869976359338064, + "grad_norm": 3.1085948944091797, + "learning_rate": 9.539190942581785e-07, + "loss": 0.3626, + "step": 9067 + }, + { + "epoch": 4.287470449172577, + "grad_norm": 2.8497378826141357, + "learning_rate": 9.53428915433306e-07, + "loss": 0.3451, + "step": 9068 + }, + { + "epoch": 4.287943262411347, + "grad_norm": 3.411508798599243, + "learning_rate": 9.529388329066325e-07, + "loss": 0.3608, + "step": 9069 + }, + { + "epoch": 4.288416075650118, + "grad_norm": 3.1312575340270996, + "learning_rate": 9.524488467086751e-07, + "loss": 0.3419, + "step": 9070 + }, + { + "epoch": 4.288888888888889, + "grad_norm": 3.1531126499176025, + "learning_rate": 9.519589568699419e-07, + "loss": 0.3261, + "step": 9071 + }, + { + "epoch": 4.2893617021276595, + "grad_norm": 2.8852546215057373, + "learning_rate": 9.514691634209361e-07, + "loss": 0.3258, + "step": 9072 + }, + { + "epoch": 4.28983451536643, + "grad_norm": 3.0486297607421875, + "learning_rate": 9.50979466392156e-07, + "loss": 0.3207, + "step": 9073 + }, + { + "epoch": 4.290307328605201, + "grad_norm": 3.017788887023926, + "learning_rate": 9.504898658140924e-07, + "loss": 0.3288, + "step": 9074 + }, + { + "epoch": 4.290780141843972, + "grad_norm": 3.24040150642395, + "learning_rate": 9.500003617172302e-07, + "loss": 0.2847, + "step": 9075 + }, + { + "epoch": 4.291252955082742, + "grad_norm": 3.6793692111968994, + "learning_rate": 9.49510954132049e-07, + "loss": 0.425, + "step": 9076 + }, + { + "epoch": 4.291725768321513, + "grad_norm": 2.7292215824127197, + "learning_rate": 9.490216430890215e-07, + "loss": 0.3208, + "step": 9077 + }, + { + "epoch": 4.292198581560283, + "grad_norm": 2.650388479232788, + "learning_rate": 9.485324286186159e-07, + "loss": 0.2842, + "step": 9078 + }, + { + "epoch": 4.292671394799054, + "grad_norm": 3.1459171772003174, + "learning_rate": 9.480433107512932e-07, + "loss": 0.3287, + "step": 9079 + }, + { + "epoch": 4.293144208037825, + "grad_norm": 3.1777186393737793, + "learning_rate": 9.475542895175074e-07, + "loss": 0.3385, + "step": 9080 + }, + { + "epoch": 4.293617021276596, + "grad_norm": 3.5608465671539307, + "learning_rate": 9.470653649477096e-07, + "loss": 0.3574, + "step": 9081 + }, + { + "epoch": 4.294089834515367, + "grad_norm": 2.58306884765625, + "learning_rate": 9.465765370723415e-07, + "loss": 0.3156, + "step": 9082 + }, + { + "epoch": 4.294562647754137, + "grad_norm": 3.3265857696533203, + "learning_rate": 9.460878059218415e-07, + "loss": 0.3678, + "step": 9083 + }, + { + "epoch": 4.295035460992908, + "grad_norm": 3.259326696395874, + "learning_rate": 9.455991715266403e-07, + "loss": 0.3675, + "step": 9084 + }, + { + "epoch": 4.295508274231678, + "grad_norm": 3.430608034133911, + "learning_rate": 9.451106339171618e-07, + "loss": 0.3147, + "step": 9085 + }, + { + "epoch": 4.295981087470449, + "grad_norm": 3.2896342277526855, + "learning_rate": 9.44622193123827e-07, + "loss": 0.3482, + "step": 9086 + }, + { + "epoch": 4.29645390070922, + "grad_norm": 2.9680557250976562, + "learning_rate": 9.441338491770474e-07, + "loss": 0.3504, + "step": 9087 + }, + { + "epoch": 4.2969267139479905, + "grad_norm": 2.9656941890716553, + "learning_rate": 9.436456021072313e-07, + "loss": 0.3782, + "step": 9088 + }, + { + "epoch": 4.297399527186761, + "grad_norm": 3.463456630706787, + "learning_rate": 9.431574519447794e-07, + "loss": 0.3517, + "step": 9089 + }, + { + "epoch": 4.297872340425532, + "grad_norm": 3.3658525943756104, + "learning_rate": 9.426693987200864e-07, + "loss": 0.3535, + "step": 9090 + }, + { + "epoch": 4.298345153664303, + "grad_norm": 3.087533712387085, + "learning_rate": 9.421814424635414e-07, + "loss": 0.3007, + "step": 9091 + }, + { + "epoch": 4.298817966903073, + "grad_norm": 3.4596481323242188, + "learning_rate": 9.41693583205526e-07, + "loss": 0.3797, + "step": 9092 + }, + { + "epoch": 4.299290780141844, + "grad_norm": 3.647507667541504, + "learning_rate": 9.412058209764191e-07, + "loss": 0.3803, + "step": 9093 + }, + { + "epoch": 4.299763593380614, + "grad_norm": 2.9130196571350098, + "learning_rate": 9.407181558065909e-07, + "loss": 0.32, + "step": 9094 + }, + { + "epoch": 4.300236406619385, + "grad_norm": 3.2562668323516846, + "learning_rate": 9.402305877264048e-07, + "loss": 0.4103, + "step": 9095 + }, + { + "epoch": 4.300709219858156, + "grad_norm": 3.1416616439819336, + "learning_rate": 9.397431167662216e-07, + "loss": 0.3498, + "step": 9096 + }, + { + "epoch": 4.301182033096927, + "grad_norm": 2.9540042877197266, + "learning_rate": 9.392557429563929e-07, + "loss": 0.3073, + "step": 9097 + }, + { + "epoch": 4.301654846335698, + "grad_norm": 3.0450825691223145, + "learning_rate": 9.387684663272645e-07, + "loss": 0.3295, + "step": 9098 + }, + { + "epoch": 4.302127659574468, + "grad_norm": 3.1060359477996826, + "learning_rate": 9.38281286909179e-07, + "loss": 0.3582, + "step": 9099 + }, + { + "epoch": 4.302600472813239, + "grad_norm": 2.901136636734009, + "learning_rate": 9.377942047324687e-07, + "loss": 0.3162, + "step": 9100 + }, + { + "epoch": 4.303073286052009, + "grad_norm": 3.5618929862976074, + "learning_rate": 9.373072198274641e-07, + "loss": 0.2917, + "step": 9101 + }, + { + "epoch": 4.30354609929078, + "grad_norm": 3.0853395462036133, + "learning_rate": 9.368203322244871e-07, + "loss": 0.3124, + "step": 9102 + }, + { + "epoch": 4.304018912529551, + "grad_norm": 3.00398588180542, + "learning_rate": 9.363335419538524e-07, + "loss": 0.3167, + "step": 9103 + }, + { + "epoch": 4.3044917257683215, + "grad_norm": 3.4705588817596436, + "learning_rate": 9.358468490458725e-07, + "loss": 0.3188, + "step": 9104 + }, + { + "epoch": 4.304964539007092, + "grad_norm": 2.948302745819092, + "learning_rate": 9.353602535308509e-07, + "loss": 0.2739, + "step": 9105 + }, + { + "epoch": 4.305437352245863, + "grad_norm": 3.4512269496917725, + "learning_rate": 9.348737554390852e-07, + "loss": 0.3256, + "step": 9106 + }, + { + "epoch": 4.305910165484634, + "grad_norm": 2.9979147911071777, + "learning_rate": 9.343873548008684e-07, + "loss": 0.3184, + "step": 9107 + }, + { + "epoch": 4.306382978723404, + "grad_norm": 3.1008479595184326, + "learning_rate": 9.339010516464847e-07, + "loss": 0.3251, + "step": 9108 + }, + { + "epoch": 4.306855791962175, + "grad_norm": 2.86930775642395, + "learning_rate": 9.334148460062165e-07, + "loss": 0.3322, + "step": 9109 + }, + { + "epoch": 4.307328605200945, + "grad_norm": 3.2068963050842285, + "learning_rate": 9.329287379103355e-07, + "loss": 0.3845, + "step": 9110 + }, + { + "epoch": 4.307801418439716, + "grad_norm": 3.567309856414795, + "learning_rate": 9.324427273891115e-07, + "loss": 0.4037, + "step": 9111 + }, + { + "epoch": 4.308274231678487, + "grad_norm": 3.2064783573150635, + "learning_rate": 9.319568144728056e-07, + "loss": 0.3481, + "step": 9112 + }, + { + "epoch": 4.308747044917258, + "grad_norm": 3.2492294311523438, + "learning_rate": 9.314709991916721e-07, + "loss": 0.3657, + "step": 9113 + }, + { + "epoch": 4.309219858156029, + "grad_norm": 2.990755081176758, + "learning_rate": 9.309852815759626e-07, + "loss": 0.3582, + "step": 9114 + }, + { + "epoch": 4.309692671394799, + "grad_norm": 3.3375513553619385, + "learning_rate": 9.304996616559187e-07, + "loss": 0.3657, + "step": 9115 + }, + { + "epoch": 4.31016548463357, + "grad_norm": 2.945552349090576, + "learning_rate": 9.300141394617798e-07, + "loss": 0.3075, + "step": 9116 + }, + { + "epoch": 4.31063829787234, + "grad_norm": 3.5318517684936523, + "learning_rate": 9.295287150237764e-07, + "loss": 0.39, + "step": 9117 + }, + { + "epoch": 4.311111111111111, + "grad_norm": 3.452049732208252, + "learning_rate": 9.290433883721326e-07, + "loss": 0.3821, + "step": 9118 + }, + { + "epoch": 4.311583924349882, + "grad_norm": 3.0762388706207275, + "learning_rate": 9.285581595370693e-07, + "loss": 0.316, + "step": 9119 + }, + { + "epoch": 4.3120567375886525, + "grad_norm": 4.13551664352417, + "learning_rate": 9.28073028548799e-07, + "loss": 0.3661, + "step": 9120 + }, + { + "epoch": 4.312529550827423, + "grad_norm": 3.1915719509124756, + "learning_rate": 9.275879954375286e-07, + "loss": 0.371, + "step": 9121 + }, + { + "epoch": 4.313002364066194, + "grad_norm": 3.118861198425293, + "learning_rate": 9.271030602334577e-07, + "loss": 0.3943, + "step": 9122 + }, + { + "epoch": 4.313475177304965, + "grad_norm": 3.042757987976074, + "learning_rate": 9.266182229667836e-07, + "loss": 0.3779, + "step": 9123 + }, + { + "epoch": 4.313947990543735, + "grad_norm": 2.949110746383667, + "learning_rate": 9.261334836676933e-07, + "loss": 0.3721, + "step": 9124 + }, + { + "epoch": 4.314420803782506, + "grad_norm": 2.982090950012207, + "learning_rate": 9.256488423663701e-07, + "loss": 0.2865, + "step": 9125 + }, + { + "epoch": 4.314893617021276, + "grad_norm": 3.6527535915374756, + "learning_rate": 9.25164299092989e-07, + "loss": 0.3689, + "step": 9126 + }, + { + "epoch": 4.315366430260047, + "grad_norm": 3.3310744762420654, + "learning_rate": 9.246798538777227e-07, + "loss": 0.3198, + "step": 9127 + }, + { + "epoch": 4.315839243498818, + "grad_norm": 2.8298583030700684, + "learning_rate": 9.241955067507332e-07, + "loss": 0.3711, + "step": 9128 + }, + { + "epoch": 4.316312056737589, + "grad_norm": 3.636894702911377, + "learning_rate": 9.237112577421809e-07, + "loss": 0.374, + "step": 9129 + }, + { + "epoch": 4.31678486997636, + "grad_norm": 2.896251678466797, + "learning_rate": 9.232271068822166e-07, + "loss": 0.3372, + "step": 9130 + }, + { + "epoch": 4.31725768321513, + "grad_norm": 3.2836971282958984, + "learning_rate": 9.227430542009854e-07, + "loss": 0.3584, + "step": 9131 + }, + { + "epoch": 4.317730496453901, + "grad_norm": 2.9452571868896484, + "learning_rate": 9.222590997286293e-07, + "loss": 0.3658, + "step": 9132 + }, + { + "epoch": 4.318203309692671, + "grad_norm": 2.88613224029541, + "learning_rate": 9.217752434952801e-07, + "loss": 0.3221, + "step": 9133 + }, + { + "epoch": 4.318676122931442, + "grad_norm": 2.7794570922851562, + "learning_rate": 9.212914855310667e-07, + "loss": 0.3142, + "step": 9134 + }, + { + "epoch": 4.319148936170213, + "grad_norm": 3.0195112228393555, + "learning_rate": 9.208078258661102e-07, + "loss": 0.3039, + "step": 9135 + }, + { + "epoch": 4.3196217494089835, + "grad_norm": 3.5178396701812744, + "learning_rate": 9.203242645305253e-07, + "loss": 0.3912, + "step": 9136 + }, + { + "epoch": 4.320094562647754, + "grad_norm": 3.145413875579834, + "learning_rate": 9.198408015544222e-07, + "loss": 0.3045, + "step": 9137 + }, + { + "epoch": 4.320567375886525, + "grad_norm": 3.151193380355835, + "learning_rate": 9.193574369679037e-07, + "loss": 0.341, + "step": 9138 + }, + { + "epoch": 4.321040189125296, + "grad_norm": 3.248255968093872, + "learning_rate": 9.188741708010668e-07, + "loss": 0.4344, + "step": 9139 + }, + { + "epoch": 4.321513002364066, + "grad_norm": 2.953218460083008, + "learning_rate": 9.183910030840021e-07, + "loss": 0.343, + "step": 9140 + }, + { + "epoch": 4.321985815602837, + "grad_norm": 3.1873161792755127, + "learning_rate": 9.179079338467936e-07, + "loss": 0.3082, + "step": 9141 + }, + { + "epoch": 4.322458628841607, + "grad_norm": 3.2587013244628906, + "learning_rate": 9.174249631195218e-07, + "loss": 0.3855, + "step": 9142 + }, + { + "epoch": 4.322931442080378, + "grad_norm": 2.956145763397217, + "learning_rate": 9.169420909322573e-07, + "loss": 0.3156, + "step": 9143 + }, + { + "epoch": 4.323404255319149, + "grad_norm": 3.1664650440216064, + "learning_rate": 9.164593173150683e-07, + "loss": 0.352, + "step": 9144 + }, + { + "epoch": 4.32387706855792, + "grad_norm": 3.2792744636535645, + "learning_rate": 9.159766422980138e-07, + "loss": 0.3963, + "step": 9145 + }, + { + "epoch": 4.3243498817966906, + "grad_norm": 3.1249687671661377, + "learning_rate": 9.154940659111472e-07, + "loss": 0.3405, + "step": 9146 + }, + { + "epoch": 4.324822695035461, + "grad_norm": 3.128340244293213, + "learning_rate": 9.150115881845181e-07, + "loss": 0.2733, + "step": 9147 + }, + { + "epoch": 4.325295508274232, + "grad_norm": 3.1790847778320312, + "learning_rate": 9.145292091481675e-07, + "loss": 0.3676, + "step": 9148 + }, + { + "epoch": 4.325768321513002, + "grad_norm": 2.874678134918213, + "learning_rate": 9.1404692883213e-07, + "loss": 0.2937, + "step": 9149 + }, + { + "epoch": 4.326241134751773, + "grad_norm": 3.102196216583252, + "learning_rate": 9.135647472664369e-07, + "loss": 0.2772, + "step": 9150 + }, + { + "epoch": 4.326713947990544, + "grad_norm": 3.0027546882629395, + "learning_rate": 9.130826644811099e-07, + "loss": 0.3171, + "step": 9151 + }, + { + "epoch": 4.3271867612293144, + "grad_norm": 2.750152587890625, + "learning_rate": 9.126006805061679e-07, + "loss": 0.3689, + "step": 9152 + }, + { + "epoch": 4.327659574468085, + "grad_norm": 3.251054525375366, + "learning_rate": 9.12118795371621e-07, + "loss": 0.3463, + "step": 9153 + }, + { + "epoch": 4.328132387706856, + "grad_norm": 2.8849353790283203, + "learning_rate": 9.116370091074738e-07, + "loss": 0.3, + "step": 9154 + }, + { + "epoch": 4.328605200945627, + "grad_norm": 3.4823720455169678, + "learning_rate": 9.111553217437255e-07, + "loss": 0.336, + "step": 9155 + }, + { + "epoch": 4.329078014184397, + "grad_norm": 2.8170886039733887, + "learning_rate": 9.106737333103677e-07, + "loss": 0.3237, + "step": 9156 + }, + { + "epoch": 4.3295508274231675, + "grad_norm": 3.095379114151001, + "learning_rate": 9.101922438373881e-07, + "loss": 0.3438, + "step": 9157 + }, + { + "epoch": 4.330023640661938, + "grad_norm": 3.1764986515045166, + "learning_rate": 9.097108533547667e-07, + "loss": 0.3174, + "step": 9158 + }, + { + "epoch": 4.330496453900709, + "grad_norm": 3.3972036838531494, + "learning_rate": 9.092295618924763e-07, + "loss": 0.3118, + "step": 9159 + }, + { + "epoch": 4.33096926713948, + "grad_norm": 3.112926959991455, + "learning_rate": 9.087483694804863e-07, + "loss": 0.3521, + "step": 9160 + }, + { + "epoch": 4.331442080378251, + "grad_norm": 3.395550012588501, + "learning_rate": 9.082672761487573e-07, + "loss": 0.3423, + "step": 9161 + }, + { + "epoch": 4.3319148936170215, + "grad_norm": 3.486910343170166, + "learning_rate": 9.077862819272465e-07, + "loss": 0.3655, + "step": 9162 + }, + { + "epoch": 4.332387706855792, + "grad_norm": 3.0986499786376953, + "learning_rate": 9.07305386845902e-07, + "loss": 0.2865, + "step": 9163 + }, + { + "epoch": 4.332860520094562, + "grad_norm": 2.962139844894409, + "learning_rate": 9.068245909346665e-07, + "loss": 0.315, + "step": 9164 + }, + { + "epoch": 4.333333333333333, + "grad_norm": 3.0887413024902344, + "learning_rate": 9.063438942234787e-07, + "loss": 0.3207, + "step": 9165 + }, + { + "epoch": 4.333806146572104, + "grad_norm": 2.909770965576172, + "learning_rate": 9.058632967422678e-07, + "loss": 0.3221, + "step": 9166 + }, + { + "epoch": 4.334278959810875, + "grad_norm": 3.1872079372406006, + "learning_rate": 9.053827985209604e-07, + "loss": 0.3856, + "step": 9167 + }, + { + "epoch": 4.334751773049645, + "grad_norm": 3.196985960006714, + "learning_rate": 9.049023995894738e-07, + "loss": 0.2994, + "step": 9168 + }, + { + "epoch": 4.335224586288416, + "grad_norm": 3.3150243759155273, + "learning_rate": 9.044220999777204e-07, + "loss": 0.3619, + "step": 9169 + }, + { + "epoch": 4.335697399527187, + "grad_norm": 3.1662707328796387, + "learning_rate": 9.039418997156066e-07, + "loss": 0.3347, + "step": 9170 + }, + { + "epoch": 4.336170212765958, + "grad_norm": 3.6789329051971436, + "learning_rate": 9.034617988330318e-07, + "loss": 0.4106, + "step": 9171 + }, + { + "epoch": 4.336643026004728, + "grad_norm": 3.5192553997039795, + "learning_rate": 9.029817973598898e-07, + "loss": 0.3561, + "step": 9172 + }, + { + "epoch": 4.3371158392434985, + "grad_norm": 3.4309239387512207, + "learning_rate": 9.025018953260692e-07, + "loss": 0.3739, + "step": 9173 + }, + { + "epoch": 4.337588652482269, + "grad_norm": 3.1840806007385254, + "learning_rate": 9.020220927614498e-07, + "loss": 0.3479, + "step": 9174 + }, + { + "epoch": 4.33806146572104, + "grad_norm": 3.6679139137268066, + "learning_rate": 9.015423896959088e-07, + "loss": 0.3739, + "step": 9175 + }, + { + "epoch": 4.338534278959811, + "grad_norm": 3.125296115875244, + "learning_rate": 9.010627861593143e-07, + "loss": 0.3046, + "step": 9176 + }, + { + "epoch": 4.339007092198582, + "grad_norm": 3.0710368156433105, + "learning_rate": 9.005832821815278e-07, + "loss": 0.322, + "step": 9177 + }, + { + "epoch": 4.3394799054373525, + "grad_norm": 3.068833351135254, + "learning_rate": 9.001038777924082e-07, + "loss": 0.3229, + "step": 9178 + }, + { + "epoch": 4.339952718676123, + "grad_norm": 3.2746002674102783, + "learning_rate": 8.996245730218037e-07, + "loss": 0.369, + "step": 9179 + }, + { + "epoch": 4.340425531914893, + "grad_norm": 3.1138477325439453, + "learning_rate": 8.991453678995607e-07, + "loss": 0.3456, + "step": 9180 + }, + { + "epoch": 4.340898345153664, + "grad_norm": 3.2195467948913574, + "learning_rate": 8.986662624555159e-07, + "loss": 0.377, + "step": 9181 + }, + { + "epoch": 4.341371158392435, + "grad_norm": 3.1197304725646973, + "learning_rate": 8.981872567195008e-07, + "loss": 0.3007, + "step": 9182 + }, + { + "epoch": 4.341843971631206, + "grad_norm": 3.295881748199463, + "learning_rate": 8.977083507213418e-07, + "loss": 0.4007, + "step": 9183 + }, + { + "epoch": 4.342316784869976, + "grad_norm": 3.8013954162597656, + "learning_rate": 8.972295444908582e-07, + "loss": 0.4322, + "step": 9184 + }, + { + "epoch": 4.342789598108747, + "grad_norm": 3.133434295654297, + "learning_rate": 8.967508380578633e-07, + "loss": 0.3379, + "step": 9185 + }, + { + "epoch": 4.343262411347518, + "grad_norm": 3.0942039489746094, + "learning_rate": 8.962722314521625e-07, + "loss": 0.3753, + "step": 9186 + }, + { + "epoch": 4.343735224586289, + "grad_norm": 2.8691020011901855, + "learning_rate": 8.957937247035583e-07, + "loss": 0.3003, + "step": 9187 + }, + { + "epoch": 4.344208037825059, + "grad_norm": 2.8353092670440674, + "learning_rate": 8.95315317841845e-07, + "loss": 0.3068, + "step": 9188 + }, + { + "epoch": 4.3446808510638295, + "grad_norm": 3.071207046508789, + "learning_rate": 8.948370108968097e-07, + "loss": 0.3147, + "step": 9189 + }, + { + "epoch": 4.3451536643026, + "grad_norm": 3.3605904579162598, + "learning_rate": 8.943588038982359e-07, + "loss": 0.3216, + "step": 9190 + }, + { + "epoch": 4.345626477541371, + "grad_norm": 3.0702717304229736, + "learning_rate": 8.93880696875899e-07, + "loss": 0.3507, + "step": 9191 + }, + { + "epoch": 4.346099290780142, + "grad_norm": 3.261456251144409, + "learning_rate": 8.934026898595675e-07, + "loss": 0.3677, + "step": 9192 + }, + { + "epoch": 4.346572104018913, + "grad_norm": 3.0827512741088867, + "learning_rate": 8.929247828790066e-07, + "loss": 0.3786, + "step": 9193 + }, + { + "epoch": 4.3470449172576835, + "grad_norm": 3.488949775695801, + "learning_rate": 8.924469759639728e-07, + "loss": 0.3685, + "step": 9194 + }, + { + "epoch": 4.347517730496454, + "grad_norm": 2.8565423488616943, + "learning_rate": 8.919692691442162e-07, + "loss": 0.3449, + "step": 9195 + }, + { + "epoch": 4.347990543735224, + "grad_norm": 4.654722213745117, + "learning_rate": 8.914916624494829e-07, + "loss": 0.344, + "step": 9196 + }, + { + "epoch": 4.348463356973995, + "grad_norm": 3.256714344024658, + "learning_rate": 8.910141559095098e-07, + "loss": 0.3487, + "step": 9197 + }, + { + "epoch": 4.348936170212766, + "grad_norm": 3.0921413898468018, + "learning_rate": 8.90536749554031e-07, + "loss": 0.3171, + "step": 9198 + }, + { + "epoch": 4.349408983451537, + "grad_norm": 3.1129112243652344, + "learning_rate": 8.900594434127712e-07, + "loss": 0.3501, + "step": 9199 + }, + { + "epoch": 4.349881796690307, + "grad_norm": 3.077688217163086, + "learning_rate": 8.8958223751545e-07, + "loss": 0.2781, + "step": 9200 + }, + { + "epoch": 4.350354609929078, + "grad_norm": 3.2839295864105225, + "learning_rate": 8.891051318917821e-07, + "loss": 0.3699, + "step": 9201 + }, + { + "epoch": 4.350827423167849, + "grad_norm": 3.0370850563049316, + "learning_rate": 8.886281265714741e-07, + "loss": 0.3344, + "step": 9202 + }, + { + "epoch": 4.35130023640662, + "grad_norm": 3.439702033996582, + "learning_rate": 8.88151221584227e-07, + "loss": 0.3865, + "step": 9203 + }, + { + "epoch": 4.35177304964539, + "grad_norm": 3.133317470550537, + "learning_rate": 8.876744169597357e-07, + "loss": 0.3352, + "step": 9204 + }, + { + "epoch": 4.3522458628841605, + "grad_norm": 3.2529115676879883, + "learning_rate": 8.871977127276876e-07, + "loss": 0.3708, + "step": 9205 + }, + { + "epoch": 4.352718676122931, + "grad_norm": 3.2149887084960938, + "learning_rate": 8.867211089177669e-07, + "loss": 0.3298, + "step": 9206 + }, + { + "epoch": 4.353191489361702, + "grad_norm": 2.778116464614868, + "learning_rate": 8.86244605559648e-07, + "loss": 0.3319, + "step": 9207 + }, + { + "epoch": 4.353664302600473, + "grad_norm": 3.206336736679077, + "learning_rate": 8.85768202683002e-07, + "loss": 0.3793, + "step": 9208 + }, + { + "epoch": 4.354137115839244, + "grad_norm": 3.4236080646514893, + "learning_rate": 8.852919003174921e-07, + "loss": 0.3341, + "step": 9209 + }, + { + "epoch": 4.3546099290780145, + "grad_norm": 3.049886703491211, + "learning_rate": 8.848156984927742e-07, + "loss": 0.3153, + "step": 9210 + }, + { + "epoch": 4.355082742316785, + "grad_norm": 4.048248291015625, + "learning_rate": 8.843395972385013e-07, + "loss": 0.3857, + "step": 9211 + }, + { + "epoch": 4.355555555555555, + "grad_norm": 3.3379292488098145, + "learning_rate": 8.838635965843165e-07, + "loss": 0.3167, + "step": 9212 + }, + { + "epoch": 4.356028368794326, + "grad_norm": 2.963364839553833, + "learning_rate": 8.833876965598598e-07, + "loss": 0.3427, + "step": 9213 + }, + { + "epoch": 4.356501182033097, + "grad_norm": 3.1309237480163574, + "learning_rate": 8.829118971947625e-07, + "loss": 0.3694, + "step": 9214 + }, + { + "epoch": 4.356973995271868, + "grad_norm": 3.4728028774261475, + "learning_rate": 8.824361985186497e-07, + "loss": 0.3769, + "step": 9215 + }, + { + "epoch": 4.357446808510638, + "grad_norm": 2.7183408737182617, + "learning_rate": 8.819606005611431e-07, + "loss": 0.3053, + "step": 9216 + }, + { + "epoch": 4.357919621749409, + "grad_norm": 3.34867262840271, + "learning_rate": 8.814851033518549e-07, + "loss": 0.3316, + "step": 9217 + }, + { + "epoch": 4.35839243498818, + "grad_norm": 3.288097858428955, + "learning_rate": 8.810097069203924e-07, + "loss": 0.362, + "step": 9218 + }, + { + "epoch": 4.358865248226951, + "grad_norm": 3.2768566608428955, + "learning_rate": 8.805344112963563e-07, + "loss": 0.3762, + "step": 9219 + }, + { + "epoch": 4.359338061465721, + "grad_norm": 2.730982542037964, + "learning_rate": 8.800592165093405e-07, + "loss": 0.3184, + "step": 9220 + }, + { + "epoch": 4.3598108747044915, + "grad_norm": 3.2347333431243896, + "learning_rate": 8.795841225889348e-07, + "loss": 0.3414, + "step": 9221 + }, + { + "epoch": 4.360283687943262, + "grad_norm": 2.8792049884796143, + "learning_rate": 8.791091295647208e-07, + "loss": 0.3312, + "step": 9222 + }, + { + "epoch": 4.360756501182033, + "grad_norm": 3.2037971019744873, + "learning_rate": 8.786342374662726e-07, + "loss": 0.3772, + "step": 9223 + }, + { + "epoch": 4.361229314420804, + "grad_norm": 3.765244245529175, + "learning_rate": 8.781594463231621e-07, + "loss": 0.3724, + "step": 9224 + }, + { + "epoch": 4.361702127659575, + "grad_norm": 3.085339069366455, + "learning_rate": 8.776847561649504e-07, + "loss": 0.3468, + "step": 9225 + }, + { + "epoch": 4.3621749408983455, + "grad_norm": 2.8031229972839355, + "learning_rate": 8.772101670211963e-07, + "loss": 0.3219, + "step": 9226 + }, + { + "epoch": 4.362647754137116, + "grad_norm": 2.667694091796875, + "learning_rate": 8.76735678921449e-07, + "loss": 0.3381, + "step": 9227 + }, + { + "epoch": 4.363120567375886, + "grad_norm": 2.898273229598999, + "learning_rate": 8.762612918952526e-07, + "loss": 0.3526, + "step": 9228 + }, + { + "epoch": 4.363593380614657, + "grad_norm": 3.1458849906921387, + "learning_rate": 8.757870059721465e-07, + "loss": 0.3516, + "step": 9229 + }, + { + "epoch": 4.364066193853428, + "grad_norm": 3.1719279289245605, + "learning_rate": 8.753128211816609e-07, + "loss": 0.328, + "step": 9230 + }, + { + "epoch": 4.3645390070921986, + "grad_norm": 3.0799217224121094, + "learning_rate": 8.748387375533224e-07, + "loss": 0.2802, + "step": 9231 + }, + { + "epoch": 4.365011820330969, + "grad_norm": 3.1218812465667725, + "learning_rate": 8.743647551166498e-07, + "loss": 0.3264, + "step": 9232 + }, + { + "epoch": 4.36548463356974, + "grad_norm": 3.231175184249878, + "learning_rate": 8.738908739011556e-07, + "loss": 0.3192, + "step": 9233 + }, + { + "epoch": 4.365957446808511, + "grad_norm": 3.088284730911255, + "learning_rate": 8.734170939363465e-07, + "loss": 0.3569, + "step": 9234 + }, + { + "epoch": 4.366430260047281, + "grad_norm": 3.2510828971862793, + "learning_rate": 8.729434152517217e-07, + "loss": 0.3977, + "step": 9235 + }, + { + "epoch": 4.366903073286052, + "grad_norm": 3.435762405395508, + "learning_rate": 8.724698378767768e-07, + "loss": 0.3201, + "step": 9236 + }, + { + "epoch": 4.3673758865248224, + "grad_norm": 3.6876676082611084, + "learning_rate": 8.719963618409985e-07, + "loss": 0.381, + "step": 9237 + }, + { + "epoch": 4.367848699763593, + "grad_norm": 2.7620339393615723, + "learning_rate": 8.715229871738676e-07, + "loss": 0.2939, + "step": 9238 + }, + { + "epoch": 4.368321513002364, + "grad_norm": 3.412893056869507, + "learning_rate": 8.710497139048604e-07, + "loss": 0.3592, + "step": 9239 + }, + { + "epoch": 4.368794326241135, + "grad_norm": 3.2498574256896973, + "learning_rate": 8.705765420634446e-07, + "loss": 0.4054, + "step": 9240 + }, + { + "epoch": 4.369267139479906, + "grad_norm": 3.138425827026367, + "learning_rate": 8.701034716790821e-07, + "loss": 0.3609, + "step": 9241 + }, + { + "epoch": 4.369739952718676, + "grad_norm": 2.7645158767700195, + "learning_rate": 8.696305027812301e-07, + "loss": 0.3085, + "step": 9242 + }, + { + "epoch": 4.370212765957447, + "grad_norm": 3.5948917865753174, + "learning_rate": 8.691576353993372e-07, + "loss": 0.3846, + "step": 9243 + }, + { + "epoch": 4.370685579196217, + "grad_norm": 3.2185158729553223, + "learning_rate": 8.68684869562848e-07, + "loss": 0.3516, + "step": 9244 + }, + { + "epoch": 4.371158392434988, + "grad_norm": 3.057281494140625, + "learning_rate": 8.68212205301199e-07, + "loss": 0.3197, + "step": 9245 + }, + { + "epoch": 4.371631205673759, + "grad_norm": 2.9788076877593994, + "learning_rate": 8.677396426438198e-07, + "loss": 0.3283, + "step": 9246 + }, + { + "epoch": 4.3721040189125295, + "grad_norm": 2.9246625900268555, + "learning_rate": 8.672671816201366e-07, + "loss": 0.3482, + "step": 9247 + }, + { + "epoch": 4.3725768321513, + "grad_norm": 2.9994964599609375, + "learning_rate": 8.667948222595671e-07, + "loss": 0.3802, + "step": 9248 + }, + { + "epoch": 4.373049645390071, + "grad_norm": 2.692626476287842, + "learning_rate": 8.663225645915222e-07, + "loss": 0.3045, + "step": 9249 + }, + { + "epoch": 4.373522458628842, + "grad_norm": 2.794236660003662, + "learning_rate": 8.658504086454078e-07, + "loss": 0.3056, + "step": 9250 + }, + { + "epoch": 4.373995271867612, + "grad_norm": 3.020534038543701, + "learning_rate": 8.653783544506222e-07, + "loss": 0.3341, + "step": 9251 + }, + { + "epoch": 4.374468085106383, + "grad_norm": 3.2142958641052246, + "learning_rate": 8.649064020365596e-07, + "loss": 0.3435, + "step": 9252 + }, + { + "epoch": 4.374940898345153, + "grad_norm": 3.3818624019622803, + "learning_rate": 8.644345514326049e-07, + "loss": 0.3744, + "step": 9253 + }, + { + "epoch": 4.375413711583924, + "grad_norm": 3.1566405296325684, + "learning_rate": 8.639628026681399e-07, + "loss": 0.3568, + "step": 9254 + }, + { + "epoch": 4.375886524822695, + "grad_norm": 3.3773083686828613, + "learning_rate": 8.63491155772537e-07, + "loss": 0.3523, + "step": 9255 + }, + { + "epoch": 4.376359338061466, + "grad_norm": 3.0850939750671387, + "learning_rate": 8.630196107751634e-07, + "loss": 0.3356, + "step": 9256 + }, + { + "epoch": 4.376832151300237, + "grad_norm": 3.361496686935425, + "learning_rate": 8.625481677053815e-07, + "loss": 0.3619, + "step": 9257 + }, + { + "epoch": 4.377304964539007, + "grad_norm": 3.026015043258667, + "learning_rate": 8.620768265925444e-07, + "loss": 0.3476, + "step": 9258 + }, + { + "epoch": 4.377777777777778, + "grad_norm": 3.142747640609741, + "learning_rate": 8.61605587466002e-07, + "loss": 0.3391, + "step": 9259 + }, + { + "epoch": 4.378250591016548, + "grad_norm": 3.0910356044769287, + "learning_rate": 8.611344503550956e-07, + "loss": 0.3201, + "step": 9260 + }, + { + "epoch": 4.378723404255319, + "grad_norm": 3.4462292194366455, + "learning_rate": 8.606634152891599e-07, + "loss": 0.4075, + "step": 9261 + }, + { + "epoch": 4.37919621749409, + "grad_norm": 2.984248638153076, + "learning_rate": 8.601924822975258e-07, + "loss": 0.3415, + "step": 9262 + }, + { + "epoch": 4.3796690307328605, + "grad_norm": 2.944971799850464, + "learning_rate": 8.597216514095155e-07, + "loss": 0.3163, + "step": 9263 + }, + { + "epoch": 4.380141843971631, + "grad_norm": 3.1562247276306152, + "learning_rate": 8.592509226544457e-07, + "loss": 0.3093, + "step": 9264 + }, + { + "epoch": 4.380614657210402, + "grad_norm": 2.911339282989502, + "learning_rate": 8.587802960616254e-07, + "loss": 0.3287, + "step": 9265 + }, + { + "epoch": 4.381087470449173, + "grad_norm": 3.5560295581817627, + "learning_rate": 8.583097716603605e-07, + "loss": 0.3763, + "step": 9266 + }, + { + "epoch": 4.381560283687943, + "grad_norm": 3.35855770111084, + "learning_rate": 8.578393494799478e-07, + "loss": 0.3703, + "step": 9267 + }, + { + "epoch": 4.382033096926714, + "grad_norm": 3.0229954719543457, + "learning_rate": 8.573690295496778e-07, + "loss": 0.3421, + "step": 9268 + }, + { + "epoch": 4.382505910165484, + "grad_norm": 3.0842833518981934, + "learning_rate": 8.568988118988348e-07, + "loss": 0.3473, + "step": 9269 + }, + { + "epoch": 4.382978723404255, + "grad_norm": 3.2471694946289062, + "learning_rate": 8.564286965566989e-07, + "loss": 0.3025, + "step": 9270 + }, + { + "epoch": 4.383451536643026, + "grad_norm": 3.4435837268829346, + "learning_rate": 8.559586835525404e-07, + "loss": 0.394, + "step": 9271 + }, + { + "epoch": 4.383924349881797, + "grad_norm": 3.4572243690490723, + "learning_rate": 8.554887729156267e-07, + "loss": 0.3745, + "step": 9272 + }, + { + "epoch": 4.384397163120568, + "grad_norm": 3.3646514415740967, + "learning_rate": 8.550189646752161e-07, + "loss": 0.308, + "step": 9273 + }, + { + "epoch": 4.384869976359338, + "grad_norm": 2.794933319091797, + "learning_rate": 8.545492588605606e-07, + "loss": 0.3039, + "step": 9274 + }, + { + "epoch": 4.385342789598109, + "grad_norm": 2.969306707382202, + "learning_rate": 8.540796555009084e-07, + "loss": 0.3815, + "step": 9275 + }, + { + "epoch": 4.385815602836879, + "grad_norm": 2.9203877449035645, + "learning_rate": 8.536101546254982e-07, + "loss": 0.3143, + "step": 9276 + }, + { + "epoch": 4.38628841607565, + "grad_norm": 3.451172113418579, + "learning_rate": 8.531407562635655e-07, + "loss": 0.3673, + "step": 9277 + }, + { + "epoch": 4.386761229314421, + "grad_norm": 3.1196818351745605, + "learning_rate": 8.526714604443365e-07, + "loss": 0.3449, + "step": 9278 + }, + { + "epoch": 4.3872340425531915, + "grad_norm": 3.0087406635284424, + "learning_rate": 8.522022671970312e-07, + "loss": 0.2898, + "step": 9279 + }, + { + "epoch": 4.387706855791962, + "grad_norm": 2.885667085647583, + "learning_rate": 8.517331765508666e-07, + "loss": 0.3119, + "step": 9280 + }, + { + "epoch": 4.388179669030733, + "grad_norm": 3.115769624710083, + "learning_rate": 8.512641885350494e-07, + "loss": 0.3662, + "step": 9281 + }, + { + "epoch": 4.388652482269504, + "grad_norm": 2.935692071914673, + "learning_rate": 8.507953031787818e-07, + "loss": 0.2957, + "step": 9282 + }, + { + "epoch": 4.389125295508274, + "grad_norm": 2.96824312210083, + "learning_rate": 8.503265205112593e-07, + "loss": 0.301, + "step": 9283 + }, + { + "epoch": 4.389598108747045, + "grad_norm": 2.8329155445098877, + "learning_rate": 8.498578405616697e-07, + "loss": 0.3289, + "step": 9284 + }, + { + "epoch": 4.390070921985815, + "grad_norm": 3.3063509464263916, + "learning_rate": 8.493892633591976e-07, + "loss": 0.371, + "step": 9285 + }, + { + "epoch": 4.390543735224586, + "grad_norm": 3.036324977874756, + "learning_rate": 8.489207889330175e-07, + "loss": 0.3111, + "step": 9286 + }, + { + "epoch": 4.391016548463357, + "grad_norm": 3.221714496612549, + "learning_rate": 8.48452417312301e-07, + "loss": 0.3034, + "step": 9287 + }, + { + "epoch": 4.391489361702128, + "grad_norm": 2.956813097000122, + "learning_rate": 8.479841485262108e-07, + "loss": 0.2826, + "step": 9288 + }, + { + "epoch": 4.391962174940899, + "grad_norm": 3.3818461894989014, + "learning_rate": 8.475159826039028e-07, + "loss": 0.3701, + "step": 9289 + }, + { + "epoch": 4.392434988179669, + "grad_norm": 3.1623525619506836, + "learning_rate": 8.470479195745293e-07, + "loss": 0.3405, + "step": 9290 + }, + { + "epoch": 4.39290780141844, + "grad_norm": 3.8068127632141113, + "learning_rate": 8.465799594672342e-07, + "loss": 0.3498, + "step": 9291 + }, + { + "epoch": 4.39338061465721, + "grad_norm": 3.042862892150879, + "learning_rate": 8.461121023111541e-07, + "loss": 0.3025, + "step": 9292 + }, + { + "epoch": 4.393853427895981, + "grad_norm": 3.0237231254577637, + "learning_rate": 8.456443481354221e-07, + "loss": 0.3351, + "step": 9293 + }, + { + "epoch": 4.394326241134752, + "grad_norm": 3.233386754989624, + "learning_rate": 8.451766969691614e-07, + "loss": 0.3314, + "step": 9294 + }, + { + "epoch": 4.3947990543735225, + "grad_norm": 2.922518014907837, + "learning_rate": 8.447091488414924e-07, + "loss": 0.2876, + "step": 9295 + }, + { + "epoch": 4.395271867612293, + "grad_norm": 3.2621119022369385, + "learning_rate": 8.442417037815268e-07, + "loss": 0.3731, + "step": 9296 + }, + { + "epoch": 4.395744680851064, + "grad_norm": 3.4238440990448, + "learning_rate": 8.437743618183697e-07, + "loss": 0.3563, + "step": 9297 + }, + { + "epoch": 4.396217494089835, + "grad_norm": 3.607088804244995, + "learning_rate": 8.43307122981121e-07, + "loss": 0.315, + "step": 9298 + }, + { + "epoch": 4.396690307328605, + "grad_norm": 3.0737040042877197, + "learning_rate": 8.428399872988724e-07, + "loss": 0.2699, + "step": 9299 + }, + { + "epoch": 4.397163120567376, + "grad_norm": 3.2364611625671387, + "learning_rate": 8.423729548007123e-07, + "loss": 0.3054, + "step": 9300 + }, + { + "epoch": 4.397635933806146, + "grad_norm": 3.505194664001465, + "learning_rate": 8.419060255157199e-07, + "loss": 0.3377, + "step": 9301 + }, + { + "epoch": 4.398108747044917, + "grad_norm": 3.337815523147583, + "learning_rate": 8.414391994729676e-07, + "loss": 0.3709, + "step": 9302 + }, + { + "epoch": 4.398581560283688, + "grad_norm": 3.310739040374756, + "learning_rate": 8.409724767015248e-07, + "loss": 0.3125, + "step": 9303 + }, + { + "epoch": 4.399054373522459, + "grad_norm": 2.9035723209381104, + "learning_rate": 8.405058572304506e-07, + "loss": 0.3294, + "step": 9304 + }, + { + "epoch": 4.39952718676123, + "grad_norm": 3.162543535232544, + "learning_rate": 8.400393410888008e-07, + "loss": 0.3563, + "step": 9305 + }, + { + "epoch": 4.4, + "grad_norm": 3.320204973220825, + "learning_rate": 8.395729283056222e-07, + "loss": 0.3681, + "step": 9306 + }, + { + "epoch": 4.400472813238771, + "grad_norm": 3.2953343391418457, + "learning_rate": 8.391066189099562e-07, + "loss": 0.3574, + "step": 9307 + }, + { + "epoch": 4.400945626477541, + "grad_norm": 3.2041780948638916, + "learning_rate": 8.386404129308387e-07, + "loss": 0.3787, + "step": 9308 + }, + { + "epoch": 4.401418439716312, + "grad_norm": 3.493856906890869, + "learning_rate": 8.381743103972973e-07, + "loss": 0.3678, + "step": 9309 + }, + { + "epoch": 4.401891252955083, + "grad_norm": 3.3114027976989746, + "learning_rate": 8.377083113383553e-07, + "loss": 0.3853, + "step": 9310 + }, + { + "epoch": 4.4023640661938535, + "grad_norm": 3.148033857345581, + "learning_rate": 8.372424157830281e-07, + "loss": 0.41, + "step": 9311 + }, + { + "epoch": 4.402836879432624, + "grad_norm": 3.1810758113861084, + "learning_rate": 8.367766237603245e-07, + "loss": 0.3536, + "step": 9312 + }, + { + "epoch": 4.403309692671395, + "grad_norm": 3.110158920288086, + "learning_rate": 8.363109352992474e-07, + "loss": 0.3453, + "step": 9313 + }, + { + "epoch": 4.403782505910166, + "grad_norm": 3.140287399291992, + "learning_rate": 8.358453504287934e-07, + "loss": 0.3617, + "step": 9314 + }, + { + "epoch": 4.404255319148936, + "grad_norm": 3.0819156169891357, + "learning_rate": 8.353798691779516e-07, + "loss": 0.3033, + "step": 9315 + }, + { + "epoch": 4.4047281323877066, + "grad_norm": 3.167506217956543, + "learning_rate": 8.349144915757071e-07, + "loss": 0.325, + "step": 9316 + }, + { + "epoch": 4.405200945626477, + "grad_norm": 3.0423221588134766, + "learning_rate": 8.34449217651035e-07, + "loss": 0.3213, + "step": 9317 + }, + { + "epoch": 4.405673758865248, + "grad_norm": 3.4442083835601807, + "learning_rate": 8.339840474329078e-07, + "loss": 0.3493, + "step": 9318 + }, + { + "epoch": 4.406146572104019, + "grad_norm": 3.2931764125823975, + "learning_rate": 8.335189809502886e-07, + "loss": 0.3947, + "step": 9319 + }, + { + "epoch": 4.40661938534279, + "grad_norm": 3.217146396636963, + "learning_rate": 8.330540182321345e-07, + "loss": 0.3622, + "step": 9320 + }, + { + "epoch": 4.4070921985815605, + "grad_norm": 3.0024805068969727, + "learning_rate": 8.325891593073981e-07, + "loss": 0.3333, + "step": 9321 + }, + { + "epoch": 4.407565011820331, + "grad_norm": 2.9128856658935547, + "learning_rate": 8.321244042050225e-07, + "loss": 0.3069, + "step": 9322 + }, + { + "epoch": 4.408037825059101, + "grad_norm": 3.1456804275512695, + "learning_rate": 8.316597529539477e-07, + "loss": 0.3749, + "step": 9323 + }, + { + "epoch": 4.408510638297872, + "grad_norm": 3.093379497528076, + "learning_rate": 8.31195205583105e-07, + "loss": 0.3899, + "step": 9324 + }, + { + "epoch": 4.408983451536643, + "grad_norm": 2.95357608795166, + "learning_rate": 8.307307621214181e-07, + "loss": 0.3525, + "step": 9325 + }, + { + "epoch": 4.409456264775414, + "grad_norm": 3.0832929611206055, + "learning_rate": 8.30266422597808e-07, + "loss": 0.3976, + "step": 9326 + }, + { + "epoch": 4.409929078014184, + "grad_norm": 3.203678846359253, + "learning_rate": 8.298021870411862e-07, + "loss": 0.3954, + "step": 9327 + }, + { + "epoch": 4.410401891252955, + "grad_norm": 3.0880157947540283, + "learning_rate": 8.293380554804586e-07, + "loss": 0.3153, + "step": 9328 + }, + { + "epoch": 4.410874704491726, + "grad_norm": 2.9109299182891846, + "learning_rate": 8.28874027944524e-07, + "loss": 0.3526, + "step": 9329 + }, + { + "epoch": 4.411347517730497, + "grad_norm": 3.4241647720336914, + "learning_rate": 8.284101044622767e-07, + "loss": 0.3288, + "step": 9330 + }, + { + "epoch": 4.411820330969267, + "grad_norm": 3.110163450241089, + "learning_rate": 8.279462850626024e-07, + "loss": 0.314, + "step": 9331 + }, + { + "epoch": 4.4122931442080375, + "grad_norm": 3.024353504180908, + "learning_rate": 8.274825697743805e-07, + "loss": 0.3107, + "step": 9332 + }, + { + "epoch": 4.412765957446808, + "grad_norm": 2.8271758556365967, + "learning_rate": 8.270189586264859e-07, + "loss": 0.3339, + "step": 9333 + }, + { + "epoch": 4.413238770685579, + "grad_norm": 3.179032325744629, + "learning_rate": 8.265554516477853e-07, + "loss": 0.3365, + "step": 9334 + }, + { + "epoch": 4.41371158392435, + "grad_norm": 2.9746336936950684, + "learning_rate": 8.260920488671376e-07, + "loss": 0.3127, + "step": 9335 + }, + { + "epoch": 4.414184397163121, + "grad_norm": 3.247529983520508, + "learning_rate": 8.256287503133992e-07, + "loss": 0.3251, + "step": 9336 + }, + { + "epoch": 4.4146572104018915, + "grad_norm": 3.3317253589630127, + "learning_rate": 8.251655560154168e-07, + "loss": 0.3246, + "step": 9337 + }, + { + "epoch": 4.415130023640662, + "grad_norm": 2.9890010356903076, + "learning_rate": 8.247024660020303e-07, + "loss": 0.3317, + "step": 9338 + }, + { + "epoch": 4.415602836879432, + "grad_norm": 3.3956406116485596, + "learning_rate": 8.242394803020759e-07, + "loss": 0.4055, + "step": 9339 + }, + { + "epoch": 4.416075650118203, + "grad_norm": 2.9918906688690186, + "learning_rate": 8.237765989443805e-07, + "loss": 0.3415, + "step": 9340 + }, + { + "epoch": 4.416548463356974, + "grad_norm": 3.4310927391052246, + "learning_rate": 8.233138219577671e-07, + "loss": 0.3654, + "step": 9341 + }, + { + "epoch": 4.417021276595745, + "grad_norm": 3.207947254180908, + "learning_rate": 8.2285114937105e-07, + "loss": 0.3773, + "step": 9342 + }, + { + "epoch": 4.417494089834515, + "grad_norm": 3.202953338623047, + "learning_rate": 8.223885812130367e-07, + "loss": 0.3476, + "step": 9343 + }, + { + "epoch": 4.417966903073286, + "grad_norm": 3.160951614379883, + "learning_rate": 8.219261175125315e-07, + "loss": 0.3583, + "step": 9344 + }, + { + "epoch": 4.418439716312057, + "grad_norm": 2.892636775970459, + "learning_rate": 8.214637582983284e-07, + "loss": 0.3241, + "step": 9345 + }, + { + "epoch": 4.418912529550828, + "grad_norm": 2.830085277557373, + "learning_rate": 8.210015035992172e-07, + "loss": 0.2921, + "step": 9346 + }, + { + "epoch": 4.419385342789598, + "grad_norm": 3.2846477031707764, + "learning_rate": 8.205393534439801e-07, + "loss": 0.4281, + "step": 9347 + }, + { + "epoch": 4.4198581560283685, + "grad_norm": 2.6153810024261475, + "learning_rate": 8.200773078613924e-07, + "loss": 0.2848, + "step": 9348 + }, + { + "epoch": 4.420330969267139, + "grad_norm": 3.0541396141052246, + "learning_rate": 8.196153668802253e-07, + "loss": 0.3619, + "step": 9349 + }, + { + "epoch": 4.42080378250591, + "grad_norm": 3.516235589981079, + "learning_rate": 8.191535305292406e-07, + "loss": 0.3996, + "step": 9350 + }, + { + "epoch": 4.421276595744681, + "grad_norm": 3.2205963134765625, + "learning_rate": 8.186917988371956e-07, + "loss": 0.3219, + "step": 9351 + }, + { + "epoch": 4.421749408983452, + "grad_norm": 3.2431082725524902, + "learning_rate": 8.1823017183284e-07, + "loss": 0.3033, + "step": 9352 + }, + { + "epoch": 4.4222222222222225, + "grad_norm": 3.337085485458374, + "learning_rate": 8.177686495449166e-07, + "loss": 0.3467, + "step": 9353 + }, + { + "epoch": 4.422695035460993, + "grad_norm": 3.2539291381835938, + "learning_rate": 8.173072320021641e-07, + "loss": 0.3565, + "step": 9354 + }, + { + "epoch": 4.423167848699763, + "grad_norm": 3.069993734359741, + "learning_rate": 8.168459192333105e-07, + "loss": 0.3627, + "step": 9355 + }, + { + "epoch": 4.423640661938534, + "grad_norm": 3.068195104598999, + "learning_rate": 8.163847112670826e-07, + "loss": 0.2945, + "step": 9356 + }, + { + "epoch": 4.424113475177305, + "grad_norm": 3.273607015609741, + "learning_rate": 8.159236081321959e-07, + "loss": 0.3451, + "step": 9357 + }, + { + "epoch": 4.424586288416076, + "grad_norm": 3.355647325515747, + "learning_rate": 8.154626098573607e-07, + "loss": 0.3469, + "step": 9358 + }, + { + "epoch": 4.425059101654846, + "grad_norm": 2.792948007583618, + "learning_rate": 8.150017164712831e-07, + "loss": 0.3393, + "step": 9359 + }, + { + "epoch": 4.425531914893617, + "grad_norm": 3.031167507171631, + "learning_rate": 8.145409280026607e-07, + "loss": 0.2924, + "step": 9360 + }, + { + "epoch": 4.426004728132388, + "grad_norm": 3.0713601112365723, + "learning_rate": 8.140802444801835e-07, + "loss": 0.3221, + "step": 9361 + }, + { + "epoch": 4.426477541371159, + "grad_norm": 3.225785493850708, + "learning_rate": 8.136196659325374e-07, + "loss": 0.3626, + "step": 9362 + }, + { + "epoch": 4.426950354609929, + "grad_norm": 2.9779045581817627, + "learning_rate": 8.131591923883991e-07, + "loss": 0.3329, + "step": 9363 + }, + { + "epoch": 4.4274231678486995, + "grad_norm": 3.534536600112915, + "learning_rate": 8.126988238764422e-07, + "loss": 0.4029, + "step": 9364 + }, + { + "epoch": 4.42789598108747, + "grad_norm": 3.4237616062164307, + "learning_rate": 8.122385604253311e-07, + "loss": 0.3763, + "step": 9365 + }, + { + "epoch": 4.428368794326241, + "grad_norm": 2.8711681365966797, + "learning_rate": 8.117784020637231e-07, + "loss": 0.3141, + "step": 9366 + }, + { + "epoch": 4.428841607565012, + "grad_norm": 3.0277621746063232, + "learning_rate": 8.113183488202725e-07, + "loss": 0.3848, + "step": 9367 + }, + { + "epoch": 4.429314420803783, + "grad_norm": 3.1275761127471924, + "learning_rate": 8.108584007236226e-07, + "loss": 0.3343, + "step": 9368 + }, + { + "epoch": 4.4297872340425535, + "grad_norm": 3.2320117950439453, + "learning_rate": 8.103985578024143e-07, + "loss": 0.3211, + "step": 9369 + }, + { + "epoch": 4.430260047281324, + "grad_norm": 3.258829355239868, + "learning_rate": 8.099388200852792e-07, + "loss": 0.3473, + "step": 9370 + }, + { + "epoch": 4.430732860520094, + "grad_norm": 2.9609436988830566, + "learning_rate": 8.094791876008423e-07, + "loss": 0.318, + "step": 9371 + }, + { + "epoch": 4.431205673758865, + "grad_norm": 3.128053665161133, + "learning_rate": 8.090196603777245e-07, + "loss": 0.3372, + "step": 9372 + }, + { + "epoch": 4.431678486997636, + "grad_norm": 3.013979196548462, + "learning_rate": 8.085602384445368e-07, + "loss": 0.3098, + "step": 9373 + }, + { + "epoch": 4.432151300236407, + "grad_norm": 3.603433132171631, + "learning_rate": 8.081009218298871e-07, + "loss": 0.4016, + "step": 9374 + }, + { + "epoch": 4.432624113475177, + "grad_norm": 2.687730312347412, + "learning_rate": 8.076417105623743e-07, + "loss": 0.3173, + "step": 9375 + }, + { + "epoch": 4.433096926713948, + "grad_norm": 3.3575692176818848, + "learning_rate": 8.071826046705913e-07, + "loss": 0.3173, + "step": 9376 + }, + { + "epoch": 4.433569739952719, + "grad_norm": 3.3599679470062256, + "learning_rate": 8.06723604183125e-07, + "loss": 0.3466, + "step": 9377 + }, + { + "epoch": 4.43404255319149, + "grad_norm": 3.101696014404297, + "learning_rate": 8.062647091285542e-07, + "loss": 0.3322, + "step": 9378 + }, + { + "epoch": 4.43451536643026, + "grad_norm": 3.0010359287261963, + "learning_rate": 8.05805919535454e-07, + "loss": 0.3863, + "step": 9379 + }, + { + "epoch": 4.4349881796690305, + "grad_norm": 3.1428821086883545, + "learning_rate": 8.053472354323902e-07, + "loss": 0.3402, + "step": 9380 + }, + { + "epoch": 4.435460992907801, + "grad_norm": 3.416954517364502, + "learning_rate": 8.048886568479222e-07, + "loss": 0.3637, + "step": 9381 + }, + { + "epoch": 4.435933806146572, + "grad_norm": 3.015092611312866, + "learning_rate": 8.044301838106059e-07, + "loss": 0.3371, + "step": 9382 + }, + { + "epoch": 4.436406619385343, + "grad_norm": 2.6680097579956055, + "learning_rate": 8.039718163489862e-07, + "loss": 0.3148, + "step": 9383 + }, + { + "epoch": 4.436879432624114, + "grad_norm": 3.098219633102417, + "learning_rate": 8.035135544916056e-07, + "loss": 0.3348, + "step": 9384 + }, + { + "epoch": 4.4373522458628845, + "grad_norm": 3.5400390625, + "learning_rate": 8.030553982669969e-07, + "loss": 0.4211, + "step": 9385 + }, + { + "epoch": 4.437825059101655, + "grad_norm": 3.6074001789093018, + "learning_rate": 8.025973477036872e-07, + "loss": 0.3179, + "step": 9386 + }, + { + "epoch": 4.438297872340425, + "grad_norm": 3.111982583999634, + "learning_rate": 8.021394028301982e-07, + "loss": 0.3414, + "step": 9387 + }, + { + "epoch": 4.438770685579196, + "grad_norm": 3.5494184494018555, + "learning_rate": 8.016815636750439e-07, + "loss": 0.3847, + "step": 9388 + }, + { + "epoch": 4.439243498817967, + "grad_norm": 3.3602912425994873, + "learning_rate": 8.012238302667308e-07, + "loss": 0.3293, + "step": 9389 + }, + { + "epoch": 4.439716312056738, + "grad_norm": 3.1263039112091064, + "learning_rate": 8.007662026337617e-07, + "loss": 0.3675, + "step": 9390 + }, + { + "epoch": 4.440189125295508, + "grad_norm": 3.239863157272339, + "learning_rate": 8.003086808046304e-07, + "loss": 0.3445, + "step": 9391 + }, + { + "epoch": 4.440661938534279, + "grad_norm": 2.8646275997161865, + "learning_rate": 7.998512648078244e-07, + "loss": 0.2715, + "step": 9392 + }, + { + "epoch": 4.44113475177305, + "grad_norm": 2.9777262210845947, + "learning_rate": 7.993939546718255e-07, + "loss": 0.3143, + "step": 9393 + }, + { + "epoch": 4.441607565011821, + "grad_norm": 3.5436604022979736, + "learning_rate": 7.98936750425107e-07, + "loss": 0.406, + "step": 9394 + }, + { + "epoch": 4.442080378250591, + "grad_norm": 3.1395277976989746, + "learning_rate": 7.984796520961391e-07, + "loss": 0.3893, + "step": 9395 + }, + { + "epoch": 4.4425531914893615, + "grad_norm": 3.6379475593566895, + "learning_rate": 7.980226597133814e-07, + "loss": 0.3878, + "step": 9396 + }, + { + "epoch": 4.443026004728132, + "grad_norm": 3.3866498470306396, + "learning_rate": 7.975657733052908e-07, + "loss": 0.3399, + "step": 9397 + }, + { + "epoch": 4.443498817966903, + "grad_norm": 2.9472098350524902, + "learning_rate": 7.971089929003142e-07, + "loss": 0.3436, + "step": 9398 + }, + { + "epoch": 4.443971631205674, + "grad_norm": 3.314652442932129, + "learning_rate": 7.966523185268929e-07, + "loss": 0.305, + "step": 9399 + }, + { + "epoch": 4.444444444444445, + "grad_norm": 3.8230092525482178, + "learning_rate": 7.961957502134638e-07, + "loss": 0.3632, + "step": 9400 + }, + { + "epoch": 4.444917257683215, + "grad_norm": 3.088292360305786, + "learning_rate": 7.957392879884534e-07, + "loss": 0.3373, + "step": 9401 + }, + { + "epoch": 4.445390070921986, + "grad_norm": 3.1412665843963623, + "learning_rate": 7.952829318802854e-07, + "loss": 0.3703, + "step": 9402 + }, + { + "epoch": 4.445862884160756, + "grad_norm": 3.464963674545288, + "learning_rate": 7.948266819173745e-07, + "loss": 0.3485, + "step": 9403 + }, + { + "epoch": 4.446335697399527, + "grad_norm": 3.2092626094818115, + "learning_rate": 7.943705381281281e-07, + "loss": 0.341, + "step": 9404 + }, + { + "epoch": 4.446808510638298, + "grad_norm": 2.5458641052246094, + "learning_rate": 7.939145005409502e-07, + "loss": 0.3059, + "step": 9405 + }, + { + "epoch": 4.4472813238770685, + "grad_norm": 4.1484150886535645, + "learning_rate": 7.934585691842353e-07, + "loss": 0.3405, + "step": 9406 + }, + { + "epoch": 4.447754137115839, + "grad_norm": 2.9794130325317383, + "learning_rate": 7.930027440863716e-07, + "loss": 0.359, + "step": 9407 + }, + { + "epoch": 4.44822695035461, + "grad_norm": 2.951674222946167, + "learning_rate": 7.92547025275743e-07, + "loss": 0.3087, + "step": 9408 + }, + { + "epoch": 4.448699763593381, + "grad_norm": 3.2857377529144287, + "learning_rate": 7.920914127807241e-07, + "loss": 0.3327, + "step": 9409 + }, + { + "epoch": 4.449172576832151, + "grad_norm": 3.4455840587615967, + "learning_rate": 7.916359066296839e-07, + "loss": 0.3731, + "step": 9410 + }, + { + "epoch": 4.449645390070922, + "grad_norm": 3.392779588699341, + "learning_rate": 7.911805068509848e-07, + "loss": 0.3559, + "step": 9411 + }, + { + "epoch": 4.450118203309692, + "grad_norm": 3.3362300395965576, + "learning_rate": 7.90725213472982e-07, + "loss": 0.333, + "step": 9412 + }, + { + "epoch": 4.450591016548463, + "grad_norm": 3.0671608448028564, + "learning_rate": 7.902700265240259e-07, + "loss": 0.3342, + "step": 9413 + }, + { + "epoch": 4.451063829787234, + "grad_norm": 2.8350744247436523, + "learning_rate": 7.898149460324575e-07, + "loss": 0.3157, + "step": 9414 + }, + { + "epoch": 4.451536643026005, + "grad_norm": 2.932446241378784, + "learning_rate": 7.893599720266143e-07, + "loss": 0.379, + "step": 9415 + }, + { + "epoch": 4.452009456264776, + "grad_norm": 3.354112386703491, + "learning_rate": 7.889051045348245e-07, + "loss": 0.3001, + "step": 9416 + }, + { + "epoch": 4.452482269503546, + "grad_norm": 3.068276882171631, + "learning_rate": 7.884503435854104e-07, + "loss": 0.3466, + "step": 9417 + }, + { + "epoch": 4.452955082742317, + "grad_norm": 2.833534002304077, + "learning_rate": 7.879956892066892e-07, + "loss": 0.3278, + "step": 9418 + }, + { + "epoch": 4.453427895981087, + "grad_norm": 2.9622433185577393, + "learning_rate": 7.875411414269687e-07, + "loss": 0.3725, + "step": 9419 + }, + { + "epoch": 4.453900709219858, + "grad_norm": 3.2055954933166504, + "learning_rate": 7.870867002745533e-07, + "loss": 0.3215, + "step": 9420 + }, + { + "epoch": 4.454373522458629, + "grad_norm": 2.877063274383545, + "learning_rate": 7.86632365777738e-07, + "loss": 0.2845, + "step": 9421 + }, + { + "epoch": 4.4548463356973995, + "grad_norm": 3.2809367179870605, + "learning_rate": 7.861781379648117e-07, + "loss": 0.39, + "step": 9422 + }, + { + "epoch": 4.45531914893617, + "grad_norm": 3.404816150665283, + "learning_rate": 7.857240168640587e-07, + "loss": 0.3003, + "step": 9423 + }, + { + "epoch": 4.455791962174941, + "grad_norm": 3.367253303527832, + "learning_rate": 7.85270002503754e-07, + "loss": 0.3414, + "step": 9424 + }, + { + "epoch": 4.456264775413712, + "grad_norm": 3.1247670650482178, + "learning_rate": 7.848160949121678e-07, + "loss": 0.2922, + "step": 9425 + }, + { + "epoch": 4.456737588652482, + "grad_norm": 3.474435806274414, + "learning_rate": 7.843622941175624e-07, + "loss": 0.3601, + "step": 9426 + }, + { + "epoch": 4.457210401891253, + "grad_norm": 3.0552384853363037, + "learning_rate": 7.839086001481933e-07, + "loss": 0.3905, + "step": 9427 + }, + { + "epoch": 4.457683215130023, + "grad_norm": 3.3532586097717285, + "learning_rate": 7.834550130323115e-07, + "loss": 0.3783, + "step": 9428 + }, + { + "epoch": 4.458156028368794, + "grad_norm": 3.7321903705596924, + "learning_rate": 7.830015327981585e-07, + "loss": 0.3765, + "step": 9429 + }, + { + "epoch": 4.458628841607565, + "grad_norm": 3.070158004760742, + "learning_rate": 7.82548159473972e-07, + "loss": 0.3279, + "step": 9430 + }, + { + "epoch": 4.459101654846336, + "grad_norm": 3.498399257659912, + "learning_rate": 7.820948930879807e-07, + "loss": 0.3864, + "step": 9431 + }, + { + "epoch": 4.459574468085107, + "grad_norm": 3.0352776050567627, + "learning_rate": 7.816417336684071e-07, + "loss": 0.2963, + "step": 9432 + }, + { + "epoch": 4.460047281323877, + "grad_norm": 3.190154790878296, + "learning_rate": 7.811886812434686e-07, + "loss": 0.354, + "step": 9433 + }, + { + "epoch": 4.460520094562648, + "grad_norm": 3.1933085918426514, + "learning_rate": 7.807357358413742e-07, + "loss": 0.3613, + "step": 9434 + }, + { + "epoch": 4.460992907801418, + "grad_norm": 4.0385637283325195, + "learning_rate": 7.80282897490326e-07, + "loss": 0.3257, + "step": 9435 + }, + { + "epoch": 4.461465721040189, + "grad_norm": 3.365485191345215, + "learning_rate": 7.798301662185218e-07, + "loss": 0.3093, + "step": 9436 + }, + { + "epoch": 4.46193853427896, + "grad_norm": 3.5345213413238525, + "learning_rate": 7.793775420541497e-07, + "loss": 0.3262, + "step": 9437 + }, + { + "epoch": 4.4624113475177305, + "grad_norm": 3.2894418239593506, + "learning_rate": 7.789250250253941e-07, + "loss": 0.3417, + "step": 9438 + }, + { + "epoch": 4.462884160756501, + "grad_norm": 2.972001791000366, + "learning_rate": 7.784726151604305e-07, + "loss": 0.3396, + "step": 9439 + }, + { + "epoch": 4.463356973995272, + "grad_norm": 3.161794424057007, + "learning_rate": 7.780203124874283e-07, + "loss": 0.3583, + "step": 9440 + }, + { + "epoch": 4.463829787234043, + "grad_norm": 3.0976521968841553, + "learning_rate": 7.775681170345508e-07, + "loss": 0.3743, + "step": 9441 + }, + { + "epoch": 4.464302600472813, + "grad_norm": 3.1454756259918213, + "learning_rate": 7.771160288299534e-07, + "loss": 0.3483, + "step": 9442 + }, + { + "epoch": 4.464775413711584, + "grad_norm": 3.467618942260742, + "learning_rate": 7.766640479017868e-07, + "loss": 0.3253, + "step": 9443 + }, + { + "epoch": 4.465248226950354, + "grad_norm": 3.3349552154541016, + "learning_rate": 7.762121742781933e-07, + "loss": 0.3579, + "step": 9444 + }, + { + "epoch": 4.465721040189125, + "grad_norm": 3.442701578140259, + "learning_rate": 7.757604079873085e-07, + "loss": 0.3854, + "step": 9445 + }, + { + "epoch": 4.466193853427896, + "grad_norm": 3.3095569610595703, + "learning_rate": 7.753087490572633e-07, + "loss": 0.3385, + "step": 9446 + }, + { + "epoch": 4.466666666666667, + "grad_norm": 3.0978634357452393, + "learning_rate": 7.748571975161786e-07, + "loss": 0.3511, + "step": 9447 + }, + { + "epoch": 4.467139479905438, + "grad_norm": 2.9801225662231445, + "learning_rate": 7.744057533921731e-07, + "loss": 0.3239, + "step": 9448 + }, + { + "epoch": 4.467612293144208, + "grad_norm": 3.116586923599243, + "learning_rate": 7.739544167133545e-07, + "loss": 0.3786, + "step": 9449 + }, + { + "epoch": 4.468085106382979, + "grad_norm": 3.2235381603240967, + "learning_rate": 7.73503187507825e-07, + "loss": 0.3285, + "step": 9450 + }, + { + "epoch": 4.468557919621749, + "grad_norm": 3.175649404525757, + "learning_rate": 7.730520658036825e-07, + "loss": 0.315, + "step": 9451 + }, + { + "epoch": 4.46903073286052, + "grad_norm": 3.013848066329956, + "learning_rate": 7.726010516290144e-07, + "loss": 0.3533, + "step": 9452 + }, + { + "epoch": 4.469503546099291, + "grad_norm": 2.87581467628479, + "learning_rate": 7.721501450119057e-07, + "loss": 0.2948, + "step": 9453 + }, + { + "epoch": 4.4699763593380615, + "grad_norm": 3.504119873046875, + "learning_rate": 7.716993459804306e-07, + "loss": 0.354, + "step": 9454 + }, + { + "epoch": 4.470449172576832, + "grad_norm": 3.2914042472839355, + "learning_rate": 7.712486545626591e-07, + "loss": 0.3724, + "step": 9455 + }, + { + "epoch": 4.470921985815603, + "grad_norm": 3.007551908493042, + "learning_rate": 7.707980707866533e-07, + "loss": 0.3923, + "step": 9456 + }, + { + "epoch": 4.471394799054374, + "grad_norm": 3.2758076190948486, + "learning_rate": 7.703475946804687e-07, + "loss": 0.4092, + "step": 9457 + }, + { + "epoch": 4.471867612293144, + "grad_norm": 3.265875816345215, + "learning_rate": 7.698972262721557e-07, + "loss": 0.4249, + "step": 9458 + }, + { + "epoch": 4.472340425531915, + "grad_norm": 3.0962677001953125, + "learning_rate": 7.694469655897565e-07, + "loss": 0.3273, + "step": 9459 + }, + { + "epoch": 4.472813238770685, + "grad_norm": 3.2247416973114014, + "learning_rate": 7.689968126613053e-07, + "loss": 0.366, + "step": 9460 + }, + { + "epoch": 4.473286052009456, + "grad_norm": 3.326211929321289, + "learning_rate": 7.685467675148334e-07, + "loss": 0.4044, + "step": 9461 + }, + { + "epoch": 4.473758865248227, + "grad_norm": 2.9795444011688232, + "learning_rate": 7.68096830178362e-07, + "loss": 0.3763, + "step": 9462 + }, + { + "epoch": 4.474231678486998, + "grad_norm": 3.0721724033355713, + "learning_rate": 7.676470006799061e-07, + "loss": 0.3586, + "step": 9463 + }, + { + "epoch": 4.474704491725769, + "grad_norm": 3.1191349029541016, + "learning_rate": 7.67197279047476e-07, + "loss": 0.3111, + "step": 9464 + }, + { + "epoch": 4.475177304964539, + "grad_norm": 3.2980053424835205, + "learning_rate": 7.667476653090727e-07, + "loss": 0.3413, + "step": 9465 + }, + { + "epoch": 4.47565011820331, + "grad_norm": 3.159794807434082, + "learning_rate": 7.662981594926927e-07, + "loss": 0.3559, + "step": 9466 + }, + { + "epoch": 4.47612293144208, + "grad_norm": 2.9250876903533936, + "learning_rate": 7.658487616263244e-07, + "loss": 0.3582, + "step": 9467 + }, + { + "epoch": 4.476595744680851, + "grad_norm": 2.915234088897705, + "learning_rate": 7.65399471737949e-07, + "loss": 0.3466, + "step": 9468 + }, + { + "epoch": 4.477068557919622, + "grad_norm": 3.2557425498962402, + "learning_rate": 7.649502898555431e-07, + "loss": 0.3772, + "step": 9469 + }, + { + "epoch": 4.4775413711583925, + "grad_norm": 3.1906673908233643, + "learning_rate": 7.645012160070748e-07, + "loss": 0.3379, + "step": 9470 + }, + { + "epoch": 4.478014184397163, + "grad_norm": 3.1513144969940186, + "learning_rate": 7.640522502205056e-07, + "loss": 0.3529, + "step": 9471 + }, + { + "epoch": 4.478486997635934, + "grad_norm": 3.1969199180603027, + "learning_rate": 7.636033925237904e-07, + "loss": 0.363, + "step": 9472 + }, + { + "epoch": 4.478959810874705, + "grad_norm": 3.4546799659729004, + "learning_rate": 7.631546429448785e-07, + "loss": 0.2944, + "step": 9473 + }, + { + "epoch": 4.479432624113475, + "grad_norm": 3.102057456970215, + "learning_rate": 7.627060015117116e-07, + "loss": 0.3309, + "step": 9474 + }, + { + "epoch": 4.479905437352246, + "grad_norm": 3.125751495361328, + "learning_rate": 7.622574682522232e-07, + "loss": 0.3576, + "step": 9475 + }, + { + "epoch": 4.480378250591016, + "grad_norm": 3.071798324584961, + "learning_rate": 7.618090431943432e-07, + "loss": 0.3298, + "step": 9476 + }, + { + "epoch": 4.480851063829787, + "grad_norm": 3.0060672760009766, + "learning_rate": 7.613607263659922e-07, + "loss": 0.3528, + "step": 9477 + }, + { + "epoch": 4.481323877068558, + "grad_norm": 3.254667043685913, + "learning_rate": 7.609125177950846e-07, + "loss": 0.3701, + "step": 9478 + }, + { + "epoch": 4.481796690307329, + "grad_norm": 3.282247304916382, + "learning_rate": 7.604644175095293e-07, + "loss": 0.4025, + "step": 9479 + }, + { + "epoch": 4.4822695035460995, + "grad_norm": 3.231097936630249, + "learning_rate": 7.600164255372266e-07, + "loss": 0.3395, + "step": 9480 + }, + { + "epoch": 4.48274231678487, + "grad_norm": 3.08368182182312, + "learning_rate": 7.595685419060722e-07, + "loss": 0.3356, + "step": 9481 + }, + { + "epoch": 4.48321513002364, + "grad_norm": 3.0406503677368164, + "learning_rate": 7.591207666439532e-07, + "loss": 0.2851, + "step": 9482 + }, + { + "epoch": 4.483687943262411, + "grad_norm": 3.021157741546631, + "learning_rate": 7.586730997787495e-07, + "loss": 0.3691, + "step": 9483 + }, + { + "epoch": 4.484160756501182, + "grad_norm": 2.8793535232543945, + "learning_rate": 7.582255413383375e-07, + "loss": 0.332, + "step": 9484 + }, + { + "epoch": 4.484633569739953, + "grad_norm": 3.319021224975586, + "learning_rate": 7.577780913505833e-07, + "loss": 0.3489, + "step": 9485 + }, + { + "epoch": 4.485106382978723, + "grad_norm": 3.0267672538757324, + "learning_rate": 7.573307498433472e-07, + "loss": 0.2989, + "step": 9486 + }, + { + "epoch": 4.485579196217494, + "grad_norm": 2.8953561782836914, + "learning_rate": 7.568835168444849e-07, + "loss": 0.306, + "step": 9487 + }, + { + "epoch": 4.486052009456265, + "grad_norm": 3.0559732913970947, + "learning_rate": 7.564363923818424e-07, + "loss": 0.3122, + "step": 9488 + }, + { + "epoch": 4.486524822695036, + "grad_norm": 3.369352340698242, + "learning_rate": 7.559893764832607e-07, + "loss": 0.3999, + "step": 9489 + }, + { + "epoch": 4.486997635933806, + "grad_norm": 3.3339598178863525, + "learning_rate": 7.555424691765731e-07, + "loss": 0.3566, + "step": 9490 + }, + { + "epoch": 4.4874704491725765, + "grad_norm": 3.6563758850097656, + "learning_rate": 7.550956704896062e-07, + "loss": 0.3686, + "step": 9491 + }, + { + "epoch": 4.487943262411347, + "grad_norm": 3.492706537246704, + "learning_rate": 7.546489804501811e-07, + "loss": 0.3382, + "step": 9492 + }, + { + "epoch": 4.488416075650118, + "grad_norm": 3.1645941734313965, + "learning_rate": 7.542023990861106e-07, + "loss": 0.3583, + "step": 9493 + }, + { + "epoch": 4.488888888888889, + "grad_norm": 2.9827258586883545, + "learning_rate": 7.537559264252021e-07, + "loss": 0.3243, + "step": 9494 + }, + { + "epoch": 4.48936170212766, + "grad_norm": 3.2876698970794678, + "learning_rate": 7.533095624952547e-07, + "loss": 0.373, + "step": 9495 + }, + { + "epoch": 4.4898345153664305, + "grad_norm": 3.110868453979492, + "learning_rate": 7.528633073240616e-07, + "loss": 0.351, + "step": 9496 + }, + { + "epoch": 4.490307328605201, + "grad_norm": 3.3962604999542236, + "learning_rate": 7.524171609394099e-07, + "loss": 0.3686, + "step": 9497 + }, + { + "epoch": 4.490780141843971, + "grad_norm": 3.272610902786255, + "learning_rate": 7.519711233690777e-07, + "loss": 0.3667, + "step": 9498 + }, + { + "epoch": 4.491252955082742, + "grad_norm": 3.0907654762268066, + "learning_rate": 7.515251946408398e-07, + "loss": 0.3247, + "step": 9499 + }, + { + "epoch": 4.491725768321513, + "grad_norm": 4.225870609283447, + "learning_rate": 7.510793747824613e-07, + "loss": 0.3636, + "step": 9500 + }, + { + "epoch": 4.492198581560284, + "grad_norm": 3.1911606788635254, + "learning_rate": 7.506336638217004e-07, + "loss": 0.3661, + "step": 9501 + }, + { + "epoch": 4.492671394799054, + "grad_norm": 2.907573699951172, + "learning_rate": 7.501880617863114e-07, + "loss": 0.3285, + "step": 9502 + }, + { + "epoch": 4.493144208037825, + "grad_norm": 3.388460397720337, + "learning_rate": 7.497425687040388e-07, + "loss": 0.3354, + "step": 9503 + }, + { + "epoch": 4.493617021276596, + "grad_norm": 2.7236225605010986, + "learning_rate": 7.49297184602622e-07, + "loss": 0.3389, + "step": 9504 + }, + { + "epoch": 4.494089834515367, + "grad_norm": 3.1962947845458984, + "learning_rate": 7.488519095097929e-07, + "loss": 0.3377, + "step": 9505 + }, + { + "epoch": 4.494562647754137, + "grad_norm": 2.936845541000366, + "learning_rate": 7.484067434532763e-07, + "loss": 0.2893, + "step": 9506 + }, + { + "epoch": 4.4950354609929075, + "grad_norm": 2.8567588329315186, + "learning_rate": 7.47961686460792e-07, + "loss": 0.3026, + "step": 9507 + }, + { + "epoch": 4.495508274231678, + "grad_norm": 3.3522651195526123, + "learning_rate": 7.475167385600507e-07, + "loss": 0.3517, + "step": 9508 + }, + { + "epoch": 4.495981087470449, + "grad_norm": 3.338757276535034, + "learning_rate": 7.470718997787572e-07, + "loss": 0.3224, + "step": 9509 + }, + { + "epoch": 4.49645390070922, + "grad_norm": 3.1484947204589844, + "learning_rate": 7.466271701446107e-07, + "loss": 0.3872, + "step": 9510 + }, + { + "epoch": 4.496926713947991, + "grad_norm": 3.3275411128997803, + "learning_rate": 7.461825496853012e-07, + "loss": 0.3287, + "step": 9511 + }, + { + "epoch": 4.4973995271867615, + "grad_norm": 3.101416826248169, + "learning_rate": 7.457380384285151e-07, + "loss": 0.3223, + "step": 9512 + }, + { + "epoch": 4.497872340425532, + "grad_norm": 2.761810779571533, + "learning_rate": 7.45293636401929e-07, + "loss": 0.3264, + "step": 9513 + }, + { + "epoch": 4.498345153664302, + "grad_norm": 3.215078592300415, + "learning_rate": 7.448493436332132e-07, + "loss": 0.3548, + "step": 9514 + }, + { + "epoch": 4.498817966903073, + "grad_norm": 3.00111722946167, + "learning_rate": 7.444051601500335e-07, + "loss": 0.3271, + "step": 9515 + }, + { + "epoch": 4.499290780141844, + "grad_norm": 3.2428977489471436, + "learning_rate": 7.439610859800456e-07, + "loss": 0.3024, + "step": 9516 + }, + { + "epoch": 4.499763593380615, + "grad_norm": 2.7977585792541504, + "learning_rate": 7.435171211509018e-07, + "loss": 0.334, + "step": 9517 + }, + { + "epoch": 4.500236406619385, + "grad_norm": 3.273468494415283, + "learning_rate": 7.430732656902447e-07, + "loss": 0.337, + "step": 9518 + }, + { + "epoch": 4.500709219858156, + "grad_norm": 4.663364410400391, + "learning_rate": 7.426295196257116e-07, + "loss": 0.3492, + "step": 9519 + }, + { + "epoch": 4.501182033096927, + "grad_norm": 3.1147210597991943, + "learning_rate": 7.421858829849327e-07, + "loss": 0.3273, + "step": 9520 + }, + { + "epoch": 4.501654846335697, + "grad_norm": 3.1411445140838623, + "learning_rate": 7.4174235579553e-07, + "loss": 0.3413, + "step": 9521 + }, + { + "epoch": 4.502127659574468, + "grad_norm": 3.0361053943634033, + "learning_rate": 7.412989380851218e-07, + "loss": 0.2908, + "step": 9522 + }, + { + "epoch": 4.5026004728132385, + "grad_norm": 3.4923086166381836, + "learning_rate": 7.408556298813172e-07, + "loss": 0.3659, + "step": 9523 + }, + { + "epoch": 4.503073286052009, + "grad_norm": 3.6827056407928467, + "learning_rate": 7.40412431211718e-07, + "loss": 0.3485, + "step": 9524 + }, + { + "epoch": 4.50354609929078, + "grad_norm": 3.257322311401367, + "learning_rate": 7.399693421039219e-07, + "loss": 0.3592, + "step": 9525 + }, + { + "epoch": 4.504018912529551, + "grad_norm": 3.515291929244995, + "learning_rate": 7.395263625855167e-07, + "loss": 0.3662, + "step": 9526 + }, + { + "epoch": 4.504491725768322, + "grad_norm": 2.899764060974121, + "learning_rate": 7.390834926840865e-07, + "loss": 0.3564, + "step": 9527 + }, + { + "epoch": 4.5049645390070925, + "grad_norm": 3.2578322887420654, + "learning_rate": 7.386407324272055e-07, + "loss": 0.3074, + "step": 9528 + }, + { + "epoch": 4.505437352245863, + "grad_norm": 3.3826515674591064, + "learning_rate": 7.381980818424419e-07, + "loss": 0.3669, + "step": 9529 + }, + { + "epoch": 4.505910165484633, + "grad_norm": 3.470733404159546, + "learning_rate": 7.377555409573594e-07, + "loss": 0.3905, + "step": 9530 + }, + { + "epoch": 4.506382978723404, + "grad_norm": 3.228917121887207, + "learning_rate": 7.373131097995123e-07, + "loss": 0.3336, + "step": 9531 + }, + { + "epoch": 4.506855791962175, + "grad_norm": 3.2193191051483154, + "learning_rate": 7.368707883964476e-07, + "loss": 0.3285, + "step": 9532 + }, + { + "epoch": 4.507328605200946, + "grad_norm": 3.19169020652771, + "learning_rate": 7.36428576775709e-07, + "loss": 0.4022, + "step": 9533 + }, + { + "epoch": 4.507801418439716, + "grad_norm": 3.1887755393981934, + "learning_rate": 7.359864749648296e-07, + "loss": 0.3749, + "step": 9534 + }, + { + "epoch": 4.508274231678487, + "grad_norm": 3.574314832687378, + "learning_rate": 7.355444829913375e-07, + "loss": 0.3549, + "step": 9535 + }, + { + "epoch": 4.508747044917258, + "grad_norm": 3.3482754230499268, + "learning_rate": 7.351026008827527e-07, + "loss": 0.311, + "step": 9536 + }, + { + "epoch": 4.509219858156028, + "grad_norm": 3.5075576305389404, + "learning_rate": 7.34660828666591e-07, + "loss": 0.3575, + "step": 9537 + }, + { + "epoch": 4.509692671394799, + "grad_norm": 3.233328104019165, + "learning_rate": 7.342191663703588e-07, + "loss": 0.3087, + "step": 9538 + }, + { + "epoch": 4.5101654846335695, + "grad_norm": 3.3704137802124023, + "learning_rate": 7.337776140215555e-07, + "loss": 0.356, + "step": 9539 + }, + { + "epoch": 4.51063829787234, + "grad_norm": 4.084654331207275, + "learning_rate": 7.333361716476761e-07, + "loss": 0.3382, + "step": 9540 + }, + { + "epoch": 4.511111111111111, + "grad_norm": 2.985344886779785, + "learning_rate": 7.32894839276207e-07, + "loss": 0.3476, + "step": 9541 + }, + { + "epoch": 4.511583924349882, + "grad_norm": 3.405877113342285, + "learning_rate": 7.324536169346269e-07, + "loss": 0.3402, + "step": 9542 + }, + { + "epoch": 4.512056737588653, + "grad_norm": 3.0168516635894775, + "learning_rate": 7.320125046504103e-07, + "loss": 0.2936, + "step": 9543 + }, + { + "epoch": 4.5125295508274235, + "grad_norm": 2.9846513271331787, + "learning_rate": 7.315715024510219e-07, + "loss": 0.3165, + "step": 9544 + }, + { + "epoch": 4.513002364066194, + "grad_norm": 3.1661694049835205, + "learning_rate": 7.311306103639224e-07, + "loss": 0.3372, + "step": 9545 + }, + { + "epoch": 4.513475177304964, + "grad_norm": 3.3390371799468994, + "learning_rate": 7.306898284165637e-07, + "loss": 0.3427, + "step": 9546 + }, + { + "epoch": 4.513947990543735, + "grad_norm": 3.1360137462615967, + "learning_rate": 7.302491566363904e-07, + "loss": 0.3786, + "step": 9547 + }, + { + "epoch": 4.514420803782506, + "grad_norm": 3.3442773818969727, + "learning_rate": 7.298085950508427e-07, + "loss": 0.3516, + "step": 9548 + }, + { + "epoch": 4.514893617021277, + "grad_norm": 3.1403257846832275, + "learning_rate": 7.293681436873518e-07, + "loss": 0.32, + "step": 9549 + }, + { + "epoch": 4.515366430260047, + "grad_norm": 5.084080696105957, + "learning_rate": 7.289278025733417e-07, + "loss": 0.3036, + "step": 9550 + }, + { + "epoch": 4.515839243498818, + "grad_norm": 3.055558919906616, + "learning_rate": 7.284875717362322e-07, + "loss": 0.3625, + "step": 9551 + }, + { + "epoch": 4.516312056737589, + "grad_norm": 3.3781931400299072, + "learning_rate": 7.280474512034338e-07, + "loss": 0.3759, + "step": 9552 + }, + { + "epoch": 4.516784869976359, + "grad_norm": 3.5266852378845215, + "learning_rate": 7.27607441002351e-07, + "loss": 0.4057, + "step": 9553 + }, + { + "epoch": 4.51725768321513, + "grad_norm": 3.290174961090088, + "learning_rate": 7.271675411603802e-07, + "loss": 0.3471, + "step": 9554 + }, + { + "epoch": 4.5177304964539005, + "grad_norm": 3.399919033050537, + "learning_rate": 7.267277517049137e-07, + "loss": 0.394, + "step": 9555 + }, + { + "epoch": 4.518203309692671, + "grad_norm": 3.4410784244537354, + "learning_rate": 7.262880726633348e-07, + "loss": 0.4351, + "step": 9556 + }, + { + "epoch": 4.518676122931442, + "grad_norm": 2.7317543029785156, + "learning_rate": 7.258485040630192e-07, + "loss": 0.3039, + "step": 9557 + }, + { + "epoch": 4.519148936170213, + "grad_norm": 3.769446849822998, + "learning_rate": 7.254090459313384e-07, + "loss": 0.4061, + "step": 9558 + }, + { + "epoch": 4.519621749408984, + "grad_norm": 3.5625245571136475, + "learning_rate": 7.249696982956553e-07, + "loss": 0.4148, + "step": 9559 + }, + { + "epoch": 4.520094562647754, + "grad_norm": 3.1534764766693115, + "learning_rate": 7.245304611833248e-07, + "loss": 0.3369, + "step": 9560 + }, + { + "epoch": 4.520567375886525, + "grad_norm": 3.179197311401367, + "learning_rate": 7.240913346216982e-07, + "loss": 0.2798, + "step": 9561 + }, + { + "epoch": 4.521040189125295, + "grad_norm": 3.2608659267425537, + "learning_rate": 7.236523186381162e-07, + "loss": 0.3513, + "step": 9562 + }, + { + "epoch": 4.521513002364066, + "grad_norm": 3.055513381958008, + "learning_rate": 7.232134132599158e-07, + "loss": 0.3098, + "step": 9563 + }, + { + "epoch": 4.521985815602837, + "grad_norm": 3.159937620162964, + "learning_rate": 7.227746185144258e-07, + "loss": 0.3234, + "step": 9564 + }, + { + "epoch": 4.5224586288416075, + "grad_norm": 3.176802635192871, + "learning_rate": 7.22335934428966e-07, + "loss": 0.3547, + "step": 9565 + }, + { + "epoch": 4.522931442080378, + "grad_norm": 3.476203680038452, + "learning_rate": 7.218973610308538e-07, + "loss": 0.3659, + "step": 9566 + }, + { + "epoch": 4.523404255319149, + "grad_norm": 3.277595043182373, + "learning_rate": 7.214588983473964e-07, + "loss": 0.3448, + "step": 9567 + }, + { + "epoch": 4.52387706855792, + "grad_norm": 3.0068325996398926, + "learning_rate": 7.210205464058944e-07, + "loss": 0.3341, + "step": 9568 + }, + { + "epoch": 4.52434988179669, + "grad_norm": 3.3836655616760254, + "learning_rate": 7.205823052336425e-07, + "loss": 0.3253, + "step": 9569 + }, + { + "epoch": 4.524822695035461, + "grad_norm": 3.4170496463775635, + "learning_rate": 7.201441748579271e-07, + "loss": 0.3754, + "step": 9570 + }, + { + "epoch": 4.525295508274231, + "grad_norm": 3.5432863235473633, + "learning_rate": 7.197061553060303e-07, + "loss": 0.3265, + "step": 9571 + }, + { + "epoch": 4.525768321513002, + "grad_norm": 2.900520086288452, + "learning_rate": 7.192682466052243e-07, + "loss": 0.2961, + "step": 9572 + }, + { + "epoch": 4.526241134751773, + "grad_norm": 3.028733491897583, + "learning_rate": 7.188304487827768e-07, + "loss": 0.347, + "step": 9573 + }, + { + "epoch": 4.526713947990544, + "grad_norm": 2.8739330768585205, + "learning_rate": 7.183927618659473e-07, + "loss": 0.3265, + "step": 9574 + }, + { + "epoch": 4.527186761229315, + "grad_norm": 3.4727251529693604, + "learning_rate": 7.179551858819873e-07, + "loss": 0.3882, + "step": 9575 + }, + { + "epoch": 4.527659574468085, + "grad_norm": 2.950634002685547, + "learning_rate": 7.175177208581449e-07, + "loss": 0.2699, + "step": 9576 + }, + { + "epoch": 4.528132387706856, + "grad_norm": 3.035752773284912, + "learning_rate": 7.170803668216572e-07, + "loss": 0.3939, + "step": 9577 + }, + { + "epoch": 4.528605200945626, + "grad_norm": 2.9155373573303223, + "learning_rate": 7.166431237997579e-07, + "loss": 0.3112, + "step": 9578 + }, + { + "epoch": 4.529078014184397, + "grad_norm": 3.2655560970306396, + "learning_rate": 7.162059918196715e-07, + "loss": 0.3605, + "step": 9579 + }, + { + "epoch": 4.529550827423168, + "grad_norm": 3.0889620780944824, + "learning_rate": 7.157689709086157e-07, + "loss": 0.3333, + "step": 9580 + }, + { + "epoch": 4.5300236406619385, + "grad_norm": 3.193974256515503, + "learning_rate": 7.153320610938031e-07, + "loss": 0.3206, + "step": 9581 + }, + { + "epoch": 4.530496453900709, + "grad_norm": 3.0833280086517334, + "learning_rate": 7.148952624024374e-07, + "loss": 0.3483, + "step": 9582 + }, + { + "epoch": 4.53096926713948, + "grad_norm": 2.8866562843322754, + "learning_rate": 7.144585748617163e-07, + "loss": 0.3147, + "step": 9583 + }, + { + "epoch": 4.531442080378251, + "grad_norm": 3.2411928176879883, + "learning_rate": 7.140219984988305e-07, + "loss": 0.336, + "step": 9584 + }, + { + "epoch": 4.531914893617021, + "grad_norm": 3.0993618965148926, + "learning_rate": 7.13585533340963e-07, + "loss": 0.3599, + "step": 9585 + }, + { + "epoch": 4.532387706855792, + "grad_norm": 3.361176013946533, + "learning_rate": 7.131491794152917e-07, + "loss": 0.3448, + "step": 9586 + }, + { + "epoch": 4.532860520094562, + "grad_norm": 3.0916879177093506, + "learning_rate": 7.12712936748986e-07, + "loss": 0.3479, + "step": 9587 + }, + { + "epoch": 4.533333333333333, + "grad_norm": 3.254135847091675, + "learning_rate": 7.122768053692078e-07, + "loss": 0.3536, + "step": 9588 + }, + { + "epoch": 4.533806146572104, + "grad_norm": 3.120321035385132, + "learning_rate": 7.118407853031148e-07, + "loss": 0.3604, + "step": 9589 + }, + { + "epoch": 4.534278959810875, + "grad_norm": 3.0456507205963135, + "learning_rate": 7.114048765778544e-07, + "loss": 0.3473, + "step": 9590 + }, + { + "epoch": 4.534751773049646, + "grad_norm": 3.7177469730377197, + "learning_rate": 7.109690792205704e-07, + "loss": 0.374, + "step": 9591 + }, + { + "epoch": 4.535224586288416, + "grad_norm": 3.2694458961486816, + "learning_rate": 7.105333932583972e-07, + "loss": 0.4206, + "step": 9592 + }, + { + "epoch": 4.535697399527187, + "grad_norm": 3.506195068359375, + "learning_rate": 7.100978187184624e-07, + "loss": 0.3483, + "step": 9593 + }, + { + "epoch": 4.536170212765957, + "grad_norm": 3.569413661956787, + "learning_rate": 7.096623556278887e-07, + "loss": 0.3389, + "step": 9594 + }, + { + "epoch": 4.536643026004728, + "grad_norm": 3.2686502933502197, + "learning_rate": 7.092270040137886e-07, + "loss": 0.3571, + "step": 9595 + }, + { + "epoch": 4.537115839243499, + "grad_norm": 3.2042582035064697, + "learning_rate": 7.087917639032718e-07, + "loss": 0.3742, + "step": 9596 + }, + { + "epoch": 4.5375886524822695, + "grad_norm": 3.014989137649536, + "learning_rate": 7.083566353234375e-07, + "loss": 0.3294, + "step": 9597 + }, + { + "epoch": 4.53806146572104, + "grad_norm": 3.4535064697265625, + "learning_rate": 7.079216183013793e-07, + "loss": 0.3434, + "step": 9598 + }, + { + "epoch": 4.538534278959811, + "grad_norm": 3.123633623123169, + "learning_rate": 7.074867128641841e-07, + "loss": 0.357, + "step": 9599 + }, + { + "epoch": 4.539007092198582, + "grad_norm": 3.0646567344665527, + "learning_rate": 7.070519190389305e-07, + "loss": 0.3488, + "step": 9600 + }, + { + "epoch": 4.539479905437352, + "grad_norm": 2.951892852783203, + "learning_rate": 7.066172368526927e-07, + "loss": 0.329, + "step": 9601 + }, + { + "epoch": 4.539952718676123, + "grad_norm": 2.8071751594543457, + "learning_rate": 7.061826663325361e-07, + "loss": 0.2788, + "step": 9602 + }, + { + "epoch": 4.540425531914893, + "grad_norm": 3.9670250415802, + "learning_rate": 7.057482075055183e-07, + "loss": 0.3776, + "step": 9603 + }, + { + "epoch": 4.540898345153664, + "grad_norm": 2.683743476867676, + "learning_rate": 7.053138603986928e-07, + "loss": 0.3044, + "step": 9604 + }, + { + "epoch": 4.541371158392435, + "grad_norm": 2.9766221046447754, + "learning_rate": 7.048796250391038e-07, + "loss": 0.3542, + "step": 9605 + }, + { + "epoch": 4.541843971631206, + "grad_norm": 3.1156277656555176, + "learning_rate": 7.044455014537882e-07, + "loss": 0.3401, + "step": 9606 + }, + { + "epoch": 4.542316784869977, + "grad_norm": 2.8444416522979736, + "learning_rate": 7.040114896697789e-07, + "loss": 0.3437, + "step": 9607 + }, + { + "epoch": 4.542789598108747, + "grad_norm": 2.9964232444763184, + "learning_rate": 7.035775897140984e-07, + "loss": 0.331, + "step": 9608 + }, + { + "epoch": 4.543262411347518, + "grad_norm": 3.511500597000122, + "learning_rate": 7.031438016137648e-07, + "loss": 0.3685, + "step": 9609 + }, + { + "epoch": 4.543735224586288, + "grad_norm": 3.541271686553955, + "learning_rate": 7.027101253957877e-07, + "loss": 0.3945, + "step": 9610 + }, + { + "epoch": 4.544208037825059, + "grad_norm": 3.1483919620513916, + "learning_rate": 7.022765610871696e-07, + "loss": 0.3681, + "step": 9611 + }, + { + "epoch": 4.54468085106383, + "grad_norm": 2.908977508544922, + "learning_rate": 7.01843108714908e-07, + "loss": 0.3468, + "step": 9612 + }, + { + "epoch": 4.5451536643026005, + "grad_norm": 3.3107962608337402, + "learning_rate": 7.014097683059912e-07, + "loss": 0.419, + "step": 9613 + }, + { + "epoch": 4.545626477541371, + "grad_norm": 3.5597898960113525, + "learning_rate": 7.009765398874008e-07, + "loss": 0.3238, + "step": 9614 + }, + { + "epoch": 4.546099290780142, + "grad_norm": 3.091235399246216, + "learning_rate": 7.005434234861136e-07, + "loss": 0.3632, + "step": 9615 + }, + { + "epoch": 4.546572104018913, + "grad_norm": 3.279076099395752, + "learning_rate": 7.001104191290972e-07, + "loss": 0.4006, + "step": 9616 + }, + { + "epoch": 4.547044917257683, + "grad_norm": 3.3877902030944824, + "learning_rate": 6.996775268433126e-07, + "loss": 0.4183, + "step": 9617 + }, + { + "epoch": 4.547517730496454, + "grad_norm": 2.979999542236328, + "learning_rate": 6.992447466557134e-07, + "loss": 0.2921, + "step": 9618 + }, + { + "epoch": 4.547990543735224, + "grad_norm": 3.196361780166626, + "learning_rate": 6.988120785932484e-07, + "loss": 0.3352, + "step": 9619 + }, + { + "epoch": 4.548463356973995, + "grad_norm": 3.3237528800964355, + "learning_rate": 6.983795226828577e-07, + "loss": 0.3487, + "step": 9620 + }, + { + "epoch": 4.548936170212766, + "grad_norm": 3.0740649700164795, + "learning_rate": 6.979470789514731e-07, + "loss": 0.3497, + "step": 9621 + }, + { + "epoch": 4.549408983451537, + "grad_norm": 3.3443479537963867, + "learning_rate": 6.97514747426023e-07, + "loss": 0.3752, + "step": 9622 + }, + { + "epoch": 4.549881796690308, + "grad_norm": 3.450427293777466, + "learning_rate": 6.970825281334254e-07, + "loss": 0.3981, + "step": 9623 + }, + { + "epoch": 4.550354609929078, + "grad_norm": 3.4733047485351562, + "learning_rate": 6.966504211005937e-07, + "loss": 0.3953, + "step": 9624 + }, + { + "epoch": 4.550827423167849, + "grad_norm": 3.1651546955108643, + "learning_rate": 6.962184263544328e-07, + "loss": 0.3012, + "step": 9625 + }, + { + "epoch": 4.551300236406619, + "grad_norm": 2.9222865104675293, + "learning_rate": 6.957865439218405e-07, + "loss": 0.2774, + "step": 9626 + }, + { + "epoch": 4.55177304964539, + "grad_norm": 2.972437620162964, + "learning_rate": 6.953547738297095e-07, + "loss": 0.3478, + "step": 9627 + }, + { + "epoch": 4.552245862884161, + "grad_norm": 3.2741193771362305, + "learning_rate": 6.949231161049239e-07, + "loss": 0.355, + "step": 9628 + }, + { + "epoch": 4.5527186761229315, + "grad_norm": 2.8715150356292725, + "learning_rate": 6.9449157077436e-07, + "loss": 0.3055, + "step": 9629 + }, + { + "epoch": 4.553191489361702, + "grad_norm": 4.50998592376709, + "learning_rate": 6.940601378648895e-07, + "loss": 0.3732, + "step": 9630 + }, + { + "epoch": 4.553664302600473, + "grad_norm": 2.9277849197387695, + "learning_rate": 6.936288174033757e-07, + "loss": 0.3367, + "step": 9631 + }, + { + "epoch": 4.554137115839244, + "grad_norm": 3.169978380203247, + "learning_rate": 6.931976094166746e-07, + "loss": 0.3529, + "step": 9632 + }, + { + "epoch": 4.554609929078014, + "grad_norm": 2.9629712104797363, + "learning_rate": 6.927665139316359e-07, + "loss": 0.3416, + "step": 9633 + }, + { + "epoch": 4.555082742316785, + "grad_norm": 3.1368603706359863, + "learning_rate": 6.923355309751012e-07, + "loss": 0.3267, + "step": 9634 + }, + { + "epoch": 4.555555555555555, + "grad_norm": 2.9895052909851074, + "learning_rate": 6.919046605739071e-07, + "loss": 0.3411, + "step": 9635 + }, + { + "epoch": 4.556028368794326, + "grad_norm": 3.1592509746551514, + "learning_rate": 6.914739027548809e-07, + "loss": 0.3488, + "step": 9636 + }, + { + "epoch": 4.556501182033097, + "grad_norm": 3.0848731994628906, + "learning_rate": 6.910432575448456e-07, + "loss": 0.3732, + "step": 9637 + }, + { + "epoch": 4.556973995271868, + "grad_norm": 3.1475934982299805, + "learning_rate": 6.906127249706143e-07, + "loss": 0.3525, + "step": 9638 + }, + { + "epoch": 4.5574468085106385, + "grad_norm": 2.9435455799102783, + "learning_rate": 6.90182305058994e-07, + "loss": 0.3155, + "step": 9639 + }, + { + "epoch": 4.557919621749409, + "grad_norm": 3.4412894248962402, + "learning_rate": 6.897519978367867e-07, + "loss": 0.3511, + "step": 9640 + }, + { + "epoch": 4.55839243498818, + "grad_norm": 3.3600406646728516, + "learning_rate": 6.893218033307838e-07, + "loss": 0.4311, + "step": 9641 + }, + { + "epoch": 4.55886524822695, + "grad_norm": 3.35927414894104, + "learning_rate": 6.888917215677734e-07, + "loss": 0.387, + "step": 9642 + }, + { + "epoch": 4.559338061465721, + "grad_norm": 3.2481210231781006, + "learning_rate": 6.884617525745343e-07, + "loss": 0.3456, + "step": 9643 + }, + { + "epoch": 4.559810874704492, + "grad_norm": 3.661160469055176, + "learning_rate": 6.880318963778374e-07, + "loss": 0.4276, + "step": 9644 + }, + { + "epoch": 4.560283687943262, + "grad_norm": 3.038726806640625, + "learning_rate": 6.876021530044502e-07, + "loss": 0.3288, + "step": 9645 + }, + { + "epoch": 4.560756501182033, + "grad_norm": 3.0502963066101074, + "learning_rate": 6.871725224811296e-07, + "loss": 0.3334, + "step": 9646 + }, + { + "epoch": 4.561229314420804, + "grad_norm": 3.1810805797576904, + "learning_rate": 6.867430048346268e-07, + "loss": 0.3335, + "step": 9647 + }, + { + "epoch": 4.561702127659575, + "grad_norm": 3.028670072555542, + "learning_rate": 6.863136000916864e-07, + "loss": 0.3235, + "step": 9648 + }, + { + "epoch": 4.562174940898345, + "grad_norm": 2.805989980697632, + "learning_rate": 6.858843082790447e-07, + "loss": 0.3201, + "step": 9649 + }, + { + "epoch": 4.5626477541371155, + "grad_norm": 3.0792744159698486, + "learning_rate": 6.854551294234333e-07, + "loss": 0.3757, + "step": 9650 + }, + { + "epoch": 4.563120567375886, + "grad_norm": 3.115539312362671, + "learning_rate": 6.850260635515735e-07, + "loss": 0.3311, + "step": 9651 + }, + { + "epoch": 4.563593380614657, + "grad_norm": 3.003520965576172, + "learning_rate": 6.845971106901831e-07, + "loss": 0.35, + "step": 9652 + }, + { + "epoch": 4.564066193853428, + "grad_norm": 2.954759359359741, + "learning_rate": 6.841682708659702e-07, + "loss": 0.279, + "step": 9653 + }, + { + "epoch": 4.564539007092199, + "grad_norm": 3.1510894298553467, + "learning_rate": 6.83739544105636e-07, + "loss": 0.333, + "step": 9654 + }, + { + "epoch": 4.5650118203309695, + "grad_norm": 3.3958635330200195, + "learning_rate": 6.833109304358776e-07, + "loss": 0.3668, + "step": 9655 + }, + { + "epoch": 4.56548463356974, + "grad_norm": 3.493522882461548, + "learning_rate": 6.828824298833811e-07, + "loss": 0.3522, + "step": 9656 + }, + { + "epoch": 4.565957446808511, + "grad_norm": 3.217268705368042, + "learning_rate": 6.824540424748275e-07, + "loss": 0.4066, + "step": 9657 + }, + { + "epoch": 4.566430260047281, + "grad_norm": 3.148505210876465, + "learning_rate": 6.820257682368914e-07, + "loss": 0.3252, + "step": 9658 + }, + { + "epoch": 4.566903073286052, + "grad_norm": 3.070316791534424, + "learning_rate": 6.815976071962385e-07, + "loss": 0.3362, + "step": 9659 + }, + { + "epoch": 4.567375886524823, + "grad_norm": 3.0421791076660156, + "learning_rate": 6.811695593795301e-07, + "loss": 0.3894, + "step": 9660 + }, + { + "epoch": 4.567848699763593, + "grad_norm": 2.9165565967559814, + "learning_rate": 6.807416248134177e-07, + "loss": 0.3147, + "step": 9661 + }, + { + "epoch": 4.568321513002364, + "grad_norm": 3.361647129058838, + "learning_rate": 6.803138035245471e-07, + "loss": 0.3346, + "step": 9662 + }, + { + "epoch": 4.568794326241135, + "grad_norm": 3.0013155937194824, + "learning_rate": 6.79886095539557e-07, + "loss": 0.3476, + "step": 9663 + }, + { + "epoch": 4.569267139479906, + "grad_norm": 2.9030165672302246, + "learning_rate": 6.794585008850779e-07, + "loss": 0.3118, + "step": 9664 + }, + { + "epoch": 4.569739952718676, + "grad_norm": 3.229907989501953, + "learning_rate": 6.790310195877361e-07, + "loss": 0.3257, + "step": 9665 + }, + { + "epoch": 4.5702127659574465, + "grad_norm": 3.4075570106506348, + "learning_rate": 6.786036516741479e-07, + "loss": 0.3545, + "step": 9666 + }, + { + "epoch": 4.570685579196217, + "grad_norm": 2.9831581115722656, + "learning_rate": 6.781763971709229e-07, + "loss": 0.3173, + "step": 9667 + }, + { + "epoch": 4.571158392434988, + "grad_norm": 3.8512840270996094, + "learning_rate": 6.777492561046659e-07, + "loss": 0.288, + "step": 9668 + }, + { + "epoch": 4.571631205673759, + "grad_norm": 3.3054401874542236, + "learning_rate": 6.773222285019718e-07, + "loss": 0.369, + "step": 9669 + }, + { + "epoch": 4.57210401891253, + "grad_norm": 2.9155004024505615, + "learning_rate": 6.768953143894308e-07, + "loss": 0.3334, + "step": 9670 + }, + { + "epoch": 4.5725768321513005, + "grad_norm": 3.60557222366333, + "learning_rate": 6.764685137936247e-07, + "loss": 0.4094, + "step": 9671 + }, + { + "epoch": 4.573049645390071, + "grad_norm": 3.271256446838379, + "learning_rate": 6.760418267411275e-07, + "loss": 0.3646, + "step": 9672 + }, + { + "epoch": 4.573522458628842, + "grad_norm": 2.970238447189331, + "learning_rate": 6.756152532585086e-07, + "loss": 0.34, + "step": 9673 + }, + { + "epoch": 4.573995271867612, + "grad_norm": 3.412712574005127, + "learning_rate": 6.751887933723277e-07, + "loss": 0.3674, + "step": 9674 + }, + { + "epoch": 4.574468085106383, + "grad_norm": 2.9984517097473145, + "learning_rate": 6.747624471091396e-07, + "loss": 0.3579, + "step": 9675 + }, + { + "epoch": 4.574940898345154, + "grad_norm": 2.863788366317749, + "learning_rate": 6.743362144954907e-07, + "loss": 0.3234, + "step": 9676 + }, + { + "epoch": 4.575413711583924, + "grad_norm": 3.313793897628784, + "learning_rate": 6.739100955579203e-07, + "loss": 0.334, + "step": 9677 + }, + { + "epoch": 4.575886524822695, + "grad_norm": 3.5350630283355713, + "learning_rate": 6.734840903229611e-07, + "loss": 0.3682, + "step": 9678 + }, + { + "epoch": 4.576359338061466, + "grad_norm": 3.531888723373413, + "learning_rate": 6.730581988171378e-07, + "loss": 0.3434, + "step": 9679 + }, + { + "epoch": 4.576832151300237, + "grad_norm": 3.358574867248535, + "learning_rate": 6.726324210669702e-07, + "loss": 0.3751, + "step": 9680 + }, + { + "epoch": 4.577304964539007, + "grad_norm": 2.9723873138427734, + "learning_rate": 6.722067570989691e-07, + "loss": 0.3077, + "step": 9681 + }, + { + "epoch": 4.5777777777777775, + "grad_norm": 3.2287187576293945, + "learning_rate": 6.717812069396379e-07, + "loss": 0.3493, + "step": 9682 + }, + { + "epoch": 4.578250591016548, + "grad_norm": 2.9089417457580566, + "learning_rate": 6.71355770615475e-07, + "loss": 0.324, + "step": 9683 + }, + { + "epoch": 4.578723404255319, + "grad_norm": 3.2894415855407715, + "learning_rate": 6.709304481529703e-07, + "loss": 0.3066, + "step": 9684 + }, + { + "epoch": 4.57919621749409, + "grad_norm": 3.1914620399475098, + "learning_rate": 6.705052395786052e-07, + "loss": 0.3453, + "step": 9685 + }, + { + "epoch": 4.579669030732861, + "grad_norm": 3.1095924377441406, + "learning_rate": 6.700801449188577e-07, + "loss": 0.3678, + "step": 9686 + }, + { + "epoch": 4.5801418439716315, + "grad_norm": 3.416944980621338, + "learning_rate": 6.696551642001948e-07, + "loss": 0.3754, + "step": 9687 + }, + { + "epoch": 4.580614657210402, + "grad_norm": 3.7102952003479004, + "learning_rate": 6.692302974490797e-07, + "loss": 0.3723, + "step": 9688 + }, + { + "epoch": 4.581087470449172, + "grad_norm": 3.296607494354248, + "learning_rate": 6.688055446919664e-07, + "loss": 0.3607, + "step": 9689 + }, + { + "epoch": 4.581560283687943, + "grad_norm": 3.4449238777160645, + "learning_rate": 6.683809059553014e-07, + "loss": 0.3102, + "step": 9690 + }, + { + "epoch": 4.582033096926714, + "grad_norm": 3.202671766281128, + "learning_rate": 6.679563812655268e-07, + "loss": 0.3535, + "step": 9691 + }, + { + "epoch": 4.582505910165485, + "grad_norm": 3.25919771194458, + "learning_rate": 6.675319706490744e-07, + "loss": 0.3778, + "step": 9692 + }, + { + "epoch": 4.582978723404255, + "grad_norm": 3.316021680831909, + "learning_rate": 6.671076741323718e-07, + "loss": 0.2943, + "step": 9693 + }, + { + "epoch": 4.583451536643026, + "grad_norm": 3.2375826835632324, + "learning_rate": 6.666834917418371e-07, + "loss": 0.3249, + "step": 9694 + }, + { + "epoch": 4.583924349881797, + "grad_norm": 2.875436782836914, + "learning_rate": 6.662594235038827e-07, + "loss": 0.2991, + "step": 9695 + }, + { + "epoch": 4.584397163120567, + "grad_norm": 3.75874924659729, + "learning_rate": 6.658354694449134e-07, + "loss": 0.3718, + "step": 9696 + }, + { + "epoch": 4.584869976359338, + "grad_norm": 3.060943126678467, + "learning_rate": 6.65411629591326e-07, + "loss": 0.3183, + "step": 9697 + }, + { + "epoch": 4.5853427895981085, + "grad_norm": 3.024336576461792, + "learning_rate": 6.649879039695126e-07, + "loss": 0.3118, + "step": 9698 + }, + { + "epoch": 4.585815602836879, + "grad_norm": 3.3640875816345215, + "learning_rate": 6.645642926058562e-07, + "loss": 0.3408, + "step": 9699 + }, + { + "epoch": 4.58628841607565, + "grad_norm": 2.8885910511016846, + "learning_rate": 6.641407955267326e-07, + "loss": 0.3304, + "step": 9700 + }, + { + "epoch": 4.586761229314421, + "grad_norm": 3.8225393295288086, + "learning_rate": 6.637174127585122e-07, + "loss": 0.3469, + "step": 9701 + }, + { + "epoch": 4.587234042553192, + "grad_norm": 3.0624778270721436, + "learning_rate": 6.632941443275567e-07, + "loss": 0.3177, + "step": 9702 + }, + { + "epoch": 4.5877068557919625, + "grad_norm": 3.1422903537750244, + "learning_rate": 6.628709902602204e-07, + "loss": 0.3205, + "step": 9703 + }, + { + "epoch": 4.588179669030733, + "grad_norm": 3.1315362453460693, + "learning_rate": 6.62447950582853e-07, + "loss": 0.3443, + "step": 9704 + }, + { + "epoch": 4.588652482269503, + "grad_norm": 3.096041202545166, + "learning_rate": 6.62025025321793e-07, + "loss": 0.3567, + "step": 9705 + }, + { + "epoch": 4.589125295508274, + "grad_norm": 3.225820302963257, + "learning_rate": 6.616022145033766e-07, + "loss": 0.3873, + "step": 9706 + }, + { + "epoch": 4.589598108747045, + "grad_norm": 3.3879058361053467, + "learning_rate": 6.611795181539288e-07, + "loss": 0.4379, + "step": 9707 + }, + { + "epoch": 4.590070921985816, + "grad_norm": 3.508265733718872, + "learning_rate": 6.60756936299769e-07, + "loss": 0.3182, + "step": 9708 + }, + { + "epoch": 4.590543735224586, + "grad_norm": 3.278857946395874, + "learning_rate": 6.603344689672106e-07, + "loss": 0.3508, + "step": 9709 + }, + { + "epoch": 4.591016548463357, + "grad_norm": 2.9961371421813965, + "learning_rate": 6.599121161825581e-07, + "loss": 0.3178, + "step": 9710 + }, + { + "epoch": 4.591489361702128, + "grad_norm": 3.413717269897461, + "learning_rate": 6.594898779721092e-07, + "loss": 0.363, + "step": 9711 + }, + { + "epoch": 4.591962174940898, + "grad_norm": 3.2014074325561523, + "learning_rate": 6.590677543621557e-07, + "loss": 0.392, + "step": 9712 + }, + { + "epoch": 4.592434988179669, + "grad_norm": 3.0421640872955322, + "learning_rate": 6.586457453789802e-07, + "loss": 0.3119, + "step": 9713 + }, + { + "epoch": 4.5929078014184395, + "grad_norm": 3.0515928268432617, + "learning_rate": 6.582238510488604e-07, + "loss": 0.3021, + "step": 9714 + }, + { + "epoch": 4.59338061465721, + "grad_norm": 3.0824668407440186, + "learning_rate": 6.578020713980648e-07, + "loss": 0.3551, + "step": 9715 + }, + { + "epoch": 4.593853427895981, + "grad_norm": 3.0002171993255615, + "learning_rate": 6.573804064528574e-07, + "loss": 0.3691, + "step": 9716 + }, + { + "epoch": 4.594326241134752, + "grad_norm": 3.0174765586853027, + "learning_rate": 6.569588562394924e-07, + "loss": 0.3289, + "step": 9717 + }, + { + "epoch": 4.594799054373523, + "grad_norm": 4.098819732666016, + "learning_rate": 6.565374207842171e-07, + "loss": 0.3637, + "step": 9718 + }, + { + "epoch": 4.5952718676122934, + "grad_norm": 3.396275281906128, + "learning_rate": 6.561161001132737e-07, + "loss": 0.348, + "step": 9719 + }, + { + "epoch": 4.595744680851064, + "grad_norm": 3.6430864334106445, + "learning_rate": 6.556948942528952e-07, + "loss": 0.3543, + "step": 9720 + }, + { + "epoch": 4.596217494089834, + "grad_norm": 3.170236587524414, + "learning_rate": 6.552738032293093e-07, + "loss": 0.3565, + "step": 9721 + }, + { + "epoch": 4.596690307328605, + "grad_norm": 3.402683734893799, + "learning_rate": 6.548528270687349e-07, + "loss": 0.4001, + "step": 9722 + }, + { + "epoch": 4.597163120567376, + "grad_norm": 2.861463785171509, + "learning_rate": 6.544319657973833e-07, + "loss": 0.3436, + "step": 9723 + }, + { + "epoch": 4.5976359338061465, + "grad_norm": 3.209259510040283, + "learning_rate": 6.540112194414613e-07, + "loss": 0.3317, + "step": 9724 + }, + { + "epoch": 4.598108747044917, + "grad_norm": 3.099533796310425, + "learning_rate": 6.535905880271662e-07, + "loss": 0.3416, + "step": 9725 + }, + { + "epoch": 4.598581560283688, + "grad_norm": 3.3558053970336914, + "learning_rate": 6.531700715806891e-07, + "loss": 0.3567, + "step": 9726 + }, + { + "epoch": 4.599054373522459, + "grad_norm": 3.1330227851867676, + "learning_rate": 6.527496701282135e-07, + "loss": 0.3025, + "step": 9727 + }, + { + "epoch": 4.599527186761229, + "grad_norm": 3.140184164047241, + "learning_rate": 6.523293836959152e-07, + "loss": 0.3195, + "step": 9728 + }, + { + "epoch": 4.6, + "grad_norm": 3.246844530105591, + "learning_rate": 6.519092123099652e-07, + "loss": 0.3367, + "step": 9729 + }, + { + "epoch": 4.60047281323877, + "grad_norm": 3.1590709686279297, + "learning_rate": 6.51489155996525e-07, + "loss": 0.367, + "step": 9730 + }, + { + "epoch": 4.600945626477541, + "grad_norm": 3.122746467590332, + "learning_rate": 6.510692147817488e-07, + "loss": 0.3401, + "step": 9731 + }, + { + "epoch": 4.601418439716312, + "grad_norm": 3.0418715476989746, + "learning_rate": 6.506493886917859e-07, + "loss": 0.2958, + "step": 9732 + }, + { + "epoch": 4.601891252955083, + "grad_norm": 3.06303334236145, + "learning_rate": 6.502296777527756e-07, + "loss": 0.3459, + "step": 9733 + }, + { + "epoch": 4.602364066193854, + "grad_norm": 3.0807206630706787, + "learning_rate": 6.498100819908532e-07, + "loss": 0.3473, + "step": 9734 + }, + { + "epoch": 4.602836879432624, + "grad_norm": 3.790008306503296, + "learning_rate": 6.493906014321441e-07, + "loss": 0.3541, + "step": 9735 + }, + { + "epoch": 4.603309692671395, + "grad_norm": 3.3040049076080322, + "learning_rate": 6.489712361027667e-07, + "loss": 0.3317, + "step": 9736 + }, + { + "epoch": 4.603782505910165, + "grad_norm": 3.3145735263824463, + "learning_rate": 6.485519860288347e-07, + "loss": 0.3351, + "step": 9737 + }, + { + "epoch": 4.604255319148936, + "grad_norm": 3.1374423503875732, + "learning_rate": 6.481328512364515e-07, + "loss": 0.3544, + "step": 9738 + }, + { + "epoch": 4.604728132387707, + "grad_norm": 3.3598453998565674, + "learning_rate": 6.477138317517162e-07, + "loss": 0.4219, + "step": 9739 + }, + { + "epoch": 4.6052009456264775, + "grad_norm": 3.227466583251953, + "learning_rate": 6.472949276007187e-07, + "loss": 0.3179, + "step": 9740 + }, + { + "epoch": 4.605673758865248, + "grad_norm": 2.9815897941589355, + "learning_rate": 6.46876138809542e-07, + "loss": 0.3753, + "step": 9741 + }, + { + "epoch": 4.606146572104019, + "grad_norm": 3.072967290878296, + "learning_rate": 6.464574654042624e-07, + "loss": 0.3288, + "step": 9742 + }, + { + "epoch": 4.60661938534279, + "grad_norm": 3.695613145828247, + "learning_rate": 6.460389074109482e-07, + "loss": 0.3305, + "step": 9743 + }, + { + "epoch": 4.60709219858156, + "grad_norm": 3.205684185028076, + "learning_rate": 6.456204648556628e-07, + "loss": 0.3305, + "step": 9744 + }, + { + "epoch": 4.607565011820331, + "grad_norm": 3.216615915298462, + "learning_rate": 6.452021377644596e-07, + "loss": 0.3416, + "step": 9745 + }, + { + "epoch": 4.608037825059101, + "grad_norm": 3.2224013805389404, + "learning_rate": 6.447839261633856e-07, + "loss": 0.3773, + "step": 9746 + }, + { + "epoch": 4.608510638297872, + "grad_norm": 3.2811145782470703, + "learning_rate": 6.443658300784824e-07, + "loss": 0.3292, + "step": 9747 + }, + { + "epoch": 4.608983451536643, + "grad_norm": 3.8610804080963135, + "learning_rate": 6.439478495357815e-07, + "loss": 0.3975, + "step": 9748 + }, + { + "epoch": 4.609456264775414, + "grad_norm": 3.2154266834259033, + "learning_rate": 6.435299845613102e-07, + "loss": 0.3367, + "step": 9749 + }, + { + "epoch": 4.609929078014185, + "grad_norm": 3.18072509765625, + "learning_rate": 6.431122351810862e-07, + "loss": 0.3972, + "step": 9750 + }, + { + "epoch": 4.610401891252955, + "grad_norm": 3.513521194458008, + "learning_rate": 6.426946014211205e-07, + "loss": 0.374, + "step": 9751 + }, + { + "epoch": 4.610874704491726, + "grad_norm": 3.2900753021240234, + "learning_rate": 6.422770833074188e-07, + "loss": 0.3823, + "step": 9752 + }, + { + "epoch": 4.611347517730496, + "grad_norm": 2.791400194168091, + "learning_rate": 6.418596808659772e-07, + "loss": 0.3187, + "step": 9753 + }, + { + "epoch": 4.611820330969267, + "grad_norm": 3.042336940765381, + "learning_rate": 6.414423941227846e-07, + "loss": 0.3832, + "step": 9754 + }, + { + "epoch": 4.612293144208038, + "grad_norm": 3.130197286605835, + "learning_rate": 6.410252231038255e-07, + "loss": 0.3152, + "step": 9755 + }, + { + "epoch": 4.6127659574468085, + "grad_norm": 3.28125262260437, + "learning_rate": 6.406081678350745e-07, + "loss": 0.3082, + "step": 9756 + }, + { + "epoch": 4.613238770685579, + "grad_norm": 3.5695526599884033, + "learning_rate": 6.401912283424988e-07, + "loss": 0.4303, + "step": 9757 + }, + { + "epoch": 4.61371158392435, + "grad_norm": 2.9045464992523193, + "learning_rate": 6.397744046520612e-07, + "loss": 0.3392, + "step": 9758 + }, + { + "epoch": 4.614184397163121, + "grad_norm": 3.4325780868530273, + "learning_rate": 6.393576967897145e-07, + "loss": 0.3446, + "step": 9759 + }, + { + "epoch": 4.614657210401891, + "grad_norm": 3.1146414279937744, + "learning_rate": 6.389411047814053e-07, + "loss": 0.3444, + "step": 9760 + }, + { + "epoch": 4.615130023640662, + "grad_norm": 3.9922995567321777, + "learning_rate": 6.385246286530722e-07, + "loss": 0.3431, + "step": 9761 + }, + { + "epoch": 4.615602836879432, + "grad_norm": 2.868818759918213, + "learning_rate": 6.381082684306491e-07, + "loss": 0.2819, + "step": 9762 + }, + { + "epoch": 4.616075650118203, + "grad_norm": 3.1957287788391113, + "learning_rate": 6.376920241400597e-07, + "loss": 0.315, + "step": 9763 + }, + { + "epoch": 4.616548463356974, + "grad_norm": 3.327913999557495, + "learning_rate": 6.372758958072215e-07, + "loss": 0.3224, + "step": 9764 + }, + { + "epoch": 4.617021276595745, + "grad_norm": 3.2451798915863037, + "learning_rate": 6.368598834580461e-07, + "loss": 0.3219, + "step": 9765 + }, + { + "epoch": 4.617494089834516, + "grad_norm": 3.328977346420288, + "learning_rate": 6.364439871184355e-07, + "loss": 0.3123, + "step": 9766 + }, + { + "epoch": 4.617966903073286, + "grad_norm": 2.929624557495117, + "learning_rate": 6.36028206814287e-07, + "loss": 0.3137, + "step": 9767 + }, + { + "epoch": 4.618439716312057, + "grad_norm": 3.2356855869293213, + "learning_rate": 6.356125425714888e-07, + "loss": 0.3672, + "step": 9768 + }, + { + "epoch": 4.618912529550827, + "grad_norm": 3.099452018737793, + "learning_rate": 6.351969944159217e-07, + "loss": 0.3875, + "step": 9769 + }, + { + "epoch": 4.619385342789598, + "grad_norm": 4.037657260894775, + "learning_rate": 6.347815623734616e-07, + "loss": 0.3984, + "step": 9770 + }, + { + "epoch": 4.619858156028369, + "grad_norm": 3.350639581680298, + "learning_rate": 6.343662464699743e-07, + "loss": 0.3325, + "step": 9771 + }, + { + "epoch": 4.6203309692671395, + "grad_norm": 3.3933796882629395, + "learning_rate": 6.339510467313206e-07, + "loss": 0.3922, + "step": 9772 + }, + { + "epoch": 4.62080378250591, + "grad_norm": 2.8599045276641846, + "learning_rate": 6.335359631833532e-07, + "loss": 0.2677, + "step": 9773 + }, + { + "epoch": 4.621276595744681, + "grad_norm": 3.0792534351348877, + "learning_rate": 6.331209958519172e-07, + "loss": 0.3784, + "step": 9774 + }, + { + "epoch": 4.621749408983452, + "grad_norm": 3.1678860187530518, + "learning_rate": 6.327061447628507e-07, + "loss": 0.3698, + "step": 9775 + }, + { + "epoch": 4.622222222222222, + "grad_norm": 3.500584602355957, + "learning_rate": 6.322914099419846e-07, + "loss": 0.281, + "step": 9776 + }, + { + "epoch": 4.622695035460993, + "grad_norm": 3.089900016784668, + "learning_rate": 6.318767914151422e-07, + "loss": 0.3202, + "step": 9777 + }, + { + "epoch": 4.623167848699763, + "grad_norm": 3.353118896484375, + "learning_rate": 6.31462289208141e-07, + "loss": 0.3584, + "step": 9778 + }, + { + "epoch": 4.623640661938534, + "grad_norm": 3.1742143630981445, + "learning_rate": 6.310479033467893e-07, + "loss": 0.3309, + "step": 9779 + }, + { + "epoch": 4.624113475177305, + "grad_norm": 3.5430498123168945, + "learning_rate": 6.306336338568903e-07, + "loss": 0.3972, + "step": 9780 + }, + { + "epoch": 4.624586288416076, + "grad_norm": 3.141406774520874, + "learning_rate": 6.302194807642379e-07, + "loss": 0.3875, + "step": 9781 + }, + { + "epoch": 4.625059101654847, + "grad_norm": 3.1661601066589355, + "learning_rate": 6.298054440946188e-07, + "loss": 0.3969, + "step": 9782 + }, + { + "epoch": 4.625531914893617, + "grad_norm": 2.9834651947021484, + "learning_rate": 6.293915238738149e-07, + "loss": 0.3357, + "step": 9783 + }, + { + "epoch": 4.626004728132388, + "grad_norm": 3.497030258178711, + "learning_rate": 6.289777201275979e-07, + "loss": 0.3683, + "step": 9784 + }, + { + "epoch": 4.626477541371158, + "grad_norm": 3.519390106201172, + "learning_rate": 6.285640328817347e-07, + "loss": 0.3647, + "step": 9785 + }, + { + "epoch": 4.626950354609929, + "grad_norm": 3.0032200813293457, + "learning_rate": 6.281504621619833e-07, + "loss": 0.2854, + "step": 9786 + }, + { + "epoch": 4.6274231678487, + "grad_norm": 2.9891152381896973, + "learning_rate": 6.277370079940939e-07, + "loss": 0.3771, + "step": 9787 + }, + { + "epoch": 4.6278959810874705, + "grad_norm": 3.379671812057495, + "learning_rate": 6.273236704038122e-07, + "loss": 0.3916, + "step": 9788 + }, + { + "epoch": 4.628368794326241, + "grad_norm": 2.9964048862457275, + "learning_rate": 6.26910449416874e-07, + "loss": 0.3618, + "step": 9789 + }, + { + "epoch": 4.628841607565012, + "grad_norm": 3.0143628120422363, + "learning_rate": 6.264973450590089e-07, + "loss": 0.336, + "step": 9790 + }, + { + "epoch": 4.629314420803783, + "grad_norm": 2.956737756729126, + "learning_rate": 6.260843573559392e-07, + "loss": 0.3657, + "step": 9791 + }, + { + "epoch": 4.629787234042553, + "grad_norm": 3.057551145553589, + "learning_rate": 6.256714863333787e-07, + "loss": 0.3475, + "step": 9792 + }, + { + "epoch": 4.630260047281324, + "grad_norm": 3.9289608001708984, + "learning_rate": 6.25258732017037e-07, + "loss": 0.3679, + "step": 9793 + }, + { + "epoch": 4.630732860520094, + "grad_norm": 3.8519062995910645, + "learning_rate": 6.248460944326129e-07, + "loss": 0.4182, + "step": 9794 + }, + { + "epoch": 4.631205673758865, + "grad_norm": 3.6360673904418945, + "learning_rate": 6.244335736058007e-07, + "loss": 0.3836, + "step": 9795 + }, + { + "epoch": 4.631678486997636, + "grad_norm": 3.1905548572540283, + "learning_rate": 6.240211695622861e-07, + "loss": 0.357, + "step": 9796 + }, + { + "epoch": 4.632151300236407, + "grad_norm": 3.3542017936706543, + "learning_rate": 6.236088823277465e-07, + "loss": 0.3191, + "step": 9797 + }, + { + "epoch": 4.6326241134751776, + "grad_norm": 3.453275442123413, + "learning_rate": 6.231967119278546e-07, + "loss": 0.3346, + "step": 9798 + }, + { + "epoch": 4.633096926713948, + "grad_norm": 3.559972047805786, + "learning_rate": 6.227846583882741e-07, + "loss": 0.365, + "step": 9799 + }, + { + "epoch": 4.633569739952719, + "grad_norm": 2.795891046524048, + "learning_rate": 6.223727217346606e-07, + "loss": 0.3346, + "step": 9800 + }, + { + "epoch": 4.634042553191489, + "grad_norm": 3.176762342453003, + "learning_rate": 6.219609019926653e-07, + "loss": 0.3692, + "step": 9801 + }, + { + "epoch": 4.63451536643026, + "grad_norm": 3.490229845046997, + "learning_rate": 6.215491991879294e-07, + "loss": 0.3334, + "step": 9802 + }, + { + "epoch": 4.634988179669031, + "grad_norm": 3.27502179145813, + "learning_rate": 6.211376133460884e-07, + "loss": 0.3484, + "step": 9803 + }, + { + "epoch": 4.6354609929078014, + "grad_norm": 2.9768311977386475, + "learning_rate": 6.207261444927698e-07, + "loss": 0.3342, + "step": 9804 + }, + { + "epoch": 4.635933806146572, + "grad_norm": 3.1726930141448975, + "learning_rate": 6.203147926535938e-07, + "loss": 0.3187, + "step": 9805 + }, + { + "epoch": 4.636406619385343, + "grad_norm": 3.1797916889190674, + "learning_rate": 6.199035578541737e-07, + "loss": 0.3418, + "step": 9806 + }, + { + "epoch": 4.636879432624114, + "grad_norm": 3.1262030601501465, + "learning_rate": 6.194924401201141e-07, + "loss": 0.3099, + "step": 9807 + }, + { + "epoch": 4.637352245862884, + "grad_norm": 3.556866407394409, + "learning_rate": 6.190814394770153e-07, + "loss": 0.2879, + "step": 9808 + }, + { + "epoch": 4.6378250591016545, + "grad_norm": 3.508984327316284, + "learning_rate": 6.186705559504678e-07, + "loss": 0.3414, + "step": 9809 + }, + { + "epoch": 4.638297872340425, + "grad_norm": 3.266221761703491, + "learning_rate": 6.182597895660544e-07, + "loss": 0.3281, + "step": 9810 + }, + { + "epoch": 4.638770685579196, + "grad_norm": 3.3781862258911133, + "learning_rate": 6.178491403493537e-07, + "loss": 0.3583, + "step": 9811 + }, + { + "epoch": 4.639243498817967, + "grad_norm": 3.4480984210968018, + "learning_rate": 6.174386083259329e-07, + "loss": 0.3704, + "step": 9812 + }, + { + "epoch": 4.639716312056738, + "grad_norm": 3.2882535457611084, + "learning_rate": 6.170281935213563e-07, + "loss": 0.3515, + "step": 9813 + }, + { + "epoch": 4.6401891252955085, + "grad_norm": 2.852627992630005, + "learning_rate": 6.166178959611774e-07, + "loss": 0.3266, + "step": 9814 + }, + { + "epoch": 4.640661938534279, + "grad_norm": 3.5469841957092285, + "learning_rate": 6.162077156709431e-07, + "loss": 0.3374, + "step": 9815 + }, + { + "epoch": 4.64113475177305, + "grad_norm": 3.343583345413208, + "learning_rate": 6.157976526761947e-07, + "loss": 0.3084, + "step": 9816 + }, + { + "epoch": 4.64160756501182, + "grad_norm": 3.028337001800537, + "learning_rate": 6.153877070024639e-07, + "loss": 0.3083, + "step": 9817 + }, + { + "epoch": 4.642080378250591, + "grad_norm": 3.1543455123901367, + "learning_rate": 6.149778786752775e-07, + "loss": 0.3273, + "step": 9818 + }, + { + "epoch": 4.642553191489362, + "grad_norm": 3.2126576900482178, + "learning_rate": 6.145681677201529e-07, + "loss": 0.3107, + "step": 9819 + }, + { + "epoch": 4.643026004728132, + "grad_norm": 3.4443142414093018, + "learning_rate": 6.141585741626014e-07, + "loss": 0.3193, + "step": 9820 + }, + { + "epoch": 4.643498817966903, + "grad_norm": 3.1558680534362793, + "learning_rate": 6.137490980281255e-07, + "loss": 0.3855, + "step": 9821 + }, + { + "epoch": 4.643971631205674, + "grad_norm": 3.370654821395874, + "learning_rate": 6.133397393422228e-07, + "loss": 0.309, + "step": 9822 + }, + { + "epoch": 4.644444444444445, + "grad_norm": 3.0980682373046875, + "learning_rate": 6.129304981303822e-07, + "loss": 0.2784, + "step": 9823 + }, + { + "epoch": 4.644917257683215, + "grad_norm": 3.102229356765747, + "learning_rate": 6.125213744180844e-07, + "loss": 0.3064, + "step": 9824 + }, + { + "epoch": 4.6453900709219855, + "grad_norm": 2.9737658500671387, + "learning_rate": 6.121123682308039e-07, + "loss": 0.2926, + "step": 9825 + }, + { + "epoch": 4.645862884160756, + "grad_norm": 3.3927671909332275, + "learning_rate": 6.117034795940089e-07, + "loss": 0.404, + "step": 9826 + }, + { + "epoch": 4.646335697399527, + "grad_norm": 2.885082721710205, + "learning_rate": 6.112947085331581e-07, + "loss": 0.3375, + "step": 9827 + }, + { + "epoch": 4.646808510638298, + "grad_norm": 2.9711341857910156, + "learning_rate": 6.108860550737034e-07, + "loss": 0.3051, + "step": 9828 + }, + { + "epoch": 4.647281323877069, + "grad_norm": 3.1437952518463135, + "learning_rate": 6.104775192410911e-07, + "loss": 0.3408, + "step": 9829 + }, + { + "epoch": 4.6477541371158395, + "grad_norm": 3.055950164794922, + "learning_rate": 6.100691010607579e-07, + "loss": 0.316, + "step": 9830 + }, + { + "epoch": 4.64822695035461, + "grad_norm": 3.515423536300659, + "learning_rate": 6.096608005581353e-07, + "loss": 0.3994, + "step": 9831 + }, + { + "epoch": 4.648699763593381, + "grad_norm": 3.1165153980255127, + "learning_rate": 6.092526177586455e-07, + "loss": 0.3908, + "step": 9832 + }, + { + "epoch": 4.649172576832151, + "grad_norm": 3.504673719406128, + "learning_rate": 6.088445526877043e-07, + "loss": 0.3328, + "step": 9833 + }, + { + "epoch": 4.649645390070922, + "grad_norm": 3.4175243377685547, + "learning_rate": 6.084366053707208e-07, + "loss": 0.3234, + "step": 9834 + }, + { + "epoch": 4.650118203309693, + "grad_norm": 3.14725661277771, + "learning_rate": 6.080287758330946e-07, + "loss": 0.3118, + "step": 9835 + }, + { + "epoch": 4.650591016548463, + "grad_norm": 3.7654550075531006, + "learning_rate": 6.076210641002217e-07, + "loss": 0.4177, + "step": 9836 + }, + { + "epoch": 4.651063829787234, + "grad_norm": 3.188804864883423, + "learning_rate": 6.072134701974871e-07, + "loss": 0.3468, + "step": 9837 + }, + { + "epoch": 4.651536643026005, + "grad_norm": 3.2176342010498047, + "learning_rate": 6.068059941502702e-07, + "loss": 0.3486, + "step": 9838 + }, + { + "epoch": 4.652009456264776, + "grad_norm": 4.188257217407227, + "learning_rate": 6.063986359839424e-07, + "loss": 0.3973, + "step": 9839 + }, + { + "epoch": 4.652482269503546, + "grad_norm": 3.206559896469116, + "learning_rate": 6.059913957238678e-07, + "loss": 0.3088, + "step": 9840 + }, + { + "epoch": 4.6529550827423165, + "grad_norm": 3.033918857574463, + "learning_rate": 6.055842733954048e-07, + "loss": 0.3331, + "step": 9841 + }, + { + "epoch": 4.653427895981087, + "grad_norm": 3.2453384399414062, + "learning_rate": 6.051772690239022e-07, + "loss": 0.323, + "step": 9842 + }, + { + "epoch": 4.653900709219858, + "grad_norm": 3.001999855041504, + "learning_rate": 6.047703826347017e-07, + "loss": 0.3763, + "step": 9843 + }, + { + "epoch": 4.654373522458629, + "grad_norm": 3.845486640930176, + "learning_rate": 6.043636142531401e-07, + "loss": 0.3595, + "step": 9844 + }, + { + "epoch": 4.6548463356974, + "grad_norm": 3.347628593444824, + "learning_rate": 6.039569639045434e-07, + "loss": 0.3388, + "step": 9845 + }, + { + "epoch": 4.6553191489361705, + "grad_norm": 3.1889400482177734, + "learning_rate": 6.035504316142333e-07, + "loss": 0.335, + "step": 9846 + }, + { + "epoch": 4.655791962174941, + "grad_norm": 3.3385977745056152, + "learning_rate": 6.031440174075221e-07, + "loss": 0.3985, + "step": 9847 + }, + { + "epoch": 4.656264775413711, + "grad_norm": 2.849853277206421, + "learning_rate": 6.027377213097146e-07, + "loss": 0.3604, + "step": 9848 + }, + { + "epoch": 4.656737588652482, + "grad_norm": 3.243053436279297, + "learning_rate": 6.02331543346111e-07, + "loss": 0.3257, + "step": 9849 + }, + { + "epoch": 4.657210401891253, + "grad_norm": 3.344167709350586, + "learning_rate": 6.01925483542001e-07, + "loss": 0.3511, + "step": 9850 + }, + { + "epoch": 4.657683215130024, + "grad_norm": 2.9741430282592773, + "learning_rate": 6.015195419226677e-07, + "loss": 0.3303, + "step": 9851 + }, + { + "epoch": 4.658156028368794, + "grad_norm": 3.0257937908172607, + "learning_rate": 6.011137185133883e-07, + "loss": 0.3716, + "step": 9852 + }, + { + "epoch": 4.658628841607565, + "grad_norm": 3.5770089626312256, + "learning_rate": 6.007080133394316e-07, + "loss": 0.3258, + "step": 9853 + }, + { + "epoch": 4.659101654846336, + "grad_norm": 3.363703489303589, + "learning_rate": 6.003024264260587e-07, + "loss": 0.3924, + "step": 9854 + }, + { + "epoch": 4.659574468085106, + "grad_norm": 3.3533787727355957, + "learning_rate": 5.998969577985239e-07, + "loss": 0.3242, + "step": 9855 + }, + { + "epoch": 4.660047281323877, + "grad_norm": 2.7335259914398193, + "learning_rate": 5.994916074820731e-07, + "loss": 0.3269, + "step": 9856 + }, + { + "epoch": 4.6605200945626475, + "grad_norm": 3.7654764652252197, + "learning_rate": 5.990863755019471e-07, + "loss": 0.4, + "step": 9857 + }, + { + "epoch": 4.660992907801418, + "grad_norm": 3.372542381286621, + "learning_rate": 5.986812618833765e-07, + "loss": 0.3423, + "step": 9858 + }, + { + "epoch": 4.661465721040189, + "grad_norm": 2.797814130783081, + "learning_rate": 5.982762666515873e-07, + "loss": 0.3228, + "step": 9859 + }, + { + "epoch": 4.66193853427896, + "grad_norm": 3.0121023654937744, + "learning_rate": 5.978713898317964e-07, + "loss": 0.3063, + "step": 9860 + }, + { + "epoch": 4.662411347517731, + "grad_norm": 3.052292823791504, + "learning_rate": 5.974666314492126e-07, + "loss": 0.3029, + "step": 9861 + }, + { + "epoch": 4.6628841607565015, + "grad_norm": 2.990906238555908, + "learning_rate": 5.970619915290399e-07, + "loss": 0.3788, + "step": 9862 + }, + { + "epoch": 4.663356973995272, + "grad_norm": 3.214334726333618, + "learning_rate": 5.966574700964722e-07, + "loss": 0.3138, + "step": 9863 + }, + { + "epoch": 4.663829787234042, + "grad_norm": 3.5982940196990967, + "learning_rate": 5.962530671766989e-07, + "loss": 0.3685, + "step": 9864 + }, + { + "epoch": 4.664302600472813, + "grad_norm": 3.2522151470184326, + "learning_rate": 5.958487827948991e-07, + "loss": 0.3086, + "step": 9865 + }, + { + "epoch": 4.664775413711584, + "grad_norm": 3.070181131362915, + "learning_rate": 5.954446169762457e-07, + "loss": 0.3534, + "step": 9866 + }, + { + "epoch": 4.665248226950355, + "grad_norm": 3.3051350116729736, + "learning_rate": 5.950405697459055e-07, + "loss": 0.3871, + "step": 9867 + }, + { + "epoch": 4.665721040189125, + "grad_norm": 2.8587753772735596, + "learning_rate": 5.946366411290358e-07, + "loss": 0.3157, + "step": 9868 + }, + { + "epoch": 4.666193853427896, + "grad_norm": 3.154926061630249, + "learning_rate": 5.942328311507878e-07, + "loss": 0.2967, + "step": 9869 + }, + { + "epoch": 4.666666666666667, + "grad_norm": 3.0322320461273193, + "learning_rate": 5.938291398363049e-07, + "loss": 0.323, + "step": 9870 + }, + { + "epoch": 4.667139479905437, + "grad_norm": 3.0678954124450684, + "learning_rate": 5.934255672107222e-07, + "loss": 0.3337, + "step": 9871 + }, + { + "epoch": 4.667612293144208, + "grad_norm": 3.4822635650634766, + "learning_rate": 5.930221132991704e-07, + "loss": 0.4052, + "step": 9872 + }, + { + "epoch": 4.6680851063829785, + "grad_norm": 3.26842999458313, + "learning_rate": 5.926187781267695e-07, + "loss": 0.3501, + "step": 9873 + }, + { + "epoch": 4.668557919621749, + "grad_norm": 2.911407709121704, + "learning_rate": 5.922155617186332e-07, + "loss": 0.3544, + "step": 9874 + }, + { + "epoch": 4.66903073286052, + "grad_norm": 3.1876001358032227, + "learning_rate": 5.91812464099869e-07, + "loss": 0.3389, + "step": 9875 + }, + { + "epoch": 4.669503546099291, + "grad_norm": 3.4954607486724854, + "learning_rate": 5.914094852955749e-07, + "loss": 0.3461, + "step": 9876 + }, + { + "epoch": 4.669976359338062, + "grad_norm": 3.8845367431640625, + "learning_rate": 5.910066253308439e-07, + "loss": 0.3868, + "step": 9877 + }, + { + "epoch": 4.6704491725768325, + "grad_norm": 3.18038272857666, + "learning_rate": 5.906038842307598e-07, + "loss": 0.3311, + "step": 9878 + }, + { + "epoch": 4.670921985815603, + "grad_norm": 3.5944042205810547, + "learning_rate": 5.902012620203984e-07, + "loss": 0.3246, + "step": 9879 + }, + { + "epoch": 4.671394799054373, + "grad_norm": 2.980142116546631, + "learning_rate": 5.897987587248311e-07, + "loss": 0.3361, + "step": 9880 + }, + { + "epoch": 4.671867612293144, + "grad_norm": 4.3120269775390625, + "learning_rate": 5.893963743691183e-07, + "loss": 0.3213, + "step": 9881 + }, + { + "epoch": 4.672340425531915, + "grad_norm": 3.42366361618042, + "learning_rate": 5.889941089783163e-07, + "loss": 0.3515, + "step": 9882 + }, + { + "epoch": 4.6728132387706856, + "grad_norm": 2.910720109939575, + "learning_rate": 5.885919625774716e-07, + "loss": 0.3417, + "step": 9883 + }, + { + "epoch": 4.673286052009456, + "grad_norm": 3.122042179107666, + "learning_rate": 5.881899351916242e-07, + "loss": 0.3714, + "step": 9884 + }, + { + "epoch": 4.673758865248227, + "grad_norm": 3.0564188957214355, + "learning_rate": 5.877880268458064e-07, + "loss": 0.3146, + "step": 9885 + }, + { + "epoch": 4.674231678486998, + "grad_norm": 3.303421974182129, + "learning_rate": 5.873862375650427e-07, + "loss": 0.3476, + "step": 9886 + }, + { + "epoch": 4.674704491725768, + "grad_norm": 3.3057096004486084, + "learning_rate": 5.869845673743521e-07, + "loss": 0.3237, + "step": 9887 + }, + { + "epoch": 4.675177304964539, + "grad_norm": 3.1843838691711426, + "learning_rate": 5.865830162987443e-07, + "loss": 0.3789, + "step": 9888 + }, + { + "epoch": 4.6756501182033094, + "grad_norm": 2.865844964981079, + "learning_rate": 5.861815843632213e-07, + "loss": 0.3173, + "step": 9889 + }, + { + "epoch": 4.67612293144208, + "grad_norm": 2.986262083053589, + "learning_rate": 5.857802715927796e-07, + "loss": 0.2697, + "step": 9890 + }, + { + "epoch": 4.676595744680851, + "grad_norm": 3.2936089038848877, + "learning_rate": 5.853790780124063e-07, + "loss": 0.3839, + "step": 9891 + }, + { + "epoch": 4.677068557919622, + "grad_norm": 2.7130303382873535, + "learning_rate": 5.849780036470831e-07, + "loss": 0.3004, + "step": 9892 + }, + { + "epoch": 4.677541371158393, + "grad_norm": 3.0076770782470703, + "learning_rate": 5.845770485217827e-07, + "loss": 0.3467, + "step": 9893 + }, + { + "epoch": 4.678014184397163, + "grad_norm": 3.5340375900268555, + "learning_rate": 5.841762126614697e-07, + "loss": 0.3759, + "step": 9894 + }, + { + "epoch": 4.678486997635934, + "grad_norm": 3.0034375190734863, + "learning_rate": 5.837754960911041e-07, + "loss": 0.3099, + "step": 9895 + }, + { + "epoch": 4.678959810874704, + "grad_norm": 3.576899766921997, + "learning_rate": 5.833748988356358e-07, + "loss": 0.3612, + "step": 9896 + }, + { + "epoch": 4.679432624113475, + "grad_norm": 3.0961546897888184, + "learning_rate": 5.829744209200077e-07, + "loss": 0.3098, + "step": 9897 + }, + { + "epoch": 4.679905437352246, + "grad_norm": 3.1387925148010254, + "learning_rate": 5.825740623691576e-07, + "loss": 0.3538, + "step": 9898 + }, + { + "epoch": 4.6803782505910165, + "grad_norm": 3.4131572246551514, + "learning_rate": 5.821738232080127e-07, + "loss": 0.3984, + "step": 9899 + }, + { + "epoch": 4.680851063829787, + "grad_norm": 3.1346065998077393, + "learning_rate": 5.817737034614934e-07, + "loss": 0.3585, + "step": 9900 + }, + { + "epoch": 4.681323877068558, + "grad_norm": 3.148144483566284, + "learning_rate": 5.813737031545155e-07, + "loss": 0.3774, + "step": 9901 + }, + { + "epoch": 4.681796690307329, + "grad_norm": 3.2461299896240234, + "learning_rate": 5.809738223119843e-07, + "loss": 0.3181, + "step": 9902 + }, + { + "epoch": 4.682269503546099, + "grad_norm": 3.1998214721679688, + "learning_rate": 5.805740609587981e-07, + "loss": 0.3452, + "step": 9903 + }, + { + "epoch": 4.68274231678487, + "grad_norm": 2.897399425506592, + "learning_rate": 5.801744191198483e-07, + "loss": 0.3247, + "step": 9904 + }, + { + "epoch": 4.68321513002364, + "grad_norm": 2.921877384185791, + "learning_rate": 5.797748968200198e-07, + "loss": 0.2842, + "step": 9905 + }, + { + "epoch": 4.683687943262411, + "grad_norm": 3.17667818069458, + "learning_rate": 5.793754940841887e-07, + "loss": 0.3218, + "step": 9906 + }, + { + "epoch": 4.684160756501182, + "grad_norm": 3.499068260192871, + "learning_rate": 5.78976210937223e-07, + "loss": 0.3352, + "step": 9907 + }, + { + "epoch": 4.684633569739953, + "grad_norm": 3.2782368659973145, + "learning_rate": 5.785770474039859e-07, + "loss": 0.3671, + "step": 9908 + }, + { + "epoch": 4.685106382978724, + "grad_norm": 3.089757204055786, + "learning_rate": 5.781780035093304e-07, + "loss": 0.3613, + "step": 9909 + }, + { + "epoch": 4.685579196217494, + "grad_norm": 3.082561492919922, + "learning_rate": 5.77779079278104e-07, + "loss": 0.3351, + "step": 9910 + }, + { + "epoch": 4.686052009456265, + "grad_norm": 3.6009864807128906, + "learning_rate": 5.773802747351462e-07, + "loss": 0.3545, + "step": 9911 + }, + { + "epoch": 4.686524822695035, + "grad_norm": 3.488717555999756, + "learning_rate": 5.769815899052872e-07, + "loss": 0.3926, + "step": 9912 + }, + { + "epoch": 4.686997635933806, + "grad_norm": 3.5619056224823, + "learning_rate": 5.765830248133531e-07, + "loss": 0.4171, + "step": 9913 + }, + { + "epoch": 4.687470449172577, + "grad_norm": 3.30653977394104, + "learning_rate": 5.761845794841594e-07, + "loss": 0.3713, + "step": 9914 + }, + { + "epoch": 4.6879432624113475, + "grad_norm": 2.8256847858428955, + "learning_rate": 5.757862539425171e-07, + "loss": 0.2633, + "step": 9915 + }, + { + "epoch": 4.688416075650118, + "grad_norm": 3.0387041568756104, + "learning_rate": 5.753880482132274e-07, + "loss": 0.3169, + "step": 9916 + }, + { + "epoch": 4.688888888888889, + "grad_norm": 3.2312963008880615, + "learning_rate": 5.749899623210845e-07, + "loss": 0.3238, + "step": 9917 + }, + { + "epoch": 4.68936170212766, + "grad_norm": 3.077155351638794, + "learning_rate": 5.74591996290876e-07, + "loss": 0.3101, + "step": 9918 + }, + { + "epoch": 4.68983451536643, + "grad_norm": 3.461580991744995, + "learning_rate": 5.741941501473811e-07, + "loss": 0.3756, + "step": 9919 + }, + { + "epoch": 4.690307328605201, + "grad_norm": 3.8845605850219727, + "learning_rate": 5.737964239153712e-07, + "loss": 0.3747, + "step": 9920 + }, + { + "epoch": 4.690780141843971, + "grad_norm": 3.1688292026519775, + "learning_rate": 5.733988176196129e-07, + "loss": 0.3663, + "step": 9921 + }, + { + "epoch": 4.691252955082742, + "grad_norm": 3.2730917930603027, + "learning_rate": 5.730013312848614e-07, + "loss": 0.3697, + "step": 9922 + }, + { + "epoch": 4.691725768321513, + "grad_norm": 3.093761682510376, + "learning_rate": 5.726039649358681e-07, + "loss": 0.3215, + "step": 9923 + }, + { + "epoch": 4.692198581560284, + "grad_norm": 3.1679420471191406, + "learning_rate": 5.722067185973746e-07, + "loss": 0.3019, + "step": 9924 + }, + { + "epoch": 4.692671394799055, + "grad_norm": 3.4821531772613525, + "learning_rate": 5.718095922941147e-07, + "loss": 0.3659, + "step": 9925 + }, + { + "epoch": 4.693144208037825, + "grad_norm": 2.985276699066162, + "learning_rate": 5.714125860508177e-07, + "loss": 0.3293, + "step": 9926 + }, + { + "epoch": 4.693617021276596, + "grad_norm": 3.171663999557495, + "learning_rate": 5.710156998922015e-07, + "loss": 0.3647, + "step": 9927 + }, + { + "epoch": 4.694089834515366, + "grad_norm": 3.3699564933776855, + "learning_rate": 5.706189338429798e-07, + "loss": 0.4021, + "step": 9928 + }, + { + "epoch": 4.694562647754137, + "grad_norm": 3.0827202796936035, + "learning_rate": 5.702222879278571e-07, + "loss": 0.329, + "step": 9929 + }, + { + "epoch": 4.695035460992908, + "grad_norm": 3.5798332691192627, + "learning_rate": 5.698257621715303e-07, + "loss": 0.3777, + "step": 9930 + }, + { + "epoch": 4.6955082742316785, + "grad_norm": 2.741230010986328, + "learning_rate": 5.6942935659869e-07, + "loss": 0.31, + "step": 9931 + }, + { + "epoch": 4.695981087470449, + "grad_norm": 2.9929327964782715, + "learning_rate": 5.690330712340187e-07, + "loss": 0.3132, + "step": 9932 + }, + { + "epoch": 4.69645390070922, + "grad_norm": 3.062685489654541, + "learning_rate": 5.68636906102191e-07, + "loss": 0.3204, + "step": 9933 + }, + { + "epoch": 4.696926713947991, + "grad_norm": 3.166281223297119, + "learning_rate": 5.682408612278742e-07, + "loss": 0.3444, + "step": 9934 + }, + { + "epoch": 4.697399527186761, + "grad_norm": 3.0413401126861572, + "learning_rate": 5.678449366357278e-07, + "loss": 0.3506, + "step": 9935 + }, + { + "epoch": 4.697872340425532, + "grad_norm": 3.7843124866485596, + "learning_rate": 5.674491323504059e-07, + "loss": 0.3349, + "step": 9936 + }, + { + "epoch": 4.698345153664302, + "grad_norm": 2.9070212841033936, + "learning_rate": 5.670534483965514e-07, + "loss": 0.2954, + "step": 9937 + }, + { + "epoch": 4.698817966903073, + "grad_norm": 2.924229383468628, + "learning_rate": 5.666578847988041e-07, + "loss": 0.3392, + "step": 9938 + }, + { + "epoch": 4.699290780141844, + "grad_norm": 3.1302332878112793, + "learning_rate": 5.662624415817924e-07, + "loss": 0.3198, + "step": 9939 + }, + { + "epoch": 4.699763593380615, + "grad_norm": 3.163005828857422, + "learning_rate": 5.65867118770139e-07, + "loss": 0.3817, + "step": 9940 + }, + { + "epoch": 4.700236406619386, + "grad_norm": 3.4002792835235596, + "learning_rate": 5.654719163884598e-07, + "loss": 0.3961, + "step": 9941 + }, + { + "epoch": 4.700709219858156, + "grad_norm": 3.9756014347076416, + "learning_rate": 5.650768344613616e-07, + "loss": 0.4011, + "step": 9942 + }, + { + "epoch": 4.701182033096927, + "grad_norm": 3.118243455886841, + "learning_rate": 5.64681873013444e-07, + "loss": 0.3675, + "step": 9943 + }, + { + "epoch": 4.701654846335697, + "grad_norm": 3.0520825386047363, + "learning_rate": 5.642870320693005e-07, + "loss": 0.2782, + "step": 9944 + }, + { + "epoch": 4.702127659574468, + "grad_norm": 3.380565643310547, + "learning_rate": 5.638923116535152e-07, + "loss": 0.3632, + "step": 9945 + }, + { + "epoch": 4.702600472813239, + "grad_norm": 3.2340569496154785, + "learning_rate": 5.634977117906668e-07, + "loss": 0.3754, + "step": 9946 + }, + { + "epoch": 4.7030732860520095, + "grad_norm": 3.0068717002868652, + "learning_rate": 5.631032325053243e-07, + "loss": 0.3879, + "step": 9947 + }, + { + "epoch": 4.70354609929078, + "grad_norm": 3.4717891216278076, + "learning_rate": 5.627088738220507e-07, + "loss": 0.4053, + "step": 9948 + }, + { + "epoch": 4.704018912529551, + "grad_norm": 4.362999439239502, + "learning_rate": 5.623146357654008e-07, + "loss": 0.3115, + "step": 9949 + }, + { + "epoch": 4.704491725768322, + "grad_norm": 3.2190041542053223, + "learning_rate": 5.619205183599211e-07, + "loss": 0.3267, + "step": 9950 + }, + { + "epoch": 4.704964539007092, + "grad_norm": 3.413800001144409, + "learning_rate": 5.615265216301532e-07, + "loss": 0.4012, + "step": 9951 + }, + { + "epoch": 4.705437352245863, + "grad_norm": 3.5244312286376953, + "learning_rate": 5.611326456006291e-07, + "loss": 0.3484, + "step": 9952 + }, + { + "epoch": 4.705910165484633, + "grad_norm": 3.055433511734009, + "learning_rate": 5.607388902958727e-07, + "loss": 0.3637, + "step": 9953 + }, + { + "epoch": 4.706382978723404, + "grad_norm": 3.4459595680236816, + "learning_rate": 5.603452557404029e-07, + "loss": 0.3632, + "step": 9954 + }, + { + "epoch": 4.706855791962175, + "grad_norm": 2.972321033477783, + "learning_rate": 5.59951741958728e-07, + "loss": 0.3411, + "step": 9955 + }, + { + "epoch": 4.707328605200946, + "grad_norm": 3.2460532188415527, + "learning_rate": 5.595583489753523e-07, + "loss": 0.3779, + "step": 9956 + }, + { + "epoch": 4.707801418439717, + "grad_norm": 3.514521837234497, + "learning_rate": 5.591650768147694e-07, + "loss": 0.3313, + "step": 9957 + }, + { + "epoch": 4.708274231678487, + "grad_norm": 2.8473336696624756, + "learning_rate": 5.587719255014662e-07, + "loss": 0.3078, + "step": 9958 + }, + { + "epoch": 4.708747044917258, + "grad_norm": 3.309263229370117, + "learning_rate": 5.583788950599239e-07, + "loss": 0.3905, + "step": 9959 + }, + { + "epoch": 4.709219858156028, + "grad_norm": 3.435980796813965, + "learning_rate": 5.579859855146133e-07, + "loss": 0.3507, + "step": 9960 + }, + { + "epoch": 4.709692671394799, + "grad_norm": 3.0237598419189453, + "learning_rate": 5.575931968900006e-07, + "loss": 0.3349, + "step": 9961 + }, + { + "epoch": 4.71016548463357, + "grad_norm": 3.6978237628936768, + "learning_rate": 5.572005292105426e-07, + "loss": 0.3672, + "step": 9962 + }, + { + "epoch": 4.7106382978723405, + "grad_norm": 3.3029704093933105, + "learning_rate": 5.568079825006883e-07, + "loss": 0.3438, + "step": 9963 + }, + { + "epoch": 4.711111111111111, + "grad_norm": 2.9121241569519043, + "learning_rate": 5.5641555678488e-07, + "loss": 0.3299, + "step": 9964 + }, + { + "epoch": 4.711583924349882, + "grad_norm": 3.2730703353881836, + "learning_rate": 5.56023252087553e-07, + "loss": 0.3572, + "step": 9965 + }, + { + "epoch": 4.712056737588653, + "grad_norm": 3.316593885421753, + "learning_rate": 5.556310684331343e-07, + "loss": 0.3139, + "step": 9966 + }, + { + "epoch": 4.712529550827423, + "grad_norm": 3.1281843185424805, + "learning_rate": 5.552390058460427e-07, + "loss": 0.3362, + "step": 9967 + }, + { + "epoch": 4.7130023640661936, + "grad_norm": 3.3069980144500732, + "learning_rate": 5.548470643506904e-07, + "loss": 0.3839, + "step": 9968 + }, + { + "epoch": 4.713475177304964, + "grad_norm": 4.4018354415893555, + "learning_rate": 5.544552439714826e-07, + "loss": 0.2954, + "step": 9969 + }, + { + "epoch": 4.713947990543735, + "grad_norm": 2.797149658203125, + "learning_rate": 5.540635447328161e-07, + "loss": 0.3253, + "step": 9970 + }, + { + "epoch": 4.714420803782506, + "grad_norm": 3.0065677165985107, + "learning_rate": 5.536719666590792e-07, + "loss": 0.3376, + "step": 9971 + }, + { + "epoch": 4.714893617021277, + "grad_norm": 3.1383140087127686, + "learning_rate": 5.532805097746552e-07, + "loss": 0.3444, + "step": 9972 + }, + { + "epoch": 4.7153664302600475, + "grad_norm": 2.983229398727417, + "learning_rate": 5.528891741039169e-07, + "loss": 0.3173, + "step": 9973 + }, + { + "epoch": 4.715839243498818, + "grad_norm": 3.119361162185669, + "learning_rate": 5.524979596712326e-07, + "loss": 0.3829, + "step": 9974 + }, + { + "epoch": 4.716312056737589, + "grad_norm": 3.4099128246307373, + "learning_rate": 5.52106866500961e-07, + "loss": 0.3363, + "step": 9975 + }, + { + "epoch": 4.716784869976359, + "grad_norm": 2.818964719772339, + "learning_rate": 5.517158946174528e-07, + "loss": 0.321, + "step": 9976 + }, + { + "epoch": 4.71725768321513, + "grad_norm": 3.4968421459198, + "learning_rate": 5.513250440450538e-07, + "loss": 0.3973, + "step": 9977 + }, + { + "epoch": 4.717730496453901, + "grad_norm": 3.3777382373809814, + "learning_rate": 5.509343148080987e-07, + "loss": 0.3607, + "step": 9978 + }, + { + "epoch": 4.718203309692671, + "grad_norm": 2.95882511138916, + "learning_rate": 5.50543706930918e-07, + "loss": 0.3483, + "step": 9979 + }, + { + "epoch": 4.718676122931442, + "grad_norm": 2.8768858909606934, + "learning_rate": 5.501532204378327e-07, + "loss": 0.3488, + "step": 9980 + }, + { + "epoch": 4.719148936170213, + "grad_norm": 2.9310572147369385, + "learning_rate": 5.497628553531565e-07, + "loss": 0.3174, + "step": 9981 + }, + { + "epoch": 4.719621749408984, + "grad_norm": 3.1057486534118652, + "learning_rate": 5.493726117011957e-07, + "loss": 0.346, + "step": 9982 + }, + { + "epoch": 4.720094562647754, + "grad_norm": 3.681593418121338, + "learning_rate": 5.489824895062487e-07, + "loss": 0.3371, + "step": 9983 + }, + { + "epoch": 4.7205673758865245, + "grad_norm": 3.0641729831695557, + "learning_rate": 5.485924887926075e-07, + "loss": 0.3614, + "step": 9984 + }, + { + "epoch": 4.721040189125295, + "grad_norm": 3.2925705909729004, + "learning_rate": 5.482026095845555e-07, + "loss": 0.3023, + "step": 9985 + }, + { + "epoch": 4.721513002364066, + "grad_norm": 2.963693141937256, + "learning_rate": 5.47812851906368e-07, + "loss": 0.3706, + "step": 9986 + }, + { + "epoch": 4.721985815602837, + "grad_norm": 3.187870740890503, + "learning_rate": 5.474232157823147e-07, + "loss": 0.3332, + "step": 9987 + }, + { + "epoch": 4.722458628841608, + "grad_norm": 3.9346799850463867, + "learning_rate": 5.470337012366556e-07, + "loss": 0.3738, + "step": 9988 + }, + { + "epoch": 4.7229314420803785, + "grad_norm": 3.385035753250122, + "learning_rate": 5.466443082936446e-07, + "loss": 0.3194, + "step": 9989 + }, + { + "epoch": 4.723404255319149, + "grad_norm": 3.0829477310180664, + "learning_rate": 5.462550369775277e-07, + "loss": 0.2877, + "step": 9990 + }, + { + "epoch": 4.72387706855792, + "grad_norm": 2.8730506896972656, + "learning_rate": 5.458658873125419e-07, + "loss": 0.3352, + "step": 9991 + }, + { + "epoch": 4.72434988179669, + "grad_norm": 3.198498249053955, + "learning_rate": 5.454768593229193e-07, + "loss": 0.3697, + "step": 9992 + }, + { + "epoch": 4.724822695035461, + "grad_norm": 3.37144136428833, + "learning_rate": 5.450879530328824e-07, + "loss": 0.4245, + "step": 9993 + }, + { + "epoch": 4.725295508274232, + "grad_norm": 3.6235079765319824, + "learning_rate": 5.446991684666461e-07, + "loss": 0.3707, + "step": 9994 + }, + { + "epoch": 4.725768321513002, + "grad_norm": 3.5587494373321533, + "learning_rate": 5.443105056484194e-07, + "loss": 0.3297, + "step": 9995 + }, + { + "epoch": 4.726241134751773, + "grad_norm": 3.5308549404144287, + "learning_rate": 5.439219646024018e-07, + "loss": 0.3521, + "step": 9996 + }, + { + "epoch": 4.726713947990544, + "grad_norm": 3.16542649269104, + "learning_rate": 5.435335453527868e-07, + "loss": 0.3499, + "step": 9997 + }, + { + "epoch": 4.727186761229315, + "grad_norm": 3.2565104961395264, + "learning_rate": 5.431452479237586e-07, + "loss": 0.338, + "step": 9998 + }, + { + "epoch": 4.727659574468085, + "grad_norm": 3.371232032775879, + "learning_rate": 5.427570723394951e-07, + "loss": 0.3641, + "step": 9999 + }, + { + "epoch": 4.7281323877068555, + "grad_norm": 2.9784507751464844, + "learning_rate": 5.423690186241668e-07, + "loss": 0.3667, + "step": 10000 + }, + { + "epoch": 4.728605200945626, + "grad_norm": 3.0877480506896973, + "learning_rate": 5.419810868019351e-07, + "loss": 0.3098, + "step": 10001 + }, + { + "epoch": 4.729078014184397, + "grad_norm": 4.132823467254639, + "learning_rate": 5.415932768969562e-07, + "loss": 0.3712, + "step": 10002 + }, + { + "epoch": 4.729550827423168, + "grad_norm": 2.8105905055999756, + "learning_rate": 5.412055889333767e-07, + "loss": 0.2829, + "step": 10003 + }, + { + "epoch": 4.730023640661939, + "grad_norm": 3.543795585632324, + "learning_rate": 5.408180229353352e-07, + "loss": 0.3101, + "step": 10004 + }, + { + "epoch": 4.7304964539007095, + "grad_norm": 3.307525157928467, + "learning_rate": 5.404305789269657e-07, + "loss": 0.3585, + "step": 10005 + }, + { + "epoch": 4.73096926713948, + "grad_norm": 3.0976414680480957, + "learning_rate": 5.400432569323905e-07, + "loss": 0.3202, + "step": 10006 + }, + { + "epoch": 4.73144208037825, + "grad_norm": 3.0249791145324707, + "learning_rate": 5.396560569757284e-07, + "loss": 0.3468, + "step": 10007 + }, + { + "epoch": 4.731914893617021, + "grad_norm": 3.0199971199035645, + "learning_rate": 5.392689790810879e-07, + "loss": 0.3483, + "step": 10008 + }, + { + "epoch": 4.732387706855792, + "grad_norm": 3.177297592163086, + "learning_rate": 5.388820232725697e-07, + "loss": 0.3333, + "step": 10009 + }, + { + "epoch": 4.732860520094563, + "grad_norm": 3.247121572494507, + "learning_rate": 5.384951895742693e-07, + "loss": 0.2881, + "step": 10010 + }, + { + "epoch": 4.733333333333333, + "grad_norm": 3.513106346130371, + "learning_rate": 5.381084780102727e-07, + "loss": 0.3786, + "step": 10011 + }, + { + "epoch": 4.733806146572104, + "grad_norm": 2.8936305046081543, + "learning_rate": 5.377218886046584e-07, + "loss": 0.3174, + "step": 10012 + }, + { + "epoch": 4.734278959810875, + "grad_norm": 3.1088016033172607, + "learning_rate": 5.373354213814977e-07, + "loss": 0.3108, + "step": 10013 + }, + { + "epoch": 4.734751773049645, + "grad_norm": 2.693617343902588, + "learning_rate": 5.369490763648539e-07, + "loss": 0.3441, + "step": 10014 + }, + { + "epoch": 4.735224586288416, + "grad_norm": 3.4399259090423584, + "learning_rate": 5.365628535787837e-07, + "loss": 0.3937, + "step": 10015 + }, + { + "epoch": 4.7356973995271865, + "grad_norm": 3.28714919090271, + "learning_rate": 5.361767530473355e-07, + "loss": 0.2993, + "step": 10016 + }, + { + "epoch": 4.736170212765957, + "grad_norm": 3.1407346725463867, + "learning_rate": 5.35790774794549e-07, + "loss": 0.3605, + "step": 10017 + }, + { + "epoch": 4.736643026004728, + "grad_norm": 3.464386224746704, + "learning_rate": 5.354049188444588e-07, + "loss": 0.382, + "step": 10018 + }, + { + "epoch": 4.737115839243499, + "grad_norm": 3.303809881210327, + "learning_rate": 5.350191852210889e-07, + "loss": 0.3438, + "step": 10019 + }, + { + "epoch": 4.73758865248227, + "grad_norm": 3.3727755546569824, + "learning_rate": 5.346335739484593e-07, + "loss": 0.3524, + "step": 10020 + }, + { + "epoch": 4.7380614657210405, + "grad_norm": 3.125762939453125, + "learning_rate": 5.342480850505788e-07, + "loss": 0.3762, + "step": 10021 + }, + { + "epoch": 4.738534278959811, + "grad_norm": 3.32598876953125, + "learning_rate": 5.3386271855145e-07, + "loss": 0.345, + "step": 10022 + }, + { + "epoch": 4.739007092198581, + "grad_norm": 2.889338970184326, + "learning_rate": 5.334774744750692e-07, + "loss": 0.3245, + "step": 10023 + }, + { + "epoch": 4.739479905437352, + "grad_norm": 3.3369252681732178, + "learning_rate": 5.330923528454223e-07, + "loss": 0.366, + "step": 10024 + }, + { + "epoch": 4.739952718676123, + "grad_norm": 3.008836269378662, + "learning_rate": 5.327073536864908e-07, + "loss": 0.358, + "step": 10025 + }, + { + "epoch": 4.740425531914894, + "grad_norm": 3.1076738834381104, + "learning_rate": 5.323224770222457e-07, + "loss": 0.3398, + "step": 10026 + }, + { + "epoch": 4.740898345153664, + "grad_norm": 3.269164800643921, + "learning_rate": 5.319377228766523e-07, + "loss": 0.3364, + "step": 10027 + }, + { + "epoch": 4.741371158392435, + "grad_norm": 3.3928871154785156, + "learning_rate": 5.315530912736671e-07, + "loss": 0.3376, + "step": 10028 + }, + { + "epoch": 4.741843971631206, + "grad_norm": 2.7413101196289062, + "learning_rate": 5.31168582237239e-07, + "loss": 0.3551, + "step": 10029 + }, + { + "epoch": 4.742316784869976, + "grad_norm": 2.837280035018921, + "learning_rate": 5.307841957913104e-07, + "loss": 0.316, + "step": 10030 + }, + { + "epoch": 4.742789598108747, + "grad_norm": 3.140482187271118, + "learning_rate": 5.303999319598158e-07, + "loss": 0.3951, + "step": 10031 + }, + { + "epoch": 4.7432624113475175, + "grad_norm": 2.978053331375122, + "learning_rate": 5.3001579076668e-07, + "loss": 0.3328, + "step": 10032 + }, + { + "epoch": 4.743735224586288, + "grad_norm": 3.3469338417053223, + "learning_rate": 5.296317722358235e-07, + "loss": 0.328, + "step": 10033 + }, + { + "epoch": 4.744208037825059, + "grad_norm": 3.1574513912200928, + "learning_rate": 5.29247876391156e-07, + "loss": 0.3375, + "step": 10034 + }, + { + "epoch": 4.74468085106383, + "grad_norm": 2.9314582347869873, + "learning_rate": 5.288641032565825e-07, + "loss": 0.3025, + "step": 10035 + }, + { + "epoch": 4.745153664302601, + "grad_norm": 3.298856258392334, + "learning_rate": 5.284804528559981e-07, + "loss": 0.3071, + "step": 10036 + }, + { + "epoch": 4.7456264775413715, + "grad_norm": 3.489758014678955, + "learning_rate": 5.280969252132903e-07, + "loss": 0.3392, + "step": 10037 + }, + { + "epoch": 4.746099290780142, + "grad_norm": 3.1727964878082275, + "learning_rate": 5.277135203523412e-07, + "loss": 0.3472, + "step": 10038 + }, + { + "epoch": 4.746572104018912, + "grad_norm": 3.267204761505127, + "learning_rate": 5.27330238297023e-07, + "loss": 0.3555, + "step": 10039 + }, + { + "epoch": 4.747044917257683, + "grad_norm": 3.376077175140381, + "learning_rate": 5.269470790712003e-07, + "loss": 0.4018, + "step": 10040 + }, + { + "epoch": 4.747517730496454, + "grad_norm": 3.2389678955078125, + "learning_rate": 5.265640426987321e-07, + "loss": 0.3742, + "step": 10041 + }, + { + "epoch": 4.7479905437352246, + "grad_norm": 3.0280439853668213, + "learning_rate": 5.261811292034668e-07, + "loss": 0.3254, + "step": 10042 + }, + { + "epoch": 4.748463356973995, + "grad_norm": 3.1756322383880615, + "learning_rate": 5.257983386092486e-07, + "loss": 0.3434, + "step": 10043 + }, + { + "epoch": 4.748936170212766, + "grad_norm": 3.220245599746704, + "learning_rate": 5.254156709399111e-07, + "loss": 0.3795, + "step": 10044 + }, + { + "epoch": 4.749408983451537, + "grad_norm": 3.4887516498565674, + "learning_rate": 5.250331262192815e-07, + "loss": 0.353, + "step": 10045 + }, + { + "epoch": 4.749881796690307, + "grad_norm": 3.1106226444244385, + "learning_rate": 5.246507044711791e-07, + "loss": 0.3329, + "step": 10046 + }, + { + "epoch": 4.750354609929078, + "grad_norm": 3.0493836402893066, + "learning_rate": 5.24268405719415e-07, + "loss": 0.3372, + "step": 10047 + }, + { + "epoch": 4.7508274231678485, + "grad_norm": 3.0885660648345947, + "learning_rate": 5.238862299877948e-07, + "loss": 0.3583, + "step": 10048 + }, + { + "epoch": 4.751300236406619, + "grad_norm": 3.194566011428833, + "learning_rate": 5.23504177300114e-07, + "loss": 0.3886, + "step": 10049 + }, + { + "epoch": 4.75177304964539, + "grad_norm": 2.9062368869781494, + "learning_rate": 5.231222476801606e-07, + "loss": 0.3267, + "step": 10050 + }, + { + "epoch": 4.752245862884161, + "grad_norm": 2.9814155101776123, + "learning_rate": 5.227404411517173e-07, + "loss": 0.3817, + "step": 10051 + }, + { + "epoch": 4.752718676122932, + "grad_norm": 3.526301383972168, + "learning_rate": 5.22358757738556e-07, + "loss": 0.3405, + "step": 10052 + }, + { + "epoch": 4.753191489361702, + "grad_norm": 3.2342031002044678, + "learning_rate": 5.219771974644439e-07, + "loss": 0.3429, + "step": 10053 + }, + { + "epoch": 4.753664302600473, + "grad_norm": 3.0213656425476074, + "learning_rate": 5.215957603531383e-07, + "loss": 0.3482, + "step": 10054 + }, + { + "epoch": 4.754137115839243, + "grad_norm": 3.566260576248169, + "learning_rate": 5.212144464283889e-07, + "loss": 0.3633, + "step": 10055 + }, + { + "epoch": 4.754609929078014, + "grad_norm": 3.3363420963287354, + "learning_rate": 5.208332557139398e-07, + "loss": 0.3528, + "step": 10056 + }, + { + "epoch": 4.755082742316785, + "grad_norm": 3.3407959938049316, + "learning_rate": 5.204521882335251e-07, + "loss": 0.3219, + "step": 10057 + }, + { + "epoch": 4.7555555555555555, + "grad_norm": 2.9756882190704346, + "learning_rate": 5.200712440108729e-07, + "loss": 0.3141, + "step": 10058 + }, + { + "epoch": 4.756028368794326, + "grad_norm": 3.7191832065582275, + "learning_rate": 5.19690423069703e-07, + "loss": 0.3657, + "step": 10059 + }, + { + "epoch": 4.756501182033097, + "grad_norm": 3.175494432449341, + "learning_rate": 5.193097254337268e-07, + "loss": 0.2922, + "step": 10060 + }, + { + "epoch": 4.756973995271868, + "grad_norm": 2.9288907051086426, + "learning_rate": 5.189291511266489e-07, + "loss": 0.3097, + "step": 10061 + }, + { + "epoch": 4.757446808510638, + "grad_norm": 3.1014389991760254, + "learning_rate": 5.185487001721656e-07, + "loss": 0.3443, + "step": 10062 + }, + { + "epoch": 4.757919621749409, + "grad_norm": 3.3224666118621826, + "learning_rate": 5.181683725939668e-07, + "loss": 0.3408, + "step": 10063 + }, + { + "epoch": 4.758392434988179, + "grad_norm": 3.248089075088501, + "learning_rate": 5.177881684157335e-07, + "loss": 0.366, + "step": 10064 + }, + { + "epoch": 4.75886524822695, + "grad_norm": 3.3183906078338623, + "learning_rate": 5.174080876611385e-07, + "loss": 0.3774, + "step": 10065 + }, + { + "epoch": 4.759338061465721, + "grad_norm": 3.1653311252593994, + "learning_rate": 5.17028130353849e-07, + "loss": 0.3208, + "step": 10066 + }, + { + "epoch": 4.759810874704492, + "grad_norm": 2.9300882816314697, + "learning_rate": 5.166482965175229e-07, + "loss": 0.3494, + "step": 10067 + }, + { + "epoch": 4.760283687943263, + "grad_norm": 3.504225254058838, + "learning_rate": 5.162685861758099e-07, + "loss": 0.3777, + "step": 10068 + }, + { + "epoch": 4.760756501182033, + "grad_norm": 3.3933908939361572, + "learning_rate": 5.158889993523544e-07, + "loss": 0.3575, + "step": 10069 + }, + { + "epoch": 4.761229314420804, + "grad_norm": 4.30021333694458, + "learning_rate": 5.155095360707901e-07, + "loss": 0.3435, + "step": 10070 + }, + { + "epoch": 4.761702127659574, + "grad_norm": 3.226658582687378, + "learning_rate": 5.151301963547462e-07, + "loss": 0.3473, + "step": 10071 + }, + { + "epoch": 4.762174940898345, + "grad_norm": 3.222884178161621, + "learning_rate": 5.14750980227841e-07, + "loss": 0.314, + "step": 10072 + }, + { + "epoch": 4.762647754137116, + "grad_norm": 3.077139377593994, + "learning_rate": 5.143718877136872e-07, + "loss": 0.2929, + "step": 10073 + }, + { + "epoch": 4.7631205673758865, + "grad_norm": 2.9789531230926514, + "learning_rate": 5.139929188358894e-07, + "loss": 0.3594, + "step": 10074 + }, + { + "epoch": 4.763593380614657, + "grad_norm": 3.558417797088623, + "learning_rate": 5.136140736180445e-07, + "loss": 0.356, + "step": 10075 + }, + { + "epoch": 4.764066193853428, + "grad_norm": 2.8887953758239746, + "learning_rate": 5.13235352083741e-07, + "loss": 0.2957, + "step": 10076 + }, + { + "epoch": 4.764539007092199, + "grad_norm": 3.187857151031494, + "learning_rate": 5.128567542565605e-07, + "loss": 0.3879, + "step": 10077 + }, + { + "epoch": 4.765011820330969, + "grad_norm": 3.761465072631836, + "learning_rate": 5.124782801600758e-07, + "loss": 0.3163, + "step": 10078 + }, + { + "epoch": 4.76548463356974, + "grad_norm": 3.4338560104370117, + "learning_rate": 5.120999298178541e-07, + "loss": 0.3924, + "step": 10079 + }, + { + "epoch": 4.76595744680851, + "grad_norm": 2.8551666736602783, + "learning_rate": 5.117217032534528e-07, + "loss": 0.329, + "step": 10080 + }, + { + "epoch": 4.766430260047281, + "grad_norm": 3.4713878631591797, + "learning_rate": 5.113436004904232e-07, + "loss": 0.3802, + "step": 10081 + }, + { + "epoch": 4.766903073286052, + "grad_norm": 3.1913888454437256, + "learning_rate": 5.109656215523076e-07, + "loss": 0.3273, + "step": 10082 + }, + { + "epoch": 4.767375886524823, + "grad_norm": 2.8070812225341797, + "learning_rate": 5.105877664626402e-07, + "loss": 0.3398, + "step": 10083 + }, + { + "epoch": 4.767848699763594, + "grad_norm": 3.316321849822998, + "learning_rate": 5.102100352449502e-07, + "loss": 0.3649, + "step": 10084 + }, + { + "epoch": 4.768321513002364, + "grad_norm": 3.3555870056152344, + "learning_rate": 5.098324279227557e-07, + "loss": 0.333, + "step": 10085 + }, + { + "epoch": 4.768794326241135, + "grad_norm": 3.0964810848236084, + "learning_rate": 5.094549445195699e-07, + "loss": 0.3384, + "step": 10086 + }, + { + "epoch": 4.769267139479905, + "grad_norm": 3.0406007766723633, + "learning_rate": 5.090775850588963e-07, + "loss": 0.3582, + "step": 10087 + }, + { + "epoch": 4.769739952718676, + "grad_norm": 2.934340238571167, + "learning_rate": 5.087003495642309e-07, + "loss": 0.3306, + "step": 10088 + }, + { + "epoch": 4.770212765957447, + "grad_norm": 3.441734552383423, + "learning_rate": 5.083232380590641e-07, + "loss": 0.386, + "step": 10089 + }, + { + "epoch": 4.7706855791962175, + "grad_norm": 3.176483631134033, + "learning_rate": 5.079462505668758e-07, + "loss": 0.3516, + "step": 10090 + }, + { + "epoch": 4.771158392434988, + "grad_norm": 3.1490824222564697, + "learning_rate": 5.075693871111395e-07, + "loss": 0.3233, + "step": 10091 + }, + { + "epoch": 4.771631205673759, + "grad_norm": 3.300335645675659, + "learning_rate": 5.07192647715321e-07, + "loss": 0.2975, + "step": 10092 + }, + { + "epoch": 4.77210401891253, + "grad_norm": 3.199085235595703, + "learning_rate": 5.068160324028776e-07, + "loss": 0.3468, + "step": 10093 + }, + { + "epoch": 4.7725768321513, + "grad_norm": 3.4611270427703857, + "learning_rate": 5.064395411972605e-07, + "loss": 0.3319, + "step": 10094 + }, + { + "epoch": 4.773049645390071, + "grad_norm": 3.0549957752227783, + "learning_rate": 5.060631741219119e-07, + "loss": 0.3542, + "step": 10095 + }, + { + "epoch": 4.773522458628841, + "grad_norm": 3.085744619369507, + "learning_rate": 5.056869312002655e-07, + "loss": 0.3611, + "step": 10096 + }, + { + "epoch": 4.773995271867612, + "grad_norm": 3.4383676052093506, + "learning_rate": 5.053108124557496e-07, + "loss": 0.3606, + "step": 10097 + }, + { + "epoch": 4.774468085106383, + "grad_norm": 2.8119592666625977, + "learning_rate": 5.049348179117825e-07, + "loss": 0.3192, + "step": 10098 + }, + { + "epoch": 4.774940898345154, + "grad_norm": 2.8554961681365967, + "learning_rate": 5.045589475917767e-07, + "loss": 0.321, + "step": 10099 + }, + { + "epoch": 4.775413711583925, + "grad_norm": 3.612732410430908, + "learning_rate": 5.041832015191356e-07, + "loss": 0.3385, + "step": 10100 + }, + { + "epoch": 4.775886524822695, + "grad_norm": 3.432650327682495, + "learning_rate": 5.038075797172543e-07, + "loss": 0.3494, + "step": 10101 + }, + { + "epoch": 4.776359338061466, + "grad_norm": 3.241612672805786, + "learning_rate": 5.034320822095228e-07, + "loss": 0.3377, + "step": 10102 + }, + { + "epoch": 4.776832151300236, + "grad_norm": 3.5062692165374756, + "learning_rate": 5.030567090193203e-07, + "loss": 0.4038, + "step": 10103 + }, + { + "epoch": 4.777304964539007, + "grad_norm": 2.9015917778015137, + "learning_rate": 5.026814601700205e-07, + "loss": 0.2987, + "step": 10104 + }, + { + "epoch": 4.777777777777778, + "grad_norm": 3.0691189765930176, + "learning_rate": 5.023063356849886e-07, + "loss": 0.3725, + "step": 10105 + }, + { + "epoch": 4.7782505910165485, + "grad_norm": 3.1556789875030518, + "learning_rate": 5.019313355875813e-07, + "loss": 0.3554, + "step": 10106 + }, + { + "epoch": 4.778723404255319, + "grad_norm": 2.84529447555542, + "learning_rate": 5.01556459901148e-07, + "loss": 0.3369, + "step": 10107 + }, + { + "epoch": 4.77919621749409, + "grad_norm": 3.322565793991089, + "learning_rate": 5.011817086490315e-07, + "loss": 0.3641, + "step": 10108 + }, + { + "epoch": 4.779669030732861, + "grad_norm": 3.011988639831543, + "learning_rate": 5.008070818545654e-07, + "loss": 0.3153, + "step": 10109 + }, + { + "epoch": 4.780141843971631, + "grad_norm": 2.937770128250122, + "learning_rate": 5.004325795410764e-07, + "loss": 0.3235, + "step": 10110 + }, + { + "epoch": 4.780614657210402, + "grad_norm": 3.0186142921447754, + "learning_rate": 5.00058201731882e-07, + "loss": 0.3443, + "step": 10111 + }, + { + "epoch": 4.781087470449172, + "grad_norm": 3.1810684204101562, + "learning_rate": 4.996839484502946e-07, + "loss": 0.3511, + "step": 10112 + }, + { + "epoch": 4.781560283687943, + "grad_norm": 3.5470240116119385, + "learning_rate": 4.993098197196167e-07, + "loss": 0.4096, + "step": 10113 + }, + { + "epoch": 4.782033096926714, + "grad_norm": 3.1422345638275146, + "learning_rate": 4.989358155631427e-07, + "loss": 0.3566, + "step": 10114 + }, + { + "epoch": 4.782505910165485, + "grad_norm": 3.3392271995544434, + "learning_rate": 4.985619360041619e-07, + "loss": 0.3278, + "step": 10115 + }, + { + "epoch": 4.782978723404256, + "grad_norm": 3.020026206970215, + "learning_rate": 4.981881810659525e-07, + "loss": 0.3349, + "step": 10116 + }, + { + "epoch": 4.783451536643026, + "grad_norm": 3.061652660369873, + "learning_rate": 4.97814550771788e-07, + "loss": 0.3275, + "step": 10117 + }, + { + "epoch": 4.783924349881797, + "grad_norm": 3.5875346660614014, + "learning_rate": 4.974410451449321e-07, + "loss": 0.3694, + "step": 10118 + }, + { + "epoch": 4.784397163120567, + "grad_norm": 3.848348379135132, + "learning_rate": 4.970676642086408e-07, + "loss": 0.3539, + "step": 10119 + }, + { + "epoch": 4.784869976359338, + "grad_norm": 3.237959146499634, + "learning_rate": 4.966944079861641e-07, + "loss": 0.3468, + "step": 10120 + }, + { + "epoch": 4.785342789598109, + "grad_norm": 3.4829745292663574, + "learning_rate": 4.96321276500742e-07, + "loss": 0.348, + "step": 10121 + }, + { + "epoch": 4.7858156028368795, + "grad_norm": 3.29961895942688, + "learning_rate": 4.959482697756085e-07, + "loss": 0.3499, + "step": 10122 + }, + { + "epoch": 4.78628841607565, + "grad_norm": 3.291260242462158, + "learning_rate": 4.955753878339886e-07, + "loss": 0.3525, + "step": 10123 + }, + { + "epoch": 4.786761229314421, + "grad_norm": 3.543893575668335, + "learning_rate": 4.952026306991004e-07, + "loss": 0.4274, + "step": 10124 + }, + { + "epoch": 4.787234042553192, + "grad_norm": 3.551354169845581, + "learning_rate": 4.948299983941534e-07, + "loss": 0.3116, + "step": 10125 + }, + { + "epoch": 4.787706855791962, + "grad_norm": 3.1988296508789062, + "learning_rate": 4.944574909423497e-07, + "loss": 0.3273, + "step": 10126 + }, + { + "epoch": 4.7881796690307326, + "grad_norm": 2.8899428844451904, + "learning_rate": 4.940851083668843e-07, + "loss": 0.3518, + "step": 10127 + }, + { + "epoch": 4.788652482269503, + "grad_norm": 3.279688835144043, + "learning_rate": 4.937128506909439e-07, + "loss": 0.3735, + "step": 10128 + }, + { + "epoch": 4.789125295508274, + "grad_norm": 3.0784502029418945, + "learning_rate": 4.933407179377059e-07, + "loss": 0.327, + "step": 10129 + }, + { + "epoch": 4.789598108747045, + "grad_norm": 3.390169858932495, + "learning_rate": 4.929687101303435e-07, + "loss": 0.3895, + "step": 10130 + }, + { + "epoch": 4.790070921985816, + "grad_norm": 3.72928524017334, + "learning_rate": 4.925968272920181e-07, + "loss": 0.3598, + "step": 10131 + }, + { + "epoch": 4.7905437352245865, + "grad_norm": 3.3786826133728027, + "learning_rate": 4.922250694458866e-07, + "loss": 0.363, + "step": 10132 + }, + { + "epoch": 4.791016548463357, + "grad_norm": 3.086150884628296, + "learning_rate": 4.918534366150965e-07, + "loss": 0.2877, + "step": 10133 + }, + { + "epoch": 4.791489361702128, + "grad_norm": 3.3568673133850098, + "learning_rate": 4.914819288227865e-07, + "loss": 0.3153, + "step": 10134 + }, + { + "epoch": 4.791962174940898, + "grad_norm": 3.294382095336914, + "learning_rate": 4.911105460920904e-07, + "loss": 0.3327, + "step": 10135 + }, + { + "epoch": 4.792434988179669, + "grad_norm": 3.0562479496002197, + "learning_rate": 4.907392884461321e-07, + "loss": 0.3368, + "step": 10136 + }, + { + "epoch": 4.79290780141844, + "grad_norm": 2.928912878036499, + "learning_rate": 4.90368155908027e-07, + "loss": 0.295, + "step": 10137 + }, + { + "epoch": 4.79338061465721, + "grad_norm": 3.0252797603607178, + "learning_rate": 4.899971485008858e-07, + "loss": 0.2985, + "step": 10138 + }, + { + "epoch": 4.793853427895981, + "grad_norm": 2.830035924911499, + "learning_rate": 4.896262662478085e-07, + "loss": 0.3518, + "step": 10139 + }, + { + "epoch": 4.794326241134752, + "grad_norm": 3.042524576187134, + "learning_rate": 4.892555091718884e-07, + "loss": 0.2871, + "step": 10140 + }, + { + "epoch": 4.794799054373523, + "grad_norm": 2.920741558074951, + "learning_rate": 4.888848772962107e-07, + "loss": 0.3234, + "step": 10141 + }, + { + "epoch": 4.795271867612293, + "grad_norm": 3.3935956954956055, + "learning_rate": 4.885143706438527e-07, + "loss": 0.3612, + "step": 10142 + }, + { + "epoch": 4.7957446808510635, + "grad_norm": 3.1501455307006836, + "learning_rate": 4.881439892378853e-07, + "loss": 0.3239, + "step": 10143 + }, + { + "epoch": 4.796217494089834, + "grad_norm": 3.233794927597046, + "learning_rate": 4.877737331013696e-07, + "loss": 0.3185, + "step": 10144 + }, + { + "epoch": 4.796690307328605, + "grad_norm": 3.1155240535736084, + "learning_rate": 4.874036022573605e-07, + "loss": 0.3128, + "step": 10145 + }, + { + "epoch": 4.797163120567376, + "grad_norm": 3.313546895980835, + "learning_rate": 4.870335967289042e-07, + "loss": 0.3136, + "step": 10146 + }, + { + "epoch": 4.797635933806147, + "grad_norm": 5.024696350097656, + "learning_rate": 4.866637165390387e-07, + "loss": 0.4032, + "step": 10147 + }, + { + "epoch": 4.7981087470449175, + "grad_norm": 3.109086275100708, + "learning_rate": 4.862939617107959e-07, + "loss": 0.3822, + "step": 10148 + }, + { + "epoch": 4.798581560283688, + "grad_norm": 3.144777536392212, + "learning_rate": 4.859243322671978e-07, + "loss": 0.3362, + "step": 10149 + }, + { + "epoch": 4.799054373522459, + "grad_norm": 3.402974843978882, + "learning_rate": 4.855548282312605e-07, + "loss": 0.3803, + "step": 10150 + }, + { + "epoch": 4.799527186761229, + "grad_norm": 2.6077685356140137, + "learning_rate": 4.851854496259911e-07, + "loss": 0.3043, + "step": 10151 + }, + { + "epoch": 4.8, + "grad_norm": 3.38386607170105, + "learning_rate": 4.848161964743883e-07, + "loss": 0.3129, + "step": 10152 + }, + { + "epoch": 4.800472813238771, + "grad_norm": 3.193723440170288, + "learning_rate": 4.844470687994454e-07, + "loss": 0.3544, + "step": 10153 + }, + { + "epoch": 4.800945626477541, + "grad_norm": 2.9620895385742188, + "learning_rate": 4.840780666241457e-07, + "loss": 0.3376, + "step": 10154 + }, + { + "epoch": 4.801418439716312, + "grad_norm": 4.192742824554443, + "learning_rate": 4.83709189971465e-07, + "loss": 0.3325, + "step": 10155 + }, + { + "epoch": 4.801891252955083, + "grad_norm": 2.996617555618286, + "learning_rate": 4.83340438864372e-07, + "loss": 0.3237, + "step": 10156 + }, + { + "epoch": 4.802364066193854, + "grad_norm": 3.255037307739258, + "learning_rate": 4.829718133258263e-07, + "loss": 0.3575, + "step": 10157 + }, + { + "epoch": 4.802836879432624, + "grad_norm": 3.1065316200256348, + "learning_rate": 4.826033133787822e-07, + "loss": 0.347, + "step": 10158 + }, + { + "epoch": 4.8033096926713945, + "grad_norm": 3.321096420288086, + "learning_rate": 4.822349390461831e-07, + "loss": 0.3628, + "step": 10159 + }, + { + "epoch": 4.803782505910165, + "grad_norm": 3.549182653427124, + "learning_rate": 4.818666903509672e-07, + "loss": 0.3539, + "step": 10160 + }, + { + "epoch": 4.804255319148936, + "grad_norm": 2.9063286781311035, + "learning_rate": 4.814985673160633e-07, + "loss": 0.2956, + "step": 10161 + }, + { + "epoch": 4.804728132387707, + "grad_norm": 3.1669399738311768, + "learning_rate": 4.81130569964392e-07, + "loss": 0.3263, + "step": 10162 + }, + { + "epoch": 4.805200945626478, + "grad_norm": 3.667128562927246, + "learning_rate": 4.807626983188684e-07, + "loss": 0.3514, + "step": 10163 + }, + { + "epoch": 4.8056737588652485, + "grad_norm": 3.1469576358795166, + "learning_rate": 4.803949524023976e-07, + "loss": 0.3273, + "step": 10164 + }, + { + "epoch": 4.806146572104019, + "grad_norm": 3.6988110542297363, + "learning_rate": 4.800273322378768e-07, + "loss": 0.3293, + "step": 10165 + }, + { + "epoch": 4.80661938534279, + "grad_norm": 3.6419219970703125, + "learning_rate": 4.79659837848197e-07, + "loss": 0.3696, + "step": 10166 + }, + { + "epoch": 4.80709219858156, + "grad_norm": 3.4860944747924805, + "learning_rate": 4.792924692562398e-07, + "loss": 0.3372, + "step": 10167 + }, + { + "epoch": 4.807565011820331, + "grad_norm": 2.879600763320923, + "learning_rate": 4.789252264848806e-07, + "loss": 0.3192, + "step": 10168 + }, + { + "epoch": 4.808037825059102, + "grad_norm": 3.4475104808807373, + "learning_rate": 4.785581095569855e-07, + "loss": 0.3285, + "step": 10169 + }, + { + "epoch": 4.808510638297872, + "grad_norm": 3.528397560119629, + "learning_rate": 4.78191118495413e-07, + "loss": 0.3612, + "step": 10170 + }, + { + "epoch": 4.808983451536643, + "grad_norm": 3.056796073913574, + "learning_rate": 4.778242533230138e-07, + "loss": 0.3077, + "step": 10171 + }, + { + "epoch": 4.809456264775414, + "grad_norm": 3.302171230316162, + "learning_rate": 4.774575140626317e-07, + "loss": 0.2963, + "step": 10172 + }, + { + "epoch": 4.809929078014184, + "grad_norm": 3.1446237564086914, + "learning_rate": 4.770909007371016e-07, + "loss": 0.3438, + "step": 10173 + }, + { + "epoch": 4.810401891252955, + "grad_norm": 2.917919635772705, + "learning_rate": 4.767244133692511e-07, + "loss": 0.3353, + "step": 10174 + }, + { + "epoch": 4.8108747044917255, + "grad_norm": 3.0808987617492676, + "learning_rate": 4.763580519818989e-07, + "loss": 0.3574, + "step": 10175 + }, + { + "epoch": 4.811347517730496, + "grad_norm": 3.2861616611480713, + "learning_rate": 4.75991816597858e-07, + "loss": 0.3891, + "step": 10176 + }, + { + "epoch": 4.811820330969267, + "grad_norm": 2.835925340652466, + "learning_rate": 4.7562570723993116e-07, + "loss": 0.3785, + "step": 10177 + }, + { + "epoch": 4.812293144208038, + "grad_norm": 3.5441393852233887, + "learning_rate": 4.7525972393091534e-07, + "loss": 0.3914, + "step": 10178 + }, + { + "epoch": 4.812765957446809, + "grad_norm": 3.94022798538208, + "learning_rate": 4.748938666935984e-07, + "loss": 0.3564, + "step": 10179 + }, + { + "epoch": 4.8132387706855795, + "grad_norm": 3.1686532497406006, + "learning_rate": 4.7452813555076e-07, + "loss": 0.3348, + "step": 10180 + }, + { + "epoch": 4.81371158392435, + "grad_norm": 3.534032106399536, + "learning_rate": 4.7416253052517374e-07, + "loss": 0.3165, + "step": 10181 + }, + { + "epoch": 4.81418439716312, + "grad_norm": 3.1169021129608154, + "learning_rate": 4.7379705163960317e-07, + "loss": 0.3515, + "step": 10182 + }, + { + "epoch": 4.814657210401891, + "grad_norm": 3.564509391784668, + "learning_rate": 4.7343169891680585e-07, + "loss": 0.4195, + "step": 10183 + }, + { + "epoch": 4.815130023640662, + "grad_norm": 3.2813005447387695, + "learning_rate": 4.7306647237953085e-07, + "loss": 0.3574, + "step": 10184 + }, + { + "epoch": 4.815602836879433, + "grad_norm": 3.053349018096924, + "learning_rate": 4.727013720505177e-07, + "loss": 0.3792, + "step": 10185 + }, + { + "epoch": 4.816075650118203, + "grad_norm": 3.069258689880371, + "learning_rate": 4.723363979525017e-07, + "loss": 0.3377, + "step": 10186 + }, + { + "epoch": 4.816548463356974, + "grad_norm": 2.999802350997925, + "learning_rate": 4.71971550108207e-07, + "loss": 0.3232, + "step": 10187 + }, + { + "epoch": 4.817021276595745, + "grad_norm": 2.941810131072998, + "learning_rate": 4.7160682854035107e-07, + "loss": 0.342, + "step": 10188 + }, + { + "epoch": 4.817494089834515, + "grad_norm": 3.407975196838379, + "learning_rate": 4.71242233271644e-07, + "loss": 0.3697, + "step": 10189 + }, + { + "epoch": 4.817966903073286, + "grad_norm": 3.148359537124634, + "learning_rate": 4.708777643247864e-07, + "loss": 0.3297, + "step": 10190 + }, + { + "epoch": 4.8184397163120565, + "grad_norm": 2.9067797660827637, + "learning_rate": 4.7051342172247354e-07, + "loss": 0.2646, + "step": 10191 + }, + { + "epoch": 4.818912529550827, + "grad_norm": 3.4185385704040527, + "learning_rate": 4.70149205487391e-07, + "loss": 0.3296, + "step": 10192 + }, + { + "epoch": 4.819385342789598, + "grad_norm": 3.363966464996338, + "learning_rate": 4.697851156422162e-07, + "loss": 0.3744, + "step": 10193 + }, + { + "epoch": 4.819858156028369, + "grad_norm": 2.944939613342285, + "learning_rate": 4.6942115220962067e-07, + "loss": 0.3311, + "step": 10194 + }, + { + "epoch": 4.82033096926714, + "grad_norm": 3.2023603916168213, + "learning_rate": 4.6905731521226544e-07, + "loss": 0.3114, + "step": 10195 + }, + { + "epoch": 4.8208037825059105, + "grad_norm": 2.9747812747955322, + "learning_rate": 4.686936046728063e-07, + "loss": 0.2891, + "step": 10196 + }, + { + "epoch": 4.821276595744681, + "grad_norm": 3.6693246364593506, + "learning_rate": 4.6833002061388965e-07, + "loss": 0.3758, + "step": 10197 + }, + { + "epoch": 4.821749408983451, + "grad_norm": 3.4812891483306885, + "learning_rate": 4.679665630581534e-07, + "loss": 0.3274, + "step": 10198 + }, + { + "epoch": 4.822222222222222, + "grad_norm": 2.888956308364868, + "learning_rate": 4.676032320282295e-07, + "loss": 0.3304, + "step": 10199 + }, + { + "epoch": 4.822695035460993, + "grad_norm": 3.2659964561462402, + "learning_rate": 4.6724002754674006e-07, + "loss": 0.3267, + "step": 10200 + }, + { + "epoch": 4.823167848699764, + "grad_norm": 3.2733213901519775, + "learning_rate": 4.6687694963630127e-07, + "loss": 0.3067, + "step": 10201 + }, + { + "epoch": 4.823640661938534, + "grad_norm": 3.0957846641540527, + "learning_rate": 4.6651399831951995e-07, + "loss": 0.3586, + "step": 10202 + }, + { + "epoch": 4.824113475177305, + "grad_norm": 2.9597535133361816, + "learning_rate": 4.6615117361899526e-07, + "loss": 0.3409, + "step": 10203 + }, + { + "epoch": 4.824586288416076, + "grad_norm": 3.0622851848602295, + "learning_rate": 4.657884755573189e-07, + "loss": 0.3112, + "step": 10204 + }, + { + "epoch": 4.825059101654846, + "grad_norm": 3.088568925857544, + "learning_rate": 4.6542590415707355e-07, + "loss": 0.3161, + "step": 10205 + }, + { + "epoch": 4.825531914893617, + "grad_norm": 3.2927064895629883, + "learning_rate": 4.650634594408368e-07, + "loss": 0.3368, + "step": 10206 + }, + { + "epoch": 4.8260047281323875, + "grad_norm": 2.9728758335113525, + "learning_rate": 4.647011414311753e-07, + "loss": 0.3615, + "step": 10207 + }, + { + "epoch": 4.826477541371158, + "grad_norm": 3.301173686981201, + "learning_rate": 4.643389501506487e-07, + "loss": 0.3597, + "step": 10208 + }, + { + "epoch": 4.826950354609929, + "grad_norm": 3.421177864074707, + "learning_rate": 4.639768856218102e-07, + "loss": 0.3087, + "step": 10209 + }, + { + "epoch": 4.8274231678487, + "grad_norm": 3.1131463050842285, + "learning_rate": 4.636149478672031e-07, + "loss": 0.3776, + "step": 10210 + }, + { + "epoch": 4.827895981087471, + "grad_norm": 3.5807228088378906, + "learning_rate": 4.6325313690936347e-07, + "loss": 0.3556, + "step": 10211 + }, + { + "epoch": 4.828368794326241, + "grad_norm": 3.2873311042785645, + "learning_rate": 4.6289145277082085e-07, + "loss": 0.377, + "step": 10212 + }, + { + "epoch": 4.828841607565012, + "grad_norm": 3.502228021621704, + "learning_rate": 4.6252989547409423e-07, + "loss": 0.3699, + "step": 10213 + }, + { + "epoch": 4.829314420803782, + "grad_norm": 3.8895792961120605, + "learning_rate": 4.621684650416977e-07, + "loss": 0.3594, + "step": 10214 + }, + { + "epoch": 4.829787234042553, + "grad_norm": 3.11706805229187, + "learning_rate": 4.6180716149613505e-07, + "loss": 0.3404, + "step": 10215 + }, + { + "epoch": 4.830260047281324, + "grad_norm": 3.174584150314331, + "learning_rate": 4.614459848599029e-07, + "loss": 0.3684, + "step": 10216 + }, + { + "epoch": 4.8307328605200945, + "grad_norm": 3.0028135776519775, + "learning_rate": 4.610849351554908e-07, + "loss": 0.3505, + "step": 10217 + }, + { + "epoch": 4.831205673758865, + "grad_norm": 3.053354024887085, + "learning_rate": 4.6072401240537965e-07, + "loss": 0.392, + "step": 10218 + }, + { + "epoch": 4.831678486997636, + "grad_norm": 3.2726800441741943, + "learning_rate": 4.603632166320424e-07, + "loss": 0.3506, + "step": 10219 + }, + { + "epoch": 4.832151300236407, + "grad_norm": 3.5746219158172607, + "learning_rate": 4.600025478579437e-07, + "loss": 0.3585, + "step": 10220 + }, + { + "epoch": 4.832624113475177, + "grad_norm": 3.0742499828338623, + "learning_rate": 4.596420061055409e-07, + "loss": 0.332, + "step": 10221 + }, + { + "epoch": 4.833096926713948, + "grad_norm": 3.161022186279297, + "learning_rate": 4.5928159139728426e-07, + "loss": 0.3303, + "step": 10222 + }, + { + "epoch": 4.833569739952718, + "grad_norm": 3.2312185764312744, + "learning_rate": 4.5892130375561395e-07, + "loss": 0.3532, + "step": 10223 + }, + { + "epoch": 4.834042553191489, + "grad_norm": 3.501893997192383, + "learning_rate": 4.585611432029649e-07, + "loss": 0.3862, + "step": 10224 + }, + { + "epoch": 4.83451536643026, + "grad_norm": 3.2005560398101807, + "learning_rate": 4.5820110976176194e-07, + "loss": 0.3626, + "step": 10225 + }, + { + "epoch": 4.834988179669031, + "grad_norm": 3.4039556980133057, + "learning_rate": 4.578412034544225e-07, + "loss": 0.3789, + "step": 10226 + }, + { + "epoch": 4.835460992907802, + "grad_norm": 3.4461448192596436, + "learning_rate": 4.574814243033571e-07, + "loss": 0.3714, + "step": 10227 + }, + { + "epoch": 4.835933806146572, + "grad_norm": 3.435886859893799, + "learning_rate": 4.571217723309665e-07, + "loss": 0.3015, + "step": 10228 + }, + { + "epoch": 4.836406619385343, + "grad_norm": 3.612645149230957, + "learning_rate": 4.567622475596462e-07, + "loss": 0.3738, + "step": 10229 + }, + { + "epoch": 4.836879432624113, + "grad_norm": 3.1911067962646484, + "learning_rate": 4.564028500117815e-07, + "loss": 0.2994, + "step": 10230 + }, + { + "epoch": 4.837352245862884, + "grad_norm": 2.9745163917541504, + "learning_rate": 4.5604357970974956e-07, + "loss": 0.3353, + "step": 10231 + }, + { + "epoch": 4.837825059101655, + "grad_norm": 3.4999606609344482, + "learning_rate": 4.556844366759222e-07, + "loss": 0.3796, + "step": 10232 + }, + { + "epoch": 4.8382978723404255, + "grad_norm": 2.8130152225494385, + "learning_rate": 4.553254209326607e-07, + "loss": 0.2964, + "step": 10233 + }, + { + "epoch": 4.838770685579196, + "grad_norm": 3.5461673736572266, + "learning_rate": 4.5496653250232005e-07, + "loss": 0.3626, + "step": 10234 + }, + { + "epoch": 4.839243498817967, + "grad_norm": 3.3498404026031494, + "learning_rate": 4.546077714072458e-07, + "loss": 0.2982, + "step": 10235 + }, + { + "epoch": 4.839716312056738, + "grad_norm": 2.8942501544952393, + "learning_rate": 4.5424913766977635e-07, + "loss": 0.2447, + "step": 10236 + }, + { + "epoch": 4.840189125295508, + "grad_norm": 3.3506743907928467, + "learning_rate": 4.5389063131224346e-07, + "loss": 0.2908, + "step": 10237 + }, + { + "epoch": 4.840661938534279, + "grad_norm": 3.058872699737549, + "learning_rate": 4.535322523569691e-07, + "loss": 0.3275, + "step": 10238 + }, + { + "epoch": 4.841134751773049, + "grad_norm": 3.0573856830596924, + "learning_rate": 4.5317400082626696e-07, + "loss": 0.3096, + "step": 10239 + }, + { + "epoch": 4.84160756501182, + "grad_norm": 3.3260257244110107, + "learning_rate": 4.5281587674244563e-07, + "loss": 0.3334, + "step": 10240 + }, + { + "epoch": 4.842080378250591, + "grad_norm": 3.265740156173706, + "learning_rate": 4.5245788012780234e-07, + "loss": 0.3698, + "step": 10241 + }, + { + "epoch": 4.842553191489362, + "grad_norm": 3.4116036891937256, + "learning_rate": 4.521000110046292e-07, + "loss": 0.4159, + "step": 10242 + }, + { + "epoch": 4.843026004728133, + "grad_norm": 3.3263189792633057, + "learning_rate": 4.5174226939520865e-07, + "loss": 0.3579, + "step": 10243 + }, + { + "epoch": 4.843498817966903, + "grad_norm": 3.4223177433013916, + "learning_rate": 4.5138465532181514e-07, + "loss": 0.3539, + "step": 10244 + }, + { + "epoch": 4.843971631205674, + "grad_norm": 3.481016159057617, + "learning_rate": 4.5102716880671665e-07, + "loss": 0.3527, + "step": 10245 + }, + { + "epoch": 4.844444444444444, + "grad_norm": 2.830122232437134, + "learning_rate": 4.5066980987217124e-07, + "loss": 0.3339, + "step": 10246 + }, + { + "epoch": 4.844917257683215, + "grad_norm": 2.895792007446289, + "learning_rate": 4.5031257854043163e-07, + "loss": 0.3056, + "step": 10247 + }, + { + "epoch": 4.845390070921986, + "grad_norm": 2.9748036861419678, + "learning_rate": 4.499554748337398e-07, + "loss": 0.2794, + "step": 10248 + }, + { + "epoch": 4.8458628841607565, + "grad_norm": 3.223539113998413, + "learning_rate": 4.49598498774331e-07, + "loss": 0.3756, + "step": 10249 + }, + { + "epoch": 4.846335697399527, + "grad_norm": 3.491365432739258, + "learning_rate": 4.492416503844335e-07, + "loss": 0.378, + "step": 10250 + }, + { + "epoch": 4.846808510638298, + "grad_norm": 2.7236695289611816, + "learning_rate": 4.48884929686266e-07, + "loss": 0.297, + "step": 10251 + }, + { + "epoch": 4.847281323877069, + "grad_norm": 3.3814051151275635, + "learning_rate": 4.4852833670204045e-07, + "loss": 0.2923, + "step": 10252 + }, + { + "epoch": 4.847754137115839, + "grad_norm": 3.168334722518921, + "learning_rate": 4.4817187145395956e-07, + "loss": 0.3455, + "step": 10253 + }, + { + "epoch": 4.84822695035461, + "grad_norm": 3.0346829891204834, + "learning_rate": 4.4781553396421873e-07, + "loss": 0.3416, + "step": 10254 + }, + { + "epoch": 4.84869976359338, + "grad_norm": 3.1232426166534424, + "learning_rate": 4.4745932425500657e-07, + "loss": 0.3494, + "step": 10255 + }, + { + "epoch": 4.849172576832151, + "grad_norm": 3.0737383365631104, + "learning_rate": 4.471032423485017e-07, + "loss": 0.3246, + "step": 10256 + }, + { + "epoch": 4.849645390070922, + "grad_norm": 3.421461582183838, + "learning_rate": 4.467472882668769e-07, + "loss": 0.3807, + "step": 10257 + }, + { + "epoch": 4.850118203309693, + "grad_norm": 3.3846490383148193, + "learning_rate": 4.463914620322951e-07, + "loss": 0.3695, + "step": 10258 + }, + { + "epoch": 4.850591016548464, + "grad_norm": 4.0876007080078125, + "learning_rate": 4.460357636669116e-07, + "loss": 0.3913, + "step": 10259 + }, + { + "epoch": 4.851063829787234, + "grad_norm": 3.2078847885131836, + "learning_rate": 4.456801931928753e-07, + "loss": 0.3424, + "step": 10260 + }, + { + "epoch": 4.851536643026005, + "grad_norm": 3.4163241386413574, + "learning_rate": 4.453247506323255e-07, + "loss": 0.3907, + "step": 10261 + }, + { + "epoch": 4.852009456264775, + "grad_norm": 2.989793539047241, + "learning_rate": 4.449694360073931e-07, + "loss": 0.3313, + "step": 10262 + }, + { + "epoch": 4.852482269503546, + "grad_norm": 3.291537284851074, + "learning_rate": 4.446142493402039e-07, + "loss": 0.3594, + "step": 10263 + }, + { + "epoch": 4.852955082742317, + "grad_norm": 3.6327221393585205, + "learning_rate": 4.4425919065287204e-07, + "loss": 0.3844, + "step": 10264 + }, + { + "epoch": 4.8534278959810875, + "grad_norm": 3.486333131790161, + "learning_rate": 4.439042599675067e-07, + "loss": 0.3666, + "step": 10265 + }, + { + "epoch": 4.853900709219858, + "grad_norm": 3.7585315704345703, + "learning_rate": 4.435494573062074e-07, + "loss": 0.3287, + "step": 10266 + }, + { + "epoch": 4.854373522458629, + "grad_norm": 3.3496108055114746, + "learning_rate": 4.4319478269106625e-07, + "loss": 0.4021, + "step": 10267 + }, + { + "epoch": 4.8548463356974, + "grad_norm": 3.4681267738342285, + "learning_rate": 4.428402361441672e-07, + "loss": 0.3119, + "step": 10268 + }, + { + "epoch": 4.85531914893617, + "grad_norm": 2.9935829639434814, + "learning_rate": 4.4248581768758567e-07, + "loss": 0.305, + "step": 10269 + }, + { + "epoch": 4.855791962174941, + "grad_norm": 3.5839056968688965, + "learning_rate": 4.42131527343391e-07, + "loss": 0.4095, + "step": 10270 + }, + { + "epoch": 4.856264775413711, + "grad_norm": 3.088690757751465, + "learning_rate": 4.4177736513364237e-07, + "loss": 0.3391, + "step": 10271 + }, + { + "epoch": 4.856737588652482, + "grad_norm": 3.2721431255340576, + "learning_rate": 4.414233310803917e-07, + "loss": 0.3741, + "step": 10272 + }, + { + "epoch": 4.857210401891253, + "grad_norm": 3.108041524887085, + "learning_rate": 4.4106942520568437e-07, + "loss": 0.4041, + "step": 10273 + }, + { + "epoch": 4.857683215130024, + "grad_norm": 3.0035696029663086, + "learning_rate": 4.407156475315549e-07, + "loss": 0.3408, + "step": 10274 + }, + { + "epoch": 4.858156028368795, + "grad_norm": 3.0572783946990967, + "learning_rate": 4.4036199808003334e-07, + "loss": 0.3207, + "step": 10275 + }, + { + "epoch": 4.858628841607565, + "grad_norm": 3.1695926189422607, + "learning_rate": 4.4000847687313857e-07, + "loss": 0.3605, + "step": 10276 + }, + { + "epoch": 4.859101654846336, + "grad_norm": 3.690382957458496, + "learning_rate": 4.396550839328828e-07, + "loss": 0.4076, + "step": 10277 + }, + { + "epoch": 4.859574468085106, + "grad_norm": 3.271988868713379, + "learning_rate": 4.393018192812712e-07, + "loss": 0.4169, + "step": 10278 + }, + { + "epoch": 4.860047281323877, + "grad_norm": 2.8622982501983643, + "learning_rate": 4.389486829402986e-07, + "loss": 0.3114, + "step": 10279 + }, + { + "epoch": 4.860520094562648, + "grad_norm": 3.3875632286071777, + "learning_rate": 4.385956749319548e-07, + "loss": 0.3664, + "step": 10280 + }, + { + "epoch": 4.8609929078014185, + "grad_norm": 2.98962664604187, + "learning_rate": 4.382427952782195e-07, + "loss": 0.314, + "step": 10281 + }, + { + "epoch": 4.861465721040189, + "grad_norm": 2.899529457092285, + "learning_rate": 4.3789004400106473e-07, + "loss": 0.3588, + "step": 10282 + }, + { + "epoch": 4.86193853427896, + "grad_norm": 3.11767578125, + "learning_rate": 4.3753742112245476e-07, + "loss": 0.3311, + "step": 10283 + }, + { + "epoch": 4.862411347517731, + "grad_norm": 2.9610254764556885, + "learning_rate": 4.3718492666434576e-07, + "loss": 0.3234, + "step": 10284 + }, + { + "epoch": 4.862884160756501, + "grad_norm": 2.9350297451019287, + "learning_rate": 4.368325606486859e-07, + "loss": 0.3086, + "step": 10285 + }, + { + "epoch": 4.863356973995272, + "grad_norm": 3.0126571655273438, + "learning_rate": 4.3648032309741626e-07, + "loss": 0.3033, + "step": 10286 + }, + { + "epoch": 4.863829787234042, + "grad_norm": 3.0580496788024902, + "learning_rate": 4.3612821403246795e-07, + "loss": 0.3631, + "step": 10287 + }, + { + "epoch": 4.864302600472813, + "grad_norm": 2.9186129570007324, + "learning_rate": 4.3577623347576676e-07, + "loss": 0.3449, + "step": 10288 + }, + { + "epoch": 4.864775413711584, + "grad_norm": 3.146562099456787, + "learning_rate": 4.354243814492282e-07, + "loss": 0.369, + "step": 10289 + }, + { + "epoch": 4.865248226950355, + "grad_norm": 2.646812915802002, + "learning_rate": 4.350726579747597e-07, + "loss": 0.331, + "step": 10290 + }, + { + "epoch": 4.8657210401891255, + "grad_norm": 3.2851274013519287, + "learning_rate": 4.3472106307426293e-07, + "loss": 0.3445, + "step": 10291 + }, + { + "epoch": 4.866193853427896, + "grad_norm": 3.144446849822998, + "learning_rate": 4.34369596769629e-07, + "loss": 0.3687, + "step": 10292 + }, + { + "epoch": 4.866666666666667, + "grad_norm": 3.01517915725708, + "learning_rate": 4.3401825908274353e-07, + "loss": 0.3282, + "step": 10293 + }, + { + "epoch": 4.867139479905437, + "grad_norm": 3.171759605407715, + "learning_rate": 4.33667050035482e-07, + "loss": 0.351, + "step": 10294 + }, + { + "epoch": 4.867612293144208, + "grad_norm": 3.5374269485473633, + "learning_rate": 4.333159696497119e-07, + "loss": 0.3586, + "step": 10295 + }, + { + "epoch": 4.868085106382979, + "grad_norm": 3.4506356716156006, + "learning_rate": 4.3296501794729494e-07, + "loss": 0.4076, + "step": 10296 + }, + { + "epoch": 4.868557919621749, + "grad_norm": 3.348048448562622, + "learning_rate": 4.326141949500826e-07, + "loss": 0.3256, + "step": 10297 + }, + { + "epoch": 4.86903073286052, + "grad_norm": 3.235438108444214, + "learning_rate": 4.322635006799192e-07, + "loss": 0.3215, + "step": 10298 + }, + { + "epoch": 4.869503546099291, + "grad_norm": 3.2025554180145264, + "learning_rate": 4.319129351586407e-07, + "loss": 0.335, + "step": 10299 + }, + { + "epoch": 4.869976359338062, + "grad_norm": 3.0318121910095215, + "learning_rate": 4.315624984080749e-07, + "loss": 0.3304, + "step": 10300 + }, + { + "epoch": 4.870449172576832, + "grad_norm": 2.9115359783172607, + "learning_rate": 4.312121904500433e-07, + "loss": 0.3459, + "step": 10301 + }, + { + "epoch": 4.8709219858156025, + "grad_norm": 3.41164493560791, + "learning_rate": 4.3086201130635633e-07, + "loss": 0.3846, + "step": 10302 + }, + { + "epoch": 4.871394799054373, + "grad_norm": 3.5832016468048096, + "learning_rate": 4.305119609988198e-07, + "loss": 0.3422, + "step": 10303 + }, + { + "epoch": 4.871867612293144, + "grad_norm": 3.5244979858398438, + "learning_rate": 4.30162039549229e-07, + "loss": 0.3862, + "step": 10304 + }, + { + "epoch": 4.872340425531915, + "grad_norm": 3.0881710052490234, + "learning_rate": 4.298122469793714e-07, + "loss": 0.358, + "step": 10305 + }, + { + "epoch": 4.872813238770686, + "grad_norm": 3.3237557411193848, + "learning_rate": 4.294625833110283e-07, + "loss": 0.3742, + "step": 10306 + }, + { + "epoch": 4.8732860520094565, + "grad_norm": 3.1959686279296875, + "learning_rate": 4.291130485659711e-07, + "loss": 0.3426, + "step": 10307 + }, + { + "epoch": 4.873758865248227, + "grad_norm": 3.1890714168548584, + "learning_rate": 4.2876364276596333e-07, + "loss": 0.3131, + "step": 10308 + }, + { + "epoch": 4.874231678486998, + "grad_norm": 2.9387660026550293, + "learning_rate": 4.284143659327619e-07, + "loss": 0.3227, + "step": 10309 + }, + { + "epoch": 4.874704491725768, + "grad_norm": 3.6868603229522705, + "learning_rate": 4.2806521808811367e-07, + "loss": 0.3159, + "step": 10310 + }, + { + "epoch": 4.875177304964539, + "grad_norm": 3.1396310329437256, + "learning_rate": 4.277161992537596e-07, + "loss": 0.3757, + "step": 10311 + }, + { + "epoch": 4.87565011820331, + "grad_norm": 3.4745748043060303, + "learning_rate": 4.273673094514313e-07, + "loss": 0.347, + "step": 10312 + }, + { + "epoch": 4.87612293144208, + "grad_norm": 3.1869146823883057, + "learning_rate": 4.270185487028525e-07, + "loss": 0.3364, + "step": 10313 + }, + { + "epoch": 4.876595744680851, + "grad_norm": 2.8646297454833984, + "learning_rate": 4.2666991702973807e-07, + "loss": 0.2987, + "step": 10314 + }, + { + "epoch": 4.877068557919622, + "grad_norm": 3.3483452796936035, + "learning_rate": 4.263214144537975e-07, + "loss": 0.307, + "step": 10315 + }, + { + "epoch": 4.877541371158393, + "grad_norm": 2.8557562828063965, + "learning_rate": 4.259730409967294e-07, + "loss": 0.3406, + "step": 10316 + }, + { + "epoch": 4.878014184397163, + "grad_norm": 3.351121664047241, + "learning_rate": 4.256247966802257e-07, + "loss": 0.3571, + "step": 10317 + }, + { + "epoch": 4.8784869976359335, + "grad_norm": 3.1691417694091797, + "learning_rate": 4.252766815259696e-07, + "loss": 0.3686, + "step": 10318 + }, + { + "epoch": 4.878959810874704, + "grad_norm": 2.957632303237915, + "learning_rate": 4.249286955556378e-07, + "loss": 0.3055, + "step": 10319 + }, + { + "epoch": 4.879432624113475, + "grad_norm": 3.234708070755005, + "learning_rate": 4.2458083879089645e-07, + "loss": 0.3733, + "step": 10320 + }, + { + "epoch": 4.879905437352246, + "grad_norm": 3.469207525253296, + "learning_rate": 4.242331112534065e-07, + "loss": 0.3758, + "step": 10321 + }, + { + "epoch": 4.880378250591017, + "grad_norm": 3.2442891597747803, + "learning_rate": 4.2388551296481896e-07, + "loss": 0.3515, + "step": 10322 + }, + { + "epoch": 4.8808510638297875, + "grad_norm": 3.3709537982940674, + "learning_rate": 4.235380439467762e-07, + "loss": 0.421, + "step": 10323 + }, + { + "epoch": 4.881323877068558, + "grad_norm": 2.730891227722168, + "learning_rate": 4.231907042209149e-07, + "loss": 0.3105, + "step": 10324 + }, + { + "epoch": 4.881796690307329, + "grad_norm": 3.6933813095092773, + "learning_rate": 4.228434938088616e-07, + "loss": 0.338, + "step": 10325 + }, + { + "epoch": 4.882269503546099, + "grad_norm": 3.2480294704437256, + "learning_rate": 4.224964127322362e-07, + "loss": 0.3695, + "step": 10326 + }, + { + "epoch": 4.88274231678487, + "grad_norm": 3.229762554168701, + "learning_rate": 4.2214946101264976e-07, + "loss": 0.3768, + "step": 10327 + }, + { + "epoch": 4.883215130023641, + "grad_norm": 3.3844475746154785, + "learning_rate": 4.218026386717047e-07, + "loss": 0.3441, + "step": 10328 + }, + { + "epoch": 4.883687943262411, + "grad_norm": 3.159759283065796, + "learning_rate": 4.2145594573099745e-07, + "loss": 0.3459, + "step": 10329 + }, + { + "epoch": 4.884160756501182, + "grad_norm": 3.5672366619110107, + "learning_rate": 4.21109382212114e-07, + "loss": 0.3908, + "step": 10330 + }, + { + "epoch": 4.884633569739953, + "grad_norm": 3.2481353282928467, + "learning_rate": 4.2076294813663405e-07, + "loss": 0.3778, + "step": 10331 + }, + { + "epoch": 4.885106382978723, + "grad_norm": 3.3311941623687744, + "learning_rate": 4.2041664352612785e-07, + "loss": 0.3171, + "step": 10332 + }, + { + "epoch": 4.885579196217494, + "grad_norm": 3.4712841510772705, + "learning_rate": 4.2007046840215783e-07, + "loss": 0.3858, + "step": 10333 + }, + { + "epoch": 4.8860520094562645, + "grad_norm": 3.1591062545776367, + "learning_rate": 4.197244227862804e-07, + "loss": 0.327, + "step": 10334 + }, + { + "epoch": 4.886524822695035, + "grad_norm": 3.400400400161743, + "learning_rate": 4.1937850670004136e-07, + "loss": 0.3231, + "step": 10335 + }, + { + "epoch": 4.886997635933806, + "grad_norm": 2.9156908988952637, + "learning_rate": 4.190327201649788e-07, + "loss": 0.2834, + "step": 10336 + }, + { + "epoch": 4.887470449172577, + "grad_norm": 3.0125153064727783, + "learning_rate": 4.1868706320262467e-07, + "loss": 0.3143, + "step": 10337 + }, + { + "epoch": 4.887943262411348, + "grad_norm": 2.656107187271118, + "learning_rate": 4.183415358345003e-07, + "loss": 0.3348, + "step": 10338 + }, + { + "epoch": 4.8884160756501185, + "grad_norm": 3.0910565853118896, + "learning_rate": 4.17996138082121e-07, + "loss": 0.3212, + "step": 10339 + }, + { + "epoch": 4.888888888888889, + "grad_norm": 3.1303164958953857, + "learning_rate": 4.1765086996699315e-07, + "loss": 0.3573, + "step": 10340 + }, + { + "epoch": 4.889361702127659, + "grad_norm": 3.504901885986328, + "learning_rate": 4.173057315106141e-07, + "loss": 0.3912, + "step": 10341 + }, + { + "epoch": 4.88983451536643, + "grad_norm": 2.994338035583496, + "learning_rate": 4.1696072273447547e-07, + "loss": 0.3896, + "step": 10342 + }, + { + "epoch": 4.890307328605201, + "grad_norm": 3.0409624576568604, + "learning_rate": 4.1661584366005814e-07, + "loss": 0.3109, + "step": 10343 + }, + { + "epoch": 4.890780141843972, + "grad_norm": 3.479952096939087, + "learning_rate": 4.1627109430883743e-07, + "loss": 0.3265, + "step": 10344 + }, + { + "epoch": 4.891252955082742, + "grad_norm": 3.0288894176483154, + "learning_rate": 4.159264747022787e-07, + "loss": 0.3345, + "step": 10345 + }, + { + "epoch": 4.891725768321513, + "grad_norm": 3.7433063983917236, + "learning_rate": 4.1558198486184005e-07, + "loss": 0.3888, + "step": 10346 + }, + { + "epoch": 4.892198581560284, + "grad_norm": 3.431964635848999, + "learning_rate": 4.152376248089715e-07, + "loss": 0.3062, + "step": 10347 + }, + { + "epoch": 4.892671394799054, + "grad_norm": 3.3993113040924072, + "learning_rate": 4.1489339456511376e-07, + "loss": 0.3955, + "step": 10348 + }, + { + "epoch": 4.893144208037825, + "grad_norm": 3.09287428855896, + "learning_rate": 4.145492941517024e-07, + "loss": 0.2857, + "step": 10349 + }, + { + "epoch": 4.8936170212765955, + "grad_norm": 3.355915069580078, + "learning_rate": 4.1420532359016166e-07, + "loss": 0.3403, + "step": 10350 + }, + { + "epoch": 4.894089834515366, + "grad_norm": 4.00920295715332, + "learning_rate": 4.1386148290190915e-07, + "loss": 0.3455, + "step": 10351 + }, + { + "epoch": 4.894562647754137, + "grad_norm": 3.408311605453491, + "learning_rate": 4.1351777210835524e-07, + "loss": 0.3606, + "step": 10352 + }, + { + "epoch": 4.895035460992908, + "grad_norm": 3.031616449356079, + "learning_rate": 4.1317419123090007e-07, + "loss": 0.3696, + "step": 10353 + }, + { + "epoch": 4.895508274231679, + "grad_norm": 3.555751085281372, + "learning_rate": 4.1283074029093814e-07, + "loss": 0.3197, + "step": 10354 + }, + { + "epoch": 4.8959810874704495, + "grad_norm": 3.3839752674102783, + "learning_rate": 4.124874193098541e-07, + "loss": 0.3744, + "step": 10355 + }, + { + "epoch": 4.89645390070922, + "grad_norm": 3.514296531677246, + "learning_rate": 4.1214422830902406e-07, + "loss": 0.29, + "step": 10356 + }, + { + "epoch": 4.89692671394799, + "grad_norm": 3.056325674057007, + "learning_rate": 4.1180116730981905e-07, + "loss": 0.371, + "step": 10357 + }, + { + "epoch": 4.897399527186761, + "grad_norm": 3.7567055225372314, + "learning_rate": 4.1145823633359865e-07, + "loss": 0.4105, + "step": 10358 + }, + { + "epoch": 4.897872340425532, + "grad_norm": 3.0050766468048096, + "learning_rate": 4.111154354017152e-07, + "loss": 0.3262, + "step": 10359 + }, + { + "epoch": 4.898345153664303, + "grad_norm": 3.2767333984375, + "learning_rate": 4.1077276453551476e-07, + "loss": 0.3253, + "step": 10360 + }, + { + "epoch": 4.898817966903073, + "grad_norm": 4.133147239685059, + "learning_rate": 4.1043022375633347e-07, + "loss": 0.4549, + "step": 10361 + }, + { + "epoch": 4.899290780141844, + "grad_norm": 3.372962236404419, + "learning_rate": 4.1008781308549934e-07, + "loss": 0.315, + "step": 10362 + }, + { + "epoch": 4.899763593380615, + "grad_norm": 3.4167628288269043, + "learning_rate": 4.0974553254433335e-07, + "loss": 0.3832, + "step": 10363 + }, + { + "epoch": 4.900236406619385, + "grad_norm": 3.103311061859131, + "learning_rate": 4.094033821541468e-07, + "loss": 0.3347, + "step": 10364 + }, + { + "epoch": 4.900709219858156, + "grad_norm": 2.95872163772583, + "learning_rate": 4.0906136193624547e-07, + "loss": 0.2861, + "step": 10365 + }, + { + "epoch": 4.9011820330969265, + "grad_norm": 3.1035397052764893, + "learning_rate": 4.087194719119239e-07, + "loss": 0.3089, + "step": 10366 + }, + { + "epoch": 4.901654846335697, + "grad_norm": 3.0228095054626465, + "learning_rate": 4.083777121024715e-07, + "loss": 0.38, + "step": 10367 + }, + { + "epoch": 4.902127659574468, + "grad_norm": 3.3528707027435303, + "learning_rate": 4.080360825291674e-07, + "loss": 0.3614, + "step": 10368 + }, + { + "epoch": 4.902600472813239, + "grad_norm": 3.5866968631744385, + "learning_rate": 4.076945832132828e-07, + "loss": 0.3751, + "step": 10369 + }, + { + "epoch": 4.90307328605201, + "grad_norm": 3.388880729675293, + "learning_rate": 4.0735321417608276e-07, + "loss": 0.3358, + "step": 10370 + }, + { + "epoch": 4.9035460992907804, + "grad_norm": 3.5489447116851807, + "learning_rate": 4.070119754388213e-07, + "loss": 0.3437, + "step": 10371 + }, + { + "epoch": 4.904018912529551, + "grad_norm": 2.8234825134277344, + "learning_rate": 4.0667086702274733e-07, + "loss": 0.3164, + "step": 10372 + }, + { + "epoch": 4.904491725768321, + "grad_norm": 3.337445020675659, + "learning_rate": 4.0632988894909965e-07, + "loss": 0.3213, + "step": 10373 + }, + { + "epoch": 4.904964539007092, + "grad_norm": 3.639477491378784, + "learning_rate": 4.0598904123910847e-07, + "loss": 0.3642, + "step": 10374 + }, + { + "epoch": 4.905437352245863, + "grad_norm": 3.101829767227173, + "learning_rate": 4.0564832391399857e-07, + "loss": 0.3415, + "step": 10375 + }, + { + "epoch": 4.9059101654846335, + "grad_norm": 2.8291256427764893, + "learning_rate": 4.05307736994984e-07, + "loss": 0.3014, + "step": 10376 + }, + { + "epoch": 4.906382978723404, + "grad_norm": 2.8689401149749756, + "learning_rate": 4.049672805032717e-07, + "loss": 0.3151, + "step": 10377 + }, + { + "epoch": 4.906855791962175, + "grad_norm": 3.468038320541382, + "learning_rate": 4.046269544600598e-07, + "loss": 0.3956, + "step": 10378 + }, + { + "epoch": 4.907328605200946, + "grad_norm": 3.5246312618255615, + "learning_rate": 4.042867588865401e-07, + "loss": 0.3003, + "step": 10379 + }, + { + "epoch": 4.907801418439716, + "grad_norm": 3.273010730743408, + "learning_rate": 4.039466938038944e-07, + "loss": 0.3036, + "step": 10380 + }, + { + "epoch": 4.908274231678487, + "grad_norm": 3.064718008041382, + "learning_rate": 4.0360675923329733e-07, + "loss": 0.3353, + "step": 10381 + }, + { + "epoch": 4.908747044917257, + "grad_norm": 3.413242816925049, + "learning_rate": 4.032669551959142e-07, + "loss": 0.354, + "step": 10382 + }, + { + "epoch": 4.909219858156028, + "grad_norm": 3.136293411254883, + "learning_rate": 4.029272817129046e-07, + "loss": 0.331, + "step": 10383 + }, + { + "epoch": 4.909692671394799, + "grad_norm": 3.0966274738311768, + "learning_rate": 4.025877388054172e-07, + "loss": 0.2878, + "step": 10384 + }, + { + "epoch": 4.91016548463357, + "grad_norm": 3.334113836288452, + "learning_rate": 4.022483264945948e-07, + "loss": 0.3276, + "step": 10385 + }, + { + "epoch": 4.910638297872341, + "grad_norm": 3.2662229537963867, + "learning_rate": 4.019090448015711e-07, + "loss": 0.3265, + "step": 10386 + }, + { + "epoch": 4.911111111111111, + "grad_norm": 3.134220838546753, + "learning_rate": 4.0156989374747047e-07, + "loss": 0.3684, + "step": 10387 + }, + { + "epoch": 4.911583924349882, + "grad_norm": 3.803694725036621, + "learning_rate": 4.012308733534118e-07, + "loss": 0.3394, + "step": 10388 + }, + { + "epoch": 4.912056737588652, + "grad_norm": 2.788388252258301, + "learning_rate": 4.008919836405034e-07, + "loss": 0.2835, + "step": 10389 + }, + { + "epoch": 4.912529550827423, + "grad_norm": 3.3408966064453125, + "learning_rate": 4.005532246298474e-07, + "loss": 0.3694, + "step": 10390 + }, + { + "epoch": 4.913002364066194, + "grad_norm": 2.913114547729492, + "learning_rate": 4.0021459634253605e-07, + "loss": 0.3456, + "step": 10391 + }, + { + "epoch": 4.9134751773049645, + "grad_norm": 3.778111457824707, + "learning_rate": 3.9987609879965414e-07, + "loss": 0.3887, + "step": 10392 + }, + { + "epoch": 4.913947990543735, + "grad_norm": 2.871978282928467, + "learning_rate": 3.995377320222796e-07, + "loss": 0.28, + "step": 10393 + }, + { + "epoch": 4.914420803782506, + "grad_norm": 3.5189783573150635, + "learning_rate": 3.9919949603147987e-07, + "loss": 0.3802, + "step": 10394 + }, + { + "epoch": 4.914893617021277, + "grad_norm": 3.381014585494995, + "learning_rate": 3.9886139084831607e-07, + "loss": 0.3661, + "step": 10395 + }, + { + "epoch": 4.915366430260047, + "grad_norm": 2.908207654953003, + "learning_rate": 3.9852341649384006e-07, + "loss": 0.3228, + "step": 10396 + }, + { + "epoch": 4.915839243498818, + "grad_norm": 3.4134814739227295, + "learning_rate": 3.981855729890957e-07, + "loss": 0.3149, + "step": 10397 + }, + { + "epoch": 4.916312056737588, + "grad_norm": 4.496891975402832, + "learning_rate": 3.9784786035512004e-07, + "loss": 0.3516, + "step": 10398 + }, + { + "epoch": 4.916784869976359, + "grad_norm": 3.2910919189453125, + "learning_rate": 3.975102786129398e-07, + "loss": 0.3329, + "step": 10399 + }, + { + "epoch": 4.91725768321513, + "grad_norm": 3.6607260704040527, + "learning_rate": 3.97172827783576e-07, + "loss": 0.3878, + "step": 10400 + }, + { + "epoch": 4.917730496453901, + "grad_norm": 3.1500742435455322, + "learning_rate": 3.9683550788803983e-07, + "loss": 0.3323, + "step": 10401 + }, + { + "epoch": 4.918203309692672, + "grad_norm": 3.263714075088501, + "learning_rate": 3.964983189473337e-07, + "loss": 0.352, + "step": 10402 + }, + { + "epoch": 4.918676122931442, + "grad_norm": 3.433868408203125, + "learning_rate": 3.961612609824542e-07, + "loss": 0.3308, + "step": 10403 + }, + { + "epoch": 4.919148936170213, + "grad_norm": 3.3086423873901367, + "learning_rate": 3.95824334014388e-07, + "loss": 0.3641, + "step": 10404 + }, + { + "epoch": 4.919621749408983, + "grad_norm": 3.2854621410369873, + "learning_rate": 3.954875380641135e-07, + "loss": 0.3405, + "step": 10405 + }, + { + "epoch": 4.920094562647754, + "grad_norm": 3.1408650875091553, + "learning_rate": 3.9515087315260244e-07, + "loss": 0.3103, + "step": 10406 + }, + { + "epoch": 4.920567375886525, + "grad_norm": 2.9340312480926514, + "learning_rate": 3.948143393008164e-07, + "loss": 0.3405, + "step": 10407 + }, + { + "epoch": 4.9210401891252955, + "grad_norm": 3.525876522064209, + "learning_rate": 3.944779365297113e-07, + "loss": 0.3464, + "step": 10408 + }, + { + "epoch": 4.921513002364066, + "grad_norm": 3.26991605758667, + "learning_rate": 3.9414166486023253e-07, + "loss": 0.3529, + "step": 10409 + }, + { + "epoch": 4.921985815602837, + "grad_norm": 2.7669694423675537, + "learning_rate": 3.938055243133182e-07, + "loss": 0.3242, + "step": 10410 + }, + { + "epoch": 4.922458628841608, + "grad_norm": 2.8268136978149414, + "learning_rate": 3.934695149098988e-07, + "loss": 0.3086, + "step": 10411 + }, + { + "epoch": 4.922931442080378, + "grad_norm": 3.119053602218628, + "learning_rate": 3.931336366708952e-07, + "loss": 0.3065, + "step": 10412 + }, + { + "epoch": 4.923404255319149, + "grad_norm": 3.1537275314331055, + "learning_rate": 3.9279788961722215e-07, + "loss": 0.3325, + "step": 10413 + }, + { + "epoch": 4.923877068557919, + "grad_norm": 3.1365256309509277, + "learning_rate": 3.9246227376978476e-07, + "loss": 0.4139, + "step": 10414 + }, + { + "epoch": 4.92434988179669, + "grad_norm": 3.3495218753814697, + "learning_rate": 3.921267891494798e-07, + "loss": 0.3463, + "step": 10415 + }, + { + "epoch": 4.924822695035461, + "grad_norm": 3.2402634620666504, + "learning_rate": 3.9179143577719736e-07, + "loss": 0.3499, + "step": 10416 + }, + { + "epoch": 4.925295508274232, + "grad_norm": 2.986429452896118, + "learning_rate": 3.914562136738176e-07, + "loss": 0.3326, + "step": 10417 + }, + { + "epoch": 4.925768321513003, + "grad_norm": 3.1276674270629883, + "learning_rate": 3.9112112286021407e-07, + "loss": 0.3087, + "step": 10418 + }, + { + "epoch": 4.926241134751773, + "grad_norm": 3.1767871379852295, + "learning_rate": 3.9078616335725126e-07, + "loss": 0.3804, + "step": 10419 + }, + { + "epoch": 4.926713947990544, + "grad_norm": 3.1657216548919678, + "learning_rate": 3.904513351857847e-07, + "loss": 0.333, + "step": 10420 + }, + { + "epoch": 4.927186761229314, + "grad_norm": 3.005009174346924, + "learning_rate": 3.901166383666641e-07, + "loss": 0.3583, + "step": 10421 + }, + { + "epoch": 4.927659574468085, + "grad_norm": 2.900146722793579, + "learning_rate": 3.897820729207283e-07, + "loss": 0.3061, + "step": 10422 + }, + { + "epoch": 4.928132387706856, + "grad_norm": 3.2418317794799805, + "learning_rate": 3.8944763886881037e-07, + "loss": 0.3822, + "step": 10423 + }, + { + "epoch": 4.9286052009456265, + "grad_norm": 3.1222848892211914, + "learning_rate": 3.8911333623173344e-07, + "loss": 0.3167, + "step": 10424 + }, + { + "epoch": 4.929078014184397, + "grad_norm": 2.727388858795166, + "learning_rate": 3.8877916503031325e-07, + "loss": 0.2977, + "step": 10425 + }, + { + "epoch": 4.929550827423168, + "grad_norm": 3.190159797668457, + "learning_rate": 3.884451252853569e-07, + "loss": 0.3399, + "step": 10426 + }, + { + "epoch": 4.930023640661939, + "grad_norm": 3.253791570663452, + "learning_rate": 3.8811121701766373e-07, + "loss": 0.3806, + "step": 10427 + }, + { + "epoch": 4.930496453900709, + "grad_norm": 3.4284887313842773, + "learning_rate": 3.8777744024802414e-07, + "loss": 0.3437, + "step": 10428 + }, + { + "epoch": 4.93096926713948, + "grad_norm": 3.3665032386779785, + "learning_rate": 3.874437949972221e-07, + "loss": 0.375, + "step": 10429 + }, + { + "epoch": 4.93144208037825, + "grad_norm": 3.1024677753448486, + "learning_rate": 3.8711028128603084e-07, + "loss": 0.3493, + "step": 10430 + }, + { + "epoch": 4.931914893617021, + "grad_norm": 3.599743604660034, + "learning_rate": 3.867768991352186e-07, + "loss": 0.3852, + "step": 10431 + }, + { + "epoch": 4.932387706855792, + "grad_norm": 3.3676376342773438, + "learning_rate": 3.8644364856554236e-07, + "loss": 0.3489, + "step": 10432 + }, + { + "epoch": 4.932860520094563, + "grad_norm": 3.2186801433563232, + "learning_rate": 3.861105295977521e-07, + "loss": 0.3413, + "step": 10433 + }, + { + "epoch": 4.933333333333334, + "grad_norm": 3.3672704696655273, + "learning_rate": 3.8577754225259055e-07, + "loss": 0.3392, + "step": 10434 + }, + { + "epoch": 4.933806146572104, + "grad_norm": 3.4285950660705566, + "learning_rate": 3.854446865507902e-07, + "loss": 0.3188, + "step": 10435 + }, + { + "epoch": 4.934278959810875, + "grad_norm": 3.187617063522339, + "learning_rate": 3.8511196251307783e-07, + "loss": 0.3258, + "step": 10436 + }, + { + "epoch": 4.934751773049645, + "grad_norm": 2.9744882583618164, + "learning_rate": 3.847793701601699e-07, + "loss": 0.3879, + "step": 10437 + }, + { + "epoch": 4.935224586288416, + "grad_norm": 2.9745848178863525, + "learning_rate": 3.844469095127751e-07, + "loss": 0.3178, + "step": 10438 + }, + { + "epoch": 4.935697399527187, + "grad_norm": 3.7419471740722656, + "learning_rate": 3.841145805915955e-07, + "loss": 0.3167, + "step": 10439 + }, + { + "epoch": 4.9361702127659575, + "grad_norm": 4.295339107513428, + "learning_rate": 3.837823834173232e-07, + "loss": 0.3209, + "step": 10440 + }, + { + "epoch": 4.936643026004728, + "grad_norm": 3.4612984657287598, + "learning_rate": 3.8345031801064217e-07, + "loss": 0.3351, + "step": 10441 + }, + { + "epoch": 4.937115839243499, + "grad_norm": 3.0626909732818604, + "learning_rate": 3.8311838439222953e-07, + "loss": 0.3395, + "step": 10442 + }, + { + "epoch": 4.93758865248227, + "grad_norm": 3.606682538986206, + "learning_rate": 3.827865825827518e-07, + "loss": 0.4112, + "step": 10443 + }, + { + "epoch": 4.93806146572104, + "grad_norm": 3.3908627033233643, + "learning_rate": 3.8245491260287064e-07, + "loss": 0.3367, + "step": 10444 + }, + { + "epoch": 4.938534278959811, + "grad_norm": 2.8598084449768066, + "learning_rate": 3.821233744732364e-07, + "loss": 0.3408, + "step": 10445 + }, + { + "epoch": 4.939007092198581, + "grad_norm": 3.207010269165039, + "learning_rate": 3.8179196821449354e-07, + "loss": 0.3301, + "step": 10446 + }, + { + "epoch": 4.939479905437352, + "grad_norm": 3.018414258956909, + "learning_rate": 3.8146069384727674e-07, + "loss": 0.3622, + "step": 10447 + }, + { + "epoch": 4.939952718676123, + "grad_norm": 3.399415969848633, + "learning_rate": 3.811295513922125e-07, + "loss": 0.3525, + "step": 10448 + }, + { + "epoch": 4.940425531914894, + "grad_norm": 3.175705671310425, + "learning_rate": 3.807985408699208e-07, + "loss": 0.322, + "step": 10449 + }, + { + "epoch": 4.9408983451536646, + "grad_norm": 2.906064033508301, + "learning_rate": 3.804676623010109e-07, + "loss": 0.3246, + "step": 10450 + }, + { + "epoch": 4.941371158392435, + "grad_norm": 3.1224400997161865, + "learning_rate": 3.8013691570608634e-07, + "loss": 0.3607, + "step": 10451 + }, + { + "epoch": 4.941843971631206, + "grad_norm": 3.4386677742004395, + "learning_rate": 3.7980630110574067e-07, + "loss": 0.3315, + "step": 10452 + }, + { + "epoch": 4.942316784869976, + "grad_norm": 3.432509183883667, + "learning_rate": 3.794758185205594e-07, + "loss": 0.3713, + "step": 10453 + }, + { + "epoch": 4.942789598108747, + "grad_norm": 3.314802646636963, + "learning_rate": 3.7914546797112097e-07, + "loss": 0.3587, + "step": 10454 + }, + { + "epoch": 4.943262411347518, + "grad_norm": 2.9151065349578857, + "learning_rate": 3.788152494779948e-07, + "loss": 0.367, + "step": 10455 + }, + { + "epoch": 4.9437352245862884, + "grad_norm": 3.3444712162017822, + "learning_rate": 3.784851630617414e-07, + "loss": 0.4009, + "step": 10456 + }, + { + "epoch": 4.944208037825059, + "grad_norm": 3.2677152156829834, + "learning_rate": 3.7815520874291494e-07, + "loss": 0.3553, + "step": 10457 + }, + { + "epoch": 4.94468085106383, + "grad_norm": 3.2326159477233887, + "learning_rate": 3.7782538654205946e-07, + "loss": 0.4008, + "step": 10458 + }, + { + "epoch": 4.945153664302601, + "grad_norm": 3.3304033279418945, + "learning_rate": 3.774956964797119e-07, + "loss": 0.3591, + "step": 10459 + }, + { + "epoch": 4.945626477541371, + "grad_norm": 3.038605213165283, + "learning_rate": 3.7716613857640026e-07, + "loss": 0.2907, + "step": 10460 + }, + { + "epoch": 4.9460992907801415, + "grad_norm": 3.016227960586548, + "learning_rate": 3.768367128526443e-07, + "loss": 0.2898, + "step": 10461 + }, + { + "epoch": 4.946572104018912, + "grad_norm": 3.354973316192627, + "learning_rate": 3.76507419328957e-07, + "loss": 0.3498, + "step": 10462 + }, + { + "epoch": 4.947044917257683, + "grad_norm": 3.5561892986297607, + "learning_rate": 3.761782580258408e-07, + "loss": 0.4384, + "step": 10463 + }, + { + "epoch": 4.947517730496454, + "grad_norm": 3.2498281002044678, + "learning_rate": 3.7584922896379244e-07, + "loss": 0.3289, + "step": 10464 + }, + { + "epoch": 4.947990543735225, + "grad_norm": 3.250598907470703, + "learning_rate": 3.755203321632986e-07, + "loss": 0.4104, + "step": 10465 + }, + { + "epoch": 4.9484633569739955, + "grad_norm": 2.8788363933563232, + "learning_rate": 3.7519156764483727e-07, + "loss": 0.2896, + "step": 10466 + }, + { + "epoch": 4.948936170212766, + "grad_norm": 3.068180561065674, + "learning_rate": 3.7486293542888075e-07, + "loss": 0.3346, + "step": 10467 + }, + { + "epoch": 4.949408983451537, + "grad_norm": 3.4533181190490723, + "learning_rate": 3.7453443553589043e-07, + "loss": 0.3917, + "step": 10468 + }, + { + "epoch": 4.949881796690307, + "grad_norm": 2.8812358379364014, + "learning_rate": 3.7420606798632104e-07, + "loss": 0.3276, + "step": 10469 + }, + { + "epoch": 4.950354609929078, + "grad_norm": 3.0952184200286865, + "learning_rate": 3.7387783280061875e-07, + "loss": 0.3261, + "step": 10470 + }, + { + "epoch": 4.950827423167849, + "grad_norm": 3.2409560680389404, + "learning_rate": 3.735497299992205e-07, + "loss": 0.3504, + "step": 10471 + }, + { + "epoch": 4.951300236406619, + "grad_norm": 3.3790557384490967, + "learning_rate": 3.73221759602557e-07, + "loss": 0.3316, + "step": 10472 + }, + { + "epoch": 4.95177304964539, + "grad_norm": 3.2161364555358887, + "learning_rate": 3.728939216310487e-07, + "loss": 0.3364, + "step": 10473 + }, + { + "epoch": 4.952245862884161, + "grad_norm": 3.3514342308044434, + "learning_rate": 3.7256621610510884e-07, + "loss": 0.3912, + "step": 10474 + }, + { + "epoch": 4.952718676122932, + "grad_norm": 2.7333486080169678, + "learning_rate": 3.722386430451422e-07, + "loss": 0.3145, + "step": 10475 + }, + { + "epoch": 4.953191489361702, + "grad_norm": 3.104905128479004, + "learning_rate": 3.719112024715449e-07, + "loss": 0.3599, + "step": 10476 + }, + { + "epoch": 4.9536643026004725, + "grad_norm": 3.16666579246521, + "learning_rate": 3.715838944047059e-07, + "loss": 0.3462, + "step": 10477 + }, + { + "epoch": 4.954137115839243, + "grad_norm": 3.078171491622925, + "learning_rate": 3.7125671886500514e-07, + "loss": 0.3119, + "step": 10478 + }, + { + "epoch": 4.954609929078014, + "grad_norm": 3.261456251144409, + "learning_rate": 3.709296758728137e-07, + "loss": 0.3959, + "step": 10479 + }, + { + "epoch": 4.955082742316785, + "grad_norm": 3.0302278995513916, + "learning_rate": 3.706027654484962e-07, + "loss": 0.3526, + "step": 10480 + }, + { + "epoch": 4.955555555555556, + "grad_norm": 3.175342559814453, + "learning_rate": 3.702759876124068e-07, + "loss": 0.3237, + "step": 10481 + }, + { + "epoch": 4.9560283687943265, + "grad_norm": 3.4779844284057617, + "learning_rate": 3.699493423848938e-07, + "loss": 0.3075, + "step": 10482 + }, + { + "epoch": 4.956501182033097, + "grad_norm": 2.809904098510742, + "learning_rate": 3.69622829786295e-07, + "loss": 0.3238, + "step": 10483 + }, + { + "epoch": 4.956973995271868, + "grad_norm": 3.092604875564575, + "learning_rate": 3.692964498369406e-07, + "loss": 0.3344, + "step": 10484 + }, + { + "epoch": 4.957446808510638, + "grad_norm": 3.477560520172119, + "learning_rate": 3.689702025571543e-07, + "loss": 0.3525, + "step": 10485 + }, + { + "epoch": 4.957919621749409, + "grad_norm": 4.119097709655762, + "learning_rate": 3.6864408796724815e-07, + "loss": 0.3953, + "step": 10486 + }, + { + "epoch": 4.95839243498818, + "grad_norm": 3.1418824195861816, + "learning_rate": 3.6831810608752986e-07, + "loss": 0.3689, + "step": 10487 + }, + { + "epoch": 4.95886524822695, + "grad_norm": 3.1947824954986572, + "learning_rate": 3.6799225693829596e-07, + "loss": 0.3427, + "step": 10488 + }, + { + "epoch": 4.959338061465721, + "grad_norm": 3.196894884109497, + "learning_rate": 3.6766654053983554e-07, + "loss": 0.3138, + "step": 10489 + }, + { + "epoch": 4.959810874704492, + "grad_norm": 2.9747161865234375, + "learning_rate": 3.6734095691242975e-07, + "loss": 0.3336, + "step": 10490 + }, + { + "epoch": 4.960283687943263, + "grad_norm": 3.2788970470428467, + "learning_rate": 3.670155060763503e-07, + "loss": 0.3418, + "step": 10491 + }, + { + "epoch": 4.960756501182033, + "grad_norm": 3.1619482040405273, + "learning_rate": 3.6669018805186335e-07, + "loss": 0.3173, + "step": 10492 + }, + { + "epoch": 4.9612293144208035, + "grad_norm": 2.9894869327545166, + "learning_rate": 3.6636500285922386e-07, + "loss": 0.3057, + "step": 10493 + }, + { + "epoch": 4.961702127659574, + "grad_norm": 3.1162378787994385, + "learning_rate": 3.660399505186793e-07, + "loss": 0.3404, + "step": 10494 + }, + { + "epoch": 4.962174940898345, + "grad_norm": 2.811485528945923, + "learning_rate": 3.657150310504706e-07, + "loss": 0.3199, + "step": 10495 + }, + { + "epoch": 4.962647754137116, + "grad_norm": 2.8914854526519775, + "learning_rate": 3.653902444748278e-07, + "loss": 0.3666, + "step": 10496 + }, + { + "epoch": 4.963120567375887, + "grad_norm": 4.075942516326904, + "learning_rate": 3.6506559081197517e-07, + "loss": 0.3737, + "step": 10497 + }, + { + "epoch": 4.9635933806146575, + "grad_norm": 4.395053863525391, + "learning_rate": 3.647410700821266e-07, + "loss": 0.338, + "step": 10498 + }, + { + "epoch": 4.964066193853428, + "grad_norm": 2.89145565032959, + "learning_rate": 3.644166823054884e-07, + "loss": 0.2893, + "step": 10499 + }, + { + "epoch": 4.964539007092198, + "grad_norm": 2.8189663887023926, + "learning_rate": 3.640924275022595e-07, + "loss": 0.289, + "step": 10500 + }, + { + "epoch": 4.965011820330969, + "grad_norm": 3.0912365913391113, + "learning_rate": 3.6376830569262946e-07, + "loss": 0.3474, + "step": 10501 + }, + { + "epoch": 4.96548463356974, + "grad_norm": 3.3087918758392334, + "learning_rate": 3.634443168967797e-07, + "loss": 0.3104, + "step": 10502 + }, + { + "epoch": 4.965957446808511, + "grad_norm": 2.855022430419922, + "learning_rate": 3.6312046113488403e-07, + "loss": 0.3328, + "step": 10503 + }, + { + "epoch": 4.966430260047281, + "grad_norm": 3.5445404052734375, + "learning_rate": 3.627967384271072e-07, + "loss": 0.322, + "step": 10504 + }, + { + "epoch": 4.966903073286052, + "grad_norm": 3.526319742202759, + "learning_rate": 3.624731487936065e-07, + "loss": 0.3264, + "step": 10505 + }, + { + "epoch": 4.967375886524823, + "grad_norm": 3.521204948425293, + "learning_rate": 3.621496922545298e-07, + "loss": 0.369, + "step": 10506 + }, + { + "epoch": 4.967848699763593, + "grad_norm": 2.8956806659698486, + "learning_rate": 3.618263688300172e-07, + "loss": 0.3396, + "step": 10507 + }, + { + "epoch": 4.968321513002364, + "grad_norm": 3.155200958251953, + "learning_rate": 3.615031785402015e-07, + "loss": 0.354, + "step": 10508 + }, + { + "epoch": 4.9687943262411345, + "grad_norm": 3.2896533012390137, + "learning_rate": 3.611801214052052e-07, + "loss": 0.3034, + "step": 10509 + }, + { + "epoch": 4.969267139479905, + "grad_norm": 3.0860259532928467, + "learning_rate": 3.608571974451447e-07, + "loss": 0.3354, + "step": 10510 + }, + { + "epoch": 4.969739952718676, + "grad_norm": 3.3194656372070312, + "learning_rate": 3.6053440668012697e-07, + "loss": 0.3714, + "step": 10511 + }, + { + "epoch": 4.970212765957447, + "grad_norm": 2.9831063747406006, + "learning_rate": 3.602117491302498e-07, + "loss": 0.311, + "step": 10512 + }, + { + "epoch": 4.970685579196218, + "grad_norm": 3.175940752029419, + "learning_rate": 3.59889224815605e-07, + "loss": 0.3658, + "step": 10513 + }, + { + "epoch": 4.9711583924349885, + "grad_norm": 3.051496982574463, + "learning_rate": 3.5956683375627324e-07, + "loss": 0.3458, + "step": 10514 + }, + { + "epoch": 4.971631205673759, + "grad_norm": 3.0264453887939453, + "learning_rate": 3.592445759723298e-07, + "loss": 0.2843, + "step": 10515 + }, + { + "epoch": 4.972104018912529, + "grad_norm": 3.404376745223999, + "learning_rate": 3.589224514838399e-07, + "loss": 0.366, + "step": 10516 + }, + { + "epoch": 4.9725768321513, + "grad_norm": 3.640212297439575, + "learning_rate": 3.586004603108598e-07, + "loss": 0.3248, + "step": 10517 + }, + { + "epoch": 4.973049645390071, + "grad_norm": 3.0829873085021973, + "learning_rate": 3.5827860247344e-07, + "loss": 0.3613, + "step": 10518 + }, + { + "epoch": 4.973522458628842, + "grad_norm": 3.6157045364379883, + "learning_rate": 3.5795687799162064e-07, + "loss": 0.3599, + "step": 10519 + }, + { + "epoch": 4.973995271867612, + "grad_norm": 3.150632619857788, + "learning_rate": 3.576352868854335e-07, + "loss": 0.3242, + "step": 10520 + }, + { + "epoch": 4.974468085106383, + "grad_norm": 3.04829740524292, + "learning_rate": 3.5731382917490286e-07, + "loss": 0.3819, + "step": 10521 + }, + { + "epoch": 4.974940898345154, + "grad_norm": 3.216092348098755, + "learning_rate": 3.5699250488004516e-07, + "loss": 0.3538, + "step": 10522 + }, + { + "epoch": 4.975413711583924, + "grad_norm": 3.36538028717041, + "learning_rate": 3.5667131402086717e-07, + "loss": 0.3381, + "step": 10523 + }, + { + "epoch": 4.975886524822695, + "grad_norm": 3.3398420810699463, + "learning_rate": 3.563502566173685e-07, + "loss": 0.3085, + "step": 10524 + }, + { + "epoch": 4.9763593380614655, + "grad_norm": 3.10583233833313, + "learning_rate": 3.5602933268953893e-07, + "loss": 0.3023, + "step": 10525 + }, + { + "epoch": 4.976832151300236, + "grad_norm": 3.422929525375366, + "learning_rate": 3.557085422573625e-07, + "loss": 0.3319, + "step": 10526 + }, + { + "epoch": 4.977304964539007, + "grad_norm": 3.7357773780822754, + "learning_rate": 3.5538788534081214e-07, + "loss": 0.3762, + "step": 10527 + }, + { + "epoch": 4.977777777777778, + "grad_norm": 3.0172133445739746, + "learning_rate": 3.550673619598549e-07, + "loss": 0.3292, + "step": 10528 + }, + { + "epoch": 4.978250591016549, + "grad_norm": 3.2497189044952393, + "learning_rate": 3.5474697213444763e-07, + "loss": 0.3292, + "step": 10529 + }, + { + "epoch": 4.9787234042553195, + "grad_norm": 2.8510115146636963, + "learning_rate": 3.544267158845394e-07, + "loss": 0.3717, + "step": 10530 + }, + { + "epoch": 4.97919621749409, + "grad_norm": 3.2559750080108643, + "learning_rate": 3.541065932300719e-07, + "loss": 0.3656, + "step": 10531 + }, + { + "epoch": 4.97966903073286, + "grad_norm": 3.3215935230255127, + "learning_rate": 3.537866041909768e-07, + "loss": 0.364, + "step": 10532 + }, + { + "epoch": 4.980141843971631, + "grad_norm": 3.4923696517944336, + "learning_rate": 3.5346674878717954e-07, + "loss": 0.3464, + "step": 10533 + }, + { + "epoch": 4.980614657210402, + "grad_norm": 3.5320425033569336, + "learning_rate": 3.531470270385959e-07, + "loss": 0.3506, + "step": 10534 + }, + { + "epoch": 4.9810874704491725, + "grad_norm": 3.290199041366577, + "learning_rate": 3.528274389651323e-07, + "loss": 0.4092, + "step": 10535 + }, + { + "epoch": 4.981560283687943, + "grad_norm": 3.108628034591675, + "learning_rate": 3.5250798458668966e-07, + "loss": 0.3522, + "step": 10536 + }, + { + "epoch": 4.982033096926714, + "grad_norm": 3.3015148639678955, + "learning_rate": 3.521886639231584e-07, + "loss": 0.3609, + "step": 10537 + }, + { + "epoch": 4.982505910165485, + "grad_norm": 3.506431818008423, + "learning_rate": 3.518694769944211e-07, + "loss": 0.3458, + "step": 10538 + }, + { + "epoch": 4.982978723404255, + "grad_norm": 3.560453414916992, + "learning_rate": 3.5155042382035236e-07, + "loss": 0.3803, + "step": 10539 + }, + { + "epoch": 4.983451536643026, + "grad_norm": 3.1382486820220947, + "learning_rate": 3.5123150442081757e-07, + "loss": 0.3209, + "step": 10540 + }, + { + "epoch": 4.9839243498817964, + "grad_norm": 4.326927661895752, + "learning_rate": 3.5091271881567523e-07, + "loss": 0.3649, + "step": 10541 + }, + { + "epoch": 4.984397163120567, + "grad_norm": 3.0951757431030273, + "learning_rate": 3.50594067024774e-07, + "loss": 0.3808, + "step": 10542 + }, + { + "epoch": 4.984869976359338, + "grad_norm": 3.264277458190918, + "learning_rate": 3.5027554906795574e-07, + "loss": 0.3408, + "step": 10543 + }, + { + "epoch": 4.985342789598109, + "grad_norm": 3.3679237365722656, + "learning_rate": 3.4995716496505293e-07, + "loss": 0.3746, + "step": 10544 + }, + { + "epoch": 4.98581560283688, + "grad_norm": 3.489201545715332, + "learning_rate": 3.496389147358892e-07, + "loss": 0.3725, + "step": 10545 + }, + { + "epoch": 4.98628841607565, + "grad_norm": 2.8233766555786133, + "learning_rate": 3.4932079840028193e-07, + "loss": 0.3178, + "step": 10546 + }, + { + "epoch": 4.986761229314421, + "grad_norm": 3.1723084449768066, + "learning_rate": 3.490028159780373e-07, + "loss": 0.348, + "step": 10547 + }, + { + "epoch": 4.987234042553191, + "grad_norm": 3.2631607055664062, + "learning_rate": 3.4868496748895616e-07, + "loss": 0.3608, + "step": 10548 + }, + { + "epoch": 4.987706855791962, + "grad_norm": 3.4170608520507812, + "learning_rate": 3.483672529528287e-07, + "loss": 0.3819, + "step": 10549 + }, + { + "epoch": 4.988179669030733, + "grad_norm": 3.002686023712158, + "learning_rate": 3.480496723894375e-07, + "loss": 0.2695, + "step": 10550 + }, + { + "epoch": 4.9886524822695035, + "grad_norm": 3.051232099533081, + "learning_rate": 3.4773222581855753e-07, + "loss": 0.3638, + "step": 10551 + }, + { + "epoch": 4.989125295508274, + "grad_norm": 2.959977149963379, + "learning_rate": 3.474149132599544e-07, + "loss": 0.3338, + "step": 10552 + }, + { + "epoch": 4.989598108747045, + "grad_norm": 2.925457000732422, + "learning_rate": 3.470977347333859e-07, + "loss": 0.3212, + "step": 10553 + }, + { + "epoch": 4.990070921985816, + "grad_norm": 3.0996408462524414, + "learning_rate": 3.4678069025860154e-07, + "loss": 0.3447, + "step": 10554 + }, + { + "epoch": 4.990543735224586, + "grad_norm": 2.8487865924835205, + "learning_rate": 3.4646377985534106e-07, + "loss": 0.3434, + "step": 10555 + }, + { + "epoch": 4.991016548463357, + "grad_norm": 2.8337016105651855, + "learning_rate": 3.461470035433387e-07, + "loss": 0.342, + "step": 10556 + }, + { + "epoch": 4.991489361702127, + "grad_norm": 2.9243876934051514, + "learning_rate": 3.4583036134231805e-07, + "loss": 0.3256, + "step": 10557 + }, + { + "epoch": 4.991962174940898, + "grad_norm": 3.2548747062683105, + "learning_rate": 3.455138532719948e-07, + "loss": 0.3313, + "step": 10558 + }, + { + "epoch": 4.992434988179669, + "grad_norm": 3.03932523727417, + "learning_rate": 3.451974793520771e-07, + "loss": 0.3854, + "step": 10559 + }, + { + "epoch": 4.99290780141844, + "grad_norm": 3.4757370948791504, + "learning_rate": 3.44881239602263e-07, + "loss": 0.3909, + "step": 10560 + }, + { + "epoch": 4.993380614657211, + "grad_norm": 2.9729294776916504, + "learning_rate": 3.4456513404224513e-07, + "loss": 0.3645, + "step": 10561 + }, + { + "epoch": 4.993853427895981, + "grad_norm": 3.2144060134887695, + "learning_rate": 3.4424916269170495e-07, + "loss": 0.3236, + "step": 10562 + }, + { + "epoch": 4.994326241134752, + "grad_norm": 3.742386817932129, + "learning_rate": 3.4393332557031615e-07, + "loss": 0.332, + "step": 10563 + }, + { + "epoch": 4.994799054373522, + "grad_norm": 3.2569401264190674, + "learning_rate": 3.4361762269774557e-07, + "loss": 0.3774, + "step": 10564 + }, + { + "epoch": 4.995271867612293, + "grad_norm": 2.91739821434021, + "learning_rate": 3.433020540936499e-07, + "loss": 0.3061, + "step": 10565 + }, + { + "epoch": 4.995744680851064, + "grad_norm": 3.534137487411499, + "learning_rate": 3.429866197776788e-07, + "loss": 0.357, + "step": 10566 + }, + { + "epoch": 4.9962174940898345, + "grad_norm": 3.215837001800537, + "learning_rate": 3.4267131976947284e-07, + "loss": 0.3395, + "step": 10567 + }, + { + "epoch": 4.996690307328605, + "grad_norm": 3.294857978820801, + "learning_rate": 3.4235615408866384e-07, + "loss": 0.3273, + "step": 10568 + }, + { + "epoch": 4.997163120567376, + "grad_norm": 3.519171953201294, + "learning_rate": 3.4204112275487646e-07, + "loss": 0.3712, + "step": 10569 + }, + { + "epoch": 4.997635933806147, + "grad_norm": 3.037527084350586, + "learning_rate": 3.4172622578772544e-07, + "loss": 0.2949, + "step": 10570 + }, + { + "epoch": 4.998108747044917, + "grad_norm": 3.309682846069336, + "learning_rate": 3.4141146320681913e-07, + "loss": 0.3068, + "step": 10571 + }, + { + "epoch": 4.998581560283688, + "grad_norm": 3.2197179794311523, + "learning_rate": 3.410968350317559e-07, + "loss": 0.3725, + "step": 10572 + }, + { + "epoch": 4.999054373522458, + "grad_norm": 3.0465641021728516, + "learning_rate": 3.4078234128212537e-07, + "loss": 0.3505, + "step": 10573 + }, + { + "epoch": 4.999527186761229, + "grad_norm": 3.066941022872925, + "learning_rate": 3.404679819775114e-07, + "loss": 0.3435, + "step": 10574 + }, + { + "epoch": 5.0, + "grad_norm": 3.3947532176971436, + "learning_rate": 3.401537571374869e-07, + "loss": 0.3344, + "step": 10575 + } + ], + "logging_steps": 1, + "max_steps": 12690, + "num_input_tokens_seen": 0, + "num_train_epochs": 6, + "save_steps": 2115, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 2.6712206559066194e+19, + "train_batch_size": 4, + "trial_name": null, + "trial_params": null +} diff --git a/checkpoint-10575/training_args.bin b/checkpoint-10575/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..3885e6a27b204c834ff6fbc6f3a5a4c7f69fe2cc --- /dev/null +++ b/checkpoint-10575/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:51188d498a3fe1bf9ddf234271786a31e25e89b3aa68721a97f8264e1013c9b6 +size 8056 diff --git a/checkpoint-10575/zero_to_fp32.py b/checkpoint-10575/zero_to_fp32.py new file mode 100644 index 0000000000000000000000000000000000000000..24cc342e78d1a006c782b3a4cd68d9ce786d8fd8 --- /dev/null +++ b/checkpoint-10575/zero_to_fp32.py @@ -0,0 +1,604 @@ +#!/usr/bin/env python + +# Copyright (c) Microsoft Corporation. +# SPDX-License-Identifier: Apache-2.0 + +# DeepSpeed Team + +# This script extracts fp32 consolidated weights from a zero 1, 2 and 3 DeepSpeed checkpoints. It gets +# copied into the top level checkpoint dir, so the user can easily do the conversion at any point in +# the future. Once extracted, the weights don't require DeepSpeed and can be used in any +# application. +# +# example: python zero_to_fp32.py . pytorch_model.bin + +import argparse +import torch +import glob +import math +import os +import re +from collections import OrderedDict +from dataclasses import dataclass + +# while this script doesn't use deepspeed to recover data, since the checkpoints are pickled with +# DeepSpeed data structures it has to be available in the current python environment. +from deepspeed.utils import logger +from deepspeed.checkpoint.constants import (DS_VERSION, OPTIMIZER_STATE_DICT, SINGLE_PARTITION_OF_FP32_GROUPS, + FP32_FLAT_GROUPS, ZERO_STAGE, PARTITION_COUNT, PARAM_SHAPES, BUFFER_NAMES, + FROZEN_PARAM_SHAPES, FROZEN_PARAM_FRAGMENTS) + + +@dataclass +class zero_model_state: + buffers: dict() + param_shapes: dict() + shared_params: list + ds_version: int + frozen_param_shapes: dict() + frozen_param_fragments: dict() + + +debug = 0 + +# load to cpu +device = torch.device('cpu') + + +def atoi(text): + return int(text) if text.isdigit() else text + + +def natural_keys(text): + ''' + alist.sort(key=natural_keys) sorts in human order + http://nedbatchelder.com/blog/200712/human_sorting.html + (See Toothy's implementation in the comments) + ''' + return [atoi(c) for c in re.split(r'(\d+)', text)] + + +def get_model_state_file(checkpoint_dir, zero_stage): + if not os.path.isdir(checkpoint_dir): + raise FileNotFoundError(f"Directory '{checkpoint_dir}' doesn't exist") + + # there should be only one file + if zero_stage <= 2: + file = os.path.join(checkpoint_dir, "mp_rank_00_model_states.pt") + elif zero_stage == 3: + file = os.path.join(checkpoint_dir, "zero_pp_rank_0_mp_rank_00_model_states.pt") + + if not os.path.exists(file): + raise FileNotFoundError(f"can't find model states file at '{file}'") + + return file + + +def get_checkpoint_files(checkpoint_dir, glob_pattern): + # XXX: need to test that this simple glob rule works for multi-node setup too + ckpt_files = sorted(glob.glob(os.path.join(checkpoint_dir, glob_pattern)), key=natural_keys) + + if len(ckpt_files) == 0: + raise FileNotFoundError(f"can't find {glob_pattern} files in directory '{checkpoint_dir}'") + + return ckpt_files + + +def get_optim_files(checkpoint_dir): + return get_checkpoint_files(checkpoint_dir, "*_optim_states.pt") + + +def get_model_state_files(checkpoint_dir): + return get_checkpoint_files(checkpoint_dir, "*_model_states.pt") + + +def parse_model_states(files): + zero_model_states = [] + for file in files: + state_dict = torch.load(file, map_location=device) + + if BUFFER_NAMES not in state_dict: + raise ValueError(f"{file} is not a model state checkpoint") + buffer_names = state_dict[BUFFER_NAMES] + if debug: + print("Found buffers:", buffer_names) + + # recover just the buffers while restoring them to fp32 if they were saved in fp16 + buffers = {k: v.float() for k, v in state_dict["module"].items() if k in buffer_names} + param_shapes = state_dict[PARAM_SHAPES] + + # collect parameters that are included in param_shapes + param_names = [] + for s in param_shapes: + for name in s.keys(): + param_names.append(name) + + # update with frozen parameters + frozen_param_shapes = state_dict.get(FROZEN_PARAM_SHAPES, None) + if frozen_param_shapes is not None: + if debug: + print(f"Found frozen_param_shapes: {frozen_param_shapes}") + param_names += list(frozen_param_shapes.keys()) + + # handle shared params + shared_params = [[k, v] for k, v in state_dict["shared_params"].items()] + + ds_version = state_dict.get(DS_VERSION, None) + + frozen_param_fragments = state_dict.get(FROZEN_PARAM_FRAGMENTS, None) + + z_model_state = zero_model_state(buffers=buffers, + param_shapes=param_shapes, + shared_params=shared_params, + ds_version=ds_version, + frozen_param_shapes=frozen_param_shapes, + frozen_param_fragments=frozen_param_fragments) + zero_model_states.append(z_model_state) + + return zero_model_states + + +def parse_optim_states(files, ds_checkpoint_dir): + + total_files = len(files) + state_dicts = [] + for f in files: + state_dict = torch.load(f, map_location=device) + # immediately discard the potentially huge 2 optimizer states as we only care for fp32 master weights + # and also handle the case where it was already removed by another helper script + state_dict["optimizer_state_dict"].pop("optimizer_state_dict", None) + state_dicts.append(state_dict) + + if not ZERO_STAGE in state_dicts[0][OPTIMIZER_STATE_DICT]: + raise ValueError(f"{files[0]} is not a zero checkpoint") + zero_stage = state_dicts[0][OPTIMIZER_STATE_DICT][ZERO_STAGE] + world_size = state_dicts[0][OPTIMIZER_STATE_DICT][PARTITION_COUNT] + + # For ZeRO-2 each param group can have different partition_count as data parallelism for expert + # parameters can be different from data parallelism for non-expert parameters. So we can just + # use the max of the partition_count to get the dp world_size. + + if type(world_size) is list: + world_size = max(world_size) + + if world_size != total_files: + raise ValueError( + f"Expected {world_size} of '*_optim_states.pt' under '{ds_checkpoint_dir}' but found {total_files} files. " + "Possibly due to an overwrite of an old checkpoint, or a checkpoint didn't get saved by one or more processes." + ) + + # the groups are named differently in each stage + if zero_stage <= 2: + fp32_groups_key = SINGLE_PARTITION_OF_FP32_GROUPS + elif zero_stage == 3: + fp32_groups_key = FP32_FLAT_GROUPS + else: + raise ValueError(f"unknown zero stage {zero_stage}") + + if zero_stage <= 2: + fp32_flat_groups = [state_dicts[i][OPTIMIZER_STATE_DICT][fp32_groups_key] for i in range(len(state_dicts))] + elif zero_stage == 3: + # if there is more than one param group, there will be multiple flattened tensors - one + # flattened tensor per group - for simplicity merge them into a single tensor + # + # XXX: could make the script more memory efficient for when there are multiple groups - it + # will require matching the sub-lists of param_shapes for each param group flattened tensor + + fp32_flat_groups = [ + torch.cat(state_dicts[i][OPTIMIZER_STATE_DICT][fp32_groups_key], 0) for i in range(len(state_dicts)) + ] + + return zero_stage, world_size, fp32_flat_groups + + +def _get_fp32_state_dict_from_zero_checkpoint(ds_checkpoint_dir, exclude_frozen_parameters): + """ + Returns fp32 state_dict reconstructed from ds checkpoint + + Args: + - ``ds_checkpoint_dir``: path to the deepspeed checkpoint folder (where the optimizer files are) + + """ + print(f"Processing zero checkpoint '{ds_checkpoint_dir}'") + + optim_files = get_optim_files(ds_checkpoint_dir) + zero_stage, world_size, fp32_flat_groups = parse_optim_states(optim_files, ds_checkpoint_dir) + print(f"Detected checkpoint of type zero stage {zero_stage}, world_size: {world_size}") + + model_files = get_model_state_files(ds_checkpoint_dir) + + zero_model_states = parse_model_states(model_files) + print(f'Parsing checkpoint created by deepspeed=={zero_model_states[0].ds_version}') + + if zero_stage <= 2: + return _get_fp32_state_dict_from_zero2_checkpoint(world_size, fp32_flat_groups, zero_model_states, + exclude_frozen_parameters) + elif zero_stage == 3: + return _get_fp32_state_dict_from_zero3_checkpoint(world_size, fp32_flat_groups, zero_model_states, + exclude_frozen_parameters) + + +def _zero2_merge_frozen_params(state_dict, zero_model_states): + if zero_model_states[0].frozen_param_shapes is None or len(zero_model_states[0].frozen_param_shapes) == 0: + return + + frozen_param_shapes = zero_model_states[0].frozen_param_shapes + frozen_param_fragments = zero_model_states[0].frozen_param_fragments + + if debug: + num_elem = sum(s.numel() for s in frozen_param_shapes.values()) + print(f'rank 0: {FROZEN_PARAM_SHAPES}.numel = {num_elem}') + + wanted_params = len(frozen_param_shapes) + wanted_numel = sum(s.numel() for s in frozen_param_shapes.values()) + avail_numel = sum([p.numel() for p in frozen_param_fragments.values()]) + print(f'Frozen params: Have {avail_numel} numels to process.') + print(f'Frozen params: Need {wanted_numel} numels in {wanted_params} params') + + total_params = 0 + total_numel = 0 + for name, shape in frozen_param_shapes.items(): + total_params += 1 + unpartitioned_numel = shape.numel() + total_numel += unpartitioned_numel + + state_dict[name] = frozen_param_fragments[name] + + if debug: + print(f"{name} full shape: {shape} unpartitioned numel {unpartitioned_numel} ") + + print(f"Reconstructed Frozen fp32 state dict with {total_params} params {total_numel} elements") + + +def _has_callable(obj, fn): + attr = getattr(obj, fn, None) + return callable(attr) + + +def _zero2_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states): + param_shapes = zero_model_states[0].param_shapes + + # Reconstruction protocol: + # + # XXX: document this + + if debug: + for i in range(world_size): + for j in range(len(fp32_flat_groups[0])): + print(f"{FP32_FLAT_GROUPS}[{i}][{j}].shape={fp32_flat_groups[i][j].shape}") + + # XXX: memory usage doubles here (zero2) + num_param_groups = len(fp32_flat_groups[0]) + merged_single_partition_of_fp32_groups = [] + for i in range(num_param_groups): + merged_partitions = [sd[i] for sd in fp32_flat_groups] + full_single_fp32_vector = torch.cat(merged_partitions, 0) + merged_single_partition_of_fp32_groups.append(full_single_fp32_vector) + avail_numel = sum( + [full_single_fp32_vector.numel() for full_single_fp32_vector in merged_single_partition_of_fp32_groups]) + + if debug: + wanted_params = sum([len(shapes) for shapes in param_shapes]) + wanted_numel = sum([sum(shape.numel() for shape in shapes.values()) for shapes in param_shapes]) + # not asserting if there is a mismatch due to possible padding + print(f"Have {avail_numel} numels to process.") + print(f"Need {wanted_numel} numels in {wanted_params} params.") + + # params + # XXX: for huge models that can't fit into the host's RAM we will have to recode this to support + # out-of-core computing solution + total_numel = 0 + total_params = 0 + for shapes, full_single_fp32_vector in zip(param_shapes, merged_single_partition_of_fp32_groups): + offset = 0 + avail_numel = full_single_fp32_vector.numel() + for name, shape in shapes.items(): + + unpartitioned_numel = shape.numel() if _has_callable(shape, 'numel') else math.prod(shape) + total_numel += unpartitioned_numel + total_params += 1 + + if debug: + print(f"{name} full shape: {shape} unpartitioned numel {unpartitioned_numel} ") + state_dict[name] = full_single_fp32_vector.narrow(0, offset, unpartitioned_numel).view(shape) + offset += unpartitioned_numel + + # Z2 started to align to 2*world_size to improve nccl performance. Therefore both offset and + # avail_numel can differ by anywhere between 0..2*world_size. Due to two unrelated complex + # paddings performed in the code it's almost impossible to predict the exact numbers w/o the + # live optimizer object, so we are checking that the numbers are within the right range + align_to = 2 * world_size + + def zero2_align(x): + return align_to * math.ceil(x / align_to) + + if debug: + print(f"original offset={offset}, avail_numel={avail_numel}") + + offset = zero2_align(offset) + avail_numel = zero2_align(avail_numel) + + if debug: + print(f"aligned offset={offset}, avail_numel={avail_numel}") + + # Sanity check + if offset != avail_numel: + raise ValueError(f"consumed {offset} numels out of {avail_numel} - something is wrong") + + print(f"Reconstructed fp32 state dict with {total_params} params {total_numel} elements") + + +def _get_fp32_state_dict_from_zero2_checkpoint(world_size, fp32_flat_groups, zero_model_states, + exclude_frozen_parameters): + state_dict = OrderedDict() + + # buffers + buffers = zero_model_states[0].buffers + state_dict.update(buffers) + if debug: + print(f"added {len(buffers)} buffers") + + if not exclude_frozen_parameters: + _zero2_merge_frozen_params(state_dict, zero_model_states) + + _zero2_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states) + + # recover shared parameters + for pair in zero_model_states[0].shared_params: + if pair[1] in state_dict: + state_dict[pair[0]] = state_dict[pair[1]] + + return state_dict + + +def zero3_partitioned_param_info(unpartitioned_numel, world_size): + remainder = unpartitioned_numel % world_size + padding_numel = (world_size - remainder) if remainder else 0 + partitioned_numel = math.ceil(unpartitioned_numel / world_size) + return partitioned_numel, padding_numel + + +def _zero3_merge_frozen_params(state_dict, world_size, zero_model_states): + if zero_model_states[0].frozen_param_shapes is None or len(zero_model_states[0].frozen_param_shapes) == 0: + return + + if debug: + for i in range(world_size): + num_elem = sum(s.numel() for s in zero_model_states[i].frozen_param_fragments.values()) + print(f'rank {i}: {FROZEN_PARAM_SHAPES}.numel = {num_elem}') + + frozen_param_shapes = zero_model_states[0].frozen_param_shapes + wanted_params = len(frozen_param_shapes) + wanted_numel = sum(s.numel() for s in frozen_param_shapes.values()) + avail_numel = sum([p.numel() for p in zero_model_states[0].frozen_param_fragments.values()]) * world_size + print(f'Frozen params: Have {avail_numel} numels to process.') + print(f'Frozen params: Need {wanted_numel} numels in {wanted_params} params') + + total_params = 0 + total_numel = 0 + for name, shape in zero_model_states[0].frozen_param_shapes.items(): + total_params += 1 + unpartitioned_numel = shape.numel() + total_numel += unpartitioned_numel + + param_frags = tuple(model_state.frozen_param_fragments[name] for model_state in zero_model_states) + state_dict[name] = torch.cat(param_frags, 0).narrow(0, 0, unpartitioned_numel).view(shape) + + partitioned_numel, partitioned_padding_numel = zero3_partitioned_param_info(unpartitioned_numel, world_size) + + if debug: + print( + f"Frozen params: {total_params} {name} full shape: {shape} partition0 numel={partitioned_numel} partitioned_padding_numel={partitioned_padding_numel}" + ) + + print(f"Reconstructed Frozen fp32 state dict with {total_params} params {total_numel} elements") + + +def _zero3_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states): + param_shapes = zero_model_states[0].param_shapes + avail_numel = fp32_flat_groups[0].numel() * world_size + # Reconstruction protocol: For zero3 we need to zip the partitions together at boundary of each + # param, re-consolidating each param, while dealing with padding if any + + # merge list of dicts, preserving order + param_shapes = {k: v for d in param_shapes for k, v in d.items()} + + if debug: + for i in range(world_size): + print(f"{FP32_FLAT_GROUPS}[{i}].shape={fp32_flat_groups[i].shape}") + + wanted_params = len(param_shapes) + wanted_numel = sum(shape.numel() for shape in param_shapes.values()) + # not asserting if there is a mismatch due to possible padding + avail_numel = fp32_flat_groups[0].numel() * world_size + print(f"Trainable params: Have {avail_numel} numels to process.") + print(f"Trainable params: Need {wanted_numel} numels in {wanted_params} params.") + + # params + # XXX: for huge models that can't fit into the host's RAM we will have to recode this to support + # out-of-core computing solution + offset = 0 + total_numel = 0 + total_params = 0 + for name, shape in param_shapes.items(): + + unpartitioned_numel = shape.numel() + total_numel += unpartitioned_numel + total_params += 1 + + partitioned_numel, partitioned_padding_numel = zero3_partitioned_param_info(unpartitioned_numel, world_size) + + if debug: + print( + f"Trainable params: {total_params} {name} full shape: {shape} partition0 numel={partitioned_numel} partitioned_padding_numel={partitioned_padding_numel}" + ) + + # XXX: memory usage doubles here + state_dict[name] = torch.cat( + tuple(fp32_flat_groups[i].narrow(0, offset, partitioned_numel) for i in range(world_size)), + 0).narrow(0, 0, unpartitioned_numel).view(shape) + offset += partitioned_numel + + offset *= world_size + + # Sanity check + if offset != avail_numel: + raise ValueError(f"consumed {offset} numels out of {avail_numel} - something is wrong") + + print(f"Reconstructed Trainable fp32 state dict with {total_params} params {total_numel} elements") + + +def _get_fp32_state_dict_from_zero3_checkpoint(world_size, fp32_flat_groups, zero_model_states, + exclude_frozen_parameters): + state_dict = OrderedDict() + + # buffers + buffers = zero_model_states[0].buffers + state_dict.update(buffers) + if debug: + print(f"added {len(buffers)} buffers") + + if not exclude_frozen_parameters: + _zero3_merge_frozen_params(state_dict, world_size, zero_model_states) + + _zero3_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states) + + # recover shared parameters + for pair in zero_model_states[0].shared_params: + if pair[1] in state_dict: + state_dict[pair[0]] = state_dict[pair[1]] + + return state_dict + + +def get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, tag=None, exclude_frozen_parameters=False): + """ + Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated state_dict that can be loaded with + ``load_state_dict()`` and used for training without DeepSpeed or shared with others, for example + via a model hub. + + Args: + - ``checkpoint_dir``: path to the desired checkpoint folder + - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in 'latest' file. e.g., ``global_step14`` + - ``exclude_frozen_parameters``: exclude frozen parameters + + Returns: + - pytorch ``state_dict`` + + Note: this approach may not work if your application doesn't have sufficient free CPU memory and + you may need to use the offline approach using the ``zero_to_fp32.py`` script that is saved with + the checkpoint. + + A typical usage might be :: + + from deepspeed.utils.zero_to_fp32 import get_fp32_state_dict_from_zero_checkpoint + # do the training and checkpoint saving + state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir) # already on cpu + model = model.cpu() # move to cpu + model.load_state_dict(state_dict) + # submit to model hub or save the model to share with others + + In this example the ``model`` will no longer be usable in the deepspeed context of the same + application. i.e. you will need to re-initialize the deepspeed engine, since + ``model.load_state_dict(state_dict)`` will remove all the deepspeed magic from it. + + If you want it all done for you, use ``load_state_dict_from_zero_checkpoint`` instead. + + """ + if tag is None: + latest_path = os.path.join(checkpoint_dir, 'latest') + if os.path.isfile(latest_path): + with open(latest_path, 'r') as fd: + tag = fd.read().strip() + else: + raise ValueError(f"Unable to find 'latest' file at {latest_path}") + + ds_checkpoint_dir = os.path.join(checkpoint_dir, tag) + + if not os.path.isdir(ds_checkpoint_dir): + raise FileNotFoundError(f"Directory '{ds_checkpoint_dir}' doesn't exist") + + return _get_fp32_state_dict_from_zero_checkpoint(ds_checkpoint_dir, exclude_frozen_parameters) + + +def convert_zero_checkpoint_to_fp32_state_dict(checkpoint_dir, output_file, tag=None, exclude_frozen_parameters=False): + """ + Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated ``state_dict`` file that can be + loaded with ``torch.load(file)`` + ``load_state_dict()`` and used for training without DeepSpeed. + + Args: + - ``checkpoint_dir``: path to the desired checkpoint folder. (one that contains the tag-folder, like ``global_step14``) + - ``output_file``: path to the pytorch fp32 state_dict output file (e.g. path/pytorch_model.bin) + - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in the file named ``latest`` in the checkpoint folder, e.g., ``global_step14`` + - ``exclude_frozen_parameters``: exclude frozen parameters + """ + + state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, tag, exclude_frozen_parameters) + print(f"Saving fp32 state dict to {output_file}") + torch.save(state_dict, output_file) + + +def load_state_dict_from_zero_checkpoint(model, checkpoint_dir, tag=None): + """ + 1. Put the provided model to cpu + 2. Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated ``state_dict`` + 3. Load it into the provided model + + Args: + - ``model``: the model object to update + - ``checkpoint_dir``: path to the desired checkpoint folder. (one that contains the tag-folder, like ``global_step14``) + - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in the file named ``latest`` in the checkpoint folder, e.g., ``global_step14`` + + Returns: + - ``model`: modified model + + Make sure you have plenty of CPU memory available before you call this function. If you don't + have enough use the ``zero_to_fp32.py`` utility to do the conversion. You will find it + conveniently placed for you in the checkpoint folder. + + A typical usage might be :: + + from deepspeed.utils.zero_to_fp32 import load_state_dict_from_zero_checkpoint + model = load_state_dict_from_zero_checkpoint(trainer.model, checkpoint_dir) + # submit to model hub or save the model to share with others + + Note, that once this was run, the ``model`` will no longer be usable in the deepspeed context + of the same application. i.e. you will need to re-initialize the deepspeed engine, since + ``model.load_state_dict(state_dict)`` will remove all the deepspeed magic from it. + + """ + logger.info(f"Extracting fp32 weights") + state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, tag) + + logger.info(f"Overwriting model with fp32 weights") + model = model.cpu() + model.load_state_dict(state_dict, strict=False) + + return model + + +if __name__ == "__main__": + + parser = argparse.ArgumentParser() + parser.add_argument("checkpoint_dir", + type=str, + help="path to the desired checkpoint folder, e.g., path/checkpoint-12") + parser.add_argument( + "output_file", + type=str, + help="path to the pytorch fp32 state_dict output file (e.g. path/checkpoint-12/pytorch_model.bin)") + parser.add_argument("-t", + "--tag", + type=str, + default=None, + help="checkpoint tag used as a unique identifier for checkpoint. e.g., global_step1") + parser.add_argument("--exclude_frozen_parameters", action='store_true', help="exclude frozen parameters") + parser.add_argument("-d", "--debug", action='store_true', help="enable debug") + args = parser.parse_args() + + debug = args.debug + + convert_zero_checkpoint_to_fp32_state_dict(args.checkpoint_dir, + args.output_file, + tag=args.tag, + exclude_frozen_parameters=args.exclude_frozen_parameters) diff --git a/checkpoint-12690/README.md b/checkpoint-12690/README.md new file mode 100644 index 0000000000000000000000000000000000000000..049d467664ca6172b7ffbe6ba60b3eac7479cac4 --- /dev/null +++ b/checkpoint-12690/README.md @@ -0,0 +1,202 @@ +--- +base_model: meta-llama/Llama-3.1-8B +library_name: peft +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.14.0 \ No newline at end of file diff --git a/checkpoint-12690/adapter_config.json b/checkpoint-12690/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..38890ca21f7e1854f7350049a9113a9eec8a443a --- /dev/null +++ b/checkpoint-12690/adapter_config.json @@ -0,0 +1,40 @@ +{ + "alpha_pattern": {}, + "auto_mapping": null, + "base_model_name_or_path": "meta-llama/Llama-3.1-8B", + "bias": "none", + "eva_config": null, + "exclude_modules": null, + "fan_in_fan_out": null, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 512, + "lora_bias": false, + "lora_dropout": 0.05, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": [ + "embed_tokens", + "lm_head" + ], + "peft_type": "LORA", + "r": 256, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "v_proj", + "q_proj", + "gate_proj", + "k_proj", + "down_proj", + "up_proj", + "o_proj" + ], + "task_type": "CAUSAL_LM", + "use_dora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/checkpoint-12690/adapter_model.safetensors b/checkpoint-12690/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..a86dd37eb82fb1d46b236ea471b6db3ec90b904d --- /dev/null +++ b/checkpoint-12690/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1f07f4ee71740fb0bcf54ffe982c86734f08671b6b0cb55e4add89e30c744f1b +size 3443586272 diff --git a/checkpoint-12690/global_step12690/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt b/checkpoint-12690/global_step12690/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..887a519261f8f876fc4769c8738a1e205fa73ddd --- /dev/null +++ b/checkpoint-12690/global_step12690/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:04f19a180841f418255309732702e6882ca7ea020dc9a5f21223f035c03e2a0d +size 20661195036 diff --git a/checkpoint-12690/global_step12690/mp_rank_00_model_states.pt b/checkpoint-12690/global_step12690/mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..160501f67104b890930394ef272f22d85dca9d2d --- /dev/null +++ b/checkpoint-12690/global_step12690/mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:dc733ade2590ed396f26e68b03ae28b0ac8492cc6c22c4a5ebada21090755b37 +size 3555326841 diff --git a/checkpoint-12690/latest b/checkpoint-12690/latest new file mode 100644 index 0000000000000000000000000000000000000000..44725bf699ccd6f2b841ff307fc5f70037c9c745 --- /dev/null +++ b/checkpoint-12690/latest @@ -0,0 +1 @@ +global_step12690 \ No newline at end of file diff --git a/checkpoint-12690/rng_state.pth b/checkpoint-12690/rng_state.pth new file mode 100644 index 0000000000000000000000000000000000000000..59692a9331087e66e22d50634c54db6e81349d0b --- /dev/null +++ b/checkpoint-12690/rng_state.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:76d547af609f90786b6fc88240881d5ac35a1e7d00f6657cc6b00880840e6bef +size 14244 diff --git a/checkpoint-12690/scheduler.pt b/checkpoint-12690/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..d5de7097cc8d80d2c14fd06e8fe60f0ba95e6906 --- /dev/null +++ b/checkpoint-12690/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b6865648a0e7d98218d3202f30defe46013d7294dd42f0bff20e0d0463dab454 +size 1064 diff --git a/checkpoint-12690/special_tokens_map.json b/checkpoint-12690/special_tokens_map.json new file mode 100644 index 0000000000000000000000000000000000000000..e5b39b6305d89284b04934011c68dbb26bf588ca --- /dev/null +++ b/checkpoint-12690/special_tokens_map.json @@ -0,0 +1,23 @@ +{ + "bos_token": { + "content": "<|begin_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "eos_token": { + "content": "<|end_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "pad_token": { + "content": "<|end_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + } +} diff --git a/checkpoint-12690/tokenizer.json b/checkpoint-12690/tokenizer.json new file mode 100644 index 0000000000000000000000000000000000000000..1c1d8d5c9024994f1d3b00f9662b8dd89ca13cf2 --- /dev/null +++ b/checkpoint-12690/tokenizer.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6b9e4e7fb171f92fd137b777cc2714bf87d11576700a1dcd7a399e7bbe39537b +size 17209920 diff --git a/checkpoint-12690/tokenizer_config.json b/checkpoint-12690/tokenizer_config.json new file mode 100644 index 0000000000000000000000000000000000000000..81dd14db6632ad5b35b9d447732e37ac074873a5 --- /dev/null +++ b/checkpoint-12690/tokenizer_config.json @@ -0,0 +1,2063 @@ +{ + "added_tokens_decoder": { + "128000": { + "content": "<|begin_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128001": { + "content": "<|end_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128002": { + "content": "<|reserved_special_token_0|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128003": { + "content": "<|reserved_special_token_1|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128004": { + "content": "<|finetune_right_pad_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128005": { + "content": "<|reserved_special_token_2|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128006": { + "content": "<|start_header_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128007": { + "content": "<|end_header_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128008": { + "content": "<|eom_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128009": { + "content": "<|eot_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128010": { + "content": "<|python_tag|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128011": { + "content": "<|reserved_special_token_3|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128012": { + "content": "<|reserved_special_token_4|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128013": { + "content": "<|reserved_special_token_5|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128014": { + "content": "<|reserved_special_token_6|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128015": { + "content": "<|reserved_special_token_7|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128016": { + "content": "<|reserved_special_token_8|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128017": { + "content": "<|reserved_special_token_9|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128018": { + "content": "<|reserved_special_token_10|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128019": { + "content": "<|reserved_special_token_11|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128020": { + "content": "<|reserved_special_token_12|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128021": { + "content": "<|reserved_special_token_13|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128022": { + "content": "<|reserved_special_token_14|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128023": { + "content": "<|reserved_special_token_15|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128024": { + "content": "<|reserved_special_token_16|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128025": { + "content": "<|reserved_special_token_17|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128026": { + "content": "<|reserved_special_token_18|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128027": { + "content": "<|reserved_special_token_19|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128028": { + "content": "<|reserved_special_token_20|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128029": { + "content": "<|reserved_special_token_21|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128030": { + "content": "<|reserved_special_token_22|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128031": { + "content": "<|reserved_special_token_23|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128032": { + "content": "<|reserved_special_token_24|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128033": { + "content": "<|reserved_special_token_25|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128034": { + "content": "<|reserved_special_token_26|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128035": { + "content": "<|reserved_special_token_27|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128036": { + "content": "<|reserved_special_token_28|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128037": { + "content": "<|reserved_special_token_29|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128038": { + "content": "<|reserved_special_token_30|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128039": { + "content": "<|reserved_special_token_31|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128040": { + "content": "<|reserved_special_token_32|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128041": { + "content": "<|reserved_special_token_33|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128042": { + "content": "<|reserved_special_token_34|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128043": { + "content": "<|reserved_special_token_35|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128044": { + "content": "<|reserved_special_token_36|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128045": { + "content": "<|reserved_special_token_37|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128046": { + "content": "<|reserved_special_token_38|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128047": { + "content": "<|reserved_special_token_39|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128048": { + "content": "<|reserved_special_token_40|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128049": { + "content": "<|reserved_special_token_41|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128050": { + "content": "<|reserved_special_token_42|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128051": { + "content": "<|reserved_special_token_43|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128052": { + "content": "<|reserved_special_token_44|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128053": { + "content": "<|reserved_special_token_45|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128054": { + "content": "<|reserved_special_token_46|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128055": { + "content": "<|reserved_special_token_47|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128056": { + "content": "<|reserved_special_token_48|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128057": { + "content": "<|reserved_special_token_49|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128058": { + "content": "<|reserved_special_token_50|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128059": { + "content": "<|reserved_special_token_51|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128060": { + "content": "<|reserved_special_token_52|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128061": { + "content": "<|reserved_special_token_53|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128062": { + "content": "<|reserved_special_token_54|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128063": { + "content": "<|reserved_special_token_55|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128064": { + "content": "<|reserved_special_token_56|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128065": { + "content": "<|reserved_special_token_57|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128066": { + "content": "<|reserved_special_token_58|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128067": { + "content": "<|reserved_special_token_59|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128068": { + "content": "<|reserved_special_token_60|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128069": { + "content": "<|reserved_special_token_61|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128070": { + "content": "<|reserved_special_token_62|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128071": { + "content": "<|reserved_special_token_63|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128072": { + "content": "<|reserved_special_token_64|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128073": { + "content": "<|reserved_special_token_65|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128074": { + "content": "<|reserved_special_token_66|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128075": { + "content": "<|reserved_special_token_67|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128076": { + "content": "<|reserved_special_token_68|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128077": { + "content": "<|reserved_special_token_69|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128078": { + "content": "<|reserved_special_token_70|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128079": { + "content": "<|reserved_special_token_71|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128080": { + "content": "<|reserved_special_token_72|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128081": { + "content": "<|reserved_special_token_73|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128082": { + "content": "<|reserved_special_token_74|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128083": { + "content": "<|reserved_special_token_75|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128084": { + "content": "<|reserved_special_token_76|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128085": { + "content": "<|reserved_special_token_77|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128086": { + "content": "<|reserved_special_token_78|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128087": { + "content": "<|reserved_special_token_79|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128088": { + "content": "<|reserved_special_token_80|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128089": { + "content": "<|reserved_special_token_81|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128090": { + "content": "<|reserved_special_token_82|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128091": { + "content": "<|reserved_special_token_83|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128092": { + "content": "<|reserved_special_token_84|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128093": { + "content": "<|reserved_special_token_85|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128094": { + "content": "<|reserved_special_token_86|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128095": { + "content": "<|reserved_special_token_87|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128096": { + "content": "<|reserved_special_token_88|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128097": { + "content": "<|reserved_special_token_89|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128098": { + "content": "<|reserved_special_token_90|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128099": { + "content": "<|reserved_special_token_91|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128100": { + "content": "<|reserved_special_token_92|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128101": { + "content": "<|reserved_special_token_93|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128102": { + "content": "<|reserved_special_token_94|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128103": { + "content": "<|reserved_special_token_95|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128104": { + "content": "<|reserved_special_token_96|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128105": { + "content": "<|reserved_special_token_97|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128106": { + "content": "<|reserved_special_token_98|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128107": { + "content": "<|reserved_special_token_99|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128108": { + "content": "<|reserved_special_token_100|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128109": { + "content": "<|reserved_special_token_101|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128110": { + "content": "<|reserved_special_token_102|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128111": { + "content": "<|reserved_special_token_103|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128112": { + "content": "<|reserved_special_token_104|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128113": { + "content": "<|reserved_special_token_105|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128114": { + "content": "<|reserved_special_token_106|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128115": { + "content": "<|reserved_special_token_107|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128116": { + "content": "<|reserved_special_token_108|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128117": { + "content": "<|reserved_special_token_109|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128118": { + "content": "<|reserved_special_token_110|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128119": { + "content": "<|reserved_special_token_111|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128120": { + "content": "<|reserved_special_token_112|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128121": { + "content": "<|reserved_special_token_113|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128122": { + "content": "<|reserved_special_token_114|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128123": { + "content": "<|reserved_special_token_115|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128124": { + "content": "<|reserved_special_token_116|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128125": { + "content": "<|reserved_special_token_117|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128126": { + "content": "<|reserved_special_token_118|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128127": { + "content": "<|reserved_special_token_119|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128128": { + "content": "<|reserved_special_token_120|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128129": { + "content": "<|reserved_special_token_121|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128130": { + "content": "<|reserved_special_token_122|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128131": { + "content": "<|reserved_special_token_123|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128132": { + "content": "<|reserved_special_token_124|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128133": { + "content": "<|reserved_special_token_125|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128134": { + "content": "<|reserved_special_token_126|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128135": { + "content": "<|reserved_special_token_127|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128136": { + "content": "<|reserved_special_token_128|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128137": { + "content": "<|reserved_special_token_129|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128138": { + "content": "<|reserved_special_token_130|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128139": { + "content": "<|reserved_special_token_131|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128140": { + "content": "<|reserved_special_token_132|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128141": { + "content": "<|reserved_special_token_133|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128142": { + "content": "<|reserved_special_token_134|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128143": { + "content": "<|reserved_special_token_135|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128144": { + "content": "<|reserved_special_token_136|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128145": { + "content": "<|reserved_special_token_137|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128146": { + "content": "<|reserved_special_token_138|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128147": { + "content": "<|reserved_special_token_139|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128148": { + "content": "<|reserved_special_token_140|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128149": { + "content": "<|reserved_special_token_141|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128150": { + "content": "<|reserved_special_token_142|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128151": { + "content": "<|reserved_special_token_143|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128152": { + "content": "<|reserved_special_token_144|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128153": { + "content": "<|reserved_special_token_145|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128154": { + "content": "<|reserved_special_token_146|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128155": { + "content": "<|reserved_special_token_147|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128156": { + "content": "<|reserved_special_token_148|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128157": { + "content": "<|reserved_special_token_149|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128158": { + "content": "<|reserved_special_token_150|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128159": { + "content": "<|reserved_special_token_151|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128160": { + "content": "<|reserved_special_token_152|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128161": { + "content": "<|reserved_special_token_153|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128162": { + "content": "<|reserved_special_token_154|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128163": { + "content": "<|reserved_special_token_155|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128164": { + "content": "<|reserved_special_token_156|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128165": { + "content": "<|reserved_special_token_157|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128166": { + "content": "<|reserved_special_token_158|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128167": { + "content": "<|reserved_special_token_159|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128168": { + "content": "<|reserved_special_token_160|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128169": { + "content": "<|reserved_special_token_161|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128170": { + "content": "<|reserved_special_token_162|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128171": { + "content": "<|reserved_special_token_163|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128172": { + "content": "<|reserved_special_token_164|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128173": { + "content": "<|reserved_special_token_165|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128174": { + "content": "<|reserved_special_token_166|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128175": { + "content": "<|reserved_special_token_167|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128176": { + "content": "<|reserved_special_token_168|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128177": { + "content": "<|reserved_special_token_169|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128178": { + "content": "<|reserved_special_token_170|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128179": { + "content": "<|reserved_special_token_171|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128180": { + "content": "<|reserved_special_token_172|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128181": { + "content": "<|reserved_special_token_173|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128182": { + "content": "<|reserved_special_token_174|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128183": { + "content": "<|reserved_special_token_175|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128184": { + "content": "<|reserved_special_token_176|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128185": { + "content": "<|reserved_special_token_177|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128186": { + "content": "<|reserved_special_token_178|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128187": { + "content": "<|reserved_special_token_179|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128188": { + "content": "<|reserved_special_token_180|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128189": { + "content": "<|reserved_special_token_181|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128190": { + "content": "<|reserved_special_token_182|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128191": { + "content": "<|reserved_special_token_183|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128192": { + "content": "<|reserved_special_token_184|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128193": { + "content": "<|reserved_special_token_185|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128194": { + "content": "<|reserved_special_token_186|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128195": { + "content": "<|reserved_special_token_187|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128196": { + "content": "<|reserved_special_token_188|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128197": { + "content": "<|reserved_special_token_189|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128198": { + "content": "<|reserved_special_token_190|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128199": { + "content": "<|reserved_special_token_191|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128200": { + "content": "<|reserved_special_token_192|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128201": { + "content": "<|reserved_special_token_193|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128202": { + "content": "<|reserved_special_token_194|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128203": { + "content": "<|reserved_special_token_195|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128204": { + "content": "<|reserved_special_token_196|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128205": { + "content": "<|reserved_special_token_197|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128206": { + "content": "<|reserved_special_token_198|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128207": { + "content": "<|reserved_special_token_199|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128208": { + "content": "<|reserved_special_token_200|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128209": { + "content": "<|reserved_special_token_201|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128210": { + "content": "<|reserved_special_token_202|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128211": { + "content": "<|reserved_special_token_203|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128212": { + "content": "<|reserved_special_token_204|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128213": { + "content": "<|reserved_special_token_205|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128214": { + "content": "<|reserved_special_token_206|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128215": { + "content": "<|reserved_special_token_207|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128216": { + "content": "<|reserved_special_token_208|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128217": { + "content": "<|reserved_special_token_209|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128218": { + "content": "<|reserved_special_token_210|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128219": { + "content": "<|reserved_special_token_211|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128220": { + "content": "<|reserved_special_token_212|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128221": { + "content": "<|reserved_special_token_213|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128222": { + "content": "<|reserved_special_token_214|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128223": { + "content": "<|reserved_special_token_215|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128224": { + "content": "<|reserved_special_token_216|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128225": { + "content": "<|reserved_special_token_217|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128226": { + "content": "<|reserved_special_token_218|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128227": { + "content": "<|reserved_special_token_219|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128228": { + "content": "<|reserved_special_token_220|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128229": { + "content": "<|reserved_special_token_221|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128230": { + "content": "<|reserved_special_token_222|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128231": { + "content": "<|reserved_special_token_223|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128232": { + "content": "<|reserved_special_token_224|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128233": { + "content": "<|reserved_special_token_225|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128234": { + "content": "<|reserved_special_token_226|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128235": { + "content": "<|reserved_special_token_227|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128236": { + "content": "<|reserved_special_token_228|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128237": { + "content": "<|reserved_special_token_229|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128238": { + "content": "<|reserved_special_token_230|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128239": { + "content": "<|reserved_special_token_231|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128240": { + "content": "<|reserved_special_token_232|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128241": { + "content": "<|reserved_special_token_233|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128242": { + "content": "<|reserved_special_token_234|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128243": { + "content": "<|reserved_special_token_235|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128244": { + "content": "<|reserved_special_token_236|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128245": { + "content": "<|reserved_special_token_237|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128246": { + "content": "<|reserved_special_token_238|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128247": { + "content": "<|reserved_special_token_239|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128248": { + "content": "<|reserved_special_token_240|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128249": { + "content": "<|reserved_special_token_241|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128250": { + "content": "<|reserved_special_token_242|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128251": { + "content": "<|reserved_special_token_243|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128252": { + "content": "<|reserved_special_token_244|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128253": { + "content": "<|reserved_special_token_245|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128254": { + "content": "<|reserved_special_token_246|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128255": { + "content": "<|reserved_special_token_247|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + } + }, + "bos_token": "<|begin_of_text|>", + "clean_up_tokenization_spaces": true, + "eos_token": "<|end_of_text|>", + "extra_special_tokens": {}, + "model_input_names": [ + "input_ids", + "attention_mask" + ], + "model_max_length": 131072, + "pad_token": "<|end_of_text|>", + "tokenizer_class": "PreTrainedTokenizer" +} diff --git a/checkpoint-12690/trainer_state.json b/checkpoint-12690/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..28fbbd8d7dbf8f2d88e36569a6088362ca746929 --- /dev/null +++ b/checkpoint-12690/trainer_state.json @@ -0,0 +1,88863 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 6.0, + "eval_steps": 500, + "global_step": 12690, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.00047281323877068556, + "grad_norm": 5.163570880889893, + "learning_rate": 5.0000000000000004e-08, + "loss": 1.4628, + "step": 1 + }, + { + "epoch": 0.0009456264775413711, + "grad_norm": 6.298020839691162, + "learning_rate": 1.0000000000000001e-07, + "loss": 1.5003, + "step": 2 + }, + { + "epoch": 0.0014184397163120568, + "grad_norm": 5.853623390197754, + "learning_rate": 1.5000000000000002e-07, + "loss": 1.4495, + "step": 3 + }, + { + "epoch": 0.0018912529550827422, + "grad_norm": 5.456025123596191, + "learning_rate": 2.0000000000000002e-07, + "loss": 1.3798, + "step": 4 + }, + { + "epoch": 0.002364066193853428, + "grad_norm": 5.757407188415527, + "learning_rate": 2.5000000000000004e-07, + "loss": 1.4515, + "step": 5 + }, + { + "epoch": 0.0028368794326241137, + "grad_norm": 5.872277736663818, + "learning_rate": 3.0000000000000004e-07, + "loss": 1.4424, + "step": 6 + }, + { + "epoch": 0.003309692671394799, + "grad_norm": 6.7816009521484375, + "learning_rate": 3.5000000000000004e-07, + "loss": 1.4004, + "step": 7 + }, + { + "epoch": 0.0037825059101654845, + "grad_norm": 6.229667663574219, + "learning_rate": 4.0000000000000003e-07, + "loss": 1.4494, + "step": 8 + }, + { + "epoch": 0.00425531914893617, + "grad_norm": 5.336202621459961, + "learning_rate": 4.5000000000000003e-07, + "loss": 1.3916, + "step": 9 + }, + { + "epoch": 0.004728132387706856, + "grad_norm": 5.589445114135742, + "learning_rate": 5.000000000000001e-07, + "loss": 1.2318, + "step": 10 + }, + { + "epoch": 0.005200945626477541, + "grad_norm": 5.720539569854736, + "learning_rate": 5.5e-07, + "loss": 1.4367, + "step": 11 + }, + { + "epoch": 0.005673758865248227, + "grad_norm": 5.913913726806641, + "learning_rate": 6.000000000000001e-07, + "loss": 1.342, + "step": 12 + }, + { + "epoch": 0.006146572104018913, + "grad_norm": 5.899744987487793, + "learning_rate": 6.5e-07, + "loss": 1.4307, + "step": 13 + }, + { + "epoch": 0.006619385342789598, + "grad_norm": 5.571037292480469, + "learning_rate": 7.000000000000001e-07, + "loss": 1.3372, + "step": 14 + }, + { + "epoch": 0.0070921985815602835, + "grad_norm": 5.480010509490967, + "learning_rate": 7.5e-07, + "loss": 1.3923, + "step": 15 + }, + { + "epoch": 0.007565011820330969, + "grad_norm": 5.254702091217041, + "learning_rate": 8.000000000000001e-07, + "loss": 1.2928, + "step": 16 + }, + { + "epoch": 0.008037825059101654, + "grad_norm": 6.090312480926514, + "learning_rate": 8.500000000000001e-07, + "loss": 1.4984, + "step": 17 + }, + { + "epoch": 0.00851063829787234, + "grad_norm": 5.689319610595703, + "learning_rate": 9.000000000000001e-07, + "loss": 1.4108, + "step": 18 + }, + { + "epoch": 0.008983451536643027, + "grad_norm": 5.386685848236084, + "learning_rate": 9.500000000000001e-07, + "loss": 1.425, + "step": 19 + }, + { + "epoch": 0.009456264775413711, + "grad_norm": 6.451584815979004, + "learning_rate": 1.0000000000000002e-06, + "loss": 1.5507, + "step": 20 + }, + { + "epoch": 0.009929078014184398, + "grad_norm": 5.37647008895874, + "learning_rate": 1.0500000000000001e-06, + "loss": 1.4109, + "step": 21 + }, + { + "epoch": 0.010401891252955082, + "grad_norm": 4.716553211212158, + "learning_rate": 1.1e-06, + "loss": 1.2028, + "step": 22 + }, + { + "epoch": 0.010874704491725768, + "grad_norm": 4.950989723205566, + "learning_rate": 1.1500000000000002e-06, + "loss": 1.3043, + "step": 23 + }, + { + "epoch": 0.011347517730496455, + "grad_norm": 4.688975811004639, + "learning_rate": 1.2000000000000002e-06, + "loss": 1.2708, + "step": 24 + }, + { + "epoch": 0.01182033096926714, + "grad_norm": 4.905868053436279, + "learning_rate": 1.25e-06, + "loss": 1.3268, + "step": 25 + }, + { + "epoch": 0.012293144208037825, + "grad_norm": 4.503395080566406, + "learning_rate": 1.3e-06, + "loss": 1.1799, + "step": 26 + }, + { + "epoch": 0.01276595744680851, + "grad_norm": 4.77382230758667, + "learning_rate": 1.3500000000000002e-06, + "loss": 1.3882, + "step": 27 + }, + { + "epoch": 0.013238770685579196, + "grad_norm": 4.734329700469971, + "learning_rate": 1.4000000000000001e-06, + "loss": 1.3476, + "step": 28 + }, + { + "epoch": 0.013711583924349883, + "grad_norm": 4.775066375732422, + "learning_rate": 1.45e-06, + "loss": 1.2429, + "step": 29 + }, + { + "epoch": 0.014184397163120567, + "grad_norm": 4.978334426879883, + "learning_rate": 1.5e-06, + "loss": 1.2119, + "step": 30 + }, + { + "epoch": 0.014657210401891253, + "grad_norm": 4.506785869598389, + "learning_rate": 1.5500000000000002e-06, + "loss": 1.3157, + "step": 31 + }, + { + "epoch": 0.015130023640661938, + "grad_norm": 4.007757186889648, + "learning_rate": 1.6000000000000001e-06, + "loss": 1.1451, + "step": 32 + }, + { + "epoch": 0.015602836879432624, + "grad_norm": 3.6621618270874023, + "learning_rate": 1.6500000000000003e-06, + "loss": 1.093, + "step": 33 + }, + { + "epoch": 0.01607565011820331, + "grad_norm": 3.8733766078948975, + "learning_rate": 1.7000000000000002e-06, + "loss": 1.2289, + "step": 34 + }, + { + "epoch": 0.016548463356973995, + "grad_norm": 4.3391900062561035, + "learning_rate": 1.75e-06, + "loss": 1.1453, + "step": 35 + }, + { + "epoch": 0.01702127659574468, + "grad_norm": 3.287623643875122, + "learning_rate": 1.8000000000000001e-06, + "loss": 1.0257, + "step": 36 + }, + { + "epoch": 0.017494089834515367, + "grad_norm": 3.591721773147583, + "learning_rate": 1.85e-06, + "loss": 0.9976, + "step": 37 + }, + { + "epoch": 0.017966903073286054, + "grad_norm": 4.028271675109863, + "learning_rate": 1.9000000000000002e-06, + "loss": 1.0773, + "step": 38 + }, + { + "epoch": 0.018439716312056736, + "grad_norm": 3.3543951511383057, + "learning_rate": 1.9500000000000004e-06, + "loss": 1.1677, + "step": 39 + }, + { + "epoch": 0.018912529550827423, + "grad_norm": 3.807624340057373, + "learning_rate": 2.0000000000000003e-06, + "loss": 1.1232, + "step": 40 + }, + { + "epoch": 0.01938534278959811, + "grad_norm": 4.242797374725342, + "learning_rate": 2.05e-06, + "loss": 1.1819, + "step": 41 + }, + { + "epoch": 0.019858156028368795, + "grad_norm": 3.4574992656707764, + "learning_rate": 2.1000000000000002e-06, + "loss": 0.9878, + "step": 42 + }, + { + "epoch": 0.02033096926713948, + "grad_norm": 3.906695604324341, + "learning_rate": 2.15e-06, + "loss": 1.0592, + "step": 43 + }, + { + "epoch": 0.020803782505910164, + "grad_norm": 3.7543163299560547, + "learning_rate": 2.2e-06, + "loss": 1.0309, + "step": 44 + }, + { + "epoch": 0.02127659574468085, + "grad_norm": 3.3777148723602295, + "learning_rate": 2.25e-06, + "loss": 1.0664, + "step": 45 + }, + { + "epoch": 0.021749408983451537, + "grad_norm": 3.6003634929656982, + "learning_rate": 2.3000000000000004e-06, + "loss": 1.0482, + "step": 46 + }, + { + "epoch": 0.022222222222222223, + "grad_norm": 3.3961377143859863, + "learning_rate": 2.35e-06, + "loss": 1.0252, + "step": 47 + }, + { + "epoch": 0.02269503546099291, + "grad_norm": 3.1601035594940186, + "learning_rate": 2.4000000000000003e-06, + "loss": 1.0435, + "step": 48 + }, + { + "epoch": 0.023167848699763592, + "grad_norm": 3.4192967414855957, + "learning_rate": 2.4500000000000003e-06, + "loss": 1.0935, + "step": 49 + }, + { + "epoch": 0.02364066193853428, + "grad_norm": 3.1225922107696533, + "learning_rate": 2.5e-06, + "loss": 0.8988, + "step": 50 + }, + { + "epoch": 0.024113475177304965, + "grad_norm": 3.1423380374908447, + "learning_rate": 2.55e-06, + "loss": 1.0159, + "step": 51 + }, + { + "epoch": 0.02458628841607565, + "grad_norm": 3.4782402515411377, + "learning_rate": 2.6e-06, + "loss": 1.0231, + "step": 52 + }, + { + "epoch": 0.025059101654846337, + "grad_norm": 3.8362693786621094, + "learning_rate": 2.6500000000000005e-06, + "loss": 1.0725, + "step": 53 + }, + { + "epoch": 0.02553191489361702, + "grad_norm": 3.033294916152954, + "learning_rate": 2.7000000000000004e-06, + "loss": 0.9377, + "step": 54 + }, + { + "epoch": 0.026004728132387706, + "grad_norm": 3.849741220474243, + "learning_rate": 2.7500000000000004e-06, + "loss": 1.0046, + "step": 55 + }, + { + "epoch": 0.026477541371158392, + "grad_norm": 3.141876220703125, + "learning_rate": 2.8000000000000003e-06, + "loss": 0.9226, + "step": 56 + }, + { + "epoch": 0.02695035460992908, + "grad_norm": 2.773594856262207, + "learning_rate": 2.85e-06, + "loss": 0.8662, + "step": 57 + }, + { + "epoch": 0.027423167848699765, + "grad_norm": 3.1460225582122803, + "learning_rate": 2.9e-06, + "loss": 0.9304, + "step": 58 + }, + { + "epoch": 0.027895981087470448, + "grad_norm": 3.293583631515503, + "learning_rate": 2.95e-06, + "loss": 1.0374, + "step": 59 + }, + { + "epoch": 0.028368794326241134, + "grad_norm": 3.8190863132476807, + "learning_rate": 3e-06, + "loss": 0.971, + "step": 60 + }, + { + "epoch": 0.02884160756501182, + "grad_norm": 3.4566776752471924, + "learning_rate": 3.05e-06, + "loss": 0.9631, + "step": 61 + }, + { + "epoch": 0.029314420803782507, + "grad_norm": 3.355741500854492, + "learning_rate": 3.1000000000000004e-06, + "loss": 1.0097, + "step": 62 + }, + { + "epoch": 0.029787234042553193, + "grad_norm": 3.29746675491333, + "learning_rate": 3.1500000000000003e-06, + "loss": 0.9459, + "step": 63 + }, + { + "epoch": 0.030260047281323876, + "grad_norm": 3.3122968673706055, + "learning_rate": 3.2000000000000003e-06, + "loss": 0.8594, + "step": 64 + }, + { + "epoch": 0.030732860520094562, + "grad_norm": 3.477701187133789, + "learning_rate": 3.2500000000000002e-06, + "loss": 0.9197, + "step": 65 + }, + { + "epoch": 0.031205673758865248, + "grad_norm": 3.3363406658172607, + "learning_rate": 3.3000000000000006e-06, + "loss": 0.9478, + "step": 66 + }, + { + "epoch": 0.03167848699763593, + "grad_norm": 4.143295764923096, + "learning_rate": 3.3500000000000005e-06, + "loss": 1.0534, + "step": 67 + }, + { + "epoch": 0.03215130023640662, + "grad_norm": 3.2363274097442627, + "learning_rate": 3.4000000000000005e-06, + "loss": 0.9454, + "step": 68 + }, + { + "epoch": 0.032624113475177303, + "grad_norm": 3.198746681213379, + "learning_rate": 3.45e-06, + "loss": 0.9388, + "step": 69 + }, + { + "epoch": 0.03309692671394799, + "grad_norm": 3.5751023292541504, + "learning_rate": 3.5e-06, + "loss": 0.9444, + "step": 70 + }, + { + "epoch": 0.033569739952718676, + "grad_norm": 3.1745729446411133, + "learning_rate": 3.5500000000000003e-06, + "loss": 0.8683, + "step": 71 + }, + { + "epoch": 0.03404255319148936, + "grad_norm": 3.3210883140563965, + "learning_rate": 3.6000000000000003e-06, + "loss": 0.8811, + "step": 72 + }, + { + "epoch": 0.03451536643026005, + "grad_norm": 3.2502429485321045, + "learning_rate": 3.65e-06, + "loss": 1.0012, + "step": 73 + }, + { + "epoch": 0.034988179669030735, + "grad_norm": 3.44598126411438, + "learning_rate": 3.7e-06, + "loss": 0.9217, + "step": 74 + }, + { + "epoch": 0.03546099290780142, + "grad_norm": 3.439117431640625, + "learning_rate": 3.7500000000000005e-06, + "loss": 0.8976, + "step": 75 + }, + { + "epoch": 0.03593380614657211, + "grad_norm": 3.523627758026123, + "learning_rate": 3.8000000000000005e-06, + "loss": 0.8996, + "step": 76 + }, + { + "epoch": 0.03640661938534279, + "grad_norm": 3.3716015815734863, + "learning_rate": 3.85e-06, + "loss": 0.9061, + "step": 77 + }, + { + "epoch": 0.03687943262411347, + "grad_norm": 3.33518385887146, + "learning_rate": 3.900000000000001e-06, + "loss": 0.9371, + "step": 78 + }, + { + "epoch": 0.03735224586288416, + "grad_norm": 3.833829879760742, + "learning_rate": 3.95e-06, + "loss": 0.9669, + "step": 79 + }, + { + "epoch": 0.037825059101654845, + "grad_norm": 3.260446786880493, + "learning_rate": 4.000000000000001e-06, + "loss": 0.9449, + "step": 80 + }, + { + "epoch": 0.03829787234042553, + "grad_norm": 3.532451629638672, + "learning_rate": 4.05e-06, + "loss": 0.897, + "step": 81 + }, + { + "epoch": 0.03877068557919622, + "grad_norm": 3.1156492233276367, + "learning_rate": 4.1e-06, + "loss": 0.8463, + "step": 82 + }, + { + "epoch": 0.039243498817966904, + "grad_norm": 2.8801751136779785, + "learning_rate": 4.15e-06, + "loss": 0.8616, + "step": 83 + }, + { + "epoch": 0.03971631205673759, + "grad_norm": 3.072476863861084, + "learning_rate": 4.2000000000000004e-06, + "loss": 0.8387, + "step": 84 + }, + { + "epoch": 0.04018912529550828, + "grad_norm": 2.9601376056671143, + "learning_rate": 4.25e-06, + "loss": 0.8538, + "step": 85 + }, + { + "epoch": 0.04066193853427896, + "grad_norm": 3.521664619445801, + "learning_rate": 4.3e-06, + "loss": 0.8894, + "step": 86 + }, + { + "epoch": 0.04113475177304964, + "grad_norm": 3.2670981884002686, + "learning_rate": 4.350000000000001e-06, + "loss": 0.8387, + "step": 87 + }, + { + "epoch": 0.04160756501182033, + "grad_norm": 3.422089099884033, + "learning_rate": 4.4e-06, + "loss": 0.7728, + "step": 88 + }, + { + "epoch": 0.042080378250591015, + "grad_norm": 3.414034128189087, + "learning_rate": 4.450000000000001e-06, + "loss": 0.7968, + "step": 89 + }, + { + "epoch": 0.0425531914893617, + "grad_norm": 4.234285354614258, + "learning_rate": 4.5e-06, + "loss": 0.8502, + "step": 90 + }, + { + "epoch": 0.04302600472813239, + "grad_norm": 3.1446919441223145, + "learning_rate": 4.5500000000000005e-06, + "loss": 0.8236, + "step": 91 + }, + { + "epoch": 0.043498817966903074, + "grad_norm": 3.683443307876587, + "learning_rate": 4.600000000000001e-06, + "loss": 0.9792, + "step": 92 + }, + { + "epoch": 0.04397163120567376, + "grad_norm": 3.664219617843628, + "learning_rate": 4.65e-06, + "loss": 0.8743, + "step": 93 + }, + { + "epoch": 0.044444444444444446, + "grad_norm": 3.369479179382324, + "learning_rate": 4.7e-06, + "loss": 0.8741, + "step": 94 + }, + { + "epoch": 0.04491725768321513, + "grad_norm": 3.694949150085449, + "learning_rate": 4.75e-06, + "loss": 0.7574, + "step": 95 + }, + { + "epoch": 0.04539007092198582, + "grad_norm": 3.5144498348236084, + "learning_rate": 4.800000000000001e-06, + "loss": 0.9934, + "step": 96 + }, + { + "epoch": 0.0458628841607565, + "grad_norm": 3.164451837539673, + "learning_rate": 4.85e-06, + "loss": 0.7463, + "step": 97 + }, + { + "epoch": 0.046335697399527184, + "grad_norm": 3.222785472869873, + "learning_rate": 4.9000000000000005e-06, + "loss": 0.7698, + "step": 98 + }, + { + "epoch": 0.04680851063829787, + "grad_norm": 2.9129555225372314, + "learning_rate": 4.95e-06, + "loss": 0.7856, + "step": 99 + }, + { + "epoch": 0.04728132387706856, + "grad_norm": 3.5061235427856445, + "learning_rate": 5e-06, + "loss": 0.8588, + "step": 100 + }, + { + "epoch": 0.04775413711583924, + "grad_norm": 3.2805044651031494, + "learning_rate": 4.999999922167982e-06, + "loss": 0.7643, + "step": 101 + }, + { + "epoch": 0.04822695035460993, + "grad_norm": 3.5461678504943848, + "learning_rate": 4.999999688671929e-06, + "loss": 0.8253, + "step": 102 + }, + { + "epoch": 0.048699763593380616, + "grad_norm": 3.2238264083862305, + "learning_rate": 4.99999929951186e-06, + "loss": 0.7622, + "step": 103 + }, + { + "epoch": 0.0491725768321513, + "grad_norm": 3.818955898284912, + "learning_rate": 4.999998754687795e-06, + "loss": 0.8471, + "step": 104 + }, + { + "epoch": 0.04964539007092199, + "grad_norm": 3.1252424716949463, + "learning_rate": 4.99999805419977e-06, + "loss": 0.8409, + "step": 105 + }, + { + "epoch": 0.050118203309692674, + "grad_norm": 3.604283571243286, + "learning_rate": 4.999997198047828e-06, + "loss": 0.9027, + "step": 106 + }, + { + "epoch": 0.050591016548463354, + "grad_norm": 3.6752424240112305, + "learning_rate": 4.999996186232023e-06, + "loss": 0.9336, + "step": 107 + }, + { + "epoch": 0.05106382978723404, + "grad_norm": 3.517557144165039, + "learning_rate": 4.9999950187524184e-06, + "loss": 0.8351, + "step": 108 + }, + { + "epoch": 0.051536643026004726, + "grad_norm": 3.427285671234131, + "learning_rate": 4.999993695609085e-06, + "loss": 0.8457, + "step": 109 + }, + { + "epoch": 0.05200945626477541, + "grad_norm": 3.2792510986328125, + "learning_rate": 4.999992216802107e-06, + "loss": 0.8391, + "step": 110 + }, + { + "epoch": 0.0524822695035461, + "grad_norm": 3.581094741821289, + "learning_rate": 4.999990582331576e-06, + "loss": 0.7533, + "step": 111 + }, + { + "epoch": 0.052955082742316785, + "grad_norm": 3.1667377948760986, + "learning_rate": 4.999988792197593e-06, + "loss": 0.9562, + "step": 112 + }, + { + "epoch": 0.05342789598108747, + "grad_norm": 3.3609890937805176, + "learning_rate": 4.99998684640027e-06, + "loss": 0.8181, + "step": 113 + }, + { + "epoch": 0.05390070921985816, + "grad_norm": 3.260627269744873, + "learning_rate": 4.999984744939729e-06, + "loss": 0.8012, + "step": 114 + }, + { + "epoch": 0.054373522458628844, + "grad_norm": 3.4535653591156006, + "learning_rate": 4.9999824878160985e-06, + "loss": 0.919, + "step": 115 + }, + { + "epoch": 0.05484633569739953, + "grad_norm": 3.4880740642547607, + "learning_rate": 4.999980075029522e-06, + "loss": 0.8114, + "step": 116 + }, + { + "epoch": 0.05531914893617021, + "grad_norm": 3.2546932697296143, + "learning_rate": 4.999977506580147e-06, + "loss": 0.8274, + "step": 117 + }, + { + "epoch": 0.055791962174940896, + "grad_norm": 3.2762744426727295, + "learning_rate": 4.999974782468136e-06, + "loss": 0.9018, + "step": 118 + }, + { + "epoch": 0.05626477541371158, + "grad_norm": 3.42825984954834, + "learning_rate": 4.999971902693657e-06, + "loss": 0.8262, + "step": 119 + }, + { + "epoch": 0.05673758865248227, + "grad_norm": 3.082496404647827, + "learning_rate": 4.99996886725689e-06, + "loss": 0.8181, + "step": 120 + }, + { + "epoch": 0.057210401891252954, + "grad_norm": 3.322869300842285, + "learning_rate": 4.9999656761580225e-06, + "loss": 0.8382, + "step": 121 + }, + { + "epoch": 0.05768321513002364, + "grad_norm": 3.6365339756011963, + "learning_rate": 4.9999623293972555e-06, + "loss": 0.7489, + "step": 122 + }, + { + "epoch": 0.05815602836879433, + "grad_norm": 3.376352548599243, + "learning_rate": 4.999958826974796e-06, + "loss": 0.9012, + "step": 123 + }, + { + "epoch": 0.05862884160756501, + "grad_norm": 3.49088716506958, + "learning_rate": 4.999955168890862e-06, + "loss": 0.8999, + "step": 124 + }, + { + "epoch": 0.0591016548463357, + "grad_norm": 3.3265068531036377, + "learning_rate": 4.999951355145682e-06, + "loss": 0.8161, + "step": 125 + }, + { + "epoch": 0.059574468085106386, + "grad_norm": 3.697282314300537, + "learning_rate": 4.999947385739493e-06, + "loss": 0.9623, + "step": 126 + }, + { + "epoch": 0.06004728132387707, + "grad_norm": 2.7901928424835205, + "learning_rate": 4.999943260672542e-06, + "loss": 0.7371, + "step": 127 + }, + { + "epoch": 0.06052009456264775, + "grad_norm": 3.110319137573242, + "learning_rate": 4.999938979945086e-06, + "loss": 0.715, + "step": 128 + }, + { + "epoch": 0.06099290780141844, + "grad_norm": 3.2211520671844482, + "learning_rate": 4.999934543557392e-06, + "loss": 0.8888, + "step": 129 + }, + { + "epoch": 0.061465721040189124, + "grad_norm": 3.2466187477111816, + "learning_rate": 4.999929951509735e-06, + "loss": 0.9389, + "step": 130 + }, + { + "epoch": 0.06193853427895981, + "grad_norm": 3.3574399948120117, + "learning_rate": 4.999925203802403e-06, + "loss": 0.8263, + "step": 131 + }, + { + "epoch": 0.062411347517730496, + "grad_norm": 3.275601625442505, + "learning_rate": 4.99992030043569e-06, + "loss": 0.8338, + "step": 132 + }, + { + "epoch": 0.06288416075650118, + "grad_norm": 3.6011312007904053, + "learning_rate": 4.999915241409902e-06, + "loss": 0.8351, + "step": 133 + }, + { + "epoch": 0.06335697399527186, + "grad_norm": 2.969011068344116, + "learning_rate": 4.999910026725352e-06, + "loss": 0.79, + "step": 134 + }, + { + "epoch": 0.06382978723404255, + "grad_norm": 3.690784454345703, + "learning_rate": 4.999904656382369e-06, + "loss": 0.8209, + "step": 135 + }, + { + "epoch": 0.06430260047281323, + "grad_norm": 3.3363115787506104, + "learning_rate": 4.999899130381283e-06, + "loss": 0.858, + "step": 136 + }, + { + "epoch": 0.06477541371158392, + "grad_norm": 3.206881523132324, + "learning_rate": 4.9998934487224405e-06, + "loss": 0.834, + "step": 137 + }, + { + "epoch": 0.06524822695035461, + "grad_norm": 2.773146152496338, + "learning_rate": 4.999887611406195e-06, + "loss": 0.7576, + "step": 138 + }, + { + "epoch": 0.0657210401891253, + "grad_norm": 3.307725667953491, + "learning_rate": 4.999881618432908e-06, + "loss": 0.7487, + "step": 139 + }, + { + "epoch": 0.06619385342789598, + "grad_norm": 4.273657321929932, + "learning_rate": 4.999875469802956e-06, + "loss": 0.8176, + "step": 140 + }, + { + "epoch": 0.06666666666666667, + "grad_norm": 3.0898005962371826, + "learning_rate": 4.999869165516719e-06, + "loss": 0.7578, + "step": 141 + }, + { + "epoch": 0.06713947990543735, + "grad_norm": 3.25150990486145, + "learning_rate": 4.9998627055745915e-06, + "loss": 0.7873, + "step": 142 + }, + { + "epoch": 0.06761229314420804, + "grad_norm": 2.9705755710601807, + "learning_rate": 4.999856089976974e-06, + "loss": 0.6473, + "step": 143 + }, + { + "epoch": 0.06808510638297872, + "grad_norm": 3.5658507347106934, + "learning_rate": 4.9998493187242804e-06, + "loss": 0.855, + "step": 144 + }, + { + "epoch": 0.06855791962174941, + "grad_norm": 3.3994076251983643, + "learning_rate": 4.99984239181693e-06, + "loss": 0.7926, + "step": 145 + }, + { + "epoch": 0.0690307328605201, + "grad_norm": 2.8266260623931885, + "learning_rate": 4.999835309255357e-06, + "loss": 0.7564, + "step": 146 + }, + { + "epoch": 0.06950354609929078, + "grad_norm": 3.1143875122070312, + "learning_rate": 4.999828071039999e-06, + "loss": 0.8398, + "step": 147 + }, + { + "epoch": 0.06997635933806147, + "grad_norm": 2.9364278316497803, + "learning_rate": 4.99982067717131e-06, + "loss": 0.7381, + "step": 148 + }, + { + "epoch": 0.07044917257683216, + "grad_norm": 3.4155616760253906, + "learning_rate": 4.999813127649748e-06, + "loss": 0.7933, + "step": 149 + }, + { + "epoch": 0.07092198581560284, + "grad_norm": 4.371236324310303, + "learning_rate": 4.999805422475784e-06, + "loss": 0.8292, + "step": 150 + }, + { + "epoch": 0.07139479905437353, + "grad_norm": 3.3967185020446777, + "learning_rate": 4.999797561649897e-06, + "loss": 0.8712, + "step": 151 + }, + { + "epoch": 0.07186761229314421, + "grad_norm": 3.343303680419922, + "learning_rate": 4.999789545172578e-06, + "loss": 0.8177, + "step": 152 + }, + { + "epoch": 0.07234042553191489, + "grad_norm": 3.040235757827759, + "learning_rate": 4.999781373044325e-06, + "loss": 0.7379, + "step": 153 + }, + { + "epoch": 0.07281323877068557, + "grad_norm": 3.4069204330444336, + "learning_rate": 4.999773045265647e-06, + "loss": 0.7939, + "step": 154 + }, + { + "epoch": 0.07328605200945626, + "grad_norm": 3.1939475536346436, + "learning_rate": 4.999764561837063e-06, + "loss": 0.8037, + "step": 155 + }, + { + "epoch": 0.07375886524822695, + "grad_norm": 4.452004909515381, + "learning_rate": 4.999755922759101e-06, + "loss": 0.8421, + "step": 156 + }, + { + "epoch": 0.07423167848699763, + "grad_norm": 3.2031240463256836, + "learning_rate": 4.999747128032298e-06, + "loss": 0.794, + "step": 157 + }, + { + "epoch": 0.07470449172576832, + "grad_norm": 3.175920009613037, + "learning_rate": 4.999738177657203e-06, + "loss": 0.759, + "step": 158 + }, + { + "epoch": 0.075177304964539, + "grad_norm": 3.7679688930511475, + "learning_rate": 4.9997290716343725e-06, + "loss": 0.8174, + "step": 159 + }, + { + "epoch": 0.07565011820330969, + "grad_norm": 3.7020037174224854, + "learning_rate": 4.999719809964373e-06, + "loss": 0.7116, + "step": 160 + }, + { + "epoch": 0.07612293144208038, + "grad_norm": 4.357471942901611, + "learning_rate": 4.999710392647783e-06, + "loss": 0.7649, + "step": 161 + }, + { + "epoch": 0.07659574468085106, + "grad_norm": 3.3439087867736816, + "learning_rate": 4.999700819685187e-06, + "loss": 0.7907, + "step": 162 + }, + { + "epoch": 0.07706855791962175, + "grad_norm": 3.210815191268921, + "learning_rate": 4.999691091077182e-06, + "loss": 0.8446, + "step": 163 + }, + { + "epoch": 0.07754137115839244, + "grad_norm": 3.1029553413391113, + "learning_rate": 4.9996812068243735e-06, + "loss": 0.7232, + "step": 164 + }, + { + "epoch": 0.07801418439716312, + "grad_norm": 2.9389400482177734, + "learning_rate": 4.999671166927378e-06, + "loss": 0.7413, + "step": 165 + }, + { + "epoch": 0.07848699763593381, + "grad_norm": 3.7062697410583496, + "learning_rate": 4.9996609713868185e-06, + "loss": 0.8773, + "step": 166 + }, + { + "epoch": 0.0789598108747045, + "grad_norm": 3.2768924236297607, + "learning_rate": 4.999650620203332e-06, + "loss": 0.8046, + "step": 167 + }, + { + "epoch": 0.07943262411347518, + "grad_norm": 3.380373001098633, + "learning_rate": 4.999640113377561e-06, + "loss": 0.7529, + "step": 168 + }, + { + "epoch": 0.07990543735224587, + "grad_norm": 3.520022392272949, + "learning_rate": 4.999629450910162e-06, + "loss": 0.7352, + "step": 169 + }, + { + "epoch": 0.08037825059101655, + "grad_norm": 3.43269419670105, + "learning_rate": 4.999618632801796e-06, + "loss": 0.9371, + "step": 170 + }, + { + "epoch": 0.08085106382978724, + "grad_norm": 3.555877923965454, + "learning_rate": 4.99960765905314e-06, + "loss": 0.8276, + "step": 171 + }, + { + "epoch": 0.08132387706855793, + "grad_norm": 3.597050189971924, + "learning_rate": 4.999596529664874e-06, + "loss": 0.8164, + "step": 172 + }, + { + "epoch": 0.0817966903073286, + "grad_norm": 3.2002956867218018, + "learning_rate": 4.999585244637693e-06, + "loss": 0.7824, + "step": 173 + }, + { + "epoch": 0.08226950354609928, + "grad_norm": 3.527275562286377, + "learning_rate": 4.999573803972299e-06, + "loss": 0.8033, + "step": 174 + }, + { + "epoch": 0.08274231678486997, + "grad_norm": 3.5184452533721924, + "learning_rate": 4.999562207669405e-06, + "loss": 0.724, + "step": 175 + }, + { + "epoch": 0.08321513002364066, + "grad_norm": 3.6635067462921143, + "learning_rate": 4.999550455729732e-06, + "loss": 0.819, + "step": 176 + }, + { + "epoch": 0.08368794326241134, + "grad_norm": 3.192399740219116, + "learning_rate": 4.999538548154012e-06, + "loss": 0.7999, + "step": 177 + }, + { + "epoch": 0.08416075650118203, + "grad_norm": 3.0946953296661377, + "learning_rate": 4.999526484942988e-06, + "loss": 0.7367, + "step": 178 + }, + { + "epoch": 0.08463356973995272, + "grad_norm": 2.847198009490967, + "learning_rate": 4.99951426609741e-06, + "loss": 0.7536, + "step": 179 + }, + { + "epoch": 0.0851063829787234, + "grad_norm": 2.7674827575683594, + "learning_rate": 4.999501891618037e-06, + "loss": 0.701, + "step": 180 + }, + { + "epoch": 0.08557919621749409, + "grad_norm": 3.357933521270752, + "learning_rate": 4.999489361505643e-06, + "loss": 0.8331, + "step": 181 + }, + { + "epoch": 0.08605200945626477, + "grad_norm": 3.1464426517486572, + "learning_rate": 4.999476675761004e-06, + "loss": 0.7931, + "step": 182 + }, + { + "epoch": 0.08652482269503546, + "grad_norm": 3.310697078704834, + "learning_rate": 4.999463834384915e-06, + "loss": 0.753, + "step": 183 + }, + { + "epoch": 0.08699763593380615, + "grad_norm": 2.9794881343841553, + "learning_rate": 4.999450837378171e-06, + "loss": 0.7091, + "step": 184 + }, + { + "epoch": 0.08747044917257683, + "grad_norm": 3.0776889324188232, + "learning_rate": 4.999437684741584e-06, + "loss": 0.7226, + "step": 185 + }, + { + "epoch": 0.08794326241134752, + "grad_norm": 3.6657519340515137, + "learning_rate": 4.999424376475972e-06, + "loss": 0.845, + "step": 186 + }, + { + "epoch": 0.0884160756501182, + "grad_norm": 3.872718572616577, + "learning_rate": 4.999410912582164e-06, + "loss": 0.812, + "step": 187 + }, + { + "epoch": 0.08888888888888889, + "grad_norm": 2.9184508323669434, + "learning_rate": 4.9993972930609976e-06, + "loss": 0.6823, + "step": 188 + }, + { + "epoch": 0.08936170212765958, + "grad_norm": 3.5567142963409424, + "learning_rate": 4.999383517913321e-06, + "loss": 0.7614, + "step": 189 + }, + { + "epoch": 0.08983451536643026, + "grad_norm": 3.3688533306121826, + "learning_rate": 4.999369587139992e-06, + "loss": 0.858, + "step": 190 + }, + { + "epoch": 0.09030732860520095, + "grad_norm": 2.893223524093628, + "learning_rate": 4.99935550074188e-06, + "loss": 0.6761, + "step": 191 + }, + { + "epoch": 0.09078014184397164, + "grad_norm": 3.400225877761841, + "learning_rate": 4.999341258719859e-06, + "loss": 0.7531, + "step": 192 + }, + { + "epoch": 0.09125295508274232, + "grad_norm": 3.6167714595794678, + "learning_rate": 4.999326861074817e-06, + "loss": 0.8164, + "step": 193 + }, + { + "epoch": 0.091725768321513, + "grad_norm": 4.325016498565674, + "learning_rate": 4.9993123078076506e-06, + "loss": 0.7069, + "step": 194 + }, + { + "epoch": 0.09219858156028368, + "grad_norm": 3.195317029953003, + "learning_rate": 4.999297598919266e-06, + "loss": 0.726, + "step": 195 + }, + { + "epoch": 0.09267139479905437, + "grad_norm": 3.146530866622925, + "learning_rate": 4.999282734410579e-06, + "loss": 0.7888, + "step": 196 + }, + { + "epoch": 0.09314420803782505, + "grad_norm": 3.5166752338409424, + "learning_rate": 4.999267714282515e-06, + "loss": 0.8473, + "step": 197 + }, + { + "epoch": 0.09361702127659574, + "grad_norm": 3.3140196800231934, + "learning_rate": 4.99925253853601e-06, + "loss": 0.7233, + "step": 198 + }, + { + "epoch": 0.09408983451536643, + "grad_norm": 3.0318164825439453, + "learning_rate": 4.999237207172008e-06, + "loss": 0.7543, + "step": 199 + }, + { + "epoch": 0.09456264775413711, + "grad_norm": 3.662214756011963, + "learning_rate": 4.999221720191464e-06, + "loss": 0.7783, + "step": 200 + }, + { + "epoch": 0.0950354609929078, + "grad_norm": 3.452078104019165, + "learning_rate": 4.9992060775953425e-06, + "loss": 0.7868, + "step": 201 + }, + { + "epoch": 0.09550827423167849, + "grad_norm": 3.4051287174224854, + "learning_rate": 4.999190279384617e-06, + "loss": 0.7849, + "step": 202 + }, + { + "epoch": 0.09598108747044917, + "grad_norm": 3.1377196311950684, + "learning_rate": 4.999174325560271e-06, + "loss": 0.8364, + "step": 203 + }, + { + "epoch": 0.09645390070921986, + "grad_norm": 3.129473924636841, + "learning_rate": 4.999158216123299e-06, + "loss": 0.7458, + "step": 204 + }, + { + "epoch": 0.09692671394799054, + "grad_norm": 3.169548749923706, + "learning_rate": 4.999141951074703e-06, + "loss": 0.7256, + "step": 205 + }, + { + "epoch": 0.09739952718676123, + "grad_norm": 3.186009168624878, + "learning_rate": 4.999125530415495e-06, + "loss": 0.783, + "step": 206 + }, + { + "epoch": 0.09787234042553192, + "grad_norm": 3.0995123386383057, + "learning_rate": 4.9991089541467e-06, + "loss": 0.7519, + "step": 207 + }, + { + "epoch": 0.0983451536643026, + "grad_norm": 3.1854088306427, + "learning_rate": 4.999092222269348e-06, + "loss": 0.7444, + "step": 208 + }, + { + "epoch": 0.09881796690307329, + "grad_norm": 3.1512246131896973, + "learning_rate": 4.999075334784482e-06, + "loss": 0.7882, + "step": 209 + }, + { + "epoch": 0.09929078014184398, + "grad_norm": 3.6199698448181152, + "learning_rate": 4.999058291693153e-06, + "loss": 0.8048, + "step": 210 + }, + { + "epoch": 0.09976359338061466, + "grad_norm": 2.956907272338867, + "learning_rate": 4.999041092996422e-06, + "loss": 0.7663, + "step": 211 + }, + { + "epoch": 0.10023640661938535, + "grad_norm": 3.3493971824645996, + "learning_rate": 4.99902373869536e-06, + "loss": 0.7639, + "step": 212 + }, + { + "epoch": 0.10070921985815603, + "grad_norm": 3.144812822341919, + "learning_rate": 4.9990062287910475e-06, + "loss": 0.7953, + "step": 213 + }, + { + "epoch": 0.10118203309692671, + "grad_norm": 3.5986971855163574, + "learning_rate": 4.998988563284576e-06, + "loss": 0.8297, + "step": 214 + }, + { + "epoch": 0.1016548463356974, + "grad_norm": 3.447584867477417, + "learning_rate": 4.998970742177044e-06, + "loss": 0.808, + "step": 215 + }, + { + "epoch": 0.10212765957446808, + "grad_norm": 3.791353940963745, + "learning_rate": 4.998952765469562e-06, + "loss": 0.8005, + "step": 216 + }, + { + "epoch": 0.10260047281323877, + "grad_norm": 3.4490807056427, + "learning_rate": 4.998934633163247e-06, + "loss": 0.8135, + "step": 217 + }, + { + "epoch": 0.10307328605200945, + "grad_norm": 3.1053314208984375, + "learning_rate": 4.998916345259232e-06, + "loss": 0.7888, + "step": 218 + }, + { + "epoch": 0.10354609929078014, + "grad_norm": 3.407862663269043, + "learning_rate": 4.9988979017586514e-06, + "loss": 0.7099, + "step": 219 + }, + { + "epoch": 0.10401891252955082, + "grad_norm": 3.116656541824341, + "learning_rate": 4.998879302662658e-06, + "loss": 0.8344, + "step": 220 + }, + { + "epoch": 0.10449172576832151, + "grad_norm": 3.339264154434204, + "learning_rate": 4.998860547972406e-06, + "loss": 0.8496, + "step": 221 + }, + { + "epoch": 0.1049645390070922, + "grad_norm": 3.251892566680908, + "learning_rate": 4.998841637689066e-06, + "loss": 0.7455, + "step": 222 + }, + { + "epoch": 0.10543735224586288, + "grad_norm": 4.098135471343994, + "learning_rate": 4.998822571813814e-06, + "loss": 0.7772, + "step": 223 + }, + { + "epoch": 0.10591016548463357, + "grad_norm": 3.9871134757995605, + "learning_rate": 4.998803350347837e-06, + "loss": 0.8261, + "step": 224 + }, + { + "epoch": 0.10638297872340426, + "grad_norm": 3.2822303771972656, + "learning_rate": 4.998783973292333e-06, + "loss": 0.8623, + "step": 225 + }, + { + "epoch": 0.10685579196217494, + "grad_norm": 3.0356857776641846, + "learning_rate": 4.998764440648507e-06, + "loss": 0.7426, + "step": 226 + }, + { + "epoch": 0.10732860520094563, + "grad_norm": 2.8932785987854004, + "learning_rate": 4.998744752417576e-06, + "loss": 0.6741, + "step": 227 + }, + { + "epoch": 0.10780141843971631, + "grad_norm": 3.085820436477661, + "learning_rate": 4.998724908600767e-06, + "loss": 0.6549, + "step": 228 + }, + { + "epoch": 0.108274231678487, + "grad_norm": 3.135829210281372, + "learning_rate": 4.998704909199314e-06, + "loss": 0.6702, + "step": 229 + }, + { + "epoch": 0.10874704491725769, + "grad_norm": 5.016134262084961, + "learning_rate": 4.9986847542144625e-06, + "loss": 0.7852, + "step": 230 + }, + { + "epoch": 0.10921985815602837, + "grad_norm": 3.9056200981140137, + "learning_rate": 4.998664443647468e-06, + "loss": 0.9654, + "step": 231 + }, + { + "epoch": 0.10969267139479906, + "grad_norm": 3.0880749225616455, + "learning_rate": 4.998643977499595e-06, + "loss": 0.7579, + "step": 232 + }, + { + "epoch": 0.11016548463356975, + "grad_norm": 3.6893601417541504, + "learning_rate": 4.998623355772118e-06, + "loss": 0.713, + "step": 233 + }, + { + "epoch": 0.11063829787234042, + "grad_norm": 4.181536674499512, + "learning_rate": 4.998602578466319e-06, + "loss": 0.7331, + "step": 234 + }, + { + "epoch": 0.1111111111111111, + "grad_norm": 3.036386728286743, + "learning_rate": 4.998581645583496e-06, + "loss": 0.7115, + "step": 235 + }, + { + "epoch": 0.11158392434988179, + "grad_norm": 3.6333255767822266, + "learning_rate": 4.998560557124948e-06, + "loss": 0.7544, + "step": 236 + }, + { + "epoch": 0.11205673758865248, + "grad_norm": 2.926417827606201, + "learning_rate": 4.9985393130919915e-06, + "loss": 0.715, + "step": 237 + }, + { + "epoch": 0.11252955082742316, + "grad_norm": 2.969158172607422, + "learning_rate": 4.998517913485946e-06, + "loss": 0.7304, + "step": 238 + }, + { + "epoch": 0.11300236406619385, + "grad_norm": 3.5254971981048584, + "learning_rate": 4.9984963583081466e-06, + "loss": 0.7725, + "step": 239 + }, + { + "epoch": 0.11347517730496454, + "grad_norm": 3.7840335369110107, + "learning_rate": 4.998474647559936e-06, + "loss": 0.8685, + "step": 240 + }, + { + "epoch": 0.11394799054373522, + "grad_norm": 3.0333125591278076, + "learning_rate": 4.9984527812426625e-06, + "loss": 0.7793, + "step": 241 + }, + { + "epoch": 0.11442080378250591, + "grad_norm": 3.290159225463867, + "learning_rate": 4.99843075935769e-06, + "loss": 0.7158, + "step": 242 + }, + { + "epoch": 0.1148936170212766, + "grad_norm": 3.3935494422912598, + "learning_rate": 4.99840858190639e-06, + "loss": 0.7643, + "step": 243 + }, + { + "epoch": 0.11536643026004728, + "grad_norm": 3.333965539932251, + "learning_rate": 4.998386248890142e-06, + "loss": 0.7255, + "step": 244 + }, + { + "epoch": 0.11583924349881797, + "grad_norm": 2.8129613399505615, + "learning_rate": 4.998363760310339e-06, + "loss": 0.768, + "step": 245 + }, + { + "epoch": 0.11631205673758865, + "grad_norm": 2.8678107261657715, + "learning_rate": 4.998341116168378e-06, + "loss": 0.7403, + "step": 246 + }, + { + "epoch": 0.11678486997635934, + "grad_norm": 2.8898239135742188, + "learning_rate": 4.998318316465672e-06, + "loss": 0.6844, + "step": 247 + }, + { + "epoch": 0.11725768321513003, + "grad_norm": 3.139777898788452, + "learning_rate": 4.998295361203637e-06, + "loss": 0.7936, + "step": 248 + }, + { + "epoch": 0.11773049645390071, + "grad_norm": 3.393721103668213, + "learning_rate": 4.998272250383707e-06, + "loss": 0.8173, + "step": 249 + }, + { + "epoch": 0.1182033096926714, + "grad_norm": 3.240973949432373, + "learning_rate": 4.998248984007318e-06, + "loss": 0.8252, + "step": 250 + }, + { + "epoch": 0.11867612293144209, + "grad_norm": 3.384855031967163, + "learning_rate": 4.998225562075918e-06, + "loss": 0.7244, + "step": 251 + }, + { + "epoch": 0.11914893617021277, + "grad_norm": 3.1881816387176514, + "learning_rate": 4.9982019845909675e-06, + "loss": 0.6818, + "step": 252 + }, + { + "epoch": 0.11962174940898346, + "grad_norm": 2.888364553451538, + "learning_rate": 4.998178251553934e-06, + "loss": 0.6753, + "step": 253 + }, + { + "epoch": 0.12009456264775414, + "grad_norm": 3.630093812942505, + "learning_rate": 4.9981543629662944e-06, + "loss": 0.7995, + "step": 254 + }, + { + "epoch": 0.12056737588652482, + "grad_norm": 2.9820947647094727, + "learning_rate": 4.998130318829537e-06, + "loss": 0.7478, + "step": 255 + }, + { + "epoch": 0.1210401891252955, + "grad_norm": 2.7094738483428955, + "learning_rate": 4.998106119145159e-06, + "loss": 0.7237, + "step": 256 + }, + { + "epoch": 0.12151300236406619, + "grad_norm": 3.1808104515075684, + "learning_rate": 4.9980817639146665e-06, + "loss": 0.7915, + "step": 257 + }, + { + "epoch": 0.12198581560283688, + "grad_norm": 3.1661291122436523, + "learning_rate": 4.998057253139575e-06, + "loss": 0.8053, + "step": 258 + }, + { + "epoch": 0.12245862884160756, + "grad_norm": 3.528749942779541, + "learning_rate": 4.998032586821413e-06, + "loss": 0.7946, + "step": 259 + }, + { + "epoch": 0.12293144208037825, + "grad_norm": 3.125964879989624, + "learning_rate": 4.998007764961716e-06, + "loss": 0.7569, + "step": 260 + }, + { + "epoch": 0.12340425531914893, + "grad_norm": 3.0778942108154297, + "learning_rate": 4.997982787562029e-06, + "loss": 0.7184, + "step": 261 + }, + { + "epoch": 0.12387706855791962, + "grad_norm": 3.3531930446624756, + "learning_rate": 4.997957654623906e-06, + "loss": 0.7586, + "step": 262 + }, + { + "epoch": 0.1243498817966903, + "grad_norm": 3.229278564453125, + "learning_rate": 4.997932366148913e-06, + "loss": 0.6092, + "step": 263 + }, + { + "epoch": 0.12482269503546099, + "grad_norm": 3.7286155223846436, + "learning_rate": 4.997906922138626e-06, + "loss": 0.7965, + "step": 264 + }, + { + "epoch": 0.12529550827423167, + "grad_norm": 3.300311803817749, + "learning_rate": 4.997881322594628e-06, + "loss": 0.7665, + "step": 265 + }, + { + "epoch": 0.12576832151300235, + "grad_norm": 3.411482572555542, + "learning_rate": 4.9978555675185115e-06, + "loss": 0.7253, + "step": 266 + }, + { + "epoch": 0.12624113475177304, + "grad_norm": 3.0884511470794678, + "learning_rate": 4.9978296569118825e-06, + "loss": 0.659, + "step": 267 + }, + { + "epoch": 0.12671394799054372, + "grad_norm": 3.0652925968170166, + "learning_rate": 4.9978035907763535e-06, + "loss": 0.6739, + "step": 268 + }, + { + "epoch": 0.1271867612293144, + "grad_norm": 3.280555009841919, + "learning_rate": 4.997777369113547e-06, + "loss": 0.8003, + "step": 269 + }, + { + "epoch": 0.1276595744680851, + "grad_norm": 2.980860948562622, + "learning_rate": 4.997750991925096e-06, + "loss": 0.7097, + "step": 270 + }, + { + "epoch": 0.12813238770685578, + "grad_norm": 3.301760673522949, + "learning_rate": 4.997724459212644e-06, + "loss": 0.7894, + "step": 271 + }, + { + "epoch": 0.12860520094562647, + "grad_norm": 2.9584903717041016, + "learning_rate": 4.997697770977841e-06, + "loss": 0.733, + "step": 272 + }, + { + "epoch": 0.12907801418439716, + "grad_norm": 3.5632214546203613, + "learning_rate": 4.99767092722235e-06, + "loss": 0.7228, + "step": 273 + }, + { + "epoch": 0.12955082742316784, + "grad_norm": 3.5900983810424805, + "learning_rate": 4.997643927947843e-06, + "loss": 0.7634, + "step": 274 + }, + { + "epoch": 0.13002364066193853, + "grad_norm": 3.332650661468506, + "learning_rate": 4.997616773156e-06, + "loss": 0.797, + "step": 275 + }, + { + "epoch": 0.13049645390070921, + "grad_norm": 3.1094167232513428, + "learning_rate": 4.997589462848512e-06, + "loss": 0.7849, + "step": 276 + }, + { + "epoch": 0.1309692671394799, + "grad_norm": 3.5359463691711426, + "learning_rate": 4.99756199702708e-06, + "loss": 0.6871, + "step": 277 + }, + { + "epoch": 0.1314420803782506, + "grad_norm": 3.190441846847534, + "learning_rate": 4.997534375693414e-06, + "loss": 0.6883, + "step": 278 + }, + { + "epoch": 0.13191489361702127, + "grad_norm": 3.063518762588501, + "learning_rate": 4.997506598849234e-06, + "loss": 0.7586, + "step": 279 + }, + { + "epoch": 0.13238770685579196, + "grad_norm": 3.4112050533294678, + "learning_rate": 4.997478666496269e-06, + "loss": 0.796, + "step": 280 + }, + { + "epoch": 0.13286052009456265, + "grad_norm": 3.231886386871338, + "learning_rate": 4.997450578636259e-06, + "loss": 0.7714, + "step": 281 + }, + { + "epoch": 0.13333333333333333, + "grad_norm": 3.279425621032715, + "learning_rate": 4.9974223352709515e-06, + "loss": 0.7793, + "step": 282 + }, + { + "epoch": 0.13380614657210402, + "grad_norm": 3.2154316902160645, + "learning_rate": 4.9973939364021075e-06, + "loss": 0.791, + "step": 283 + }, + { + "epoch": 0.1342789598108747, + "grad_norm": 3.2090768814086914, + "learning_rate": 4.9973653820314925e-06, + "loss": 0.6433, + "step": 284 + }, + { + "epoch": 0.1347517730496454, + "grad_norm": 3.1712026596069336, + "learning_rate": 4.997336672160886e-06, + "loss": 0.8128, + "step": 285 + }, + { + "epoch": 0.13522458628841608, + "grad_norm": 2.929229497909546, + "learning_rate": 4.997307806792076e-06, + "loss": 0.7594, + "step": 286 + }, + { + "epoch": 0.13569739952718676, + "grad_norm": 3.0363314151763916, + "learning_rate": 4.997278785926859e-06, + "loss": 0.7336, + "step": 287 + }, + { + "epoch": 0.13617021276595745, + "grad_norm": 3.1352357864379883, + "learning_rate": 4.997249609567042e-06, + "loss": 0.7225, + "step": 288 + }, + { + "epoch": 0.13664302600472814, + "grad_norm": 3.3171157836914062, + "learning_rate": 4.997220277714442e-06, + "loss": 0.7777, + "step": 289 + }, + { + "epoch": 0.13711583924349882, + "grad_norm": 3.050717353820801, + "learning_rate": 4.997190790370885e-06, + "loss": 0.6836, + "step": 290 + }, + { + "epoch": 0.1375886524822695, + "grad_norm": 3.0297694206237793, + "learning_rate": 4.997161147538208e-06, + "loss": 0.6883, + "step": 291 + }, + { + "epoch": 0.1380614657210402, + "grad_norm": 3.0566554069519043, + "learning_rate": 4.997131349218256e-06, + "loss": 0.6674, + "step": 292 + }, + { + "epoch": 0.13853427895981088, + "grad_norm": 3.799111843109131, + "learning_rate": 4.997101395412885e-06, + "loss": 0.8256, + "step": 293 + }, + { + "epoch": 0.13900709219858157, + "grad_norm": 3.1394248008728027, + "learning_rate": 4.9970712861239576e-06, + "loss": 0.7306, + "step": 294 + }, + { + "epoch": 0.13947990543735225, + "grad_norm": 3.0605666637420654, + "learning_rate": 4.997041021353352e-06, + "loss": 0.7212, + "step": 295 + }, + { + "epoch": 0.13995271867612294, + "grad_norm": 3.8813397884368896, + "learning_rate": 4.997010601102951e-06, + "loss": 0.769, + "step": 296 + }, + { + "epoch": 0.14042553191489363, + "grad_norm": 3.0514819622039795, + "learning_rate": 4.996980025374649e-06, + "loss": 0.7422, + "step": 297 + }, + { + "epoch": 0.1408983451536643, + "grad_norm": 2.9544146060943604, + "learning_rate": 4.99694929417035e-06, + "loss": 0.6912, + "step": 298 + }, + { + "epoch": 0.141371158392435, + "grad_norm": 3.2635602951049805, + "learning_rate": 4.996918407491966e-06, + "loss": 0.7395, + "step": 299 + }, + { + "epoch": 0.14184397163120568, + "grad_norm": 3.373882532119751, + "learning_rate": 4.996887365341423e-06, + "loss": 0.7799, + "step": 300 + }, + { + "epoch": 0.14231678486997637, + "grad_norm": 3.001128673553467, + "learning_rate": 4.996856167720652e-06, + "loss": 0.7168, + "step": 301 + }, + { + "epoch": 0.14278959810874706, + "grad_norm": 3.1026835441589355, + "learning_rate": 4.996824814631595e-06, + "loss": 0.7492, + "step": 302 + }, + { + "epoch": 0.14326241134751774, + "grad_norm": 3.41947603225708, + "learning_rate": 4.996793306076205e-06, + "loss": 0.6659, + "step": 303 + }, + { + "epoch": 0.14373522458628843, + "grad_norm": 3.2272400856018066, + "learning_rate": 4.996761642056444e-06, + "loss": 0.7184, + "step": 304 + }, + { + "epoch": 0.14420803782505912, + "grad_norm": 2.9488935470581055, + "learning_rate": 4.996729822574284e-06, + "loss": 0.7451, + "step": 305 + }, + { + "epoch": 0.14468085106382977, + "grad_norm": 3.268231153488159, + "learning_rate": 4.9966978476317065e-06, + "loss": 0.7798, + "step": 306 + }, + { + "epoch": 0.14515366430260046, + "grad_norm": 3.9086556434631348, + "learning_rate": 4.996665717230701e-06, + "loss": 0.7871, + "step": 307 + }, + { + "epoch": 0.14562647754137115, + "grad_norm": 3.3483879566192627, + "learning_rate": 4.996633431373269e-06, + "loss": 0.7415, + "step": 308 + }, + { + "epoch": 0.14609929078014183, + "grad_norm": 2.839400053024292, + "learning_rate": 4.99660099006142e-06, + "loss": 0.7192, + "step": 309 + }, + { + "epoch": 0.14657210401891252, + "grad_norm": 3.177302598953247, + "learning_rate": 4.996568393297175e-06, + "loss": 0.755, + "step": 310 + }, + { + "epoch": 0.1470449172576832, + "grad_norm": 3.5477044582366943, + "learning_rate": 4.996535641082563e-06, + "loss": 0.7531, + "step": 311 + }, + { + "epoch": 0.1475177304964539, + "grad_norm": 3.418576717376709, + "learning_rate": 4.996502733419624e-06, + "loss": 0.8009, + "step": 312 + }, + { + "epoch": 0.14799054373522458, + "grad_norm": 3.711341619491577, + "learning_rate": 4.996469670310407e-06, + "loss": 0.7362, + "step": 313 + }, + { + "epoch": 0.14846335697399526, + "grad_norm": 3.2419373989105225, + "learning_rate": 4.99643645175697e-06, + "loss": 0.7761, + "step": 314 + }, + { + "epoch": 0.14893617021276595, + "grad_norm": 3.121858835220337, + "learning_rate": 4.996403077761381e-06, + "loss": 0.6495, + "step": 315 + }, + { + "epoch": 0.14940898345153664, + "grad_norm": 3.123054265975952, + "learning_rate": 4.996369548325719e-06, + "loss": 0.7444, + "step": 316 + }, + { + "epoch": 0.14988179669030732, + "grad_norm": 2.780880928039551, + "learning_rate": 4.996335863452072e-06, + "loss": 0.672, + "step": 317 + }, + { + "epoch": 0.150354609929078, + "grad_norm": 3.3738629817962646, + "learning_rate": 4.996302023142536e-06, + "loss": 0.7972, + "step": 318 + }, + { + "epoch": 0.1508274231678487, + "grad_norm": 3.4874777793884277, + "learning_rate": 4.99626802739922e-06, + "loss": 0.8252, + "step": 319 + }, + { + "epoch": 0.15130023640661938, + "grad_norm": 3.7074787616729736, + "learning_rate": 4.9962338762242395e-06, + "loss": 0.8216, + "step": 320 + }, + { + "epoch": 0.15177304964539007, + "grad_norm": 3.281912326812744, + "learning_rate": 4.996199569619721e-06, + "loss": 0.8175, + "step": 321 + }, + { + "epoch": 0.15224586288416075, + "grad_norm": 2.9485340118408203, + "learning_rate": 4.996165107587801e-06, + "loss": 0.707, + "step": 322 + }, + { + "epoch": 0.15271867612293144, + "grad_norm": 3.3757646083831787, + "learning_rate": 4.996130490130625e-06, + "loss": 0.7955, + "step": 323 + }, + { + "epoch": 0.15319148936170213, + "grad_norm": 2.962181568145752, + "learning_rate": 4.996095717250349e-06, + "loss": 0.7067, + "step": 324 + }, + { + "epoch": 0.1536643026004728, + "grad_norm": 3.114272356033325, + "learning_rate": 4.996060788949136e-06, + "loss": 0.7486, + "step": 325 + }, + { + "epoch": 0.1541371158392435, + "grad_norm": 3.0621590614318848, + "learning_rate": 4.996025705229165e-06, + "loss": 0.6547, + "step": 326 + }, + { + "epoch": 0.15460992907801419, + "grad_norm": 2.8745882511138916, + "learning_rate": 4.995990466092616e-06, + "loss": 0.6435, + "step": 327 + }, + { + "epoch": 0.15508274231678487, + "grad_norm": 2.90841007232666, + "learning_rate": 4.995955071541686e-06, + "loss": 0.7331, + "step": 328 + }, + { + "epoch": 0.15555555555555556, + "grad_norm": 2.694580316543579, + "learning_rate": 4.9959195215785784e-06, + "loss": 0.6731, + "step": 329 + }, + { + "epoch": 0.15602836879432624, + "grad_norm": 3.158083438873291, + "learning_rate": 4.995883816205507e-06, + "loss": 0.7257, + "step": 330 + }, + { + "epoch": 0.15650118203309693, + "grad_norm": 3.3234715461730957, + "learning_rate": 4.995847955424694e-06, + "loss": 0.7389, + "step": 331 + }, + { + "epoch": 0.15697399527186762, + "grad_norm": 2.9406495094299316, + "learning_rate": 4.995811939238373e-06, + "loss": 0.643, + "step": 332 + }, + { + "epoch": 0.1574468085106383, + "grad_norm": 3.3191726207733154, + "learning_rate": 4.995775767648785e-06, + "loss": 0.7879, + "step": 333 + }, + { + "epoch": 0.157919621749409, + "grad_norm": 3.711925745010376, + "learning_rate": 4.995739440658185e-06, + "loss": 0.7586, + "step": 334 + }, + { + "epoch": 0.15839243498817968, + "grad_norm": 9.573421478271484, + "learning_rate": 4.995702958268833e-06, + "loss": 0.7842, + "step": 335 + }, + { + "epoch": 0.15886524822695036, + "grad_norm": 3.4154508113861084, + "learning_rate": 4.995666320483001e-06, + "loss": 0.6735, + "step": 336 + }, + { + "epoch": 0.15933806146572105, + "grad_norm": 3.4169859886169434, + "learning_rate": 4.995629527302971e-06, + "loss": 0.741, + "step": 337 + }, + { + "epoch": 0.15981087470449173, + "grad_norm": 3.287503242492676, + "learning_rate": 4.9955925787310335e-06, + "loss": 0.7139, + "step": 338 + }, + { + "epoch": 0.16028368794326242, + "grad_norm": 3.288409471511841, + "learning_rate": 4.995555474769488e-06, + "loss": 0.7636, + "step": 339 + }, + { + "epoch": 0.1607565011820331, + "grad_norm": 2.8021693229675293, + "learning_rate": 4.995518215420646e-06, + "loss": 0.5883, + "step": 340 + }, + { + "epoch": 0.1612293144208038, + "grad_norm": 2.7038564682006836, + "learning_rate": 4.995480800686827e-06, + "loss": 0.657, + "step": 341 + }, + { + "epoch": 0.16170212765957448, + "grad_norm": 3.2370235919952393, + "learning_rate": 4.9954432305703615e-06, + "loss": 0.6999, + "step": 342 + }, + { + "epoch": 0.16217494089834517, + "grad_norm": 2.8666412830352783, + "learning_rate": 4.995405505073588e-06, + "loss": 0.7199, + "step": 343 + }, + { + "epoch": 0.16264775413711585, + "grad_norm": 3.6467232704162598, + "learning_rate": 4.995367624198856e-06, + "loss": 0.7317, + "step": 344 + }, + { + "epoch": 0.16312056737588654, + "grad_norm": 2.7576327323913574, + "learning_rate": 4.9953295879485246e-06, + "loss": 0.647, + "step": 345 + }, + { + "epoch": 0.1635933806146572, + "grad_norm": 2.922232151031494, + "learning_rate": 4.995291396324959e-06, + "loss": 0.6686, + "step": 346 + }, + { + "epoch": 0.16406619385342788, + "grad_norm": 2.8693501949310303, + "learning_rate": 4.995253049330542e-06, + "loss": 0.6756, + "step": 347 + }, + { + "epoch": 0.16453900709219857, + "grad_norm": 3.671865701675415, + "learning_rate": 4.995214546967658e-06, + "loss": 0.7347, + "step": 348 + }, + { + "epoch": 0.16501182033096926, + "grad_norm": 3.024219274520874, + "learning_rate": 4.995175889238706e-06, + "loss": 0.7547, + "step": 349 + }, + { + "epoch": 0.16548463356973994, + "grad_norm": 2.8470778465270996, + "learning_rate": 4.995137076146091e-06, + "loss": 0.6764, + "step": 350 + }, + { + "epoch": 0.16595744680851063, + "grad_norm": 2.905057907104492, + "learning_rate": 4.9950981076922324e-06, + "loss": 0.6814, + "step": 351 + }, + { + "epoch": 0.16643026004728131, + "grad_norm": 3.504377841949463, + "learning_rate": 4.995058983879555e-06, + "loss": 0.7145, + "step": 352 + }, + { + "epoch": 0.166903073286052, + "grad_norm": 3.0029661655426025, + "learning_rate": 4.995019704710495e-06, + "loss": 0.7114, + "step": 353 + }, + { + "epoch": 0.1673758865248227, + "grad_norm": 2.8666274547576904, + "learning_rate": 4.994980270187499e-06, + "loss": 0.7416, + "step": 354 + }, + { + "epoch": 0.16784869976359337, + "grad_norm": 3.1644718647003174, + "learning_rate": 4.994940680313021e-06, + "loss": 0.661, + "step": 355 + }, + { + "epoch": 0.16832151300236406, + "grad_norm": 3.050391674041748, + "learning_rate": 4.994900935089527e-06, + "loss": 0.7243, + "step": 356 + }, + { + "epoch": 0.16879432624113475, + "grad_norm": 2.985466480255127, + "learning_rate": 4.994861034519491e-06, + "loss": 0.6917, + "step": 357 + }, + { + "epoch": 0.16926713947990543, + "grad_norm": 2.909342050552368, + "learning_rate": 4.9948209786053995e-06, + "loss": 0.6636, + "step": 358 + }, + { + "epoch": 0.16973995271867612, + "grad_norm": 3.2214784622192383, + "learning_rate": 4.9947807673497435e-06, + "loss": 0.7903, + "step": 359 + }, + { + "epoch": 0.1702127659574468, + "grad_norm": 2.5654983520507812, + "learning_rate": 4.994740400755029e-06, + "loss": 0.6129, + "step": 360 + }, + { + "epoch": 0.1706855791962175, + "grad_norm": 3.775646448135376, + "learning_rate": 4.99469987882377e-06, + "loss": 0.7145, + "step": 361 + }, + { + "epoch": 0.17115839243498818, + "grad_norm": 2.8965413570404053, + "learning_rate": 4.994659201558487e-06, + "loss": 0.7177, + "step": 362 + }, + { + "epoch": 0.17163120567375886, + "grad_norm": 3.485597848892212, + "learning_rate": 4.9946183689617146e-06, + "loss": 0.8107, + "step": 363 + }, + { + "epoch": 0.17210401891252955, + "grad_norm": 3.277839183807373, + "learning_rate": 4.994577381035995e-06, + "loss": 0.691, + "step": 364 + }, + { + "epoch": 0.17257683215130024, + "grad_norm": 2.8807685375213623, + "learning_rate": 4.99453623778388e-06, + "loss": 0.7627, + "step": 365 + }, + { + "epoch": 0.17304964539007092, + "grad_norm": 3.0659940242767334, + "learning_rate": 4.994494939207932e-06, + "loss": 0.6858, + "step": 366 + }, + { + "epoch": 0.1735224586288416, + "grad_norm": 3.0881855487823486, + "learning_rate": 4.994453485310723e-06, + "loss": 0.8212, + "step": 367 + }, + { + "epoch": 0.1739952718676123, + "grad_norm": 2.7199201583862305, + "learning_rate": 4.994411876094832e-06, + "loss": 0.6516, + "step": 368 + }, + { + "epoch": 0.17446808510638298, + "grad_norm": 2.955889940261841, + "learning_rate": 4.994370111562851e-06, + "loss": 0.6579, + "step": 369 + }, + { + "epoch": 0.17494089834515367, + "grad_norm": 3.1321663856506348, + "learning_rate": 4.994328191717382e-06, + "loss": 0.6891, + "step": 370 + }, + { + "epoch": 0.17541371158392435, + "grad_norm": 3.0560388565063477, + "learning_rate": 4.994286116561034e-06, + "loss": 0.7243, + "step": 371 + }, + { + "epoch": 0.17588652482269504, + "grad_norm": 3.1560704708099365, + "learning_rate": 4.994243886096425e-06, + "loss": 0.7262, + "step": 372 + }, + { + "epoch": 0.17635933806146573, + "grad_norm": 2.913541316986084, + "learning_rate": 4.994201500326187e-06, + "loss": 0.7318, + "step": 373 + }, + { + "epoch": 0.1768321513002364, + "grad_norm": 3.098376512527466, + "learning_rate": 4.994158959252958e-06, + "loss": 0.6419, + "step": 374 + }, + { + "epoch": 0.1773049645390071, + "grad_norm": 2.977508544921875, + "learning_rate": 4.994116262879387e-06, + "loss": 0.6709, + "step": 375 + }, + { + "epoch": 0.17777777777777778, + "grad_norm": 3.168186902999878, + "learning_rate": 4.994073411208133e-06, + "loss": 0.6608, + "step": 376 + }, + { + "epoch": 0.17825059101654847, + "grad_norm": 3.436844825744629, + "learning_rate": 4.994030404241864e-06, + "loss": 0.7227, + "step": 377 + }, + { + "epoch": 0.17872340425531916, + "grad_norm": 2.8998289108276367, + "learning_rate": 4.993987241983258e-06, + "loss": 0.6512, + "step": 378 + }, + { + "epoch": 0.17919621749408984, + "grad_norm": 3.407191514968872, + "learning_rate": 4.993943924435002e-06, + "loss": 0.616, + "step": 379 + }, + { + "epoch": 0.17966903073286053, + "grad_norm": 3.744858741760254, + "learning_rate": 4.993900451599793e-06, + "loss": 0.8599, + "step": 380 + }, + { + "epoch": 0.18014184397163122, + "grad_norm": 3.486283779144287, + "learning_rate": 4.993856823480338e-06, + "loss": 0.6634, + "step": 381 + }, + { + "epoch": 0.1806146572104019, + "grad_norm": 2.895719051361084, + "learning_rate": 4.993813040079355e-06, + "loss": 0.6972, + "step": 382 + }, + { + "epoch": 0.1810874704491726, + "grad_norm": 2.814133882522583, + "learning_rate": 4.993769101399569e-06, + "loss": 0.6271, + "step": 383 + }, + { + "epoch": 0.18156028368794327, + "grad_norm": 2.8609800338745117, + "learning_rate": 4.993725007443715e-06, + "loss": 0.6481, + "step": 384 + }, + { + "epoch": 0.18203309692671396, + "grad_norm": 3.2829644680023193, + "learning_rate": 4.99368075821454e-06, + "loss": 0.7999, + "step": 385 + }, + { + "epoch": 0.18250591016548465, + "grad_norm": 3.1417458057403564, + "learning_rate": 4.993636353714798e-06, + "loss": 0.6972, + "step": 386 + }, + { + "epoch": 0.1829787234042553, + "grad_norm": 3.0679385662078857, + "learning_rate": 4.993591793947256e-06, + "loss": 0.667, + "step": 387 + }, + { + "epoch": 0.183451536643026, + "grad_norm": 3.1387410163879395, + "learning_rate": 4.993547078914686e-06, + "loss": 0.7618, + "step": 388 + }, + { + "epoch": 0.18392434988179668, + "grad_norm": 2.9181406497955322, + "learning_rate": 4.993502208619872e-06, + "loss": 0.7391, + "step": 389 + }, + { + "epoch": 0.18439716312056736, + "grad_norm": 2.8952157497406006, + "learning_rate": 4.993457183065611e-06, + "loss": 0.6988, + "step": 390 + }, + { + "epoch": 0.18486997635933805, + "grad_norm": 3.2274813652038574, + "learning_rate": 4.993412002254704e-06, + "loss": 0.688, + "step": 391 + }, + { + "epoch": 0.18534278959810874, + "grad_norm": 3.4693779945373535, + "learning_rate": 4.993366666189965e-06, + "loss": 0.6634, + "step": 392 + }, + { + "epoch": 0.18581560283687942, + "grad_norm": 3.5358526706695557, + "learning_rate": 4.993321174874217e-06, + "loss": 0.7343, + "step": 393 + }, + { + "epoch": 0.1862884160756501, + "grad_norm": 3.013338088989258, + "learning_rate": 4.993275528310292e-06, + "loss": 0.7579, + "step": 394 + }, + { + "epoch": 0.1867612293144208, + "grad_norm": 2.694772720336914, + "learning_rate": 4.993229726501033e-06, + "loss": 0.718, + "step": 395 + }, + { + "epoch": 0.18723404255319148, + "grad_norm": 3.070612907409668, + "learning_rate": 4.9931837694492915e-06, + "loss": 0.6438, + "step": 396 + }, + { + "epoch": 0.18770685579196217, + "grad_norm": 2.9193027019500732, + "learning_rate": 4.993137657157928e-06, + "loss": 0.6788, + "step": 397 + }, + { + "epoch": 0.18817966903073285, + "grad_norm": 3.047682046890259, + "learning_rate": 4.993091389629816e-06, + "loss": 0.6826, + "step": 398 + }, + { + "epoch": 0.18865248226950354, + "grad_norm": 2.9629905223846436, + "learning_rate": 4.993044966867834e-06, + "loss": 0.7196, + "step": 399 + }, + { + "epoch": 0.18912529550827423, + "grad_norm": 3.0692050457000732, + "learning_rate": 4.992998388874874e-06, + "loss": 0.7015, + "step": 400 + }, + { + "epoch": 0.1895981087470449, + "grad_norm": 3.5427212715148926, + "learning_rate": 4.992951655653836e-06, + "loss": 0.8292, + "step": 401 + }, + { + "epoch": 0.1900709219858156, + "grad_norm": 2.643526554107666, + "learning_rate": 4.992904767207629e-06, + "loss": 0.624, + "step": 402 + }, + { + "epoch": 0.19054373522458629, + "grad_norm": 3.1185996532440186, + "learning_rate": 4.992857723539173e-06, + "loss": 0.7354, + "step": 403 + }, + { + "epoch": 0.19101654846335697, + "grad_norm": 3.006856679916382, + "learning_rate": 4.992810524651398e-06, + "loss": 0.7752, + "step": 404 + }, + { + "epoch": 0.19148936170212766, + "grad_norm": 2.9913275241851807, + "learning_rate": 4.9927631705472425e-06, + "loss": 0.7306, + "step": 405 + }, + { + "epoch": 0.19196217494089834, + "grad_norm": 2.6794071197509766, + "learning_rate": 4.992715661229655e-06, + "loss": 0.6136, + "step": 406 + }, + { + "epoch": 0.19243498817966903, + "grad_norm": 3.5933966636657715, + "learning_rate": 4.992667996701593e-06, + "loss": 0.7024, + "step": 407 + }, + { + "epoch": 0.19290780141843972, + "grad_norm": 2.862187623977661, + "learning_rate": 4.992620176966025e-06, + "loss": 0.692, + "step": 408 + }, + { + "epoch": 0.1933806146572104, + "grad_norm": 3.076845407485962, + "learning_rate": 4.9925722020259286e-06, + "loss": 0.7475, + "step": 409 + }, + { + "epoch": 0.1938534278959811, + "grad_norm": 3.372919797897339, + "learning_rate": 4.9925240718842895e-06, + "loss": 0.6886, + "step": 410 + }, + { + "epoch": 0.19432624113475178, + "grad_norm": 2.922977924346924, + "learning_rate": 4.992475786544108e-06, + "loss": 0.7049, + "step": 411 + }, + { + "epoch": 0.19479905437352246, + "grad_norm": 2.908034324645996, + "learning_rate": 4.992427346008387e-06, + "loss": 0.6498, + "step": 412 + }, + { + "epoch": 0.19527186761229315, + "grad_norm": 3.096723794937134, + "learning_rate": 4.992378750280144e-06, + "loss": 0.7151, + "step": 413 + }, + { + "epoch": 0.19574468085106383, + "grad_norm": 2.895237684249878, + "learning_rate": 4.992329999362405e-06, + "loss": 0.7277, + "step": 414 + }, + { + "epoch": 0.19621749408983452, + "grad_norm": 2.718230724334717, + "learning_rate": 4.9922810932582065e-06, + "loss": 0.6375, + "step": 415 + }, + { + "epoch": 0.1966903073286052, + "grad_norm": 3.187743663787842, + "learning_rate": 4.992232031970592e-06, + "loss": 0.6528, + "step": 416 + }, + { + "epoch": 0.1971631205673759, + "grad_norm": 2.996406316757202, + "learning_rate": 4.992182815502616e-06, + "loss": 0.6552, + "step": 417 + }, + { + "epoch": 0.19763593380614658, + "grad_norm": 3.301084041595459, + "learning_rate": 4.992133443857345e-06, + "loss": 0.7061, + "step": 418 + }, + { + "epoch": 0.19810874704491727, + "grad_norm": 3.7874677181243896, + "learning_rate": 4.992083917037853e-06, + "loss": 0.7859, + "step": 419 + }, + { + "epoch": 0.19858156028368795, + "grad_norm": 3.124253511428833, + "learning_rate": 4.992034235047222e-06, + "loss": 0.7615, + "step": 420 + }, + { + "epoch": 0.19905437352245864, + "grad_norm": 3.0488970279693604, + "learning_rate": 4.991984397888546e-06, + "loss": 0.6916, + "step": 421 + }, + { + "epoch": 0.19952718676122932, + "grad_norm": 3.1241321563720703, + "learning_rate": 4.991934405564929e-06, + "loss": 0.7055, + "step": 422 + }, + { + "epoch": 0.2, + "grad_norm": 3.396632432937622, + "learning_rate": 4.991884258079484e-06, + "loss": 0.7675, + "step": 423 + }, + { + "epoch": 0.2004728132387707, + "grad_norm": 3.7776873111724854, + "learning_rate": 4.9918339554353316e-06, + "loss": 0.7371, + "step": 424 + }, + { + "epoch": 0.20094562647754138, + "grad_norm": 3.3356032371520996, + "learning_rate": 4.991783497635606e-06, + "loss": 0.6778, + "step": 425 + }, + { + "epoch": 0.20141843971631207, + "grad_norm": 2.988856792449951, + "learning_rate": 4.9917328846834474e-06, + "loss": 0.6795, + "step": 426 + }, + { + "epoch": 0.20189125295508276, + "grad_norm": 3.264183282852173, + "learning_rate": 4.99168211658201e-06, + "loss": 0.7707, + "step": 427 + }, + { + "epoch": 0.20236406619385341, + "grad_norm": 3.878068208694458, + "learning_rate": 4.991631193334451e-06, + "loss": 0.857, + "step": 428 + }, + { + "epoch": 0.2028368794326241, + "grad_norm": 3.6377553939819336, + "learning_rate": 4.991580114943943e-06, + "loss": 0.8033, + "step": 429 + }, + { + "epoch": 0.2033096926713948, + "grad_norm": 2.95393967628479, + "learning_rate": 4.991528881413667e-06, + "loss": 0.6809, + "step": 430 + }, + { + "epoch": 0.20378250591016547, + "grad_norm": 3.058704376220703, + "learning_rate": 4.9914774927468125e-06, + "loss": 0.6664, + "step": 431 + }, + { + "epoch": 0.20425531914893616, + "grad_norm": 2.7783217430114746, + "learning_rate": 4.9914259489465795e-06, + "loss": 0.6478, + "step": 432 + }, + { + "epoch": 0.20472813238770685, + "grad_norm": 2.4825217723846436, + "learning_rate": 4.991374250016177e-06, + "loss": 0.6598, + "step": 433 + }, + { + "epoch": 0.20520094562647753, + "grad_norm": 2.8753600120544434, + "learning_rate": 4.991322395958824e-06, + "loss": 0.6947, + "step": 434 + }, + { + "epoch": 0.20567375886524822, + "grad_norm": 3.2339367866516113, + "learning_rate": 4.99127038677775e-06, + "loss": 0.8201, + "step": 435 + }, + { + "epoch": 0.2061465721040189, + "grad_norm": 2.9065537452697754, + "learning_rate": 4.991218222476193e-06, + "loss": 0.6679, + "step": 436 + }, + { + "epoch": 0.2066193853427896, + "grad_norm": 3.283228874206543, + "learning_rate": 4.991165903057401e-06, + "loss": 0.8039, + "step": 437 + }, + { + "epoch": 0.20709219858156028, + "grad_norm": 3.429872751235962, + "learning_rate": 4.991113428524631e-06, + "loss": 0.7392, + "step": 438 + }, + { + "epoch": 0.20756501182033096, + "grad_norm": 3.118943452835083, + "learning_rate": 4.991060798881152e-06, + "loss": 0.6794, + "step": 439 + }, + { + "epoch": 0.20803782505910165, + "grad_norm": 3.395970106124878, + "learning_rate": 4.99100801413024e-06, + "loss": 0.6862, + "step": 440 + }, + { + "epoch": 0.20851063829787234, + "grad_norm": 2.869191884994507, + "learning_rate": 4.99095507427518e-06, + "loss": 0.6076, + "step": 441 + }, + { + "epoch": 0.20898345153664302, + "grad_norm": 3.1934661865234375, + "learning_rate": 4.990901979319272e-06, + "loss": 0.6927, + "step": 442 + }, + { + "epoch": 0.2094562647754137, + "grad_norm": 2.9068603515625, + "learning_rate": 4.990848729265819e-06, + "loss": 0.6864, + "step": 443 + }, + { + "epoch": 0.2099290780141844, + "grad_norm": 3.0535948276519775, + "learning_rate": 4.9907953241181375e-06, + "loss": 0.6396, + "step": 444 + }, + { + "epoch": 0.21040189125295508, + "grad_norm": 2.871511459350586, + "learning_rate": 4.990741763879554e-06, + "loss": 0.6743, + "step": 445 + }, + { + "epoch": 0.21087470449172577, + "grad_norm": 2.9184393882751465, + "learning_rate": 4.9906880485534015e-06, + "loss": 0.6786, + "step": 446 + }, + { + "epoch": 0.21134751773049645, + "grad_norm": 3.0628271102905273, + "learning_rate": 4.990634178143026e-06, + "loss": 0.6326, + "step": 447 + }, + { + "epoch": 0.21182033096926714, + "grad_norm": 3.7878305912017822, + "learning_rate": 4.990580152651782e-06, + "loss": 0.7944, + "step": 448 + }, + { + "epoch": 0.21229314420803783, + "grad_norm": 2.8577189445495605, + "learning_rate": 4.990525972083031e-06, + "loss": 0.71, + "step": 449 + }, + { + "epoch": 0.2127659574468085, + "grad_norm": 3.307769775390625, + "learning_rate": 4.99047163644015e-06, + "loss": 0.6893, + "step": 450 + }, + { + "epoch": 0.2132387706855792, + "grad_norm": 2.7391717433929443, + "learning_rate": 4.990417145726519e-06, + "loss": 0.712, + "step": 451 + }, + { + "epoch": 0.21371158392434988, + "grad_norm": 2.938044786453247, + "learning_rate": 4.990362499945534e-06, + "loss": 0.7516, + "step": 452 + }, + { + "epoch": 0.21418439716312057, + "grad_norm": 2.7831056118011475, + "learning_rate": 4.990307699100595e-06, + "loss": 0.6168, + "step": 453 + }, + { + "epoch": 0.21465721040189126, + "grad_norm": 2.907977342605591, + "learning_rate": 4.990252743195116e-06, + "loss": 0.6706, + "step": 454 + }, + { + "epoch": 0.21513002364066194, + "grad_norm": 3.7882161140441895, + "learning_rate": 4.990197632232517e-06, + "loss": 0.6847, + "step": 455 + }, + { + "epoch": 0.21560283687943263, + "grad_norm": 2.899716854095459, + "learning_rate": 4.990142366216232e-06, + "loss": 0.6699, + "step": 456 + }, + { + "epoch": 0.21607565011820332, + "grad_norm": 2.907003879547119, + "learning_rate": 4.990086945149701e-06, + "loss": 0.6864, + "step": 457 + }, + { + "epoch": 0.216548463356974, + "grad_norm": 3.2407333850860596, + "learning_rate": 4.9900313690363736e-06, + "loss": 0.692, + "step": 458 + }, + { + "epoch": 0.2170212765957447, + "grad_norm": 2.9055583477020264, + "learning_rate": 4.989975637879712e-06, + "loss": 0.7113, + "step": 459 + }, + { + "epoch": 0.21749408983451538, + "grad_norm": 2.9836206436157227, + "learning_rate": 4.989919751683184e-06, + "loss": 0.6673, + "step": 460 + }, + { + "epoch": 0.21796690307328606, + "grad_norm": 3.371035575866699, + "learning_rate": 4.989863710450273e-06, + "loss": 0.7181, + "step": 461 + }, + { + "epoch": 0.21843971631205675, + "grad_norm": 2.9636635780334473, + "learning_rate": 4.989807514184465e-06, + "loss": 0.6082, + "step": 462 + }, + { + "epoch": 0.21891252955082743, + "grad_norm": 2.9634664058685303, + "learning_rate": 4.9897511628892615e-06, + "loss": 0.7086, + "step": 463 + }, + { + "epoch": 0.21938534278959812, + "grad_norm": 3.154763698577881, + "learning_rate": 4.98969465656817e-06, + "loss": 0.7027, + "step": 464 + }, + { + "epoch": 0.2198581560283688, + "grad_norm": 2.9959890842437744, + "learning_rate": 4.98963799522471e-06, + "loss": 0.6498, + "step": 465 + }, + { + "epoch": 0.2203309692671395, + "grad_norm": 3.5470590591430664, + "learning_rate": 4.989581178862408e-06, + "loss": 0.7199, + "step": 466 + }, + { + "epoch": 0.22080378250591018, + "grad_norm": 7.1873369216918945, + "learning_rate": 4.989524207484802e-06, + "loss": 0.6676, + "step": 467 + }, + { + "epoch": 0.22127659574468084, + "grad_norm": 3.1099541187286377, + "learning_rate": 4.98946708109544e-06, + "loss": 0.6785, + "step": 468 + }, + { + "epoch": 0.22174940898345152, + "grad_norm": 2.830991506576538, + "learning_rate": 4.9894097996978795e-06, + "loss": 0.6456, + "step": 469 + }, + { + "epoch": 0.2222222222222222, + "grad_norm": 3.0212316513061523, + "learning_rate": 4.989352363295687e-06, + "loss": 0.6048, + "step": 470 + }, + { + "epoch": 0.2226950354609929, + "grad_norm": 3.18776798248291, + "learning_rate": 4.989294771892437e-06, + "loss": 0.7078, + "step": 471 + }, + { + "epoch": 0.22316784869976358, + "grad_norm": 2.9972598552703857, + "learning_rate": 4.989237025491717e-06, + "loss": 0.7082, + "step": 472 + }, + { + "epoch": 0.22364066193853427, + "grad_norm": 3.4935688972473145, + "learning_rate": 4.989179124097123e-06, + "loss": 0.8199, + "step": 473 + }, + { + "epoch": 0.22411347517730495, + "grad_norm": 2.6485543251037598, + "learning_rate": 4.9891210677122595e-06, + "loss": 0.6371, + "step": 474 + }, + { + "epoch": 0.22458628841607564, + "grad_norm": 2.969233512878418, + "learning_rate": 4.989062856340742e-06, + "loss": 0.6879, + "step": 475 + }, + { + "epoch": 0.22505910165484633, + "grad_norm": 2.881875514984131, + "learning_rate": 4.989004489986194e-06, + "loss": 0.7415, + "step": 476 + }, + { + "epoch": 0.225531914893617, + "grad_norm": 2.624540090560913, + "learning_rate": 4.98894596865225e-06, + "loss": 0.6522, + "step": 477 + }, + { + "epoch": 0.2260047281323877, + "grad_norm": 3.61075496673584, + "learning_rate": 4.988887292342555e-06, + "loss": 0.7109, + "step": 478 + }, + { + "epoch": 0.2264775413711584, + "grad_norm": 2.9368972778320312, + "learning_rate": 4.988828461060762e-06, + "loss": 0.6843, + "step": 479 + }, + { + "epoch": 0.22695035460992907, + "grad_norm": 3.0670197010040283, + "learning_rate": 4.988769474810533e-06, + "loss": 0.6807, + "step": 480 + }, + { + "epoch": 0.22742316784869976, + "grad_norm": 2.9662792682647705, + "learning_rate": 4.988710333595542e-06, + "loss": 0.6796, + "step": 481 + }, + { + "epoch": 0.22789598108747045, + "grad_norm": 2.971235752105713, + "learning_rate": 4.988651037419472e-06, + "loss": 0.696, + "step": 482 + }, + { + "epoch": 0.22836879432624113, + "grad_norm": 2.931884527206421, + "learning_rate": 4.988591586286013e-06, + "loss": 0.7323, + "step": 483 + }, + { + "epoch": 0.22884160756501182, + "grad_norm": 2.8114213943481445, + "learning_rate": 4.988531980198868e-06, + "loss": 0.6584, + "step": 484 + }, + { + "epoch": 0.2293144208037825, + "grad_norm": 3.2785916328430176, + "learning_rate": 4.98847221916175e-06, + "loss": 0.7514, + "step": 485 + }, + { + "epoch": 0.2297872340425532, + "grad_norm": 3.0520215034484863, + "learning_rate": 4.988412303178377e-06, + "loss": 0.7564, + "step": 486 + }, + { + "epoch": 0.23026004728132388, + "grad_norm": 3.181002616882324, + "learning_rate": 4.988352232252483e-06, + "loss": 0.6768, + "step": 487 + }, + { + "epoch": 0.23073286052009456, + "grad_norm": 3.4953625202178955, + "learning_rate": 4.988292006387805e-06, + "loss": 0.7143, + "step": 488 + }, + { + "epoch": 0.23120567375886525, + "grad_norm": 3.326571226119995, + "learning_rate": 4.988231625588096e-06, + "loss": 0.7318, + "step": 489 + }, + { + "epoch": 0.23167848699763594, + "grad_norm": 3.09614634513855, + "learning_rate": 4.988171089857113e-06, + "loss": 0.6574, + "step": 490 + }, + { + "epoch": 0.23215130023640662, + "grad_norm": 2.7439446449279785, + "learning_rate": 4.9881103991986265e-06, + "loss": 0.6637, + "step": 491 + }, + { + "epoch": 0.2326241134751773, + "grad_norm": 3.0681190490722656, + "learning_rate": 4.988049553616416e-06, + "loss": 0.6326, + "step": 492 + }, + { + "epoch": 0.233096926713948, + "grad_norm": 3.0757341384887695, + "learning_rate": 4.98798855311427e-06, + "loss": 0.695, + "step": 493 + }, + { + "epoch": 0.23356973995271868, + "grad_norm": 2.8637635707855225, + "learning_rate": 4.987927397695985e-06, + "loss": 0.6598, + "step": 494 + }, + { + "epoch": 0.23404255319148937, + "grad_norm": 3.3641068935394287, + "learning_rate": 4.9878660873653715e-06, + "loss": 0.7435, + "step": 495 + }, + { + "epoch": 0.23451536643026005, + "grad_norm": 3.5025596618652344, + "learning_rate": 4.987804622126245e-06, + "loss": 0.735, + "step": 496 + }, + { + "epoch": 0.23498817966903074, + "grad_norm": 2.9298837184906006, + "learning_rate": 4.987743001982434e-06, + "loss": 0.7063, + "step": 497 + }, + { + "epoch": 0.23546099290780143, + "grad_norm": 2.70358943939209, + "learning_rate": 4.987681226937774e-06, + "loss": 0.6799, + "step": 498 + }, + { + "epoch": 0.2359338061465721, + "grad_norm": 3.027871608734131, + "learning_rate": 4.9876192969961125e-06, + "loss": 0.6881, + "step": 499 + }, + { + "epoch": 0.2364066193853428, + "grad_norm": 3.362306594848633, + "learning_rate": 4.987557212161304e-06, + "loss": 0.7906, + "step": 500 + }, + { + "epoch": 0.23687943262411348, + "grad_norm": 3.3136050701141357, + "learning_rate": 4.987494972437217e-06, + "loss": 0.6878, + "step": 501 + }, + { + "epoch": 0.23735224586288417, + "grad_norm": 3.017089605331421, + "learning_rate": 4.9874325778277255e-06, + "loss": 0.7279, + "step": 502 + }, + { + "epoch": 0.23782505910165486, + "grad_norm": 2.8300516605377197, + "learning_rate": 4.987370028336714e-06, + "loss": 0.6864, + "step": 503 + }, + { + "epoch": 0.23829787234042554, + "grad_norm": 3.201860189437866, + "learning_rate": 4.987307323968077e-06, + "loss": 0.7531, + "step": 504 + }, + { + "epoch": 0.23877068557919623, + "grad_norm": 2.685396194458008, + "learning_rate": 4.987244464725721e-06, + "loss": 0.5849, + "step": 505 + }, + { + "epoch": 0.23924349881796692, + "grad_norm": 2.8715312480926514, + "learning_rate": 4.987181450613557e-06, + "loss": 0.675, + "step": 506 + }, + { + "epoch": 0.2397163120567376, + "grad_norm": 2.813908815383911, + "learning_rate": 4.987118281635511e-06, + "loss": 0.6841, + "step": 507 + }, + { + "epoch": 0.2401891252955083, + "grad_norm": 3.2738473415374756, + "learning_rate": 4.987054957795514e-06, + "loss": 0.7158, + "step": 508 + }, + { + "epoch": 0.24066193853427895, + "grad_norm": 2.896134376525879, + "learning_rate": 4.986991479097511e-06, + "loss": 0.7542, + "step": 509 + }, + { + "epoch": 0.24113475177304963, + "grad_norm": 3.0390403270721436, + "learning_rate": 4.986927845545454e-06, + "loss": 0.6733, + "step": 510 + }, + { + "epoch": 0.24160756501182032, + "grad_norm": 3.0300254821777344, + "learning_rate": 4.9868640571433044e-06, + "loss": 0.722, + "step": 511 + }, + { + "epoch": 0.242080378250591, + "grad_norm": 3.3037352561950684, + "learning_rate": 4.986800113895035e-06, + "loss": 0.6811, + "step": 512 + }, + { + "epoch": 0.2425531914893617, + "grad_norm": 3.0358474254608154, + "learning_rate": 4.986736015804627e-06, + "loss": 0.7348, + "step": 513 + }, + { + "epoch": 0.24302600472813238, + "grad_norm": 3.108792304992676, + "learning_rate": 4.986671762876071e-06, + "loss": 0.6096, + "step": 514 + }, + { + "epoch": 0.24349881796690306, + "grad_norm": 3.1316237449645996, + "learning_rate": 4.986607355113367e-06, + "loss": 0.6357, + "step": 515 + }, + { + "epoch": 0.24397163120567375, + "grad_norm": 3.3095219135284424, + "learning_rate": 4.986542792520528e-06, + "loss": 0.7515, + "step": 516 + }, + { + "epoch": 0.24444444444444444, + "grad_norm": 3.4775984287261963, + "learning_rate": 4.986478075101572e-06, + "loss": 0.7104, + "step": 517 + }, + { + "epoch": 0.24491725768321512, + "grad_norm": 3.341708183288574, + "learning_rate": 4.986413202860528e-06, + "loss": 0.7339, + "step": 518 + }, + { + "epoch": 0.2453900709219858, + "grad_norm": 2.9646966457366943, + "learning_rate": 4.986348175801438e-06, + "loss": 0.6032, + "step": 519 + }, + { + "epoch": 0.2458628841607565, + "grad_norm": 3.1853902339935303, + "learning_rate": 4.986282993928349e-06, + "loss": 0.6925, + "step": 520 + }, + { + "epoch": 0.24633569739952718, + "grad_norm": 3.286909818649292, + "learning_rate": 4.98621765724532e-06, + "loss": 0.7447, + "step": 521 + }, + { + "epoch": 0.24680851063829787, + "grad_norm": 3.2255051136016846, + "learning_rate": 4.986152165756419e-06, + "loss": 0.7747, + "step": 522 + }, + { + "epoch": 0.24728132387706855, + "grad_norm": 3.002352237701416, + "learning_rate": 4.986086519465724e-06, + "loss": 0.6472, + "step": 523 + }, + { + "epoch": 0.24775413711583924, + "grad_norm": 3.4738974571228027, + "learning_rate": 4.986020718377322e-06, + "loss": 0.7381, + "step": 524 + }, + { + "epoch": 0.24822695035460993, + "grad_norm": 3.4470200538635254, + "learning_rate": 4.985954762495312e-06, + "loss": 0.6878, + "step": 525 + }, + { + "epoch": 0.2486997635933806, + "grad_norm": 2.9219350814819336, + "learning_rate": 4.985888651823799e-06, + "loss": 0.6317, + "step": 526 + }, + { + "epoch": 0.2491725768321513, + "grad_norm": 3.061767101287842, + "learning_rate": 4.985822386366899e-06, + "loss": 0.6842, + "step": 527 + }, + { + "epoch": 0.24964539007092199, + "grad_norm": 3.0291247367858887, + "learning_rate": 4.985755966128742e-06, + "loss": 0.6852, + "step": 528 + }, + { + "epoch": 0.25011820330969264, + "grad_norm": 2.964280843734741, + "learning_rate": 4.985689391113457e-06, + "loss": 0.7738, + "step": 529 + }, + { + "epoch": 0.25059101654846333, + "grad_norm": 3.058302164077759, + "learning_rate": 4.9856226613251955e-06, + "loss": 0.6677, + "step": 530 + }, + { + "epoch": 0.251063829787234, + "grad_norm": 3.345141649246216, + "learning_rate": 4.985555776768109e-06, + "loss": 0.7837, + "step": 531 + }, + { + "epoch": 0.2515366430260047, + "grad_norm": 3.565031051635742, + "learning_rate": 4.9854887374463636e-06, + "loss": 0.7231, + "step": 532 + }, + { + "epoch": 0.2520094562647754, + "grad_norm": 2.7953789234161377, + "learning_rate": 4.985421543364132e-06, + "loss": 0.6102, + "step": 533 + }, + { + "epoch": 0.2524822695035461, + "grad_norm": 2.887606620788574, + "learning_rate": 4.9853541945256e-06, + "loss": 0.6289, + "step": 534 + }, + { + "epoch": 0.25295508274231676, + "grad_norm": 3.1480495929718018, + "learning_rate": 4.985286690934961e-06, + "loss": 0.6348, + "step": 535 + }, + { + "epoch": 0.25342789598108745, + "grad_norm": 2.8912761211395264, + "learning_rate": 4.985219032596416e-06, + "loss": 0.595, + "step": 536 + }, + { + "epoch": 0.25390070921985813, + "grad_norm": 2.947936534881592, + "learning_rate": 4.98515121951418e-06, + "loss": 0.6196, + "step": 537 + }, + { + "epoch": 0.2543735224586288, + "grad_norm": 3.1085827350616455, + "learning_rate": 4.985083251692474e-06, + "loss": 0.6387, + "step": 538 + }, + { + "epoch": 0.2548463356973995, + "grad_norm": 3.1688334941864014, + "learning_rate": 4.985015129135531e-06, + "loss": 0.7055, + "step": 539 + }, + { + "epoch": 0.2553191489361702, + "grad_norm": 3.075042963027954, + "learning_rate": 4.984946851847593e-06, + "loss": 0.7515, + "step": 540 + }, + { + "epoch": 0.2557919621749409, + "grad_norm": 3.1933093070983887, + "learning_rate": 4.98487841983291e-06, + "loss": 0.7054, + "step": 541 + }, + { + "epoch": 0.25626477541371157, + "grad_norm": 3.043473958969116, + "learning_rate": 4.984809833095744e-06, + "loss": 0.6281, + "step": 542 + }, + { + "epoch": 0.25673758865248225, + "grad_norm": 3.0532584190368652, + "learning_rate": 4.9847410916403645e-06, + "loss": 0.6155, + "step": 543 + }, + { + "epoch": 0.25721040189125294, + "grad_norm": 3.608480215072632, + "learning_rate": 4.984672195471053e-06, + "loss": 0.7363, + "step": 544 + }, + { + "epoch": 0.2576832151300236, + "grad_norm": 2.7491862773895264, + "learning_rate": 4.9846031445921e-06, + "loss": 0.6594, + "step": 545 + }, + { + "epoch": 0.2581560283687943, + "grad_norm": 2.8602418899536133, + "learning_rate": 4.984533939007802e-06, + "loss": 0.6742, + "step": 546 + }, + { + "epoch": 0.258628841607565, + "grad_norm": 3.1782007217407227, + "learning_rate": 4.98446457872247e-06, + "loss": 0.731, + "step": 547 + }, + { + "epoch": 0.2591016548463357, + "grad_norm": 2.796147584915161, + "learning_rate": 4.984395063740423e-06, + "loss": 0.6617, + "step": 548 + }, + { + "epoch": 0.25957446808510637, + "grad_norm": 2.8392202854156494, + "learning_rate": 4.984325394065991e-06, + "loss": 0.6753, + "step": 549 + }, + { + "epoch": 0.26004728132387706, + "grad_norm": 3.134672164916992, + "learning_rate": 4.984255569703508e-06, + "loss": 0.7222, + "step": 550 + }, + { + "epoch": 0.26052009456264774, + "grad_norm": 2.734330177307129, + "learning_rate": 4.984185590657325e-06, + "loss": 0.6098, + "step": 551 + }, + { + "epoch": 0.26099290780141843, + "grad_norm": 3.739010810852051, + "learning_rate": 4.984115456931798e-06, + "loss": 0.7457, + "step": 552 + }, + { + "epoch": 0.2614657210401891, + "grad_norm": 2.8412528038024902, + "learning_rate": 4.9840451685312925e-06, + "loss": 0.6972, + "step": 553 + }, + { + "epoch": 0.2619385342789598, + "grad_norm": 3.017395496368408, + "learning_rate": 4.983974725460188e-06, + "loss": 0.6887, + "step": 554 + }, + { + "epoch": 0.2624113475177305, + "grad_norm": 3.2746949195861816, + "learning_rate": 4.98390412772287e-06, + "loss": 0.7047, + "step": 555 + }, + { + "epoch": 0.2628841607565012, + "grad_norm": 3.1561965942382812, + "learning_rate": 4.983833375323732e-06, + "loss": 0.7726, + "step": 556 + }, + { + "epoch": 0.26335697399527186, + "grad_norm": 3.2367217540740967, + "learning_rate": 4.9837624682671816e-06, + "loss": 0.6348, + "step": 557 + }, + { + "epoch": 0.26382978723404255, + "grad_norm": 2.8195858001708984, + "learning_rate": 4.983691406557633e-06, + "loss": 0.6387, + "step": 558 + }, + { + "epoch": 0.26430260047281323, + "grad_norm": 3.349820852279663, + "learning_rate": 4.983620190199511e-06, + "loss": 0.6776, + "step": 559 + }, + { + "epoch": 0.2647754137115839, + "grad_norm": 2.8025588989257812, + "learning_rate": 4.98354881919725e-06, + "loss": 0.6512, + "step": 560 + }, + { + "epoch": 0.2652482269503546, + "grad_norm": 2.9125499725341797, + "learning_rate": 4.983477293555295e-06, + "loss": 0.7024, + "step": 561 + }, + { + "epoch": 0.2657210401891253, + "grad_norm": 3.3479275703430176, + "learning_rate": 4.983405613278098e-06, + "loss": 0.688, + "step": 562 + }, + { + "epoch": 0.266193853427896, + "grad_norm": 3.123971462249756, + "learning_rate": 4.983333778370123e-06, + "loss": 0.6743, + "step": 563 + }, + { + "epoch": 0.26666666666666666, + "grad_norm": 2.891625165939331, + "learning_rate": 4.983261788835843e-06, + "loss": 0.5971, + "step": 564 + }, + { + "epoch": 0.26713947990543735, + "grad_norm": 3.5066864490509033, + "learning_rate": 4.98318964467974e-06, + "loss": 0.6958, + "step": 565 + }, + { + "epoch": 0.26761229314420804, + "grad_norm": 2.570547342300415, + "learning_rate": 4.983117345906306e-06, + "loss": 0.609, + "step": 566 + }, + { + "epoch": 0.2680851063829787, + "grad_norm": 3.005106210708618, + "learning_rate": 4.983044892520044e-06, + "loss": 0.6791, + "step": 567 + }, + { + "epoch": 0.2685579196217494, + "grad_norm": 3.429675340652466, + "learning_rate": 4.982972284525463e-06, + "loss": 0.6625, + "step": 568 + }, + { + "epoch": 0.2690307328605201, + "grad_norm": 3.825657367706299, + "learning_rate": 4.982899521927086e-06, + "loss": 0.6368, + "step": 569 + }, + { + "epoch": 0.2695035460992908, + "grad_norm": 2.8699095249176025, + "learning_rate": 4.982826604729443e-06, + "loss": 0.6425, + "step": 570 + }, + { + "epoch": 0.26997635933806147, + "grad_norm": 3.1688714027404785, + "learning_rate": 4.982753532937074e-06, + "loss": 0.6904, + "step": 571 + }, + { + "epoch": 0.27044917257683215, + "grad_norm": 3.3889992237091064, + "learning_rate": 4.98268030655453e-06, + "loss": 0.7575, + "step": 572 + }, + { + "epoch": 0.27092198581560284, + "grad_norm": 3.108315944671631, + "learning_rate": 4.982606925586367e-06, + "loss": 0.6648, + "step": 573 + }, + { + "epoch": 0.2713947990543735, + "grad_norm": 3.209831953048706, + "learning_rate": 4.982533390037159e-06, + "loss": 0.657, + "step": 574 + }, + { + "epoch": 0.2718676122931442, + "grad_norm": 3.1740927696228027, + "learning_rate": 4.982459699911482e-06, + "loss": 0.7262, + "step": 575 + }, + { + "epoch": 0.2723404255319149, + "grad_norm": 3.0190417766571045, + "learning_rate": 4.982385855213924e-06, + "loss": 0.6368, + "step": 576 + }, + { + "epoch": 0.2728132387706856, + "grad_norm": 3.05049467086792, + "learning_rate": 4.982311855949084e-06, + "loss": 0.72, + "step": 577 + }, + { + "epoch": 0.27328605200945627, + "grad_norm": 2.984816551208496, + "learning_rate": 4.98223770212157e-06, + "loss": 0.6856, + "step": 578 + }, + { + "epoch": 0.27375886524822696, + "grad_norm": 2.744969606399536, + "learning_rate": 4.982163393735998e-06, + "loss": 0.6023, + "step": 579 + }, + { + "epoch": 0.27423167848699764, + "grad_norm": 3.170564889907837, + "learning_rate": 4.982088930796996e-06, + "loss": 0.6678, + "step": 580 + }, + { + "epoch": 0.27470449172576833, + "grad_norm": 2.8686118125915527, + "learning_rate": 4.982014313309199e-06, + "loss": 0.6157, + "step": 581 + }, + { + "epoch": 0.275177304964539, + "grad_norm": 2.8768694400787354, + "learning_rate": 4.981939541277254e-06, + "loss": 0.6566, + "step": 582 + }, + { + "epoch": 0.2756501182033097, + "grad_norm": 2.621481418609619, + "learning_rate": 4.981864614705818e-06, + "loss": 0.7372, + "step": 583 + }, + { + "epoch": 0.2761229314420804, + "grad_norm": 3.527374267578125, + "learning_rate": 4.981789533599554e-06, + "loss": 0.6485, + "step": 584 + }, + { + "epoch": 0.2765957446808511, + "grad_norm": 3.3141074180603027, + "learning_rate": 4.981714297963138e-06, + "loss": 0.6816, + "step": 585 + }, + { + "epoch": 0.27706855791962176, + "grad_norm": 2.9247069358825684, + "learning_rate": 4.981638907801255e-06, + "loss": 0.7217, + "step": 586 + }, + { + "epoch": 0.27754137115839245, + "grad_norm": 2.875236749649048, + "learning_rate": 4.981563363118599e-06, + "loss": 0.6662, + "step": 587 + }, + { + "epoch": 0.27801418439716313, + "grad_norm": 2.9540364742279053, + "learning_rate": 4.981487663919874e-06, + "loss": 0.7225, + "step": 588 + }, + { + "epoch": 0.2784869976359338, + "grad_norm": 2.90889310836792, + "learning_rate": 4.981411810209793e-06, + "loss": 0.6054, + "step": 589 + }, + { + "epoch": 0.2789598108747045, + "grad_norm": 2.8541409969329834, + "learning_rate": 4.981335801993078e-06, + "loss": 0.6539, + "step": 590 + }, + { + "epoch": 0.2794326241134752, + "grad_norm": 3.1600730419158936, + "learning_rate": 4.981259639274465e-06, + "loss": 0.6415, + "step": 591 + }, + { + "epoch": 0.2799054373522459, + "grad_norm": 3.569376230239868, + "learning_rate": 4.981183322058693e-06, + "loss": 0.6944, + "step": 592 + }, + { + "epoch": 0.28037825059101656, + "grad_norm": 3.067667007446289, + "learning_rate": 4.981106850350515e-06, + "loss": 0.7378, + "step": 593 + }, + { + "epoch": 0.28085106382978725, + "grad_norm": 3.082073450088501, + "learning_rate": 4.981030224154693e-06, + "loss": 0.693, + "step": 594 + }, + { + "epoch": 0.28132387706855794, + "grad_norm": 2.902932643890381, + "learning_rate": 4.980953443475998e-06, + "loss": 0.6549, + "step": 595 + }, + { + "epoch": 0.2817966903073286, + "grad_norm": 2.6821181774139404, + "learning_rate": 4.980876508319211e-06, + "loss": 0.6231, + "step": 596 + }, + { + "epoch": 0.2822695035460993, + "grad_norm": 3.1747355461120605, + "learning_rate": 4.9807994186891215e-06, + "loss": 0.6826, + "step": 597 + }, + { + "epoch": 0.28274231678487, + "grad_norm": 2.6975860595703125, + "learning_rate": 4.980722174590531e-06, + "loss": 0.6669, + "step": 598 + }, + { + "epoch": 0.2832151300236407, + "grad_norm": 2.924285650253296, + "learning_rate": 4.9806447760282486e-06, + "loss": 0.689, + "step": 599 + }, + { + "epoch": 0.28368794326241137, + "grad_norm": 2.941417694091797, + "learning_rate": 4.980567223007093e-06, + "loss": 0.6672, + "step": 600 + }, + { + "epoch": 0.28416075650118205, + "grad_norm": 2.8582186698913574, + "learning_rate": 4.980489515531892e-06, + "loss": 0.6229, + "step": 601 + }, + { + "epoch": 0.28463356973995274, + "grad_norm": 2.6462013721466064, + "learning_rate": 4.9804116536074865e-06, + "loss": 0.606, + "step": 602 + }, + { + "epoch": 0.2851063829787234, + "grad_norm": 2.9029998779296875, + "learning_rate": 4.980333637238723e-06, + "loss": 0.5915, + "step": 603 + }, + { + "epoch": 0.2855791962174941, + "grad_norm": 3.9359042644500732, + "learning_rate": 4.980255466430462e-06, + "loss": 0.7035, + "step": 604 + }, + { + "epoch": 0.2860520094562648, + "grad_norm": 3.200524091720581, + "learning_rate": 4.980177141187566e-06, + "loss": 0.7156, + "step": 605 + }, + { + "epoch": 0.2865248226950355, + "grad_norm": 3.1708686351776123, + "learning_rate": 4.980098661514916e-06, + "loss": 0.746, + "step": 606 + }, + { + "epoch": 0.28699763593380617, + "grad_norm": 2.8926830291748047, + "learning_rate": 4.980020027417397e-06, + "loss": 0.6282, + "step": 607 + }, + { + "epoch": 0.28747044917257686, + "grad_norm": 3.0526294708251953, + "learning_rate": 4.979941238899906e-06, + "loss": 0.6594, + "step": 608 + }, + { + "epoch": 0.28794326241134754, + "grad_norm": 2.9869306087493896, + "learning_rate": 4.9798622959673486e-06, + "loss": 0.7771, + "step": 609 + }, + { + "epoch": 0.28841607565011823, + "grad_norm": 2.7894513607025146, + "learning_rate": 4.979783198624638e-06, + "loss": 0.6819, + "step": 610 + }, + { + "epoch": 0.28888888888888886, + "grad_norm": 2.958575963973999, + "learning_rate": 4.9797039468767025e-06, + "loss": 0.6474, + "step": 611 + }, + { + "epoch": 0.28936170212765955, + "grad_norm": 3.423748016357422, + "learning_rate": 4.979624540728475e-06, + "loss": 0.7389, + "step": 612 + }, + { + "epoch": 0.28983451536643023, + "grad_norm": 2.9641635417938232, + "learning_rate": 4.9795449801849e-06, + "loss": 0.6005, + "step": 613 + }, + { + "epoch": 0.2903073286052009, + "grad_norm": 3.02274227142334, + "learning_rate": 4.979465265250933e-06, + "loss": 0.6358, + "step": 614 + }, + { + "epoch": 0.2907801418439716, + "grad_norm": 3.0562758445739746, + "learning_rate": 4.979385395931534e-06, + "loss": 0.6313, + "step": 615 + }, + { + "epoch": 0.2912529550827423, + "grad_norm": 3.301816701889038, + "learning_rate": 4.97930537223168e-06, + "loss": 0.7264, + "step": 616 + }, + { + "epoch": 0.291725768321513, + "grad_norm": 2.975360870361328, + "learning_rate": 4.979225194156351e-06, + "loss": 0.613, + "step": 617 + }, + { + "epoch": 0.29219858156028367, + "grad_norm": 2.9245030879974365, + "learning_rate": 4.97914486171054e-06, + "loss": 0.6646, + "step": 618 + }, + { + "epoch": 0.29267139479905435, + "grad_norm": 3.1336188316345215, + "learning_rate": 4.979064374899249e-06, + "loss": 0.6421, + "step": 619 + }, + { + "epoch": 0.29314420803782504, + "grad_norm": 3.6298763751983643, + "learning_rate": 4.978983733727491e-06, + "loss": 0.6433, + "step": 620 + }, + { + "epoch": 0.2936170212765957, + "grad_norm": 2.919597625732422, + "learning_rate": 4.9789029382002845e-06, + "loss": 0.6288, + "step": 621 + }, + { + "epoch": 0.2940898345153664, + "grad_norm": 3.2206127643585205, + "learning_rate": 4.978821988322662e-06, + "loss": 0.7102, + "step": 622 + }, + { + "epoch": 0.2945626477541371, + "grad_norm": 3.1767101287841797, + "learning_rate": 4.978740884099664e-06, + "loss": 0.6722, + "step": 623 + }, + { + "epoch": 0.2950354609929078, + "grad_norm": 3.3425452709198, + "learning_rate": 4.97865962553634e-06, + "loss": 0.6492, + "step": 624 + }, + { + "epoch": 0.29550827423167847, + "grad_norm": 3.0408358573913574, + "learning_rate": 4.97857821263775e-06, + "loss": 0.6522, + "step": 625 + }, + { + "epoch": 0.29598108747044916, + "grad_norm": 2.8144783973693848, + "learning_rate": 4.978496645408963e-06, + "loss": 0.7237, + "step": 626 + }, + { + "epoch": 0.29645390070921984, + "grad_norm": 3.7010560035705566, + "learning_rate": 4.978414923855057e-06, + "loss": 0.7509, + "step": 627 + }, + { + "epoch": 0.29692671394799053, + "grad_norm": 2.9438371658325195, + "learning_rate": 4.978333047981122e-06, + "loss": 0.6244, + "step": 628 + }, + { + "epoch": 0.2973995271867612, + "grad_norm": 3.285982370376587, + "learning_rate": 4.978251017792255e-06, + "loss": 0.7553, + "step": 629 + }, + { + "epoch": 0.2978723404255319, + "grad_norm": 3.7021138668060303, + "learning_rate": 4.978168833293564e-06, + "loss": 0.7859, + "step": 630 + }, + { + "epoch": 0.2983451536643026, + "grad_norm": 3.481858730316162, + "learning_rate": 4.9780864944901654e-06, + "loss": 0.7146, + "step": 631 + }, + { + "epoch": 0.2988179669030733, + "grad_norm": 3.693824529647827, + "learning_rate": 4.978004001387188e-06, + "loss": 0.6608, + "step": 632 + }, + { + "epoch": 0.29929078014184396, + "grad_norm": 3.0069146156311035, + "learning_rate": 4.9779213539897665e-06, + "loss": 0.6506, + "step": 633 + }, + { + "epoch": 0.29976359338061465, + "grad_norm": 3.037644147872925, + "learning_rate": 4.977838552303048e-06, + "loss": 0.6487, + "step": 634 + }, + { + "epoch": 0.30023640661938533, + "grad_norm": 3.018554449081421, + "learning_rate": 4.977755596332188e-06, + "loss": 0.6128, + "step": 635 + }, + { + "epoch": 0.300709219858156, + "grad_norm": 3.000312089920044, + "learning_rate": 4.977672486082351e-06, + "loss": 0.6431, + "step": 636 + }, + { + "epoch": 0.3011820330969267, + "grad_norm": 2.836803913116455, + "learning_rate": 4.977589221558713e-06, + "loss": 0.5914, + "step": 637 + }, + { + "epoch": 0.3016548463356974, + "grad_norm": 3.080469846725464, + "learning_rate": 4.977505802766457e-06, + "loss": 0.7265, + "step": 638 + }, + { + "epoch": 0.3021276595744681, + "grad_norm": 3.2245471477508545, + "learning_rate": 4.97742222971078e-06, + "loss": 0.6895, + "step": 639 + }, + { + "epoch": 0.30260047281323876, + "grad_norm": 3.559006452560425, + "learning_rate": 4.977338502396882e-06, + "loss": 0.7439, + "step": 640 + }, + { + "epoch": 0.30307328605200945, + "grad_norm": 2.9116289615631104, + "learning_rate": 4.9772546208299795e-06, + "loss": 0.6907, + "step": 641 + }, + { + "epoch": 0.30354609929078014, + "grad_norm": 3.3645524978637695, + "learning_rate": 4.977170585015295e-06, + "loss": 0.6983, + "step": 642 + }, + { + "epoch": 0.3040189125295508, + "grad_norm": 3.080148458480835, + "learning_rate": 4.977086394958058e-06, + "loss": 0.7016, + "step": 643 + }, + { + "epoch": 0.3044917257683215, + "grad_norm": 2.9276750087738037, + "learning_rate": 4.977002050663515e-06, + "loss": 0.6509, + "step": 644 + }, + { + "epoch": 0.3049645390070922, + "grad_norm": 3.183609962463379, + "learning_rate": 4.976917552136914e-06, + "loss": 0.6814, + "step": 645 + }, + { + "epoch": 0.3054373522458629, + "grad_norm": 3.0980000495910645, + "learning_rate": 4.976832899383519e-06, + "loss": 0.6319, + "step": 646 + }, + { + "epoch": 0.30591016548463357, + "grad_norm": 3.211376190185547, + "learning_rate": 4.9767480924086e-06, + "loss": 0.6365, + "step": 647 + }, + { + "epoch": 0.30638297872340425, + "grad_norm": 3.214430093765259, + "learning_rate": 4.976663131217437e-06, + "loss": 0.6006, + "step": 648 + }, + { + "epoch": 0.30685579196217494, + "grad_norm": 3.0914318561553955, + "learning_rate": 4.976578015815321e-06, + "loss": 0.7162, + "step": 649 + }, + { + "epoch": 0.3073286052009456, + "grad_norm": 2.7644500732421875, + "learning_rate": 4.976492746207551e-06, + "loss": 0.6045, + "step": 650 + }, + { + "epoch": 0.3078014184397163, + "grad_norm": 3.1913280487060547, + "learning_rate": 4.9764073223994374e-06, + "loss": 0.6796, + "step": 651 + }, + { + "epoch": 0.308274231678487, + "grad_norm": 2.8919692039489746, + "learning_rate": 4.976321744396299e-06, + "loss": 0.6683, + "step": 652 + }, + { + "epoch": 0.3087470449172577, + "grad_norm": 2.862234115600586, + "learning_rate": 4.976236012203463e-06, + "loss": 0.6631, + "step": 653 + }, + { + "epoch": 0.30921985815602837, + "grad_norm": 2.9708092212677, + "learning_rate": 4.976150125826268e-06, + "loss": 0.6326, + "step": 654 + }, + { + "epoch": 0.30969267139479906, + "grad_norm": 2.892465353012085, + "learning_rate": 4.976064085270063e-06, + "loss": 0.6574, + "step": 655 + }, + { + "epoch": 0.31016548463356974, + "grad_norm": 3.9215126037597656, + "learning_rate": 4.975977890540205e-06, + "loss": 0.7351, + "step": 656 + }, + { + "epoch": 0.31063829787234043, + "grad_norm": 2.9544081687927246, + "learning_rate": 4.975891541642059e-06, + "loss": 0.7264, + "step": 657 + }, + { + "epoch": 0.3111111111111111, + "grad_norm": 2.995035409927368, + "learning_rate": 4.975805038581005e-06, + "loss": 0.7405, + "step": 658 + }, + { + "epoch": 0.3115839243498818, + "grad_norm": 2.9653120040893555, + "learning_rate": 4.975718381362427e-06, + "loss": 0.679, + "step": 659 + }, + { + "epoch": 0.3120567375886525, + "grad_norm": 2.93976092338562, + "learning_rate": 4.9756315699917205e-06, + "loss": 0.627, + "step": 660 + }, + { + "epoch": 0.3125295508274232, + "grad_norm": 3.106522560119629, + "learning_rate": 4.9755446044742915e-06, + "loss": 0.6329, + "step": 661 + }, + { + "epoch": 0.31300236406619386, + "grad_norm": 3.0238280296325684, + "learning_rate": 4.975457484815554e-06, + "loss": 0.6643, + "step": 662 + }, + { + "epoch": 0.31347517730496455, + "grad_norm": 2.943528175354004, + "learning_rate": 4.9753702110209356e-06, + "loss": 0.668, + "step": 663 + }, + { + "epoch": 0.31394799054373523, + "grad_norm": 2.6840121746063232, + "learning_rate": 4.9752827830958676e-06, + "loss": 0.5482, + "step": 664 + }, + { + "epoch": 0.3144208037825059, + "grad_norm": 2.823875904083252, + "learning_rate": 4.975195201045794e-06, + "loss": 0.7017, + "step": 665 + }, + { + "epoch": 0.3148936170212766, + "grad_norm": 3.148181200027466, + "learning_rate": 4.975107464876168e-06, + "loss": 0.747, + "step": 666 + }, + { + "epoch": 0.3153664302600473, + "grad_norm": 2.630584478378296, + "learning_rate": 4.9750195745924545e-06, + "loss": 0.5987, + "step": 667 + }, + { + "epoch": 0.315839243498818, + "grad_norm": 3.075866460800171, + "learning_rate": 4.974931530200124e-06, + "loss": 0.664, + "step": 668 + }, + { + "epoch": 0.31631205673758866, + "grad_norm": 2.947197914123535, + "learning_rate": 4.974843331704659e-06, + "loss": 0.631, + "step": 669 + }, + { + "epoch": 0.31678486997635935, + "grad_norm": 3.519646644592285, + "learning_rate": 4.974754979111552e-06, + "loss": 0.7154, + "step": 670 + }, + { + "epoch": 0.31725768321513004, + "grad_norm": 2.8687186241149902, + "learning_rate": 4.974666472426305e-06, + "loss": 0.6366, + "step": 671 + }, + { + "epoch": 0.3177304964539007, + "grad_norm": 2.6966612339019775, + "learning_rate": 4.974577811654426e-06, + "loss": 0.7112, + "step": 672 + }, + { + "epoch": 0.3182033096926714, + "grad_norm": 3.1390228271484375, + "learning_rate": 4.974488996801439e-06, + "loss": 0.6882, + "step": 673 + }, + { + "epoch": 0.3186761229314421, + "grad_norm": 3.4667599201202393, + "learning_rate": 4.974400027872871e-06, + "loss": 0.7153, + "step": 674 + }, + { + "epoch": 0.3191489361702128, + "grad_norm": 2.9632184505462646, + "learning_rate": 4.974310904874265e-06, + "loss": 0.7081, + "step": 675 + }, + { + "epoch": 0.31962174940898347, + "grad_norm": 3.46150279045105, + "learning_rate": 4.9742216278111666e-06, + "loss": 0.6242, + "step": 676 + }, + { + "epoch": 0.32009456264775416, + "grad_norm": 3.380403757095337, + "learning_rate": 4.974132196689137e-06, + "loss": 0.6863, + "step": 677 + }, + { + "epoch": 0.32056737588652484, + "grad_norm": 3.4279606342315674, + "learning_rate": 4.974042611513746e-06, + "loss": 0.6388, + "step": 678 + }, + { + "epoch": 0.3210401891252955, + "grad_norm": 2.634523391723633, + "learning_rate": 4.973952872290568e-06, + "loss": 0.6038, + "step": 679 + }, + { + "epoch": 0.3215130023640662, + "grad_norm": 3.19693922996521, + "learning_rate": 4.973862979025194e-06, + "loss": 0.6383, + "step": 680 + }, + { + "epoch": 0.3219858156028369, + "grad_norm": 3.437692165374756, + "learning_rate": 4.973772931723218e-06, + "loss": 0.7288, + "step": 681 + }, + { + "epoch": 0.3224586288416076, + "grad_norm": 2.506301164627075, + "learning_rate": 4.97368273039025e-06, + "loss": 0.5707, + "step": 682 + }, + { + "epoch": 0.3229314420803783, + "grad_norm": 3.0942845344543457, + "learning_rate": 4.9735923750319044e-06, + "loss": 0.6348, + "step": 683 + }, + { + "epoch": 0.32340425531914896, + "grad_norm": 3.0889835357666016, + "learning_rate": 4.973501865653809e-06, + "loss": 0.6697, + "step": 684 + }, + { + "epoch": 0.32387706855791965, + "grad_norm": 3.0391931533813477, + "learning_rate": 4.973411202261598e-06, + "loss": 0.7091, + "step": 685 + }, + { + "epoch": 0.32434988179669033, + "grad_norm": 3.0333497524261475, + "learning_rate": 4.973320384860917e-06, + "loss": 0.6403, + "step": 686 + }, + { + "epoch": 0.324822695035461, + "grad_norm": 2.9714622497558594, + "learning_rate": 4.973229413457421e-06, + "loss": 0.6977, + "step": 687 + }, + { + "epoch": 0.3252955082742317, + "grad_norm": 3.057558298110962, + "learning_rate": 4.973138288056774e-06, + "loss": 0.7236, + "step": 688 + }, + { + "epoch": 0.3257683215130024, + "grad_norm": 2.921093463897705, + "learning_rate": 4.97304700866465e-06, + "loss": 0.576, + "step": 689 + }, + { + "epoch": 0.3262411347517731, + "grad_norm": 3.0287256240844727, + "learning_rate": 4.972955575286732e-06, + "loss": 0.7077, + "step": 690 + }, + { + "epoch": 0.32671394799054376, + "grad_norm": 2.8621346950531006, + "learning_rate": 4.972863987928716e-06, + "loss": 0.6952, + "step": 691 + }, + { + "epoch": 0.3271867612293144, + "grad_norm": 2.631359100341797, + "learning_rate": 4.9727722465963006e-06, + "loss": 0.6931, + "step": 692 + }, + { + "epoch": 0.3276595744680851, + "grad_norm": 2.8484320640563965, + "learning_rate": 4.972680351295201e-06, + "loss": 0.6292, + "step": 693 + }, + { + "epoch": 0.32813238770685577, + "grad_norm": 2.593001365661621, + "learning_rate": 4.972588302031138e-06, + "loss": 0.5942, + "step": 694 + }, + { + "epoch": 0.32860520094562645, + "grad_norm": 2.6321065425872803, + "learning_rate": 4.972496098809844e-06, + "loss": 0.65, + "step": 695 + }, + { + "epoch": 0.32907801418439714, + "grad_norm": 3.2516732215881348, + "learning_rate": 4.972403741637059e-06, + "loss": 0.7385, + "step": 696 + }, + { + "epoch": 0.3295508274231678, + "grad_norm": 3.180854320526123, + "learning_rate": 4.972311230518535e-06, + "loss": 0.6569, + "step": 697 + }, + { + "epoch": 0.3300236406619385, + "grad_norm": 4.161016941070557, + "learning_rate": 4.972218565460031e-06, + "loss": 0.6416, + "step": 698 + }, + { + "epoch": 0.3304964539007092, + "grad_norm": 3.153897762298584, + "learning_rate": 4.972125746467317e-06, + "loss": 0.7196, + "step": 699 + }, + { + "epoch": 0.3309692671394799, + "grad_norm": 2.9595556259155273, + "learning_rate": 4.972032773546173e-06, + "loss": 0.7093, + "step": 700 + }, + { + "epoch": 0.33144208037825057, + "grad_norm": 3.1086833477020264, + "learning_rate": 4.9719396467023875e-06, + "loss": 0.6963, + "step": 701 + }, + { + "epoch": 0.33191489361702126, + "grad_norm": 2.958921432495117, + "learning_rate": 4.971846365941759e-06, + "loss": 0.6518, + "step": 702 + }, + { + "epoch": 0.33238770685579194, + "grad_norm": 2.8745479583740234, + "learning_rate": 4.971752931270096e-06, + "loss": 0.696, + "step": 703 + }, + { + "epoch": 0.33286052009456263, + "grad_norm": 3.224358558654785, + "learning_rate": 4.971659342693217e-06, + "loss": 0.6769, + "step": 704 + }, + { + "epoch": 0.3333333333333333, + "grad_norm": 2.696319580078125, + "learning_rate": 4.9715656002169486e-06, + "loss": 0.6833, + "step": 705 + }, + { + "epoch": 0.333806146572104, + "grad_norm": 2.9283502101898193, + "learning_rate": 4.971471703847127e-06, + "loss": 0.6784, + "step": 706 + }, + { + "epoch": 0.3342789598108747, + "grad_norm": 2.654914140701294, + "learning_rate": 4.9713776535896e-06, + "loss": 0.6337, + "step": 707 + }, + { + "epoch": 0.3347517730496454, + "grad_norm": 3.041555643081665, + "learning_rate": 4.971283449450224e-06, + "loss": 0.6227, + "step": 708 + }, + { + "epoch": 0.33522458628841606, + "grad_norm": 2.893008232116699, + "learning_rate": 4.971189091434863e-06, + "loss": 0.655, + "step": 709 + }, + { + "epoch": 0.33569739952718675, + "grad_norm": 2.8806653022766113, + "learning_rate": 4.971094579549393e-06, + "loss": 0.7077, + "step": 710 + }, + { + "epoch": 0.33617021276595743, + "grad_norm": 3.4830048084259033, + "learning_rate": 4.9709999137996986e-06, + "loss": 0.7461, + "step": 711 + }, + { + "epoch": 0.3366430260047281, + "grad_norm": 3.155444860458374, + "learning_rate": 4.970905094191674e-06, + "loss": 0.652, + "step": 712 + }, + { + "epoch": 0.3371158392434988, + "grad_norm": 2.7608706951141357, + "learning_rate": 4.970810120731225e-06, + "loss": 0.684, + "step": 713 + }, + { + "epoch": 0.3375886524822695, + "grad_norm": 2.8209474086761475, + "learning_rate": 4.970714993424265e-06, + "loss": 0.6009, + "step": 714 + }, + { + "epoch": 0.3380614657210402, + "grad_norm": 3.6532654762268066, + "learning_rate": 4.9706197122767145e-06, + "loss": 0.702, + "step": 715 + }, + { + "epoch": 0.33853427895981086, + "grad_norm": 2.6276566982269287, + "learning_rate": 4.970524277294508e-06, + "loss": 0.6338, + "step": 716 + }, + { + "epoch": 0.33900709219858155, + "grad_norm": 3.509871482849121, + "learning_rate": 4.970428688483589e-06, + "loss": 0.6853, + "step": 717 + }, + { + "epoch": 0.33947990543735224, + "grad_norm": 5.332682132720947, + "learning_rate": 4.970332945849906e-06, + "loss": 0.6684, + "step": 718 + }, + { + "epoch": 0.3399527186761229, + "grad_norm": 2.718801975250244, + "learning_rate": 4.970237049399424e-06, + "loss": 0.6676, + "step": 719 + }, + { + "epoch": 0.3404255319148936, + "grad_norm": 3.891003131866455, + "learning_rate": 4.970140999138112e-06, + "loss": 0.7043, + "step": 720 + }, + { + "epoch": 0.3408983451536643, + "grad_norm": 2.8863155841827393, + "learning_rate": 4.970044795071951e-06, + "loss": 0.6563, + "step": 721 + }, + { + "epoch": 0.341371158392435, + "grad_norm": 3.2527518272399902, + "learning_rate": 4.969948437206932e-06, + "loss": 0.7244, + "step": 722 + }, + { + "epoch": 0.34184397163120567, + "grad_norm": 2.9726758003234863, + "learning_rate": 4.969851925549054e-06, + "loss": 0.6548, + "step": 723 + }, + { + "epoch": 0.34231678486997635, + "grad_norm": 3.118309497833252, + "learning_rate": 4.969755260104327e-06, + "loss": 0.7293, + "step": 724 + }, + { + "epoch": 0.34278959810874704, + "grad_norm": 3.373068332672119, + "learning_rate": 4.969658440878769e-06, + "loss": 0.6444, + "step": 725 + }, + { + "epoch": 0.3432624113475177, + "grad_norm": 2.7157437801361084, + "learning_rate": 4.969561467878409e-06, + "loss": 0.642, + "step": 726 + }, + { + "epoch": 0.3437352245862884, + "grad_norm": 2.58929705619812, + "learning_rate": 4.969464341109285e-06, + "loss": 0.6165, + "step": 727 + }, + { + "epoch": 0.3442080378250591, + "grad_norm": 2.8811306953430176, + "learning_rate": 4.969367060577445e-06, + "loss": 0.7127, + "step": 728 + }, + { + "epoch": 0.3446808510638298, + "grad_norm": 3.494358539581299, + "learning_rate": 4.969269626288946e-06, + "loss": 0.7103, + "step": 729 + }, + { + "epoch": 0.34515366430260047, + "grad_norm": 2.9753928184509277, + "learning_rate": 4.969172038249855e-06, + "loss": 0.6911, + "step": 730 + }, + { + "epoch": 0.34562647754137116, + "grad_norm": 3.2885913848876953, + "learning_rate": 4.969074296466247e-06, + "loss": 0.6968, + "step": 731 + }, + { + "epoch": 0.34609929078014184, + "grad_norm": 2.7564568519592285, + "learning_rate": 4.968976400944211e-06, + "loss": 0.6843, + "step": 732 + }, + { + "epoch": 0.34657210401891253, + "grad_norm": 2.9255006313323975, + "learning_rate": 4.96887835168984e-06, + "loss": 0.6024, + "step": 733 + }, + { + "epoch": 0.3470449172576832, + "grad_norm": 3.1808290481567383, + "learning_rate": 4.968780148709239e-06, + "loss": 0.7377, + "step": 734 + }, + { + "epoch": 0.3475177304964539, + "grad_norm": 2.956666946411133, + "learning_rate": 4.968681792008523e-06, + "loss": 0.65, + "step": 735 + }, + { + "epoch": 0.3479905437352246, + "grad_norm": 2.9631855487823486, + "learning_rate": 4.9685832815938175e-06, + "loss": 0.677, + "step": 736 + }, + { + "epoch": 0.3484633569739953, + "grad_norm": 2.501917600631714, + "learning_rate": 4.968484617471256e-06, + "loss": 0.6282, + "step": 737 + }, + { + "epoch": 0.34893617021276596, + "grad_norm": 2.750779628753662, + "learning_rate": 4.968385799646981e-06, + "loss": 0.6507, + "step": 738 + }, + { + "epoch": 0.34940898345153665, + "grad_norm": 2.872300624847412, + "learning_rate": 4.968286828127146e-06, + "loss": 0.5949, + "step": 739 + }, + { + "epoch": 0.34988179669030733, + "grad_norm": 2.6316142082214355, + "learning_rate": 4.9681877029179124e-06, + "loss": 0.6328, + "step": 740 + }, + { + "epoch": 0.350354609929078, + "grad_norm": 3.244364023208618, + "learning_rate": 4.968088424025454e-06, + "loss": 0.7393, + "step": 741 + }, + { + "epoch": 0.3508274231678487, + "grad_norm": 2.620465040206909, + "learning_rate": 4.967988991455951e-06, + "loss": 0.6797, + "step": 742 + }, + { + "epoch": 0.3513002364066194, + "grad_norm": 2.854513645172119, + "learning_rate": 4.967889405215596e-06, + "loss": 0.6368, + "step": 743 + }, + { + "epoch": 0.3517730496453901, + "grad_norm": 2.579854726791382, + "learning_rate": 4.9677896653105886e-06, + "loss": 0.6489, + "step": 744 + }, + { + "epoch": 0.35224586288416077, + "grad_norm": 3.0697381496429443, + "learning_rate": 4.96768977174714e-06, + "loss": 0.6313, + "step": 745 + }, + { + "epoch": 0.35271867612293145, + "grad_norm": 3.369338035583496, + "learning_rate": 4.96758972453147e-06, + "loss": 0.7416, + "step": 746 + }, + { + "epoch": 0.35319148936170214, + "grad_norm": 2.836221933364868, + "learning_rate": 4.967489523669807e-06, + "loss": 0.6422, + "step": 747 + }, + { + "epoch": 0.3536643026004728, + "grad_norm": 2.929579496383667, + "learning_rate": 4.967389169168392e-06, + "loss": 0.6482, + "step": 748 + }, + { + "epoch": 0.3541371158392435, + "grad_norm": 2.9243831634521484, + "learning_rate": 4.967288661033472e-06, + "loss": 0.5813, + "step": 749 + }, + { + "epoch": 0.3546099290780142, + "grad_norm": 3.7555336952209473, + "learning_rate": 4.967187999271306e-06, + "loss": 0.6501, + "step": 750 + }, + { + "epoch": 0.3550827423167849, + "grad_norm": 3.4279143810272217, + "learning_rate": 4.9670871838881615e-06, + "loss": 0.6326, + "step": 751 + }, + { + "epoch": 0.35555555555555557, + "grad_norm": 2.875066041946411, + "learning_rate": 4.9669862148903166e-06, + "loss": 0.664, + "step": 752 + }, + { + "epoch": 0.35602836879432626, + "grad_norm": 3.130394697189331, + "learning_rate": 4.966885092284057e-06, + "loss": 0.706, + "step": 753 + }, + { + "epoch": 0.35650118203309694, + "grad_norm": 2.9606287479400635, + "learning_rate": 4.96678381607568e-06, + "loss": 0.693, + "step": 754 + }, + { + "epoch": 0.35697399527186763, + "grad_norm": 3.0584909915924072, + "learning_rate": 4.966682386271491e-06, + "loss": 0.6034, + "step": 755 + }, + { + "epoch": 0.3574468085106383, + "grad_norm": 2.8215200901031494, + "learning_rate": 4.966580802877805e-06, + "loss": 0.6217, + "step": 756 + }, + { + "epoch": 0.357919621749409, + "grad_norm": 2.7348055839538574, + "learning_rate": 4.966479065900949e-06, + "loss": 0.6194, + "step": 757 + }, + { + "epoch": 0.3583924349881797, + "grad_norm": 3.2347466945648193, + "learning_rate": 4.966377175347257e-06, + "loss": 0.6377, + "step": 758 + }, + { + "epoch": 0.3588652482269504, + "grad_norm": 3.311845302581787, + "learning_rate": 4.966275131223072e-06, + "loss": 0.6234, + "step": 759 + }, + { + "epoch": 0.35933806146572106, + "grad_norm": 3.0384368896484375, + "learning_rate": 4.96617293353475e-06, + "loss": 0.609, + "step": 760 + }, + { + "epoch": 0.35981087470449175, + "grad_norm": 3.516854763031006, + "learning_rate": 4.966070582288653e-06, + "loss": 0.6627, + "step": 761 + }, + { + "epoch": 0.36028368794326243, + "grad_norm": 3.2425215244293213, + "learning_rate": 4.9659680774911534e-06, + "loss": 0.7355, + "step": 762 + }, + { + "epoch": 0.3607565011820331, + "grad_norm": 3.2665750980377197, + "learning_rate": 4.965865419148636e-06, + "loss": 0.6787, + "step": 763 + }, + { + "epoch": 0.3612293144208038, + "grad_norm": 2.729428291320801, + "learning_rate": 4.96576260726749e-06, + "loss": 0.6272, + "step": 764 + }, + { + "epoch": 0.3617021276595745, + "grad_norm": 3.299969434738159, + "learning_rate": 4.965659641854119e-06, + "loss": 0.6552, + "step": 765 + }, + { + "epoch": 0.3621749408983452, + "grad_norm": 2.7090916633605957, + "learning_rate": 4.965556522914934e-06, + "loss": 0.6661, + "step": 766 + }, + { + "epoch": 0.36264775413711586, + "grad_norm": 2.488846778869629, + "learning_rate": 4.965453250456355e-06, + "loss": 0.5821, + "step": 767 + }, + { + "epoch": 0.36312056737588655, + "grad_norm": 2.5267233848571777, + "learning_rate": 4.965349824484813e-06, + "loss": 0.5593, + "step": 768 + }, + { + "epoch": 0.36359338061465724, + "grad_norm": 3.0646679401397705, + "learning_rate": 4.965246245006748e-06, + "loss": 0.6341, + "step": 769 + }, + { + "epoch": 0.3640661938534279, + "grad_norm": 2.9877712726593018, + "learning_rate": 4.965142512028609e-06, + "loss": 0.7202, + "step": 770 + }, + { + "epoch": 0.3645390070921986, + "grad_norm": 3.7494113445281982, + "learning_rate": 4.965038625556854e-06, + "loss": 0.7643, + "step": 771 + }, + { + "epoch": 0.3650118203309693, + "grad_norm": 2.8382890224456787, + "learning_rate": 4.964934585597954e-06, + "loss": 0.6522, + "step": 772 + }, + { + "epoch": 0.3654846335697399, + "grad_norm": 3.091655731201172, + "learning_rate": 4.9648303921583854e-06, + "loss": 0.7117, + "step": 773 + }, + { + "epoch": 0.3659574468085106, + "grad_norm": 3.0608325004577637, + "learning_rate": 4.964726045244635e-06, + "loss": 0.6538, + "step": 774 + }, + { + "epoch": 0.3664302600472813, + "grad_norm": 2.8492867946624756, + "learning_rate": 4.964621544863203e-06, + "loss": 0.6079, + "step": 775 + }, + { + "epoch": 0.366903073286052, + "grad_norm": 3.0669894218444824, + "learning_rate": 4.964516891020594e-06, + "loss": 0.6223, + "step": 776 + }, + { + "epoch": 0.36737588652482267, + "grad_norm": 3.089984893798828, + "learning_rate": 4.964412083723325e-06, + "loss": 0.671, + "step": 777 + }, + { + "epoch": 0.36784869976359336, + "grad_norm": 2.905242443084717, + "learning_rate": 4.964307122977921e-06, + "loss": 0.62, + "step": 778 + }, + { + "epoch": 0.36832151300236404, + "grad_norm": 3.954436779022217, + "learning_rate": 4.964202008790918e-06, + "loss": 0.6535, + "step": 779 + }, + { + "epoch": 0.36879432624113473, + "grad_norm": 2.6026058197021484, + "learning_rate": 4.9640967411688615e-06, + "loss": 0.5865, + "step": 780 + }, + { + "epoch": 0.3692671394799054, + "grad_norm": 2.9876346588134766, + "learning_rate": 4.963991320118306e-06, + "loss": 0.6698, + "step": 781 + }, + { + "epoch": 0.3697399527186761, + "grad_norm": 2.9411263465881348, + "learning_rate": 4.963885745645815e-06, + "loss": 0.6173, + "step": 782 + }, + { + "epoch": 0.3702127659574468, + "grad_norm": 2.5679805278778076, + "learning_rate": 4.963780017757962e-06, + "loss": 0.6285, + "step": 783 + }, + { + "epoch": 0.3706855791962175, + "grad_norm": 3.3100640773773193, + "learning_rate": 4.963674136461332e-06, + "loss": 0.5968, + "step": 784 + }, + { + "epoch": 0.37115839243498816, + "grad_norm": 3.1293699741363525, + "learning_rate": 4.963568101762515e-06, + "loss": 0.697, + "step": 785 + }, + { + "epoch": 0.37163120567375885, + "grad_norm": 3.043853759765625, + "learning_rate": 4.963461913668115e-06, + "loss": 0.5881, + "step": 786 + }, + { + "epoch": 0.37210401891252953, + "grad_norm": 3.07351016998291, + "learning_rate": 4.963355572184744e-06, + "loss": 0.6307, + "step": 787 + }, + { + "epoch": 0.3725768321513002, + "grad_norm": 2.7381317615509033, + "learning_rate": 4.9632490773190225e-06, + "loss": 0.716, + "step": 788 + }, + { + "epoch": 0.3730496453900709, + "grad_norm": 2.892221450805664, + "learning_rate": 4.963142429077582e-06, + "loss": 0.6867, + "step": 789 + }, + { + "epoch": 0.3735224586288416, + "grad_norm": 3.133122205734253, + "learning_rate": 4.963035627467064e-06, + "loss": 0.659, + "step": 790 + }, + { + "epoch": 0.3739952718676123, + "grad_norm": 3.032599925994873, + "learning_rate": 4.962928672494116e-06, + "loss": 0.6848, + "step": 791 + }, + { + "epoch": 0.37446808510638296, + "grad_norm": 3.0076355934143066, + "learning_rate": 4.9628215641654e-06, + "loss": 0.6549, + "step": 792 + }, + { + "epoch": 0.37494089834515365, + "grad_norm": 2.8904454708099365, + "learning_rate": 4.962714302487585e-06, + "loss": 0.6484, + "step": 793 + }, + { + "epoch": 0.37541371158392434, + "grad_norm": 2.881364107131958, + "learning_rate": 4.9626068874673486e-06, + "loss": 0.721, + "step": 794 + }, + { + "epoch": 0.375886524822695, + "grad_norm": 3.11668062210083, + "learning_rate": 4.962499319111379e-06, + "loss": 0.7824, + "step": 795 + }, + { + "epoch": 0.3763593380614657, + "grad_norm": 2.9201436042785645, + "learning_rate": 4.962391597426374e-06, + "loss": 0.6911, + "step": 796 + }, + { + "epoch": 0.3768321513002364, + "grad_norm": 2.926598072052002, + "learning_rate": 4.962283722419043e-06, + "loss": 0.6715, + "step": 797 + }, + { + "epoch": 0.3773049645390071, + "grad_norm": 2.7267675399780273, + "learning_rate": 4.962175694096101e-06, + "loss": 0.6111, + "step": 798 + }, + { + "epoch": 0.37777777777777777, + "grad_norm": 3.194031000137329, + "learning_rate": 4.962067512464275e-06, + "loss": 0.6558, + "step": 799 + }, + { + "epoch": 0.37825059101654845, + "grad_norm": 2.6249136924743652, + "learning_rate": 4.9619591775303e-06, + "loss": 0.6166, + "step": 800 + }, + { + "epoch": 0.37872340425531914, + "grad_norm": 2.6356167793273926, + "learning_rate": 4.961850689300923e-06, + "loss": 0.6112, + "step": 801 + }, + { + "epoch": 0.3791962174940898, + "grad_norm": 3.030724287033081, + "learning_rate": 4.961742047782898e-06, + "loss": 0.6511, + "step": 802 + }, + { + "epoch": 0.3796690307328605, + "grad_norm": 3.4987757205963135, + "learning_rate": 4.96163325298299e-06, + "loss": 0.5888, + "step": 803 + }, + { + "epoch": 0.3801418439716312, + "grad_norm": 3.0371780395507812, + "learning_rate": 4.961524304907974e-06, + "loss": 0.6385, + "step": 804 + }, + { + "epoch": 0.3806146572104019, + "grad_norm": 3.302570104598999, + "learning_rate": 4.961415203564632e-06, + "loss": 0.6515, + "step": 805 + }, + { + "epoch": 0.38108747044917257, + "grad_norm": 2.7597038745880127, + "learning_rate": 4.961305948959759e-06, + "loss": 0.6126, + "step": 806 + }, + { + "epoch": 0.38156028368794326, + "grad_norm": 2.789811849594116, + "learning_rate": 4.9611965411001575e-06, + "loss": 0.6601, + "step": 807 + }, + { + "epoch": 0.38203309692671394, + "grad_norm": 3.0403921604156494, + "learning_rate": 4.961086979992639e-06, + "loss": 0.6947, + "step": 808 + }, + { + "epoch": 0.38250591016548463, + "grad_norm": 3.2139980792999268, + "learning_rate": 4.960977265644026e-06, + "loss": 0.6876, + "step": 809 + }, + { + "epoch": 0.3829787234042553, + "grad_norm": 2.918515205383301, + "learning_rate": 4.960867398061149e-06, + "loss": 0.5997, + "step": 810 + }, + { + "epoch": 0.383451536643026, + "grad_norm": 3.197636604309082, + "learning_rate": 4.9607573772508495e-06, + "loss": 0.5754, + "step": 811 + }, + { + "epoch": 0.3839243498817967, + "grad_norm": 2.8848466873168945, + "learning_rate": 4.960647203219979e-06, + "loss": 0.6424, + "step": 812 + }, + { + "epoch": 0.3843971631205674, + "grad_norm": 3.4810187816619873, + "learning_rate": 4.960536875975397e-06, + "loss": 0.6851, + "step": 813 + }, + { + "epoch": 0.38486997635933806, + "grad_norm": 3.713934898376465, + "learning_rate": 4.960426395523972e-06, + "loss": 0.6122, + "step": 814 + }, + { + "epoch": 0.38534278959810875, + "grad_norm": 2.862600803375244, + "learning_rate": 4.960315761872585e-06, + "loss": 0.6493, + "step": 815 + }, + { + "epoch": 0.38581560283687943, + "grad_norm": 3.133882522583008, + "learning_rate": 4.960204975028123e-06, + "loss": 0.7535, + "step": 816 + }, + { + "epoch": 0.3862884160756501, + "grad_norm": 3.1526732444763184, + "learning_rate": 4.960094034997485e-06, + "loss": 0.6512, + "step": 817 + }, + { + "epoch": 0.3867612293144208, + "grad_norm": 2.7213544845581055, + "learning_rate": 4.959982941787579e-06, + "loss": 0.6121, + "step": 818 + }, + { + "epoch": 0.3872340425531915, + "grad_norm": 3.4935851097106934, + "learning_rate": 4.9598716954053214e-06, + "loss": 0.7852, + "step": 819 + }, + { + "epoch": 0.3877068557919622, + "grad_norm": 2.691016435623169, + "learning_rate": 4.9597602958576395e-06, + "loss": 0.6861, + "step": 820 + }, + { + "epoch": 0.38817966903073287, + "grad_norm": 2.8621015548706055, + "learning_rate": 4.959648743151469e-06, + "loss": 0.6262, + "step": 821 + }, + { + "epoch": 0.38865248226950355, + "grad_norm": 3.3887462615966797, + "learning_rate": 4.959537037293758e-06, + "loss": 0.7103, + "step": 822 + }, + { + "epoch": 0.38912529550827424, + "grad_norm": 2.7565438747406006, + "learning_rate": 4.95942517829146e-06, + "loss": 0.6471, + "step": 823 + }, + { + "epoch": 0.3895981087470449, + "grad_norm": 2.7920358180999756, + "learning_rate": 4.959313166151541e-06, + "loss": 0.6239, + "step": 824 + }, + { + "epoch": 0.3900709219858156, + "grad_norm": 3.18904185295105, + "learning_rate": 4.959201000880973e-06, + "loss": 0.7461, + "step": 825 + }, + { + "epoch": 0.3905437352245863, + "grad_norm": 2.727872371673584, + "learning_rate": 4.959088682486743e-06, + "loss": 0.6333, + "step": 826 + }, + { + "epoch": 0.391016548463357, + "grad_norm": 2.906378746032715, + "learning_rate": 4.958976210975844e-06, + "loss": 0.7547, + "step": 827 + }, + { + "epoch": 0.39148936170212767, + "grad_norm": 2.96482515335083, + "learning_rate": 4.958863586355278e-06, + "loss": 0.6312, + "step": 828 + }, + { + "epoch": 0.39196217494089836, + "grad_norm": 3.2890889644622803, + "learning_rate": 4.958750808632059e-06, + "loss": 0.6943, + "step": 829 + }, + { + "epoch": 0.39243498817966904, + "grad_norm": 2.7004311084747314, + "learning_rate": 4.958637877813207e-06, + "loss": 0.5918, + "step": 830 + }, + { + "epoch": 0.39290780141843973, + "grad_norm": 2.7487950325012207, + "learning_rate": 4.9585247939057566e-06, + "loss": 0.6201, + "step": 831 + }, + { + "epoch": 0.3933806146572104, + "grad_norm": 2.7873897552490234, + "learning_rate": 4.958411556916747e-06, + "loss": 0.6268, + "step": 832 + }, + { + "epoch": 0.3938534278959811, + "grad_norm": 2.8501343727111816, + "learning_rate": 4.958298166853229e-06, + "loss": 0.7119, + "step": 833 + }, + { + "epoch": 0.3943262411347518, + "grad_norm": 3.0391547679901123, + "learning_rate": 4.958184623722265e-06, + "loss": 0.6375, + "step": 834 + }, + { + "epoch": 0.3947990543735225, + "grad_norm": 2.850520133972168, + "learning_rate": 4.958070927530922e-06, + "loss": 0.5962, + "step": 835 + }, + { + "epoch": 0.39527186761229316, + "grad_norm": 3.351914644241333, + "learning_rate": 4.957957078286281e-06, + "loss": 0.7247, + "step": 836 + }, + { + "epoch": 0.39574468085106385, + "grad_norm": 2.9559543132781982, + "learning_rate": 4.957843075995431e-06, + "loss": 0.6571, + "step": 837 + }, + { + "epoch": 0.39621749408983453, + "grad_norm": 3.225785255432129, + "learning_rate": 4.95772892066547e-06, + "loss": 0.7074, + "step": 838 + }, + { + "epoch": 0.3966903073286052, + "grad_norm": 2.7842373847961426, + "learning_rate": 4.957614612303505e-06, + "loss": 0.6469, + "step": 839 + }, + { + "epoch": 0.3971631205673759, + "grad_norm": 4.249724864959717, + "learning_rate": 4.957500150916655e-06, + "loss": 0.741, + "step": 840 + }, + { + "epoch": 0.3976359338061466, + "grad_norm": 3.138221263885498, + "learning_rate": 4.957385536512046e-06, + "loss": 0.6676, + "step": 841 + }, + { + "epoch": 0.3981087470449173, + "grad_norm": 3.456423759460449, + "learning_rate": 4.957270769096816e-06, + "loss": 0.6877, + "step": 842 + }, + { + "epoch": 0.39858156028368796, + "grad_norm": 2.8676278591156006, + "learning_rate": 4.957155848678109e-06, + "loss": 0.5986, + "step": 843 + }, + { + "epoch": 0.39905437352245865, + "grad_norm": 2.705324411392212, + "learning_rate": 4.957040775263082e-06, + "loss": 0.6356, + "step": 844 + }, + { + "epoch": 0.39952718676122934, + "grad_norm": 3.0767486095428467, + "learning_rate": 4.9569255488589e-06, + "loss": 0.6844, + "step": 845 + }, + { + "epoch": 0.4, + "grad_norm": 2.7787704467773438, + "learning_rate": 4.956810169472736e-06, + "loss": 0.6641, + "step": 846 + }, + { + "epoch": 0.4004728132387707, + "grad_norm": 2.584277868270874, + "learning_rate": 4.956694637111777e-06, + "loss": 0.6256, + "step": 847 + }, + { + "epoch": 0.4009456264775414, + "grad_norm": 2.751641273498535, + "learning_rate": 4.956578951783215e-06, + "loss": 0.5954, + "step": 848 + }, + { + "epoch": 0.4014184397163121, + "grad_norm": 3.0181658267974854, + "learning_rate": 4.956463113494253e-06, + "loss": 0.6569, + "step": 849 + }, + { + "epoch": 0.40189125295508277, + "grad_norm": 3.0933220386505127, + "learning_rate": 4.956347122252104e-06, + "loss": 0.6248, + "step": 850 + }, + { + "epoch": 0.40236406619385345, + "grad_norm": 3.3767428398132324, + "learning_rate": 4.956230978063991e-06, + "loss": 0.719, + "step": 851 + }, + { + "epoch": 0.40283687943262414, + "grad_norm": 3.7666573524475098, + "learning_rate": 4.956114680937145e-06, + "loss": 0.6467, + "step": 852 + }, + { + "epoch": 0.4033096926713948, + "grad_norm": 2.9836843013763428, + "learning_rate": 4.955998230878808e-06, + "loss": 0.6993, + "step": 853 + }, + { + "epoch": 0.4037825059101655, + "grad_norm": 2.981497049331665, + "learning_rate": 4.955881627896229e-06, + "loss": 0.6578, + "step": 854 + }, + { + "epoch": 0.40425531914893614, + "grad_norm": 3.1369056701660156, + "learning_rate": 4.955764871996672e-06, + "loss": 0.6763, + "step": 855 + }, + { + "epoch": 0.40472813238770683, + "grad_norm": 2.7675817012786865, + "learning_rate": 4.9556479631874036e-06, + "loss": 0.6488, + "step": 856 + }, + { + "epoch": 0.4052009456264775, + "grad_norm": 3.035334825515747, + "learning_rate": 4.9555309014757034e-06, + "loss": 0.7076, + "step": 857 + }, + { + "epoch": 0.4056737588652482, + "grad_norm": 3.493704319000244, + "learning_rate": 4.955413686868862e-06, + "loss": 0.6773, + "step": 858 + }, + { + "epoch": 0.4061465721040189, + "grad_norm": 3.245487928390503, + "learning_rate": 4.9552963193741765e-06, + "loss": 0.6915, + "step": 859 + }, + { + "epoch": 0.4066193853427896, + "grad_norm": 3.189969539642334, + "learning_rate": 4.955178798998956e-06, + "loss": 0.7318, + "step": 860 + }, + { + "epoch": 0.40709219858156026, + "grad_norm": 2.7987146377563477, + "learning_rate": 4.955061125750517e-06, + "loss": 0.6162, + "step": 861 + }, + { + "epoch": 0.40756501182033095, + "grad_norm": 3.020118474960327, + "learning_rate": 4.954943299636187e-06, + "loss": 0.6678, + "step": 862 + }, + { + "epoch": 0.40803782505910163, + "grad_norm": 2.715463876724243, + "learning_rate": 4.954825320663302e-06, + "loss": 0.668, + "step": 863 + }, + { + "epoch": 0.4085106382978723, + "grad_norm": 2.595050096511841, + "learning_rate": 4.9547071888392085e-06, + "loss": 0.6557, + "step": 864 + }, + { + "epoch": 0.408983451536643, + "grad_norm": 3.131596088409424, + "learning_rate": 4.954588904171261e-06, + "loss": 0.6548, + "step": 865 + }, + { + "epoch": 0.4094562647754137, + "grad_norm": 2.5742313861846924, + "learning_rate": 4.954470466666827e-06, + "loss": 0.6592, + "step": 866 + }, + { + "epoch": 0.4099290780141844, + "grad_norm": 2.8612802028656006, + "learning_rate": 4.9543518763332785e-06, + "loss": 0.5391, + "step": 867 + }, + { + "epoch": 0.41040189125295506, + "grad_norm": 2.8973186016082764, + "learning_rate": 4.954233133178001e-06, + "loss": 0.6649, + "step": 868 + }, + { + "epoch": 0.41087470449172575, + "grad_norm": 2.802525043487549, + "learning_rate": 4.954114237208388e-06, + "loss": 0.6212, + "step": 869 + }, + { + "epoch": 0.41134751773049644, + "grad_norm": 2.5919506549835205, + "learning_rate": 4.953995188431843e-06, + "loss": 0.6596, + "step": 870 + }, + { + "epoch": 0.4118203309692671, + "grad_norm": 3.139169454574585, + "learning_rate": 4.953875986855777e-06, + "loss": 0.6799, + "step": 871 + }, + { + "epoch": 0.4122931442080378, + "grad_norm": 3.99727725982666, + "learning_rate": 4.953756632487614e-06, + "loss": 0.6519, + "step": 872 + }, + { + "epoch": 0.4127659574468085, + "grad_norm": 3.238706350326538, + "learning_rate": 4.953637125334784e-06, + "loss": 0.7361, + "step": 873 + }, + { + "epoch": 0.4132387706855792, + "grad_norm": 2.780019998550415, + "learning_rate": 4.9535174654047295e-06, + "loss": 0.6406, + "step": 874 + }, + { + "epoch": 0.41371158392434987, + "grad_norm": 2.7629551887512207, + "learning_rate": 4.953397652704901e-06, + "loss": 0.6131, + "step": 875 + }, + { + "epoch": 0.41418439716312055, + "grad_norm": 2.8008246421813965, + "learning_rate": 4.9532776872427585e-06, + "loss": 0.6464, + "step": 876 + }, + { + "epoch": 0.41465721040189124, + "grad_norm": 3.0970115661621094, + "learning_rate": 4.953157569025772e-06, + "loss": 0.7066, + "step": 877 + }, + { + "epoch": 0.4151300236406619, + "grad_norm": 2.8375589847564697, + "learning_rate": 4.9530372980614195e-06, + "loss": 0.6551, + "step": 878 + }, + { + "epoch": 0.4156028368794326, + "grad_norm": 2.718843936920166, + "learning_rate": 4.952916874357191e-06, + "loss": 0.5947, + "step": 879 + }, + { + "epoch": 0.4160756501182033, + "grad_norm": 2.7104697227478027, + "learning_rate": 4.952796297920585e-06, + "loss": 0.6708, + "step": 880 + }, + { + "epoch": 0.416548463356974, + "grad_norm": 2.8223445415496826, + "learning_rate": 4.952675568759108e-06, + "loss": 0.6214, + "step": 881 + }, + { + "epoch": 0.41702127659574467, + "grad_norm": 2.6598153114318848, + "learning_rate": 4.952554686880279e-06, + "loss": 0.6116, + "step": 882 + }, + { + "epoch": 0.41749408983451536, + "grad_norm": 2.8639824390411377, + "learning_rate": 4.952433652291623e-06, + "loss": 0.5971, + "step": 883 + }, + { + "epoch": 0.41796690307328604, + "grad_norm": 2.9578304290771484, + "learning_rate": 4.952312465000677e-06, + "loss": 0.6785, + "step": 884 + }, + { + "epoch": 0.41843971631205673, + "grad_norm": 2.872144937515259, + "learning_rate": 4.952191125014987e-06, + "loss": 0.6772, + "step": 885 + }, + { + "epoch": 0.4189125295508274, + "grad_norm": 2.7513675689697266, + "learning_rate": 4.952069632342108e-06, + "loss": 0.702, + "step": 886 + }, + { + "epoch": 0.4193853427895981, + "grad_norm": 2.9275078773498535, + "learning_rate": 4.951947986989606e-06, + "loss": 0.589, + "step": 887 + }, + { + "epoch": 0.4198581560283688, + "grad_norm": 2.740549325942993, + "learning_rate": 4.951826188965053e-06, + "loss": 0.5942, + "step": 888 + }, + { + "epoch": 0.4203309692671395, + "grad_norm": 2.92452073097229, + "learning_rate": 4.951704238276035e-06, + "loss": 0.6819, + "step": 889 + }, + { + "epoch": 0.42080378250591016, + "grad_norm": 2.842491865158081, + "learning_rate": 4.951582134930144e-06, + "loss": 0.6304, + "step": 890 + }, + { + "epoch": 0.42127659574468085, + "grad_norm": 2.613478422164917, + "learning_rate": 4.951459878934983e-06, + "loss": 0.6912, + "step": 891 + }, + { + "epoch": 0.42174940898345153, + "grad_norm": 3.2408607006073, + "learning_rate": 4.951337470298165e-06, + "loss": 0.6755, + "step": 892 + }, + { + "epoch": 0.4222222222222222, + "grad_norm": 3.1022439002990723, + "learning_rate": 4.9512149090273125e-06, + "loss": 0.6138, + "step": 893 + }, + { + "epoch": 0.4226950354609929, + "grad_norm": 2.6418895721435547, + "learning_rate": 4.951092195130055e-06, + "loss": 0.639, + "step": 894 + }, + { + "epoch": 0.4231678486997636, + "grad_norm": 3.010744333267212, + "learning_rate": 4.950969328614035e-06, + "loss": 0.7102, + "step": 895 + }, + { + "epoch": 0.4236406619385343, + "grad_norm": 2.673292636871338, + "learning_rate": 4.950846309486901e-06, + "loss": 0.5676, + "step": 896 + }, + { + "epoch": 0.42411347517730497, + "grad_norm": 3.6974737644195557, + "learning_rate": 4.950723137756314e-06, + "loss": 0.5722, + "step": 897 + }, + { + "epoch": 0.42458628841607565, + "grad_norm": 3.69028902053833, + "learning_rate": 4.9505998134299435e-06, + "loss": 0.6337, + "step": 898 + }, + { + "epoch": 0.42505910165484634, + "grad_norm": 3.2136125564575195, + "learning_rate": 4.950476336515469e-06, + "loss": 0.6469, + "step": 899 + }, + { + "epoch": 0.425531914893617, + "grad_norm": 2.7396016120910645, + "learning_rate": 4.950352707020577e-06, + "loss": 0.6656, + "step": 900 + }, + { + "epoch": 0.4260047281323877, + "grad_norm": 2.825416088104248, + "learning_rate": 4.950228924952967e-06, + "loss": 0.6298, + "step": 901 + }, + { + "epoch": 0.4264775413711584, + "grad_norm": 3.401658535003662, + "learning_rate": 4.950104990320345e-06, + "loss": 0.778, + "step": 902 + }, + { + "epoch": 0.4269503546099291, + "grad_norm": 2.7002272605895996, + "learning_rate": 4.9499809031304294e-06, + "loss": 0.6536, + "step": 903 + }, + { + "epoch": 0.42742316784869977, + "grad_norm": 2.62386417388916, + "learning_rate": 4.949856663390945e-06, + "loss": 0.6629, + "step": 904 + }, + { + "epoch": 0.42789598108747046, + "grad_norm": 2.584247589111328, + "learning_rate": 4.94973227110963e-06, + "loss": 0.5813, + "step": 905 + }, + { + "epoch": 0.42836879432624114, + "grad_norm": 3.4365768432617188, + "learning_rate": 4.9496077262942265e-06, + "loss": 0.7648, + "step": 906 + }, + { + "epoch": 0.42884160756501183, + "grad_norm": 2.8993639945983887, + "learning_rate": 4.949483028952492e-06, + "loss": 0.6696, + "step": 907 + }, + { + "epoch": 0.4293144208037825, + "grad_norm": 2.922809362411499, + "learning_rate": 4.94935817909219e-06, + "loss": 0.6892, + "step": 908 + }, + { + "epoch": 0.4297872340425532, + "grad_norm": 2.85478138923645, + "learning_rate": 4.9492331767210944e-06, + "loss": 0.536, + "step": 909 + }, + { + "epoch": 0.4302600472813239, + "grad_norm": 2.8639259338378906, + "learning_rate": 4.949108021846988e-06, + "loss": 0.634, + "step": 910 + }, + { + "epoch": 0.4307328605200946, + "grad_norm": 3.0533697605133057, + "learning_rate": 4.948982714477664e-06, + "loss": 0.6318, + "step": 911 + }, + { + "epoch": 0.43120567375886526, + "grad_norm": 2.331674814224243, + "learning_rate": 4.9488572546209255e-06, + "loss": 0.6562, + "step": 912 + }, + { + "epoch": 0.43167848699763595, + "grad_norm": 3.0154623985290527, + "learning_rate": 4.9487316422845835e-06, + "loss": 0.6675, + "step": 913 + }, + { + "epoch": 0.43215130023640663, + "grad_norm": 2.7354514598846436, + "learning_rate": 4.948605877476459e-06, + "loss": 0.6012, + "step": 914 + }, + { + "epoch": 0.4326241134751773, + "grad_norm": 2.863736629486084, + "learning_rate": 4.948479960204383e-06, + "loss": 0.6062, + "step": 915 + }, + { + "epoch": 0.433096926713948, + "grad_norm": 3.01998233795166, + "learning_rate": 4.948353890476197e-06, + "loss": 0.6749, + "step": 916 + }, + { + "epoch": 0.4335697399527187, + "grad_norm": 2.7550456523895264, + "learning_rate": 4.94822766829975e-06, + "loss": 0.6507, + "step": 917 + }, + { + "epoch": 0.4340425531914894, + "grad_norm": 3.370572805404663, + "learning_rate": 4.948101293682901e-06, + "loss": 0.714, + "step": 918 + }, + { + "epoch": 0.43451536643026006, + "grad_norm": 2.9736790657043457, + "learning_rate": 4.947974766633519e-06, + "loss": 0.729, + "step": 919 + }, + { + "epoch": 0.43498817966903075, + "grad_norm": 3.1036548614501953, + "learning_rate": 4.947848087159483e-06, + "loss": 0.7547, + "step": 920 + }, + { + "epoch": 0.43546099290780144, + "grad_norm": 2.895094871520996, + "learning_rate": 4.947721255268679e-06, + "loss": 0.6089, + "step": 921 + }, + { + "epoch": 0.4359338061465721, + "grad_norm": 2.798476219177246, + "learning_rate": 4.947594270969005e-06, + "loss": 0.5432, + "step": 922 + }, + { + "epoch": 0.4364066193853428, + "grad_norm": 2.7675702571868896, + "learning_rate": 4.94746713426837e-06, + "loss": 0.5693, + "step": 923 + }, + { + "epoch": 0.4368794326241135, + "grad_norm": 2.6851553916931152, + "learning_rate": 4.947339845174687e-06, + "loss": 0.6503, + "step": 924 + }, + { + "epoch": 0.4373522458628842, + "grad_norm": 2.909635543823242, + "learning_rate": 4.947212403695883e-06, + "loss": 0.6494, + "step": 925 + }, + { + "epoch": 0.43782505910165487, + "grad_norm": 2.604526996612549, + "learning_rate": 4.947084809839894e-06, + "loss": 0.6349, + "step": 926 + }, + { + "epoch": 0.43829787234042555, + "grad_norm": 3.118149518966675, + "learning_rate": 4.946957063614664e-06, + "loss": 0.6219, + "step": 927 + }, + { + "epoch": 0.43877068557919624, + "grad_norm": 2.7452616691589355, + "learning_rate": 4.9468291650281465e-06, + "loss": 0.6096, + "step": 928 + }, + { + "epoch": 0.4392434988179669, + "grad_norm": 3.30098819732666, + "learning_rate": 4.946701114088307e-06, + "loss": 0.6277, + "step": 929 + }, + { + "epoch": 0.4397163120567376, + "grad_norm": 2.789482593536377, + "learning_rate": 4.946572910803116e-06, + "loss": 0.7, + "step": 930 + }, + { + "epoch": 0.4401891252955083, + "grad_norm": 2.7283935546875, + "learning_rate": 4.946444555180559e-06, + "loss": 0.5375, + "step": 931 + }, + { + "epoch": 0.440661938534279, + "grad_norm": 3.101304054260254, + "learning_rate": 4.946316047228627e-06, + "loss": 0.6131, + "step": 932 + }, + { + "epoch": 0.44113475177304967, + "grad_norm": 3.573908805847168, + "learning_rate": 4.946187386955321e-06, + "loss": 0.7073, + "step": 933 + }, + { + "epoch": 0.44160756501182036, + "grad_norm": 3.214979648590088, + "learning_rate": 4.946058574368653e-06, + "loss": 0.6508, + "step": 934 + }, + { + "epoch": 0.44208037825059104, + "grad_norm": 3.145082712173462, + "learning_rate": 4.945929609476643e-06, + "loss": 0.64, + "step": 935 + }, + { + "epoch": 0.4425531914893617, + "grad_norm": 2.991780996322632, + "learning_rate": 4.945800492287321e-06, + "loss": 0.6315, + "step": 936 + }, + { + "epoch": 0.44302600472813236, + "grad_norm": 3.2441139221191406, + "learning_rate": 4.945671222808727e-06, + "loss": 0.7144, + "step": 937 + }, + { + "epoch": 0.44349881796690305, + "grad_norm": 2.9397029876708984, + "learning_rate": 4.94554180104891e-06, + "loss": 0.6818, + "step": 938 + }, + { + "epoch": 0.44397163120567373, + "grad_norm": 3.2471461296081543, + "learning_rate": 4.945412227015929e-06, + "loss": 0.6921, + "step": 939 + }, + { + "epoch": 0.4444444444444444, + "grad_norm": 3.0882487297058105, + "learning_rate": 4.945282500717851e-06, + "loss": 0.718, + "step": 940 + }, + { + "epoch": 0.4449172576832151, + "grad_norm": 2.6035783290863037, + "learning_rate": 4.945152622162753e-06, + "loss": 0.621, + "step": 941 + }, + { + "epoch": 0.4453900709219858, + "grad_norm": 2.83659029006958, + "learning_rate": 4.945022591358724e-06, + "loss": 0.6403, + "step": 942 + }, + { + "epoch": 0.4458628841607565, + "grad_norm": 2.824463129043579, + "learning_rate": 4.944892408313859e-06, + "loss": 0.6594, + "step": 943 + }, + { + "epoch": 0.44633569739952716, + "grad_norm": 2.753735065460205, + "learning_rate": 4.9447620730362645e-06, + "loss": 0.6116, + "step": 944 + }, + { + "epoch": 0.44680851063829785, + "grad_norm": 3.0659725666046143, + "learning_rate": 4.944631585534056e-06, + "loss": 0.5983, + "step": 945 + }, + { + "epoch": 0.44728132387706854, + "grad_norm": 2.969113349914551, + "learning_rate": 4.944500945815357e-06, + "loss": 0.6859, + "step": 946 + }, + { + "epoch": 0.4477541371158392, + "grad_norm": 2.810303211212158, + "learning_rate": 4.944370153888303e-06, + "loss": 0.7025, + "step": 947 + }, + { + "epoch": 0.4482269503546099, + "grad_norm": 3.027721643447876, + "learning_rate": 4.944239209761038e-06, + "loss": 0.7268, + "step": 948 + }, + { + "epoch": 0.4486997635933806, + "grad_norm": 2.661503314971924, + "learning_rate": 4.944108113441716e-06, + "loss": 0.6702, + "step": 949 + }, + { + "epoch": 0.4491725768321513, + "grad_norm": 2.738591432571411, + "learning_rate": 4.943976864938498e-06, + "loss": 0.6728, + "step": 950 + }, + { + "epoch": 0.44964539007092197, + "grad_norm": 3.447505474090576, + "learning_rate": 4.943845464259557e-06, + "loss": 0.6586, + "step": 951 + }, + { + "epoch": 0.45011820330969265, + "grad_norm": 3.0968854427337646, + "learning_rate": 4.943713911413075e-06, + "loss": 0.7666, + "step": 952 + }, + { + "epoch": 0.45059101654846334, + "grad_norm": 2.4113779067993164, + "learning_rate": 4.943582206407244e-06, + "loss": 0.6173, + "step": 953 + }, + { + "epoch": 0.451063829787234, + "grad_norm": 2.6357979774475098, + "learning_rate": 4.943450349250263e-06, + "loss": 0.5589, + "step": 954 + }, + { + "epoch": 0.4515366430260047, + "grad_norm": 2.9182233810424805, + "learning_rate": 4.9433183399503425e-06, + "loss": 0.6252, + "step": 955 + }, + { + "epoch": 0.4520094562647754, + "grad_norm": 2.832740306854248, + "learning_rate": 4.943186178515703e-06, + "loss": 0.6882, + "step": 956 + }, + { + "epoch": 0.4524822695035461, + "grad_norm": 2.9508981704711914, + "learning_rate": 4.943053864954574e-06, + "loss": 0.5722, + "step": 957 + }, + { + "epoch": 0.4529550827423168, + "grad_norm": 3.044729471206665, + "learning_rate": 4.9429213992751925e-06, + "loss": 0.6772, + "step": 958 + }, + { + "epoch": 0.45342789598108746, + "grad_norm": 2.606003522872925, + "learning_rate": 4.9427887814858075e-06, + "loss": 0.6445, + "step": 959 + }, + { + "epoch": 0.45390070921985815, + "grad_norm": 2.4634225368499756, + "learning_rate": 4.942656011594676e-06, + "loss": 0.6151, + "step": 960 + }, + { + "epoch": 0.45437352245862883, + "grad_norm": 2.8872334957122803, + "learning_rate": 4.942523089610066e-06, + "loss": 0.6255, + "step": 961 + }, + { + "epoch": 0.4548463356973995, + "grad_norm": 2.870605707168579, + "learning_rate": 4.942390015540253e-06, + "loss": 0.7481, + "step": 962 + }, + { + "epoch": 0.4553191489361702, + "grad_norm": 2.952680826187134, + "learning_rate": 4.942256789393524e-06, + "loss": 0.5556, + "step": 963 + }, + { + "epoch": 0.4557919621749409, + "grad_norm": 2.623680353164673, + "learning_rate": 4.9421234111781725e-06, + "loss": 0.6115, + "step": 964 + }, + { + "epoch": 0.4562647754137116, + "grad_norm": 2.6933600902557373, + "learning_rate": 4.941989880902505e-06, + "loss": 0.6102, + "step": 965 + }, + { + "epoch": 0.45673758865248226, + "grad_norm": 2.6047189235687256, + "learning_rate": 4.941856198574836e-06, + "loss": 0.612, + "step": 966 + }, + { + "epoch": 0.45721040189125295, + "grad_norm": 2.779186725616455, + "learning_rate": 4.9417223642034885e-06, + "loss": 0.5424, + "step": 967 + }, + { + "epoch": 0.45768321513002364, + "grad_norm": 2.6177165508270264, + "learning_rate": 4.941588377796795e-06, + "loss": 0.4661, + "step": 968 + }, + { + "epoch": 0.4581560283687943, + "grad_norm": 2.959676742553711, + "learning_rate": 4.941454239363101e-06, + "loss": 0.6966, + "step": 969 + }, + { + "epoch": 0.458628841607565, + "grad_norm": 2.9788379669189453, + "learning_rate": 4.941319948910756e-06, + "loss": 0.6181, + "step": 970 + }, + { + "epoch": 0.4591016548463357, + "grad_norm": 4.642750263214111, + "learning_rate": 4.941185506448122e-06, + "loss": 0.5602, + "step": 971 + }, + { + "epoch": 0.4595744680851064, + "grad_norm": 2.793002128601074, + "learning_rate": 4.941050911983572e-06, + "loss": 0.602, + "step": 972 + }, + { + "epoch": 0.46004728132387707, + "grad_norm": 2.6833035945892334, + "learning_rate": 4.9409161655254845e-06, + "loss": 0.5549, + "step": 973 + }, + { + "epoch": 0.46052009456264775, + "grad_norm": 3.905032157897949, + "learning_rate": 4.94078126708225e-06, + "loss": 0.6335, + "step": 974 + }, + { + "epoch": 0.46099290780141844, + "grad_norm": 2.922609329223633, + "learning_rate": 4.94064621666227e-06, + "loss": 0.5839, + "step": 975 + }, + { + "epoch": 0.4614657210401891, + "grad_norm": 2.8277416229248047, + "learning_rate": 4.940511014273952e-06, + "loss": 0.629, + "step": 976 + }, + { + "epoch": 0.4619385342789598, + "grad_norm": 3.07511043548584, + "learning_rate": 4.940375659925714e-06, + "loss": 0.7058, + "step": 977 + }, + { + "epoch": 0.4624113475177305, + "grad_norm": 3.65043044090271, + "learning_rate": 4.940240153625984e-06, + "loss": 0.7174, + "step": 978 + }, + { + "epoch": 0.4628841607565012, + "grad_norm": 2.755167245864868, + "learning_rate": 4.9401044953832e-06, + "loss": 0.6548, + "step": 979 + }, + { + "epoch": 0.46335697399527187, + "grad_norm": 2.9881057739257812, + "learning_rate": 4.939968685205808e-06, + "loss": 0.6245, + "step": 980 + }, + { + "epoch": 0.46382978723404256, + "grad_norm": 2.9484212398529053, + "learning_rate": 4.939832723102266e-06, + "loss": 0.655, + "step": 981 + }, + { + "epoch": 0.46430260047281324, + "grad_norm": 2.898918628692627, + "learning_rate": 4.939696609081038e-06, + "loss": 0.6178, + "step": 982 + }, + { + "epoch": 0.46477541371158393, + "grad_norm": 2.7052435874938965, + "learning_rate": 4.9395603431506e-06, + "loss": 0.6393, + "step": 983 + }, + { + "epoch": 0.4652482269503546, + "grad_norm": 2.5610013008117676, + "learning_rate": 4.939423925319436e-06, + "loss": 0.4847, + "step": 984 + }, + { + "epoch": 0.4657210401891253, + "grad_norm": 3.229083299636841, + "learning_rate": 4.939287355596042e-06, + "loss": 0.6473, + "step": 985 + }, + { + "epoch": 0.466193853427896, + "grad_norm": 2.907097816467285, + "learning_rate": 4.9391506339889195e-06, + "loss": 0.652, + "step": 986 + }, + { + "epoch": 0.4666666666666667, + "grad_norm": 2.6929478645324707, + "learning_rate": 4.939013760506582e-06, + "loss": 0.6175, + "step": 987 + }, + { + "epoch": 0.46713947990543736, + "grad_norm": 3.414813280105591, + "learning_rate": 4.938876735157554e-06, + "loss": 0.7597, + "step": 988 + }, + { + "epoch": 0.46761229314420805, + "grad_norm": 3.297360420227051, + "learning_rate": 4.938739557950365e-06, + "loss": 0.6824, + "step": 989 + }, + { + "epoch": 0.46808510638297873, + "grad_norm": 3.083155393600464, + "learning_rate": 4.938602228893557e-06, + "loss": 0.6505, + "step": 990 + }, + { + "epoch": 0.4685579196217494, + "grad_norm": 2.9781153202056885, + "learning_rate": 4.938464747995681e-06, + "loss": 0.666, + "step": 991 + }, + { + "epoch": 0.4690307328605201, + "grad_norm": 3.1494534015655518, + "learning_rate": 4.9383271152652975e-06, + "loss": 0.6422, + "step": 992 + }, + { + "epoch": 0.4695035460992908, + "grad_norm": 2.547868490219116, + "learning_rate": 4.938189330710976e-06, + "loss": 0.5766, + "step": 993 + }, + { + "epoch": 0.4699763593380615, + "grad_norm": 2.684736967086792, + "learning_rate": 4.938051394341297e-06, + "loss": 0.6407, + "step": 994 + }, + { + "epoch": 0.47044917257683216, + "grad_norm": 2.9619693756103516, + "learning_rate": 4.937913306164847e-06, + "loss": 0.6936, + "step": 995 + }, + { + "epoch": 0.47092198581560285, + "grad_norm": 2.9698498249053955, + "learning_rate": 4.937775066190227e-06, + "loss": 0.6464, + "step": 996 + }, + { + "epoch": 0.47139479905437354, + "grad_norm": 3.121049642562866, + "learning_rate": 4.937636674426042e-06, + "loss": 0.6383, + "step": 997 + }, + { + "epoch": 0.4718676122931442, + "grad_norm": 3.113672971725464, + "learning_rate": 4.93749813088091e-06, + "loss": 0.6892, + "step": 998 + }, + { + "epoch": 0.4723404255319149, + "grad_norm": 3.126113176345825, + "learning_rate": 4.937359435563458e-06, + "loss": 0.6728, + "step": 999 + }, + { + "epoch": 0.4728132387706856, + "grad_norm": 3.353966236114502, + "learning_rate": 4.937220588482321e-06, + "loss": 0.6041, + "step": 1000 + }, + { + "epoch": 0.4732860520094563, + "grad_norm": 2.8860628604888916, + "learning_rate": 4.937081589646144e-06, + "loss": 0.6798, + "step": 1001 + }, + { + "epoch": 0.47375886524822697, + "grad_norm": 3.0510590076446533, + "learning_rate": 4.936942439063584e-06, + "loss": 0.5841, + "step": 1002 + }, + { + "epoch": 0.47423167848699765, + "grad_norm": 2.6998369693756104, + "learning_rate": 4.936803136743303e-06, + "loss": 0.6403, + "step": 1003 + }, + { + "epoch": 0.47470449172576834, + "grad_norm": 2.875347137451172, + "learning_rate": 4.9366636826939765e-06, + "loss": 0.5811, + "step": 1004 + }, + { + "epoch": 0.475177304964539, + "grad_norm": 2.9122262001037598, + "learning_rate": 4.936524076924287e-06, + "loss": 0.6852, + "step": 1005 + }, + { + "epoch": 0.4756501182033097, + "grad_norm": 2.5167057514190674, + "learning_rate": 4.9363843194429265e-06, + "loss": 0.5367, + "step": 1006 + }, + { + "epoch": 0.4761229314420804, + "grad_norm": 2.5745551586151123, + "learning_rate": 4.9362444102585985e-06, + "loss": 0.6241, + "step": 1007 + }, + { + "epoch": 0.4765957446808511, + "grad_norm": 2.5024216175079346, + "learning_rate": 4.9361043493800125e-06, + "loss": 0.6133, + "step": 1008 + }, + { + "epoch": 0.47706855791962177, + "grad_norm": 2.7281384468078613, + "learning_rate": 4.935964136815892e-06, + "loss": 0.6834, + "step": 1009 + }, + { + "epoch": 0.47754137115839246, + "grad_norm": 3.0118913650512695, + "learning_rate": 4.935823772574965e-06, + "loss": 0.6922, + "step": 1010 + }, + { + "epoch": 0.47801418439716314, + "grad_norm": 3.016216993331909, + "learning_rate": 4.935683256665973e-06, + "loss": 0.6653, + "step": 1011 + }, + { + "epoch": 0.47848699763593383, + "grad_norm": 2.9526784420013428, + "learning_rate": 4.9355425890976636e-06, + "loss": 0.6423, + "step": 1012 + }, + { + "epoch": 0.4789598108747045, + "grad_norm": 6.222797393798828, + "learning_rate": 4.9354017698787985e-06, + "loss": 0.5884, + "step": 1013 + }, + { + "epoch": 0.4794326241134752, + "grad_norm": 2.6553597450256348, + "learning_rate": 4.935260799018143e-06, + "loss": 0.6624, + "step": 1014 + }, + { + "epoch": 0.4799054373522459, + "grad_norm": 3.0942065715789795, + "learning_rate": 4.935119676524475e-06, + "loss": 0.6623, + "step": 1015 + }, + { + "epoch": 0.4803782505910166, + "grad_norm": 2.626359224319458, + "learning_rate": 4.934978402406585e-06, + "loss": 0.6195, + "step": 1016 + }, + { + "epoch": 0.4808510638297872, + "grad_norm": 2.7954699993133545, + "learning_rate": 4.934836976673265e-06, + "loss": 0.5545, + "step": 1017 + }, + { + "epoch": 0.4813238770685579, + "grad_norm": 2.913557291030884, + "learning_rate": 4.934695399333324e-06, + "loss": 0.6288, + "step": 1018 + }, + { + "epoch": 0.4817966903073286, + "grad_norm": 3.1043739318847656, + "learning_rate": 4.9345536703955746e-06, + "loss": 0.6771, + "step": 1019 + }, + { + "epoch": 0.48226950354609927, + "grad_norm": 2.789357900619507, + "learning_rate": 4.934411789868845e-06, + "loss": 0.6227, + "step": 1020 + }, + { + "epoch": 0.48274231678486995, + "grad_norm": 2.480609655380249, + "learning_rate": 4.934269757761967e-06, + "loss": 0.5779, + "step": 1021 + }, + { + "epoch": 0.48321513002364064, + "grad_norm": 2.7946252822875977, + "learning_rate": 4.934127574083785e-06, + "loss": 0.6166, + "step": 1022 + }, + { + "epoch": 0.4836879432624113, + "grad_norm": 3.0670509338378906, + "learning_rate": 4.933985238843153e-06, + "loss": 0.7766, + "step": 1023 + }, + { + "epoch": 0.484160756501182, + "grad_norm": 2.8567559719085693, + "learning_rate": 4.933842752048932e-06, + "loss": 0.5088, + "step": 1024 + }, + { + "epoch": 0.4846335697399527, + "grad_norm": 2.5674657821655273, + "learning_rate": 4.933700113709996e-06, + "loss": 0.6036, + "step": 1025 + }, + { + "epoch": 0.4851063829787234, + "grad_norm": 2.782339096069336, + "learning_rate": 4.933557323835224e-06, + "loss": 0.5335, + "step": 1026 + }, + { + "epoch": 0.48557919621749407, + "grad_norm": 2.6334071159362793, + "learning_rate": 4.93341438243351e-06, + "loss": 0.6327, + "step": 1027 + }, + { + "epoch": 0.48605200945626476, + "grad_norm": 3.0853965282440186, + "learning_rate": 4.933271289513751e-06, + "loss": 0.7102, + "step": 1028 + }, + { + "epoch": 0.48652482269503544, + "grad_norm": 2.619997501373291, + "learning_rate": 4.933128045084859e-06, + "loss": 0.6138, + "step": 1029 + }, + { + "epoch": 0.48699763593380613, + "grad_norm": 2.8316116333007812, + "learning_rate": 4.932984649155753e-06, + "loss": 0.6346, + "step": 1030 + }, + { + "epoch": 0.4874704491725768, + "grad_norm": 3.153486490249634, + "learning_rate": 4.932841101735361e-06, + "loss": 0.7626, + "step": 1031 + }, + { + "epoch": 0.4879432624113475, + "grad_norm": 3.1831274032592773, + "learning_rate": 4.9326974028326214e-06, + "loss": 0.6607, + "step": 1032 + }, + { + "epoch": 0.4884160756501182, + "grad_norm": 2.791078567504883, + "learning_rate": 4.932553552456481e-06, + "loss": 0.6141, + "step": 1033 + }, + { + "epoch": 0.4888888888888889, + "grad_norm": 2.627263307571411, + "learning_rate": 4.932409550615898e-06, + "loss": 0.6777, + "step": 1034 + }, + { + "epoch": 0.48936170212765956, + "grad_norm": 2.8550007343292236, + "learning_rate": 4.932265397319838e-06, + "loss": 0.6379, + "step": 1035 + }, + { + "epoch": 0.48983451536643025, + "grad_norm": 4.505824089050293, + "learning_rate": 4.932121092577276e-06, + "loss": 0.5892, + "step": 1036 + }, + { + "epoch": 0.49030732860520093, + "grad_norm": 3.100191116333008, + "learning_rate": 4.931976636397199e-06, + "loss": 0.6443, + "step": 1037 + }, + { + "epoch": 0.4907801418439716, + "grad_norm": 2.921494245529175, + "learning_rate": 4.9318320287886e-06, + "loss": 0.6821, + "step": 1038 + }, + { + "epoch": 0.4912529550827423, + "grad_norm": 4.577807903289795, + "learning_rate": 4.931687269760485e-06, + "loss": 0.5946, + "step": 1039 + }, + { + "epoch": 0.491725768321513, + "grad_norm": 2.7347636222839355, + "learning_rate": 4.931542359321865e-06, + "loss": 0.5689, + "step": 1040 + }, + { + "epoch": 0.4921985815602837, + "grad_norm": 2.5289158821105957, + "learning_rate": 4.931397297481765e-06, + "loss": 0.5632, + "step": 1041 + }, + { + "epoch": 0.49267139479905436, + "grad_norm": 3.3518471717834473, + "learning_rate": 4.9312520842492165e-06, + "loss": 0.6349, + "step": 1042 + }, + { + "epoch": 0.49314420803782505, + "grad_norm": 3.0469748973846436, + "learning_rate": 4.931106719633261e-06, + "loss": 0.5734, + "step": 1043 + }, + { + "epoch": 0.49361702127659574, + "grad_norm": 3.104682445526123, + "learning_rate": 4.930961203642951e-06, + "loss": 0.6101, + "step": 1044 + }, + { + "epoch": 0.4940898345153664, + "grad_norm": 2.776705503463745, + "learning_rate": 4.930815536287346e-06, + "loss": 0.6397, + "step": 1045 + }, + { + "epoch": 0.4945626477541371, + "grad_norm": 2.760380983352661, + "learning_rate": 4.930669717575516e-06, + "loss": 0.668, + "step": 1046 + }, + { + "epoch": 0.4950354609929078, + "grad_norm": 2.70084547996521, + "learning_rate": 4.930523747516541e-06, + "loss": 0.5729, + "step": 1047 + }, + { + "epoch": 0.4955082742316785, + "grad_norm": 2.7319583892822266, + "learning_rate": 4.930377626119511e-06, + "loss": 0.6258, + "step": 1048 + }, + { + "epoch": 0.49598108747044917, + "grad_norm": 3.2515223026275635, + "learning_rate": 4.930231353393521e-06, + "loss": 0.7412, + "step": 1049 + }, + { + "epoch": 0.49645390070921985, + "grad_norm": 3.0646486282348633, + "learning_rate": 4.930084929347682e-06, + "loss": 0.5809, + "step": 1050 + }, + { + "epoch": 0.49692671394799054, + "grad_norm": 3.1621921062469482, + "learning_rate": 4.9299383539911096e-06, + "loss": 0.6282, + "step": 1051 + }, + { + "epoch": 0.4973995271867612, + "grad_norm": 2.864713191986084, + "learning_rate": 4.929791627332931e-06, + "loss": 0.6263, + "step": 1052 + }, + { + "epoch": 0.4978723404255319, + "grad_norm": 3.181016683578491, + "learning_rate": 4.929644749382283e-06, + "loss": 0.5697, + "step": 1053 + }, + { + "epoch": 0.4983451536643026, + "grad_norm": 2.9064836502075195, + "learning_rate": 4.929497720148309e-06, + "loss": 0.6161, + "step": 1054 + }, + { + "epoch": 0.4988179669030733, + "grad_norm": 3.058112859725952, + "learning_rate": 4.9293505396401655e-06, + "loss": 0.6477, + "step": 1055 + }, + { + "epoch": 0.49929078014184397, + "grad_norm": 2.5227596759796143, + "learning_rate": 4.929203207867016e-06, + "loss": 0.5819, + "step": 1056 + }, + { + "epoch": 0.49976359338061466, + "grad_norm": 3.386862277984619, + "learning_rate": 4.929055724838035e-06, + "loss": 0.7342, + "step": 1057 + }, + { + "epoch": 0.5002364066193853, + "grad_norm": 3.368346929550171, + "learning_rate": 4.928908090562404e-06, + "loss": 0.6622, + "step": 1058 + }, + { + "epoch": 0.500709219858156, + "grad_norm": 2.9108314514160156, + "learning_rate": 4.928760305049317e-06, + "loss": 0.6598, + "step": 1059 + }, + { + "epoch": 0.5011820330969267, + "grad_norm": 2.822305917739868, + "learning_rate": 4.928612368307977e-06, + "loss": 0.5841, + "step": 1060 + }, + { + "epoch": 0.5016548463356973, + "grad_norm": 2.689131259918213, + "learning_rate": 4.928464280347592e-06, + "loss": 0.6631, + "step": 1061 + }, + { + "epoch": 0.502127659574468, + "grad_norm": 3.337214946746826, + "learning_rate": 4.9283160411773864e-06, + "loss": 0.6105, + "step": 1062 + }, + { + "epoch": 0.5026004728132387, + "grad_norm": 3.035911798477173, + "learning_rate": 4.928167650806588e-06, + "loss": 0.6981, + "step": 1063 + }, + { + "epoch": 0.5030732860520094, + "grad_norm": 2.8820855617523193, + "learning_rate": 4.9280191092444375e-06, + "loss": 0.6408, + "step": 1064 + }, + { + "epoch": 0.5035460992907801, + "grad_norm": 3.080432415008545, + "learning_rate": 4.927870416500183e-06, + "loss": 0.6398, + "step": 1065 + }, + { + "epoch": 0.5040189125295508, + "grad_norm": 2.761612892150879, + "learning_rate": 4.927721572583084e-06, + "loss": 0.6126, + "step": 1066 + }, + { + "epoch": 0.5044917257683215, + "grad_norm": 2.8561882972717285, + "learning_rate": 4.927572577502408e-06, + "loss": 0.584, + "step": 1067 + }, + { + "epoch": 0.5049645390070922, + "grad_norm": 3.3386311531066895, + "learning_rate": 4.927423431267432e-06, + "loss": 0.6666, + "step": 1068 + }, + { + "epoch": 0.5054373522458628, + "grad_norm": 2.632906675338745, + "learning_rate": 4.927274133887443e-06, + "loss": 0.632, + "step": 1069 + }, + { + "epoch": 0.5059101654846335, + "grad_norm": 2.8737308979034424, + "learning_rate": 4.927124685371737e-06, + "loss": 0.6051, + "step": 1070 + }, + { + "epoch": 0.5063829787234042, + "grad_norm": 3.042222738265991, + "learning_rate": 4.926975085729619e-06, + "loss": 0.6954, + "step": 1071 + }, + { + "epoch": 0.5068557919621749, + "grad_norm": 3.3341481685638428, + "learning_rate": 4.926825334970404e-06, + "loss": 0.7148, + "step": 1072 + }, + { + "epoch": 0.5073286052009456, + "grad_norm": 2.7415387630462646, + "learning_rate": 4.926675433103418e-06, + "loss": 0.5456, + "step": 1073 + }, + { + "epoch": 0.5078014184397163, + "grad_norm": 2.7545325756073, + "learning_rate": 4.926525380137993e-06, + "loss": 0.6213, + "step": 1074 + }, + { + "epoch": 0.508274231678487, + "grad_norm": 2.9153690338134766, + "learning_rate": 4.926375176083472e-06, + "loss": 0.6466, + "step": 1075 + }, + { + "epoch": 0.5087470449172576, + "grad_norm": 4.210638523101807, + "learning_rate": 4.926224820949209e-06, + "loss": 0.6192, + "step": 1076 + }, + { + "epoch": 0.5092198581560283, + "grad_norm": 2.4357898235321045, + "learning_rate": 4.926074314744565e-06, + "loss": 0.594, + "step": 1077 + }, + { + "epoch": 0.509692671394799, + "grad_norm": 2.8004701137542725, + "learning_rate": 4.92592365747891e-06, + "loss": 0.6276, + "step": 1078 + }, + { + "epoch": 0.5101654846335697, + "grad_norm": 2.920675039291382, + "learning_rate": 4.925772849161628e-06, + "loss": 0.6043, + "step": 1079 + }, + { + "epoch": 0.5106382978723404, + "grad_norm": 2.791555404663086, + "learning_rate": 4.9256218898021055e-06, + "loss": 0.6837, + "step": 1080 + }, + { + "epoch": 0.5111111111111111, + "grad_norm": 3.1702463626861572, + "learning_rate": 4.925470779409746e-06, + "loss": 0.668, + "step": 1081 + }, + { + "epoch": 0.5115839243498818, + "grad_norm": 2.7149479389190674, + "learning_rate": 4.925319517993955e-06, + "loss": 0.5842, + "step": 1082 + }, + { + "epoch": 0.5120567375886524, + "grad_norm": 2.916311025619507, + "learning_rate": 4.925168105564153e-06, + "loss": 0.6893, + "step": 1083 + }, + { + "epoch": 0.5125295508274231, + "grad_norm": 2.917654514312744, + "learning_rate": 4.925016542129767e-06, + "loss": 0.6513, + "step": 1084 + }, + { + "epoch": 0.5130023640661938, + "grad_norm": 2.5568928718566895, + "learning_rate": 4.924864827700234e-06, + "loss": 0.6177, + "step": 1085 + }, + { + "epoch": 0.5134751773049645, + "grad_norm": 2.816720485687256, + "learning_rate": 4.924712962285001e-06, + "loss": 0.5833, + "step": 1086 + }, + { + "epoch": 0.5139479905437352, + "grad_norm": 2.6989188194274902, + "learning_rate": 4.9245609458935235e-06, + "loss": 0.6332, + "step": 1087 + }, + { + "epoch": 0.5144208037825059, + "grad_norm": 2.959599494934082, + "learning_rate": 4.924408778535268e-06, + "loss": 0.626, + "step": 1088 + }, + { + "epoch": 0.5148936170212766, + "grad_norm": 2.872814416885376, + "learning_rate": 4.924256460219708e-06, + "loss": 0.6407, + "step": 1089 + }, + { + "epoch": 0.5153664302600472, + "grad_norm": 2.6989097595214844, + "learning_rate": 4.924103990956329e-06, + "loss": 0.6391, + "step": 1090 + }, + { + "epoch": 0.5158392434988179, + "grad_norm": 2.986492156982422, + "learning_rate": 4.9239513707546235e-06, + "loss": 0.6911, + "step": 1091 + }, + { + "epoch": 0.5163120567375886, + "grad_norm": 3.069920301437378, + "learning_rate": 4.9237985996240954e-06, + "loss": 0.671, + "step": 1092 + }, + { + "epoch": 0.5167848699763593, + "grad_norm": 2.8214917182922363, + "learning_rate": 4.9236456775742555e-06, + "loss": 0.5885, + "step": 1093 + }, + { + "epoch": 0.51725768321513, + "grad_norm": 2.9416961669921875, + "learning_rate": 4.923492604614627e-06, + "loss": 0.6293, + "step": 1094 + }, + { + "epoch": 0.5177304964539007, + "grad_norm": 2.761780023574829, + "learning_rate": 4.923339380754741e-06, + "loss": 0.649, + "step": 1095 + }, + { + "epoch": 0.5182033096926714, + "grad_norm": 2.7648792266845703, + "learning_rate": 4.923186006004138e-06, + "loss": 0.5906, + "step": 1096 + }, + { + "epoch": 0.518676122931442, + "grad_norm": 3.5535428524017334, + "learning_rate": 4.923032480372367e-06, + "loss": 0.7138, + "step": 1097 + }, + { + "epoch": 0.5191489361702127, + "grad_norm": 2.6252479553222656, + "learning_rate": 4.922878803868988e-06, + "loss": 0.5499, + "step": 1098 + }, + { + "epoch": 0.5196217494089834, + "grad_norm": 2.901002883911133, + "learning_rate": 4.9227249765035715e-06, + "loss": 0.6991, + "step": 1099 + }, + { + "epoch": 0.5200945626477541, + "grad_norm": 2.621877431869507, + "learning_rate": 4.9225709982856925e-06, + "loss": 0.6269, + "step": 1100 + }, + { + "epoch": 0.5205673758865248, + "grad_norm": 2.872483015060425, + "learning_rate": 4.92241686922494e-06, + "loss": 0.6657, + "step": 1101 + }, + { + "epoch": 0.5210401891252955, + "grad_norm": 2.730447769165039, + "learning_rate": 4.922262589330912e-06, + "loss": 0.6061, + "step": 1102 + }, + { + "epoch": 0.5215130023640662, + "grad_norm": 2.646247386932373, + "learning_rate": 4.922108158613213e-06, + "loss": 0.5923, + "step": 1103 + }, + { + "epoch": 0.5219858156028369, + "grad_norm": 2.6488895416259766, + "learning_rate": 4.92195357708146e-06, + "loss": 0.6293, + "step": 1104 + }, + { + "epoch": 0.5224586288416075, + "grad_norm": 2.756338357925415, + "learning_rate": 4.921798844745278e-06, + "loss": 0.6374, + "step": 1105 + }, + { + "epoch": 0.5229314420803782, + "grad_norm": 3.1441280841827393, + "learning_rate": 4.921643961614301e-06, + "loss": 0.6652, + "step": 1106 + }, + { + "epoch": 0.5234042553191489, + "grad_norm": 3.050002098083496, + "learning_rate": 4.921488927698172e-06, + "loss": 0.6809, + "step": 1107 + }, + { + "epoch": 0.5238770685579196, + "grad_norm": 2.71750807762146, + "learning_rate": 4.921333743006547e-06, + "loss": 0.6266, + "step": 1108 + }, + { + "epoch": 0.5243498817966903, + "grad_norm": 2.8439245223999023, + "learning_rate": 4.921178407549086e-06, + "loss": 0.5663, + "step": 1109 + }, + { + "epoch": 0.524822695035461, + "grad_norm": 3.0722241401672363, + "learning_rate": 4.921022921335464e-06, + "loss": 0.6791, + "step": 1110 + }, + { + "epoch": 0.5252955082742317, + "grad_norm": 3.4381656646728516, + "learning_rate": 4.920867284375358e-06, + "loss": 0.6687, + "step": 1111 + }, + { + "epoch": 0.5257683215130023, + "grad_norm": 2.819812774658203, + "learning_rate": 4.920711496678463e-06, + "loss": 0.6299, + "step": 1112 + }, + { + "epoch": 0.526241134751773, + "grad_norm": 3.6587414741516113, + "learning_rate": 4.9205555582544765e-06, + "loss": 0.7392, + "step": 1113 + }, + { + "epoch": 0.5267139479905437, + "grad_norm": 2.774296522140503, + "learning_rate": 4.920399469113109e-06, + "loss": 0.6652, + "step": 1114 + }, + { + "epoch": 0.5271867612293144, + "grad_norm": 2.7480580806732178, + "learning_rate": 4.920243229264081e-06, + "loss": 0.596, + "step": 1115 + }, + { + "epoch": 0.5276595744680851, + "grad_norm": 3.213057518005371, + "learning_rate": 4.920086838717119e-06, + "loss": 0.6986, + "step": 1116 + }, + { + "epoch": 0.5281323877068558, + "grad_norm": 2.940546989440918, + "learning_rate": 4.919930297481962e-06, + "loss": 0.6481, + "step": 1117 + }, + { + "epoch": 0.5286052009456265, + "grad_norm": 2.5970494747161865, + "learning_rate": 4.9197736055683555e-06, + "loss": 0.5658, + "step": 1118 + }, + { + "epoch": 0.5290780141843971, + "grad_norm": 4.49385404586792, + "learning_rate": 4.919616762986057e-06, + "loss": 0.605, + "step": 1119 + }, + { + "epoch": 0.5295508274231678, + "grad_norm": 2.971857786178589, + "learning_rate": 4.919459769744833e-06, + "loss": 0.6539, + "step": 1120 + }, + { + "epoch": 0.5300236406619385, + "grad_norm": 2.6192965507507324, + "learning_rate": 4.919302625854457e-06, + "loss": 0.6226, + "step": 1121 + }, + { + "epoch": 0.5304964539007092, + "grad_norm": 2.665088176727295, + "learning_rate": 4.919145331324716e-06, + "loss": 0.6647, + "step": 1122 + }, + { + "epoch": 0.5309692671394799, + "grad_norm": 2.612126111984253, + "learning_rate": 4.918987886165403e-06, + "loss": 0.6965, + "step": 1123 + }, + { + "epoch": 0.5314420803782506, + "grad_norm": 3.80017352104187, + "learning_rate": 4.9188302903863205e-06, + "loss": 0.7396, + "step": 1124 + }, + { + "epoch": 0.5319148936170213, + "grad_norm": 2.781752824783325, + "learning_rate": 4.918672543997282e-06, + "loss": 0.5985, + "step": 1125 + }, + { + "epoch": 0.532387706855792, + "grad_norm": 2.6067914962768555, + "learning_rate": 4.91851464700811e-06, + "loss": 0.6159, + "step": 1126 + }, + { + "epoch": 0.5328605200945626, + "grad_norm": 2.670807123184204, + "learning_rate": 4.918356599428636e-06, + "loss": 0.5958, + "step": 1127 + }, + { + "epoch": 0.5333333333333333, + "grad_norm": 2.608611822128296, + "learning_rate": 4.9181984012687e-06, + "loss": 0.5768, + "step": 1128 + }, + { + "epoch": 0.533806146572104, + "grad_norm": 2.586764097213745, + "learning_rate": 4.918040052538154e-06, + "loss": 0.661, + "step": 1129 + }, + { + "epoch": 0.5342789598108747, + "grad_norm": 3.1317451000213623, + "learning_rate": 4.917881553246856e-06, + "loss": 0.6626, + "step": 1130 + }, + { + "epoch": 0.5347517730496454, + "grad_norm": 2.7135281562805176, + "learning_rate": 4.917722903404676e-06, + "loss": 0.6572, + "step": 1131 + }, + { + "epoch": 0.5352245862884161, + "grad_norm": 3.4546358585357666, + "learning_rate": 4.917564103021493e-06, + "loss": 0.5597, + "step": 1132 + }, + { + "epoch": 0.5356973995271868, + "grad_norm": 3.0943493843078613, + "learning_rate": 4.917405152107193e-06, + "loss": 0.7258, + "step": 1133 + }, + { + "epoch": 0.5361702127659574, + "grad_norm": 2.6069352626800537, + "learning_rate": 4.917246050671674e-06, + "loss": 0.6209, + "step": 1134 + }, + { + "epoch": 0.5366430260047281, + "grad_norm": 2.584883689880371, + "learning_rate": 4.917086798724844e-06, + "loss": 0.658, + "step": 1135 + }, + { + "epoch": 0.5371158392434988, + "grad_norm": 3.001976490020752, + "learning_rate": 4.9169273962766166e-06, + "loss": 0.6306, + "step": 1136 + }, + { + "epoch": 0.5375886524822695, + "grad_norm": 2.5013928413391113, + "learning_rate": 4.916767843336918e-06, + "loss": 0.572, + "step": 1137 + }, + { + "epoch": 0.5380614657210402, + "grad_norm": 2.9114553928375244, + "learning_rate": 4.916608139915684e-06, + "loss": 0.5841, + "step": 1138 + }, + { + "epoch": 0.5385342789598109, + "grad_norm": 2.8878467082977295, + "learning_rate": 4.9164482860228564e-06, + "loss": 0.6654, + "step": 1139 + }, + { + "epoch": 0.5390070921985816, + "grad_norm": 2.9827866554260254, + "learning_rate": 4.91628828166839e-06, + "loss": 0.6674, + "step": 1140 + }, + { + "epoch": 0.5394799054373522, + "grad_norm": 3.8696281909942627, + "learning_rate": 4.916128126862248e-06, + "loss": 0.6241, + "step": 1141 + }, + { + "epoch": 0.5399527186761229, + "grad_norm": 2.9556291103363037, + "learning_rate": 4.915967821614402e-06, + "loss": 0.6478, + "step": 1142 + }, + { + "epoch": 0.5404255319148936, + "grad_norm": 2.392942428588867, + "learning_rate": 4.915807365934834e-06, + "loss": 0.6097, + "step": 1143 + }, + { + "epoch": 0.5408983451536643, + "grad_norm": 3.032235860824585, + "learning_rate": 4.915646759833534e-06, + "loss": 0.7193, + "step": 1144 + }, + { + "epoch": 0.541371158392435, + "grad_norm": 2.840416193008423, + "learning_rate": 4.915486003320501e-06, + "loss": 0.5506, + "step": 1145 + }, + { + "epoch": 0.5418439716312057, + "grad_norm": 2.5438895225524902, + "learning_rate": 4.915325096405747e-06, + "loss": 0.6487, + "step": 1146 + }, + { + "epoch": 0.5423167848699764, + "grad_norm": 2.544334650039673, + "learning_rate": 4.9151640390992905e-06, + "loss": 0.6168, + "step": 1147 + }, + { + "epoch": 0.542789598108747, + "grad_norm": 2.8535678386688232, + "learning_rate": 4.91500283141116e-06, + "loss": 0.678, + "step": 1148 + }, + { + "epoch": 0.5432624113475177, + "grad_norm": 2.8086955547332764, + "learning_rate": 4.9148414733513915e-06, + "loss": 0.6473, + "step": 1149 + }, + { + "epoch": 0.5437352245862884, + "grad_norm": 2.4709885120391846, + "learning_rate": 4.914679964930034e-06, + "loss": 0.6797, + "step": 1150 + }, + { + "epoch": 0.5442080378250591, + "grad_norm": 2.8546934127807617, + "learning_rate": 4.9145183061571435e-06, + "loss": 0.6247, + "step": 1151 + }, + { + "epoch": 0.5446808510638298, + "grad_norm": 2.991184711456299, + "learning_rate": 4.9143564970427844e-06, + "loss": 0.5977, + "step": 1152 + }, + { + "epoch": 0.5451536643026005, + "grad_norm": 3.011216402053833, + "learning_rate": 4.914194537597033e-06, + "loss": 0.7005, + "step": 1153 + }, + { + "epoch": 0.5456264775413712, + "grad_norm": 2.807521343231201, + "learning_rate": 4.9140324278299744e-06, + "loss": 0.5412, + "step": 1154 + }, + { + "epoch": 0.5460992907801419, + "grad_norm": 3.0401229858398438, + "learning_rate": 4.913870167751701e-06, + "loss": 0.6394, + "step": 1155 + }, + { + "epoch": 0.5465721040189125, + "grad_norm": 2.853914976119995, + "learning_rate": 4.913707757372317e-06, + "loss": 0.6745, + "step": 1156 + }, + { + "epoch": 0.5470449172576832, + "grad_norm": 4.505620956420898, + "learning_rate": 4.913545196701935e-06, + "loss": 0.6668, + "step": 1157 + }, + { + "epoch": 0.5475177304964539, + "grad_norm": 3.0505781173706055, + "learning_rate": 4.913382485750676e-06, + "loss": 0.6926, + "step": 1158 + }, + { + "epoch": 0.5479905437352246, + "grad_norm": 2.798435688018799, + "learning_rate": 4.913219624528672e-06, + "loss": 0.605, + "step": 1159 + }, + { + "epoch": 0.5484633569739953, + "grad_norm": 2.7814908027648926, + "learning_rate": 4.913056613046065e-06, + "loss": 0.6678, + "step": 1160 + }, + { + "epoch": 0.548936170212766, + "grad_norm": 3.2089321613311768, + "learning_rate": 4.9128934513130025e-06, + "loss": 0.5995, + "step": 1161 + }, + { + "epoch": 0.5494089834515367, + "grad_norm": 2.7699952125549316, + "learning_rate": 4.9127301393396455e-06, + "loss": 0.7062, + "step": 1162 + }, + { + "epoch": 0.5498817966903073, + "grad_norm": 2.859368324279785, + "learning_rate": 4.912566677136162e-06, + "loss": 0.6063, + "step": 1163 + }, + { + "epoch": 0.550354609929078, + "grad_norm": 2.727334499359131, + "learning_rate": 4.91240306471273e-06, + "loss": 0.6848, + "step": 1164 + }, + { + "epoch": 0.5508274231678487, + "grad_norm": 2.6017510890960693, + "learning_rate": 4.912239302079537e-06, + "loss": 0.5808, + "step": 1165 + }, + { + "epoch": 0.5513002364066194, + "grad_norm": 3.539583206176758, + "learning_rate": 4.912075389246781e-06, + "loss": 0.7053, + "step": 1166 + }, + { + "epoch": 0.5517730496453901, + "grad_norm": 2.918280601501465, + "learning_rate": 4.911911326224666e-06, + "loss": 0.5904, + "step": 1167 + }, + { + "epoch": 0.5522458628841608, + "grad_norm": 3.0067362785339355, + "learning_rate": 4.9117471130234095e-06, + "loss": 0.6392, + "step": 1168 + }, + { + "epoch": 0.5527186761229315, + "grad_norm": 2.4374797344207764, + "learning_rate": 4.911582749653236e-06, + "loss": 0.5793, + "step": 1169 + }, + { + "epoch": 0.5531914893617021, + "grad_norm": 3.121182918548584, + "learning_rate": 4.911418236124378e-06, + "loss": 0.6636, + "step": 1170 + }, + { + "epoch": 0.5536643026004728, + "grad_norm": 3.1289851665496826, + "learning_rate": 4.91125357244708e-06, + "loss": 0.656, + "step": 1171 + }, + { + "epoch": 0.5541371158392435, + "grad_norm": 2.7034592628479004, + "learning_rate": 4.911088758631596e-06, + "loss": 0.6001, + "step": 1172 + }, + { + "epoch": 0.5546099290780142, + "grad_norm": 2.710146188735962, + "learning_rate": 4.910923794688187e-06, + "loss": 0.6007, + "step": 1173 + }, + { + "epoch": 0.5550827423167849, + "grad_norm": 2.5424487590789795, + "learning_rate": 4.910758680627124e-06, + "loss": 0.5193, + "step": 1174 + }, + { + "epoch": 0.5555555555555556, + "grad_norm": 2.615893602371216, + "learning_rate": 4.91059341645869e-06, + "loss": 0.5525, + "step": 1175 + }, + { + "epoch": 0.5560283687943263, + "grad_norm": 3.3179728984832764, + "learning_rate": 4.910428002193174e-06, + "loss": 0.7285, + "step": 1176 + }, + { + "epoch": 0.556501182033097, + "grad_norm": 2.7234175205230713, + "learning_rate": 4.910262437840875e-06, + "loss": 0.574, + "step": 1177 + }, + { + "epoch": 0.5569739952718676, + "grad_norm": 3.0416605472564697, + "learning_rate": 4.9100967234121034e-06, + "loss": 0.5623, + "step": 1178 + }, + { + "epoch": 0.5574468085106383, + "grad_norm": 3.067786455154419, + "learning_rate": 4.909930858917177e-06, + "loss": 0.6491, + "step": 1179 + }, + { + "epoch": 0.557919621749409, + "grad_norm": 3.0037379264831543, + "learning_rate": 4.909764844366422e-06, + "loss": 0.5696, + "step": 1180 + }, + { + "epoch": 0.5583924349881797, + "grad_norm": 2.966179609298706, + "learning_rate": 4.909598679770178e-06, + "loss": 0.6042, + "step": 1181 + }, + { + "epoch": 0.5588652482269504, + "grad_norm": 2.6000657081604004, + "learning_rate": 4.909432365138789e-06, + "loss": 0.5883, + "step": 1182 + }, + { + "epoch": 0.5593380614657211, + "grad_norm": 2.6794495582580566, + "learning_rate": 4.909265900482612e-06, + "loss": 0.6809, + "step": 1183 + }, + { + "epoch": 0.5598108747044918, + "grad_norm": 2.6765122413635254, + "learning_rate": 4.9090992858120115e-06, + "loss": 0.6601, + "step": 1184 + }, + { + "epoch": 0.5602836879432624, + "grad_norm": 2.6051928997039795, + "learning_rate": 4.908932521137363e-06, + "loss": 0.5946, + "step": 1185 + }, + { + "epoch": 0.5607565011820331, + "grad_norm": 3.0405542850494385, + "learning_rate": 4.908765606469048e-06, + "loss": 0.6998, + "step": 1186 + }, + { + "epoch": 0.5612293144208038, + "grad_norm": 2.7975668907165527, + "learning_rate": 4.908598541817462e-06, + "loss": 0.6218, + "step": 1187 + }, + { + "epoch": 0.5617021276595745, + "grad_norm": 2.5367627143859863, + "learning_rate": 4.908431327193005e-06, + "loss": 0.6354, + "step": 1188 + }, + { + "epoch": 0.5621749408983452, + "grad_norm": 3.7939631938934326, + "learning_rate": 4.908263962606091e-06, + "loss": 0.6376, + "step": 1189 + }, + { + "epoch": 0.5626477541371159, + "grad_norm": 2.864079475402832, + "learning_rate": 4.908096448067139e-06, + "loss": 0.5485, + "step": 1190 + }, + { + "epoch": 0.5631205673758866, + "grad_norm": 2.7855563163757324, + "learning_rate": 4.9079287835865804e-06, + "loss": 0.6645, + "step": 1191 + }, + { + "epoch": 0.5635933806146572, + "grad_norm": 2.6156625747680664, + "learning_rate": 4.9077609691748556e-06, + "loss": 0.5751, + "step": 1192 + }, + { + "epoch": 0.5640661938534279, + "grad_norm": 3.0475659370422363, + "learning_rate": 4.907593004842412e-06, + "loss": 0.6739, + "step": 1193 + }, + { + "epoch": 0.5645390070921986, + "grad_norm": 2.9176738262176514, + "learning_rate": 4.9074248905997104e-06, + "loss": 0.6493, + "step": 1194 + }, + { + "epoch": 0.5650118203309693, + "grad_norm": 2.6168384552001953, + "learning_rate": 4.907256626457216e-06, + "loss": 0.6154, + "step": 1195 + }, + { + "epoch": 0.56548463356974, + "grad_norm": 2.893980026245117, + "learning_rate": 4.907088212425408e-06, + "loss": 0.5808, + "step": 1196 + }, + { + "epoch": 0.5659574468085107, + "grad_norm": 3.3832836151123047, + "learning_rate": 4.90691964851477e-06, + "loss": 0.7888, + "step": 1197 + }, + { + "epoch": 0.5664302600472814, + "grad_norm": 3.088932752609253, + "learning_rate": 4.906750934735801e-06, + "loss": 0.6516, + "step": 1198 + }, + { + "epoch": 0.566903073286052, + "grad_norm": 2.494471549987793, + "learning_rate": 4.906582071099004e-06, + "loss": 0.6286, + "step": 1199 + }, + { + "epoch": 0.5673758865248227, + "grad_norm": 2.716550588607788, + "learning_rate": 4.906413057614895e-06, + "loss": 0.5939, + "step": 1200 + }, + { + "epoch": 0.5678486997635934, + "grad_norm": 2.5821073055267334, + "learning_rate": 4.906243894293995e-06, + "loss": 0.6668, + "step": 1201 + }, + { + "epoch": 0.5683215130023641, + "grad_norm": 3.651787042617798, + "learning_rate": 4.90607458114684e-06, + "loss": 0.6124, + "step": 1202 + }, + { + "epoch": 0.5687943262411348, + "grad_norm": 2.7567858695983887, + "learning_rate": 4.9059051181839705e-06, + "loss": 0.6656, + "step": 1203 + }, + { + "epoch": 0.5692671394799055, + "grad_norm": 2.8067586421966553, + "learning_rate": 4.90573550541594e-06, + "loss": 0.6306, + "step": 1204 + }, + { + "epoch": 0.5697399527186762, + "grad_norm": 2.6136393547058105, + "learning_rate": 4.905565742853307e-06, + "loss": 0.5992, + "step": 1205 + }, + { + "epoch": 0.5702127659574469, + "grad_norm": 2.899049758911133, + "learning_rate": 4.905395830506644e-06, + "loss": 0.621, + "step": 1206 + }, + { + "epoch": 0.5706855791962175, + "grad_norm": 3.036583185195923, + "learning_rate": 4.9052257683865294e-06, + "loss": 0.652, + "step": 1207 + }, + { + "epoch": 0.5711583924349882, + "grad_norm": 2.7947216033935547, + "learning_rate": 4.905055556503553e-06, + "loss": 0.6636, + "step": 1208 + }, + { + "epoch": 0.5716312056737589, + "grad_norm": 3.1646955013275146, + "learning_rate": 4.9048851948683135e-06, + "loss": 0.6376, + "step": 1209 + }, + { + "epoch": 0.5721040189125296, + "grad_norm": 2.8175766468048096, + "learning_rate": 4.904714683491417e-06, + "loss": 0.5929, + "step": 1210 + }, + { + "epoch": 0.5725768321513003, + "grad_norm": 2.923923969268799, + "learning_rate": 4.904544022383483e-06, + "loss": 0.6633, + "step": 1211 + }, + { + "epoch": 0.573049645390071, + "grad_norm": 2.7471134662628174, + "learning_rate": 4.9043732115551356e-06, + "loss": 0.6551, + "step": 1212 + }, + { + "epoch": 0.5735224586288417, + "grad_norm": 2.8660807609558105, + "learning_rate": 4.90420225101701e-06, + "loss": 0.6423, + "step": 1213 + }, + { + "epoch": 0.5739952718676123, + "grad_norm": 2.769247531890869, + "learning_rate": 4.904031140779754e-06, + "loss": 0.5982, + "step": 1214 + }, + { + "epoch": 0.574468085106383, + "grad_norm": 2.9043145179748535, + "learning_rate": 4.90385988085402e-06, + "loss": 0.5843, + "step": 1215 + }, + { + "epoch": 0.5749408983451537, + "grad_norm": 2.6639609336853027, + "learning_rate": 4.903688471250471e-06, + "loss": 0.5858, + "step": 1216 + }, + { + "epoch": 0.5754137115839244, + "grad_norm": 2.6967573165893555, + "learning_rate": 4.903516911979781e-06, + "loss": 0.5755, + "step": 1217 + }, + { + "epoch": 0.5758865248226951, + "grad_norm": 2.8865857124328613, + "learning_rate": 4.903345203052633e-06, + "loss": 0.6051, + "step": 1218 + }, + { + "epoch": 0.5763593380614658, + "grad_norm": 2.381979465484619, + "learning_rate": 4.903173344479717e-06, + "loss": 0.5727, + "step": 1219 + }, + { + "epoch": 0.5768321513002365, + "grad_norm": 2.7717981338500977, + "learning_rate": 4.903001336271734e-06, + "loss": 0.6406, + "step": 1220 + }, + { + "epoch": 0.577304964539007, + "grad_norm": 2.6431570053100586, + "learning_rate": 4.902829178439395e-06, + "loss": 0.6226, + "step": 1221 + }, + { + "epoch": 0.5777777777777777, + "grad_norm": 2.8090415000915527, + "learning_rate": 4.902656870993419e-06, + "loss": 0.5761, + "step": 1222 + }, + { + "epoch": 0.5782505910165484, + "grad_norm": 2.4769368171691895, + "learning_rate": 4.902484413944535e-06, + "loss": 0.5602, + "step": 1223 + }, + { + "epoch": 0.5787234042553191, + "grad_norm": 2.693316698074341, + "learning_rate": 4.902311807303481e-06, + "loss": 0.5222, + "step": 1224 + }, + { + "epoch": 0.5791962174940898, + "grad_norm": 2.7623913288116455, + "learning_rate": 4.902139051081004e-06, + "loss": 0.6978, + "step": 1225 + }, + { + "epoch": 0.5796690307328605, + "grad_norm": 2.6133766174316406, + "learning_rate": 4.901966145287863e-06, + "loss": 0.5802, + "step": 1226 + }, + { + "epoch": 0.5801418439716312, + "grad_norm": 2.7345972061157227, + "learning_rate": 4.901793089934821e-06, + "loss": 0.6294, + "step": 1227 + }, + { + "epoch": 0.5806146572104018, + "grad_norm": 2.7545835971832275, + "learning_rate": 4.9016198850326555e-06, + "loss": 0.6085, + "step": 1228 + }, + { + "epoch": 0.5810874704491725, + "grad_norm": 2.6947758197784424, + "learning_rate": 4.90144653059215e-06, + "loss": 0.6025, + "step": 1229 + }, + { + "epoch": 0.5815602836879432, + "grad_norm": 2.692967414855957, + "learning_rate": 4.901273026624099e-06, + "loss": 0.5715, + "step": 1230 + }, + { + "epoch": 0.5820330969267139, + "grad_norm": 2.78347110748291, + "learning_rate": 4.901099373139307e-06, + "loss": 0.6063, + "step": 1231 + }, + { + "epoch": 0.5825059101654846, + "grad_norm": 2.346496343612671, + "learning_rate": 4.900925570148585e-06, + "loss": 0.5869, + "step": 1232 + }, + { + "epoch": 0.5829787234042553, + "grad_norm": 2.606639862060547, + "learning_rate": 4.900751617662755e-06, + "loss": 0.6197, + "step": 1233 + }, + { + "epoch": 0.583451536643026, + "grad_norm": 2.5825929641723633, + "learning_rate": 4.900577515692649e-06, + "loss": 0.6721, + "step": 1234 + }, + { + "epoch": 0.5839243498817966, + "grad_norm": 2.731349468231201, + "learning_rate": 4.900403264249107e-06, + "loss": 0.6273, + "step": 1235 + }, + { + "epoch": 0.5843971631205673, + "grad_norm": 3.2133874893188477, + "learning_rate": 4.90022886334298e-06, + "loss": 0.6231, + "step": 1236 + }, + { + "epoch": 0.584869976359338, + "grad_norm": 2.9213852882385254, + "learning_rate": 4.900054312985127e-06, + "loss": 0.6677, + "step": 1237 + }, + { + "epoch": 0.5853427895981087, + "grad_norm": 2.815425157546997, + "learning_rate": 4.899879613186414e-06, + "loss": 0.6405, + "step": 1238 + }, + { + "epoch": 0.5858156028368794, + "grad_norm": 2.730782985687256, + "learning_rate": 4.899704763957721e-06, + "loss": 0.6233, + "step": 1239 + }, + { + "epoch": 0.5862884160756501, + "grad_norm": 2.6432766914367676, + "learning_rate": 4.899529765309936e-06, + "loss": 0.6267, + "step": 1240 + }, + { + "epoch": 0.5867612293144208, + "grad_norm": 2.616215229034424, + "learning_rate": 4.899354617253953e-06, + "loss": 0.6268, + "step": 1241 + }, + { + "epoch": 0.5872340425531914, + "grad_norm": 2.7630255222320557, + "learning_rate": 4.899179319800679e-06, + "loss": 0.6348, + "step": 1242 + }, + { + "epoch": 0.5877068557919621, + "grad_norm": 2.785095453262329, + "learning_rate": 4.899003872961029e-06, + "loss": 0.5839, + "step": 1243 + }, + { + "epoch": 0.5881796690307328, + "grad_norm": 2.9050328731536865, + "learning_rate": 4.898828276745927e-06, + "loss": 0.651, + "step": 1244 + }, + { + "epoch": 0.5886524822695035, + "grad_norm": 2.958092212677002, + "learning_rate": 4.8986525311663065e-06, + "loss": 0.6395, + "step": 1245 + }, + { + "epoch": 0.5891252955082742, + "grad_norm": 2.952310800552368, + "learning_rate": 4.898476636233111e-06, + "loss": 0.6731, + "step": 1246 + }, + { + "epoch": 0.5895981087470449, + "grad_norm": 2.9876346588134766, + "learning_rate": 4.898300591957293e-06, + "loss": 0.7015, + "step": 1247 + }, + { + "epoch": 0.5900709219858156, + "grad_norm": 2.8941752910614014, + "learning_rate": 4.898124398349813e-06, + "loss": 0.6452, + "step": 1248 + }, + { + "epoch": 0.5905437352245863, + "grad_norm": 2.9809536933898926, + "learning_rate": 4.897948055421642e-06, + "loss": 0.5736, + "step": 1249 + }, + { + "epoch": 0.5910165484633569, + "grad_norm": 2.927046775817871, + "learning_rate": 4.897771563183761e-06, + "loss": 0.5918, + "step": 1250 + }, + { + "epoch": 0.5914893617021276, + "grad_norm": 2.865020275115967, + "learning_rate": 4.897594921647158e-06, + "loss": 0.6924, + "step": 1251 + }, + { + "epoch": 0.5919621749408983, + "grad_norm": 2.7406699657440186, + "learning_rate": 4.897418130822832e-06, + "loss": 0.509, + "step": 1252 + }, + { + "epoch": 0.592434988179669, + "grad_norm": 2.781606912612915, + "learning_rate": 4.897241190721791e-06, + "loss": 0.5555, + "step": 1253 + }, + { + "epoch": 0.5929078014184397, + "grad_norm": 2.79209303855896, + "learning_rate": 4.8970641013550535e-06, + "loss": 0.6722, + "step": 1254 + }, + { + "epoch": 0.5933806146572104, + "grad_norm": 3.0672268867492676, + "learning_rate": 4.896886862733645e-06, + "loss": 0.6366, + "step": 1255 + }, + { + "epoch": 0.5938534278959811, + "grad_norm": 2.7456953525543213, + "learning_rate": 4.896709474868602e-06, + "loss": 0.6246, + "step": 1256 + }, + { + "epoch": 0.5943262411347517, + "grad_norm": 3.6731202602386475, + "learning_rate": 4.896531937770968e-06, + "loss": 0.668, + "step": 1257 + }, + { + "epoch": 0.5947990543735224, + "grad_norm": 2.6056087017059326, + "learning_rate": 4.8963542514518e-06, + "loss": 0.5815, + "step": 1258 + }, + { + "epoch": 0.5952718676122931, + "grad_norm": 2.719698905944824, + "learning_rate": 4.89617641592216e-06, + "loss": 0.6058, + "step": 1259 + }, + { + "epoch": 0.5957446808510638, + "grad_norm": 2.625838279724121, + "learning_rate": 4.895998431193121e-06, + "loss": 0.6143, + "step": 1260 + }, + { + "epoch": 0.5962174940898345, + "grad_norm": 2.7166085243225098, + "learning_rate": 4.895820297275767e-06, + "loss": 0.5187, + "step": 1261 + }, + { + "epoch": 0.5966903073286052, + "grad_norm": 2.7544102668762207, + "learning_rate": 4.8956420141811875e-06, + "loss": 0.5928, + "step": 1262 + }, + { + "epoch": 0.5971631205673759, + "grad_norm": 2.6678333282470703, + "learning_rate": 4.895463581920484e-06, + "loss": 0.611, + "step": 1263 + }, + { + "epoch": 0.5976359338061465, + "grad_norm": 2.853384494781494, + "learning_rate": 4.895285000504768e-06, + "loss": 0.642, + "step": 1264 + }, + { + "epoch": 0.5981087470449172, + "grad_norm": 2.637852430343628, + "learning_rate": 4.895106269945158e-06, + "loss": 0.6308, + "step": 1265 + }, + { + "epoch": 0.5985815602836879, + "grad_norm": 2.9880387783050537, + "learning_rate": 4.8949273902527826e-06, + "loss": 0.5781, + "step": 1266 + }, + { + "epoch": 0.5990543735224586, + "grad_norm": 3.5984015464782715, + "learning_rate": 4.89474836143878e-06, + "loss": 0.5865, + "step": 1267 + }, + { + "epoch": 0.5995271867612293, + "grad_norm": 2.719855546951294, + "learning_rate": 4.8945691835142975e-06, + "loss": 0.6393, + "step": 1268 + }, + { + "epoch": 0.6, + "grad_norm": 2.7885141372680664, + "learning_rate": 4.894389856490492e-06, + "loss": 0.66, + "step": 1269 + }, + { + "epoch": 0.6004728132387707, + "grad_norm": 2.698819875717163, + "learning_rate": 4.894210380378529e-06, + "loss": 0.6144, + "step": 1270 + }, + { + "epoch": 0.6009456264775414, + "grad_norm": 2.278045654296875, + "learning_rate": 4.894030755189584e-06, + "loss": 0.5609, + "step": 1271 + }, + { + "epoch": 0.601418439716312, + "grad_norm": 2.8729357719421387, + "learning_rate": 4.893850980934841e-06, + "loss": 0.6715, + "step": 1272 + }, + { + "epoch": 0.6018912529550827, + "grad_norm": 2.8541221618652344, + "learning_rate": 4.893671057625495e-06, + "loss": 0.6787, + "step": 1273 + }, + { + "epoch": 0.6023640661938534, + "grad_norm": 2.4561476707458496, + "learning_rate": 4.893490985272748e-06, + "loss": 0.6331, + "step": 1274 + }, + { + "epoch": 0.6028368794326241, + "grad_norm": 2.565739154815674, + "learning_rate": 4.893310763887812e-06, + "loss": 0.587, + "step": 1275 + }, + { + "epoch": 0.6033096926713948, + "grad_norm": 2.384951591491699, + "learning_rate": 4.8931303934819095e-06, + "loss": 0.5358, + "step": 1276 + }, + { + "epoch": 0.6037825059101655, + "grad_norm": 2.380808115005493, + "learning_rate": 4.89294987406627e-06, + "loss": 0.5402, + "step": 1277 + }, + { + "epoch": 0.6042553191489362, + "grad_norm": 2.764815092086792, + "learning_rate": 4.892769205652136e-06, + "loss": 0.6103, + "step": 1278 + }, + { + "epoch": 0.6047281323877068, + "grad_norm": 2.463350296020508, + "learning_rate": 4.892588388250754e-06, + "loss": 0.5937, + "step": 1279 + }, + { + "epoch": 0.6052009456264775, + "grad_norm": 3.099689245223999, + "learning_rate": 4.8924074218733855e-06, + "loss": 0.6354, + "step": 1280 + }, + { + "epoch": 0.6056737588652482, + "grad_norm": 2.804450035095215, + "learning_rate": 4.892226306531297e-06, + "loss": 0.6595, + "step": 1281 + }, + { + "epoch": 0.6061465721040189, + "grad_norm": 3.1559767723083496, + "learning_rate": 4.892045042235765e-06, + "loss": 0.6664, + "step": 1282 + }, + { + "epoch": 0.6066193853427896, + "grad_norm": 2.844341993331909, + "learning_rate": 4.891863628998079e-06, + "loss": 0.7454, + "step": 1283 + }, + { + "epoch": 0.6070921985815603, + "grad_norm": 2.686602830886841, + "learning_rate": 4.891682066829532e-06, + "loss": 0.6755, + "step": 1284 + }, + { + "epoch": 0.607565011820331, + "grad_norm": 2.736457347869873, + "learning_rate": 4.8915003557414285e-06, + "loss": 0.6305, + "step": 1285 + }, + { + "epoch": 0.6080378250591016, + "grad_norm": 2.661362409591675, + "learning_rate": 4.891318495745086e-06, + "loss": 0.5958, + "step": 1286 + }, + { + "epoch": 0.6085106382978723, + "grad_norm": 2.707348108291626, + "learning_rate": 4.8911364868518255e-06, + "loss": 0.5824, + "step": 1287 + }, + { + "epoch": 0.608983451536643, + "grad_norm": 2.9798858165740967, + "learning_rate": 4.890954329072981e-06, + "loss": 0.5981, + "step": 1288 + }, + { + "epoch": 0.6094562647754137, + "grad_norm": 2.6285455226898193, + "learning_rate": 4.890772022419895e-06, + "loss": 0.6194, + "step": 1289 + }, + { + "epoch": 0.6099290780141844, + "grad_norm": 2.9254322052001953, + "learning_rate": 4.890589566903917e-06, + "loss": 0.6002, + "step": 1290 + }, + { + "epoch": 0.6104018912529551, + "grad_norm": 2.6458325386047363, + "learning_rate": 4.89040696253641e-06, + "loss": 0.5457, + "step": 1291 + }, + { + "epoch": 0.6108747044917258, + "grad_norm": 2.508242607116699, + "learning_rate": 4.890224209328743e-06, + "loss": 0.6168, + "step": 1292 + }, + { + "epoch": 0.6113475177304964, + "grad_norm": 3.034785509109497, + "learning_rate": 4.890041307292296e-06, + "loss": 0.664, + "step": 1293 + }, + { + "epoch": 0.6118203309692671, + "grad_norm": 3.52469539642334, + "learning_rate": 4.889858256438455e-06, + "loss": 0.7301, + "step": 1294 + }, + { + "epoch": 0.6122931442080378, + "grad_norm": 2.9145348072052, + "learning_rate": 4.889675056778622e-06, + "loss": 0.6494, + "step": 1295 + }, + { + "epoch": 0.6127659574468085, + "grad_norm": 2.831829071044922, + "learning_rate": 4.8894917083242e-06, + "loss": 0.6064, + "step": 1296 + }, + { + "epoch": 0.6132387706855792, + "grad_norm": 2.6883130073547363, + "learning_rate": 4.889308211086608e-06, + "loss": 0.5642, + "step": 1297 + }, + { + "epoch": 0.6137115839243499, + "grad_norm": 3.0605485439300537, + "learning_rate": 4.889124565077269e-06, + "loss": 0.6695, + "step": 1298 + }, + { + "epoch": 0.6141843971631206, + "grad_norm": 3.44062876701355, + "learning_rate": 4.88894077030762e-06, + "loss": 0.6415, + "step": 1299 + }, + { + "epoch": 0.6146572104018913, + "grad_norm": 2.5970818996429443, + "learning_rate": 4.888756826789105e-06, + "loss": 0.6518, + "step": 1300 + }, + { + "epoch": 0.6151300236406619, + "grad_norm": 4.2233567237854, + "learning_rate": 4.8885727345331755e-06, + "loss": 0.6555, + "step": 1301 + }, + { + "epoch": 0.6156028368794326, + "grad_norm": 2.645385503768921, + "learning_rate": 4.888388493551297e-06, + "loss": 0.6762, + "step": 1302 + }, + { + "epoch": 0.6160756501182033, + "grad_norm": 2.907954454421997, + "learning_rate": 4.8882041038549385e-06, + "loss": 0.6526, + "step": 1303 + }, + { + "epoch": 0.616548463356974, + "grad_norm": 2.482771873474121, + "learning_rate": 4.888019565455583e-06, + "loss": 0.628, + "step": 1304 + }, + { + "epoch": 0.6170212765957447, + "grad_norm": 2.7165915966033936, + "learning_rate": 4.88783487836472e-06, + "loss": 0.5743, + "step": 1305 + }, + { + "epoch": 0.6174940898345154, + "grad_norm": 3.095627546310425, + "learning_rate": 4.88765004259385e-06, + "loss": 0.627, + "step": 1306 + }, + { + "epoch": 0.6179669030732861, + "grad_norm": 2.5018465518951416, + "learning_rate": 4.8874650581544805e-06, + "loss": 0.5215, + "step": 1307 + }, + { + "epoch": 0.6184397163120567, + "grad_norm": 3.094337224960327, + "learning_rate": 4.8872799250581316e-06, + "loss": 0.6979, + "step": 1308 + }, + { + "epoch": 0.6189125295508274, + "grad_norm": 3.1002209186553955, + "learning_rate": 4.887094643316329e-06, + "loss": 0.6565, + "step": 1309 + }, + { + "epoch": 0.6193853427895981, + "grad_norm": 2.551431894302368, + "learning_rate": 4.88690921294061e-06, + "loss": 0.5748, + "step": 1310 + }, + { + "epoch": 0.6198581560283688, + "grad_norm": 2.8282904624938965, + "learning_rate": 4.886723633942521e-06, + "loss": 0.676, + "step": 1311 + }, + { + "epoch": 0.6203309692671395, + "grad_norm": 2.8887810707092285, + "learning_rate": 4.886537906333617e-06, + "loss": 0.5971, + "step": 1312 + }, + { + "epoch": 0.6208037825059102, + "grad_norm": 2.9989118576049805, + "learning_rate": 4.886352030125462e-06, + "loss": 0.6341, + "step": 1313 + }, + { + "epoch": 0.6212765957446809, + "grad_norm": 2.8042776584625244, + "learning_rate": 4.886166005329629e-06, + "loss": 0.6578, + "step": 1314 + }, + { + "epoch": 0.6217494089834515, + "grad_norm": 2.4980967044830322, + "learning_rate": 4.8859798319577026e-06, + "loss": 0.6711, + "step": 1315 + }, + { + "epoch": 0.6222222222222222, + "grad_norm": 2.762369155883789, + "learning_rate": 4.885793510021274e-06, + "loss": 0.5747, + "step": 1316 + }, + { + "epoch": 0.6226950354609929, + "grad_norm": 3.136327028274536, + "learning_rate": 4.885607039531945e-06, + "loss": 0.7544, + "step": 1317 + }, + { + "epoch": 0.6231678486997636, + "grad_norm": 2.8736963272094727, + "learning_rate": 4.885420420501327e-06, + "loss": 0.6603, + "step": 1318 + }, + { + "epoch": 0.6236406619385343, + "grad_norm": 2.766237497329712, + "learning_rate": 4.885233652941039e-06, + "loss": 0.581, + "step": 1319 + }, + { + "epoch": 0.624113475177305, + "grad_norm": 2.4740939140319824, + "learning_rate": 4.88504673686271e-06, + "loss": 0.6335, + "step": 1320 + }, + { + "epoch": 0.6245862884160757, + "grad_norm": 3.324795961380005, + "learning_rate": 4.884859672277978e-06, + "loss": 0.6019, + "step": 1321 + }, + { + "epoch": 0.6250591016548463, + "grad_norm": 3.521327257156372, + "learning_rate": 4.884672459198493e-06, + "loss": 0.6104, + "step": 1322 + }, + { + "epoch": 0.625531914893617, + "grad_norm": 2.7728071212768555, + "learning_rate": 4.884485097635909e-06, + "loss": 0.6714, + "step": 1323 + }, + { + "epoch": 0.6260047281323877, + "grad_norm": 3.0738155841827393, + "learning_rate": 4.884297587601895e-06, + "loss": 0.604, + "step": 1324 + }, + { + "epoch": 0.6264775413711584, + "grad_norm": 2.719240427017212, + "learning_rate": 4.884109929108124e-06, + "loss": 0.6795, + "step": 1325 + }, + { + "epoch": 0.6269503546099291, + "grad_norm": 2.4108200073242188, + "learning_rate": 4.883922122166282e-06, + "loss": 0.5846, + "step": 1326 + }, + { + "epoch": 0.6274231678486998, + "grad_norm": 2.393899917602539, + "learning_rate": 4.883734166788063e-06, + "loss": 0.6188, + "step": 1327 + }, + { + "epoch": 0.6278959810874705, + "grad_norm": 4.555255889892578, + "learning_rate": 4.883546062985169e-06, + "loss": 0.5962, + "step": 1328 + }, + { + "epoch": 0.6283687943262412, + "grad_norm": 2.571075439453125, + "learning_rate": 4.883357810769315e-06, + "loss": 0.6165, + "step": 1329 + }, + { + "epoch": 0.6288416075650118, + "grad_norm": 2.553115129470825, + "learning_rate": 4.8831694101522185e-06, + "loss": 0.6787, + "step": 1330 + }, + { + "epoch": 0.6293144208037825, + "grad_norm": 3.2564642429351807, + "learning_rate": 4.882980861145614e-06, + "loss": 0.659, + "step": 1331 + }, + { + "epoch": 0.6297872340425532, + "grad_norm": 2.535216808319092, + "learning_rate": 4.882792163761241e-06, + "loss": 0.6176, + "step": 1332 + }, + { + "epoch": 0.6302600472813239, + "grad_norm": 3.097921848297119, + "learning_rate": 4.882603318010847e-06, + "loss": 0.6822, + "step": 1333 + }, + { + "epoch": 0.6307328605200946, + "grad_norm": 2.8135175704956055, + "learning_rate": 4.882414323906192e-06, + "loss": 0.6782, + "step": 1334 + }, + { + "epoch": 0.6312056737588653, + "grad_norm": 2.724634885787964, + "learning_rate": 4.882225181459044e-06, + "loss": 0.6545, + "step": 1335 + }, + { + "epoch": 0.631678486997636, + "grad_norm": 2.9585227966308594, + "learning_rate": 4.882035890681179e-06, + "loss": 0.6218, + "step": 1336 + }, + { + "epoch": 0.6321513002364066, + "grad_norm": 2.6952011585235596, + "learning_rate": 4.881846451584385e-06, + "loss": 0.6, + "step": 1337 + }, + { + "epoch": 0.6326241134751773, + "grad_norm": 3.1400704383850098, + "learning_rate": 4.881656864180455e-06, + "loss": 0.6687, + "step": 1338 + }, + { + "epoch": 0.633096926713948, + "grad_norm": 2.8382487297058105, + "learning_rate": 4.881467128481197e-06, + "loss": 0.574, + "step": 1339 + }, + { + "epoch": 0.6335697399527187, + "grad_norm": 2.8520095348358154, + "learning_rate": 4.881277244498422e-06, + "loss": 0.6582, + "step": 1340 + }, + { + "epoch": 0.6340425531914894, + "grad_norm": 2.703498363494873, + "learning_rate": 4.881087212243956e-06, + "loss": 0.7224, + "step": 1341 + }, + { + "epoch": 0.6345153664302601, + "grad_norm": 3.697205066680908, + "learning_rate": 4.880897031729629e-06, + "loss": 0.6582, + "step": 1342 + }, + { + "epoch": 0.6349881796690308, + "grad_norm": 2.7625808715820312, + "learning_rate": 4.880706702967284e-06, + "loss": 0.574, + "step": 1343 + }, + { + "epoch": 0.6354609929078014, + "grad_norm": 2.949984073638916, + "learning_rate": 4.880516225968771e-06, + "loss": 0.66, + "step": 1344 + }, + { + "epoch": 0.6359338061465721, + "grad_norm": 2.548269748687744, + "learning_rate": 4.8803256007459525e-06, + "loss": 0.642, + "step": 1345 + }, + { + "epoch": 0.6364066193853428, + "grad_norm": 2.5102174282073975, + "learning_rate": 4.8801348273106945e-06, + "loss": 0.6238, + "step": 1346 + }, + { + "epoch": 0.6368794326241135, + "grad_norm": 2.9847946166992188, + "learning_rate": 4.8799439056748786e-06, + "loss": 0.5416, + "step": 1347 + }, + { + "epoch": 0.6373522458628842, + "grad_norm": 2.8711049556732178, + "learning_rate": 4.879752835850391e-06, + "loss": 0.6427, + "step": 1348 + }, + { + "epoch": 0.6378250591016549, + "grad_norm": 2.7901716232299805, + "learning_rate": 4.879561617849129e-06, + "loss": 0.6026, + "step": 1349 + }, + { + "epoch": 0.6382978723404256, + "grad_norm": 2.659778356552124, + "learning_rate": 4.879370251682999e-06, + "loss": 0.6623, + "step": 1350 + }, + { + "epoch": 0.6387706855791963, + "grad_norm": 3.224386692047119, + "learning_rate": 4.879178737363917e-06, + "loss": 0.6485, + "step": 1351 + }, + { + "epoch": 0.6392434988179669, + "grad_norm": 2.6385605335235596, + "learning_rate": 4.8789870749038076e-06, + "loss": 0.5866, + "step": 1352 + }, + { + "epoch": 0.6397163120567376, + "grad_norm": 2.807713270187378, + "learning_rate": 4.8787952643146045e-06, + "loss": 0.6537, + "step": 1353 + }, + { + "epoch": 0.6401891252955083, + "grad_norm": 2.5689280033111572, + "learning_rate": 4.878603305608251e-06, + "loss": 0.6216, + "step": 1354 + }, + { + "epoch": 0.640661938534279, + "grad_norm": 2.7347843647003174, + "learning_rate": 4.8784111987967e-06, + "loss": 0.6318, + "step": 1355 + }, + { + "epoch": 0.6411347517730497, + "grad_norm": 2.5210378170013428, + "learning_rate": 4.878218943891911e-06, + "loss": 0.5472, + "step": 1356 + }, + { + "epoch": 0.6416075650118204, + "grad_norm": 2.866785764694214, + "learning_rate": 4.878026540905858e-06, + "loss": 0.7108, + "step": 1357 + }, + { + "epoch": 0.642080378250591, + "grad_norm": 2.923314332962036, + "learning_rate": 4.877833989850519e-06, + "loss": 0.5557, + "step": 1358 + }, + { + "epoch": 0.6425531914893617, + "grad_norm": 2.925463914871216, + "learning_rate": 4.8776412907378845e-06, + "loss": 0.6382, + "step": 1359 + }, + { + "epoch": 0.6430260047281324, + "grad_norm": 2.909644365310669, + "learning_rate": 4.877448443579952e-06, + "loss": 0.5603, + "step": 1360 + }, + { + "epoch": 0.6434988179669031, + "grad_norm": 3.501148223876953, + "learning_rate": 4.8772554483887306e-06, + "loss": 0.6722, + "step": 1361 + }, + { + "epoch": 0.6439716312056738, + "grad_norm": 2.823765516281128, + "learning_rate": 4.877062305176235e-06, + "loss": 0.6408, + "step": 1362 + }, + { + "epoch": 0.6444444444444445, + "grad_norm": 2.9807584285736084, + "learning_rate": 4.8768690139544935e-06, + "loss": 0.5984, + "step": 1363 + }, + { + "epoch": 0.6449172576832152, + "grad_norm": 2.8411378860473633, + "learning_rate": 4.8766755747355405e-06, + "loss": 0.6231, + "step": 1364 + }, + { + "epoch": 0.6453900709219859, + "grad_norm": 3.158952236175537, + "learning_rate": 4.8764819875314215e-06, + "loss": 0.6441, + "step": 1365 + }, + { + "epoch": 0.6458628841607565, + "grad_norm": 2.9614369869232178, + "learning_rate": 4.876288252354189e-06, + "loss": 0.6308, + "step": 1366 + }, + { + "epoch": 0.6463356973995272, + "grad_norm": 3.073805570602417, + "learning_rate": 4.876094369215907e-06, + "loss": 0.6046, + "step": 1367 + }, + { + "epoch": 0.6468085106382979, + "grad_norm": 2.719189405441284, + "learning_rate": 4.875900338128648e-06, + "loss": 0.6082, + "step": 1368 + }, + { + "epoch": 0.6472813238770686, + "grad_norm": 2.676726818084717, + "learning_rate": 4.8757061591044914e-06, + "loss": 0.6344, + "step": 1369 + }, + { + "epoch": 0.6477541371158393, + "grad_norm": 2.955256938934326, + "learning_rate": 4.87551183215553e-06, + "loss": 0.6506, + "step": 1370 + }, + { + "epoch": 0.64822695035461, + "grad_norm": 2.5672218799591064, + "learning_rate": 4.875317357293864e-06, + "loss": 0.5284, + "step": 1371 + }, + { + "epoch": 0.6486997635933807, + "grad_norm": 2.5860238075256348, + "learning_rate": 4.875122734531602e-06, + "loss": 0.667, + "step": 1372 + }, + { + "epoch": 0.6491725768321513, + "grad_norm": 3.1037003993988037, + "learning_rate": 4.8749279638808605e-06, + "loss": 0.6902, + "step": 1373 + }, + { + "epoch": 0.649645390070922, + "grad_norm": 2.7715282440185547, + "learning_rate": 4.874733045353769e-06, + "loss": 0.6291, + "step": 1374 + }, + { + "epoch": 0.6501182033096927, + "grad_norm": 2.527071475982666, + "learning_rate": 4.874537978962463e-06, + "loss": 0.5565, + "step": 1375 + }, + { + "epoch": 0.6505910165484634, + "grad_norm": 2.722092628479004, + "learning_rate": 4.874342764719091e-06, + "loss": 0.5724, + "step": 1376 + }, + { + "epoch": 0.6510638297872341, + "grad_norm": 2.6342411041259766, + "learning_rate": 4.874147402635805e-06, + "loss": 0.6308, + "step": 1377 + }, + { + "epoch": 0.6515366430260048, + "grad_norm": 2.3850719928741455, + "learning_rate": 4.8739518927247695e-06, + "loss": 0.5692, + "step": 1378 + }, + { + "epoch": 0.6520094562647755, + "grad_norm": 2.9787259101867676, + "learning_rate": 4.873756234998161e-06, + "loss": 0.6953, + "step": 1379 + }, + { + "epoch": 0.6524822695035462, + "grad_norm": 2.634141683578491, + "learning_rate": 4.873560429468159e-06, + "loss": 0.6077, + "step": 1380 + }, + { + "epoch": 0.6529550827423168, + "grad_norm": 2.803046941757202, + "learning_rate": 4.873364476146958e-06, + "loss": 0.6657, + "step": 1381 + }, + { + "epoch": 0.6534278959810875, + "grad_norm": 2.762827157974243, + "learning_rate": 4.8731683750467574e-06, + "loss": 0.6061, + "step": 1382 + }, + { + "epoch": 0.6539007092198581, + "grad_norm": 2.6654391288757324, + "learning_rate": 4.872972126179768e-06, + "loss": 0.6387, + "step": 1383 + }, + { + "epoch": 0.6543735224586288, + "grad_norm": 2.4363625049591064, + "learning_rate": 4.872775729558209e-06, + "loss": 0.5623, + "step": 1384 + }, + { + "epoch": 0.6548463356973995, + "grad_norm": 2.528959035873413, + "learning_rate": 4.87257918519431e-06, + "loss": 0.5609, + "step": 1385 + }, + { + "epoch": 0.6553191489361702, + "grad_norm": 2.718383312225342, + "learning_rate": 4.872382493100309e-06, + "loss": 0.5575, + "step": 1386 + }, + { + "epoch": 0.6557919621749408, + "grad_norm": 2.660841226577759, + "learning_rate": 4.872185653288453e-06, + "loss": 0.6106, + "step": 1387 + }, + { + "epoch": 0.6562647754137115, + "grad_norm": 2.508753538131714, + "learning_rate": 4.871988665770997e-06, + "loss": 0.5705, + "step": 1388 + }, + { + "epoch": 0.6567375886524822, + "grad_norm": 2.5134334564208984, + "learning_rate": 4.871791530560208e-06, + "loss": 0.5592, + "step": 1389 + }, + { + "epoch": 0.6572104018912529, + "grad_norm": 2.7475597858428955, + "learning_rate": 4.871594247668361e-06, + "loss": 0.6277, + "step": 1390 + }, + { + "epoch": 0.6576832151300236, + "grad_norm": 2.793616533279419, + "learning_rate": 4.871396817107739e-06, + "loss": 0.595, + "step": 1391 + }, + { + "epoch": 0.6581560283687943, + "grad_norm": 2.8285086154937744, + "learning_rate": 4.871199238890635e-06, + "loss": 0.6094, + "step": 1392 + }, + { + "epoch": 0.658628841607565, + "grad_norm": 2.74124813079834, + "learning_rate": 4.871001513029352e-06, + "loss": 0.6296, + "step": 1393 + }, + { + "epoch": 0.6591016548463356, + "grad_norm": 2.761237621307373, + "learning_rate": 4.870803639536202e-06, + "loss": 0.5702, + "step": 1394 + }, + { + "epoch": 0.6595744680851063, + "grad_norm": 2.761038064956665, + "learning_rate": 4.870605618423504e-06, + "loss": 0.6195, + "step": 1395 + }, + { + "epoch": 0.660047281323877, + "grad_norm": 2.8812482357025146, + "learning_rate": 4.870407449703589e-06, + "loss": 0.616, + "step": 1396 + }, + { + "epoch": 0.6605200945626477, + "grad_norm": 2.9966578483581543, + "learning_rate": 4.870209133388797e-06, + "loss": 0.6547, + "step": 1397 + }, + { + "epoch": 0.6609929078014184, + "grad_norm": 2.7969017028808594, + "learning_rate": 4.870010669491474e-06, + "loss": 0.5762, + "step": 1398 + }, + { + "epoch": 0.6614657210401891, + "grad_norm": 2.557783842086792, + "learning_rate": 4.86981205802398e-06, + "loss": 0.6184, + "step": 1399 + }, + { + "epoch": 0.6619385342789598, + "grad_norm": 2.5393927097320557, + "learning_rate": 4.86961329899868e-06, + "loss": 0.5953, + "step": 1400 + }, + { + "epoch": 0.6624113475177305, + "grad_norm": 2.7745981216430664, + "learning_rate": 4.86941439242795e-06, + "loss": 0.5967, + "step": 1401 + }, + { + "epoch": 0.6628841607565011, + "grad_norm": 2.650381326675415, + "learning_rate": 4.869215338324176e-06, + "loss": 0.5667, + "step": 1402 + }, + { + "epoch": 0.6633569739952718, + "grad_norm": 2.583169937133789, + "learning_rate": 4.869016136699751e-06, + "loss": 0.549, + "step": 1403 + }, + { + "epoch": 0.6638297872340425, + "grad_norm": 2.984978437423706, + "learning_rate": 4.868816787567079e-06, + "loss": 0.5931, + "step": 1404 + }, + { + "epoch": 0.6643026004728132, + "grad_norm": 3.1947181224823, + "learning_rate": 4.868617290938573e-06, + "loss": 0.5473, + "step": 1405 + }, + { + "epoch": 0.6647754137115839, + "grad_norm": 2.562927007675171, + "learning_rate": 4.868417646826654e-06, + "loss": 0.6878, + "step": 1406 + }, + { + "epoch": 0.6652482269503546, + "grad_norm": 2.8741261959075928, + "learning_rate": 4.868217855243754e-06, + "loss": 0.6312, + "step": 1407 + }, + { + "epoch": 0.6657210401891253, + "grad_norm": 2.9834797382354736, + "learning_rate": 4.868017916202312e-06, + "loss": 0.5624, + "step": 1408 + }, + { + "epoch": 0.6661938534278959, + "grad_norm": 2.6935982704162598, + "learning_rate": 4.8678178297147785e-06, + "loss": 0.5857, + "step": 1409 + }, + { + "epoch": 0.6666666666666666, + "grad_norm": 2.8200576305389404, + "learning_rate": 4.86761759579361e-06, + "loss": 0.6153, + "step": 1410 + }, + { + "epoch": 0.6671394799054373, + "grad_norm": 2.831425189971924, + "learning_rate": 4.867417214451276e-06, + "loss": 0.6495, + "step": 1411 + }, + { + "epoch": 0.667612293144208, + "grad_norm": 2.733565092086792, + "learning_rate": 4.867216685700253e-06, + "loss": 0.6036, + "step": 1412 + }, + { + "epoch": 0.6680851063829787, + "grad_norm": 3.0609400272369385, + "learning_rate": 4.867016009553027e-06, + "loss": 0.6773, + "step": 1413 + }, + { + "epoch": 0.6685579196217494, + "grad_norm": 2.665452241897583, + "learning_rate": 4.866815186022093e-06, + "loss": 0.6256, + "step": 1414 + }, + { + "epoch": 0.6690307328605201, + "grad_norm": 2.9480721950531006, + "learning_rate": 4.866614215119956e-06, + "loss": 0.535, + "step": 1415 + }, + { + "epoch": 0.6695035460992907, + "grad_norm": 2.5514180660247803, + "learning_rate": 4.866413096859128e-06, + "loss": 0.6588, + "step": 1416 + }, + { + "epoch": 0.6699763593380614, + "grad_norm": 3.3442373275756836, + "learning_rate": 4.866211831252134e-06, + "loss": 0.5754, + "step": 1417 + }, + { + "epoch": 0.6704491725768321, + "grad_norm": 2.521467685699463, + "learning_rate": 4.866010418311504e-06, + "loss": 0.5546, + "step": 1418 + }, + { + "epoch": 0.6709219858156028, + "grad_norm": 2.930706262588501, + "learning_rate": 4.865808858049781e-06, + "loss": 0.589, + "step": 1419 + }, + { + "epoch": 0.6713947990543735, + "grad_norm": 2.6298375129699707, + "learning_rate": 4.865607150479513e-06, + "loss": 0.5915, + "step": 1420 + }, + { + "epoch": 0.6718676122931442, + "grad_norm": 2.9554293155670166, + "learning_rate": 4.8654052956132615e-06, + "loss": 0.6654, + "step": 1421 + }, + { + "epoch": 0.6723404255319149, + "grad_norm": 3.2706902027130127, + "learning_rate": 4.865203293463593e-06, + "loss": 0.7115, + "step": 1422 + }, + { + "epoch": 0.6728132387706856, + "grad_norm": 3.041539430618286, + "learning_rate": 4.865001144043088e-06, + "loss": 0.5818, + "step": 1423 + }, + { + "epoch": 0.6732860520094562, + "grad_norm": 3.1314544677734375, + "learning_rate": 4.864798847364331e-06, + "loss": 0.5822, + "step": 1424 + }, + { + "epoch": 0.6737588652482269, + "grad_norm": 2.5301461219787598, + "learning_rate": 4.86459640343992e-06, + "loss": 0.5525, + "step": 1425 + }, + { + "epoch": 0.6742316784869976, + "grad_norm": 2.809295892715454, + "learning_rate": 4.864393812282458e-06, + "loss": 0.6768, + "step": 1426 + }, + { + "epoch": 0.6747044917257683, + "grad_norm": 2.794664144515991, + "learning_rate": 4.864191073904562e-06, + "loss": 0.5793, + "step": 1427 + }, + { + "epoch": 0.675177304964539, + "grad_norm": 2.7771105766296387, + "learning_rate": 4.863988188318854e-06, + "loss": 0.6453, + "step": 1428 + }, + { + "epoch": 0.6756501182033097, + "grad_norm": 2.6431946754455566, + "learning_rate": 4.863785155537967e-06, + "loss": 0.5877, + "step": 1429 + }, + { + "epoch": 0.6761229314420804, + "grad_norm": 2.951353073120117, + "learning_rate": 4.863581975574544e-06, + "loss": 0.6793, + "step": 1430 + }, + { + "epoch": 0.676595744680851, + "grad_norm": 3.1336071491241455, + "learning_rate": 4.863378648441235e-06, + "loss": 0.6695, + "step": 1431 + }, + { + "epoch": 0.6770685579196217, + "grad_norm": 2.735982656478882, + "learning_rate": 4.8631751741507e-06, + "loss": 0.5239, + "step": 1432 + }, + { + "epoch": 0.6775413711583924, + "grad_norm": 2.7085206508636475, + "learning_rate": 4.862971552715611e-06, + "loss": 0.6837, + "step": 1433 + }, + { + "epoch": 0.6780141843971631, + "grad_norm": 3.136528730392456, + "learning_rate": 4.8627677841486436e-06, + "loss": 0.683, + "step": 1434 + }, + { + "epoch": 0.6784869976359338, + "grad_norm": 2.7879369258880615, + "learning_rate": 4.862563868462486e-06, + "loss": 0.608, + "step": 1435 + }, + { + "epoch": 0.6789598108747045, + "grad_norm": 2.7937729358673096, + "learning_rate": 4.862359805669837e-06, + "loss": 0.6131, + "step": 1436 + }, + { + "epoch": 0.6794326241134752, + "grad_norm": 2.5988364219665527, + "learning_rate": 4.862155595783401e-06, + "loss": 0.6303, + "step": 1437 + }, + { + "epoch": 0.6799054373522458, + "grad_norm": 3.251070499420166, + "learning_rate": 4.861951238815894e-06, + "loss": 0.7246, + "step": 1438 + }, + { + "epoch": 0.6803782505910165, + "grad_norm": 2.646759271621704, + "learning_rate": 4.861746734780039e-06, + "loss": 0.6313, + "step": 1439 + }, + { + "epoch": 0.6808510638297872, + "grad_norm": 2.773866891860962, + "learning_rate": 4.861542083688573e-06, + "loss": 0.6463, + "step": 1440 + }, + { + "epoch": 0.6813238770685579, + "grad_norm": 2.759965658187866, + "learning_rate": 4.861337285554235e-06, + "loss": 0.5428, + "step": 1441 + }, + { + "epoch": 0.6817966903073286, + "grad_norm": 3.3250818252563477, + "learning_rate": 4.861132340389779e-06, + "loss": 0.6522, + "step": 1442 + }, + { + "epoch": 0.6822695035460993, + "grad_norm": 2.661797523498535, + "learning_rate": 4.860927248207965e-06, + "loss": 0.5871, + "step": 1443 + }, + { + "epoch": 0.68274231678487, + "grad_norm": 2.706289052963257, + "learning_rate": 4.860722009021563e-06, + "loss": 0.6651, + "step": 1444 + }, + { + "epoch": 0.6832151300236406, + "grad_norm": 2.8459298610687256, + "learning_rate": 4.860516622843354e-06, + "loss": 0.5827, + "step": 1445 + }, + { + "epoch": 0.6836879432624113, + "grad_norm": 3.1041831970214844, + "learning_rate": 4.860311089686125e-06, + "loss": 0.6727, + "step": 1446 + }, + { + "epoch": 0.684160756501182, + "grad_norm": 2.9382801055908203, + "learning_rate": 4.8601054095626746e-06, + "loss": 0.6002, + "step": 1447 + }, + { + "epoch": 0.6846335697399527, + "grad_norm": 2.782475471496582, + "learning_rate": 4.859899582485808e-06, + "loss": 0.6951, + "step": 1448 + }, + { + "epoch": 0.6851063829787234, + "grad_norm": 3.313894510269165, + "learning_rate": 4.859693608468343e-06, + "loss": 0.6363, + "step": 1449 + }, + { + "epoch": 0.6855791962174941, + "grad_norm": 3.1639695167541504, + "learning_rate": 4.8594874875231045e-06, + "loss": 0.7002, + "step": 1450 + }, + { + "epoch": 0.6860520094562648, + "grad_norm": 2.6762218475341797, + "learning_rate": 4.859281219662926e-06, + "loss": 0.6246, + "step": 1451 + }, + { + "epoch": 0.6865248226950355, + "grad_norm": 2.8368663787841797, + "learning_rate": 4.85907480490065e-06, + "loss": 0.5906, + "step": 1452 + }, + { + "epoch": 0.6869976359338061, + "grad_norm": 2.887373208999634, + "learning_rate": 4.858868243249131e-06, + "loss": 0.5931, + "step": 1453 + }, + { + "epoch": 0.6874704491725768, + "grad_norm": 2.8115322589874268, + "learning_rate": 4.858661534721229e-06, + "loss": 0.6337, + "step": 1454 + }, + { + "epoch": 0.6879432624113475, + "grad_norm": 2.8470499515533447, + "learning_rate": 4.8584546793298174e-06, + "loss": 0.632, + "step": 1455 + }, + { + "epoch": 0.6884160756501182, + "grad_norm": 2.8229613304138184, + "learning_rate": 4.8582476770877725e-06, + "loss": 0.6494, + "step": 1456 + }, + { + "epoch": 0.6888888888888889, + "grad_norm": 2.4235479831695557, + "learning_rate": 4.858040528007987e-06, + "loss": 0.5709, + "step": 1457 + }, + { + "epoch": 0.6893617021276596, + "grad_norm": 2.9348199367523193, + "learning_rate": 4.857833232103356e-06, + "loss": 0.5404, + "step": 1458 + }, + { + "epoch": 0.6898345153664303, + "grad_norm": 2.8274219036102295, + "learning_rate": 4.857625789386789e-06, + "loss": 0.701, + "step": 1459 + }, + { + "epoch": 0.6903073286052009, + "grad_norm": 3.136929988861084, + "learning_rate": 4.857418199871203e-06, + "loss": 0.6971, + "step": 1460 + }, + { + "epoch": 0.6907801418439716, + "grad_norm": 2.8987185955047607, + "learning_rate": 4.8572104635695214e-06, + "loss": 0.6613, + "step": 1461 + }, + { + "epoch": 0.6912529550827423, + "grad_norm": 2.5073442459106445, + "learning_rate": 4.857002580494681e-06, + "loss": 0.6032, + "step": 1462 + }, + { + "epoch": 0.691725768321513, + "grad_norm": 2.7019522190093994, + "learning_rate": 4.856794550659625e-06, + "loss": 0.567, + "step": 1463 + }, + { + "epoch": 0.6921985815602837, + "grad_norm": 2.4795594215393066, + "learning_rate": 4.8565863740773054e-06, + "loss": 0.5777, + "step": 1464 + }, + { + "epoch": 0.6926713947990544, + "grad_norm": 3.032506227493286, + "learning_rate": 4.856378050760687e-06, + "loss": 0.607, + "step": 1465 + }, + { + "epoch": 0.6931442080378251, + "grad_norm": 3.052091121673584, + "learning_rate": 4.85616958072274e-06, + "loss": 0.591, + "step": 1466 + }, + { + "epoch": 0.6936170212765957, + "grad_norm": 2.704831838607788, + "learning_rate": 4.855960963976443e-06, + "loss": 0.6528, + "step": 1467 + }, + { + "epoch": 0.6940898345153664, + "grad_norm": 2.680995225906372, + "learning_rate": 4.855752200534788e-06, + "loss": 0.6294, + "step": 1468 + }, + { + "epoch": 0.6945626477541371, + "grad_norm": 2.3948659896850586, + "learning_rate": 4.855543290410774e-06, + "loss": 0.6091, + "step": 1469 + }, + { + "epoch": 0.6950354609929078, + "grad_norm": 2.6407411098480225, + "learning_rate": 4.855334233617407e-06, + "loss": 0.5572, + "step": 1470 + }, + { + "epoch": 0.6955082742316785, + "grad_norm": 2.5526835918426514, + "learning_rate": 4.8551250301677064e-06, + "loss": 0.5432, + "step": 1471 + }, + { + "epoch": 0.6959810874704492, + "grad_norm": 3.1237430572509766, + "learning_rate": 4.8549156800746965e-06, + "loss": 0.5944, + "step": 1472 + }, + { + "epoch": 0.6964539007092199, + "grad_norm": 2.8112540245056152, + "learning_rate": 4.854706183351412e-06, + "loss": 0.604, + "step": 1473 + }, + { + "epoch": 0.6969267139479906, + "grad_norm": 2.664644479751587, + "learning_rate": 4.8544965400109e-06, + "loss": 0.5647, + "step": 1474 + }, + { + "epoch": 0.6973995271867612, + "grad_norm": 3.26310133934021, + "learning_rate": 4.854286750066212e-06, + "loss": 0.6999, + "step": 1475 + }, + { + "epoch": 0.6978723404255319, + "grad_norm": 2.9717442989349365, + "learning_rate": 4.8540768135304115e-06, + "loss": 0.6655, + "step": 1476 + }, + { + "epoch": 0.6983451536643026, + "grad_norm": 2.5302982330322266, + "learning_rate": 4.85386673041657e-06, + "loss": 0.6384, + "step": 1477 + }, + { + "epoch": 0.6988179669030733, + "grad_norm": 2.864877700805664, + "learning_rate": 4.853656500737769e-06, + "loss": 0.6834, + "step": 1478 + }, + { + "epoch": 0.699290780141844, + "grad_norm": 2.5522031784057617, + "learning_rate": 4.853446124507098e-06, + "loss": 0.5929, + "step": 1479 + }, + { + "epoch": 0.6997635933806147, + "grad_norm": 3.096477746963501, + "learning_rate": 4.853235601737656e-06, + "loss": 0.5737, + "step": 1480 + }, + { + "epoch": 0.7002364066193854, + "grad_norm": 2.884779214859009, + "learning_rate": 4.853024932442552e-06, + "loss": 0.6362, + "step": 1481 + }, + { + "epoch": 0.700709219858156, + "grad_norm": 3.368558406829834, + "learning_rate": 4.852814116634903e-06, + "loss": 0.6721, + "step": 1482 + }, + { + "epoch": 0.7011820330969267, + "grad_norm": 2.742414951324463, + "learning_rate": 4.852603154327837e-06, + "loss": 0.6212, + "step": 1483 + }, + { + "epoch": 0.7016548463356974, + "grad_norm": 2.53454852104187, + "learning_rate": 4.8523920455344864e-06, + "loss": 0.6675, + "step": 1484 + }, + { + "epoch": 0.7021276595744681, + "grad_norm": 2.9354238510131836, + "learning_rate": 4.852180790267999e-06, + "loss": 0.6692, + "step": 1485 + }, + { + "epoch": 0.7026004728132388, + "grad_norm": 2.585070848464966, + "learning_rate": 4.8519693885415274e-06, + "loss": 0.6215, + "step": 1486 + }, + { + "epoch": 0.7030732860520095, + "grad_norm": 2.9047999382019043, + "learning_rate": 4.851757840368235e-06, + "loss": 0.6231, + "step": 1487 + }, + { + "epoch": 0.7035460992907802, + "grad_norm": 3.0930933952331543, + "learning_rate": 4.851546145761295e-06, + "loss": 0.7267, + "step": 1488 + }, + { + "epoch": 0.7040189125295508, + "grad_norm": 3.0224719047546387, + "learning_rate": 4.8513343047338875e-06, + "loss": 0.6293, + "step": 1489 + }, + { + "epoch": 0.7044917257683215, + "grad_norm": 2.5758471488952637, + "learning_rate": 4.851122317299203e-06, + "loss": 0.5855, + "step": 1490 + }, + { + "epoch": 0.7049645390070922, + "grad_norm": 2.579272508621216, + "learning_rate": 4.850910183470441e-06, + "loss": 0.582, + "step": 1491 + }, + { + "epoch": 0.7054373522458629, + "grad_norm": 2.8148300647735596, + "learning_rate": 4.85069790326081e-06, + "loss": 0.6396, + "step": 1492 + }, + { + "epoch": 0.7059101654846336, + "grad_norm": 2.6380527019500732, + "learning_rate": 4.850485476683528e-06, + "loss": 0.6114, + "step": 1493 + }, + { + "epoch": 0.7063829787234043, + "grad_norm": 2.7736263275146484, + "learning_rate": 4.850272903751823e-06, + "loss": 0.6683, + "step": 1494 + }, + { + "epoch": 0.706855791962175, + "grad_norm": 3.1958179473876953, + "learning_rate": 4.8500601844789285e-06, + "loss": 0.6265, + "step": 1495 + }, + { + "epoch": 0.7073286052009456, + "grad_norm": 3.783212423324585, + "learning_rate": 4.8498473188780916e-06, + "loss": 0.6078, + "step": 1496 + }, + { + "epoch": 0.7078014184397163, + "grad_norm": 2.6656646728515625, + "learning_rate": 4.849634306962566e-06, + "loss": 0.5756, + "step": 1497 + }, + { + "epoch": 0.708274231678487, + "grad_norm": 2.757141590118408, + "learning_rate": 4.849421148745615e-06, + "loss": 0.5596, + "step": 1498 + }, + { + "epoch": 0.7087470449172577, + "grad_norm": 3.0391886234283447, + "learning_rate": 4.849207844240511e-06, + "loss": 0.5293, + "step": 1499 + }, + { + "epoch": 0.7092198581560284, + "grad_norm": 2.981912851333618, + "learning_rate": 4.848994393460535e-06, + "loss": 0.598, + "step": 1500 + }, + { + "epoch": 0.7096926713947991, + "grad_norm": 2.5470798015594482, + "learning_rate": 4.848780796418978e-06, + "loss": 0.6266, + "step": 1501 + }, + { + "epoch": 0.7101654846335698, + "grad_norm": 2.8394415378570557, + "learning_rate": 4.8485670531291415e-06, + "loss": 0.6844, + "step": 1502 + }, + { + "epoch": 0.7106382978723405, + "grad_norm": 3.2023508548736572, + "learning_rate": 4.848353163604331e-06, + "loss": 0.6134, + "step": 1503 + }, + { + "epoch": 0.7111111111111111, + "grad_norm": 2.98245906829834, + "learning_rate": 4.848139127857867e-06, + "loss": 0.7084, + "step": 1504 + }, + { + "epoch": 0.7115839243498818, + "grad_norm": 2.5917441844940186, + "learning_rate": 4.847924945903076e-06, + "loss": 0.5676, + "step": 1505 + }, + { + "epoch": 0.7120567375886525, + "grad_norm": 2.8736681938171387, + "learning_rate": 4.847710617753294e-06, + "loss": 0.6304, + "step": 1506 + }, + { + "epoch": 0.7125295508274232, + "grad_norm": 2.7832682132720947, + "learning_rate": 4.847496143421866e-06, + "loss": 0.5705, + "step": 1507 + }, + { + "epoch": 0.7130023640661939, + "grad_norm": 2.480560779571533, + "learning_rate": 4.847281522922147e-06, + "loss": 0.5595, + "step": 1508 + }, + { + "epoch": 0.7134751773049646, + "grad_norm": 2.357675313949585, + "learning_rate": 4.847066756267499e-06, + "loss": 0.5065, + "step": 1509 + }, + { + "epoch": 0.7139479905437353, + "grad_norm": 2.632669448852539, + "learning_rate": 4.846851843471296e-06, + "loss": 0.6949, + "step": 1510 + }, + { + "epoch": 0.7144208037825059, + "grad_norm": 2.7691073417663574, + "learning_rate": 4.84663678454692e-06, + "loss": 0.6638, + "step": 1511 + }, + { + "epoch": 0.7148936170212766, + "grad_norm": 2.5647685527801514, + "learning_rate": 4.846421579507761e-06, + "loss": 0.6098, + "step": 1512 + }, + { + "epoch": 0.7153664302600473, + "grad_norm": 2.476701021194458, + "learning_rate": 4.846206228367218e-06, + "loss": 0.592, + "step": 1513 + }, + { + "epoch": 0.715839243498818, + "grad_norm": 2.805727958679199, + "learning_rate": 4.845990731138702e-06, + "loss": 0.5466, + "step": 1514 + }, + { + "epoch": 0.7163120567375887, + "grad_norm": 2.551392078399658, + "learning_rate": 4.84577508783563e-06, + "loss": 0.6039, + "step": 1515 + }, + { + "epoch": 0.7167848699763594, + "grad_norm": 2.6861350536346436, + "learning_rate": 4.845559298471429e-06, + "loss": 0.6427, + "step": 1516 + }, + { + "epoch": 0.7172576832151301, + "grad_norm": 3.1908371448516846, + "learning_rate": 4.845343363059535e-06, + "loss": 0.5447, + "step": 1517 + }, + { + "epoch": 0.7177304964539007, + "grad_norm": 2.9021761417388916, + "learning_rate": 4.845127281613394e-06, + "loss": 0.5836, + "step": 1518 + }, + { + "epoch": 0.7182033096926714, + "grad_norm": 2.476670742034912, + "learning_rate": 4.844911054146461e-06, + "loss": 0.5863, + "step": 1519 + }, + { + "epoch": 0.7186761229314421, + "grad_norm": 2.662935495376587, + "learning_rate": 4.844694680672198e-06, + "loss": 0.5678, + "step": 1520 + }, + { + "epoch": 0.7191489361702128, + "grad_norm": 2.677896738052368, + "learning_rate": 4.844478161204079e-06, + "loss": 0.6195, + "step": 1521 + }, + { + "epoch": 0.7196217494089835, + "grad_norm": 2.781921863555908, + "learning_rate": 4.844261495755585e-06, + "loss": 0.643, + "step": 1522 + }, + { + "epoch": 0.7200945626477542, + "grad_norm": 3.0157392024993896, + "learning_rate": 4.844044684340206e-06, + "loss": 0.7559, + "step": 1523 + }, + { + "epoch": 0.7205673758865249, + "grad_norm": 2.8109354972839355, + "learning_rate": 4.843827726971444e-06, + "loss": 0.6264, + "step": 1524 + }, + { + "epoch": 0.7210401891252955, + "grad_norm": 3.0953569412231445, + "learning_rate": 4.8436106236628064e-06, + "loss": 0.6429, + "step": 1525 + }, + { + "epoch": 0.7215130023640662, + "grad_norm": 2.6850643157958984, + "learning_rate": 4.843393374427812e-06, + "loss": 0.6598, + "step": 1526 + }, + { + "epoch": 0.7219858156028369, + "grad_norm": 3.043480634689331, + "learning_rate": 4.8431759792799874e-06, + "loss": 0.6331, + "step": 1527 + }, + { + "epoch": 0.7224586288416076, + "grad_norm": 2.723870038986206, + "learning_rate": 4.842958438232868e-06, + "loss": 0.6259, + "step": 1528 + }, + { + "epoch": 0.7229314420803783, + "grad_norm": 2.822492837905884, + "learning_rate": 4.842740751300002e-06, + "loss": 0.6554, + "step": 1529 + }, + { + "epoch": 0.723404255319149, + "grad_norm": 2.7866315841674805, + "learning_rate": 4.842522918494941e-06, + "loss": 0.6991, + "step": 1530 + }, + { + "epoch": 0.7238770685579197, + "grad_norm": 2.8881826400756836, + "learning_rate": 4.84230493983125e-06, + "loss": 0.5876, + "step": 1531 + }, + { + "epoch": 0.7243498817966904, + "grad_norm": 2.7456939220428467, + "learning_rate": 4.8420868153225e-06, + "loss": 0.6188, + "step": 1532 + }, + { + "epoch": 0.724822695035461, + "grad_norm": 3.0257532596588135, + "learning_rate": 4.841868544982274e-06, + "loss": 0.63, + "step": 1533 + }, + { + "epoch": 0.7252955082742317, + "grad_norm": 3.1581954956054688, + "learning_rate": 4.841650128824164e-06, + "loss": 0.7214, + "step": 1534 + }, + { + "epoch": 0.7257683215130024, + "grad_norm": 2.9174306392669678, + "learning_rate": 4.841431566861767e-06, + "loss": 0.704, + "step": 1535 + }, + { + "epoch": 0.7262411347517731, + "grad_norm": 2.5019054412841797, + "learning_rate": 4.8412128591086935e-06, + "loss": 0.6298, + "step": 1536 + }, + { + "epoch": 0.7267139479905438, + "grad_norm": 2.724285125732422, + "learning_rate": 4.840994005578562e-06, + "loss": 0.6289, + "step": 1537 + }, + { + "epoch": 0.7271867612293145, + "grad_norm": 2.5882341861724854, + "learning_rate": 4.840775006284998e-06, + "loss": 0.6355, + "step": 1538 + }, + { + "epoch": 0.7276595744680852, + "grad_norm": 3.1281991004943848, + "learning_rate": 4.840555861241638e-06, + "loss": 0.5551, + "step": 1539 + }, + { + "epoch": 0.7281323877068558, + "grad_norm": 2.6064817905426025, + "learning_rate": 4.840336570462127e-06, + "loss": 0.5543, + "step": 1540 + }, + { + "epoch": 0.7286052009456265, + "grad_norm": 2.67112398147583, + "learning_rate": 4.840117133960122e-06, + "loss": 0.6044, + "step": 1541 + }, + { + "epoch": 0.7290780141843972, + "grad_norm": 2.838022232055664, + "learning_rate": 4.839897551749282e-06, + "loss": 0.6814, + "step": 1542 + }, + { + "epoch": 0.7295508274231679, + "grad_norm": 2.8897151947021484, + "learning_rate": 4.839677823843283e-06, + "loss": 0.593, + "step": 1543 + }, + { + "epoch": 0.7300236406619386, + "grad_norm": 2.9238014221191406, + "learning_rate": 4.839457950255805e-06, + "loss": 0.5544, + "step": 1544 + }, + { + "epoch": 0.7304964539007093, + "grad_norm": 3.016876459121704, + "learning_rate": 4.839237931000538e-06, + "loss": 0.6099, + "step": 1545 + }, + { + "epoch": 0.7309692671394799, + "grad_norm": 2.9415392875671387, + "learning_rate": 4.839017766091182e-06, + "loss": 0.6413, + "step": 1546 + }, + { + "epoch": 0.7314420803782505, + "grad_norm": 2.658067226409912, + "learning_rate": 4.838797455541446e-06, + "loss": 0.6534, + "step": 1547 + }, + { + "epoch": 0.7319148936170212, + "grad_norm": 2.460358142852783, + "learning_rate": 4.838576999365049e-06, + "loss": 0.5307, + "step": 1548 + }, + { + "epoch": 0.7323877068557919, + "grad_norm": 2.5818674564361572, + "learning_rate": 4.838356397575716e-06, + "loss": 0.6265, + "step": 1549 + }, + { + "epoch": 0.7328605200945626, + "grad_norm": 3.009197473526001, + "learning_rate": 4.838135650187183e-06, + "loss": 0.6957, + "step": 1550 + }, + { + "epoch": 0.7333333333333333, + "grad_norm": 2.738543748855591, + "learning_rate": 4.837914757213196e-06, + "loss": 0.646, + "step": 1551 + }, + { + "epoch": 0.733806146572104, + "grad_norm": 2.8208494186401367, + "learning_rate": 4.837693718667508e-06, + "loss": 0.5936, + "step": 1552 + }, + { + "epoch": 0.7342789598108747, + "grad_norm": 3.1574649810791016, + "learning_rate": 4.837472534563883e-06, + "loss": 0.6455, + "step": 1553 + }, + { + "epoch": 0.7347517730496453, + "grad_norm": 2.6737420558929443, + "learning_rate": 4.837251204916093e-06, + "loss": 0.5921, + "step": 1554 + }, + { + "epoch": 0.735224586288416, + "grad_norm": 2.424983024597168, + "learning_rate": 4.837029729737918e-06, + "loss": 0.6346, + "step": 1555 + }, + { + "epoch": 0.7356973995271867, + "grad_norm": 2.5163493156433105, + "learning_rate": 4.836808109043151e-06, + "loss": 0.6061, + "step": 1556 + }, + { + "epoch": 0.7361702127659574, + "grad_norm": 2.8377044200897217, + "learning_rate": 4.836586342845588e-06, + "loss": 0.611, + "step": 1557 + }, + { + "epoch": 0.7366430260047281, + "grad_norm": 2.5929181575775146, + "learning_rate": 4.83636443115904e-06, + "loss": 0.5496, + "step": 1558 + }, + { + "epoch": 0.7371158392434988, + "grad_norm": 2.5017223358154297, + "learning_rate": 4.836142373997323e-06, + "loss": 0.6235, + "step": 1559 + }, + { + "epoch": 0.7375886524822695, + "grad_norm": 2.822500228881836, + "learning_rate": 4.835920171374265e-06, + "loss": 0.6147, + "step": 1560 + }, + { + "epoch": 0.7380614657210401, + "grad_norm": 2.7234230041503906, + "learning_rate": 4.8356978233037e-06, + "loss": 0.6228, + "step": 1561 + }, + { + "epoch": 0.7385342789598108, + "grad_norm": 2.9565515518188477, + "learning_rate": 4.835475329799472e-06, + "loss": 0.5728, + "step": 1562 + }, + { + "epoch": 0.7390070921985815, + "grad_norm": 2.4356038570404053, + "learning_rate": 4.835252690875438e-06, + "loss": 0.6723, + "step": 1563 + }, + { + "epoch": 0.7394799054373522, + "grad_norm": 2.765913248062134, + "learning_rate": 4.835029906545458e-06, + "loss": 0.5805, + "step": 1564 + }, + { + "epoch": 0.7399527186761229, + "grad_norm": 2.4481914043426514, + "learning_rate": 4.834806976823405e-06, + "loss": 0.599, + "step": 1565 + }, + { + "epoch": 0.7404255319148936, + "grad_norm": 2.620779514312744, + "learning_rate": 4.834583901723158e-06, + "loss": 0.63, + "step": 1566 + }, + { + "epoch": 0.7408983451536643, + "grad_norm": 2.654426097869873, + "learning_rate": 4.83436068125861e-06, + "loss": 0.6544, + "step": 1567 + }, + { + "epoch": 0.741371158392435, + "grad_norm": 2.589623212814331, + "learning_rate": 4.834137315443656e-06, + "loss": 0.5596, + "step": 1568 + }, + { + "epoch": 0.7418439716312056, + "grad_norm": 2.572883129119873, + "learning_rate": 4.833913804292209e-06, + "loss": 0.5974, + "step": 1569 + }, + { + "epoch": 0.7423167848699763, + "grad_norm": 2.8744914531707764, + "learning_rate": 4.833690147818181e-06, + "loss": 0.5364, + "step": 1570 + }, + { + "epoch": 0.742789598108747, + "grad_norm": 2.9800851345062256, + "learning_rate": 4.833466346035502e-06, + "loss": 0.6287, + "step": 1571 + }, + { + "epoch": 0.7432624113475177, + "grad_norm": 2.627784490585327, + "learning_rate": 4.833242398958105e-06, + "loss": 0.621, + "step": 1572 + }, + { + "epoch": 0.7437352245862884, + "grad_norm": 2.5187721252441406, + "learning_rate": 4.833018306599933e-06, + "loss": 0.5901, + "step": 1573 + }, + { + "epoch": 0.7442080378250591, + "grad_norm": 2.4843688011169434, + "learning_rate": 4.832794068974944e-06, + "loss": 0.6336, + "step": 1574 + }, + { + "epoch": 0.7446808510638298, + "grad_norm": 2.774911880493164, + "learning_rate": 4.832569686097096e-06, + "loss": 0.6091, + "step": 1575 + }, + { + "epoch": 0.7451536643026004, + "grad_norm": 3.2562527656555176, + "learning_rate": 4.8323451579803615e-06, + "loss": 0.7686, + "step": 1576 + }, + { + "epoch": 0.7456264775413711, + "grad_norm": 2.799570083618164, + "learning_rate": 4.832120484638721e-06, + "loss": 0.6233, + "step": 1577 + }, + { + "epoch": 0.7460992907801418, + "grad_norm": 2.661893367767334, + "learning_rate": 4.831895666086164e-06, + "loss": 0.5841, + "step": 1578 + }, + { + "epoch": 0.7465721040189125, + "grad_norm": 3.0382652282714844, + "learning_rate": 4.831670702336689e-06, + "loss": 0.5769, + "step": 1579 + }, + { + "epoch": 0.7470449172576832, + "grad_norm": 2.676398515701294, + "learning_rate": 4.831445593404304e-06, + "loss": 0.619, + "step": 1580 + }, + { + "epoch": 0.7475177304964539, + "grad_norm": 2.717916965484619, + "learning_rate": 4.831220339303024e-06, + "loss": 0.5787, + "step": 1581 + }, + { + "epoch": 0.7479905437352246, + "grad_norm": 2.3918066024780273, + "learning_rate": 4.830994940046876e-06, + "loss": 0.5108, + "step": 1582 + }, + { + "epoch": 0.7484633569739952, + "grad_norm": 2.709144115447998, + "learning_rate": 4.830769395649895e-06, + "loss": 0.6875, + "step": 1583 + }, + { + "epoch": 0.7489361702127659, + "grad_norm": 2.8711116313934326, + "learning_rate": 4.830543706126123e-06, + "loss": 0.6745, + "step": 1584 + }, + { + "epoch": 0.7494089834515366, + "grad_norm": 2.612339496612549, + "learning_rate": 4.830317871489614e-06, + "loss": 0.5738, + "step": 1585 + }, + { + "epoch": 0.7498817966903073, + "grad_norm": 2.4355857372283936, + "learning_rate": 4.830091891754429e-06, + "loss": 0.5907, + "step": 1586 + }, + { + "epoch": 0.750354609929078, + "grad_norm": 2.676051378250122, + "learning_rate": 4.829865766934638e-06, + "loss": 0.6628, + "step": 1587 + }, + { + "epoch": 0.7508274231678487, + "grad_norm": 2.66489839553833, + "learning_rate": 4.829639497044323e-06, + "loss": 0.5984, + "step": 1588 + }, + { + "epoch": 0.7513002364066194, + "grad_norm": 2.5358035564422607, + "learning_rate": 4.829413082097572e-06, + "loss": 0.5867, + "step": 1589 + }, + { + "epoch": 0.75177304964539, + "grad_norm": 2.6530144214630127, + "learning_rate": 4.8291865221084815e-06, + "loss": 0.5917, + "step": 1590 + }, + { + "epoch": 0.7522458628841607, + "grad_norm": 2.5160958766937256, + "learning_rate": 4.82895981709116e-06, + "loss": 0.6347, + "step": 1591 + }, + { + "epoch": 0.7527186761229314, + "grad_norm": 2.61592698097229, + "learning_rate": 4.8287329670597225e-06, + "loss": 0.5472, + "step": 1592 + }, + { + "epoch": 0.7531914893617021, + "grad_norm": 2.7528622150421143, + "learning_rate": 4.828505972028296e-06, + "loss": 0.5842, + "step": 1593 + }, + { + "epoch": 0.7536643026004728, + "grad_norm": 2.8154072761535645, + "learning_rate": 4.828278832011011e-06, + "loss": 0.5757, + "step": 1594 + }, + { + "epoch": 0.7541371158392435, + "grad_norm": 3.118515729904175, + "learning_rate": 4.828051547022013e-06, + "loss": 0.6472, + "step": 1595 + }, + { + "epoch": 0.7546099290780142, + "grad_norm": 2.452033758163452, + "learning_rate": 4.827824117075453e-06, + "loss": 0.5571, + "step": 1596 + }, + { + "epoch": 0.7550827423167848, + "grad_norm": 2.984388828277588, + "learning_rate": 4.827596542185492e-06, + "loss": 0.6656, + "step": 1597 + }, + { + "epoch": 0.7555555555555555, + "grad_norm": 2.61356782913208, + "learning_rate": 4.8273688223663014e-06, + "loss": 0.6444, + "step": 1598 + }, + { + "epoch": 0.7560283687943262, + "grad_norm": 2.8967196941375732, + "learning_rate": 4.8271409576320595e-06, + "loss": 0.6457, + "step": 1599 + }, + { + "epoch": 0.7565011820330969, + "grad_norm": 2.852367639541626, + "learning_rate": 4.826912947996954e-06, + "loss": 0.5629, + "step": 1600 + }, + { + "epoch": 0.7569739952718676, + "grad_norm": 2.905280590057373, + "learning_rate": 4.826684793475182e-06, + "loss": 0.6245, + "step": 1601 + }, + { + "epoch": 0.7574468085106383, + "grad_norm": 2.6156530380249023, + "learning_rate": 4.826456494080951e-06, + "loss": 0.5869, + "step": 1602 + }, + { + "epoch": 0.757919621749409, + "grad_norm": 2.6490228176116943, + "learning_rate": 4.826228049828475e-06, + "loss": 0.5461, + "step": 1603 + }, + { + "epoch": 0.7583924349881797, + "grad_norm": 2.9626693725585938, + "learning_rate": 4.825999460731978e-06, + "loss": 0.6842, + "step": 1604 + }, + { + "epoch": 0.7588652482269503, + "grad_norm": 2.6866023540496826, + "learning_rate": 4.825770726805695e-06, + "loss": 0.5726, + "step": 1605 + }, + { + "epoch": 0.759338061465721, + "grad_norm": 2.5525858402252197, + "learning_rate": 4.825541848063866e-06, + "loss": 0.6061, + "step": 1606 + }, + { + "epoch": 0.7598108747044917, + "grad_norm": 2.703977584838867, + "learning_rate": 4.825312824520743e-06, + "loss": 0.6726, + "step": 1607 + }, + { + "epoch": 0.7602836879432624, + "grad_norm": 2.856534957885742, + "learning_rate": 4.825083656190588e-06, + "loss": 0.625, + "step": 1608 + }, + { + "epoch": 0.7607565011820331, + "grad_norm": 2.8564887046813965, + "learning_rate": 4.824854343087668e-06, + "loss": 0.7251, + "step": 1609 + }, + { + "epoch": 0.7612293144208038, + "grad_norm": 2.327650308609009, + "learning_rate": 4.824624885226262e-06, + "loss": 0.526, + "step": 1610 + }, + { + "epoch": 0.7617021276595745, + "grad_norm": 3.0025737285614014, + "learning_rate": 4.824395282620659e-06, + "loss": 0.6043, + "step": 1611 + }, + { + "epoch": 0.7621749408983451, + "grad_norm": 2.5441737174987793, + "learning_rate": 4.824165535285152e-06, + "loss": 0.6276, + "step": 1612 + }, + { + "epoch": 0.7626477541371158, + "grad_norm": 2.4177372455596924, + "learning_rate": 4.823935643234049e-06, + "loss": 0.6419, + "step": 1613 + }, + { + "epoch": 0.7631205673758865, + "grad_norm": 2.9210550785064697, + "learning_rate": 4.823705606481664e-06, + "loss": 0.5663, + "step": 1614 + }, + { + "epoch": 0.7635933806146572, + "grad_norm": 2.6353724002838135, + "learning_rate": 4.82347542504232e-06, + "loss": 0.5669, + "step": 1615 + }, + { + "epoch": 0.7640661938534279, + "grad_norm": 2.419081926345825, + "learning_rate": 4.823245098930349e-06, + "loss": 0.5777, + "step": 1616 + }, + { + "epoch": 0.7645390070921986, + "grad_norm": 2.5077571868896484, + "learning_rate": 4.823014628160093e-06, + "loss": 0.5924, + "step": 1617 + }, + { + "epoch": 0.7650118203309693, + "grad_norm": 2.816056251525879, + "learning_rate": 4.822784012745902e-06, + "loss": 0.7273, + "step": 1618 + }, + { + "epoch": 0.76548463356974, + "grad_norm": 2.7163147926330566, + "learning_rate": 4.8225532527021366e-06, + "loss": 0.5545, + "step": 1619 + }, + { + "epoch": 0.7659574468085106, + "grad_norm": 2.4784302711486816, + "learning_rate": 4.822322348043164e-06, + "loss": 0.556, + "step": 1620 + }, + { + "epoch": 0.7664302600472813, + "grad_norm": 2.712467670440674, + "learning_rate": 4.822091298783361e-06, + "loss": 0.6501, + "step": 1621 + }, + { + "epoch": 0.766903073286052, + "grad_norm": 2.7217724323272705, + "learning_rate": 4.821860104937115e-06, + "loss": 0.5989, + "step": 1622 + }, + { + "epoch": 0.7673758865248227, + "grad_norm": 2.5622854232788086, + "learning_rate": 4.821628766518821e-06, + "loss": 0.5263, + "step": 1623 + }, + { + "epoch": 0.7678486997635934, + "grad_norm": 3.230923891067505, + "learning_rate": 4.821397283542884e-06, + "loss": 0.6707, + "step": 1624 + }, + { + "epoch": 0.7683215130023641, + "grad_norm": 2.37929105758667, + "learning_rate": 4.821165656023718e-06, + "loss": 0.6124, + "step": 1625 + }, + { + "epoch": 0.7687943262411348, + "grad_norm": 2.9811325073242188, + "learning_rate": 4.820933883975745e-06, + "loss": 0.6435, + "step": 1626 + }, + { + "epoch": 0.7692671394799054, + "grad_norm": 2.887380838394165, + "learning_rate": 4.820701967413395e-06, + "loss": 0.621, + "step": 1627 + }, + { + "epoch": 0.7697399527186761, + "grad_norm": 2.6762876510620117, + "learning_rate": 4.820469906351109e-06, + "loss": 0.5713, + "step": 1628 + }, + { + "epoch": 0.7702127659574468, + "grad_norm": 2.7347512245178223, + "learning_rate": 4.820237700803337e-06, + "loss": 0.6136, + "step": 1629 + }, + { + "epoch": 0.7706855791962175, + "grad_norm": 2.7244746685028076, + "learning_rate": 4.820005350784539e-06, + "loss": 0.5816, + "step": 1630 + }, + { + "epoch": 0.7711583924349882, + "grad_norm": 2.9293999671936035, + "learning_rate": 4.8197728563091795e-06, + "loss": 0.6649, + "step": 1631 + }, + { + "epoch": 0.7716312056737589, + "grad_norm": 2.4402127265930176, + "learning_rate": 4.819540217391736e-06, + "loss": 0.6481, + "step": 1632 + }, + { + "epoch": 0.7721040189125296, + "grad_norm": 3.083941698074341, + "learning_rate": 4.819307434046694e-06, + "loss": 0.6951, + "step": 1633 + }, + { + "epoch": 0.7725768321513002, + "grad_norm": 2.544952392578125, + "learning_rate": 4.819074506288548e-06, + "loss": 0.539, + "step": 1634 + }, + { + "epoch": 0.7730496453900709, + "grad_norm": 2.7791268825531006, + "learning_rate": 4.818841434131801e-06, + "loss": 0.5827, + "step": 1635 + }, + { + "epoch": 0.7735224586288416, + "grad_norm": 2.7349796295166016, + "learning_rate": 4.818608217590967e-06, + "loss": 0.5584, + "step": 1636 + }, + { + "epoch": 0.7739952718676123, + "grad_norm": 2.637652635574341, + "learning_rate": 4.818374856680565e-06, + "loss": 0.6386, + "step": 1637 + }, + { + "epoch": 0.774468085106383, + "grad_norm": 2.9821584224700928, + "learning_rate": 4.818141351415127e-06, + "loss": 0.6734, + "step": 1638 + }, + { + "epoch": 0.7749408983451537, + "grad_norm": 2.992938995361328, + "learning_rate": 4.817907701809192e-06, + "loss": 0.5899, + "step": 1639 + }, + { + "epoch": 0.7754137115839244, + "grad_norm": 4.35719633102417, + "learning_rate": 4.8176739078773076e-06, + "loss": 0.6281, + "step": 1640 + }, + { + "epoch": 0.775886524822695, + "grad_norm": 2.838146209716797, + "learning_rate": 4.8174399696340315e-06, + "loss": 0.5766, + "step": 1641 + }, + { + "epoch": 0.7763593380614657, + "grad_norm": 3.3116989135742188, + "learning_rate": 4.81720588709393e-06, + "loss": 0.6409, + "step": 1642 + }, + { + "epoch": 0.7768321513002364, + "grad_norm": 2.9843590259552, + "learning_rate": 4.816971660271579e-06, + "loss": 0.6108, + "step": 1643 + }, + { + "epoch": 0.7773049645390071, + "grad_norm": 2.843770742416382, + "learning_rate": 4.816737289181562e-06, + "loss": 0.6053, + "step": 1644 + }, + { + "epoch": 0.7777777777777778, + "grad_norm": 2.7608556747436523, + "learning_rate": 4.816502773838473e-06, + "loss": 0.5854, + "step": 1645 + }, + { + "epoch": 0.7782505910165485, + "grad_norm": 3.343682289123535, + "learning_rate": 4.816268114256914e-06, + "loss": 0.6329, + "step": 1646 + }, + { + "epoch": 0.7787234042553192, + "grad_norm": 2.769768476486206, + "learning_rate": 4.816033310451496e-06, + "loss": 0.6242, + "step": 1647 + }, + { + "epoch": 0.7791962174940898, + "grad_norm": 2.989851713180542, + "learning_rate": 4.815798362436838e-06, + "loss": 0.6493, + "step": 1648 + }, + { + "epoch": 0.7796690307328605, + "grad_norm": 3.170736312866211, + "learning_rate": 4.8155632702275716e-06, + "loss": 0.6341, + "step": 1649 + }, + { + "epoch": 0.7801418439716312, + "grad_norm": 2.7372522354125977, + "learning_rate": 4.815328033838334e-06, + "loss": 0.5445, + "step": 1650 + }, + { + "epoch": 0.7806146572104019, + "grad_norm": 2.6947238445281982, + "learning_rate": 4.8150926532837715e-06, + "loss": 0.6437, + "step": 1651 + }, + { + "epoch": 0.7810874704491726, + "grad_norm": 2.472323179244995, + "learning_rate": 4.81485712857854e-06, + "loss": 0.5751, + "step": 1652 + }, + { + "epoch": 0.7815602836879433, + "grad_norm": 2.791114091873169, + "learning_rate": 4.814621459737308e-06, + "loss": 0.5996, + "step": 1653 + }, + { + "epoch": 0.782033096926714, + "grad_norm": 3.1957521438598633, + "learning_rate": 4.814385646774745e-06, + "loss": 0.5803, + "step": 1654 + }, + { + "epoch": 0.7825059101654847, + "grad_norm": 2.4120798110961914, + "learning_rate": 4.8141496897055364e-06, + "loss": 0.5814, + "step": 1655 + }, + { + "epoch": 0.7829787234042553, + "grad_norm": 2.9262423515319824, + "learning_rate": 4.813913588544374e-06, + "loss": 0.6292, + "step": 1656 + }, + { + "epoch": 0.783451536643026, + "grad_norm": 2.8251047134399414, + "learning_rate": 4.813677343305959e-06, + "loss": 0.6787, + "step": 1657 + }, + { + "epoch": 0.7839243498817967, + "grad_norm": 2.931659698486328, + "learning_rate": 4.8134409540050005e-06, + "loss": 0.6163, + "step": 1658 + }, + { + "epoch": 0.7843971631205674, + "grad_norm": 2.7160706520080566, + "learning_rate": 4.813204420656219e-06, + "loss": 0.6831, + "step": 1659 + }, + { + "epoch": 0.7848699763593381, + "grad_norm": 3.2134454250335693, + "learning_rate": 4.81296774327434e-06, + "loss": 0.6002, + "step": 1660 + }, + { + "epoch": 0.7853427895981088, + "grad_norm": 2.4002513885498047, + "learning_rate": 4.812730921874103e-06, + "loss": 0.5488, + "step": 1661 + }, + { + "epoch": 0.7858156028368795, + "grad_norm": 2.5559282302856445, + "learning_rate": 4.812493956470251e-06, + "loss": 0.5802, + "step": 1662 + }, + { + "epoch": 0.7862884160756501, + "grad_norm": 2.57478404045105, + "learning_rate": 4.812256847077541e-06, + "loss": 0.646, + "step": 1663 + }, + { + "epoch": 0.7867612293144208, + "grad_norm": 2.811851978302002, + "learning_rate": 4.812019593710736e-06, + "loss": 0.6245, + "step": 1664 + }, + { + "epoch": 0.7872340425531915, + "grad_norm": 2.5228829383850098, + "learning_rate": 4.811782196384609e-06, + "loss": 0.5949, + "step": 1665 + }, + { + "epoch": 0.7877068557919622, + "grad_norm": 2.744096040725708, + "learning_rate": 4.8115446551139415e-06, + "loss": 0.6006, + "step": 1666 + }, + { + "epoch": 0.7881796690307329, + "grad_norm": 3.129242420196533, + "learning_rate": 4.811306969913524e-06, + "loss": 0.7251, + "step": 1667 + }, + { + "epoch": 0.7886524822695036, + "grad_norm": 2.7855660915374756, + "learning_rate": 4.811069140798156e-06, + "loss": 0.6534, + "step": 1668 + }, + { + "epoch": 0.7891252955082743, + "grad_norm": 2.836603879928589, + "learning_rate": 4.810831167782647e-06, + "loss": 0.6661, + "step": 1669 + }, + { + "epoch": 0.789598108747045, + "grad_norm": 2.5339887142181396, + "learning_rate": 4.810593050881813e-06, + "loss": 0.5354, + "step": 1670 + }, + { + "epoch": 0.7900709219858156, + "grad_norm": 2.9553709030151367, + "learning_rate": 4.810354790110482e-06, + "loss": 0.6001, + "step": 1671 + }, + { + "epoch": 0.7905437352245863, + "grad_norm": 2.6581788063049316, + "learning_rate": 4.8101163854834885e-06, + "loss": 0.6802, + "step": 1672 + }, + { + "epoch": 0.791016548463357, + "grad_norm": 3.2002551555633545, + "learning_rate": 4.809877837015677e-06, + "loss": 0.6641, + "step": 1673 + }, + { + "epoch": 0.7914893617021277, + "grad_norm": 2.918792963027954, + "learning_rate": 4.809639144721902e-06, + "loss": 0.6758, + "step": 1674 + }, + { + "epoch": 0.7919621749408984, + "grad_norm": 2.7993946075439453, + "learning_rate": 4.8094003086170245e-06, + "loss": 0.5889, + "step": 1675 + }, + { + "epoch": 0.7924349881796691, + "grad_norm": 2.3698952198028564, + "learning_rate": 4.809161328715916e-06, + "loss": 0.6244, + "step": 1676 + }, + { + "epoch": 0.7929078014184398, + "grad_norm": 2.8891594409942627, + "learning_rate": 4.808922205033458e-06, + "loss": 0.5835, + "step": 1677 + }, + { + "epoch": 0.7933806146572104, + "grad_norm": 2.838345766067505, + "learning_rate": 4.808682937584537e-06, + "loss": 0.6907, + "step": 1678 + }, + { + "epoch": 0.7938534278959811, + "grad_norm": 2.8443174362182617, + "learning_rate": 4.808443526384053e-06, + "loss": 0.6692, + "step": 1679 + }, + { + "epoch": 0.7943262411347518, + "grad_norm": 2.7355034351348877, + "learning_rate": 4.808203971446913e-06, + "loss": 0.5799, + "step": 1680 + }, + { + "epoch": 0.7947990543735225, + "grad_norm": 2.7108020782470703, + "learning_rate": 4.807964272788033e-06, + "loss": 0.652, + "step": 1681 + }, + { + "epoch": 0.7952718676122932, + "grad_norm": 2.397650957107544, + "learning_rate": 4.807724430422338e-06, + "loss": 0.5418, + "step": 1682 + }, + { + "epoch": 0.7957446808510639, + "grad_norm": 2.4981582164764404, + "learning_rate": 4.807484444364762e-06, + "loss": 0.5731, + "step": 1683 + }, + { + "epoch": 0.7962174940898346, + "grad_norm": 2.7943713665008545, + "learning_rate": 4.8072443146302475e-06, + "loss": 0.5913, + "step": 1684 + }, + { + "epoch": 0.7966903073286052, + "grad_norm": 2.5691423416137695, + "learning_rate": 4.807004041233746e-06, + "loss": 0.6475, + "step": 1685 + }, + { + "epoch": 0.7971631205673759, + "grad_norm": 3.2367498874664307, + "learning_rate": 4.8067636241902195e-06, + "loss": 0.675, + "step": 1686 + }, + { + "epoch": 0.7976359338061466, + "grad_norm": 3.000595808029175, + "learning_rate": 4.806523063514637e-06, + "loss": 0.5481, + "step": 1687 + }, + { + "epoch": 0.7981087470449173, + "grad_norm": 2.702014207839966, + "learning_rate": 4.806282359221976e-06, + "loss": 0.5993, + "step": 1688 + }, + { + "epoch": 0.798581560283688, + "grad_norm": 2.383671998977661, + "learning_rate": 4.806041511327226e-06, + "loss": 0.562, + "step": 1689 + }, + { + "epoch": 0.7990543735224587, + "grad_norm": 2.6965041160583496, + "learning_rate": 4.8058005198453834e-06, + "loss": 0.5955, + "step": 1690 + }, + { + "epoch": 0.7995271867612294, + "grad_norm": 2.5906765460968018, + "learning_rate": 4.805559384791453e-06, + "loss": 0.5151, + "step": 1691 + }, + { + "epoch": 0.8, + "grad_norm": 2.5454652309417725, + "learning_rate": 4.8053181061804475e-06, + "loss": 0.5843, + "step": 1692 + }, + { + "epoch": 0.8004728132387707, + "grad_norm": 2.661343812942505, + "learning_rate": 4.8050766840273935e-06, + "loss": 0.5995, + "step": 1693 + }, + { + "epoch": 0.8009456264775414, + "grad_norm": 2.5635924339294434, + "learning_rate": 4.8048351183473215e-06, + "loss": 0.5676, + "step": 1694 + }, + { + "epoch": 0.8014184397163121, + "grad_norm": 2.5936667919158936, + "learning_rate": 4.804593409155274e-06, + "loss": 0.6291, + "step": 1695 + }, + { + "epoch": 0.8018912529550828, + "grad_norm": 2.6902432441711426, + "learning_rate": 4.804351556466299e-06, + "loss": 0.6114, + "step": 1696 + }, + { + "epoch": 0.8023640661938535, + "grad_norm": 2.7764673233032227, + "learning_rate": 4.804109560295457e-06, + "loss": 0.5768, + "step": 1697 + }, + { + "epoch": 0.8028368794326242, + "grad_norm": 2.9587221145629883, + "learning_rate": 4.803867420657816e-06, + "loss": 0.6048, + "step": 1698 + }, + { + "epoch": 0.8033096926713948, + "grad_norm": 2.9238998889923096, + "learning_rate": 4.803625137568453e-06, + "loss": 0.6329, + "step": 1699 + }, + { + "epoch": 0.8037825059101655, + "grad_norm": 2.70473313331604, + "learning_rate": 4.803382711042455e-06, + "loss": 0.5427, + "step": 1700 + }, + { + "epoch": 0.8042553191489362, + "grad_norm": 3.1604979038238525, + "learning_rate": 4.803140141094914e-06, + "loss": 0.626, + "step": 1701 + }, + { + "epoch": 0.8047281323877069, + "grad_norm": 2.9567699432373047, + "learning_rate": 4.802897427740936e-06, + "loss": 0.5319, + "step": 1702 + }, + { + "epoch": 0.8052009456264776, + "grad_norm": 2.90983247756958, + "learning_rate": 4.802654570995632e-06, + "loss": 0.586, + "step": 1703 + }, + { + "epoch": 0.8056737588652483, + "grad_norm": 2.783480167388916, + "learning_rate": 4.8024115708741255e-06, + "loss": 0.5773, + "step": 1704 + }, + { + "epoch": 0.806146572104019, + "grad_norm": 3.3307793140411377, + "learning_rate": 4.802168427391547e-06, + "loss": 0.6257, + "step": 1705 + }, + { + "epoch": 0.8066193853427897, + "grad_norm": 3.0475001335144043, + "learning_rate": 4.801925140563034e-06, + "loss": 0.6612, + "step": 1706 + }, + { + "epoch": 0.8070921985815603, + "grad_norm": 2.8278894424438477, + "learning_rate": 4.8016817104037375e-06, + "loss": 0.6449, + "step": 1707 + }, + { + "epoch": 0.807565011820331, + "grad_norm": 2.760244369506836, + "learning_rate": 4.801438136928812e-06, + "loss": 0.7007, + "step": 1708 + }, + { + "epoch": 0.8080378250591016, + "grad_norm": 2.827634572982788, + "learning_rate": 4.801194420153427e-06, + "loss": 0.6418, + "step": 1709 + }, + { + "epoch": 0.8085106382978723, + "grad_norm": 2.8655009269714355, + "learning_rate": 4.800950560092754e-06, + "loss": 0.6231, + "step": 1710 + }, + { + "epoch": 0.808983451536643, + "grad_norm": 2.738112688064575, + "learning_rate": 4.800706556761981e-06, + "loss": 0.6463, + "step": 1711 + }, + { + "epoch": 0.8094562647754137, + "grad_norm": 2.4781179428100586, + "learning_rate": 4.800462410176296e-06, + "loss": 0.5365, + "step": 1712 + }, + { + "epoch": 0.8099290780141843, + "grad_norm": 2.6049838066101074, + "learning_rate": 4.800218120350906e-06, + "loss": 0.6035, + "step": 1713 + }, + { + "epoch": 0.810401891252955, + "grad_norm": 2.9089980125427246, + "learning_rate": 4.79997368730102e-06, + "loss": 0.5828, + "step": 1714 + }, + { + "epoch": 0.8108747044917257, + "grad_norm": 2.831871747970581, + "learning_rate": 4.799729111041857e-06, + "loss": 0.5953, + "step": 1715 + }, + { + "epoch": 0.8113475177304964, + "grad_norm": 2.5611300468444824, + "learning_rate": 4.799484391588647e-06, + "loss": 0.6302, + "step": 1716 + }, + { + "epoch": 0.8118203309692671, + "grad_norm": 2.744070053100586, + "learning_rate": 4.799239528956625e-06, + "loss": 0.5561, + "step": 1717 + }, + { + "epoch": 0.8122931442080378, + "grad_norm": 2.7344231605529785, + "learning_rate": 4.798994523161041e-06, + "loss": 0.6317, + "step": 1718 + }, + { + "epoch": 0.8127659574468085, + "grad_norm": 2.3420889377593994, + "learning_rate": 4.798749374217149e-06, + "loss": 0.5415, + "step": 1719 + }, + { + "epoch": 0.8132387706855791, + "grad_norm": 2.57384991645813, + "learning_rate": 4.798504082140212e-06, + "loss": 0.6383, + "step": 1720 + }, + { + "epoch": 0.8137115839243498, + "grad_norm": 2.8819844722747803, + "learning_rate": 4.798258646945505e-06, + "loss": 0.6355, + "step": 1721 + }, + { + "epoch": 0.8141843971631205, + "grad_norm": 2.908123254776001, + "learning_rate": 4.79801306864831e-06, + "loss": 0.701, + "step": 1722 + }, + { + "epoch": 0.8146572104018912, + "grad_norm": 2.6500701904296875, + "learning_rate": 4.797767347263917e-06, + "loss": 0.6152, + "step": 1723 + }, + { + "epoch": 0.8151300236406619, + "grad_norm": 2.5513017177581787, + "learning_rate": 4.797521482807628e-06, + "loss": 0.6241, + "step": 1724 + }, + { + "epoch": 0.8156028368794326, + "grad_norm": 2.6239185333251953, + "learning_rate": 4.7972754752947495e-06, + "loss": 0.6072, + "step": 1725 + }, + { + "epoch": 0.8160756501182033, + "grad_norm": 2.673436403274536, + "learning_rate": 4.797029324740601e-06, + "loss": 0.5802, + "step": 1726 + }, + { + "epoch": 0.816548463356974, + "grad_norm": 2.533831834793091, + "learning_rate": 4.796783031160508e-06, + "loss": 0.5566, + "step": 1727 + }, + { + "epoch": 0.8170212765957446, + "grad_norm": 2.9806582927703857, + "learning_rate": 4.796536594569807e-06, + "loss": 0.6945, + "step": 1728 + }, + { + "epoch": 0.8174940898345153, + "grad_norm": 2.7093560695648193, + "learning_rate": 4.796290014983842e-06, + "loss": 0.7143, + "step": 1729 + }, + { + "epoch": 0.817966903073286, + "grad_norm": 2.814507246017456, + "learning_rate": 4.796043292417967e-06, + "loss": 0.6122, + "step": 1730 + }, + { + "epoch": 0.8184397163120567, + "grad_norm": 2.537156820297241, + "learning_rate": 4.795796426887543e-06, + "loss": 0.6229, + "step": 1731 + }, + { + "epoch": 0.8189125295508274, + "grad_norm": 2.4878013134002686, + "learning_rate": 4.795549418407944e-06, + "loss": 0.5442, + "step": 1732 + }, + { + "epoch": 0.8193853427895981, + "grad_norm": 2.839383363723755, + "learning_rate": 4.795302266994548e-06, + "loss": 0.6717, + "step": 1733 + }, + { + "epoch": 0.8198581560283688, + "grad_norm": 3.1981801986694336, + "learning_rate": 4.795054972662744e-06, + "loss": 0.6596, + "step": 1734 + }, + { + "epoch": 0.8203309692671394, + "grad_norm": 2.781730890274048, + "learning_rate": 4.79480753542793e-06, + "loss": 0.5845, + "step": 1735 + }, + { + "epoch": 0.8208037825059101, + "grad_norm": 2.689948558807373, + "learning_rate": 4.794559955305513e-06, + "loss": 0.5928, + "step": 1736 + }, + { + "epoch": 0.8212765957446808, + "grad_norm": 2.7267637252807617, + "learning_rate": 4.7943122323109105e-06, + "loss": 0.5224, + "step": 1737 + }, + { + "epoch": 0.8217494089834515, + "grad_norm": 2.4346601963043213, + "learning_rate": 4.794064366459544e-06, + "loss": 0.6431, + "step": 1738 + }, + { + "epoch": 0.8222222222222222, + "grad_norm": 2.7440176010131836, + "learning_rate": 4.793816357766849e-06, + "loss": 0.6083, + "step": 1739 + }, + { + "epoch": 0.8226950354609929, + "grad_norm": 2.6558027267456055, + "learning_rate": 4.793568206248268e-06, + "loss": 0.698, + "step": 1740 + }, + { + "epoch": 0.8231678486997636, + "grad_norm": 2.591658353805542, + "learning_rate": 4.793319911919251e-06, + "loss": 0.6601, + "step": 1741 + }, + { + "epoch": 0.8236406619385342, + "grad_norm": 2.5431172847747803, + "learning_rate": 4.79307147479526e-06, + "loss": 0.5917, + "step": 1742 + }, + { + "epoch": 0.8241134751773049, + "grad_norm": 2.7335588932037354, + "learning_rate": 4.792822894891762e-06, + "loss": 0.5925, + "step": 1743 + }, + { + "epoch": 0.8245862884160756, + "grad_norm": 2.2500839233398438, + "learning_rate": 4.792574172224237e-06, + "loss": 0.4984, + "step": 1744 + }, + { + "epoch": 0.8250591016548463, + "grad_norm": 2.691343069076538, + "learning_rate": 4.79232530680817e-06, + "loss": 0.6262, + "step": 1745 + }, + { + "epoch": 0.825531914893617, + "grad_norm": 2.612204074859619, + "learning_rate": 4.792076298659058e-06, + "loss": 0.5822, + "step": 1746 + }, + { + "epoch": 0.8260047281323877, + "grad_norm": 3.0163519382476807, + "learning_rate": 4.791827147792406e-06, + "loss": 0.6263, + "step": 1747 + }, + { + "epoch": 0.8264775413711584, + "grad_norm": 2.742183208465576, + "learning_rate": 4.791577854223727e-06, + "loss": 0.6628, + "step": 1748 + }, + { + "epoch": 0.826950354609929, + "grad_norm": 2.872213840484619, + "learning_rate": 4.791328417968542e-06, + "loss": 0.6332, + "step": 1749 + }, + { + "epoch": 0.8274231678486997, + "grad_norm": 2.725006580352783, + "learning_rate": 4.7910788390423844e-06, + "loss": 0.6266, + "step": 1750 + }, + { + "epoch": 0.8278959810874704, + "grad_norm": 3.0366697311401367, + "learning_rate": 4.790829117460793e-06, + "loss": 0.6403, + "step": 1751 + }, + { + "epoch": 0.8283687943262411, + "grad_norm": 2.594881772994995, + "learning_rate": 4.790579253239318e-06, + "loss": 0.521, + "step": 1752 + }, + { + "epoch": 0.8288416075650118, + "grad_norm": 2.4496347904205322, + "learning_rate": 4.790329246393517e-06, + "loss": 0.54, + "step": 1753 + }, + { + "epoch": 0.8293144208037825, + "grad_norm": 3.102278470993042, + "learning_rate": 4.790079096938956e-06, + "loss": 0.6142, + "step": 1754 + }, + { + "epoch": 0.8297872340425532, + "grad_norm": 2.4645912647247314, + "learning_rate": 4.789828804891212e-06, + "loss": 0.5212, + "step": 1755 + }, + { + "epoch": 0.8302600472813239, + "grad_norm": 2.7482516765594482, + "learning_rate": 4.789578370265868e-06, + "loss": 0.6712, + "step": 1756 + }, + { + "epoch": 0.8307328605200945, + "grad_norm": 2.61360502243042, + "learning_rate": 4.7893277930785195e-06, + "loss": 0.6367, + "step": 1757 + }, + { + "epoch": 0.8312056737588652, + "grad_norm": 2.79028058052063, + "learning_rate": 4.789077073344767e-06, + "loss": 0.5099, + "step": 1758 + }, + { + "epoch": 0.8316784869976359, + "grad_norm": 2.647662401199341, + "learning_rate": 4.788826211080222e-06, + "loss": 0.6698, + "step": 1759 + }, + { + "epoch": 0.8321513002364066, + "grad_norm": 3.0214831829071045, + "learning_rate": 4.7885752063005055e-06, + "loss": 0.6121, + "step": 1760 + }, + { + "epoch": 0.8326241134751773, + "grad_norm": 2.8244032859802246, + "learning_rate": 4.788324059021247e-06, + "loss": 0.6921, + "step": 1761 + }, + { + "epoch": 0.833096926713948, + "grad_norm": 3.1501076221466064, + "learning_rate": 4.788072769258082e-06, + "loss": 0.6872, + "step": 1762 + }, + { + "epoch": 0.8335697399527187, + "grad_norm": 2.6989903450012207, + "learning_rate": 4.7878213370266594e-06, + "loss": 0.5884, + "step": 1763 + }, + { + "epoch": 0.8340425531914893, + "grad_norm": 2.6982665061950684, + "learning_rate": 4.787569762342633e-06, + "loss": 0.6112, + "step": 1764 + }, + { + "epoch": 0.83451536643026, + "grad_norm": 2.6918323040008545, + "learning_rate": 4.7873180452216685e-06, + "loss": 0.5315, + "step": 1765 + }, + { + "epoch": 0.8349881796690307, + "grad_norm": 2.5494401454925537, + "learning_rate": 4.78706618567944e-06, + "loss": 0.5909, + "step": 1766 + }, + { + "epoch": 0.8354609929078014, + "grad_norm": 2.7532095909118652, + "learning_rate": 4.786814183731627e-06, + "loss": 0.5566, + "step": 1767 + }, + { + "epoch": 0.8359338061465721, + "grad_norm": 2.550865888595581, + "learning_rate": 4.786562039393923e-06, + "loss": 0.555, + "step": 1768 + }, + { + "epoch": 0.8364066193853428, + "grad_norm": 2.4477791786193848, + "learning_rate": 4.786309752682028e-06, + "loss": 0.5844, + "step": 1769 + }, + { + "epoch": 0.8368794326241135, + "grad_norm": 2.6982262134552, + "learning_rate": 4.7860573236116485e-06, + "loss": 0.6136, + "step": 1770 + }, + { + "epoch": 0.8373522458628841, + "grad_norm": 2.456263542175293, + "learning_rate": 4.785804752198503e-06, + "loss": 0.5055, + "step": 1771 + }, + { + "epoch": 0.8378250591016548, + "grad_norm": 2.428544521331787, + "learning_rate": 4.78555203845832e-06, + "loss": 0.5859, + "step": 1772 + }, + { + "epoch": 0.8382978723404255, + "grad_norm": 2.1782307624816895, + "learning_rate": 4.785299182406833e-06, + "loss": 0.5325, + "step": 1773 + }, + { + "epoch": 0.8387706855791962, + "grad_norm": 3.137956142425537, + "learning_rate": 4.785046184059786e-06, + "loss": 0.6097, + "step": 1774 + }, + { + "epoch": 0.8392434988179669, + "grad_norm": 2.6269001960754395, + "learning_rate": 4.7847930434329336e-06, + "loss": 0.5972, + "step": 1775 + }, + { + "epoch": 0.8397163120567376, + "grad_norm": 2.732659339904785, + "learning_rate": 4.784539760542037e-06, + "loss": 0.6054, + "step": 1776 + }, + { + "epoch": 0.8401891252955083, + "grad_norm": 2.5346736907958984, + "learning_rate": 4.784286335402866e-06, + "loss": 0.5521, + "step": 1777 + }, + { + "epoch": 0.840661938534279, + "grad_norm": 3.1420228481292725, + "learning_rate": 4.784032768031202e-06, + "loss": 0.6165, + "step": 1778 + }, + { + "epoch": 0.8411347517730496, + "grad_norm": 3.073793411254883, + "learning_rate": 4.783779058442831e-06, + "loss": 0.6414, + "step": 1779 + }, + { + "epoch": 0.8416075650118203, + "grad_norm": 2.6621336936950684, + "learning_rate": 4.783525206653554e-06, + "loss": 0.5836, + "step": 1780 + }, + { + "epoch": 0.842080378250591, + "grad_norm": 2.7029049396514893, + "learning_rate": 4.7832712126791745e-06, + "loss": 0.5897, + "step": 1781 + }, + { + "epoch": 0.8425531914893617, + "grad_norm": 2.4733822345733643, + "learning_rate": 4.783017076535509e-06, + "loss": 0.5913, + "step": 1782 + }, + { + "epoch": 0.8430260047281324, + "grad_norm": 2.8119473457336426, + "learning_rate": 4.782762798238381e-06, + "loss": 0.6105, + "step": 1783 + }, + { + "epoch": 0.8434988179669031, + "grad_norm": 2.5290818214416504, + "learning_rate": 4.782508377803622e-06, + "loss": 0.6119, + "step": 1784 + }, + { + "epoch": 0.8439716312056738, + "grad_norm": 3.193472385406494, + "learning_rate": 4.782253815247076e-06, + "loss": 0.6665, + "step": 1785 + }, + { + "epoch": 0.8444444444444444, + "grad_norm": 3.206759452819824, + "learning_rate": 4.781999110584592e-06, + "loss": 0.6012, + "step": 1786 + }, + { + "epoch": 0.8449172576832151, + "grad_norm": 2.6227457523345947, + "learning_rate": 4.781744263832029e-06, + "loss": 0.5845, + "step": 1787 + }, + { + "epoch": 0.8453900709219858, + "grad_norm": 2.838365316390991, + "learning_rate": 4.781489275005257e-06, + "loss": 0.5695, + "step": 1788 + }, + { + "epoch": 0.8458628841607565, + "grad_norm": 2.8348326683044434, + "learning_rate": 4.78123414412015e-06, + "loss": 0.6136, + "step": 1789 + }, + { + "epoch": 0.8463356973995272, + "grad_norm": 2.5698344707489014, + "learning_rate": 4.780978871192597e-06, + "loss": 0.6576, + "step": 1790 + }, + { + "epoch": 0.8468085106382979, + "grad_norm": 2.5198330879211426, + "learning_rate": 4.780723456238492e-06, + "loss": 0.5521, + "step": 1791 + }, + { + "epoch": 0.8472813238770686, + "grad_norm": 3.001325845718384, + "learning_rate": 4.780467899273737e-06, + "loss": 0.6075, + "step": 1792 + }, + { + "epoch": 0.8477541371158392, + "grad_norm": 2.7732746601104736, + "learning_rate": 4.780212200314247e-06, + "loss": 0.6245, + "step": 1793 + }, + { + "epoch": 0.8482269503546099, + "grad_norm": 2.6950337886810303, + "learning_rate": 4.77995635937594e-06, + "loss": 0.5723, + "step": 1794 + }, + { + "epoch": 0.8486997635933806, + "grad_norm": 2.82051420211792, + "learning_rate": 4.779700376474749e-06, + "loss": 0.6184, + "step": 1795 + }, + { + "epoch": 0.8491725768321513, + "grad_norm": 2.757791757583618, + "learning_rate": 4.779444251626611e-06, + "loss": 0.608, + "step": 1796 + }, + { + "epoch": 0.849645390070922, + "grad_norm": 2.394108533859253, + "learning_rate": 4.779187984847475e-06, + "loss": 0.6174, + "step": 1797 + }, + { + "epoch": 0.8501182033096927, + "grad_norm": 2.427562713623047, + "learning_rate": 4.778931576153296e-06, + "loss": 0.5618, + "step": 1798 + }, + { + "epoch": 0.8505910165484634, + "grad_norm": 2.891268491744995, + "learning_rate": 4.778675025560042e-06, + "loss": 0.6865, + "step": 1799 + }, + { + "epoch": 0.851063829787234, + "grad_norm": 2.665534257888794, + "learning_rate": 4.778418333083685e-06, + "loss": 0.5852, + "step": 1800 + }, + { + "epoch": 0.8515366430260047, + "grad_norm": 2.5492889881134033, + "learning_rate": 4.7781614987402095e-06, + "loss": 0.5161, + "step": 1801 + }, + { + "epoch": 0.8520094562647754, + "grad_norm": 2.400177001953125, + "learning_rate": 4.777904522545607e-06, + "loss": 0.5128, + "step": 1802 + }, + { + "epoch": 0.8524822695035461, + "grad_norm": 2.3949809074401855, + "learning_rate": 4.777647404515878e-06, + "loss": 0.571, + "step": 1803 + }, + { + "epoch": 0.8529550827423168, + "grad_norm": 2.3624472618103027, + "learning_rate": 4.7773901446670325e-06, + "loss": 0.5486, + "step": 1804 + }, + { + "epoch": 0.8534278959810875, + "grad_norm": 2.711366891860962, + "learning_rate": 4.7771327430150885e-06, + "loss": 0.5667, + "step": 1805 + }, + { + "epoch": 0.8539007092198582, + "grad_norm": 2.7681493759155273, + "learning_rate": 4.776875199576073e-06, + "loss": 0.5686, + "step": 1806 + }, + { + "epoch": 0.8543735224586289, + "grad_norm": 3.0369436740875244, + "learning_rate": 4.776617514366023e-06, + "loss": 0.6635, + "step": 1807 + }, + { + "epoch": 0.8548463356973995, + "grad_norm": 2.919649600982666, + "learning_rate": 4.776359687400983e-06, + "loss": 0.5749, + "step": 1808 + }, + { + "epoch": 0.8553191489361702, + "grad_norm": 2.7986185550689697, + "learning_rate": 4.776101718697007e-06, + "loss": 0.559, + "step": 1809 + }, + { + "epoch": 0.8557919621749409, + "grad_norm": 2.5951223373413086, + "learning_rate": 4.775843608270158e-06, + "loss": 0.5654, + "step": 1810 + }, + { + "epoch": 0.8562647754137116, + "grad_norm": 2.674138069152832, + "learning_rate": 4.775585356136505e-06, + "loss": 0.5286, + "step": 1811 + }, + { + "epoch": 0.8567375886524823, + "grad_norm": 3.045437812805176, + "learning_rate": 4.775326962312131e-06, + "loss": 0.6185, + "step": 1812 + }, + { + "epoch": 0.857210401891253, + "grad_norm": 2.6145293712615967, + "learning_rate": 4.775068426813124e-06, + "loss": 0.6075, + "step": 1813 + }, + { + "epoch": 0.8576832151300237, + "grad_norm": 2.6320106983184814, + "learning_rate": 4.7748097496555824e-06, + "loss": 0.561, + "step": 1814 + }, + { + "epoch": 0.8581560283687943, + "grad_norm": 2.5038623809814453, + "learning_rate": 4.774550930855612e-06, + "loss": 0.593, + "step": 1815 + }, + { + "epoch": 0.858628841607565, + "grad_norm": 2.8168089389801025, + "learning_rate": 4.774291970429329e-06, + "loss": 0.5196, + "step": 1816 + }, + { + "epoch": 0.8591016548463357, + "grad_norm": 2.778130292892456, + "learning_rate": 4.774032868392858e-06, + "loss": 0.5984, + "step": 1817 + }, + { + "epoch": 0.8595744680851064, + "grad_norm": 2.536458730697632, + "learning_rate": 4.7737736247623305e-06, + "loss": 0.568, + "step": 1818 + }, + { + "epoch": 0.8600472813238771, + "grad_norm": 2.6669719219207764, + "learning_rate": 4.77351423955389e-06, + "loss": 0.6233, + "step": 1819 + }, + { + "epoch": 0.8605200945626478, + "grad_norm": 2.578242540359497, + "learning_rate": 4.773254712783687e-06, + "loss": 0.579, + "step": 1820 + }, + { + "epoch": 0.8609929078014185, + "grad_norm": 2.816664457321167, + "learning_rate": 4.772995044467881e-06, + "loss": 0.6635, + "step": 1821 + }, + { + "epoch": 0.8614657210401891, + "grad_norm": 3.1111979484558105, + "learning_rate": 4.77273523462264e-06, + "loss": 0.6372, + "step": 1822 + }, + { + "epoch": 0.8619385342789598, + "grad_norm": 2.764552354812622, + "learning_rate": 4.772475283264142e-06, + "loss": 0.6216, + "step": 1823 + }, + { + "epoch": 0.8624113475177305, + "grad_norm": 2.9126830101013184, + "learning_rate": 4.772215190408572e-06, + "loss": 0.6396, + "step": 1824 + }, + { + "epoch": 0.8628841607565012, + "grad_norm": 2.7502307891845703, + "learning_rate": 4.7719549560721264e-06, + "loss": 0.6186, + "step": 1825 + }, + { + "epoch": 0.8633569739952719, + "grad_norm": 2.6279006004333496, + "learning_rate": 4.771694580271007e-06, + "loss": 0.5557, + "step": 1826 + }, + { + "epoch": 0.8638297872340426, + "grad_norm": 2.996563196182251, + "learning_rate": 4.7714340630214276e-06, + "loss": 0.6259, + "step": 1827 + }, + { + "epoch": 0.8643026004728133, + "grad_norm": 3.231323480606079, + "learning_rate": 4.771173404339609e-06, + "loss": 0.5473, + "step": 1828 + }, + { + "epoch": 0.864775413711584, + "grad_norm": 3.143519878387451, + "learning_rate": 4.770912604241781e-06, + "loss": 0.593, + "step": 1829 + }, + { + "epoch": 0.8652482269503546, + "grad_norm": 2.515484094619751, + "learning_rate": 4.770651662744184e-06, + "loss": 0.538, + "step": 1830 + }, + { + "epoch": 0.8657210401891253, + "grad_norm": 2.629058837890625, + "learning_rate": 4.770390579863064e-06, + "loss": 0.5745, + "step": 1831 + }, + { + "epoch": 0.866193853427896, + "grad_norm": 2.5826802253723145, + "learning_rate": 4.770129355614677e-06, + "loss": 0.6397, + "step": 1832 + }, + { + "epoch": 0.8666666666666667, + "grad_norm": 2.954623222351074, + "learning_rate": 4.769867990015289e-06, + "loss": 0.6106, + "step": 1833 + }, + { + "epoch": 0.8671394799054374, + "grad_norm": 2.742192268371582, + "learning_rate": 4.769606483081175e-06, + "loss": 0.6902, + "step": 1834 + }, + { + "epoch": 0.8676122931442081, + "grad_norm": 2.2619097232818604, + "learning_rate": 4.769344834828618e-06, + "loss": 0.5414, + "step": 1835 + }, + { + "epoch": 0.8680851063829788, + "grad_norm": 2.7384188175201416, + "learning_rate": 4.769083045273908e-06, + "loss": 0.5787, + "step": 1836 + }, + { + "epoch": 0.8685579196217494, + "grad_norm": 2.6734485626220703, + "learning_rate": 4.768821114433346e-06, + "loss": 0.5923, + "step": 1837 + }, + { + "epoch": 0.8690307328605201, + "grad_norm": 2.286140203475952, + "learning_rate": 4.768559042323243e-06, + "loss": 0.5822, + "step": 1838 + }, + { + "epoch": 0.8695035460992908, + "grad_norm": 3.0243725776672363, + "learning_rate": 4.768296828959915e-06, + "loss": 0.6623, + "step": 1839 + }, + { + "epoch": 0.8699763593380615, + "grad_norm": 2.4026312828063965, + "learning_rate": 4.768034474359689e-06, + "loss": 0.5554, + "step": 1840 + }, + { + "epoch": 0.8704491725768322, + "grad_norm": 2.7469029426574707, + "learning_rate": 4.767771978538903e-06, + "loss": 0.6316, + "step": 1841 + }, + { + "epoch": 0.8709219858156029, + "grad_norm": 2.729659080505371, + "learning_rate": 4.767509341513899e-06, + "loss": 0.5807, + "step": 1842 + }, + { + "epoch": 0.8713947990543736, + "grad_norm": 2.5336945056915283, + "learning_rate": 4.76724656330103e-06, + "loss": 0.6109, + "step": 1843 + }, + { + "epoch": 0.8718676122931442, + "grad_norm": 2.519880533218384, + "learning_rate": 4.76698364391666e-06, + "loss": 0.5313, + "step": 1844 + }, + { + "epoch": 0.8723404255319149, + "grad_norm": 2.698862075805664, + "learning_rate": 4.766720583377159e-06, + "loss": 0.5953, + "step": 1845 + }, + { + "epoch": 0.8728132387706856, + "grad_norm": 3.0195560455322266, + "learning_rate": 4.766457381698907e-06, + "loss": 0.5965, + "step": 1846 + }, + { + "epoch": 0.8732860520094563, + "grad_norm": 2.5972697734832764, + "learning_rate": 4.766194038898291e-06, + "loss": 0.6014, + "step": 1847 + }, + { + "epoch": 0.873758865248227, + "grad_norm": 2.7132294178009033, + "learning_rate": 4.76593055499171e-06, + "loss": 0.5638, + "step": 1848 + }, + { + "epoch": 0.8742316784869977, + "grad_norm": 2.7134575843811035, + "learning_rate": 4.765666929995568e-06, + "loss": 0.52, + "step": 1849 + }, + { + "epoch": 0.8747044917257684, + "grad_norm": 2.3804993629455566, + "learning_rate": 4.765403163926282e-06, + "loss": 0.5435, + "step": 1850 + }, + { + "epoch": 0.875177304964539, + "grad_norm": 2.8782761096954346, + "learning_rate": 4.765139256800274e-06, + "loss": 0.5843, + "step": 1851 + }, + { + "epoch": 0.8756501182033097, + "grad_norm": 2.836209774017334, + "learning_rate": 4.764875208633977e-06, + "loss": 0.6667, + "step": 1852 + }, + { + "epoch": 0.8761229314420804, + "grad_norm": 2.608851194381714, + "learning_rate": 4.764611019443831e-06, + "loss": 0.5436, + "step": 1853 + }, + { + "epoch": 0.8765957446808511, + "grad_norm": 2.788738965988159, + "learning_rate": 4.764346689246288e-06, + "loss": 0.7331, + "step": 1854 + }, + { + "epoch": 0.8770685579196218, + "grad_norm": 2.524277687072754, + "learning_rate": 4.764082218057805e-06, + "loss": 0.5067, + "step": 1855 + }, + { + "epoch": 0.8775413711583925, + "grad_norm": 3.7559316158294678, + "learning_rate": 4.763817605894851e-06, + "loss": 0.6809, + "step": 1856 + }, + { + "epoch": 0.8780141843971632, + "grad_norm": 2.9070613384246826, + "learning_rate": 4.763552852773899e-06, + "loss": 0.5913, + "step": 1857 + }, + { + "epoch": 0.8784869976359339, + "grad_norm": 2.7050609588623047, + "learning_rate": 4.7632879587114386e-06, + "loss": 0.6074, + "step": 1858 + }, + { + "epoch": 0.8789598108747045, + "grad_norm": 2.891134262084961, + "learning_rate": 4.76302292372396e-06, + "loss": 0.5939, + "step": 1859 + }, + { + "epoch": 0.8794326241134752, + "grad_norm": 2.8581702709198, + "learning_rate": 4.762757747827968e-06, + "loss": 0.5972, + "step": 1860 + }, + { + "epoch": 0.8799054373522459, + "grad_norm": 2.8266196250915527, + "learning_rate": 4.762492431039971e-06, + "loss": 0.5993, + "step": 1861 + }, + { + "epoch": 0.8803782505910166, + "grad_norm": 2.4853954315185547, + "learning_rate": 4.762226973376493e-06, + "loss": 0.6388, + "step": 1862 + }, + { + "epoch": 0.8808510638297873, + "grad_norm": 3.2212886810302734, + "learning_rate": 4.761961374854059e-06, + "loss": 0.6698, + "step": 1863 + }, + { + "epoch": 0.881323877068558, + "grad_norm": 3.1254501342773438, + "learning_rate": 4.761695635489211e-06, + "loss": 0.5263, + "step": 1864 + }, + { + "epoch": 0.8817966903073287, + "grad_norm": 2.6891462802886963, + "learning_rate": 4.761429755298491e-06, + "loss": 0.5359, + "step": 1865 + }, + { + "epoch": 0.8822695035460993, + "grad_norm": 2.8557538986206055, + "learning_rate": 4.761163734298457e-06, + "loss": 0.5933, + "step": 1866 + }, + { + "epoch": 0.88274231678487, + "grad_norm": 2.53548264503479, + "learning_rate": 4.7608975725056724e-06, + "loss": 0.6397, + "step": 1867 + }, + { + "epoch": 0.8832151300236407, + "grad_norm": 3.0237956047058105, + "learning_rate": 4.76063126993671e-06, + "loss": 0.6845, + "step": 1868 + }, + { + "epoch": 0.8836879432624114, + "grad_norm": 3.222886800765991, + "learning_rate": 4.76036482660815e-06, + "loss": 0.6055, + "step": 1869 + }, + { + "epoch": 0.8841607565011821, + "grad_norm": 3.1867551803588867, + "learning_rate": 4.760098242536584e-06, + "loss": 0.6592, + "step": 1870 + }, + { + "epoch": 0.8846335697399527, + "grad_norm": 2.782209873199463, + "learning_rate": 4.7598315177386115e-06, + "loss": 0.5833, + "step": 1871 + }, + { + "epoch": 0.8851063829787233, + "grad_norm": 2.899871587753296, + "learning_rate": 4.759564652230838e-06, + "loss": 0.6129, + "step": 1872 + }, + { + "epoch": 0.885579196217494, + "grad_norm": 2.5690579414367676, + "learning_rate": 4.759297646029882e-06, + "loss": 0.5827, + "step": 1873 + }, + { + "epoch": 0.8860520094562647, + "grad_norm": 2.666130304336548, + "learning_rate": 4.759030499152368e-06, + "loss": 0.5272, + "step": 1874 + }, + { + "epoch": 0.8865248226950354, + "grad_norm": 2.7030911445617676, + "learning_rate": 4.758763211614932e-06, + "loss": 0.6415, + "step": 1875 + }, + { + "epoch": 0.8869976359338061, + "grad_norm": 2.717512845993042, + "learning_rate": 4.7584957834342135e-06, + "loss": 0.5827, + "step": 1876 + }, + { + "epoch": 0.8874704491725768, + "grad_norm": 2.665823459625244, + "learning_rate": 4.758228214626867e-06, + "loss": 0.6209, + "step": 1877 + }, + { + "epoch": 0.8879432624113475, + "grad_norm": 2.636653184890747, + "learning_rate": 4.75796050520955e-06, + "loss": 0.6413, + "step": 1878 + }, + { + "epoch": 0.8884160756501182, + "grad_norm": 2.585115671157837, + "learning_rate": 4.7576926551989345e-06, + "loss": 0.5518, + "step": 1879 + }, + { + "epoch": 0.8888888888888888, + "grad_norm": 2.808526039123535, + "learning_rate": 4.757424664611697e-06, + "loss": 0.5717, + "step": 1880 + }, + { + "epoch": 0.8893617021276595, + "grad_norm": 3.5957939624786377, + "learning_rate": 4.757156533464524e-06, + "loss": 0.6323, + "step": 1881 + }, + { + "epoch": 0.8898345153664302, + "grad_norm": 2.5003883838653564, + "learning_rate": 4.756888261774111e-06, + "loss": 0.5937, + "step": 1882 + }, + { + "epoch": 0.8903073286052009, + "grad_norm": 2.749061346054077, + "learning_rate": 4.756619849557161e-06, + "loss": 0.6642, + "step": 1883 + }, + { + "epoch": 0.8907801418439716, + "grad_norm": 2.6757891178131104, + "learning_rate": 4.756351296830389e-06, + "loss": 0.5887, + "step": 1884 + }, + { + "epoch": 0.8912529550827423, + "grad_norm": 2.811925172805786, + "learning_rate": 4.756082603610516e-06, + "loss": 0.6571, + "step": 1885 + }, + { + "epoch": 0.891725768321513, + "grad_norm": 2.5054616928100586, + "learning_rate": 4.755813769914271e-06, + "loss": 0.6312, + "step": 1886 + }, + { + "epoch": 0.8921985815602836, + "grad_norm": 2.7518467903137207, + "learning_rate": 4.755544795758395e-06, + "loss": 0.6685, + "step": 1887 + }, + { + "epoch": 0.8926713947990543, + "grad_norm": 2.7527287006378174, + "learning_rate": 4.755275681159634e-06, + "loss": 0.5886, + "step": 1888 + }, + { + "epoch": 0.893144208037825, + "grad_norm": 2.6162452697753906, + "learning_rate": 4.755006426134745e-06, + "loss": 0.546, + "step": 1889 + }, + { + "epoch": 0.8936170212765957, + "grad_norm": 2.4016737937927246, + "learning_rate": 4.754737030700495e-06, + "loss": 0.5726, + "step": 1890 + }, + { + "epoch": 0.8940898345153664, + "grad_norm": 2.528327703475952, + "learning_rate": 4.754467494873656e-06, + "loss": 0.5682, + "step": 1891 + }, + { + "epoch": 0.8945626477541371, + "grad_norm": 2.3139286041259766, + "learning_rate": 4.7541978186710115e-06, + "loss": 0.6108, + "step": 1892 + }, + { + "epoch": 0.8950354609929078, + "grad_norm": 2.7269136905670166, + "learning_rate": 4.753928002109354e-06, + "loss": 0.5875, + "step": 1893 + }, + { + "epoch": 0.8955082742316784, + "grad_norm": 4.425495147705078, + "learning_rate": 4.753658045205482e-06, + "loss": 0.5572, + "step": 1894 + }, + { + "epoch": 0.8959810874704491, + "grad_norm": 2.535409927368164, + "learning_rate": 4.753387947976206e-06, + "loss": 0.5868, + "step": 1895 + }, + { + "epoch": 0.8964539007092198, + "grad_norm": 2.722458600997925, + "learning_rate": 4.753117710438343e-06, + "loss": 0.5935, + "step": 1896 + }, + { + "epoch": 0.8969267139479905, + "grad_norm": 2.743861436843872, + "learning_rate": 4.75284733260872e-06, + "loss": 0.572, + "step": 1897 + }, + { + "epoch": 0.8973995271867612, + "grad_norm": 2.60640549659729, + "learning_rate": 4.752576814504173e-06, + "loss": 0.567, + "step": 1898 + }, + { + "epoch": 0.8978723404255319, + "grad_norm": 2.7486042976379395, + "learning_rate": 4.7523061561415435e-06, + "loss": 0.5768, + "step": 1899 + }, + { + "epoch": 0.8983451536643026, + "grad_norm": 3.8410251140594482, + "learning_rate": 4.752035357537686e-06, + "loss": 0.6034, + "step": 1900 + }, + { + "epoch": 0.8988179669030733, + "grad_norm": 3.0935890674591064, + "learning_rate": 4.751764418709462e-06, + "loss": 0.5644, + "step": 1901 + }, + { + "epoch": 0.8992907801418439, + "grad_norm": 2.7989892959594727, + "learning_rate": 4.751493339673742e-06, + "loss": 0.656, + "step": 1902 + }, + { + "epoch": 0.8997635933806146, + "grad_norm": 3.6940557956695557, + "learning_rate": 4.751222120447403e-06, + "loss": 0.6632, + "step": 1903 + }, + { + "epoch": 0.9002364066193853, + "grad_norm": 2.3428797721862793, + "learning_rate": 4.750950761047335e-06, + "loss": 0.4485, + "step": 1904 + }, + { + "epoch": 0.900709219858156, + "grad_norm": 2.622544050216675, + "learning_rate": 4.750679261490432e-06, + "loss": 0.5857, + "step": 1905 + }, + { + "epoch": 0.9011820330969267, + "grad_norm": 2.4911322593688965, + "learning_rate": 4.750407621793601e-06, + "loss": 0.5618, + "step": 1906 + }, + { + "epoch": 0.9016548463356974, + "grad_norm": 2.6434662342071533, + "learning_rate": 4.750135841973755e-06, + "loss": 0.6057, + "step": 1907 + }, + { + "epoch": 0.902127659574468, + "grad_norm": 3.115443706512451, + "learning_rate": 4.749863922047817e-06, + "loss": 0.6064, + "step": 1908 + }, + { + "epoch": 0.9026004728132387, + "grad_norm": 2.5671091079711914, + "learning_rate": 4.749591862032718e-06, + "loss": 0.5625, + "step": 1909 + }, + { + "epoch": 0.9030732860520094, + "grad_norm": 3.2008655071258545, + "learning_rate": 4.749319661945398e-06, + "loss": 0.5547, + "step": 1910 + }, + { + "epoch": 0.9035460992907801, + "grad_norm": 2.905987024307251, + "learning_rate": 4.749047321802805e-06, + "loss": 0.6033, + "step": 1911 + }, + { + "epoch": 0.9040189125295508, + "grad_norm": 3.1456053256988525, + "learning_rate": 4.748774841621897e-06, + "loss": 0.5651, + "step": 1912 + }, + { + "epoch": 0.9044917257683215, + "grad_norm": 2.8116416931152344, + "learning_rate": 4.748502221419641e-06, + "loss": 0.5853, + "step": 1913 + }, + { + "epoch": 0.9049645390070922, + "grad_norm": 3.123835325241089, + "learning_rate": 4.748229461213011e-06, + "loss": 0.5427, + "step": 1914 + }, + { + "epoch": 0.9054373522458629, + "grad_norm": 2.4750146865844727, + "learning_rate": 4.747956561018989e-06, + "loss": 0.6517, + "step": 1915 + }, + { + "epoch": 0.9059101654846335, + "grad_norm": 2.6174299716949463, + "learning_rate": 4.7476835208545705e-06, + "loss": 0.6119, + "step": 1916 + }, + { + "epoch": 0.9063829787234042, + "grad_norm": 2.7390382289886475, + "learning_rate": 4.747410340736755e-06, + "loss": 0.5664, + "step": 1917 + }, + { + "epoch": 0.9068557919621749, + "grad_norm": 2.7940444946289062, + "learning_rate": 4.747137020682552e-06, + "loss": 0.5628, + "step": 1918 + }, + { + "epoch": 0.9073286052009456, + "grad_norm": 2.477365016937256, + "learning_rate": 4.7468635607089795e-06, + "loss": 0.5261, + "step": 1919 + }, + { + "epoch": 0.9078014184397163, + "grad_norm": 2.7016685009002686, + "learning_rate": 4.746589960833066e-06, + "loss": 0.5576, + "step": 1920 + }, + { + "epoch": 0.908274231678487, + "grad_norm": 2.8806519508361816, + "learning_rate": 4.746316221071846e-06, + "loss": 0.5925, + "step": 1921 + }, + { + "epoch": 0.9087470449172577, + "grad_norm": 3.0315234661102295, + "learning_rate": 4.746042341442365e-06, + "loss": 0.6142, + "step": 1922 + }, + { + "epoch": 0.9092198581560283, + "grad_norm": 4.2446160316467285, + "learning_rate": 4.745768321961676e-06, + "loss": 0.5352, + "step": 1923 + }, + { + "epoch": 0.909692671394799, + "grad_norm": 2.6517012119293213, + "learning_rate": 4.745494162646841e-06, + "loss": 0.6118, + "step": 1924 + }, + { + "epoch": 0.9101654846335697, + "grad_norm": 2.774900197982788, + "learning_rate": 4.7452198635149304e-06, + "loss": 0.572, + "step": 1925 + }, + { + "epoch": 0.9106382978723404, + "grad_norm": 3.0133683681488037, + "learning_rate": 4.744945424583024e-06, + "loss": 0.5897, + "step": 1926 + }, + { + "epoch": 0.9111111111111111, + "grad_norm": 2.7344839572906494, + "learning_rate": 4.744670845868211e-06, + "loss": 0.6207, + "step": 1927 + }, + { + "epoch": 0.9115839243498818, + "grad_norm": 2.636578321456909, + "learning_rate": 4.744396127387586e-06, + "loss": 0.6687, + "step": 1928 + }, + { + "epoch": 0.9120567375886525, + "grad_norm": 2.8663458824157715, + "learning_rate": 4.744121269158255e-06, + "loss": 0.5002, + "step": 1929 + }, + { + "epoch": 0.9125295508274232, + "grad_norm": 2.661079168319702, + "learning_rate": 4.743846271197333e-06, + "loss": 0.5848, + "step": 1930 + }, + { + "epoch": 0.9130023640661938, + "grad_norm": 2.881256341934204, + "learning_rate": 4.743571133521943e-06, + "loss": 0.5911, + "step": 1931 + }, + { + "epoch": 0.9134751773049645, + "grad_norm": 2.5540573596954346, + "learning_rate": 4.743295856149217e-06, + "loss": 0.5647, + "step": 1932 + }, + { + "epoch": 0.9139479905437352, + "grad_norm": 2.7060387134552, + "learning_rate": 4.743020439096293e-06, + "loss": 0.6267, + "step": 1933 + }, + { + "epoch": 0.9144208037825059, + "grad_norm": 2.694481372833252, + "learning_rate": 4.742744882380323e-06, + "loss": 0.6283, + "step": 1934 + }, + { + "epoch": 0.9148936170212766, + "grad_norm": 2.711555242538452, + "learning_rate": 4.7424691860184625e-06, + "loss": 0.5784, + "step": 1935 + }, + { + "epoch": 0.9153664302600473, + "grad_norm": 2.9077224731445312, + "learning_rate": 4.742193350027879e-06, + "loss": 0.5948, + "step": 1936 + }, + { + "epoch": 0.915839243498818, + "grad_norm": 2.9824187755584717, + "learning_rate": 4.7419173744257476e-06, + "loss": 0.6115, + "step": 1937 + }, + { + "epoch": 0.9163120567375886, + "grad_norm": 2.5127830505371094, + "learning_rate": 4.7416412592292515e-06, + "loss": 0.5803, + "step": 1938 + }, + { + "epoch": 0.9167848699763593, + "grad_norm": 3.1307175159454346, + "learning_rate": 4.741365004455583e-06, + "loss": 0.5657, + "step": 1939 + }, + { + "epoch": 0.91725768321513, + "grad_norm": 2.8205273151397705, + "learning_rate": 4.741088610121944e-06, + "loss": 0.6145, + "step": 1940 + }, + { + "epoch": 0.9177304964539007, + "grad_norm": 2.6119720935821533, + "learning_rate": 4.7408120762455444e-06, + "loss": 0.6058, + "step": 1941 + }, + { + "epoch": 0.9182033096926714, + "grad_norm": 2.421276092529297, + "learning_rate": 4.7405354028436025e-06, + "loss": 0.5973, + "step": 1942 + }, + { + "epoch": 0.9186761229314421, + "grad_norm": 2.9846808910369873, + "learning_rate": 4.740258589933346e-06, + "loss": 0.6892, + "step": 1943 + }, + { + "epoch": 0.9191489361702128, + "grad_norm": 2.6899871826171875, + "learning_rate": 4.739981637532009e-06, + "loss": 0.5705, + "step": 1944 + }, + { + "epoch": 0.9196217494089834, + "grad_norm": 2.8636131286621094, + "learning_rate": 4.739704545656839e-06, + "loss": 0.5775, + "step": 1945 + }, + { + "epoch": 0.9200945626477541, + "grad_norm": 2.7659449577331543, + "learning_rate": 4.739427314325087e-06, + "loss": 0.5823, + "step": 1946 + }, + { + "epoch": 0.9205673758865248, + "grad_norm": 4.71295166015625, + "learning_rate": 4.739149943554016e-06, + "loss": 0.5601, + "step": 1947 + }, + { + "epoch": 0.9210401891252955, + "grad_norm": 2.642636775970459, + "learning_rate": 4.738872433360896e-06, + "loss": 0.5278, + "step": 1948 + }, + { + "epoch": 0.9215130023640662, + "grad_norm": 2.4658217430114746, + "learning_rate": 4.7385947837630065e-06, + "loss": 0.6392, + "step": 1949 + }, + { + "epoch": 0.9219858156028369, + "grad_norm": 2.851602792739868, + "learning_rate": 4.738316994777636e-06, + "loss": 0.6164, + "step": 1950 + }, + { + "epoch": 0.9224586288416076, + "grad_norm": 2.394226551055908, + "learning_rate": 4.738039066422081e-06, + "loss": 0.5556, + "step": 1951 + }, + { + "epoch": 0.9229314420803783, + "grad_norm": 2.7985100746154785, + "learning_rate": 4.737760998713647e-06, + "loss": 0.5799, + "step": 1952 + }, + { + "epoch": 0.9234042553191489, + "grad_norm": 2.5974674224853516, + "learning_rate": 4.737482791669648e-06, + "loss": 0.6984, + "step": 1953 + }, + { + "epoch": 0.9238770685579196, + "grad_norm": 2.707636594772339, + "learning_rate": 4.737204445307406e-06, + "loss": 0.5548, + "step": 1954 + }, + { + "epoch": 0.9243498817966903, + "grad_norm": 2.7882707118988037, + "learning_rate": 4.736925959644254e-06, + "loss": 0.6026, + "step": 1955 + }, + { + "epoch": 0.924822695035461, + "grad_norm": 2.474482774734497, + "learning_rate": 4.7366473346975304e-06, + "loss": 0.5832, + "step": 1956 + }, + { + "epoch": 0.9252955082742317, + "grad_norm": 2.6196324825286865, + "learning_rate": 4.736368570484585e-06, + "loss": 0.5861, + "step": 1957 + }, + { + "epoch": 0.9257683215130024, + "grad_norm": 2.826864004135132, + "learning_rate": 4.736089667022775e-06, + "loss": 0.6173, + "step": 1958 + }, + { + "epoch": 0.926241134751773, + "grad_norm": 2.414473056793213, + "learning_rate": 4.735810624329466e-06, + "loss": 0.5753, + "step": 1959 + }, + { + "epoch": 0.9267139479905437, + "grad_norm": 2.8037970066070557, + "learning_rate": 4.7355314424220335e-06, + "loss": 0.6207, + "step": 1960 + }, + { + "epoch": 0.9271867612293144, + "grad_norm": 2.645458698272705, + "learning_rate": 4.735252121317861e-06, + "loss": 0.5959, + "step": 1961 + }, + { + "epoch": 0.9276595744680851, + "grad_norm": 2.7983884811401367, + "learning_rate": 4.734972661034339e-06, + "loss": 0.5696, + "step": 1962 + }, + { + "epoch": 0.9281323877068558, + "grad_norm": 3.0568997859954834, + "learning_rate": 4.73469306158887e-06, + "loss": 0.6194, + "step": 1963 + }, + { + "epoch": 0.9286052009456265, + "grad_norm": 2.7205135822296143, + "learning_rate": 4.734413322998863e-06, + "loss": 0.5292, + "step": 1964 + }, + { + "epoch": 0.9290780141843972, + "grad_norm": 3.3168489933013916, + "learning_rate": 4.734133445281735e-06, + "loss": 0.5654, + "step": 1965 + }, + { + "epoch": 0.9295508274231679, + "grad_norm": 3.0095653533935547, + "learning_rate": 4.733853428454916e-06, + "loss": 0.6508, + "step": 1966 + }, + { + "epoch": 0.9300236406619385, + "grad_norm": 2.7726712226867676, + "learning_rate": 4.733573272535838e-06, + "loss": 0.644, + "step": 1967 + }, + { + "epoch": 0.9304964539007092, + "grad_norm": 2.474397659301758, + "learning_rate": 4.7332929775419456e-06, + "loss": 0.5479, + "step": 1968 + }, + { + "epoch": 0.9309692671394799, + "grad_norm": 2.4518635272979736, + "learning_rate": 4.733012543490693e-06, + "loss": 0.6, + "step": 1969 + }, + { + "epoch": 0.9314420803782506, + "grad_norm": 2.9292192459106445, + "learning_rate": 4.73273197039954e-06, + "loss": 0.6647, + "step": 1970 + }, + { + "epoch": 0.9319148936170213, + "grad_norm": 2.425004720687866, + "learning_rate": 4.732451258285958e-06, + "loss": 0.6338, + "step": 1971 + }, + { + "epoch": 0.932387706855792, + "grad_norm": 2.904479503631592, + "learning_rate": 4.7321704071674255e-06, + "loss": 0.5923, + "step": 1972 + }, + { + "epoch": 0.9328605200945627, + "grad_norm": 2.477085590362549, + "learning_rate": 4.731889417061428e-06, + "loss": 0.5984, + "step": 1973 + }, + { + "epoch": 0.9333333333333333, + "grad_norm": 2.585240364074707, + "learning_rate": 4.731608287985465e-06, + "loss": 0.558, + "step": 1974 + }, + { + "epoch": 0.933806146572104, + "grad_norm": 2.658714532852173, + "learning_rate": 4.731327019957039e-06, + "loss": 0.5567, + "step": 1975 + }, + { + "epoch": 0.9342789598108747, + "grad_norm": 2.7593026161193848, + "learning_rate": 4.731045612993662e-06, + "loss": 0.5772, + "step": 1976 + }, + { + "epoch": 0.9347517730496454, + "grad_norm": 2.4386026859283447, + "learning_rate": 4.7307640671128585e-06, + "loss": 0.6199, + "step": 1977 + }, + { + "epoch": 0.9352245862884161, + "grad_norm": 2.681910514831543, + "learning_rate": 4.730482382332158e-06, + "loss": 0.5971, + "step": 1978 + }, + { + "epoch": 0.9356973995271868, + "grad_norm": 3.7593860626220703, + "learning_rate": 4.7302005586691e-06, + "loss": 0.6346, + "step": 1979 + }, + { + "epoch": 0.9361702127659575, + "grad_norm": 2.5789096355438232, + "learning_rate": 4.729918596141232e-06, + "loss": 0.5684, + "step": 1980 + }, + { + "epoch": 0.9366430260047282, + "grad_norm": 3.0607335567474365, + "learning_rate": 4.729636494766111e-06, + "loss": 0.6223, + "step": 1981 + }, + { + "epoch": 0.9371158392434988, + "grad_norm": 2.906643867492676, + "learning_rate": 4.729354254561303e-06, + "loss": 0.6513, + "step": 1982 + }, + { + "epoch": 0.9375886524822695, + "grad_norm": 3.192430019378662, + "learning_rate": 4.7290718755443795e-06, + "loss": 0.5095, + "step": 1983 + }, + { + "epoch": 0.9380614657210402, + "grad_norm": 2.661536931991577, + "learning_rate": 4.7287893577329255e-06, + "loss": 0.5525, + "step": 1984 + }, + { + "epoch": 0.9385342789598109, + "grad_norm": 2.8436734676361084, + "learning_rate": 4.728506701144531e-06, + "loss": 0.6323, + "step": 1985 + }, + { + "epoch": 0.9390070921985816, + "grad_norm": 2.75544810295105, + "learning_rate": 4.728223905796796e-06, + "loss": 0.6018, + "step": 1986 + }, + { + "epoch": 0.9394799054373523, + "grad_norm": 3.0652759075164795, + "learning_rate": 4.727940971707329e-06, + "loss": 0.62, + "step": 1987 + }, + { + "epoch": 0.939952718676123, + "grad_norm": 2.802567720413208, + "learning_rate": 4.727657898893747e-06, + "loss": 0.5809, + "step": 1988 + }, + { + "epoch": 0.9404255319148936, + "grad_norm": 2.6208512783050537, + "learning_rate": 4.7273746873736745e-06, + "loss": 0.5762, + "step": 1989 + }, + { + "epoch": 0.9408983451536643, + "grad_norm": 2.5901873111724854, + "learning_rate": 4.727091337164748e-06, + "loss": 0.6111, + "step": 1990 + }, + { + "epoch": 0.941371158392435, + "grad_norm": 3.002347707748413, + "learning_rate": 4.726807848284609e-06, + "loss": 0.6419, + "step": 1991 + }, + { + "epoch": 0.9418439716312057, + "grad_norm": 2.522151470184326, + "learning_rate": 4.72652422075091e-06, + "loss": 0.642, + "step": 1992 + }, + { + "epoch": 0.9423167848699764, + "grad_norm": 2.5571532249450684, + "learning_rate": 4.726240454581311e-06, + "loss": 0.5729, + "step": 1993 + }, + { + "epoch": 0.9427895981087471, + "grad_norm": 2.7704918384552, + "learning_rate": 4.72595654979348e-06, + "loss": 0.6816, + "step": 1994 + }, + { + "epoch": 0.9432624113475178, + "grad_norm": 2.517040491104126, + "learning_rate": 4.7256725064050955e-06, + "loss": 0.5782, + "step": 1995 + }, + { + "epoch": 0.9437352245862884, + "grad_norm": 2.613955020904541, + "learning_rate": 4.725388324433843e-06, + "loss": 0.6291, + "step": 1996 + }, + { + "epoch": 0.9442080378250591, + "grad_norm": 2.848891258239746, + "learning_rate": 4.725104003897418e-06, + "loss": 0.6544, + "step": 1997 + }, + { + "epoch": 0.9446808510638298, + "grad_norm": 3.0162429809570312, + "learning_rate": 4.724819544813523e-06, + "loss": 0.6301, + "step": 1998 + }, + { + "epoch": 0.9451536643026005, + "grad_norm": 2.613614559173584, + "learning_rate": 4.72453494719987e-06, + "loss": 0.5829, + "step": 1999 + }, + { + "epoch": 0.9456264775413712, + "grad_norm": 2.4838767051696777, + "learning_rate": 4.724250211074182e-06, + "loss": 0.6042, + "step": 2000 + }, + { + "epoch": 0.9460992907801419, + "grad_norm": 2.526470899581909, + "learning_rate": 4.723965336454185e-06, + "loss": 0.6167, + "step": 2001 + }, + { + "epoch": 0.9465721040189126, + "grad_norm": 2.504506826400757, + "learning_rate": 4.723680323357618e-06, + "loss": 0.6061, + "step": 2002 + }, + { + "epoch": 0.9470449172576832, + "grad_norm": 3.0547544956207275, + "learning_rate": 4.723395171802228e-06, + "loss": 0.6619, + "step": 2003 + }, + { + "epoch": 0.9475177304964539, + "grad_norm": 2.8692407608032227, + "learning_rate": 4.723109881805771e-06, + "loss": 0.5985, + "step": 2004 + }, + { + "epoch": 0.9479905437352246, + "grad_norm": 2.7929654121398926, + "learning_rate": 4.7228244533860094e-06, + "loss": 0.5869, + "step": 2005 + }, + { + "epoch": 0.9484633569739953, + "grad_norm": 2.764869451522827, + "learning_rate": 4.7225388865607146e-06, + "loss": 0.6288, + "step": 2006 + }, + { + "epoch": 0.948936170212766, + "grad_norm": 2.7656404972076416, + "learning_rate": 4.722253181347671e-06, + "loss": 0.5831, + "step": 2007 + }, + { + "epoch": 0.9494089834515367, + "grad_norm": 2.6698336601257324, + "learning_rate": 4.7219673377646635e-06, + "loss": 0.6087, + "step": 2008 + }, + { + "epoch": 0.9498817966903074, + "grad_norm": 2.524935722351074, + "learning_rate": 4.7216813558294946e-06, + "loss": 0.5675, + "step": 2009 + }, + { + "epoch": 0.950354609929078, + "grad_norm": 2.5998785495758057, + "learning_rate": 4.721395235559969e-06, + "loss": 0.5667, + "step": 2010 + }, + { + "epoch": 0.9508274231678487, + "grad_norm": 2.758021354675293, + "learning_rate": 4.721108976973902e-06, + "loss": 0.4931, + "step": 2011 + }, + { + "epoch": 0.9513002364066194, + "grad_norm": 2.767695903778076, + "learning_rate": 4.72082258008912e-06, + "loss": 0.5778, + "step": 2012 + }, + { + "epoch": 0.9517730496453901, + "grad_norm": 2.982314348220825, + "learning_rate": 4.720536044923453e-06, + "loss": 0.6096, + "step": 2013 + }, + { + "epoch": 0.9522458628841608, + "grad_norm": 2.7608799934387207, + "learning_rate": 4.720249371494743e-06, + "loss": 0.6242, + "step": 2014 + }, + { + "epoch": 0.9527186761229315, + "grad_norm": 2.60054349899292, + "learning_rate": 4.71996255982084e-06, + "loss": 0.6249, + "step": 2015 + }, + { + "epoch": 0.9531914893617022, + "grad_norm": 2.654355764389038, + "learning_rate": 4.719675609919603e-06, + "loss": 0.6327, + "step": 2016 + }, + { + "epoch": 0.9536643026004729, + "grad_norm": 2.589404582977295, + "learning_rate": 4.719388521808899e-06, + "loss": 0.6357, + "step": 2017 + }, + { + "epoch": 0.9541371158392435, + "grad_norm": 2.8016581535339355, + "learning_rate": 4.719101295506603e-06, + "loss": 0.5901, + "step": 2018 + }, + { + "epoch": 0.9546099290780142, + "grad_norm": 3.1408045291900635, + "learning_rate": 4.7188139310306e-06, + "loss": 0.598, + "step": 2019 + }, + { + "epoch": 0.9550827423167849, + "grad_norm": 2.7432665824890137, + "learning_rate": 4.718526428398783e-06, + "loss": 0.5508, + "step": 2020 + }, + { + "epoch": 0.9555555555555556, + "grad_norm": 2.947800874710083, + "learning_rate": 4.718238787629053e-06, + "loss": 0.6439, + "step": 2021 + }, + { + "epoch": 0.9560283687943263, + "grad_norm": 2.50828218460083, + "learning_rate": 4.71795100873932e-06, + "loss": 0.5441, + "step": 2022 + }, + { + "epoch": 0.956501182033097, + "grad_norm": 2.8558974266052246, + "learning_rate": 4.717663091747503e-06, + "loss": 0.5416, + "step": 2023 + }, + { + "epoch": 0.9569739952718677, + "grad_norm": 2.4803316593170166, + "learning_rate": 4.71737503667153e-06, + "loss": 0.5317, + "step": 2024 + }, + { + "epoch": 0.9574468085106383, + "grad_norm": 4.36754035949707, + "learning_rate": 4.717086843529336e-06, + "loss": 0.5808, + "step": 2025 + }, + { + "epoch": 0.957919621749409, + "grad_norm": 2.730185031890869, + "learning_rate": 4.7167985123388665e-06, + "loss": 0.5257, + "step": 2026 + }, + { + "epoch": 0.9583924349881797, + "grad_norm": 2.8136069774627686, + "learning_rate": 4.716510043118074e-06, + "loss": 0.5836, + "step": 2027 + }, + { + "epoch": 0.9588652482269504, + "grad_norm": 2.793975353240967, + "learning_rate": 4.71622143588492e-06, + "loss": 0.5706, + "step": 2028 + }, + { + "epoch": 0.9593380614657211, + "grad_norm": 2.3883821964263916, + "learning_rate": 4.7159326906573745e-06, + "loss": 0.5291, + "step": 2029 + }, + { + "epoch": 0.9598108747044918, + "grad_norm": 2.6135976314544678, + "learning_rate": 4.715643807453417e-06, + "loss": 0.6199, + "step": 2030 + }, + { + "epoch": 0.9602836879432625, + "grad_norm": 2.6245670318603516, + "learning_rate": 4.715354786291035e-06, + "loss": 0.5585, + "step": 2031 + }, + { + "epoch": 0.9607565011820332, + "grad_norm": 2.7870967388153076, + "learning_rate": 4.715065627188225e-06, + "loss": 0.6196, + "step": 2032 + }, + { + "epoch": 0.9612293144208038, + "grad_norm": 2.6983911991119385, + "learning_rate": 4.714776330162991e-06, + "loss": 0.6424, + "step": 2033 + }, + { + "epoch": 0.9617021276595744, + "grad_norm": 2.3221919536590576, + "learning_rate": 4.7144868952333465e-06, + "loss": 0.568, + "step": 2034 + }, + { + "epoch": 0.9621749408983451, + "grad_norm": 2.9408178329467773, + "learning_rate": 4.714197322417314e-06, + "loss": 0.6175, + "step": 2035 + }, + { + "epoch": 0.9626477541371158, + "grad_norm": 2.404057264328003, + "learning_rate": 4.713907611732921e-06, + "loss": 0.4943, + "step": 2036 + }, + { + "epoch": 0.9631205673758865, + "grad_norm": 3.547607660293579, + "learning_rate": 4.71361776319821e-06, + "loss": 0.5488, + "step": 2037 + }, + { + "epoch": 0.9635933806146572, + "grad_norm": 2.679614543914795, + "learning_rate": 4.713327776831227e-06, + "loss": 0.6234, + "step": 2038 + }, + { + "epoch": 0.9640661938534278, + "grad_norm": 2.526914119720459, + "learning_rate": 4.7130376526500286e-06, + "loss": 0.5891, + "step": 2039 + }, + { + "epoch": 0.9645390070921985, + "grad_norm": 2.6953470706939697, + "learning_rate": 4.71274739067268e-06, + "loss": 0.69, + "step": 2040 + }, + { + "epoch": 0.9650118203309692, + "grad_norm": 2.546660900115967, + "learning_rate": 4.712456990917254e-06, + "loss": 0.6185, + "step": 2041 + }, + { + "epoch": 0.9654846335697399, + "grad_norm": 3.3920490741729736, + "learning_rate": 4.712166453401832e-06, + "loss": 0.587, + "step": 2042 + }, + { + "epoch": 0.9659574468085106, + "grad_norm": 2.5961573123931885, + "learning_rate": 4.711875778144504e-06, + "loss": 0.6105, + "step": 2043 + }, + { + "epoch": 0.9664302600472813, + "grad_norm": 2.5111498832702637, + "learning_rate": 4.711584965163372e-06, + "loss": 0.5533, + "step": 2044 + }, + { + "epoch": 0.966903073286052, + "grad_norm": 2.4878132343292236, + "learning_rate": 4.7112940144765405e-06, + "loss": 0.5604, + "step": 2045 + }, + { + "epoch": 0.9673758865248226, + "grad_norm": 2.5714077949523926, + "learning_rate": 4.711002926102128e-06, + "loss": 0.5794, + "step": 2046 + }, + { + "epoch": 0.9678486997635933, + "grad_norm": 2.7069091796875, + "learning_rate": 4.710711700058257e-06, + "loss": 0.594, + "step": 2047 + }, + { + "epoch": 0.968321513002364, + "grad_norm": 2.8104631900787354, + "learning_rate": 4.710420336363063e-06, + "loss": 0.6247, + "step": 2048 + }, + { + "epoch": 0.9687943262411347, + "grad_norm": 2.8464386463165283, + "learning_rate": 4.7101288350346865e-06, + "loss": 0.6162, + "step": 2049 + }, + { + "epoch": 0.9692671394799054, + "grad_norm": 2.7187976837158203, + "learning_rate": 4.709837196091279e-06, + "loss": 0.6109, + "step": 2050 + }, + { + "epoch": 0.9697399527186761, + "grad_norm": 2.556734085083008, + "learning_rate": 4.709545419550999e-06, + "loss": 0.6297, + "step": 2051 + }, + { + "epoch": 0.9702127659574468, + "grad_norm": 2.937195062637329, + "learning_rate": 4.709253505432014e-06, + "loss": 0.6862, + "step": 2052 + }, + { + "epoch": 0.9706855791962175, + "grad_norm": 2.792175531387329, + "learning_rate": 4.7089614537525015e-06, + "loss": 0.6105, + "step": 2053 + }, + { + "epoch": 0.9711583924349881, + "grad_norm": 2.625636100769043, + "learning_rate": 4.708669264530644e-06, + "loss": 0.5849, + "step": 2054 + }, + { + "epoch": 0.9716312056737588, + "grad_norm": 2.6752610206604004, + "learning_rate": 4.708376937784637e-06, + "loss": 0.5949, + "step": 2055 + }, + { + "epoch": 0.9721040189125295, + "grad_norm": 2.6072793006896973, + "learning_rate": 4.708084473532681e-06, + "loss": 0.5776, + "step": 2056 + }, + { + "epoch": 0.9725768321513002, + "grad_norm": 2.728632926940918, + "learning_rate": 4.707791871792988e-06, + "loss": 0.6352, + "step": 2057 + }, + { + "epoch": 0.9730496453900709, + "grad_norm": 2.5841758251190186, + "learning_rate": 4.707499132583775e-06, + "loss": 0.5488, + "step": 2058 + }, + { + "epoch": 0.9735224586288416, + "grad_norm": 2.8464293479919434, + "learning_rate": 4.707206255923271e-06, + "loss": 0.7051, + "step": 2059 + }, + { + "epoch": 0.9739952718676123, + "grad_norm": 2.547297239303589, + "learning_rate": 4.706913241829712e-06, + "loss": 0.5937, + "step": 2060 + }, + { + "epoch": 0.9744680851063829, + "grad_norm": 2.6572306156158447, + "learning_rate": 4.706620090321341e-06, + "loss": 0.6041, + "step": 2061 + }, + { + "epoch": 0.9749408983451536, + "grad_norm": 2.3262805938720703, + "learning_rate": 4.706326801416414e-06, + "loss": 0.5144, + "step": 2062 + }, + { + "epoch": 0.9754137115839243, + "grad_norm": 2.9693965911865234, + "learning_rate": 4.706033375133191e-06, + "loss": 0.551, + "step": 2063 + }, + { + "epoch": 0.975886524822695, + "grad_norm": 2.5993731021881104, + "learning_rate": 4.7057398114899435e-06, + "loss": 0.6143, + "step": 2064 + }, + { + "epoch": 0.9763593380614657, + "grad_norm": 2.453336477279663, + "learning_rate": 4.70544611050495e-06, + "loss": 0.6093, + "step": 2065 + }, + { + "epoch": 0.9768321513002364, + "grad_norm": 2.898629665374756, + "learning_rate": 4.705152272196497e-06, + "loss": 0.6007, + "step": 2066 + }, + { + "epoch": 0.9773049645390071, + "grad_norm": 2.7990612983703613, + "learning_rate": 4.7048582965828815e-06, + "loss": 0.6687, + "step": 2067 + }, + { + "epoch": 0.9777777777777777, + "grad_norm": 2.635284423828125, + "learning_rate": 4.704564183682408e-06, + "loss": 0.5564, + "step": 2068 + }, + { + "epoch": 0.9782505910165484, + "grad_norm": 3.014547109603882, + "learning_rate": 4.704269933513389e-06, + "loss": 0.6084, + "step": 2069 + }, + { + "epoch": 0.9787234042553191, + "grad_norm": 2.659357786178589, + "learning_rate": 4.703975546094147e-06, + "loss": 0.6031, + "step": 2070 + }, + { + "epoch": 0.9791962174940898, + "grad_norm": 2.326932668685913, + "learning_rate": 4.703681021443013e-06, + "loss": 0.5859, + "step": 2071 + }, + { + "epoch": 0.9796690307328605, + "grad_norm": 2.958803653717041, + "learning_rate": 4.7033863595783235e-06, + "loss": 0.5586, + "step": 2072 + }, + { + "epoch": 0.9801418439716312, + "grad_norm": 2.921386957168579, + "learning_rate": 4.703091560518427e-06, + "loss": 0.6126, + "step": 2073 + }, + { + "epoch": 0.9806146572104019, + "grad_norm": 2.6500775814056396, + "learning_rate": 4.702796624281679e-06, + "loss": 0.5678, + "step": 2074 + }, + { + "epoch": 0.9810874704491725, + "grad_norm": 2.7740228176116943, + "learning_rate": 4.702501550886445e-06, + "loss": 0.6067, + "step": 2075 + }, + { + "epoch": 0.9815602836879432, + "grad_norm": 2.3296213150024414, + "learning_rate": 4.702206340351096e-06, + "loss": 0.5247, + "step": 2076 + }, + { + "epoch": 0.9820330969267139, + "grad_norm": 2.748300790786743, + "learning_rate": 4.701910992694016e-06, + "loss": 0.5197, + "step": 2077 + }, + { + "epoch": 0.9825059101654846, + "grad_norm": 2.250985622406006, + "learning_rate": 4.7016155079335926e-06, + "loss": 0.5214, + "step": 2078 + }, + { + "epoch": 0.9829787234042553, + "grad_norm": 2.389845848083496, + "learning_rate": 4.701319886088226e-06, + "loss": 0.519, + "step": 2079 + }, + { + "epoch": 0.983451536643026, + "grad_norm": 2.818220853805542, + "learning_rate": 4.701024127176322e-06, + "loss": 0.607, + "step": 2080 + }, + { + "epoch": 0.9839243498817967, + "grad_norm": 3.4058034420013428, + "learning_rate": 4.700728231216297e-06, + "loss": 0.5711, + "step": 2081 + }, + { + "epoch": 0.9843971631205674, + "grad_norm": 2.5297787189483643, + "learning_rate": 4.700432198226575e-06, + "loss": 0.5979, + "step": 2082 + }, + { + "epoch": 0.984869976359338, + "grad_norm": 3.0548105239868164, + "learning_rate": 4.7001360282255885e-06, + "loss": 0.6041, + "step": 2083 + }, + { + "epoch": 0.9853427895981087, + "grad_norm": 2.8983733654022217, + "learning_rate": 4.699839721231779e-06, + "loss": 0.5926, + "step": 2084 + }, + { + "epoch": 0.9858156028368794, + "grad_norm": 3.2717764377593994, + "learning_rate": 4.699543277263596e-06, + "loss": 0.6477, + "step": 2085 + }, + { + "epoch": 0.9862884160756501, + "grad_norm": 3.03729248046875, + "learning_rate": 4.699246696339497e-06, + "loss": 0.6786, + "step": 2086 + }, + { + "epoch": 0.9867612293144208, + "grad_norm": 2.852301597595215, + "learning_rate": 4.698949978477951e-06, + "loss": 0.6565, + "step": 2087 + }, + { + "epoch": 0.9872340425531915, + "grad_norm": 2.843485116958618, + "learning_rate": 4.698653123697431e-06, + "loss": 0.6627, + "step": 2088 + }, + { + "epoch": 0.9877068557919622, + "grad_norm": 2.6315064430236816, + "learning_rate": 4.698356132016423e-06, + "loss": 0.6577, + "step": 2089 + }, + { + "epoch": 0.9881796690307328, + "grad_norm": 2.7482151985168457, + "learning_rate": 4.698059003453417e-06, + "loss": 0.5514, + "step": 2090 + }, + { + "epoch": 0.9886524822695035, + "grad_norm": 2.826673746109009, + "learning_rate": 4.6977617380269145e-06, + "loss": 0.565, + "step": 2091 + }, + { + "epoch": 0.9891252955082742, + "grad_norm": 3.0273752212524414, + "learning_rate": 4.697464335755427e-06, + "loss": 0.6331, + "step": 2092 + }, + { + "epoch": 0.9895981087470449, + "grad_norm": 2.7551653385162354, + "learning_rate": 4.6971667966574695e-06, + "loss": 0.6486, + "step": 2093 + }, + { + "epoch": 0.9900709219858156, + "grad_norm": 2.656299114227295, + "learning_rate": 4.696869120751571e-06, + "loss": 0.6562, + "step": 2094 + }, + { + "epoch": 0.9905437352245863, + "grad_norm": 2.785322904586792, + "learning_rate": 4.696571308056265e-06, + "loss": 0.5892, + "step": 2095 + }, + { + "epoch": 0.991016548463357, + "grad_norm": 2.9334635734558105, + "learning_rate": 4.696273358590095e-06, + "loss": 0.6346, + "step": 2096 + }, + { + "epoch": 0.9914893617021276, + "grad_norm": 2.7944300174713135, + "learning_rate": 4.695975272371613e-06, + "loss": 0.5859, + "step": 2097 + }, + { + "epoch": 0.9919621749408983, + "grad_norm": 2.5416972637176514, + "learning_rate": 4.695677049419381e-06, + "loss": 0.5658, + "step": 2098 + }, + { + "epoch": 0.992434988179669, + "grad_norm": 2.4056856632232666, + "learning_rate": 4.695378689751966e-06, + "loss": 0.5121, + "step": 2099 + }, + { + "epoch": 0.9929078014184397, + "grad_norm": 2.614548683166504, + "learning_rate": 4.695080193387948e-06, + "loss": 0.5961, + "step": 2100 + }, + { + "epoch": 0.9933806146572104, + "grad_norm": 2.8966517448425293, + "learning_rate": 4.69478156034591e-06, + "loss": 0.5985, + "step": 2101 + }, + { + "epoch": 0.9938534278959811, + "grad_norm": 2.9514098167419434, + "learning_rate": 4.694482790644448e-06, + "loss": 0.5677, + "step": 2102 + }, + { + "epoch": 0.9943262411347518, + "grad_norm": 2.4326791763305664, + "learning_rate": 4.694183884302165e-06, + "loss": 0.5698, + "step": 2103 + }, + { + "epoch": 0.9947990543735225, + "grad_norm": 2.9242892265319824, + "learning_rate": 4.6938848413376735e-06, + "loss": 0.6245, + "step": 2104 + }, + { + "epoch": 0.9952718676122931, + "grad_norm": 2.9134104251861572, + "learning_rate": 4.693585661769593e-06, + "loss": 0.6164, + "step": 2105 + }, + { + "epoch": 0.9957446808510638, + "grad_norm": 2.472564458847046, + "learning_rate": 4.693286345616551e-06, + "loss": 0.5616, + "step": 2106 + }, + { + "epoch": 0.9962174940898345, + "grad_norm": 3.2456448078155518, + "learning_rate": 4.692986892897186e-06, + "loss": 0.6977, + "step": 2107 + }, + { + "epoch": 0.9966903073286052, + "grad_norm": 3.4032769203186035, + "learning_rate": 4.692687303630143e-06, + "loss": 0.643, + "step": 2108 + }, + { + "epoch": 0.9971631205673759, + "grad_norm": 2.722200870513916, + "learning_rate": 4.692387577834076e-06, + "loss": 0.5873, + "step": 2109 + }, + { + "epoch": 0.9976359338061466, + "grad_norm": 2.687532663345337, + "learning_rate": 4.692087715527648e-06, + "loss": 0.5423, + "step": 2110 + }, + { + "epoch": 0.9981087470449173, + "grad_norm": 2.578613042831421, + "learning_rate": 4.6917877167295305e-06, + "loss": 0.5689, + "step": 2111 + }, + { + "epoch": 0.9985815602836879, + "grad_norm": 3.1806094646453857, + "learning_rate": 4.691487581458402e-06, + "loss": 0.6133, + "step": 2112 + }, + { + "epoch": 0.9990543735224586, + "grad_norm": 2.4449520111083984, + "learning_rate": 4.691187309732952e-06, + "loss": 0.5841, + "step": 2113 + }, + { + "epoch": 0.9995271867612293, + "grad_norm": 2.908749580383301, + "learning_rate": 4.690886901571875e-06, + "loss": 0.534, + "step": 2114 + }, + { + "epoch": 1.0, + "grad_norm": 4.019968032836914, + "learning_rate": 4.6905863569938785e-06, + "loss": 0.596, + "step": 2115 + }, + { + "epoch": 1.0004728132387706, + "grad_norm": 2.4319307804107666, + "learning_rate": 4.690285676017675e-06, + "loss": 0.4973, + "step": 2116 + }, + { + "epoch": 1.0009456264775414, + "grad_norm": 2.6366477012634277, + "learning_rate": 4.689984858661986e-06, + "loss": 0.5682, + "step": 2117 + }, + { + "epoch": 1.001418439716312, + "grad_norm": 2.815114974975586, + "learning_rate": 4.689683904945542e-06, + "loss": 0.5616, + "step": 2118 + }, + { + "epoch": 1.0018912529550827, + "grad_norm": 2.6680490970611572, + "learning_rate": 4.689382814887084e-06, + "loss": 0.5161, + "step": 2119 + }, + { + "epoch": 1.0023640661938533, + "grad_norm": 2.7406351566314697, + "learning_rate": 4.689081588505358e-06, + "loss": 0.4937, + "step": 2120 + }, + { + "epoch": 1.0028368794326241, + "grad_norm": 2.2832298278808594, + "learning_rate": 4.68878022581912e-06, + "loss": 0.4986, + "step": 2121 + }, + { + "epoch": 1.0033096926713947, + "grad_norm": 2.5525307655334473, + "learning_rate": 4.688478726847136e-06, + "loss": 0.4909, + "step": 2122 + }, + { + "epoch": 1.0037825059101655, + "grad_norm": 2.9843199253082275, + "learning_rate": 4.688177091608176e-06, + "loss": 0.6046, + "step": 2123 + }, + { + "epoch": 1.004255319148936, + "grad_norm": 2.5231106281280518, + "learning_rate": 4.687875320121024e-06, + "loss": 0.5423, + "step": 2124 + }, + { + "epoch": 1.0047281323877069, + "grad_norm": 2.567599058151245, + "learning_rate": 4.68757341240447e-06, + "loss": 0.5092, + "step": 2125 + }, + { + "epoch": 1.0052009456264774, + "grad_norm": 2.768111228942871, + "learning_rate": 4.687271368477311e-06, + "loss": 0.5175, + "step": 2126 + }, + { + "epoch": 1.0056737588652482, + "grad_norm": 2.7223286628723145, + "learning_rate": 4.686969188358355e-06, + "loss": 0.5412, + "step": 2127 + }, + { + "epoch": 1.0061465721040188, + "grad_norm": 2.488299608230591, + "learning_rate": 4.686666872066418e-06, + "loss": 0.5288, + "step": 2128 + }, + { + "epoch": 1.0066193853427896, + "grad_norm": 2.882981777191162, + "learning_rate": 4.6863644196203215e-06, + "loss": 0.6117, + "step": 2129 + }, + { + "epoch": 1.0070921985815602, + "grad_norm": 3.0019447803497314, + "learning_rate": 4.686061831038901e-06, + "loss": 0.5308, + "step": 2130 + }, + { + "epoch": 1.007565011820331, + "grad_norm": 3.0056138038635254, + "learning_rate": 4.685759106340996e-06, + "loss": 0.5833, + "step": 2131 + }, + { + "epoch": 1.0080378250591016, + "grad_norm": 2.5709075927734375, + "learning_rate": 4.685456245545454e-06, + "loss": 0.5071, + "step": 2132 + }, + { + "epoch": 1.0085106382978724, + "grad_norm": 2.4641504287719727, + "learning_rate": 4.685153248671136e-06, + "loss": 0.4813, + "step": 2133 + }, + { + "epoch": 1.008983451536643, + "grad_norm": 2.374413013458252, + "learning_rate": 4.684850115736906e-06, + "loss": 0.5179, + "step": 2134 + }, + { + "epoch": 1.0094562647754137, + "grad_norm": 2.6504571437835693, + "learning_rate": 4.684546846761641e-06, + "loss": 0.437, + "step": 2135 + }, + { + "epoch": 1.0099290780141843, + "grad_norm": 2.5977871417999268, + "learning_rate": 4.684243441764221e-06, + "loss": 0.497, + "step": 2136 + }, + { + "epoch": 1.010401891252955, + "grad_norm": 2.4950785636901855, + "learning_rate": 4.683939900763541e-06, + "loss": 0.5624, + "step": 2137 + }, + { + "epoch": 1.0108747044917257, + "grad_norm": 3.065718412399292, + "learning_rate": 4.6836362237785e-06, + "loss": 0.512, + "step": 2138 + }, + { + "epoch": 1.0113475177304965, + "grad_norm": 2.7419207096099854, + "learning_rate": 4.6833324108280045e-06, + "loss": 0.5585, + "step": 2139 + }, + { + "epoch": 1.011820330969267, + "grad_norm": 2.623610496520996, + "learning_rate": 4.6830284619309744e-06, + "loss": 0.5163, + "step": 2140 + }, + { + "epoch": 1.0122931442080378, + "grad_norm": 2.774322986602783, + "learning_rate": 4.682724377106334e-06, + "loss": 0.527, + "step": 2141 + }, + { + "epoch": 1.0127659574468084, + "grad_norm": 2.959935188293457, + "learning_rate": 4.682420156373017e-06, + "loss": 0.6166, + "step": 2142 + }, + { + "epoch": 1.0132387706855792, + "grad_norm": 2.584026336669922, + "learning_rate": 4.682115799749968e-06, + "loss": 0.5086, + "step": 2143 + }, + { + "epoch": 1.0137115839243498, + "grad_norm": 2.6039700508117676, + "learning_rate": 4.6818113072561346e-06, + "loss": 0.49, + "step": 2144 + }, + { + "epoch": 1.0141843971631206, + "grad_norm": 2.466381072998047, + "learning_rate": 4.681506678910479e-06, + "loss": 0.4959, + "step": 2145 + }, + { + "epoch": 1.0146572104018912, + "grad_norm": 2.432636260986328, + "learning_rate": 4.681201914731969e-06, + "loss": 0.5057, + "step": 2146 + }, + { + "epoch": 1.015130023640662, + "grad_norm": 2.6134090423583984, + "learning_rate": 4.680897014739579e-06, + "loss": 0.4874, + "step": 2147 + }, + { + "epoch": 1.0156028368794325, + "grad_norm": 2.774481773376465, + "learning_rate": 4.680591978952295e-06, + "loss": 0.4967, + "step": 2148 + }, + { + "epoch": 1.0160756501182033, + "grad_norm": 2.66050124168396, + "learning_rate": 4.68028680738911e-06, + "loss": 0.4932, + "step": 2149 + }, + { + "epoch": 1.016548463356974, + "grad_norm": 3.020594835281372, + "learning_rate": 4.679981500069026e-06, + "loss": 0.5788, + "step": 2150 + }, + { + "epoch": 1.0170212765957447, + "grad_norm": 2.697758436203003, + "learning_rate": 4.679676057011053e-06, + "loss": 0.5441, + "step": 2151 + }, + { + "epoch": 1.0174940898345153, + "grad_norm": 6.986445903778076, + "learning_rate": 4.679370478234209e-06, + "loss": 0.6483, + "step": 2152 + }, + { + "epoch": 1.017966903073286, + "grad_norm": 2.6637115478515625, + "learning_rate": 4.679064763757522e-06, + "loss": 0.5859, + "step": 2153 + }, + { + "epoch": 1.0184397163120567, + "grad_norm": 2.7501862049102783, + "learning_rate": 4.678758913600027e-06, + "loss": 0.5745, + "step": 2154 + }, + { + "epoch": 1.0189125295508275, + "grad_norm": 2.7959372997283936, + "learning_rate": 4.678452927780768e-06, + "loss": 0.5076, + "step": 2155 + }, + { + "epoch": 1.019385342789598, + "grad_norm": 2.4377388954162598, + "learning_rate": 4.678146806318798e-06, + "loss": 0.5061, + "step": 2156 + }, + { + "epoch": 1.0198581560283688, + "grad_norm": 2.5478947162628174, + "learning_rate": 4.677840549233176e-06, + "loss": 0.4941, + "step": 2157 + }, + { + "epoch": 1.0203309692671394, + "grad_norm": 3.0956528186798096, + "learning_rate": 4.677534156542973e-06, + "loss": 0.5879, + "step": 2158 + }, + { + "epoch": 1.0208037825059102, + "grad_norm": 2.5247607231140137, + "learning_rate": 4.6772276282672666e-06, + "loss": 0.5532, + "step": 2159 + }, + { + "epoch": 1.0212765957446808, + "grad_norm": 3.1972787380218506, + "learning_rate": 4.676920964425143e-06, + "loss": 0.6081, + "step": 2160 + }, + { + "epoch": 1.0217494089834516, + "grad_norm": 2.6173388957977295, + "learning_rate": 4.6766141650356955e-06, + "loss": 0.5001, + "step": 2161 + }, + { + "epoch": 1.0222222222222221, + "grad_norm": 2.9914398193359375, + "learning_rate": 4.676307230118029e-06, + "loss": 0.5566, + "step": 2162 + }, + { + "epoch": 1.022695035460993, + "grad_norm": 2.8011834621429443, + "learning_rate": 4.676000159691254e-06, + "loss": 0.4909, + "step": 2163 + }, + { + "epoch": 1.0231678486997635, + "grad_norm": 2.6049559116363525, + "learning_rate": 4.67569295377449e-06, + "loss": 0.5018, + "step": 2164 + }, + { + "epoch": 1.0236406619385343, + "grad_norm": 2.8175013065338135, + "learning_rate": 4.675385612386866e-06, + "loss": 0.5309, + "step": 2165 + }, + { + "epoch": 1.0241134751773049, + "grad_norm": 2.854696750640869, + "learning_rate": 4.675078135547519e-06, + "loss": 0.5627, + "step": 2166 + }, + { + "epoch": 1.0245862884160757, + "grad_norm": 3.1856436729431152, + "learning_rate": 4.674770523275594e-06, + "loss": 0.5475, + "step": 2167 + }, + { + "epoch": 1.0250591016548463, + "grad_norm": 2.8289129734039307, + "learning_rate": 4.674462775590244e-06, + "loss": 0.5878, + "step": 2168 + }, + { + "epoch": 1.025531914893617, + "grad_norm": 2.8824517726898193, + "learning_rate": 4.6741548925106325e-06, + "loss": 0.4392, + "step": 2169 + }, + { + "epoch": 1.0260047281323876, + "grad_norm": 2.7044589519500732, + "learning_rate": 4.673846874055928e-06, + "loss": 0.5264, + "step": 2170 + }, + { + "epoch": 1.0264775413711584, + "grad_norm": 2.575035810470581, + "learning_rate": 4.673538720245312e-06, + "loss": 0.4615, + "step": 2171 + }, + { + "epoch": 1.026950354609929, + "grad_norm": 2.48168683052063, + "learning_rate": 4.67323043109797e-06, + "loss": 0.4404, + "step": 2172 + }, + { + "epoch": 1.0274231678486998, + "grad_norm": 2.926593065261841, + "learning_rate": 4.672922006633098e-06, + "loss": 0.54, + "step": 2173 + }, + { + "epoch": 1.0278959810874704, + "grad_norm": 2.4610698223114014, + "learning_rate": 4.672613446869901e-06, + "loss": 0.5555, + "step": 2174 + }, + { + "epoch": 1.0283687943262412, + "grad_norm": 3.026901960372925, + "learning_rate": 4.672304751827592e-06, + "loss": 0.62, + "step": 2175 + }, + { + "epoch": 1.0288416075650118, + "grad_norm": 2.3946213722229004, + "learning_rate": 4.671995921525391e-06, + "loss": 0.5228, + "step": 2176 + }, + { + "epoch": 1.0293144208037825, + "grad_norm": 2.985020399093628, + "learning_rate": 4.671686955982528e-06, + "loss": 0.6256, + "step": 2177 + }, + { + "epoch": 1.0297872340425531, + "grad_norm": 3.0910139083862305, + "learning_rate": 4.671377855218239e-06, + "loss": 0.5893, + "step": 2178 + }, + { + "epoch": 1.030260047281324, + "grad_norm": 2.507805109024048, + "learning_rate": 4.6710686192517744e-06, + "loss": 0.5329, + "step": 2179 + }, + { + "epoch": 1.0307328605200945, + "grad_norm": 2.4514641761779785, + "learning_rate": 4.670759248102386e-06, + "loss": 0.4585, + "step": 2180 + }, + { + "epoch": 1.0312056737588653, + "grad_norm": 2.742838144302368, + "learning_rate": 4.670449741789337e-06, + "loss": 0.6255, + "step": 2181 + }, + { + "epoch": 1.0316784869976359, + "grad_norm": 2.374349594116211, + "learning_rate": 4.670140100331901e-06, + "loss": 0.5049, + "step": 2182 + }, + { + "epoch": 1.0321513002364067, + "grad_norm": 2.78894305229187, + "learning_rate": 4.669830323749356e-06, + "loss": 0.6061, + "step": 2183 + }, + { + "epoch": 1.0326241134751772, + "grad_norm": 2.7195091247558594, + "learning_rate": 4.6695204120609905e-06, + "loss": 0.592, + "step": 2184 + }, + { + "epoch": 1.033096926713948, + "grad_norm": 2.824411630630493, + "learning_rate": 4.6692103652861035e-06, + "loss": 0.5666, + "step": 2185 + }, + { + "epoch": 1.0335697399527186, + "grad_norm": 2.4981014728546143, + "learning_rate": 4.6689001834439975e-06, + "loss": 0.5045, + "step": 2186 + }, + { + "epoch": 1.0340425531914894, + "grad_norm": 2.7375214099884033, + "learning_rate": 4.668589866553988e-06, + "loss": 0.5305, + "step": 2187 + }, + { + "epoch": 1.03451536643026, + "grad_norm": 2.625345468521118, + "learning_rate": 4.668279414635396e-06, + "loss": 0.4819, + "step": 2188 + }, + { + "epoch": 1.0349881796690308, + "grad_norm": 2.60479736328125, + "learning_rate": 4.667968827707553e-06, + "loss": 0.55, + "step": 2189 + }, + { + "epoch": 1.0354609929078014, + "grad_norm": 2.642014741897583, + "learning_rate": 4.667658105789797e-06, + "loss": 0.5264, + "step": 2190 + }, + { + "epoch": 1.0359338061465722, + "grad_norm": 2.5439083576202393, + "learning_rate": 4.667347248901476e-06, + "loss": 0.4657, + "step": 2191 + }, + { + "epoch": 1.0364066193853427, + "grad_norm": 2.5537586212158203, + "learning_rate": 4.667036257061945e-06, + "loss": 0.527, + "step": 2192 + }, + { + "epoch": 1.0368794326241135, + "grad_norm": 2.595466375350952, + "learning_rate": 4.666725130290569e-06, + "loss": 0.5336, + "step": 2193 + }, + { + "epoch": 1.037352245862884, + "grad_norm": 3.5106313228607178, + "learning_rate": 4.666413868606719e-06, + "loss": 0.5176, + "step": 2194 + }, + { + "epoch": 1.037825059101655, + "grad_norm": 2.931553363800049, + "learning_rate": 4.666102472029778e-06, + "loss": 0.549, + "step": 2195 + }, + { + "epoch": 1.0382978723404255, + "grad_norm": 2.4325125217437744, + "learning_rate": 4.665790940579133e-06, + "loss": 0.5095, + "step": 2196 + }, + { + "epoch": 1.0387706855791963, + "grad_norm": 2.708477258682251, + "learning_rate": 4.665479274274184e-06, + "loss": 0.5264, + "step": 2197 + }, + { + "epoch": 1.0392434988179668, + "grad_norm": 2.905977487564087, + "learning_rate": 4.665167473134335e-06, + "loss": 0.5575, + "step": 2198 + }, + { + "epoch": 1.0397163120567376, + "grad_norm": 2.428938865661621, + "learning_rate": 4.664855537179003e-06, + "loss": 0.5099, + "step": 2199 + }, + { + "epoch": 1.0401891252955082, + "grad_norm": 2.8432137966156006, + "learning_rate": 4.6645434664276075e-06, + "loss": 0.5331, + "step": 2200 + }, + { + "epoch": 1.040661938534279, + "grad_norm": 2.5185136795043945, + "learning_rate": 4.6642312608995825e-06, + "loss": 0.5217, + "step": 2201 + }, + { + "epoch": 1.0411347517730496, + "grad_norm": 2.556607723236084, + "learning_rate": 4.663918920614366e-06, + "loss": 0.4431, + "step": 2202 + }, + { + "epoch": 1.0416075650118204, + "grad_norm": 3.1271166801452637, + "learning_rate": 4.663606445591407e-06, + "loss": 0.5398, + "step": 2203 + }, + { + "epoch": 1.042080378250591, + "grad_norm": 2.573680877685547, + "learning_rate": 4.663293835850162e-06, + "loss": 0.4713, + "step": 2204 + }, + { + "epoch": 1.0425531914893618, + "grad_norm": 2.5230324268341064, + "learning_rate": 4.662981091410096e-06, + "loss": 0.5571, + "step": 2205 + }, + { + "epoch": 1.0430260047281323, + "grad_norm": 2.552182912826538, + "learning_rate": 4.662668212290681e-06, + "loss": 0.5173, + "step": 2206 + }, + { + "epoch": 1.0434988179669031, + "grad_norm": 2.832345724105835, + "learning_rate": 4.6623551985113995e-06, + "loss": 0.525, + "step": 2207 + }, + { + "epoch": 1.0439716312056737, + "grad_norm": 2.9729080200195312, + "learning_rate": 4.6620420500917416e-06, + "loss": 0.6308, + "step": 2208 + }, + { + "epoch": 1.0444444444444445, + "grad_norm": 2.618187665939331, + "learning_rate": 4.661728767051206e-06, + "loss": 0.4942, + "step": 2209 + }, + { + "epoch": 1.044917257683215, + "grad_norm": 2.515566349029541, + "learning_rate": 4.661415349409299e-06, + "loss": 0.5229, + "step": 2210 + }, + { + "epoch": 1.0453900709219859, + "grad_norm": 2.8651459217071533, + "learning_rate": 4.6611017971855356e-06, + "loss": 0.5029, + "step": 2211 + }, + { + "epoch": 1.0458628841607565, + "grad_norm": 2.502405881881714, + "learning_rate": 4.660788110399439e-06, + "loss": 0.4732, + "step": 2212 + }, + { + "epoch": 1.0463356973995273, + "grad_norm": 2.540668249130249, + "learning_rate": 4.660474289070541e-06, + "loss": 0.547, + "step": 2213 + }, + { + "epoch": 1.0468085106382978, + "grad_norm": 2.803469181060791, + "learning_rate": 4.660160333218384e-06, + "loss": 0.5441, + "step": 2214 + }, + { + "epoch": 1.0472813238770686, + "grad_norm": 3.233325481414795, + "learning_rate": 4.659846242862514e-06, + "loss": 0.4457, + "step": 2215 + }, + { + "epoch": 1.0477541371158392, + "grad_norm": 2.549548387527466, + "learning_rate": 4.659532018022489e-06, + "loss": 0.5684, + "step": 2216 + }, + { + "epoch": 1.04822695035461, + "grad_norm": 2.6112852096557617, + "learning_rate": 4.659217658717875e-06, + "loss": 0.5323, + "step": 2217 + }, + { + "epoch": 1.0486997635933806, + "grad_norm": 2.347418785095215, + "learning_rate": 4.658903164968245e-06, + "loss": 0.5349, + "step": 2218 + }, + { + "epoch": 1.0491725768321514, + "grad_norm": 2.695502281188965, + "learning_rate": 4.658588536793182e-06, + "loss": 0.4883, + "step": 2219 + }, + { + "epoch": 1.049645390070922, + "grad_norm": 2.7575674057006836, + "learning_rate": 4.658273774212275e-06, + "loss": 0.5517, + "step": 2220 + }, + { + "epoch": 1.0501182033096927, + "grad_norm": 2.787855386734009, + "learning_rate": 4.6579588772451245e-06, + "loss": 0.5744, + "step": 2221 + }, + { + "epoch": 1.0505910165484633, + "grad_norm": 3.0699398517608643, + "learning_rate": 4.657643845911337e-06, + "loss": 0.5258, + "step": 2222 + }, + { + "epoch": 1.0510638297872341, + "grad_norm": 2.652040719985962, + "learning_rate": 4.657328680230527e-06, + "loss": 0.5141, + "step": 2223 + }, + { + "epoch": 1.0515366430260047, + "grad_norm": 2.6896369457244873, + "learning_rate": 4.657013380222322e-06, + "loss": 0.5139, + "step": 2224 + }, + { + "epoch": 1.0520094562647755, + "grad_norm": 2.551839590072632, + "learning_rate": 4.65669794590635e-06, + "loss": 0.5099, + "step": 2225 + }, + { + "epoch": 1.052482269503546, + "grad_norm": 2.8543262481689453, + "learning_rate": 4.656382377302255e-06, + "loss": 0.6085, + "step": 2226 + }, + { + "epoch": 1.0529550827423169, + "grad_norm": 2.871469259262085, + "learning_rate": 4.656066674429685e-06, + "loss": 0.6108, + "step": 2227 + }, + { + "epoch": 1.0534278959810874, + "grad_norm": 2.4840824604034424, + "learning_rate": 4.655750837308296e-06, + "loss": 0.4994, + "step": 2228 + }, + { + "epoch": 1.0539007092198582, + "grad_norm": 2.5203280448913574, + "learning_rate": 4.6554348659577555e-06, + "loss": 0.4928, + "step": 2229 + }, + { + "epoch": 1.0543735224586288, + "grad_norm": 2.9327683448791504, + "learning_rate": 4.655118760397737e-06, + "loss": 0.6324, + "step": 2230 + }, + { + "epoch": 1.0548463356973996, + "grad_norm": 2.6766855716705322, + "learning_rate": 4.654802520647924e-06, + "loss": 0.5178, + "step": 2231 + }, + { + "epoch": 1.0553191489361702, + "grad_norm": 2.8438873291015625, + "learning_rate": 4.654486146728006e-06, + "loss": 0.509, + "step": 2232 + }, + { + "epoch": 1.055791962174941, + "grad_norm": 2.538661241531372, + "learning_rate": 4.6541696386576826e-06, + "loss": 0.5463, + "step": 2233 + }, + { + "epoch": 1.0562647754137116, + "grad_norm": 2.829030990600586, + "learning_rate": 4.653852996456662e-06, + "loss": 0.5404, + "step": 2234 + }, + { + "epoch": 1.0567375886524824, + "grad_norm": 2.5657269954681396, + "learning_rate": 4.653536220144659e-06, + "loss": 0.5479, + "step": 2235 + }, + { + "epoch": 1.057210401891253, + "grad_norm": 2.6641297340393066, + "learning_rate": 4.653219309741399e-06, + "loss": 0.5503, + "step": 2236 + }, + { + "epoch": 1.0576832151300237, + "grad_norm": 2.966350555419922, + "learning_rate": 4.652902265266615e-06, + "loss": 0.6404, + "step": 2237 + }, + { + "epoch": 1.0581560283687943, + "grad_norm": 2.462430000305176, + "learning_rate": 4.6525850867400455e-06, + "loss": 0.4885, + "step": 2238 + }, + { + "epoch": 1.058628841607565, + "grad_norm": 2.1791880130767822, + "learning_rate": 4.652267774181443e-06, + "loss": 0.4405, + "step": 2239 + }, + { + "epoch": 1.0591016548463357, + "grad_norm": 2.5473732948303223, + "learning_rate": 4.651950327610563e-06, + "loss": 0.5295, + "step": 2240 + }, + { + "epoch": 1.0595744680851065, + "grad_norm": 2.70904803276062, + "learning_rate": 4.651632747047172e-06, + "loss": 0.5169, + "step": 2241 + }, + { + "epoch": 1.060047281323877, + "grad_norm": 3.8442928791046143, + "learning_rate": 4.651315032511045e-06, + "loss": 0.5473, + "step": 2242 + }, + { + "epoch": 1.0605200945626478, + "grad_norm": 2.8613383769989014, + "learning_rate": 4.650997184021963e-06, + "loss": 0.5445, + "step": 2243 + }, + { + "epoch": 1.0609929078014184, + "grad_norm": 2.5995829105377197, + "learning_rate": 4.6506792015997184e-06, + "loss": 0.5525, + "step": 2244 + }, + { + "epoch": 1.0614657210401892, + "grad_norm": 2.5465996265411377, + "learning_rate": 4.650361085264111e-06, + "loss": 0.5093, + "step": 2245 + }, + { + "epoch": 1.0619385342789598, + "grad_norm": 2.46553111076355, + "learning_rate": 4.650042835034948e-06, + "loss": 0.5375, + "step": 2246 + }, + { + "epoch": 1.0624113475177306, + "grad_norm": 2.6907830238342285, + "learning_rate": 4.649724450932045e-06, + "loss": 0.572, + "step": 2247 + }, + { + "epoch": 1.0628841607565012, + "grad_norm": 3.0671346187591553, + "learning_rate": 4.649405932975226e-06, + "loss": 0.4974, + "step": 2248 + }, + { + "epoch": 1.063356973995272, + "grad_norm": 2.5392491817474365, + "learning_rate": 4.649087281184325e-06, + "loss": 0.524, + "step": 2249 + }, + { + "epoch": 1.0638297872340425, + "grad_norm": 2.7498562335968018, + "learning_rate": 4.648768495579183e-06, + "loss": 0.5801, + "step": 2250 + }, + { + "epoch": 1.0643026004728133, + "grad_norm": 2.8536248207092285, + "learning_rate": 4.648449576179649e-06, + "loss": 0.5384, + "step": 2251 + }, + { + "epoch": 1.064775413711584, + "grad_norm": 2.7062792778015137, + "learning_rate": 4.64813052300558e-06, + "loss": 0.5262, + "step": 2252 + }, + { + "epoch": 1.0652482269503547, + "grad_norm": 2.798650026321411, + "learning_rate": 4.647811336076841e-06, + "loss": 0.5719, + "step": 2253 + }, + { + "epoch": 1.0657210401891253, + "grad_norm": 2.9793951511383057, + "learning_rate": 4.647492015413311e-06, + "loss": 0.5377, + "step": 2254 + }, + { + "epoch": 1.066193853427896, + "grad_norm": 2.572129011154175, + "learning_rate": 4.647172561034868e-06, + "loss": 0.4791, + "step": 2255 + }, + { + "epoch": 1.0666666666666667, + "grad_norm": 3.7490930557250977, + "learning_rate": 4.646852972961405e-06, + "loss": 0.5423, + "step": 2256 + }, + { + "epoch": 1.0671394799054374, + "grad_norm": 2.626255750656128, + "learning_rate": 4.646533251212821e-06, + "loss": 0.5558, + "step": 2257 + }, + { + "epoch": 1.067612293144208, + "grad_norm": 2.8408126831054688, + "learning_rate": 4.646213395809023e-06, + "loss": 0.55, + "step": 2258 + }, + { + "epoch": 1.0680851063829788, + "grad_norm": 3.255606174468994, + "learning_rate": 4.645893406769929e-06, + "loss": 0.547, + "step": 2259 + }, + { + "epoch": 1.0685579196217494, + "grad_norm": 2.4352102279663086, + "learning_rate": 4.645573284115461e-06, + "loss": 0.4898, + "step": 2260 + }, + { + "epoch": 1.0690307328605202, + "grad_norm": 2.408634662628174, + "learning_rate": 4.6452530278655535e-06, + "loss": 0.5264, + "step": 2261 + }, + { + "epoch": 1.0695035460992908, + "grad_norm": 2.4220449924468994, + "learning_rate": 4.644932638040146e-06, + "loss": 0.5166, + "step": 2262 + }, + { + "epoch": 1.0699763593380616, + "grad_norm": 2.9188082218170166, + "learning_rate": 4.644612114659188e-06, + "loss": 0.5611, + "step": 2263 + }, + { + "epoch": 1.0704491725768321, + "grad_norm": 2.906557083129883, + "learning_rate": 4.644291457742638e-06, + "loss": 0.5515, + "step": 2264 + }, + { + "epoch": 1.070921985815603, + "grad_norm": 2.9039015769958496, + "learning_rate": 4.643970667310462e-06, + "loss": 0.5732, + "step": 2265 + }, + { + "epoch": 1.0713947990543735, + "grad_norm": 2.9985480308532715, + "learning_rate": 4.643649743382632e-06, + "loss": 0.563, + "step": 2266 + }, + { + "epoch": 1.0718676122931443, + "grad_norm": 2.5780906677246094, + "learning_rate": 4.6433286859791335e-06, + "loss": 0.502, + "step": 2267 + }, + { + "epoch": 1.0723404255319149, + "grad_norm": 2.590209722518921, + "learning_rate": 4.643007495119955e-06, + "loss": 0.4995, + "step": 2268 + }, + { + "epoch": 1.0728132387706855, + "grad_norm": 2.378894805908203, + "learning_rate": 4.642686170825097e-06, + "loss": 0.4886, + "step": 2269 + }, + { + "epoch": 1.0732860520094563, + "grad_norm": 2.6826229095458984, + "learning_rate": 4.642364713114567e-06, + "loss": 0.465, + "step": 2270 + }, + { + "epoch": 1.073758865248227, + "grad_norm": 2.627819538116455, + "learning_rate": 4.64204312200838e-06, + "loss": 0.4954, + "step": 2271 + }, + { + "epoch": 1.0742316784869976, + "grad_norm": 2.993021249771118, + "learning_rate": 4.641721397526561e-06, + "loss": 0.5073, + "step": 2272 + }, + { + "epoch": 1.0747044917257682, + "grad_norm": 2.719052791595459, + "learning_rate": 4.64139953968914e-06, + "loss": 0.538, + "step": 2273 + }, + { + "epoch": 1.075177304964539, + "grad_norm": 2.729252576828003, + "learning_rate": 4.6410775485161605e-06, + "loss": 0.552, + "step": 2274 + }, + { + "epoch": 1.0756501182033098, + "grad_norm": 2.924142599105835, + "learning_rate": 4.640755424027671e-06, + "loss": 0.522, + "step": 2275 + }, + { + "epoch": 1.0761229314420804, + "grad_norm": 3.329162120819092, + "learning_rate": 4.640433166243728e-06, + "loss": 0.5965, + "step": 2276 + }, + { + "epoch": 1.076595744680851, + "grad_norm": 2.9810245037078857, + "learning_rate": 4.640110775184396e-06, + "loss": 0.5653, + "step": 2277 + }, + { + "epoch": 1.0770685579196217, + "grad_norm": 2.61772084236145, + "learning_rate": 4.639788250869751e-06, + "loss": 0.5382, + "step": 2278 + }, + { + "epoch": 1.0775413711583925, + "grad_norm": 2.741225004196167, + "learning_rate": 4.639465593319874e-06, + "loss": 0.4866, + "step": 2279 + }, + { + "epoch": 1.0780141843971631, + "grad_norm": 2.7945218086242676, + "learning_rate": 4.639142802554856e-06, + "loss": 0.4711, + "step": 2280 + }, + { + "epoch": 1.0784869976359337, + "grad_norm": 2.4282329082489014, + "learning_rate": 4.638819878594795e-06, + "loss": 0.4911, + "step": 2281 + }, + { + "epoch": 1.0789598108747045, + "grad_norm": 2.551741361618042, + "learning_rate": 4.638496821459799e-06, + "loss": 0.453, + "step": 2282 + }, + { + "epoch": 1.0794326241134753, + "grad_norm": 2.5622754096984863, + "learning_rate": 4.638173631169983e-06, + "loss": 0.5983, + "step": 2283 + }, + { + "epoch": 1.0799054373522459, + "grad_norm": 2.7748284339904785, + "learning_rate": 4.6378503077454715e-06, + "loss": 0.5143, + "step": 2284 + }, + { + "epoch": 1.0803782505910164, + "grad_norm": 2.7693238258361816, + "learning_rate": 4.637526851206394e-06, + "loss": 0.5929, + "step": 2285 + }, + { + "epoch": 1.0808510638297872, + "grad_norm": 2.705548048019409, + "learning_rate": 4.637203261572893e-06, + "loss": 0.5577, + "step": 2286 + }, + { + "epoch": 1.081323877068558, + "grad_norm": 2.739307165145874, + "learning_rate": 4.636879538865117e-06, + "loss": 0.5676, + "step": 2287 + }, + { + "epoch": 1.0817966903073286, + "grad_norm": 2.514059543609619, + "learning_rate": 4.636555683103221e-06, + "loss": 0.5001, + "step": 2288 + }, + { + "epoch": 1.0822695035460992, + "grad_norm": 2.7166874408721924, + "learning_rate": 4.636231694307372e-06, + "loss": 0.5411, + "step": 2289 + }, + { + "epoch": 1.08274231678487, + "grad_norm": 2.7661683559417725, + "learning_rate": 4.635907572497741e-06, + "loss": 0.6353, + "step": 2290 + }, + { + "epoch": 1.0832151300236406, + "grad_norm": 2.598381996154785, + "learning_rate": 4.635583317694512e-06, + "loss": 0.5213, + "step": 2291 + }, + { + "epoch": 1.0836879432624114, + "grad_norm": 2.821491003036499, + "learning_rate": 4.6352589299178744e-06, + "loss": 0.6172, + "step": 2292 + }, + { + "epoch": 1.084160756501182, + "grad_norm": 2.5422823429107666, + "learning_rate": 4.634934409188025e-06, + "loss": 0.5245, + "step": 2293 + }, + { + "epoch": 1.0846335697399527, + "grad_norm": 2.8264620304107666, + "learning_rate": 4.634609755525173e-06, + "loss": 0.5004, + "step": 2294 + }, + { + "epoch": 1.0851063829787233, + "grad_norm": 2.3286643028259277, + "learning_rate": 4.63428496894953e-06, + "loss": 0.4561, + "step": 2295 + }, + { + "epoch": 1.085579196217494, + "grad_norm": 2.462005376815796, + "learning_rate": 4.633960049481321e-06, + "loss": 0.4948, + "step": 2296 + }, + { + "epoch": 1.0860520094562647, + "grad_norm": 2.760258913040161, + "learning_rate": 4.633634997140777e-06, + "loss": 0.5407, + "step": 2297 + }, + { + "epoch": 1.0865248226950355, + "grad_norm": 3.0234217643737793, + "learning_rate": 4.633309811948138e-06, + "loss": 0.4914, + "step": 2298 + }, + { + "epoch": 1.086997635933806, + "grad_norm": 2.8380849361419678, + "learning_rate": 4.63298449392365e-06, + "loss": 0.5562, + "step": 2299 + }, + { + "epoch": 1.0874704491725768, + "grad_norm": 2.6201648712158203, + "learning_rate": 4.632659043087572e-06, + "loss": 0.5882, + "step": 2300 + }, + { + "epoch": 1.0879432624113474, + "grad_norm": 2.586339235305786, + "learning_rate": 4.632333459460165e-06, + "loss": 0.4991, + "step": 2301 + }, + { + "epoch": 1.0884160756501182, + "grad_norm": 2.500115394592285, + "learning_rate": 4.632007743061705e-06, + "loss": 0.552, + "step": 2302 + }, + { + "epoch": 1.0888888888888888, + "grad_norm": 2.816390037536621, + "learning_rate": 4.63168189391247e-06, + "loss": 0.5301, + "step": 2303 + }, + { + "epoch": 1.0893617021276596, + "grad_norm": 2.975400924682617, + "learning_rate": 4.631355912032753e-06, + "loss": 0.6056, + "step": 2304 + }, + { + "epoch": 1.0898345153664302, + "grad_norm": 2.747985363006592, + "learning_rate": 4.631029797442846e-06, + "loss": 0.5335, + "step": 2305 + }, + { + "epoch": 1.090307328605201, + "grad_norm": 2.609281539916992, + "learning_rate": 4.630703550163059e-06, + "loss": 0.5189, + "step": 2306 + }, + { + "epoch": 1.0907801418439715, + "grad_norm": 2.624131202697754, + "learning_rate": 4.630377170213705e-06, + "loss": 0.5646, + "step": 2307 + }, + { + "epoch": 1.0912529550827423, + "grad_norm": 2.6186959743499756, + "learning_rate": 4.630050657615107e-06, + "loss": 0.5187, + "step": 2308 + }, + { + "epoch": 1.091725768321513, + "grad_norm": 2.9961764812469482, + "learning_rate": 4.629724012387594e-06, + "loss": 0.6207, + "step": 2309 + }, + { + "epoch": 1.0921985815602837, + "grad_norm": 2.665799140930176, + "learning_rate": 4.629397234551505e-06, + "loss": 0.5046, + "step": 2310 + }, + { + "epoch": 1.0926713947990543, + "grad_norm": 2.6154725551605225, + "learning_rate": 4.629070324127187e-06, + "loss": 0.5553, + "step": 2311 + }, + { + "epoch": 1.093144208037825, + "grad_norm": 2.702967643737793, + "learning_rate": 4.628743281134996e-06, + "loss": 0.5159, + "step": 2312 + }, + { + "epoch": 1.0936170212765957, + "grad_norm": 2.578080177307129, + "learning_rate": 4.628416105595295e-06, + "loss": 0.4934, + "step": 2313 + }, + { + "epoch": 1.0940898345153665, + "grad_norm": 2.8763060569763184, + "learning_rate": 4.628088797528456e-06, + "loss": 0.5404, + "step": 2314 + }, + { + "epoch": 1.094562647754137, + "grad_norm": 2.5301198959350586, + "learning_rate": 4.6277613569548585e-06, + "loss": 0.524, + "step": 2315 + }, + { + "epoch": 1.0950354609929078, + "grad_norm": 2.559903144836426, + "learning_rate": 4.627433783894892e-06, + "loss": 0.5177, + "step": 2316 + }, + { + "epoch": 1.0955082742316784, + "grad_norm": 2.430863380432129, + "learning_rate": 4.627106078368952e-06, + "loss": 0.5368, + "step": 2317 + }, + { + "epoch": 1.0959810874704492, + "grad_norm": 2.687567949295044, + "learning_rate": 4.626778240397444e-06, + "loss": 0.5385, + "step": 2318 + }, + { + "epoch": 1.0964539007092198, + "grad_norm": 3.053466558456421, + "learning_rate": 4.62645027000078e-06, + "loss": 0.5814, + "step": 2319 + }, + { + "epoch": 1.0969267139479906, + "grad_norm": 2.4612979888916016, + "learning_rate": 4.6261221671993815e-06, + "loss": 0.5069, + "step": 2320 + }, + { + "epoch": 1.0973995271867611, + "grad_norm": 2.6153628826141357, + "learning_rate": 4.625793932013679e-06, + "loss": 0.5422, + "step": 2321 + }, + { + "epoch": 1.097872340425532, + "grad_norm": 2.8918874263763428, + "learning_rate": 4.62546556446411e-06, + "loss": 0.5326, + "step": 2322 + }, + { + "epoch": 1.0983451536643025, + "grad_norm": 3.62565279006958, + "learning_rate": 4.625137064571119e-06, + "loss": 0.5164, + "step": 2323 + }, + { + "epoch": 1.0988179669030733, + "grad_norm": 2.4285085201263428, + "learning_rate": 4.624808432355164e-06, + "loss": 0.5084, + "step": 2324 + }, + { + "epoch": 1.099290780141844, + "grad_norm": 2.593979835510254, + "learning_rate": 4.624479667836702e-06, + "loss": 0.4986, + "step": 2325 + }, + { + "epoch": 1.0997635933806147, + "grad_norm": 2.490752935409546, + "learning_rate": 4.624150771036208e-06, + "loss": 0.5296, + "step": 2326 + }, + { + "epoch": 1.1002364066193853, + "grad_norm": 2.67694091796875, + "learning_rate": 4.6238217419741595e-06, + "loss": 0.5229, + "step": 2327 + }, + { + "epoch": 1.100709219858156, + "grad_norm": 2.594147205352783, + "learning_rate": 4.623492580671044e-06, + "loss": 0.4916, + "step": 2328 + }, + { + "epoch": 1.1011820330969266, + "grad_norm": 2.943472385406494, + "learning_rate": 4.623163287147356e-06, + "loss": 0.5591, + "step": 2329 + }, + { + "epoch": 1.1016548463356974, + "grad_norm": 2.569410562515259, + "learning_rate": 4.622833861423601e-06, + "loss": 0.4648, + "step": 2330 + }, + { + "epoch": 1.102127659574468, + "grad_norm": 2.5490405559539795, + "learning_rate": 4.6225043035202886e-06, + "loss": 0.5493, + "step": 2331 + }, + { + "epoch": 1.1026004728132388, + "grad_norm": 2.5964598655700684, + "learning_rate": 4.622174613457941e-06, + "loss": 0.5358, + "step": 2332 + }, + { + "epoch": 1.1030732860520094, + "grad_norm": 2.6456820964813232, + "learning_rate": 4.621844791257085e-06, + "loss": 0.5864, + "step": 2333 + }, + { + "epoch": 1.1035460992907802, + "grad_norm": 2.861180067062378, + "learning_rate": 4.621514836938259e-06, + "loss": 0.6064, + "step": 2334 + }, + { + "epoch": 1.1040189125295508, + "grad_norm": 2.8199548721313477, + "learning_rate": 4.621184750522005e-06, + "loss": 0.5244, + "step": 2335 + }, + { + "epoch": 1.1044917257683216, + "grad_norm": 2.7398853302001953, + "learning_rate": 4.6208545320288795e-06, + "loss": 0.5496, + "step": 2336 + }, + { + "epoch": 1.1049645390070921, + "grad_norm": 2.7941031455993652, + "learning_rate": 4.620524181479441e-06, + "loss": 0.5496, + "step": 2337 + }, + { + "epoch": 1.105437352245863, + "grad_norm": 2.973785161972046, + "learning_rate": 4.620193698894259e-06, + "loss": 0.5492, + "step": 2338 + }, + { + "epoch": 1.1059101654846335, + "grad_norm": 2.650355815887451, + "learning_rate": 4.6198630842939144e-06, + "loss": 0.5392, + "step": 2339 + }, + { + "epoch": 1.1063829787234043, + "grad_norm": 2.9092214107513428, + "learning_rate": 4.61953233769899e-06, + "loss": 0.5305, + "step": 2340 + }, + { + "epoch": 1.1068557919621749, + "grad_norm": 2.6329731941223145, + "learning_rate": 4.61920145913008e-06, + "loss": 0.5031, + "step": 2341 + }, + { + "epoch": 1.1073286052009457, + "grad_norm": 2.7214207649230957, + "learning_rate": 4.618870448607788e-06, + "loss": 0.5536, + "step": 2342 + }, + { + "epoch": 1.1078014184397162, + "grad_norm": 2.873119592666626, + "learning_rate": 4.618539306152724e-06, + "loss": 0.4531, + "step": 2343 + }, + { + "epoch": 1.108274231678487, + "grad_norm": 2.701042413711548, + "learning_rate": 4.618208031785507e-06, + "loss": 0.5217, + "step": 2344 + }, + { + "epoch": 1.1087470449172576, + "grad_norm": 2.7189881801605225, + "learning_rate": 4.6178766255267635e-06, + "loss": 0.6205, + "step": 2345 + }, + { + "epoch": 1.1092198581560284, + "grad_norm": 2.546382188796997, + "learning_rate": 4.61754508739713e-06, + "loss": 0.5475, + "step": 2346 + }, + { + "epoch": 1.109692671394799, + "grad_norm": 2.8429276943206787, + "learning_rate": 4.617213417417249e-06, + "loss": 0.4809, + "step": 2347 + }, + { + "epoch": 1.1101654846335698, + "grad_norm": 2.9515812397003174, + "learning_rate": 4.616881615607772e-06, + "loss": 0.5067, + "step": 2348 + }, + { + "epoch": 1.1106382978723404, + "grad_norm": 2.5910723209381104, + "learning_rate": 4.616549681989358e-06, + "loss": 0.5368, + "step": 2349 + }, + { + "epoch": 1.1111111111111112, + "grad_norm": 2.80855655670166, + "learning_rate": 4.616217616582678e-06, + "loss": 0.5827, + "step": 2350 + }, + { + "epoch": 1.1115839243498817, + "grad_norm": 2.604383945465088, + "learning_rate": 4.6158854194084044e-06, + "loss": 0.5716, + "step": 2351 + }, + { + "epoch": 1.1120567375886525, + "grad_norm": 3.0585904121398926, + "learning_rate": 4.6155530904872246e-06, + "loss": 0.4998, + "step": 2352 + }, + { + "epoch": 1.112529550827423, + "grad_norm": 2.660961627960205, + "learning_rate": 4.61522062983983e-06, + "loss": 0.4533, + "step": 2353 + }, + { + "epoch": 1.113002364066194, + "grad_norm": 2.8042070865631104, + "learning_rate": 4.614888037486923e-06, + "loss": 0.5592, + "step": 2354 + }, + { + "epoch": 1.1134751773049645, + "grad_norm": 2.681664228439331, + "learning_rate": 4.61455531344921e-06, + "loss": 0.5439, + "step": 2355 + }, + { + "epoch": 1.1139479905437353, + "grad_norm": 2.905054807662964, + "learning_rate": 4.61422245774741e-06, + "loss": 0.5497, + "step": 2356 + }, + { + "epoch": 1.1144208037825059, + "grad_norm": 2.7979753017425537, + "learning_rate": 4.6138894704022484e-06, + "loss": 0.5374, + "step": 2357 + }, + { + "epoch": 1.1148936170212767, + "grad_norm": 2.965611696243286, + "learning_rate": 4.613556351434458e-06, + "loss": 0.5145, + "step": 2358 + }, + { + "epoch": 1.1153664302600472, + "grad_norm": 2.583134889602661, + "learning_rate": 4.613223100864782e-06, + "loss": 0.535, + "step": 2359 + }, + { + "epoch": 1.115839243498818, + "grad_norm": 2.5979621410369873, + "learning_rate": 4.61288971871397e-06, + "loss": 0.5514, + "step": 2360 + }, + { + "epoch": 1.1163120567375886, + "grad_norm": 3.0117669105529785, + "learning_rate": 4.612556205002779e-06, + "loss": 0.5266, + "step": 2361 + }, + { + "epoch": 1.1167848699763594, + "grad_norm": 2.425133466720581, + "learning_rate": 4.612222559751976e-06, + "loss": 0.4838, + "step": 2362 + }, + { + "epoch": 1.11725768321513, + "grad_norm": 2.5102691650390625, + "learning_rate": 4.611888782982337e-06, + "loss": 0.3947, + "step": 2363 + }, + { + "epoch": 1.1177304964539008, + "grad_norm": 3.0327367782592773, + "learning_rate": 4.611554874714645e-06, + "loss": 0.5753, + "step": 2364 + }, + { + "epoch": 1.1182033096926713, + "grad_norm": 2.4561009407043457, + "learning_rate": 4.6112208349696875e-06, + "loss": 0.5054, + "step": 2365 + }, + { + "epoch": 1.1186761229314421, + "grad_norm": 3.3898050785064697, + "learning_rate": 4.610886663768267e-06, + "loss": 0.5946, + "step": 2366 + }, + { + "epoch": 1.1191489361702127, + "grad_norm": 2.8112242221832275, + "learning_rate": 4.61055236113119e-06, + "loss": 0.5475, + "step": 2367 + }, + { + "epoch": 1.1196217494089835, + "grad_norm": 3.152946710586548, + "learning_rate": 4.610217927079272e-06, + "loss": 0.5165, + "step": 2368 + }, + { + "epoch": 1.120094562647754, + "grad_norm": 2.7847867012023926, + "learning_rate": 4.609883361633336e-06, + "loss": 0.5533, + "step": 2369 + }, + { + "epoch": 1.1205673758865249, + "grad_norm": 2.6376686096191406, + "learning_rate": 4.6095486648142155e-06, + "loss": 0.4942, + "step": 2370 + }, + { + "epoch": 1.1210401891252955, + "grad_norm": 3.123072862625122, + "learning_rate": 4.609213836642749e-06, + "loss": 0.616, + "step": 2371 + }, + { + "epoch": 1.1215130023640663, + "grad_norm": 2.802694320678711, + "learning_rate": 4.608878877139786e-06, + "loss": 0.5323, + "step": 2372 + }, + { + "epoch": 1.1219858156028368, + "grad_norm": 2.3567938804626465, + "learning_rate": 4.6085437863261825e-06, + "loss": 0.4822, + "step": 2373 + }, + { + "epoch": 1.1224586288416076, + "grad_norm": 2.553112030029297, + "learning_rate": 4.608208564222804e-06, + "loss": 0.5447, + "step": 2374 + }, + { + "epoch": 1.1229314420803782, + "grad_norm": 3.0020132064819336, + "learning_rate": 4.607873210850521e-06, + "loss": 0.6486, + "step": 2375 + }, + { + "epoch": 1.123404255319149, + "grad_norm": 2.832442045211792, + "learning_rate": 4.607537726230216e-06, + "loss": 0.5257, + "step": 2376 + }, + { + "epoch": 1.1238770685579196, + "grad_norm": 2.471527099609375, + "learning_rate": 4.607202110382778e-06, + "loss": 0.4816, + "step": 2377 + }, + { + "epoch": 1.1243498817966904, + "grad_norm": 2.4232118129730225, + "learning_rate": 4.606866363329105e-06, + "loss": 0.5533, + "step": 2378 + }, + { + "epoch": 1.124822695035461, + "grad_norm": 2.477506637573242, + "learning_rate": 4.6065304850901025e-06, + "loss": 0.5223, + "step": 2379 + }, + { + "epoch": 1.1252955082742317, + "grad_norm": 3.54127836227417, + "learning_rate": 4.6061944756866824e-06, + "loss": 0.6514, + "step": 2380 + }, + { + "epoch": 1.1257683215130023, + "grad_norm": 2.5148677825927734, + "learning_rate": 4.605858335139768e-06, + "loss": 0.4864, + "step": 2381 + }, + { + "epoch": 1.1262411347517731, + "grad_norm": 2.8363659381866455, + "learning_rate": 4.605522063470289e-06, + "loss": 0.5034, + "step": 2382 + }, + { + "epoch": 1.1267139479905437, + "grad_norm": 2.4996654987335205, + "learning_rate": 4.605185660699184e-06, + "loss": 0.4126, + "step": 2383 + }, + { + "epoch": 1.1271867612293145, + "grad_norm": 2.352543830871582, + "learning_rate": 4.604849126847398e-06, + "loss": 0.5224, + "step": 2384 + }, + { + "epoch": 1.127659574468085, + "grad_norm": 2.60101056098938, + "learning_rate": 4.6045124619358875e-06, + "loss": 0.4867, + "step": 2385 + }, + { + "epoch": 1.1281323877068559, + "grad_norm": 2.9471068382263184, + "learning_rate": 4.604175665985613e-06, + "loss": 0.6474, + "step": 2386 + }, + { + "epoch": 1.1286052009456264, + "grad_norm": 2.5933351516723633, + "learning_rate": 4.603838739017546e-06, + "loss": 0.5081, + "step": 2387 + }, + { + "epoch": 1.1290780141843972, + "grad_norm": 2.3740346431732178, + "learning_rate": 4.6035016810526665e-06, + "loss": 0.4438, + "step": 2388 + }, + { + "epoch": 1.1295508274231678, + "grad_norm": 2.675020217895508, + "learning_rate": 4.6031644921119614e-06, + "loss": 0.4968, + "step": 2389 + }, + { + "epoch": 1.1300236406619386, + "grad_norm": 2.599472999572754, + "learning_rate": 4.602827172216424e-06, + "loss": 0.5131, + "step": 2390 + }, + { + "epoch": 1.1304964539007092, + "grad_norm": 2.8176097869873047, + "learning_rate": 4.602489721387061e-06, + "loss": 0.5549, + "step": 2391 + }, + { + "epoch": 1.13096926713948, + "grad_norm": 2.466914176940918, + "learning_rate": 4.602152139644881e-06, + "loss": 0.5052, + "step": 2392 + }, + { + "epoch": 1.1314420803782506, + "grad_norm": 2.8938796520233154, + "learning_rate": 4.601814427010905e-06, + "loss": 0.6181, + "step": 2393 + }, + { + "epoch": 1.1319148936170214, + "grad_norm": 2.7390825748443604, + "learning_rate": 4.601476583506161e-06, + "loss": 0.5178, + "step": 2394 + }, + { + "epoch": 1.132387706855792, + "grad_norm": 3.180112838745117, + "learning_rate": 4.601138609151685e-06, + "loss": 0.6071, + "step": 2395 + }, + { + "epoch": 1.1328605200945627, + "grad_norm": 2.9282350540161133, + "learning_rate": 4.600800503968521e-06, + "loss": 0.5557, + "step": 2396 + }, + { + "epoch": 1.1333333333333333, + "grad_norm": 2.6689717769622803, + "learning_rate": 4.6004622679777215e-06, + "loss": 0.4679, + "step": 2397 + }, + { + "epoch": 1.133806146572104, + "grad_norm": 2.651582956314087, + "learning_rate": 4.600123901200347e-06, + "loss": 0.4907, + "step": 2398 + }, + { + "epoch": 1.1342789598108747, + "grad_norm": 2.5702924728393555, + "learning_rate": 4.599785403657464e-06, + "loss": 0.4919, + "step": 2399 + }, + { + "epoch": 1.1347517730496455, + "grad_norm": 2.636812448501587, + "learning_rate": 4.599446775370153e-06, + "loss": 0.5091, + "step": 2400 + }, + { + "epoch": 1.135224586288416, + "grad_norm": 2.5965442657470703, + "learning_rate": 4.599108016359497e-06, + "loss": 0.5035, + "step": 2401 + }, + { + "epoch": 1.1356973995271868, + "grad_norm": 2.689732313156128, + "learning_rate": 4.5987691266465885e-06, + "loss": 0.5307, + "step": 2402 + }, + { + "epoch": 1.1361702127659574, + "grad_norm": 2.7256956100463867, + "learning_rate": 4.59843010625253e-06, + "loss": 0.5066, + "step": 2403 + }, + { + "epoch": 1.1366430260047282, + "grad_norm": 2.726020574569702, + "learning_rate": 4.59809095519843e-06, + "loss": 0.4805, + "step": 2404 + }, + { + "epoch": 1.1371158392434988, + "grad_norm": 2.703339099884033, + "learning_rate": 4.597751673505406e-06, + "loss": 0.4992, + "step": 2405 + }, + { + "epoch": 1.1375886524822696, + "grad_norm": 2.54455304145813, + "learning_rate": 4.5974122611945835e-06, + "loss": 0.5251, + "step": 2406 + }, + { + "epoch": 1.1380614657210402, + "grad_norm": 2.623507022857666, + "learning_rate": 4.597072718287096e-06, + "loss": 0.4831, + "step": 2407 + }, + { + "epoch": 1.138534278959811, + "grad_norm": 2.653590202331543, + "learning_rate": 4.596733044804086e-06, + "loss": 0.5646, + "step": 2408 + }, + { + "epoch": 1.1390070921985815, + "grad_norm": 2.8230600357055664, + "learning_rate": 4.5963932407667035e-06, + "loss": 0.514, + "step": 2409 + }, + { + "epoch": 1.1394799054373523, + "grad_norm": 2.6077451705932617, + "learning_rate": 4.5960533061961065e-06, + "loss": 0.4713, + "step": 2410 + }, + { + "epoch": 1.139952718676123, + "grad_norm": 2.3945798873901367, + "learning_rate": 4.595713241113461e-06, + "loss": 0.466, + "step": 2411 + }, + { + "epoch": 1.1404255319148937, + "grad_norm": 2.8100006580352783, + "learning_rate": 4.595373045539941e-06, + "loss": 0.5365, + "step": 2412 + }, + { + "epoch": 1.1408983451536643, + "grad_norm": 2.6825881004333496, + "learning_rate": 4.59503271949673e-06, + "loss": 0.4457, + "step": 2413 + }, + { + "epoch": 1.141371158392435, + "grad_norm": 2.969435691833496, + "learning_rate": 4.594692263005016e-06, + "loss": 0.5459, + "step": 2414 + }, + { + "epoch": 1.1418439716312057, + "grad_norm": 2.4103164672851562, + "learning_rate": 4.594351676086002e-06, + "loss": 0.4573, + "step": 2415 + }, + { + "epoch": 1.1423167848699765, + "grad_norm": 2.9450128078460693, + "learning_rate": 4.594010958760892e-06, + "loss": 0.5529, + "step": 2416 + }, + { + "epoch": 1.142789598108747, + "grad_norm": 2.6416335105895996, + "learning_rate": 4.593670111050901e-06, + "loss": 0.5153, + "step": 2417 + }, + { + "epoch": 1.1432624113475178, + "grad_norm": 2.473177194595337, + "learning_rate": 4.593329132977253e-06, + "loss": 0.4962, + "step": 2418 + }, + { + "epoch": 1.1437352245862884, + "grad_norm": 2.4494502544403076, + "learning_rate": 4.592988024561179e-06, + "loss": 0.5182, + "step": 2419 + }, + { + "epoch": 1.1442080378250592, + "grad_norm": 2.773930311203003, + "learning_rate": 4.592646785823918e-06, + "loss": 0.4442, + "step": 2420 + }, + { + "epoch": 1.1446808510638298, + "grad_norm": 2.4733314514160156, + "learning_rate": 4.592305416786718e-06, + "loss": 0.5106, + "step": 2421 + }, + { + "epoch": 1.1451536643026006, + "grad_norm": 2.6870038509368896, + "learning_rate": 4.591963917470834e-06, + "loss": 0.5316, + "step": 2422 + }, + { + "epoch": 1.1456264775413711, + "grad_norm": 2.8989531993865967, + "learning_rate": 4.591622287897529e-06, + "loss": 0.5906, + "step": 2423 + }, + { + "epoch": 1.1460992907801417, + "grad_norm": 2.6349124908447266, + "learning_rate": 4.591280528088077e-06, + "loss": 0.6225, + "step": 2424 + }, + { + "epoch": 1.1465721040189125, + "grad_norm": 3.19022274017334, + "learning_rate": 4.5909386380637555e-06, + "loss": 0.555, + "step": 2425 + }, + { + "epoch": 1.1470449172576833, + "grad_norm": 3.1473541259765625, + "learning_rate": 4.5905966178458535e-06, + "loss": 0.537, + "step": 2426 + }, + { + "epoch": 1.147517730496454, + "grad_norm": 2.6996145248413086, + "learning_rate": 4.590254467455667e-06, + "loss": 0.565, + "step": 2427 + }, + { + "epoch": 1.1479905437352245, + "grad_norm": 2.830188274383545, + "learning_rate": 4.5899121869145015e-06, + "loss": 0.6773, + "step": 2428 + }, + { + "epoch": 1.1484633569739953, + "grad_norm": 2.4937260150909424, + "learning_rate": 4.589569776243667e-06, + "loss": 0.5484, + "step": 2429 + }, + { + "epoch": 1.148936170212766, + "grad_norm": 2.54011869430542, + "learning_rate": 4.589227235464486e-06, + "loss": 0.5307, + "step": 2430 + }, + { + "epoch": 1.1494089834515366, + "grad_norm": 2.8764214515686035, + "learning_rate": 4.5888845645982845e-06, + "loss": 0.5296, + "step": 2431 + }, + { + "epoch": 1.1498817966903072, + "grad_norm": 2.637033462524414, + "learning_rate": 4.588541763666402e-06, + "loss": 0.5975, + "step": 2432 + }, + { + "epoch": 1.150354609929078, + "grad_norm": 2.8534255027770996, + "learning_rate": 4.5881988326901815e-06, + "loss": 0.5431, + "step": 2433 + }, + { + "epoch": 1.1508274231678488, + "grad_norm": 2.8546559810638428, + "learning_rate": 4.587855771690976e-06, + "loss": 0.469, + "step": 2434 + }, + { + "epoch": 1.1513002364066194, + "grad_norm": 2.9084973335266113, + "learning_rate": 4.587512580690146e-06, + "loss": 0.5566, + "step": 2435 + }, + { + "epoch": 1.15177304964539, + "grad_norm": 3.0993130207061768, + "learning_rate": 4.587169259709063e-06, + "loss": 0.5612, + "step": 2436 + }, + { + "epoch": 1.1522458628841608, + "grad_norm": 10.847400665283203, + "learning_rate": 4.5868258087691e-06, + "loss": 0.4678, + "step": 2437 + }, + { + "epoch": 1.1527186761229316, + "grad_norm": 2.6648571491241455, + "learning_rate": 4.586482227891645e-06, + "loss": 0.5951, + "step": 2438 + }, + { + "epoch": 1.1531914893617021, + "grad_norm": 2.529043197631836, + "learning_rate": 4.586138517098091e-06, + "loss": 0.5048, + "step": 2439 + }, + { + "epoch": 1.1536643026004727, + "grad_norm": 2.833904504776001, + "learning_rate": 4.585794676409839e-06, + "loss": 0.536, + "step": 2440 + }, + { + "epoch": 1.1541371158392435, + "grad_norm": 3.507657766342163, + "learning_rate": 4.585450705848298e-06, + "loss": 0.5954, + "step": 2441 + }, + { + "epoch": 1.1546099290780143, + "grad_norm": 2.6108388900756836, + "learning_rate": 4.585106605434887e-06, + "loss": 0.5684, + "step": 2442 + }, + { + "epoch": 1.1550827423167849, + "grad_norm": 2.490708589553833, + "learning_rate": 4.58476237519103e-06, + "loss": 0.4678, + "step": 2443 + }, + { + "epoch": 1.1555555555555554, + "grad_norm": 2.8192343711853027, + "learning_rate": 4.584418015138161e-06, + "loss": 0.5291, + "step": 2444 + }, + { + "epoch": 1.1560283687943262, + "grad_norm": 3.0878679752349854, + "learning_rate": 4.584073525297722e-06, + "loss": 0.5691, + "step": 2445 + }, + { + "epoch": 1.156501182033097, + "grad_norm": 3.1444318294525146, + "learning_rate": 4.583728905691163e-06, + "loss": 0.5643, + "step": 2446 + }, + { + "epoch": 1.1569739952718676, + "grad_norm": 3.02382230758667, + "learning_rate": 4.583384156339942e-06, + "loss": 0.6008, + "step": 2447 + }, + { + "epoch": 1.1574468085106382, + "grad_norm": 2.5942490100860596, + "learning_rate": 4.583039277265525e-06, + "loss": 0.5105, + "step": 2448 + }, + { + "epoch": 1.157919621749409, + "grad_norm": 2.938608407974243, + "learning_rate": 4.582694268489386e-06, + "loss": 0.5123, + "step": 2449 + }, + { + "epoch": 1.1583924349881798, + "grad_norm": 2.4622268676757812, + "learning_rate": 4.5823491300330075e-06, + "loss": 0.4538, + "step": 2450 + }, + { + "epoch": 1.1588652482269504, + "grad_norm": 2.4380505084991455, + "learning_rate": 4.5820038619178795e-06, + "loss": 0.4682, + "step": 2451 + }, + { + "epoch": 1.159338061465721, + "grad_norm": 2.479896068572998, + "learning_rate": 4.581658464165501e-06, + "loss": 0.4877, + "step": 2452 + }, + { + "epoch": 1.1598108747044917, + "grad_norm": 2.3373546600341797, + "learning_rate": 4.5813129367973765e-06, + "loss": 0.445, + "step": 2453 + }, + { + "epoch": 1.1602836879432625, + "grad_norm": 2.8586013317108154, + "learning_rate": 4.5809672798350214e-06, + "loss": 0.5232, + "step": 2454 + }, + { + "epoch": 1.160756501182033, + "grad_norm": 3.2302439212799072, + "learning_rate": 4.5806214932999595e-06, + "loss": 0.5336, + "step": 2455 + }, + { + "epoch": 1.1612293144208037, + "grad_norm": 3.1005783081054688, + "learning_rate": 4.580275577213721e-06, + "loss": 0.5123, + "step": 2456 + }, + { + "epoch": 1.1617021276595745, + "grad_norm": 2.7131073474884033, + "learning_rate": 4.579929531597842e-06, + "loss": 0.5648, + "step": 2457 + }, + { + "epoch": 1.1621749408983453, + "grad_norm": 2.5067050457000732, + "learning_rate": 4.579583356473874e-06, + "loss": 0.5324, + "step": 2458 + }, + { + "epoch": 1.1626477541371159, + "grad_norm": 2.7870543003082275, + "learning_rate": 4.579237051863366e-06, + "loss": 0.5094, + "step": 2459 + }, + { + "epoch": 1.1631205673758864, + "grad_norm": 2.739196300506592, + "learning_rate": 4.578890617787887e-06, + "loss": 0.5103, + "step": 2460 + }, + { + "epoch": 1.1635933806146572, + "grad_norm": 2.7108185291290283, + "learning_rate": 4.578544054269003e-06, + "loss": 0.533, + "step": 2461 + }, + { + "epoch": 1.1640661938534278, + "grad_norm": 3.028005361557007, + "learning_rate": 4.578197361328295e-06, + "loss": 0.636, + "step": 2462 + }, + { + "epoch": 1.1645390070921986, + "grad_norm": 2.4855129718780518, + "learning_rate": 4.5778505389873505e-06, + "loss": 0.501, + "step": 2463 + }, + { + "epoch": 1.1650118203309692, + "grad_norm": 2.6314198970794678, + "learning_rate": 4.577503587267764e-06, + "loss": 0.5812, + "step": 2464 + }, + { + "epoch": 1.16548463356974, + "grad_norm": 2.4209671020507812, + "learning_rate": 4.5771565061911385e-06, + "loss": 0.5168, + "step": 2465 + }, + { + "epoch": 1.1659574468085105, + "grad_norm": 2.526388645172119, + "learning_rate": 4.576809295779085e-06, + "loss": 0.5047, + "step": 2466 + }, + { + "epoch": 1.1664302600472813, + "grad_norm": 2.8278191089630127, + "learning_rate": 4.576461956053224e-06, + "loss": 0.4759, + "step": 2467 + }, + { + "epoch": 1.166903073286052, + "grad_norm": 2.7862167358398438, + "learning_rate": 4.576114487035182e-06, + "loss": 0.5492, + "step": 2468 + }, + { + "epoch": 1.1673758865248227, + "grad_norm": 2.6303019523620605, + "learning_rate": 4.575766888746594e-06, + "loss": 0.5538, + "step": 2469 + }, + { + "epoch": 1.1678486997635933, + "grad_norm": 2.613104820251465, + "learning_rate": 4.5754191612091034e-06, + "loss": 0.5114, + "step": 2470 + }, + { + "epoch": 1.168321513002364, + "grad_norm": 2.653958320617676, + "learning_rate": 4.5750713044443625e-06, + "loss": 0.5858, + "step": 2471 + }, + { + "epoch": 1.1687943262411347, + "grad_norm": 3.1143975257873535, + "learning_rate": 4.574723318474031e-06, + "loss": 0.5193, + "step": 2472 + }, + { + "epoch": 1.1692671394799055, + "grad_norm": 3.05454421043396, + "learning_rate": 4.574375203319775e-06, + "loss": 0.464, + "step": 2473 + }, + { + "epoch": 1.169739952718676, + "grad_norm": 2.66626238822937, + "learning_rate": 4.574026959003272e-06, + "loss": 0.4988, + "step": 2474 + }, + { + "epoch": 1.1702127659574468, + "grad_norm": 2.8871963024139404, + "learning_rate": 4.573678585546203e-06, + "loss": 0.5557, + "step": 2475 + }, + { + "epoch": 1.1706855791962174, + "grad_norm": 2.592949628829956, + "learning_rate": 4.573330082970262e-06, + "loss": 0.5178, + "step": 2476 + }, + { + "epoch": 1.1711583924349882, + "grad_norm": 2.9111456871032715, + "learning_rate": 4.572981451297148e-06, + "loss": 0.5712, + "step": 2477 + }, + { + "epoch": 1.1716312056737588, + "grad_norm": 2.8152248859405518, + "learning_rate": 4.57263269054857e-06, + "loss": 0.5548, + "step": 2478 + }, + { + "epoch": 1.1721040189125296, + "grad_norm": 3.0292418003082275, + "learning_rate": 4.572283800746241e-06, + "loss": 0.5937, + "step": 2479 + }, + { + "epoch": 1.1725768321513002, + "grad_norm": 3.454618215560913, + "learning_rate": 4.571934781911886e-06, + "loss": 0.5537, + "step": 2480 + }, + { + "epoch": 1.173049645390071, + "grad_norm": 2.7817866802215576, + "learning_rate": 4.571585634067239e-06, + "loss": 0.5649, + "step": 2481 + }, + { + "epoch": 1.1735224586288415, + "grad_norm": 2.7989349365234375, + "learning_rate": 4.571236357234037e-06, + "loss": 0.5448, + "step": 2482 + }, + { + "epoch": 1.1739952718676123, + "grad_norm": 2.8863933086395264, + "learning_rate": 4.57088695143403e-06, + "loss": 0.63, + "step": 2483 + }, + { + "epoch": 1.174468085106383, + "grad_norm": 2.5738039016723633, + "learning_rate": 4.570537416688972e-06, + "loss": 0.4702, + "step": 2484 + }, + { + "epoch": 1.1749408983451537, + "grad_norm": 3.003643274307251, + "learning_rate": 4.570187753020629e-06, + "loss": 0.5918, + "step": 2485 + }, + { + "epoch": 1.1754137115839243, + "grad_norm": 2.8619167804718018, + "learning_rate": 4.569837960450772e-06, + "loss": 0.5268, + "step": 2486 + }, + { + "epoch": 1.175886524822695, + "grad_norm": 2.876077175140381, + "learning_rate": 4.569488039001181e-06, + "loss": 0.4915, + "step": 2487 + }, + { + "epoch": 1.1763593380614656, + "grad_norm": 3.407115936279297, + "learning_rate": 4.569137988693644e-06, + "loss": 0.5761, + "step": 2488 + }, + { + "epoch": 1.1768321513002364, + "grad_norm": 2.7292826175689697, + "learning_rate": 4.568787809549958e-06, + "loss": 0.541, + "step": 2489 + }, + { + "epoch": 1.177304964539007, + "grad_norm": 2.8805999755859375, + "learning_rate": 4.568437501591926e-06, + "loss": 0.6223, + "step": 2490 + }, + { + "epoch": 1.1777777777777778, + "grad_norm": 2.9264373779296875, + "learning_rate": 4.56808706484136e-06, + "loss": 0.6081, + "step": 2491 + }, + { + "epoch": 1.1782505910165484, + "grad_norm": 2.5167033672332764, + "learning_rate": 4.567736499320082e-06, + "loss": 0.5393, + "step": 2492 + }, + { + "epoch": 1.1787234042553192, + "grad_norm": 3.4647862911224365, + "learning_rate": 4.567385805049918e-06, + "loss": 0.4826, + "step": 2493 + }, + { + "epoch": 1.1791962174940898, + "grad_norm": 2.9824202060699463, + "learning_rate": 4.5670349820527055e-06, + "loss": 0.541, + "step": 2494 + }, + { + "epoch": 1.1796690307328606, + "grad_norm": 2.997105836868286, + "learning_rate": 4.5666840303502885e-06, + "loss": 0.5771, + "step": 2495 + }, + { + "epoch": 1.1801418439716311, + "grad_norm": 2.8728017807006836, + "learning_rate": 4.56633294996452e-06, + "loss": 0.4877, + "step": 2496 + }, + { + "epoch": 1.180614657210402, + "grad_norm": 2.626498222351074, + "learning_rate": 4.5659817409172565e-06, + "loss": 0.5296, + "step": 2497 + }, + { + "epoch": 1.1810874704491725, + "grad_norm": 2.87037992477417, + "learning_rate": 4.565630403230371e-06, + "loss": 0.539, + "step": 2498 + }, + { + "epoch": 1.1815602836879433, + "grad_norm": 2.5719685554504395, + "learning_rate": 4.5652789369257375e-06, + "loss": 0.5653, + "step": 2499 + }, + { + "epoch": 1.1820330969267139, + "grad_norm": 2.4842135906219482, + "learning_rate": 4.56492734202524e-06, + "loss": 0.515, + "step": 2500 + }, + { + "epoch": 1.1825059101654847, + "grad_norm": 2.640951156616211, + "learning_rate": 4.564575618550773e-06, + "loss": 0.5601, + "step": 2501 + }, + { + "epoch": 1.1829787234042553, + "grad_norm": 2.624394655227661, + "learning_rate": 4.564223766524234e-06, + "loss": 0.5551, + "step": 2502 + }, + { + "epoch": 1.183451536643026, + "grad_norm": 3.014537811279297, + "learning_rate": 4.563871785967533e-06, + "loss": 0.5212, + "step": 2503 + }, + { + "epoch": 1.1839243498817966, + "grad_norm": 2.8756890296936035, + "learning_rate": 4.563519676902585e-06, + "loss": 0.5132, + "step": 2504 + }, + { + "epoch": 1.1843971631205674, + "grad_norm": 2.636781692504883, + "learning_rate": 4.5631674393513145e-06, + "loss": 0.5323, + "step": 2505 + }, + { + "epoch": 1.184869976359338, + "grad_norm": 2.7233786582946777, + "learning_rate": 4.562815073335655e-06, + "loss": 0.5608, + "step": 2506 + }, + { + "epoch": 1.1853427895981088, + "grad_norm": 2.7158713340759277, + "learning_rate": 4.562462578877546e-06, + "loss": 0.5373, + "step": 2507 + }, + { + "epoch": 1.1858156028368794, + "grad_norm": 2.9754762649536133, + "learning_rate": 4.562109955998936e-06, + "loss": 0.5712, + "step": 2508 + }, + { + "epoch": 1.1862884160756502, + "grad_norm": 2.8815054893493652, + "learning_rate": 4.561757204721781e-06, + "loss": 0.6126, + "step": 2509 + }, + { + "epoch": 1.1867612293144207, + "grad_norm": 2.866319417953491, + "learning_rate": 4.561404325068045e-06, + "loss": 0.506, + "step": 2510 + }, + { + "epoch": 1.1872340425531915, + "grad_norm": 2.6187376976013184, + "learning_rate": 4.561051317059701e-06, + "loss": 0.4674, + "step": 2511 + }, + { + "epoch": 1.1877068557919621, + "grad_norm": 2.642552137374878, + "learning_rate": 4.560698180718729e-06, + "loss": 0.4793, + "step": 2512 + }, + { + "epoch": 1.188179669030733, + "grad_norm": 2.7815041542053223, + "learning_rate": 4.560344916067117e-06, + "loss": 0.5034, + "step": 2513 + }, + { + "epoch": 1.1886524822695035, + "grad_norm": 2.70853590965271, + "learning_rate": 4.559991523126862e-06, + "loss": 0.4811, + "step": 2514 + }, + { + "epoch": 1.1891252955082743, + "grad_norm": 2.7049436569213867, + "learning_rate": 4.559638001919967e-06, + "loss": 0.547, + "step": 2515 + }, + { + "epoch": 1.1895981087470449, + "grad_norm": 2.766773223876953, + "learning_rate": 4.559284352468445e-06, + "loss": 0.5362, + "step": 2516 + }, + { + "epoch": 1.1900709219858157, + "grad_norm": 3.0064334869384766, + "learning_rate": 4.558930574794316e-06, + "loss": 0.5915, + "step": 2517 + }, + { + "epoch": 1.1905437352245862, + "grad_norm": 2.4899885654449463, + "learning_rate": 4.558576668919609e-06, + "loss": 0.4379, + "step": 2518 + }, + { + "epoch": 1.191016548463357, + "grad_norm": 2.925963878631592, + "learning_rate": 4.558222634866358e-06, + "loss": 0.5389, + "step": 2519 + }, + { + "epoch": 1.1914893617021276, + "grad_norm": 6.087667465209961, + "learning_rate": 4.55786847265661e-06, + "loss": 0.4777, + "step": 2520 + }, + { + "epoch": 1.1919621749408984, + "grad_norm": 2.4560582637786865, + "learning_rate": 4.5575141823124145e-06, + "loss": 0.5576, + "step": 2521 + }, + { + "epoch": 1.192434988179669, + "grad_norm": 3.184252977371216, + "learning_rate": 4.557159763855834e-06, + "loss": 0.5151, + "step": 2522 + }, + { + "epoch": 1.1929078014184398, + "grad_norm": 2.359722137451172, + "learning_rate": 4.556805217308935e-06, + "loss": 0.478, + "step": 2523 + }, + { + "epoch": 1.1933806146572103, + "grad_norm": 3.0821568965911865, + "learning_rate": 4.5564505426937935e-06, + "loss": 0.5784, + "step": 2524 + }, + { + "epoch": 1.1938534278959811, + "grad_norm": 2.9905128479003906, + "learning_rate": 4.5560957400324936e-06, + "loss": 0.6087, + "step": 2525 + }, + { + "epoch": 1.1943262411347517, + "grad_norm": 2.462102174758911, + "learning_rate": 4.555740809347128e-06, + "loss": 0.4739, + "step": 2526 + }, + { + "epoch": 1.1947990543735225, + "grad_norm": 2.7931067943573, + "learning_rate": 4.555385750659796e-06, + "loss": 0.4961, + "step": 2527 + }, + { + "epoch": 1.195271867612293, + "grad_norm": 2.660320997238159, + "learning_rate": 4.555030563992607e-06, + "loss": 0.487, + "step": 2528 + }, + { + "epoch": 1.195744680851064, + "grad_norm": 2.8135557174682617, + "learning_rate": 4.554675249367675e-06, + "loss": 0.5269, + "step": 2529 + }, + { + "epoch": 1.1962174940898345, + "grad_norm": 2.661933422088623, + "learning_rate": 4.554319806807126e-06, + "loss": 0.4723, + "step": 2530 + }, + { + "epoch": 1.1966903073286053, + "grad_norm": 2.568176507949829, + "learning_rate": 4.553964236333089e-06, + "loss": 0.5258, + "step": 2531 + }, + { + "epoch": 1.1971631205673758, + "grad_norm": 2.6890947818756104, + "learning_rate": 4.553608537967705e-06, + "loss": 0.4965, + "step": 2532 + }, + { + "epoch": 1.1976359338061466, + "grad_norm": 3.133470058441162, + "learning_rate": 4.553252711733124e-06, + "loss": 0.5423, + "step": 2533 + }, + { + "epoch": 1.1981087470449172, + "grad_norm": 2.7086687088012695, + "learning_rate": 4.552896757651498e-06, + "loss": 0.5326, + "step": 2534 + }, + { + "epoch": 1.198581560283688, + "grad_norm": 2.8411715030670166, + "learning_rate": 4.552540675744994e-06, + "loss": 0.5793, + "step": 2535 + }, + { + "epoch": 1.1990543735224586, + "grad_norm": 3.041077136993408, + "learning_rate": 4.552184466035782e-06, + "loss": 0.5068, + "step": 2536 + }, + { + "epoch": 1.1995271867612294, + "grad_norm": 2.5921192169189453, + "learning_rate": 4.551828128546041e-06, + "loss": 0.5189, + "step": 2537 + }, + { + "epoch": 1.2, + "grad_norm": 2.923305034637451, + "learning_rate": 4.5514716632979605e-06, + "loss": 0.516, + "step": 2538 + }, + { + "epoch": 1.2004728132387708, + "grad_norm": 2.7083024978637695, + "learning_rate": 4.551115070313734e-06, + "loss": 0.4825, + "step": 2539 + }, + { + "epoch": 1.2009456264775413, + "grad_norm": 2.746842384338379, + "learning_rate": 4.550758349615567e-06, + "loss": 0.5691, + "step": 2540 + }, + { + "epoch": 1.2014184397163121, + "grad_norm": 2.6596429347991943, + "learning_rate": 4.550401501225669e-06, + "loss": 0.5983, + "step": 2541 + }, + { + "epoch": 1.2018912529550827, + "grad_norm": 2.9057931900024414, + "learning_rate": 4.550044525166261e-06, + "loss": 0.5069, + "step": 2542 + }, + { + "epoch": 1.2023640661938535, + "grad_norm": 2.6139039993286133, + "learning_rate": 4.5496874214595686e-06, + "loss": 0.5102, + "step": 2543 + }, + { + "epoch": 1.202836879432624, + "grad_norm": 2.630286455154419, + "learning_rate": 4.5493301901278285e-06, + "loss": 0.4902, + "step": 2544 + }, + { + "epoch": 1.2033096926713949, + "grad_norm": 2.639174222946167, + "learning_rate": 4.548972831193284e-06, + "loss": 0.4566, + "step": 2545 + }, + { + "epoch": 1.2037825059101654, + "grad_norm": 2.9569664001464844, + "learning_rate": 4.548615344678186e-06, + "loss": 0.5636, + "step": 2546 + }, + { + "epoch": 1.2042553191489362, + "grad_norm": 2.981734037399292, + "learning_rate": 4.5482577306047924e-06, + "loss": 0.4884, + "step": 2547 + }, + { + "epoch": 1.2047281323877068, + "grad_norm": 2.6760342121124268, + "learning_rate": 4.547899988995371e-06, + "loss": 0.5426, + "step": 2548 + }, + { + "epoch": 1.2052009456264776, + "grad_norm": 2.825805902481079, + "learning_rate": 4.547542119872198e-06, + "loss": 0.4989, + "step": 2549 + }, + { + "epoch": 1.2056737588652482, + "grad_norm": 2.856426954269409, + "learning_rate": 4.547184123257555e-06, + "loss": 0.5734, + "step": 2550 + }, + { + "epoch": 1.206146572104019, + "grad_norm": 2.555682420730591, + "learning_rate": 4.5468259991737334e-06, + "loss": 0.5299, + "step": 2551 + }, + { + "epoch": 1.2066193853427896, + "grad_norm": 2.6324024200439453, + "learning_rate": 4.546467747643032e-06, + "loss": 0.5906, + "step": 2552 + }, + { + "epoch": 1.2070921985815604, + "grad_norm": 3.4145350456237793, + "learning_rate": 4.546109368687757e-06, + "loss": 0.5153, + "step": 2553 + }, + { + "epoch": 1.207565011820331, + "grad_norm": 2.658691644668579, + "learning_rate": 4.545750862330225e-06, + "loss": 0.5759, + "step": 2554 + }, + { + "epoch": 1.2080378250591017, + "grad_norm": 3.162605047225952, + "learning_rate": 4.545392228592755e-06, + "loss": 0.5379, + "step": 2555 + }, + { + "epoch": 1.2085106382978723, + "grad_norm": 2.8631198406219482, + "learning_rate": 4.545033467497681e-06, + "loss": 0.5959, + "step": 2556 + }, + { + "epoch": 1.208983451536643, + "grad_norm": 2.457109212875366, + "learning_rate": 4.54467457906734e-06, + "loss": 0.4864, + "step": 2557 + }, + { + "epoch": 1.2094562647754137, + "grad_norm": 2.5307061672210693, + "learning_rate": 4.544315563324078e-06, + "loss": 0.5308, + "step": 2558 + }, + { + "epoch": 1.2099290780141845, + "grad_norm": 2.8482773303985596, + "learning_rate": 4.543956420290251e-06, + "loss": 0.5126, + "step": 2559 + }, + { + "epoch": 1.210401891252955, + "grad_norm": 2.4990832805633545, + "learning_rate": 4.5435971499882195e-06, + "loss": 0.4534, + "step": 2560 + }, + { + "epoch": 1.2108747044917259, + "grad_norm": 2.6292665004730225, + "learning_rate": 4.543237752440354e-06, + "loss": 0.4434, + "step": 2561 + }, + { + "epoch": 1.2113475177304964, + "grad_norm": 2.865983247756958, + "learning_rate": 4.542878227669033e-06, + "loss": 0.5667, + "step": 2562 + }, + { + "epoch": 1.2118203309692672, + "grad_norm": 2.745614528656006, + "learning_rate": 4.542518575696644e-06, + "loss": 0.4724, + "step": 2563 + }, + { + "epoch": 1.2122931442080378, + "grad_norm": 2.8562581539154053, + "learning_rate": 4.5421587965455785e-06, + "loss": 0.5405, + "step": 2564 + }, + { + "epoch": 1.2127659574468086, + "grad_norm": 2.6670095920562744, + "learning_rate": 4.5417988902382385e-06, + "loss": 0.5432, + "step": 2565 + }, + { + "epoch": 1.2132387706855792, + "grad_norm": 2.9320743083953857, + "learning_rate": 4.541438856797036e-06, + "loss": 0.5862, + "step": 2566 + }, + { + "epoch": 1.21371158392435, + "grad_norm": 2.577505588531494, + "learning_rate": 4.541078696244386e-06, + "loss": 0.4742, + "step": 2567 + }, + { + "epoch": 1.2141843971631205, + "grad_norm": 3.4476120471954346, + "learning_rate": 4.540718408602717e-06, + "loss": 0.5903, + "step": 2568 + }, + { + "epoch": 1.2146572104018913, + "grad_norm": 2.816210985183716, + "learning_rate": 4.540357993894459e-06, + "loss": 0.5033, + "step": 2569 + }, + { + "epoch": 1.215130023640662, + "grad_norm": 3.0806639194488525, + "learning_rate": 4.539997452142058e-06, + "loss": 0.6064, + "step": 2570 + }, + { + "epoch": 1.2156028368794327, + "grad_norm": 2.563060760498047, + "learning_rate": 4.5396367833679586e-06, + "loss": 0.5597, + "step": 2571 + }, + { + "epoch": 1.2160756501182033, + "grad_norm": 3.1014397144317627, + "learning_rate": 4.5392759875946215e-06, + "loss": 0.54, + "step": 2572 + }, + { + "epoch": 1.216548463356974, + "grad_norm": 3.124190330505371, + "learning_rate": 4.53891506484451e-06, + "loss": 0.5122, + "step": 2573 + }, + { + "epoch": 1.2170212765957447, + "grad_norm": 2.6688716411590576, + "learning_rate": 4.538554015140097e-06, + "loss": 0.5615, + "step": 2574 + }, + { + "epoch": 1.2174940898345155, + "grad_norm": 2.775543689727783, + "learning_rate": 4.538192838503866e-06, + "loss": 0.496, + "step": 2575 + }, + { + "epoch": 1.217966903073286, + "grad_norm": 2.7877283096313477, + "learning_rate": 4.537831534958303e-06, + "loss": 0.4995, + "step": 2576 + }, + { + "epoch": 1.2184397163120568, + "grad_norm": 2.824810028076172, + "learning_rate": 4.537470104525906e-06, + "loss": 0.5481, + "step": 2577 + }, + { + "epoch": 1.2189125295508274, + "grad_norm": 2.801269292831421, + "learning_rate": 4.53710854722918e-06, + "loss": 0.5628, + "step": 2578 + }, + { + "epoch": 1.2193853427895982, + "grad_norm": 2.7780683040618896, + "learning_rate": 4.536746863090637e-06, + "loss": 0.4845, + "step": 2579 + }, + { + "epoch": 1.2198581560283688, + "grad_norm": 2.536010265350342, + "learning_rate": 4.536385052132798e-06, + "loss": 0.4771, + "step": 2580 + }, + { + "epoch": 1.2203309692671396, + "grad_norm": 2.768775701522827, + "learning_rate": 4.536023114378191e-06, + "loss": 0.5366, + "step": 2581 + }, + { + "epoch": 1.2208037825059102, + "grad_norm": 2.658125877380371, + "learning_rate": 4.535661049849352e-06, + "loss": 0.524, + "step": 2582 + }, + { + "epoch": 1.2212765957446807, + "grad_norm": 2.558696746826172, + "learning_rate": 4.535298858568825e-06, + "loss": 0.5482, + "step": 2583 + }, + { + "epoch": 1.2217494089834515, + "grad_norm": 2.5284535884857178, + "learning_rate": 4.534936540559164e-06, + "loss": 0.4454, + "step": 2584 + }, + { + "epoch": 1.2222222222222223, + "grad_norm": 7.617330074310303, + "learning_rate": 4.534574095842927e-06, + "loss": 0.5615, + "step": 2585 + }, + { + "epoch": 1.222695035460993, + "grad_norm": 2.9120311737060547, + "learning_rate": 4.534211524442682e-06, + "loss": 0.5624, + "step": 2586 + }, + { + "epoch": 1.2231678486997635, + "grad_norm": 2.5004289150238037, + "learning_rate": 4.533848826381005e-06, + "loss": 0.4743, + "step": 2587 + }, + { + "epoch": 1.2236406619385343, + "grad_norm": 2.8395533561706543, + "learning_rate": 4.53348600168048e-06, + "loss": 0.4457, + "step": 2588 + }, + { + "epoch": 1.224113475177305, + "grad_norm": 2.832211494445801, + "learning_rate": 4.533123050363699e-06, + "loss": 0.5559, + "step": 2589 + }, + { + "epoch": 1.2245862884160756, + "grad_norm": 2.6318583488464355, + "learning_rate": 4.53275997245326e-06, + "loss": 0.5281, + "step": 2590 + }, + { + "epoch": 1.2250591016548462, + "grad_norm": 3.0509233474731445, + "learning_rate": 4.532396767971771e-06, + "loss": 0.6003, + "step": 2591 + }, + { + "epoch": 1.225531914893617, + "grad_norm": 2.6863620281219482, + "learning_rate": 4.532033436941847e-06, + "loss": 0.5219, + "step": 2592 + }, + { + "epoch": 1.2260047281323878, + "grad_norm": 2.401463747024536, + "learning_rate": 4.5316699793861104e-06, + "loss": 0.5994, + "step": 2593 + }, + { + "epoch": 1.2264775413711584, + "grad_norm": 2.613517999649048, + "learning_rate": 4.531306395327194e-06, + "loss": 0.5785, + "step": 2594 + }, + { + "epoch": 1.226950354609929, + "grad_norm": 2.5016374588012695, + "learning_rate": 4.530942684787735e-06, + "loss": 0.5695, + "step": 2595 + }, + { + "epoch": 1.2274231678486998, + "grad_norm": 2.576464891433716, + "learning_rate": 4.53057884779038e-06, + "loss": 0.4427, + "step": 2596 + }, + { + "epoch": 1.2278959810874706, + "grad_norm": 2.5688700675964355, + "learning_rate": 4.530214884357785e-06, + "loss": 0.4966, + "step": 2597 + }, + { + "epoch": 1.2283687943262411, + "grad_norm": 3.179013729095459, + "learning_rate": 4.52985079451261e-06, + "loss": 0.5239, + "step": 2598 + }, + { + "epoch": 1.2288416075650117, + "grad_norm": 2.6015284061431885, + "learning_rate": 4.529486578277527e-06, + "loss": 0.5135, + "step": 2599 + }, + { + "epoch": 1.2293144208037825, + "grad_norm": 2.3029589653015137, + "learning_rate": 4.529122235675214e-06, + "loss": 0.4044, + "step": 2600 + }, + { + "epoch": 1.2297872340425533, + "grad_norm": 2.994093656539917, + "learning_rate": 4.528757766728357e-06, + "loss": 0.5419, + "step": 2601 + }, + { + "epoch": 1.2302600472813239, + "grad_norm": 2.6297390460968018, + "learning_rate": 4.52839317145965e-06, + "loss": 0.488, + "step": 2602 + }, + { + "epoch": 1.2307328605200945, + "grad_norm": 2.4814043045043945, + "learning_rate": 4.528028449891793e-06, + "loss": 0.4917, + "step": 2603 + }, + { + "epoch": 1.2312056737588652, + "grad_norm": 3.6052863597869873, + "learning_rate": 4.527663602047499e-06, + "loss": 0.5301, + "step": 2604 + }, + { + "epoch": 1.231678486997636, + "grad_norm": 2.6984751224517822, + "learning_rate": 4.5272986279494825e-06, + "loss": 0.5253, + "step": 2605 + }, + { + "epoch": 1.2321513002364066, + "grad_norm": 2.514000415802002, + "learning_rate": 4.526933527620469e-06, + "loss": 0.5661, + "step": 2606 + }, + { + "epoch": 1.2326241134751772, + "grad_norm": 2.890921115875244, + "learning_rate": 4.526568301083195e-06, + "loss": 0.5585, + "step": 2607 + }, + { + "epoch": 1.233096926713948, + "grad_norm": 2.6390011310577393, + "learning_rate": 4.526202948360397e-06, + "loss": 0.5168, + "step": 2608 + }, + { + "epoch": 1.2335697399527188, + "grad_norm": 2.7370636463165283, + "learning_rate": 4.5258374694748266e-06, + "loss": 0.5453, + "step": 2609 + }, + { + "epoch": 1.2340425531914894, + "grad_norm": 2.8203976154327393, + "learning_rate": 4.52547186444924e-06, + "loss": 0.5763, + "step": 2610 + }, + { + "epoch": 1.23451536643026, + "grad_norm": 2.7567849159240723, + "learning_rate": 4.5251061333064025e-06, + "loss": 0.5194, + "step": 2611 + }, + { + "epoch": 1.2349881796690307, + "grad_norm": 2.767519474029541, + "learning_rate": 4.524740276069085e-06, + "loss": 0.5355, + "step": 2612 + }, + { + "epoch": 1.2354609929078015, + "grad_norm": 3.072035312652588, + "learning_rate": 4.5243742927600695e-06, + "loss": 0.5391, + "step": 2613 + }, + { + "epoch": 1.2359338061465721, + "grad_norm": 2.5957462787628174, + "learning_rate": 4.524008183402143e-06, + "loss": 0.5645, + "step": 2614 + }, + { + "epoch": 1.2364066193853427, + "grad_norm": 2.774897575378418, + "learning_rate": 4.523641948018101e-06, + "loss": 0.5576, + "step": 2615 + }, + { + "epoch": 1.2368794326241135, + "grad_norm": 2.635887622833252, + "learning_rate": 4.5232755866307496e-06, + "loss": 0.5254, + "step": 2616 + }, + { + "epoch": 1.2373522458628843, + "grad_norm": 2.4860997200012207, + "learning_rate": 4.522909099262899e-06, + "loss": 0.4692, + "step": 2617 + }, + { + "epoch": 1.2378250591016549, + "grad_norm": 2.595513105392456, + "learning_rate": 4.522542485937369e-06, + "loss": 0.5166, + "step": 2618 + }, + { + "epoch": 1.2382978723404254, + "grad_norm": 2.961474895477295, + "learning_rate": 4.522175746676986e-06, + "loss": 0.5455, + "step": 2619 + }, + { + "epoch": 1.2387706855791962, + "grad_norm": 2.813889741897583, + "learning_rate": 4.521808881504588e-06, + "loss": 0.5249, + "step": 2620 + }, + { + "epoch": 1.239243498817967, + "grad_norm": 2.8434813022613525, + "learning_rate": 4.521441890443015e-06, + "loss": 0.472, + "step": 2621 + }, + { + "epoch": 1.2397163120567376, + "grad_norm": 2.4264845848083496, + "learning_rate": 4.521074773515119e-06, + "loss": 0.4783, + "step": 2622 + }, + { + "epoch": 1.2401891252955082, + "grad_norm": 2.615169048309326, + "learning_rate": 4.520707530743761e-06, + "loss": 0.5324, + "step": 2623 + }, + { + "epoch": 1.240661938534279, + "grad_norm": 2.6772537231445312, + "learning_rate": 4.520340162151803e-06, + "loss": 0.5224, + "step": 2624 + }, + { + "epoch": 1.2411347517730495, + "grad_norm": 2.683393955230713, + "learning_rate": 4.519972667762124e-06, + "loss": 0.4863, + "step": 2625 + }, + { + "epoch": 1.2416075650118203, + "grad_norm": 3.0335750579833984, + "learning_rate": 4.519605047597603e-06, + "loss": 0.544, + "step": 2626 + }, + { + "epoch": 1.242080378250591, + "grad_norm": 2.8694353103637695, + "learning_rate": 4.519237301681132e-06, + "loss": 0.5576, + "step": 2627 + }, + { + "epoch": 1.2425531914893617, + "grad_norm": 3.217808246612549, + "learning_rate": 4.518869430035609e-06, + "loss": 0.5459, + "step": 2628 + }, + { + "epoch": 1.2430260047281323, + "grad_norm": 2.7700083255767822, + "learning_rate": 4.518501432683937e-06, + "loss": 0.5579, + "step": 2629 + }, + { + "epoch": 1.243498817966903, + "grad_norm": 2.4759175777435303, + "learning_rate": 4.5181333096490335e-06, + "loss": 0.5049, + "step": 2630 + }, + { + "epoch": 1.2439716312056737, + "grad_norm": 2.8652584552764893, + "learning_rate": 4.517765060953818e-06, + "loss": 0.5366, + "step": 2631 + }, + { + "epoch": 1.2444444444444445, + "grad_norm": 2.776334524154663, + "learning_rate": 4.517396686621218e-06, + "loss": 0.5677, + "step": 2632 + }, + { + "epoch": 1.244917257683215, + "grad_norm": 2.676708221435547, + "learning_rate": 4.517028186674174e-06, + "loss": 0.5055, + "step": 2633 + }, + { + "epoch": 1.2453900709219858, + "grad_norm": 2.6851537227630615, + "learning_rate": 4.516659561135629e-06, + "loss": 0.5537, + "step": 2634 + }, + { + "epoch": 1.2458628841607564, + "grad_norm": 2.619971513748169, + "learning_rate": 4.516290810028536e-06, + "loss": 0.5765, + "step": 2635 + }, + { + "epoch": 1.2463356973995272, + "grad_norm": 2.7302334308624268, + "learning_rate": 4.515921933375855e-06, + "loss": 0.5611, + "step": 2636 + }, + { + "epoch": 1.2468085106382978, + "grad_norm": 2.5005829334259033, + "learning_rate": 4.5155529312005554e-06, + "loss": 0.442, + "step": 2637 + }, + { + "epoch": 1.2472813238770686, + "grad_norm": 2.713587522506714, + "learning_rate": 4.515183803525612e-06, + "loss": 0.5023, + "step": 2638 + }, + { + "epoch": 1.2477541371158392, + "grad_norm": 2.5146236419677734, + "learning_rate": 4.514814550374009e-06, + "loss": 0.5195, + "step": 2639 + }, + { + "epoch": 1.24822695035461, + "grad_norm": 2.761060953140259, + "learning_rate": 4.51444517176874e-06, + "loss": 0.5138, + "step": 2640 + }, + { + "epoch": 1.2486997635933805, + "grad_norm": 3.082329273223877, + "learning_rate": 4.5140756677328026e-06, + "loss": 0.6105, + "step": 2641 + }, + { + "epoch": 1.2491725768321513, + "grad_norm": 2.6933493614196777, + "learning_rate": 4.513706038289205e-06, + "loss": 0.5185, + "step": 2642 + }, + { + "epoch": 1.249645390070922, + "grad_norm": 2.515856981277466, + "learning_rate": 4.513336283460962e-06, + "loss": 0.5375, + "step": 2643 + }, + { + "epoch": 1.2501182033096927, + "grad_norm": 2.8553731441497803, + "learning_rate": 4.512966403271096e-06, + "loss": 0.5582, + "step": 2644 + }, + { + "epoch": 1.2505910165484633, + "grad_norm": 2.640880823135376, + "learning_rate": 4.5125963977426405e-06, + "loss": 0.5125, + "step": 2645 + }, + { + "epoch": 1.251063829787234, + "grad_norm": 2.9845943450927734, + "learning_rate": 4.512226266898631e-06, + "loss": 0.4749, + "step": 2646 + }, + { + "epoch": 1.2515366430260046, + "grad_norm": 2.5131032466888428, + "learning_rate": 4.511856010762116e-06, + "loss": 0.4764, + "step": 2647 + }, + { + "epoch": 1.2520094562647754, + "grad_norm": 2.370638370513916, + "learning_rate": 4.511485629356148e-06, + "loss": 0.5153, + "step": 2648 + }, + { + "epoch": 1.252482269503546, + "grad_norm": 2.912461996078491, + "learning_rate": 4.511115122703791e-06, + "loss": 0.6117, + "step": 2649 + }, + { + "epoch": 1.2529550827423168, + "grad_norm": 2.7308082580566406, + "learning_rate": 4.510744490828113e-06, + "loss": 0.5076, + "step": 2650 + }, + { + "epoch": 1.2534278959810874, + "grad_norm": 2.8524296283721924, + "learning_rate": 4.510373733752193e-06, + "loss": 0.542, + "step": 2651 + }, + { + "epoch": 1.2539007092198582, + "grad_norm": 2.799377202987671, + "learning_rate": 4.5100028514991145e-06, + "loss": 0.486, + "step": 2652 + }, + { + "epoch": 1.2543735224586288, + "grad_norm": 2.7248027324676514, + "learning_rate": 4.509631844091973e-06, + "loss": 0.4972, + "step": 2653 + }, + { + "epoch": 1.2548463356973996, + "grad_norm": 2.8041458129882812, + "learning_rate": 4.5092607115538686e-06, + "loss": 0.588, + "step": 2654 + }, + { + "epoch": 1.2553191489361701, + "grad_norm": 2.679417133331299, + "learning_rate": 4.50888945390791e-06, + "loss": 0.4639, + "step": 2655 + }, + { + "epoch": 1.255791962174941, + "grad_norm": 3.1049270629882812, + "learning_rate": 4.508518071177214e-06, + "loss": 0.5857, + "step": 2656 + }, + { + "epoch": 1.2562647754137115, + "grad_norm": 2.8590362071990967, + "learning_rate": 4.508146563384904e-06, + "loss": 0.5451, + "step": 2657 + }, + { + "epoch": 1.2567375886524823, + "grad_norm": 2.9774081707000732, + "learning_rate": 4.507774930554114e-06, + "loss": 0.5493, + "step": 2658 + }, + { + "epoch": 1.2572104018912529, + "grad_norm": 2.617643356323242, + "learning_rate": 4.507403172707983e-06, + "loss": 0.5472, + "step": 2659 + }, + { + "epoch": 1.2576832151300237, + "grad_norm": 2.9195587635040283, + "learning_rate": 4.507031289869658e-06, + "loss": 0.5403, + "step": 2660 + }, + { + "epoch": 1.2581560283687943, + "grad_norm": 2.706089496612549, + "learning_rate": 4.506659282062295e-06, + "loss": 0.4899, + "step": 2661 + }, + { + "epoch": 1.258628841607565, + "grad_norm": 2.8229358196258545, + "learning_rate": 4.506287149309057e-06, + "loss": 0.5336, + "step": 2662 + }, + { + "epoch": 1.2591016548463356, + "grad_norm": 2.5295674800872803, + "learning_rate": 4.505914891633117e-06, + "loss": 0.4806, + "step": 2663 + }, + { + "epoch": 1.2595744680851064, + "grad_norm": 3.098208427429199, + "learning_rate": 4.505542509057651e-06, + "loss": 0.6039, + "step": 2664 + }, + { + "epoch": 1.260047281323877, + "grad_norm": 2.5118041038513184, + "learning_rate": 4.5051700016058475e-06, + "loss": 0.5279, + "step": 2665 + }, + { + "epoch": 1.2605200945626478, + "grad_norm": 2.6901369094848633, + "learning_rate": 4.5047973693009005e-06, + "loss": 0.5515, + "step": 2666 + }, + { + "epoch": 1.2609929078014184, + "grad_norm": 2.5622377395629883, + "learning_rate": 4.504424612166012e-06, + "loss": 0.5405, + "step": 2667 + }, + { + "epoch": 1.2614657210401892, + "grad_norm": 2.685751438140869, + "learning_rate": 4.5040517302243915e-06, + "loss": 0.5797, + "step": 2668 + }, + { + "epoch": 1.2619385342789597, + "grad_norm": 2.8525350093841553, + "learning_rate": 4.503678723499259e-06, + "loss": 0.5561, + "step": 2669 + }, + { + "epoch": 1.2624113475177305, + "grad_norm": 2.803386926651001, + "learning_rate": 4.503305592013836e-06, + "loss": 0.5376, + "step": 2670 + }, + { + "epoch": 1.2628841607565011, + "grad_norm": 2.78633189201355, + "learning_rate": 4.502932335791359e-06, + "loss": 0.4739, + "step": 2671 + }, + { + "epoch": 1.263356973995272, + "grad_norm": 2.8337297439575195, + "learning_rate": 4.502558954855069e-06, + "loss": 0.5406, + "step": 2672 + }, + { + "epoch": 1.2638297872340425, + "grad_norm": 2.610275983810425, + "learning_rate": 4.502185449228213e-06, + "loss": 0.5343, + "step": 2673 + }, + { + "epoch": 1.2643026004728133, + "grad_norm": 2.7842252254486084, + "learning_rate": 4.501811818934048e-06, + "loss": 0.532, + "step": 2674 + }, + { + "epoch": 1.2647754137115839, + "grad_norm": 2.4472389221191406, + "learning_rate": 4.501438063995839e-06, + "loss": 0.4976, + "step": 2675 + }, + { + "epoch": 1.2652482269503547, + "grad_norm": 3.076580762863159, + "learning_rate": 4.501064184436858e-06, + "loss": 0.507, + "step": 2676 + }, + { + "epoch": 1.2657210401891252, + "grad_norm": 2.5952908992767334, + "learning_rate": 4.500690180280384e-06, + "loss": 0.5498, + "step": 2677 + }, + { + "epoch": 1.266193853427896, + "grad_norm": 2.476943016052246, + "learning_rate": 4.500316051549706e-06, + "loss": 0.557, + "step": 2678 + }, + { + "epoch": 1.2666666666666666, + "grad_norm": 2.730579376220703, + "learning_rate": 4.499941798268118e-06, + "loss": 0.4975, + "step": 2679 + }, + { + "epoch": 1.2671394799054374, + "grad_norm": 2.7916698455810547, + "learning_rate": 4.499567420458924e-06, + "loss": 0.5673, + "step": 2680 + }, + { + "epoch": 1.267612293144208, + "grad_norm": 2.4249091148376465, + "learning_rate": 4.4991929181454355e-06, + "loss": 0.4836, + "step": 2681 + }, + { + "epoch": 1.2680851063829788, + "grad_norm": 2.661911725997925, + "learning_rate": 4.498818291350969e-06, + "loss": 0.5332, + "step": 2682 + }, + { + "epoch": 1.2685579196217494, + "grad_norm": 2.693657875061035, + "learning_rate": 4.498443540098852e-06, + "loss": 0.5257, + "step": 2683 + }, + { + "epoch": 1.2690307328605201, + "grad_norm": 2.609386682510376, + "learning_rate": 4.4980686644124195e-06, + "loss": 0.4918, + "step": 2684 + }, + { + "epoch": 1.2695035460992907, + "grad_norm": 3.2104930877685547, + "learning_rate": 4.4976936643150124e-06, + "loss": 0.6097, + "step": 2685 + }, + { + "epoch": 1.2699763593380615, + "grad_norm": 2.707860231399536, + "learning_rate": 4.49731853982998e-06, + "loss": 0.5109, + "step": 2686 + }, + { + "epoch": 1.270449172576832, + "grad_norm": 3.5046379566192627, + "learning_rate": 4.49694329098068e-06, + "loss": 0.5883, + "step": 2687 + }, + { + "epoch": 1.270921985815603, + "grad_norm": 2.5362324714660645, + "learning_rate": 4.496567917790477e-06, + "loss": 0.5301, + "step": 2688 + }, + { + "epoch": 1.2713947990543735, + "grad_norm": 2.7095518112182617, + "learning_rate": 4.496192420282746e-06, + "loss": 0.4772, + "step": 2689 + }, + { + "epoch": 1.2718676122931443, + "grad_norm": 2.416433095932007, + "learning_rate": 4.495816798480865e-06, + "loss": 0.5012, + "step": 2690 + }, + { + "epoch": 1.2723404255319148, + "grad_norm": 2.5362391471862793, + "learning_rate": 4.495441052408224e-06, + "loss": 0.5197, + "step": 2691 + }, + { + "epoch": 1.2728132387706856, + "grad_norm": 2.9093947410583496, + "learning_rate": 4.495065182088218e-06, + "loss": 0.4893, + "step": 2692 + }, + { + "epoch": 1.2732860520094562, + "grad_norm": 2.520470142364502, + "learning_rate": 4.494689187544251e-06, + "loss": 0.5072, + "step": 2693 + }, + { + "epoch": 1.273758865248227, + "grad_norm": 2.4385125637054443, + "learning_rate": 4.494313068799735e-06, + "loss": 0.4923, + "step": 2694 + }, + { + "epoch": 1.2742316784869976, + "grad_norm": 2.636852502822876, + "learning_rate": 4.493936825878089e-06, + "loss": 0.5409, + "step": 2695 + }, + { + "epoch": 1.2747044917257684, + "grad_norm": 2.7027053833007812, + "learning_rate": 4.493560458802741e-06, + "loss": 0.5906, + "step": 2696 + }, + { + "epoch": 1.275177304964539, + "grad_norm": 2.58752179145813, + "learning_rate": 4.493183967597123e-06, + "loss": 0.5292, + "step": 2697 + }, + { + "epoch": 1.2756501182033098, + "grad_norm": 2.7658379077911377, + "learning_rate": 4.49280735228468e-06, + "loss": 0.5613, + "step": 2698 + }, + { + "epoch": 1.2761229314420803, + "grad_norm": 3.272688388824463, + "learning_rate": 4.492430612888861e-06, + "loss": 0.5654, + "step": 2699 + }, + { + "epoch": 1.2765957446808511, + "grad_norm": 2.806819438934326, + "learning_rate": 4.492053749433125e-06, + "loss": 0.5388, + "step": 2700 + }, + { + "epoch": 1.2770685579196217, + "grad_norm": 2.879727602005005, + "learning_rate": 4.491676761940936e-06, + "loss": 0.5033, + "step": 2701 + }, + { + "epoch": 1.2775413711583925, + "grad_norm": 2.733347177505493, + "learning_rate": 4.4912996504357695e-06, + "loss": 0.5113, + "step": 2702 + }, + { + "epoch": 1.278014184397163, + "grad_norm": 2.7431252002716064, + "learning_rate": 4.490922414941104e-06, + "loss": 0.5417, + "step": 2703 + }, + { + "epoch": 1.2784869976359339, + "grad_norm": 2.9287240505218506, + "learning_rate": 4.490545055480431e-06, + "loss": 0.5875, + "step": 2704 + }, + { + "epoch": 1.2789598108747045, + "grad_norm": 2.576775550842285, + "learning_rate": 4.490167572077244e-06, + "loss": 0.5176, + "step": 2705 + }, + { + "epoch": 1.2794326241134752, + "grad_norm": 2.4335594177246094, + "learning_rate": 4.4897899647550505e-06, + "loss": 0.4749, + "step": 2706 + }, + { + "epoch": 1.2799054373522458, + "grad_norm": 2.6798062324523926, + "learning_rate": 4.489412233537361e-06, + "loss": 0.5439, + "step": 2707 + }, + { + "epoch": 1.2803782505910166, + "grad_norm": 2.8440675735473633, + "learning_rate": 4.489034378447693e-06, + "loss": 0.552, + "step": 2708 + }, + { + "epoch": 1.2808510638297872, + "grad_norm": 2.9059503078460693, + "learning_rate": 4.488656399509577e-06, + "loss": 0.5667, + "step": 2709 + }, + { + "epoch": 1.281323877068558, + "grad_norm": 2.7415006160736084, + "learning_rate": 4.488278296746548e-06, + "loss": 0.5676, + "step": 2710 + }, + { + "epoch": 1.2817966903073286, + "grad_norm": 2.4584875106811523, + "learning_rate": 4.487900070182147e-06, + "loss": 0.4787, + "step": 2711 + }, + { + "epoch": 1.2822695035460994, + "grad_norm": 2.990940809249878, + "learning_rate": 4.487521719839924e-06, + "loss": 0.5239, + "step": 2712 + }, + { + "epoch": 1.28274231678487, + "grad_norm": 3.075201988220215, + "learning_rate": 4.487143245743441e-06, + "loss": 0.5103, + "step": 2713 + }, + { + "epoch": 1.2832151300236407, + "grad_norm": 2.543341875076294, + "learning_rate": 4.486764647916259e-06, + "loss": 0.5475, + "step": 2714 + }, + { + "epoch": 1.2836879432624113, + "grad_norm": 2.9927213191986084, + "learning_rate": 4.486385926381957e-06, + "loss": 0.4923, + "step": 2715 + }, + { + "epoch": 1.284160756501182, + "grad_norm": 2.4220657348632812, + "learning_rate": 4.486007081164111e-06, + "loss": 0.543, + "step": 2716 + }, + { + "epoch": 1.2846335697399527, + "grad_norm": 2.468214988708496, + "learning_rate": 4.4856281122863134e-06, + "loss": 0.5248, + "step": 2717 + }, + { + "epoch": 1.2851063829787235, + "grad_norm": 2.633711099624634, + "learning_rate": 4.48524901977216e-06, + "loss": 0.4764, + "step": 2718 + }, + { + "epoch": 1.285579196217494, + "grad_norm": 2.8399546146392822, + "learning_rate": 4.484869803645254e-06, + "loss": 0.5503, + "step": 2719 + }, + { + "epoch": 1.2860520094562649, + "grad_norm": 2.769063949584961, + "learning_rate": 4.484490463929209e-06, + "loss": 0.5468, + "step": 2720 + }, + { + "epoch": 1.2865248226950354, + "grad_norm": 2.617863893508911, + "learning_rate": 4.4841110006476465e-06, + "loss": 0.5906, + "step": 2721 + }, + { + "epoch": 1.2869976359338062, + "grad_norm": 2.7639541625976562, + "learning_rate": 4.4837314138241905e-06, + "loss": 0.552, + "step": 2722 + }, + { + "epoch": 1.2874704491725768, + "grad_norm": 2.7711129188537598, + "learning_rate": 4.483351703482478e-06, + "loss": 0.5229, + "step": 2723 + }, + { + "epoch": 1.2879432624113476, + "grad_norm": 2.611205577850342, + "learning_rate": 4.482971869646152e-06, + "loss": 0.5055, + "step": 2724 + }, + { + "epoch": 1.2884160756501182, + "grad_norm": 2.8602211475372314, + "learning_rate": 4.482591912338862e-06, + "loss": 0.5561, + "step": 2725 + }, + { + "epoch": 1.2888888888888888, + "grad_norm": 2.5882298946380615, + "learning_rate": 4.4822118315842675e-06, + "loss": 0.5555, + "step": 2726 + }, + { + "epoch": 1.2893617021276595, + "grad_norm": 2.7533531188964844, + "learning_rate": 4.481831627406033e-06, + "loss": 0.5346, + "step": 2727 + }, + { + "epoch": 1.2898345153664303, + "grad_norm": 2.4296958446502686, + "learning_rate": 4.481451299827835e-06, + "loss": 0.4915, + "step": 2728 + }, + { + "epoch": 1.290307328605201, + "grad_norm": 2.4403445720672607, + "learning_rate": 4.481070848873352e-06, + "loss": 0.5648, + "step": 2729 + }, + { + "epoch": 1.2907801418439715, + "grad_norm": 2.473224401473999, + "learning_rate": 4.480690274566274e-06, + "loss": 0.4849, + "step": 2730 + }, + { + "epoch": 1.2912529550827423, + "grad_norm": 2.637899875640869, + "learning_rate": 4.480309576930297e-06, + "loss": 0.4968, + "step": 2731 + }, + { + "epoch": 1.291725768321513, + "grad_norm": 2.7156927585601807, + "learning_rate": 4.479928755989127e-06, + "loss": 0.4759, + "step": 2732 + }, + { + "epoch": 1.2921985815602837, + "grad_norm": 2.632786989212036, + "learning_rate": 4.479547811766475e-06, + "loss": 0.5468, + "step": 2733 + }, + { + "epoch": 1.2926713947990542, + "grad_norm": 2.529218912124634, + "learning_rate": 4.479166744286061e-06, + "loss": 0.4852, + "step": 2734 + }, + { + "epoch": 1.293144208037825, + "grad_norm": 2.561978340148926, + "learning_rate": 4.4787855535716115e-06, + "loss": 0.546, + "step": 2735 + }, + { + "epoch": 1.2936170212765958, + "grad_norm": 2.3684909343719482, + "learning_rate": 4.478404239646862e-06, + "loss": 0.5369, + "step": 2736 + }, + { + "epoch": 1.2940898345153664, + "grad_norm": 2.8940367698669434, + "learning_rate": 4.4780228025355566e-06, + "loss": 0.568, + "step": 2737 + }, + { + "epoch": 1.294562647754137, + "grad_norm": 2.6950316429138184, + "learning_rate": 4.477641242261445e-06, + "loss": 0.4576, + "step": 2738 + }, + { + "epoch": 1.2950354609929078, + "grad_norm": 2.4211716651916504, + "learning_rate": 4.4772595588482835e-06, + "loss": 0.4341, + "step": 2739 + }, + { + "epoch": 1.2955082742316786, + "grad_norm": 3.141097068786621, + "learning_rate": 4.47687775231984e-06, + "loss": 0.5944, + "step": 2740 + }, + { + "epoch": 1.2959810874704492, + "grad_norm": 3.077522039413452, + "learning_rate": 4.476495822699887e-06, + "loss": 0.5786, + "step": 2741 + }, + { + "epoch": 1.2964539007092197, + "grad_norm": 2.708139419555664, + "learning_rate": 4.476113770012206e-06, + "loss": 0.5014, + "step": 2742 + }, + { + "epoch": 1.2969267139479905, + "grad_norm": 2.7572035789489746, + "learning_rate": 4.475731594280586e-06, + "loss": 0.594, + "step": 2743 + }, + { + "epoch": 1.2973995271867613, + "grad_norm": 2.673126459121704, + "learning_rate": 4.475349295528822e-06, + "loss": 0.5317, + "step": 2744 + }, + { + "epoch": 1.297872340425532, + "grad_norm": 2.6757819652557373, + "learning_rate": 4.4749668737807195e-06, + "loss": 0.5614, + "step": 2745 + }, + { + "epoch": 1.2983451536643025, + "grad_norm": 2.7077620029449463, + "learning_rate": 4.47458432906009e-06, + "loss": 0.4916, + "step": 2746 + }, + { + "epoch": 1.2988179669030733, + "grad_norm": 2.446570873260498, + "learning_rate": 4.474201661390752e-06, + "loss": 0.5005, + "step": 2747 + }, + { + "epoch": 1.299290780141844, + "grad_norm": 2.642695665359497, + "learning_rate": 4.473818870796533e-06, + "loss": 0.5048, + "step": 2748 + }, + { + "epoch": 1.2997635933806146, + "grad_norm": 2.519824743270874, + "learning_rate": 4.4734359573012686e-06, + "loss": 0.5131, + "step": 2749 + }, + { + "epoch": 1.3002364066193852, + "grad_norm": 2.5901925563812256, + "learning_rate": 4.4730529209287995e-06, + "loss": 0.4582, + "step": 2750 + }, + { + "epoch": 1.300709219858156, + "grad_norm": 2.6789121627807617, + "learning_rate": 4.472669761702978e-06, + "loss": 0.5685, + "step": 2751 + }, + { + "epoch": 1.3011820330969268, + "grad_norm": 2.408003807067871, + "learning_rate": 4.472286479647659e-06, + "loss": 0.4329, + "step": 2752 + }, + { + "epoch": 1.3016548463356974, + "grad_norm": 2.681403398513794, + "learning_rate": 4.47190307478671e-06, + "loss": 0.4853, + "step": 2753 + }, + { + "epoch": 1.302127659574468, + "grad_norm": 2.9923183917999268, + "learning_rate": 4.4715195471440025e-06, + "loss": 0.5184, + "step": 2754 + }, + { + "epoch": 1.3026004728132388, + "grad_norm": 2.5100321769714355, + "learning_rate": 4.471135896743418e-06, + "loss": 0.5148, + "step": 2755 + }, + { + "epoch": 1.3030732860520096, + "grad_norm": 2.267881393432617, + "learning_rate": 4.4707521236088444e-06, + "loss": 0.5028, + "step": 2756 + }, + { + "epoch": 1.3035460992907801, + "grad_norm": 2.7779829502105713, + "learning_rate": 4.4703682277641775e-06, + "loss": 0.5724, + "step": 2757 + }, + { + "epoch": 1.3040189125295507, + "grad_norm": 2.4262194633483887, + "learning_rate": 4.4699842092333205e-06, + "loss": 0.5341, + "step": 2758 + }, + { + "epoch": 1.3044917257683215, + "grad_norm": 2.8682050704956055, + "learning_rate": 4.469600068040185e-06, + "loss": 0.6114, + "step": 2759 + }, + { + "epoch": 1.3049645390070923, + "grad_norm": 2.647853374481201, + "learning_rate": 4.46921580420869e-06, + "loss": 0.5107, + "step": 2760 + }, + { + "epoch": 1.3054373522458629, + "grad_norm": 2.561998128890991, + "learning_rate": 4.468831417762762e-06, + "loss": 0.6019, + "step": 2761 + }, + { + "epoch": 1.3059101654846335, + "grad_norm": 2.763425350189209, + "learning_rate": 4.468446908726334e-06, + "loss": 0.572, + "step": 2762 + }, + { + "epoch": 1.3063829787234043, + "grad_norm": 2.7052934169769287, + "learning_rate": 4.468062277123348e-06, + "loss": 0.4876, + "step": 2763 + }, + { + "epoch": 1.306855791962175, + "grad_norm": 2.997845411300659, + "learning_rate": 4.467677522977755e-06, + "loss": 0.5683, + "step": 2764 + }, + { + "epoch": 1.3073286052009456, + "grad_norm": 2.503129005432129, + "learning_rate": 4.46729264631351e-06, + "loss": 0.4951, + "step": 2765 + }, + { + "epoch": 1.3078014184397162, + "grad_norm": 2.617492437362671, + "learning_rate": 4.466907647154578e-06, + "loss": 0.5054, + "step": 2766 + }, + { + "epoch": 1.308274231678487, + "grad_norm": 2.934967279434204, + "learning_rate": 4.4665225255249315e-06, + "loss": 0.5299, + "step": 2767 + }, + { + "epoch": 1.3087470449172578, + "grad_norm": 2.787252187728882, + "learning_rate": 4.46613728144855e-06, + "loss": 0.4652, + "step": 2768 + }, + { + "epoch": 1.3092198581560284, + "grad_norm": 2.567439556121826, + "learning_rate": 4.465751914949422e-06, + "loss": 0.538, + "step": 2769 + }, + { + "epoch": 1.309692671394799, + "grad_norm": 2.6386024951934814, + "learning_rate": 4.4653664260515416e-06, + "loss": 0.464, + "step": 2770 + }, + { + "epoch": 1.3101654846335697, + "grad_norm": 2.966848134994507, + "learning_rate": 4.464980814778912e-06, + "loss": 0.4889, + "step": 2771 + }, + { + "epoch": 1.3106382978723405, + "grad_norm": 2.571256637573242, + "learning_rate": 4.464595081155542e-06, + "loss": 0.4979, + "step": 2772 + }, + { + "epoch": 1.3111111111111111, + "grad_norm": 2.774203062057495, + "learning_rate": 4.4642092252054515e-06, + "loss": 0.5366, + "step": 2773 + }, + { + "epoch": 1.3115839243498817, + "grad_norm": 2.682969331741333, + "learning_rate": 4.463823246952666e-06, + "loss": 0.5118, + "step": 2774 + }, + { + "epoch": 1.3120567375886525, + "grad_norm": 2.4873905181884766, + "learning_rate": 4.463437146421217e-06, + "loss": 0.5548, + "step": 2775 + }, + { + "epoch": 1.3125295508274233, + "grad_norm": 2.6769661903381348, + "learning_rate": 4.463050923635147e-06, + "loss": 0.5023, + "step": 2776 + }, + { + "epoch": 1.3130023640661939, + "grad_norm": 2.7190892696380615, + "learning_rate": 4.462664578618503e-06, + "loss": 0.5546, + "step": 2777 + }, + { + "epoch": 1.3134751773049644, + "grad_norm": 2.8193624019622803, + "learning_rate": 4.462278111395343e-06, + "loss": 0.5265, + "step": 2778 + }, + { + "epoch": 1.3139479905437352, + "grad_norm": 2.7324538230895996, + "learning_rate": 4.461891521989728e-06, + "loss": 0.5449, + "step": 2779 + }, + { + "epoch": 1.314420803782506, + "grad_norm": 2.87320876121521, + "learning_rate": 4.4615048104257305e-06, + "loss": 0.5367, + "step": 2780 + }, + { + "epoch": 1.3148936170212766, + "grad_norm": 2.6777031421661377, + "learning_rate": 4.4611179767274306e-06, + "loss": 0.5026, + "step": 2781 + }, + { + "epoch": 1.3153664302600472, + "grad_norm": 3.714524269104004, + "learning_rate": 4.460731020918913e-06, + "loss": 0.569, + "step": 2782 + }, + { + "epoch": 1.315839243498818, + "grad_norm": 2.7493600845336914, + "learning_rate": 4.460343943024273e-06, + "loss": 0.5826, + "step": 2783 + }, + { + "epoch": 1.3163120567375888, + "grad_norm": 2.6544079780578613, + "learning_rate": 4.459956743067609e-06, + "loss": 0.5399, + "step": 2784 + }, + { + "epoch": 1.3167848699763594, + "grad_norm": 2.4338037967681885, + "learning_rate": 4.459569421073036e-06, + "loss": 0.5186, + "step": 2785 + }, + { + "epoch": 1.31725768321513, + "grad_norm": 2.9312374591827393, + "learning_rate": 4.459181977064665e-06, + "loss": 0.5571, + "step": 2786 + }, + { + "epoch": 1.3177304964539007, + "grad_norm": 2.5988922119140625, + "learning_rate": 4.458794411066624e-06, + "loss": 0.5926, + "step": 2787 + }, + { + "epoch": 1.3182033096926715, + "grad_norm": 2.5193772315979004, + "learning_rate": 4.458406723103044e-06, + "loss": 0.5243, + "step": 2788 + }, + { + "epoch": 1.318676122931442, + "grad_norm": 2.8653743267059326, + "learning_rate": 4.458018913198066e-06, + "loss": 0.5421, + "step": 2789 + }, + { + "epoch": 1.3191489361702127, + "grad_norm": 2.486245632171631, + "learning_rate": 4.457630981375834e-06, + "loss": 0.4862, + "step": 2790 + }, + { + "epoch": 1.3196217494089835, + "grad_norm": 3.155435800552368, + "learning_rate": 4.457242927660506e-06, + "loss": 0.5386, + "step": 2791 + }, + { + "epoch": 1.3200945626477543, + "grad_norm": 3.102023124694824, + "learning_rate": 4.456854752076242e-06, + "loss": 0.5527, + "step": 2792 + }, + { + "epoch": 1.3205673758865248, + "grad_norm": 2.7995986938476562, + "learning_rate": 4.456466454647215e-06, + "loss": 0.4364, + "step": 2793 + }, + { + "epoch": 1.3210401891252954, + "grad_norm": 2.8328311443328857, + "learning_rate": 4.456078035397599e-06, + "loss": 0.5516, + "step": 2794 + }, + { + "epoch": 1.3215130023640662, + "grad_norm": 2.606161594390869, + "learning_rate": 4.455689494351581e-06, + "loss": 0.5042, + "step": 2795 + }, + { + "epoch": 1.321985815602837, + "grad_norm": 2.6344757080078125, + "learning_rate": 4.455300831533354e-06, + "loss": 0.4807, + "step": 2796 + }, + { + "epoch": 1.3224586288416076, + "grad_norm": 2.8539786338806152, + "learning_rate": 4.454912046967118e-06, + "loss": 0.4694, + "step": 2797 + }, + { + "epoch": 1.3229314420803782, + "grad_norm": 2.849066734313965, + "learning_rate": 4.454523140677081e-06, + "loss": 0.5037, + "step": 2798 + }, + { + "epoch": 1.323404255319149, + "grad_norm": 2.6803371906280518, + "learning_rate": 4.454134112687458e-06, + "loss": 0.4959, + "step": 2799 + }, + { + "epoch": 1.3238770685579198, + "grad_norm": 3.0546066761016846, + "learning_rate": 4.453744963022473e-06, + "loss": 0.5935, + "step": 2800 + }, + { + "epoch": 1.3243498817966903, + "grad_norm": 2.625602960586548, + "learning_rate": 4.453355691706356e-06, + "loss": 0.5349, + "step": 2801 + }, + { + "epoch": 1.324822695035461, + "grad_norm": 2.7568554878234863, + "learning_rate": 4.452966298763345e-06, + "loss": 0.5012, + "step": 2802 + }, + { + "epoch": 1.3252955082742317, + "grad_norm": 2.940427303314209, + "learning_rate": 4.452576784217686e-06, + "loss": 0.5246, + "step": 2803 + }, + { + "epoch": 1.3257683215130025, + "grad_norm": 2.5485289096832275, + "learning_rate": 4.452187148093633e-06, + "loss": 0.5282, + "step": 2804 + }, + { + "epoch": 1.326241134751773, + "grad_norm": 2.8152987957000732, + "learning_rate": 4.4517973904154455e-06, + "loss": 0.5468, + "step": 2805 + }, + { + "epoch": 1.3267139479905437, + "grad_norm": 2.9399688243865967, + "learning_rate": 4.451407511207393e-06, + "loss": 0.5586, + "step": 2806 + }, + { + "epoch": 1.3271867612293144, + "grad_norm": 2.3870036602020264, + "learning_rate": 4.451017510493751e-06, + "loss": 0.4807, + "step": 2807 + }, + { + "epoch": 1.327659574468085, + "grad_norm": 3.4667887687683105, + "learning_rate": 4.450627388298805e-06, + "loss": 0.5571, + "step": 2808 + }, + { + "epoch": 1.3281323877068558, + "grad_norm": 2.685986042022705, + "learning_rate": 4.450237144646844e-06, + "loss": 0.5525, + "step": 2809 + }, + { + "epoch": 1.3286052009456264, + "grad_norm": 2.8529131412506104, + "learning_rate": 4.449846779562168e-06, + "loss": 0.491, + "step": 2810 + }, + { + "epoch": 1.3290780141843972, + "grad_norm": 2.7360332012176514, + "learning_rate": 4.449456293069082e-06, + "loss": 0.5574, + "step": 2811 + }, + { + "epoch": 1.3295508274231678, + "grad_norm": 2.4656026363372803, + "learning_rate": 4.4490656851919015e-06, + "loss": 0.4678, + "step": 2812 + }, + { + "epoch": 1.3300236406619386, + "grad_norm": 2.602651357650757, + "learning_rate": 4.448674955954947e-06, + "loss": 0.5118, + "step": 2813 + }, + { + "epoch": 1.3304964539007091, + "grad_norm": 3.0129756927490234, + "learning_rate": 4.448284105382548e-06, + "loss": 0.6136, + "step": 2814 + }, + { + "epoch": 1.33096926713948, + "grad_norm": 2.8499927520751953, + "learning_rate": 4.447893133499039e-06, + "loss": 0.5286, + "step": 2815 + }, + { + "epoch": 1.3314420803782505, + "grad_norm": 2.8320744037628174, + "learning_rate": 4.447502040328767e-06, + "loss": 0.5186, + "step": 2816 + }, + { + "epoch": 1.3319148936170213, + "grad_norm": 2.499950885772705, + "learning_rate": 4.447110825896084e-06, + "loss": 0.5338, + "step": 2817 + }, + { + "epoch": 1.3323877068557919, + "grad_norm": 2.530895233154297, + "learning_rate": 4.446719490225346e-06, + "loss": 0.5151, + "step": 2818 + }, + { + "epoch": 1.3328605200945627, + "grad_norm": 2.5276098251342773, + "learning_rate": 4.446328033340921e-06, + "loss": 0.5424, + "step": 2819 + }, + { + "epoch": 1.3333333333333333, + "grad_norm": 2.90218186378479, + "learning_rate": 4.4459364552671845e-06, + "loss": 0.5747, + "step": 2820 + }, + { + "epoch": 1.333806146572104, + "grad_norm": 2.500943183898926, + "learning_rate": 4.445544756028518e-06, + "loss": 0.5459, + "step": 2821 + }, + { + "epoch": 1.3342789598108746, + "grad_norm": 2.960374355316162, + "learning_rate": 4.44515293564931e-06, + "loss": 0.6092, + "step": 2822 + }, + { + "epoch": 1.3347517730496454, + "grad_norm": 2.813671827316284, + "learning_rate": 4.444760994153958e-06, + "loss": 0.5536, + "step": 2823 + }, + { + "epoch": 1.335224586288416, + "grad_norm": 2.7147483825683594, + "learning_rate": 4.444368931566867e-06, + "loss": 0.5291, + "step": 2824 + }, + { + "epoch": 1.3356973995271868, + "grad_norm": 2.710101842880249, + "learning_rate": 4.443976747912447e-06, + "loss": 0.5138, + "step": 2825 + }, + { + "epoch": 1.3361702127659574, + "grad_norm": 2.711419105529785, + "learning_rate": 4.443584443215121e-06, + "loss": 0.5223, + "step": 2826 + }, + { + "epoch": 1.3366430260047282, + "grad_norm": 2.887472152709961, + "learning_rate": 4.443192017499313e-06, + "loss": 0.5464, + "step": 2827 + }, + { + "epoch": 1.3371158392434987, + "grad_norm": 2.8867223262786865, + "learning_rate": 4.4427994707894585e-06, + "loss": 0.5748, + "step": 2828 + }, + { + "epoch": 1.3375886524822695, + "grad_norm": 2.407247543334961, + "learning_rate": 4.44240680311e-06, + "loss": 0.4727, + "step": 2829 + }, + { + "epoch": 1.3380614657210401, + "grad_norm": 2.578420877456665, + "learning_rate": 4.4420140144853865e-06, + "loss": 0.5129, + "step": 2830 + }, + { + "epoch": 1.338534278959811, + "grad_norm": 2.884373426437378, + "learning_rate": 4.441621104940077e-06, + "loss": 0.5366, + "step": 2831 + }, + { + "epoch": 1.3390070921985815, + "grad_norm": 2.8652374744415283, + "learning_rate": 4.441228074498534e-06, + "loss": 0.5045, + "step": 2832 + }, + { + "epoch": 1.3394799054373523, + "grad_norm": 2.5380210876464844, + "learning_rate": 4.440834923185231e-06, + "loss": 0.509, + "step": 2833 + }, + { + "epoch": 1.3399527186761229, + "grad_norm": 2.415734052658081, + "learning_rate": 4.440441651024648e-06, + "loss": 0.5066, + "step": 2834 + }, + { + "epoch": 1.3404255319148937, + "grad_norm": 2.503051996231079, + "learning_rate": 4.440048258041272e-06, + "loss": 0.5118, + "step": 2835 + }, + { + "epoch": 1.3408983451536642, + "grad_norm": 3.351001024246216, + "learning_rate": 4.439654744259598e-06, + "loss": 0.5758, + "step": 2836 + }, + { + "epoch": 1.341371158392435, + "grad_norm": 2.7368781566619873, + "learning_rate": 4.439261109704129e-06, + "loss": 0.5674, + "step": 2837 + }, + { + "epoch": 1.3418439716312056, + "grad_norm": 3.008199453353882, + "learning_rate": 4.438867354399372e-06, + "loss": 0.5891, + "step": 2838 + }, + { + "epoch": 1.3423167848699764, + "grad_norm": 2.538907766342163, + "learning_rate": 4.438473478369847e-06, + "loss": 0.5102, + "step": 2839 + }, + { + "epoch": 1.342789598108747, + "grad_norm": 2.7169063091278076, + "learning_rate": 4.438079481640079e-06, + "loss": 0.6131, + "step": 2840 + }, + { + "epoch": 1.3432624113475178, + "grad_norm": 2.7411608695983887, + "learning_rate": 4.437685364234601e-06, + "loss": 0.5337, + "step": 2841 + }, + { + "epoch": 1.3437352245862884, + "grad_norm": 3.2374939918518066, + "learning_rate": 4.43729112617795e-06, + "loss": 0.5401, + "step": 2842 + }, + { + "epoch": 1.3442080378250592, + "grad_norm": 2.4712226390838623, + "learning_rate": 4.436896767494676e-06, + "loss": 0.5365, + "step": 2843 + }, + { + "epoch": 1.3446808510638297, + "grad_norm": 2.661619186401367, + "learning_rate": 4.436502288209334e-06, + "loss": 0.4919, + "step": 2844 + }, + { + "epoch": 1.3451536643026005, + "grad_norm": 2.5943779945373535, + "learning_rate": 4.4361076883464845e-06, + "loss": 0.5253, + "step": 2845 + }, + { + "epoch": 1.345626477541371, + "grad_norm": 2.672297477722168, + "learning_rate": 4.4357129679307e-06, + "loss": 0.541, + "step": 2846 + }, + { + "epoch": 1.346099290780142, + "grad_norm": 2.6830925941467285, + "learning_rate": 4.435318126986557e-06, + "loss": 0.5641, + "step": 2847 + }, + { + "epoch": 1.3465721040189125, + "grad_norm": 2.7394626140594482, + "learning_rate": 4.434923165538639e-06, + "loss": 0.5591, + "step": 2848 + }, + { + "epoch": 1.3470449172576833, + "grad_norm": 2.9656317234039307, + "learning_rate": 4.434528083611541e-06, + "loss": 0.515, + "step": 2849 + }, + { + "epoch": 1.3475177304964538, + "grad_norm": 3.30155086517334, + "learning_rate": 4.434132881229861e-06, + "loss": 0.5871, + "step": 2850 + }, + { + "epoch": 1.3479905437352246, + "grad_norm": 2.6222476959228516, + "learning_rate": 4.433737558418209e-06, + "loss": 0.5143, + "step": 2851 + }, + { + "epoch": 1.3484633569739952, + "grad_norm": 2.903158187866211, + "learning_rate": 4.4333421152011965e-06, + "loss": 0.4484, + "step": 2852 + }, + { + "epoch": 1.348936170212766, + "grad_norm": 2.863116979598999, + "learning_rate": 4.432946551603449e-06, + "loss": 0.5213, + "step": 2853 + }, + { + "epoch": 1.3494089834515366, + "grad_norm": 2.8253962993621826, + "learning_rate": 4.432550867649596e-06, + "loss": 0.5713, + "step": 2854 + }, + { + "epoch": 1.3498817966903074, + "grad_norm": 2.652493953704834, + "learning_rate": 4.432155063364273e-06, + "loss": 0.5559, + "step": 2855 + }, + { + "epoch": 1.350354609929078, + "grad_norm": 2.4289376735687256, + "learning_rate": 4.431759138772127e-06, + "loss": 0.5122, + "step": 2856 + }, + { + "epoch": 1.3508274231678488, + "grad_norm": 2.6329853534698486, + "learning_rate": 4.43136309389781e-06, + "loss": 0.5332, + "step": 2857 + }, + { + "epoch": 1.3513002364066193, + "grad_norm": 2.431103229522705, + "learning_rate": 4.430966928765982e-06, + "loss": 0.4863, + "step": 2858 + }, + { + "epoch": 1.3517730496453901, + "grad_norm": 2.7529025077819824, + "learning_rate": 4.4305706434013106e-06, + "loss": 0.5263, + "step": 2859 + }, + { + "epoch": 1.3522458628841607, + "grad_norm": 2.884605646133423, + "learning_rate": 4.43017423782847e-06, + "loss": 0.564, + "step": 2860 + }, + { + "epoch": 1.3527186761229315, + "grad_norm": 3.027771234512329, + "learning_rate": 4.4297777120721435e-06, + "loss": 0.5846, + "step": 2861 + }, + { + "epoch": 1.353191489361702, + "grad_norm": 3.0140626430511475, + "learning_rate": 4.4293810661570205e-06, + "loss": 0.6621, + "step": 2862 + }, + { + "epoch": 1.3536643026004729, + "grad_norm": 2.721799612045288, + "learning_rate": 4.428984300107799e-06, + "loss": 0.5566, + "step": 2863 + }, + { + "epoch": 1.3541371158392435, + "grad_norm": 3.0016496181488037, + "learning_rate": 4.428587413949183e-06, + "loss": 0.5525, + "step": 2864 + }, + { + "epoch": 1.3546099290780143, + "grad_norm": 2.77138614654541, + "learning_rate": 4.428190407705886e-06, + "loss": 0.6016, + "step": 2865 + }, + { + "epoch": 1.3550827423167848, + "grad_norm": 2.9783477783203125, + "learning_rate": 4.427793281402627e-06, + "loss": 0.5556, + "step": 2866 + }, + { + "epoch": 1.3555555555555556, + "grad_norm": 2.2490382194519043, + "learning_rate": 4.427396035064132e-06, + "loss": 0.5138, + "step": 2867 + }, + { + "epoch": 1.3560283687943262, + "grad_norm": 2.442225217819214, + "learning_rate": 4.426998668715139e-06, + "loss": 0.4843, + "step": 2868 + }, + { + "epoch": 1.356501182033097, + "grad_norm": 2.74040150642395, + "learning_rate": 4.426601182380388e-06, + "loss": 0.54, + "step": 2869 + }, + { + "epoch": 1.3569739952718676, + "grad_norm": 2.4434332847595215, + "learning_rate": 4.426203576084629e-06, + "loss": 0.5199, + "step": 2870 + }, + { + "epoch": 1.3574468085106384, + "grad_norm": 2.6380388736724854, + "learning_rate": 4.42580584985262e-06, + "loss": 0.5049, + "step": 2871 + }, + { + "epoch": 1.357919621749409, + "grad_norm": 2.7324254512786865, + "learning_rate": 4.425408003709125e-06, + "loss": 0.5036, + "step": 2872 + }, + { + "epoch": 1.3583924349881797, + "grad_norm": 2.661012649536133, + "learning_rate": 4.425010037678916e-06, + "loss": 0.4965, + "step": 2873 + }, + { + "epoch": 1.3588652482269503, + "grad_norm": 2.5380208492279053, + "learning_rate": 4.424611951786773e-06, + "loss": 0.4293, + "step": 2874 + }, + { + "epoch": 1.3593380614657211, + "grad_norm": 2.6060714721679688, + "learning_rate": 4.424213746057483e-06, + "loss": 0.5335, + "step": 2875 + }, + { + "epoch": 1.3598108747044917, + "grad_norm": 2.98282527923584, + "learning_rate": 4.423815420515841e-06, + "loss": 0.5626, + "step": 2876 + }, + { + "epoch": 1.3602836879432625, + "grad_norm": 2.779371500015259, + "learning_rate": 4.423416975186647e-06, + "loss": 0.5353, + "step": 2877 + }, + { + "epoch": 1.360756501182033, + "grad_norm": 2.8033530712127686, + "learning_rate": 4.423018410094713e-06, + "loss": 0.538, + "step": 2878 + }, + { + "epoch": 1.3612293144208039, + "grad_norm": 3.225177764892578, + "learning_rate": 4.422619725264855e-06, + "loss": 0.5441, + "step": 2879 + }, + { + "epoch": 1.3617021276595744, + "grad_norm": 2.959135055541992, + "learning_rate": 4.422220920721896e-06, + "loss": 0.5293, + "step": 2880 + }, + { + "epoch": 1.3621749408983452, + "grad_norm": 2.5558884143829346, + "learning_rate": 4.4218219964906704e-06, + "loss": 0.442, + "step": 2881 + }, + { + "epoch": 1.3626477541371158, + "grad_norm": 2.694899797439575, + "learning_rate": 4.421422952596015e-06, + "loss": 0.5318, + "step": 2882 + }, + { + "epoch": 1.3631205673758866, + "grad_norm": 2.7909531593322754, + "learning_rate": 4.421023789062777e-06, + "loss": 0.6648, + "step": 2883 + }, + { + "epoch": 1.3635933806146572, + "grad_norm": 2.421995162963867, + "learning_rate": 4.420624505915813e-06, + "loss": 0.4644, + "step": 2884 + }, + { + "epoch": 1.364066193853428, + "grad_norm": 2.5876688957214355, + "learning_rate": 4.420225103179981e-06, + "loss": 0.5743, + "step": 2885 + }, + { + "epoch": 1.3645390070921986, + "grad_norm": 2.89341139793396, + "learning_rate": 4.419825580880152e-06, + "loss": 0.5454, + "step": 2886 + }, + { + "epoch": 1.3650118203309693, + "grad_norm": 2.534708261489868, + "learning_rate": 4.419425939041203e-06, + "loss": 0.5572, + "step": 2887 + }, + { + "epoch": 1.36548463356974, + "grad_norm": 2.6052141189575195, + "learning_rate": 4.419026177688017e-06, + "loss": 0.4763, + "step": 2888 + }, + { + "epoch": 1.3659574468085105, + "grad_norm": 2.723720073699951, + "learning_rate": 4.4186262968454854e-06, + "loss": 0.5659, + "step": 2889 + }, + { + "epoch": 1.3664302600472813, + "grad_norm": 2.8909599781036377, + "learning_rate": 4.418226296538507e-06, + "loss": 0.4996, + "step": 2890 + }, + { + "epoch": 1.366903073286052, + "grad_norm": 2.551375389099121, + "learning_rate": 4.417826176791988e-06, + "loss": 0.5259, + "step": 2891 + }, + { + "epoch": 1.3673758865248227, + "grad_norm": 3.360267162322998, + "learning_rate": 4.417425937630843e-06, + "loss": 0.5381, + "step": 2892 + }, + { + "epoch": 1.3678486997635932, + "grad_norm": 2.7611942291259766, + "learning_rate": 4.417025579079992e-06, + "loss": 0.6022, + "step": 2893 + }, + { + "epoch": 1.368321513002364, + "grad_norm": 2.5931224822998047, + "learning_rate": 4.416625101164365e-06, + "loss": 0.5102, + "step": 2894 + }, + { + "epoch": 1.3687943262411348, + "grad_norm": 2.5888102054595947, + "learning_rate": 4.416224503908897e-06, + "loss": 0.4955, + "step": 2895 + }, + { + "epoch": 1.3692671394799054, + "grad_norm": 2.6262896060943604, + "learning_rate": 4.41582378733853e-06, + "loss": 0.5101, + "step": 2896 + }, + { + "epoch": 1.369739952718676, + "grad_norm": 3.339170217514038, + "learning_rate": 4.415422951478218e-06, + "loss": 0.4939, + "step": 2897 + }, + { + "epoch": 1.3702127659574468, + "grad_norm": 2.940866708755493, + "learning_rate": 4.415021996352917e-06, + "loss": 0.5157, + "step": 2898 + }, + { + "epoch": 1.3706855791962176, + "grad_norm": 2.7423818111419678, + "learning_rate": 4.414620921987594e-06, + "loss": 0.5308, + "step": 2899 + }, + { + "epoch": 1.3711583924349882, + "grad_norm": 2.7177040576934814, + "learning_rate": 4.414219728407221e-06, + "loss": 0.5429, + "step": 2900 + }, + { + "epoch": 1.3716312056737587, + "grad_norm": 2.560774087905884, + "learning_rate": 4.4138184156367794e-06, + "loss": 0.5266, + "step": 2901 + }, + { + "epoch": 1.3721040189125295, + "grad_norm": 2.5649116039276123, + "learning_rate": 4.413416983701256e-06, + "loss": 0.4718, + "step": 2902 + }, + { + "epoch": 1.3725768321513003, + "grad_norm": 2.8547167778015137, + "learning_rate": 4.413015432625648e-06, + "loss": 0.5129, + "step": 2903 + }, + { + "epoch": 1.373049645390071, + "grad_norm": 2.5413618087768555, + "learning_rate": 4.412613762434958e-06, + "loss": 0.5738, + "step": 2904 + }, + { + "epoch": 1.3735224586288415, + "grad_norm": 3.3252241611480713, + "learning_rate": 4.412211973154195e-06, + "loss": 0.5639, + "step": 2905 + }, + { + "epoch": 1.3739952718676123, + "grad_norm": 2.869102954864502, + "learning_rate": 4.411810064808376e-06, + "loss": 0.5384, + "step": 2906 + }, + { + "epoch": 1.374468085106383, + "grad_norm": 2.703199863433838, + "learning_rate": 4.411408037422529e-06, + "loss": 0.5742, + "step": 2907 + }, + { + "epoch": 1.3749408983451537, + "grad_norm": 2.685450792312622, + "learning_rate": 4.411005891021684e-06, + "loss": 0.5121, + "step": 2908 + }, + { + "epoch": 1.3754137115839242, + "grad_norm": 2.9572203159332275, + "learning_rate": 4.410603625630882e-06, + "loss": 0.5444, + "step": 2909 + }, + { + "epoch": 1.375886524822695, + "grad_norm": 2.707002878189087, + "learning_rate": 4.410201241275169e-06, + "loss": 0.5125, + "step": 2910 + }, + { + "epoch": 1.3763593380614658, + "grad_norm": 3.0158939361572266, + "learning_rate": 4.409798737979602e-06, + "loss": 0.5299, + "step": 2911 + }, + { + "epoch": 1.3768321513002364, + "grad_norm": 2.7932698726654053, + "learning_rate": 4.4093961157692415e-06, + "loss": 0.5437, + "step": 2912 + }, + { + "epoch": 1.377304964539007, + "grad_norm": 2.459510326385498, + "learning_rate": 4.408993374669156e-06, + "loss": 0.5548, + "step": 2913 + }, + { + "epoch": 1.3777777777777778, + "grad_norm": 2.7500696182250977, + "learning_rate": 4.408590514704425e-06, + "loss": 0.5186, + "step": 2914 + }, + { + "epoch": 1.3782505910165486, + "grad_norm": 2.7824268341064453, + "learning_rate": 4.4081875359001315e-06, + "loss": 0.4762, + "step": 2915 + }, + { + "epoch": 1.3787234042553191, + "grad_norm": 2.4202158451080322, + "learning_rate": 4.4077844382813675e-06, + "loss": 0.5005, + "step": 2916 + }, + { + "epoch": 1.3791962174940897, + "grad_norm": 2.5566670894622803, + "learning_rate": 4.4073812218732316e-06, + "loss": 0.5377, + "step": 2917 + }, + { + "epoch": 1.3796690307328605, + "grad_norm": 3.400874376296997, + "learning_rate": 4.406977886700831e-06, + "loss": 0.6637, + "step": 2918 + }, + { + "epoch": 1.3801418439716313, + "grad_norm": 2.8187878131866455, + "learning_rate": 4.406574432789278e-06, + "loss": 0.5033, + "step": 2919 + }, + { + "epoch": 1.3806146572104019, + "grad_norm": 2.5578041076660156, + "learning_rate": 4.406170860163697e-06, + "loss": 0.5293, + "step": 2920 + }, + { + "epoch": 1.3810874704491725, + "grad_norm": 2.6709718704223633, + "learning_rate": 4.405767168849213e-06, + "loss": 0.5144, + "step": 2921 + }, + { + "epoch": 1.3815602836879433, + "grad_norm": 3.049365997314453, + "learning_rate": 4.405363358870965e-06, + "loss": 0.4894, + "step": 2922 + }, + { + "epoch": 1.382033096926714, + "grad_norm": 2.5569891929626465, + "learning_rate": 4.404959430254095e-06, + "loss": 0.4929, + "step": 2923 + }, + { + "epoch": 1.3825059101654846, + "grad_norm": 2.8288230895996094, + "learning_rate": 4.404555383023754e-06, + "loss": 0.5438, + "step": 2924 + }, + { + "epoch": 1.3829787234042552, + "grad_norm": 2.8363358974456787, + "learning_rate": 4.404151217205102e-06, + "loss": 0.545, + "step": 2925 + }, + { + "epoch": 1.383451536643026, + "grad_norm": 2.720972776412964, + "learning_rate": 4.403746932823302e-06, + "loss": 0.5732, + "step": 2926 + }, + { + "epoch": 1.3839243498817968, + "grad_norm": 2.728043794631958, + "learning_rate": 4.403342529903528e-06, + "loss": 0.4944, + "step": 2927 + }, + { + "epoch": 1.3843971631205674, + "grad_norm": 2.4366135597229004, + "learning_rate": 4.402938008470961e-06, + "loss": 0.4441, + "step": 2928 + }, + { + "epoch": 1.384869976359338, + "grad_norm": 2.858454704284668, + "learning_rate": 4.402533368550788e-06, + "loss": 0.5359, + "step": 2929 + }, + { + "epoch": 1.3853427895981087, + "grad_norm": 2.805795907974243, + "learning_rate": 4.402128610168205e-06, + "loss": 0.4954, + "step": 2930 + }, + { + "epoch": 1.3858156028368795, + "grad_norm": 3.3514177799224854, + "learning_rate": 4.401723733348413e-06, + "loss": 0.579, + "step": 2931 + }, + { + "epoch": 1.3862884160756501, + "grad_norm": 2.6255125999450684, + "learning_rate": 4.401318738116624e-06, + "loss": 0.5002, + "step": 2932 + }, + { + "epoch": 1.3867612293144207, + "grad_norm": 2.3480796813964844, + "learning_rate": 4.400913624498054e-06, + "loss": 0.4688, + "step": 2933 + }, + { + "epoch": 1.3872340425531915, + "grad_norm": 2.710165023803711, + "learning_rate": 4.400508392517927e-06, + "loss": 0.5099, + "step": 2934 + }, + { + "epoch": 1.3877068557919623, + "grad_norm": 2.5820295810699463, + "learning_rate": 4.400103042201477e-06, + "loss": 0.512, + "step": 2935 + }, + { + "epoch": 1.3881796690307329, + "grad_norm": 2.750596523284912, + "learning_rate": 4.399697573573942e-06, + "loss": 0.463, + "step": 2936 + }, + { + "epoch": 1.3886524822695034, + "grad_norm": 3.497537612915039, + "learning_rate": 4.399291986660569e-06, + "loss": 0.5676, + "step": 2937 + }, + { + "epoch": 1.3891252955082742, + "grad_norm": 2.4046003818511963, + "learning_rate": 4.398886281486612e-06, + "loss": 0.5408, + "step": 2938 + }, + { + "epoch": 1.389598108747045, + "grad_norm": 2.941606283187866, + "learning_rate": 4.398480458077332e-06, + "loss": 0.5734, + "step": 2939 + }, + { + "epoch": 1.3900709219858156, + "grad_norm": 3.030214309692383, + "learning_rate": 4.398074516458e-06, + "loss": 0.5353, + "step": 2940 + }, + { + "epoch": 1.3905437352245862, + "grad_norm": 2.9991626739501953, + "learning_rate": 4.397668456653889e-06, + "loss": 0.5989, + "step": 2941 + }, + { + "epoch": 1.391016548463357, + "grad_norm": 4.163141250610352, + "learning_rate": 4.397262278690285e-06, + "loss": 0.5436, + "step": 2942 + }, + { + "epoch": 1.3914893617021278, + "grad_norm": 2.6576037406921387, + "learning_rate": 4.396855982592478e-06, + "loss": 0.5206, + "step": 2943 + }, + { + "epoch": 1.3919621749408984, + "grad_norm": 2.7729203701019287, + "learning_rate": 4.396449568385768e-06, + "loss": 0.5403, + "step": 2944 + }, + { + "epoch": 1.392434988179669, + "grad_norm": 2.4560446739196777, + "learning_rate": 4.396043036095457e-06, + "loss": 0.4924, + "step": 2945 + }, + { + "epoch": 1.3929078014184397, + "grad_norm": 2.6370556354522705, + "learning_rate": 4.39563638574686e-06, + "loss": 0.5543, + "step": 2946 + }, + { + "epoch": 1.3933806146572105, + "grad_norm": 2.593914270401001, + "learning_rate": 4.395229617365298e-06, + "loss": 0.5133, + "step": 2947 + }, + { + "epoch": 1.393853427895981, + "grad_norm": 2.3583998680114746, + "learning_rate": 4.394822730976099e-06, + "loss": 0.4436, + "step": 2948 + }, + { + "epoch": 1.3943262411347517, + "grad_norm": 3.2768537998199463, + "learning_rate": 4.394415726604596e-06, + "loss": 0.5489, + "step": 2949 + }, + { + "epoch": 1.3947990543735225, + "grad_norm": 2.88662052154541, + "learning_rate": 4.394008604276133e-06, + "loss": 0.5194, + "step": 2950 + }, + { + "epoch": 1.3952718676122933, + "grad_norm": 2.46610426902771, + "learning_rate": 4.393601364016059e-06, + "loss": 0.5255, + "step": 2951 + }, + { + "epoch": 1.3957446808510638, + "grad_norm": 3.122509241104126, + "learning_rate": 4.393194005849731e-06, + "loss": 0.6046, + "step": 2952 + }, + { + "epoch": 1.3962174940898344, + "grad_norm": 2.724926471710205, + "learning_rate": 4.392786529802513e-06, + "loss": 0.4958, + "step": 2953 + }, + { + "epoch": 1.3966903073286052, + "grad_norm": 2.491485595703125, + "learning_rate": 4.3923789358997785e-06, + "loss": 0.5209, + "step": 2954 + }, + { + "epoch": 1.397163120567376, + "grad_norm": 2.61110520362854, + "learning_rate": 4.3919712241669056e-06, + "loss": 0.5202, + "step": 2955 + }, + { + "epoch": 1.3976359338061466, + "grad_norm": 2.3814501762390137, + "learning_rate": 4.39156339462928e-06, + "loss": 0.4966, + "step": 2956 + }, + { + "epoch": 1.3981087470449172, + "grad_norm": 2.762498617172241, + "learning_rate": 4.391155447312296e-06, + "loss": 0.6025, + "step": 2957 + }, + { + "epoch": 1.398581560283688, + "grad_norm": 2.964975595474243, + "learning_rate": 4.390747382241355e-06, + "loss": 0.4845, + "step": 2958 + }, + { + "epoch": 1.3990543735224588, + "grad_norm": 3.0117249488830566, + "learning_rate": 4.3903391994418655e-06, + "loss": 0.5326, + "step": 2959 + }, + { + "epoch": 1.3995271867612293, + "grad_norm": 2.578626871109009, + "learning_rate": 4.389930898939243e-06, + "loss": 0.5271, + "step": 2960 + }, + { + "epoch": 1.4, + "grad_norm": 2.747441053390503, + "learning_rate": 4.38952248075891e-06, + "loss": 0.5553, + "step": 2961 + }, + { + "epoch": 1.4004728132387707, + "grad_norm": 2.8273086547851562, + "learning_rate": 4.389113944926297e-06, + "loss": 0.5475, + "step": 2962 + }, + { + "epoch": 1.4009456264775415, + "grad_norm": 2.55238676071167, + "learning_rate": 4.388705291466843e-06, + "loss": 0.4864, + "step": 2963 + }, + { + "epoch": 1.401418439716312, + "grad_norm": 2.597214460372925, + "learning_rate": 4.388296520405992e-06, + "loss": 0.4845, + "step": 2964 + }, + { + "epoch": 1.4018912529550827, + "grad_norm": 2.608962297439575, + "learning_rate": 4.387887631769196e-06, + "loss": 0.5544, + "step": 2965 + }, + { + "epoch": 1.4023640661938535, + "grad_norm": 2.2754876613616943, + "learning_rate": 4.3874786255819165e-06, + "loss": 0.5045, + "step": 2966 + }, + { + "epoch": 1.4028368794326243, + "grad_norm": 2.9900264739990234, + "learning_rate": 4.387069501869618e-06, + "loss": 0.562, + "step": 2967 + }, + { + "epoch": 1.4033096926713948, + "grad_norm": 2.8069417476654053, + "learning_rate": 4.386660260657778e-06, + "loss": 0.5284, + "step": 2968 + }, + { + "epoch": 1.4037825059101654, + "grad_norm": 2.68894624710083, + "learning_rate": 4.386250901971875e-06, + "loss": 0.5879, + "step": 2969 + }, + { + "epoch": 1.4042553191489362, + "grad_norm": 2.614485025405884, + "learning_rate": 4.385841425837399e-06, + "loss": 0.4771, + "step": 2970 + }, + { + "epoch": 1.4047281323877068, + "grad_norm": 2.487950325012207, + "learning_rate": 4.385431832279848e-06, + "loss": 0.5552, + "step": 2971 + }, + { + "epoch": 1.4052009456264776, + "grad_norm": 2.5098392963409424, + "learning_rate": 4.385022121324723e-06, + "loss": 0.5267, + "step": 2972 + }, + { + "epoch": 1.4056737588652481, + "grad_norm": 2.825838565826416, + "learning_rate": 4.384612292997537e-06, + "loss": 0.5336, + "step": 2973 + }, + { + "epoch": 1.406146572104019, + "grad_norm": 2.898188829421997, + "learning_rate": 4.384202347323806e-06, + "loss": 0.5685, + "step": 2974 + }, + { + "epoch": 1.4066193853427895, + "grad_norm": 2.8722569942474365, + "learning_rate": 4.383792284329057e-06, + "loss": 0.5977, + "step": 2975 + }, + { + "epoch": 1.4070921985815603, + "grad_norm": 2.832951307296753, + "learning_rate": 4.3833821040388235e-06, + "loss": 0.5766, + "step": 2976 + }, + { + "epoch": 1.407565011820331, + "grad_norm": 2.7353670597076416, + "learning_rate": 4.3829718064786446e-06, + "loss": 0.5461, + "step": 2977 + }, + { + "epoch": 1.4080378250591017, + "grad_norm": 2.6050429344177246, + "learning_rate": 4.3825613916740675e-06, + "loss": 0.5501, + "step": 2978 + }, + { + "epoch": 1.4085106382978723, + "grad_norm": 2.79719877243042, + "learning_rate": 4.382150859650647e-06, + "loss": 0.502, + "step": 2979 + }, + { + "epoch": 1.408983451536643, + "grad_norm": 2.5538079738616943, + "learning_rate": 4.381740210433946e-06, + "loss": 0.4762, + "step": 2980 + }, + { + "epoch": 1.4094562647754136, + "grad_norm": 2.7256062030792236, + "learning_rate": 4.381329444049533e-06, + "loss": 0.4692, + "step": 2981 + }, + { + "epoch": 1.4099290780141844, + "grad_norm": 2.7778146266937256, + "learning_rate": 4.3809185605229855e-06, + "loss": 0.5366, + "step": 2982 + }, + { + "epoch": 1.410401891252955, + "grad_norm": 2.6289451122283936, + "learning_rate": 4.380507559879887e-06, + "loss": 0.5412, + "step": 2983 + }, + { + "epoch": 1.4108747044917258, + "grad_norm": 2.697204828262329, + "learning_rate": 4.380096442145827e-06, + "loss": 0.5065, + "step": 2984 + }, + { + "epoch": 1.4113475177304964, + "grad_norm": 2.4709219932556152, + "learning_rate": 4.379685207346407e-06, + "loss": 0.568, + "step": 2985 + }, + { + "epoch": 1.4118203309692672, + "grad_norm": 2.9740655422210693, + "learning_rate": 4.379273855507231e-06, + "loss": 0.5512, + "step": 2986 + }, + { + "epoch": 1.4122931442080378, + "grad_norm": 3.0090627670288086, + "learning_rate": 4.378862386653911e-06, + "loss": 0.5459, + "step": 2987 + }, + { + "epoch": 1.4127659574468086, + "grad_norm": 2.8835368156433105, + "learning_rate": 4.378450800812071e-06, + "loss": 0.5357, + "step": 2988 + }, + { + "epoch": 1.4132387706855791, + "grad_norm": 2.558824062347412, + "learning_rate": 4.378039098007335e-06, + "loss": 0.536, + "step": 2989 + }, + { + "epoch": 1.41371158392435, + "grad_norm": 2.5572092533111572, + "learning_rate": 4.377627278265339e-06, + "loss": 0.5183, + "step": 2990 + }, + { + "epoch": 1.4141843971631205, + "grad_norm": 2.7356579303741455, + "learning_rate": 4.377215341611727e-06, + "loss": 0.5087, + "step": 2991 + }, + { + "epoch": 1.4146572104018913, + "grad_norm": 2.7541024684906006, + "learning_rate": 4.376803288072146e-06, + "loss": 0.4509, + "step": 2992 + }, + { + "epoch": 1.4151300236406619, + "grad_norm": 2.7548446655273438, + "learning_rate": 4.376391117672254e-06, + "loss": 0.5532, + "step": 2993 + }, + { + "epoch": 1.4156028368794327, + "grad_norm": 2.9107465744018555, + "learning_rate": 4.375978830437715e-06, + "loss": 0.5719, + "step": 2994 + }, + { + "epoch": 1.4160756501182032, + "grad_norm": 2.7077393531799316, + "learning_rate": 4.3755664263942e-06, + "loss": 0.5084, + "step": 2995 + }, + { + "epoch": 1.416548463356974, + "grad_norm": 2.764209270477295, + "learning_rate": 4.375153905567388e-06, + "loss": 0.5976, + "step": 2996 + }, + { + "epoch": 1.4170212765957446, + "grad_norm": 2.7792932987213135, + "learning_rate": 4.374741267982964e-06, + "loss": 0.5358, + "step": 2997 + }, + { + "epoch": 1.4174940898345154, + "grad_norm": 2.459212064743042, + "learning_rate": 4.374328513666622e-06, + "loss": 0.5181, + "step": 2998 + }, + { + "epoch": 1.417966903073286, + "grad_norm": 2.548546552658081, + "learning_rate": 4.373915642644062e-06, + "loss": 0.528, + "step": 2999 + }, + { + "epoch": 1.4184397163120568, + "grad_norm": 2.998138189315796, + "learning_rate": 4.373502654940992e-06, + "loss": 0.5233, + "step": 3000 + }, + { + "epoch": 1.4189125295508274, + "grad_norm": 2.604341983795166, + "learning_rate": 4.373089550583126e-06, + "loss": 0.5274, + "step": 3001 + }, + { + "epoch": 1.4193853427895982, + "grad_norm": 2.6792588233947754, + "learning_rate": 4.372676329596188e-06, + "loss": 0.5061, + "step": 3002 + }, + { + "epoch": 1.4198581560283687, + "grad_norm": 2.5182368755340576, + "learning_rate": 4.372262992005906e-06, + "loss": 0.541, + "step": 3003 + }, + { + "epoch": 1.4203309692671395, + "grad_norm": 2.690718173980713, + "learning_rate": 4.371849537838018e-06, + "loss": 0.5308, + "step": 3004 + }, + { + "epoch": 1.42080378250591, + "grad_norm": 2.6797590255737305, + "learning_rate": 4.371435967118266e-06, + "loss": 0.5728, + "step": 3005 + }, + { + "epoch": 1.421276595744681, + "grad_norm": 2.847900152206421, + "learning_rate": 4.371022279872403e-06, + "loss": 0.5053, + "step": 3006 + }, + { + "epoch": 1.4217494089834515, + "grad_norm": 2.497810125350952, + "learning_rate": 4.370608476126186e-06, + "loss": 0.5057, + "step": 3007 + }, + { + "epoch": 1.4222222222222223, + "grad_norm": 2.5259225368499756, + "learning_rate": 4.370194555905382e-06, + "loss": 0.5508, + "step": 3008 + }, + { + "epoch": 1.4226950354609929, + "grad_norm": 2.774118423461914, + "learning_rate": 4.369780519235763e-06, + "loss": 0.5419, + "step": 3009 + }, + { + "epoch": 1.4231678486997636, + "grad_norm": 2.2764663696289062, + "learning_rate": 4.369366366143111e-06, + "loss": 0.5032, + "step": 3010 + }, + { + "epoch": 1.4236406619385342, + "grad_norm": 2.736347198486328, + "learning_rate": 4.368952096653211e-06, + "loss": 0.5184, + "step": 3011 + }, + { + "epoch": 1.424113475177305, + "grad_norm": 2.476762056350708, + "learning_rate": 4.36853771079186e-06, + "loss": 0.5331, + "step": 3012 + }, + { + "epoch": 1.4245862884160756, + "grad_norm": 2.8006162643432617, + "learning_rate": 4.3681232085848585e-06, + "loss": 0.5331, + "step": 3013 + }, + { + "epoch": 1.4250591016548464, + "grad_norm": 2.509143590927124, + "learning_rate": 4.367708590058016e-06, + "loss": 0.5127, + "step": 3014 + }, + { + "epoch": 1.425531914893617, + "grad_norm": 3.030137538909912, + "learning_rate": 4.3672938552371505e-06, + "loss": 0.5555, + "step": 3015 + }, + { + "epoch": 1.4260047281323878, + "grad_norm": 3.0536904335021973, + "learning_rate": 4.3668790041480835e-06, + "loss": 0.5241, + "step": 3016 + }, + { + "epoch": 1.4264775413711583, + "grad_norm": 2.6400439739227295, + "learning_rate": 4.366464036816647e-06, + "loss": 0.4946, + "step": 3017 + }, + { + "epoch": 1.4269503546099291, + "grad_norm": 2.7302589416503906, + "learning_rate": 4.366048953268679e-06, + "loss": 0.5105, + "step": 3018 + }, + { + "epoch": 1.4274231678486997, + "grad_norm": 2.504549264907837, + "learning_rate": 4.365633753530026e-06, + "loss": 0.4844, + "step": 3019 + }, + { + "epoch": 1.4278959810874705, + "grad_norm": 2.3872320652008057, + "learning_rate": 4.365218437626539e-06, + "loss": 0.4402, + "step": 3020 + }, + { + "epoch": 1.428368794326241, + "grad_norm": 2.531649351119995, + "learning_rate": 4.364803005584078e-06, + "loss": 0.4913, + "step": 3021 + }, + { + "epoch": 1.4288416075650119, + "grad_norm": 2.4683783054351807, + "learning_rate": 4.364387457428512e-06, + "loss": 0.515, + "step": 3022 + }, + { + "epoch": 1.4293144208037825, + "grad_norm": 2.632336378097534, + "learning_rate": 4.363971793185713e-06, + "loss": 0.5398, + "step": 3023 + }, + { + "epoch": 1.4297872340425533, + "grad_norm": 2.7456719875335693, + "learning_rate": 4.363556012881565e-06, + "loss": 0.5254, + "step": 3024 + }, + { + "epoch": 1.4302600472813238, + "grad_norm": 2.607177972793579, + "learning_rate": 4.363140116541955e-06, + "loss": 0.5266, + "step": 3025 + }, + { + "epoch": 1.4307328605200946, + "grad_norm": 2.640127420425415, + "learning_rate": 4.3627241041927796e-06, + "loss": 0.5157, + "step": 3026 + }, + { + "epoch": 1.4312056737588652, + "grad_norm": 2.4210736751556396, + "learning_rate": 4.362307975859941e-06, + "loss": 0.4599, + "step": 3027 + }, + { + "epoch": 1.431678486997636, + "grad_norm": 2.6007790565490723, + "learning_rate": 4.361891731569352e-06, + "loss": 0.5298, + "step": 3028 + }, + { + "epoch": 1.4321513002364066, + "grad_norm": 2.5352046489715576, + "learning_rate": 4.361475371346928e-06, + "loss": 0.5128, + "step": 3029 + }, + { + "epoch": 1.4326241134751774, + "grad_norm": 2.4204049110412598, + "learning_rate": 4.361058895218596e-06, + "loss": 0.4669, + "step": 3030 + }, + { + "epoch": 1.433096926713948, + "grad_norm": 2.525240182876587, + "learning_rate": 4.360642303210286e-06, + "loss": 0.4925, + "step": 3031 + }, + { + "epoch": 1.4335697399527187, + "grad_norm": 2.839646339416504, + "learning_rate": 4.360225595347939e-06, + "loss": 0.5868, + "step": 3032 + }, + { + "epoch": 1.4340425531914893, + "grad_norm": 2.5043296813964844, + "learning_rate": 4.359808771657501e-06, + "loss": 0.4951, + "step": 3033 + }, + { + "epoch": 1.4345153664302601, + "grad_norm": 2.9082300662994385, + "learning_rate": 4.359391832164927e-06, + "loss": 0.5259, + "step": 3034 + }, + { + "epoch": 1.4349881796690307, + "grad_norm": 2.6651999950408936, + "learning_rate": 4.3589747768961745e-06, + "loss": 0.537, + "step": 3035 + }, + { + "epoch": 1.4354609929078015, + "grad_norm": 2.577077865600586, + "learning_rate": 4.358557605877216e-06, + "loss": 0.5186, + "step": 3036 + }, + { + "epoch": 1.435933806146572, + "grad_norm": 2.7445287704467773, + "learning_rate": 4.3581403191340236e-06, + "loss": 0.5573, + "step": 3037 + }, + { + "epoch": 1.4364066193853429, + "grad_norm": 2.502086639404297, + "learning_rate": 4.357722916692582e-06, + "loss": 0.5039, + "step": 3038 + }, + { + "epoch": 1.4368794326241134, + "grad_norm": 2.4476163387298584, + "learning_rate": 4.357305398578879e-06, + "loss": 0.5638, + "step": 3039 + }, + { + "epoch": 1.4373522458628842, + "grad_norm": 2.7705588340759277, + "learning_rate": 4.356887764818915e-06, + "loss": 0.5485, + "step": 3040 + }, + { + "epoch": 1.4378250591016548, + "grad_norm": 2.498225450515747, + "learning_rate": 4.356470015438691e-06, + "loss": 0.5486, + "step": 3041 + }, + { + "epoch": 1.4382978723404256, + "grad_norm": 2.394320011138916, + "learning_rate": 4.356052150464219e-06, + "loss": 0.512, + "step": 3042 + }, + { + "epoch": 1.4387706855791962, + "grad_norm": 2.8725767135620117, + "learning_rate": 4.3556341699215185e-06, + "loss": 0.5202, + "step": 3043 + }, + { + "epoch": 1.439243498817967, + "grad_norm": 3.1707918643951416, + "learning_rate": 4.355216073836615e-06, + "loss": 0.5229, + "step": 3044 + }, + { + "epoch": 1.4397163120567376, + "grad_norm": 2.532578468322754, + "learning_rate": 4.3547978622355415e-06, + "loss": 0.4569, + "step": 3045 + }, + { + "epoch": 1.4401891252955084, + "grad_norm": 3.0111029148101807, + "learning_rate": 4.354379535144338e-06, + "loss": 0.5801, + "step": 3046 + }, + { + "epoch": 1.440661938534279, + "grad_norm": 2.9554224014282227, + "learning_rate": 4.353961092589052e-06, + "loss": 0.5968, + "step": 3047 + }, + { + "epoch": 1.4411347517730497, + "grad_norm": 2.7562637329101562, + "learning_rate": 4.353542534595738e-06, + "loss": 0.5005, + "step": 3048 + }, + { + "epoch": 1.4416075650118203, + "grad_norm": 3.083254337310791, + "learning_rate": 4.3531238611904595e-06, + "loss": 0.5389, + "step": 3049 + }, + { + "epoch": 1.442080378250591, + "grad_norm": 2.7778005599975586, + "learning_rate": 4.352705072399282e-06, + "loss": 0.5342, + "step": 3050 + }, + { + "epoch": 1.4425531914893617, + "grad_norm": 2.6673996448516846, + "learning_rate": 4.3522861682482845e-06, + "loss": 0.5213, + "step": 3051 + }, + { + "epoch": 1.4430260047281322, + "grad_norm": 2.637605905532837, + "learning_rate": 4.351867148763548e-06, + "loss": 0.4893, + "step": 3052 + }, + { + "epoch": 1.443498817966903, + "grad_norm": 2.834469795227051, + "learning_rate": 4.351448013971166e-06, + "loss": 0.5391, + "step": 3053 + }, + { + "epoch": 1.4439716312056738, + "grad_norm": 2.824153184890747, + "learning_rate": 4.351028763897234e-06, + "loss": 0.6403, + "step": 3054 + }, + { + "epoch": 1.4444444444444444, + "grad_norm": 2.558966875076294, + "learning_rate": 4.350609398567857e-06, + "loss": 0.4912, + "step": 3055 + }, + { + "epoch": 1.444917257683215, + "grad_norm": 2.281726360321045, + "learning_rate": 4.3501899180091475e-06, + "loss": 0.4655, + "step": 3056 + }, + { + "epoch": 1.4453900709219858, + "grad_norm": 2.499472141265869, + "learning_rate": 4.349770322247225e-06, + "loss": 0.4878, + "step": 3057 + }, + { + "epoch": 1.4458628841607566, + "grad_norm": 2.578615188598633, + "learning_rate": 4.349350611308215e-06, + "loss": 0.4855, + "step": 3058 + }, + { + "epoch": 1.4463356973995272, + "grad_norm": 2.7111165523529053, + "learning_rate": 4.348930785218252e-06, + "loss": 0.5415, + "step": 3059 + }, + { + "epoch": 1.4468085106382977, + "grad_norm": 2.8081610202789307, + "learning_rate": 4.348510844003476e-06, + "loss": 0.4881, + "step": 3060 + }, + { + "epoch": 1.4472813238770685, + "grad_norm": 2.9439868927001953, + "learning_rate": 4.348090787690036e-06, + "loss": 0.5485, + "step": 3061 + }, + { + "epoch": 1.4477541371158393, + "grad_norm": 2.592532157897949, + "learning_rate": 4.347670616304085e-06, + "loss": 0.4912, + "step": 3062 + }, + { + "epoch": 1.44822695035461, + "grad_norm": 2.960592746734619, + "learning_rate": 4.347250329871787e-06, + "loss": 0.5473, + "step": 3063 + }, + { + "epoch": 1.4486997635933805, + "grad_norm": 2.5786688327789307, + "learning_rate": 4.3468299284193116e-06, + "loss": 0.5348, + "step": 3064 + }, + { + "epoch": 1.4491725768321513, + "grad_norm": 2.6084046363830566, + "learning_rate": 4.346409411972834e-06, + "loss": 0.527, + "step": 3065 + }, + { + "epoch": 1.449645390070922, + "grad_norm": 2.489748239517212, + "learning_rate": 4.3459887805585385e-06, + "loss": 0.4943, + "step": 3066 + }, + { + "epoch": 1.4501182033096927, + "grad_norm": 2.452131986618042, + "learning_rate": 4.345568034202617e-06, + "loss": 0.4886, + "step": 3067 + }, + { + "epoch": 1.4505910165484632, + "grad_norm": 2.4034671783447266, + "learning_rate": 4.345147172931266e-06, + "loss": 0.4689, + "step": 3068 + }, + { + "epoch": 1.451063829787234, + "grad_norm": 2.6045448780059814, + "learning_rate": 4.344726196770691e-06, + "loss": 0.5842, + "step": 3069 + }, + { + "epoch": 1.4515366430260048, + "grad_norm": 2.697593927383423, + "learning_rate": 4.3443051057471045e-06, + "loss": 0.5358, + "step": 3070 + }, + { + "epoch": 1.4520094562647754, + "grad_norm": 2.6080820560455322, + "learning_rate": 4.343883899886727e-06, + "loss": 0.5361, + "step": 3071 + }, + { + "epoch": 1.452482269503546, + "grad_norm": 2.4605307579040527, + "learning_rate": 4.343462579215783e-06, + "loss": 0.4941, + "step": 3072 + }, + { + "epoch": 1.4529550827423168, + "grad_norm": 2.8025355339050293, + "learning_rate": 4.343041143760509e-06, + "loss": 0.5116, + "step": 3073 + }, + { + "epoch": 1.4534278959810876, + "grad_norm": 2.432515859603882, + "learning_rate": 4.3426195935471434e-06, + "loss": 0.4991, + "step": 3074 + }, + { + "epoch": 1.4539007092198581, + "grad_norm": 2.5838661193847656, + "learning_rate": 4.342197928601935e-06, + "loss": 0.4994, + "step": 3075 + }, + { + "epoch": 1.4543735224586287, + "grad_norm": 2.421692371368408, + "learning_rate": 4.341776148951141e-06, + "loss": 0.4945, + "step": 3076 + }, + { + "epoch": 1.4548463356973995, + "grad_norm": 2.5354676246643066, + "learning_rate": 4.341354254621021e-06, + "loss": 0.4859, + "step": 3077 + }, + { + "epoch": 1.4553191489361703, + "grad_norm": 2.7316789627075195, + "learning_rate": 4.340932245637846e-06, + "loss": 0.5136, + "step": 3078 + }, + { + "epoch": 1.455791962174941, + "grad_norm": 3.5903496742248535, + "learning_rate": 4.340510122027891e-06, + "loss": 0.6451, + "step": 3079 + }, + { + "epoch": 1.4562647754137115, + "grad_norm": 2.95190167427063, + "learning_rate": 4.340087883817442e-06, + "loss": 0.6354, + "step": 3080 + }, + { + "epoch": 1.4567375886524823, + "grad_norm": 2.8659214973449707, + "learning_rate": 4.339665531032789e-06, + "loss": 0.5514, + "step": 3081 + }, + { + "epoch": 1.457210401891253, + "grad_norm": 2.5681674480438232, + "learning_rate": 4.339243063700231e-06, + "loss": 0.5135, + "step": 3082 + }, + { + "epoch": 1.4576832151300236, + "grad_norm": 2.7353906631469727, + "learning_rate": 4.338820481846072e-06, + "loss": 0.4608, + "step": 3083 + }, + { + "epoch": 1.4581560283687942, + "grad_norm": 2.6116466522216797, + "learning_rate": 4.3383977854966245e-06, + "loss": 0.4924, + "step": 3084 + }, + { + "epoch": 1.458628841607565, + "grad_norm": 2.6676487922668457, + "learning_rate": 4.337974974678207e-06, + "loss": 0.5747, + "step": 3085 + }, + { + "epoch": 1.4591016548463358, + "grad_norm": 2.909031629562378, + "learning_rate": 4.337552049417147e-06, + "loss": 0.4618, + "step": 3086 + }, + { + "epoch": 1.4595744680851064, + "grad_norm": 2.7614190578460693, + "learning_rate": 4.33712900973978e-06, + "loss": 0.5154, + "step": 3087 + }, + { + "epoch": 1.460047281323877, + "grad_norm": 2.452188014984131, + "learning_rate": 4.336705855672444e-06, + "loss": 0.542, + "step": 3088 + }, + { + "epoch": 1.4605200945626478, + "grad_norm": 3.0004117488861084, + "learning_rate": 4.336282587241488e-06, + "loss": 0.5857, + "step": 3089 + }, + { + "epoch": 1.4609929078014185, + "grad_norm": 2.870783567428589, + "learning_rate": 4.335859204473268e-06, + "loss": 0.5506, + "step": 3090 + }, + { + "epoch": 1.4614657210401891, + "grad_norm": 3.1078689098358154, + "learning_rate": 4.335435707394145e-06, + "loss": 0.5138, + "step": 3091 + }, + { + "epoch": 1.4619385342789597, + "grad_norm": 2.8516197204589844, + "learning_rate": 4.335012096030488e-06, + "loss": 0.5842, + "step": 3092 + }, + { + "epoch": 1.4624113475177305, + "grad_norm": 2.615922212600708, + "learning_rate": 4.334588370408675e-06, + "loss": 0.4896, + "step": 3093 + }, + { + "epoch": 1.4628841607565013, + "grad_norm": 3.1911802291870117, + "learning_rate": 4.334164530555088e-06, + "loss": 0.4974, + "step": 3094 + }, + { + "epoch": 1.4633569739952719, + "grad_norm": 3.075051784515381, + "learning_rate": 4.3337405764961186e-06, + "loss": 0.567, + "step": 3095 + }, + { + "epoch": 1.4638297872340424, + "grad_norm": 2.550625801086426, + "learning_rate": 4.333316508258163e-06, + "loss": 0.4887, + "step": 3096 + }, + { + "epoch": 1.4643026004728132, + "grad_norm": 2.3986475467681885, + "learning_rate": 4.332892325867629e-06, + "loss": 0.5047, + "step": 3097 + }, + { + "epoch": 1.464775413711584, + "grad_norm": 2.5045125484466553, + "learning_rate": 4.332468029350926e-06, + "loss": 0.4721, + "step": 3098 + }, + { + "epoch": 1.4652482269503546, + "grad_norm": 2.347365617752075, + "learning_rate": 4.332043618734474e-06, + "loss": 0.4913, + "step": 3099 + }, + { + "epoch": 1.4657210401891252, + "grad_norm": 2.459928512573242, + "learning_rate": 4.331619094044699e-06, + "loss": 0.523, + "step": 3100 + }, + { + "epoch": 1.466193853427896, + "grad_norm": 2.5771310329437256, + "learning_rate": 4.331194455308035e-06, + "loss": 0.593, + "step": 3101 + }, + { + "epoch": 1.4666666666666668, + "grad_norm": 3.1351823806762695, + "learning_rate": 4.330769702550921e-06, + "loss": 0.5852, + "step": 3102 + }, + { + "epoch": 1.4671394799054374, + "grad_norm": 2.589817523956299, + "learning_rate": 4.330344835799806e-06, + "loss": 0.508, + "step": 3103 + }, + { + "epoch": 1.467612293144208, + "grad_norm": 3.1140341758728027, + "learning_rate": 4.329919855081144e-06, + "loss": 0.469, + "step": 3104 + }, + { + "epoch": 1.4680851063829787, + "grad_norm": 2.8186635971069336, + "learning_rate": 4.329494760421396e-06, + "loss": 0.5088, + "step": 3105 + }, + { + "epoch": 1.4685579196217495, + "grad_norm": 2.676077365875244, + "learning_rate": 4.329069551847031e-06, + "loss": 0.52, + "step": 3106 + }, + { + "epoch": 1.46903073286052, + "grad_norm": 2.5543313026428223, + "learning_rate": 4.328644229384526e-06, + "loss": 0.5066, + "step": 3107 + }, + { + "epoch": 1.4695035460992907, + "grad_norm": 2.8176217079162598, + "learning_rate": 4.328218793060362e-06, + "loss": 0.6404, + "step": 3108 + }, + { + "epoch": 1.4699763593380615, + "grad_norm": 2.485217332839966, + "learning_rate": 4.3277932429010314e-06, + "loss": 0.4578, + "step": 3109 + }, + { + "epoch": 1.4704491725768323, + "grad_norm": 2.6741621494293213, + "learning_rate": 4.327367578933031e-06, + "loss": 0.5068, + "step": 3110 + }, + { + "epoch": 1.4709219858156029, + "grad_norm": 2.377242088317871, + "learning_rate": 4.326941801182863e-06, + "loss": 0.5249, + "step": 3111 + }, + { + "epoch": 1.4713947990543734, + "grad_norm": 2.790046215057373, + "learning_rate": 4.32651590967704e-06, + "loss": 0.5532, + "step": 3112 + }, + { + "epoch": 1.4718676122931442, + "grad_norm": 2.78019642829895, + "learning_rate": 4.326089904442081e-06, + "loss": 0.5362, + "step": 3113 + }, + { + "epoch": 1.472340425531915, + "grad_norm": 2.5661380290985107, + "learning_rate": 4.32566378550451e-06, + "loss": 0.5041, + "step": 3114 + }, + { + "epoch": 1.4728132387706856, + "grad_norm": 2.522153615951538, + "learning_rate": 4.3252375528908605e-06, + "loss": 0.5074, + "step": 3115 + }, + { + "epoch": 1.4732860520094562, + "grad_norm": 2.874688148498535, + "learning_rate": 4.3248112066276725e-06, + "loss": 0.59, + "step": 3116 + }, + { + "epoch": 1.473758865248227, + "grad_norm": 3.067866802215576, + "learning_rate": 4.324384746741492e-06, + "loss": 0.5924, + "step": 3117 + }, + { + "epoch": 1.4742316784869978, + "grad_norm": 3.359463930130005, + "learning_rate": 4.323958173258873e-06, + "loss": 0.6346, + "step": 3118 + }, + { + "epoch": 1.4747044917257683, + "grad_norm": 2.193024158477783, + "learning_rate": 4.323531486206376e-06, + "loss": 0.4594, + "step": 3119 + }, + { + "epoch": 1.475177304964539, + "grad_norm": 2.886889934539795, + "learning_rate": 4.323104685610569e-06, + "loss": 0.523, + "step": 3120 + }, + { + "epoch": 1.4756501182033097, + "grad_norm": 2.7558681964874268, + "learning_rate": 4.322677771498028e-06, + "loss": 0.5387, + "step": 3121 + }, + { + "epoch": 1.4761229314420805, + "grad_norm": 2.639277935028076, + "learning_rate": 4.322250743895335e-06, + "loss": 0.5599, + "step": 3122 + }, + { + "epoch": 1.476595744680851, + "grad_norm": 2.786198616027832, + "learning_rate": 4.321823602829078e-06, + "loss": 0.5405, + "step": 3123 + }, + { + "epoch": 1.4770685579196217, + "grad_norm": 2.582315683364868, + "learning_rate": 4.321396348325853e-06, + "loss": 0.4452, + "step": 3124 + }, + { + "epoch": 1.4775413711583925, + "grad_norm": 2.8574297428131104, + "learning_rate": 4.320968980412265e-06, + "loss": 0.4846, + "step": 3125 + }, + { + "epoch": 1.4780141843971633, + "grad_norm": 2.705281972885132, + "learning_rate": 4.320541499114922e-06, + "loss": 0.5548, + "step": 3126 + }, + { + "epoch": 1.4784869976359338, + "grad_norm": 2.3152754306793213, + "learning_rate": 4.320113904460444e-06, + "loss": 0.5216, + "step": 3127 + }, + { + "epoch": 1.4789598108747044, + "grad_norm": 3.230764150619507, + "learning_rate": 4.319686196475453e-06, + "loss": 0.6192, + "step": 3128 + }, + { + "epoch": 1.4794326241134752, + "grad_norm": 2.463380813598633, + "learning_rate": 4.319258375186583e-06, + "loss": 0.4872, + "step": 3129 + }, + { + "epoch": 1.479905437352246, + "grad_norm": 2.8477656841278076, + "learning_rate": 4.31883044062047e-06, + "loss": 0.5371, + "step": 3130 + }, + { + "epoch": 1.4803782505910166, + "grad_norm": 2.393911123275757, + "learning_rate": 4.318402392803762e-06, + "loss": 0.5334, + "step": 3131 + }, + { + "epoch": 1.4808510638297872, + "grad_norm": 2.6113736629486084, + "learning_rate": 4.317974231763109e-06, + "loss": 0.5572, + "step": 3132 + }, + { + "epoch": 1.481323877068558, + "grad_norm": 2.3941731452941895, + "learning_rate": 4.317545957525173e-06, + "loss": 0.4849, + "step": 3133 + }, + { + "epoch": 1.4817966903073285, + "grad_norm": 2.9536755084991455, + "learning_rate": 4.317117570116619e-06, + "loss": 0.6058, + "step": 3134 + }, + { + "epoch": 1.4822695035460993, + "grad_norm": 2.595754623413086, + "learning_rate": 4.316689069564123e-06, + "loss": 0.5193, + "step": 3135 + }, + { + "epoch": 1.48274231678487, + "grad_norm": 2.569833993911743, + "learning_rate": 4.316260455894364e-06, + "loss": 0.543, + "step": 3136 + }, + { + "epoch": 1.4832151300236407, + "grad_norm": 2.5137455463409424, + "learning_rate": 4.315831729134031e-06, + "loss": 0.5415, + "step": 3137 + }, + { + "epoch": 1.4836879432624113, + "grad_norm": 2.5582292079925537, + "learning_rate": 4.3154028893098176e-06, + "loss": 0.5338, + "step": 3138 + }, + { + "epoch": 1.484160756501182, + "grad_norm": 2.666426181793213, + "learning_rate": 4.3149739364484265e-06, + "loss": 0.5435, + "step": 3139 + }, + { + "epoch": 1.4846335697399526, + "grad_norm": 2.790851354598999, + "learning_rate": 4.314544870576568e-06, + "loss": 0.5746, + "step": 3140 + }, + { + "epoch": 1.4851063829787234, + "grad_norm": 2.620326042175293, + "learning_rate": 4.314115691720956e-06, + "loss": 0.5076, + "step": 3141 + }, + { + "epoch": 1.485579196217494, + "grad_norm": 3.075674533843994, + "learning_rate": 4.313686399908314e-06, + "loss": 0.5486, + "step": 3142 + }, + { + "epoch": 1.4860520094562648, + "grad_norm": 3.1347315311431885, + "learning_rate": 4.3132569951653745e-06, + "loss": 0.531, + "step": 3143 + }, + { + "epoch": 1.4865248226950354, + "grad_norm": 2.5783653259277344, + "learning_rate": 4.312827477518871e-06, + "loss": 0.5818, + "step": 3144 + }, + { + "epoch": 1.4869976359338062, + "grad_norm": 3.0247137546539307, + "learning_rate": 4.3123978469955505e-06, + "loss": 0.5347, + "step": 3145 + }, + { + "epoch": 1.4874704491725768, + "grad_norm": 2.4789345264434814, + "learning_rate": 4.311968103622163e-06, + "loss": 0.5, + "step": 3146 + }, + { + "epoch": 1.4879432624113476, + "grad_norm": 2.663341522216797, + "learning_rate": 4.311538247425466e-06, + "loss": 0.4825, + "step": 3147 + }, + { + "epoch": 1.4884160756501181, + "grad_norm": 2.633711099624634, + "learning_rate": 4.311108278432226e-06, + "loss": 0.5244, + "step": 3148 + }, + { + "epoch": 1.488888888888889, + "grad_norm": 2.51312518119812, + "learning_rate": 4.310678196669216e-06, + "loss": 0.513, + "step": 3149 + }, + { + "epoch": 1.4893617021276595, + "grad_norm": 2.5263755321502686, + "learning_rate": 4.310248002163214e-06, + "loss": 0.5236, + "step": 3150 + }, + { + "epoch": 1.4898345153664303, + "grad_norm": 2.559216260910034, + "learning_rate": 4.309817694941007e-06, + "loss": 0.5107, + "step": 3151 + }, + { + "epoch": 1.4903073286052009, + "grad_norm": 2.5023303031921387, + "learning_rate": 4.309387275029386e-06, + "loss": 0.4685, + "step": 3152 + }, + { + "epoch": 1.4907801418439717, + "grad_norm": 3.0314254760742188, + "learning_rate": 4.308956742455155e-06, + "loss": 0.5462, + "step": 3153 + }, + { + "epoch": 1.4912529550827422, + "grad_norm": 2.675295114517212, + "learning_rate": 4.308526097245119e-06, + "loss": 0.5398, + "step": 3154 + }, + { + "epoch": 1.491725768321513, + "grad_norm": 2.6613399982452393, + "learning_rate": 4.308095339426094e-06, + "loss": 0.5376, + "step": 3155 + }, + { + "epoch": 1.4921985815602836, + "grad_norm": 2.58937668800354, + "learning_rate": 4.307664469024899e-06, + "loss": 0.5385, + "step": 3156 + }, + { + "epoch": 1.4926713947990544, + "grad_norm": 2.583631992340088, + "learning_rate": 4.3072334860683655e-06, + "loss": 0.4927, + "step": 3157 + }, + { + "epoch": 1.493144208037825, + "grad_norm": 2.5889222621917725, + "learning_rate": 4.306802390583327e-06, + "loss": 0.47, + "step": 3158 + }, + { + "epoch": 1.4936170212765958, + "grad_norm": 2.9362716674804688, + "learning_rate": 4.3063711825966244e-06, + "loss": 0.4902, + "step": 3159 + }, + { + "epoch": 1.4940898345153664, + "grad_norm": 2.5385425090789795, + "learning_rate": 4.305939862135111e-06, + "loss": 0.5396, + "step": 3160 + }, + { + "epoch": 1.4945626477541372, + "grad_norm": 2.776326894760132, + "learning_rate": 4.305508429225641e-06, + "loss": 0.5169, + "step": 3161 + }, + { + "epoch": 1.4950354609929077, + "grad_norm": 2.575063467025757, + "learning_rate": 4.305076883895076e-06, + "loss": 0.4938, + "step": 3162 + }, + { + "epoch": 1.4955082742316785, + "grad_norm": 2.7552313804626465, + "learning_rate": 4.304645226170291e-06, + "loss": 0.6211, + "step": 3163 + }, + { + "epoch": 1.4959810874704491, + "grad_norm": 2.57149338722229, + "learning_rate": 4.30421345607816e-06, + "loss": 0.5241, + "step": 3164 + }, + { + "epoch": 1.49645390070922, + "grad_norm": 2.8142426013946533, + "learning_rate": 4.303781573645568e-06, + "loss": 0.5699, + "step": 3165 + }, + { + "epoch": 1.4969267139479905, + "grad_norm": 2.6344845294952393, + "learning_rate": 4.303349578899407e-06, + "loss": 0.5049, + "step": 3166 + }, + { + "epoch": 1.4973995271867613, + "grad_norm": 2.554410934448242, + "learning_rate": 4.302917471866575e-06, + "loss": 0.4404, + "step": 3167 + }, + { + "epoch": 1.4978723404255319, + "grad_norm": 2.896240711212158, + "learning_rate": 4.302485252573978e-06, + "loss": 0.602, + "step": 3168 + }, + { + "epoch": 1.4983451536643027, + "grad_norm": 2.4044477939605713, + "learning_rate": 4.302052921048527e-06, + "loss": 0.4857, + "step": 3169 + }, + { + "epoch": 1.4988179669030732, + "grad_norm": 2.7447879314422607, + "learning_rate": 4.301620477317144e-06, + "loss": 0.5438, + "step": 3170 + }, + { + "epoch": 1.499290780141844, + "grad_norm": 2.851820945739746, + "learning_rate": 4.301187921406752e-06, + "loss": 0.5245, + "step": 3171 + }, + { + "epoch": 1.4997635933806146, + "grad_norm": 3.247114419937134, + "learning_rate": 4.300755253344287e-06, + "loss": 0.504, + "step": 3172 + }, + { + "epoch": 1.5002364066193854, + "grad_norm": 3.117490291595459, + "learning_rate": 4.300322473156688e-06, + "loss": 0.4627, + "step": 3173 + }, + { + "epoch": 1.500709219858156, + "grad_norm": 2.558319330215454, + "learning_rate": 4.299889580870904e-06, + "loss": 0.5721, + "step": 3174 + }, + { + "epoch": 1.5011820330969265, + "grad_norm": 2.8983113765716553, + "learning_rate": 4.2994565765138865e-06, + "loss": 0.5257, + "step": 3175 + }, + { + "epoch": 1.5016548463356973, + "grad_norm": 2.744056463241577, + "learning_rate": 4.299023460112599e-06, + "loss": 0.4892, + "step": 3176 + }, + { + "epoch": 1.5021276595744681, + "grad_norm": 2.5506751537323, + "learning_rate": 4.29859023169401e-06, + "loss": 0.4933, + "step": 3177 + }, + { + "epoch": 1.5026004728132387, + "grad_norm": 2.842615842819214, + "learning_rate": 4.298156891285092e-06, + "loss": 0.6124, + "step": 3178 + }, + { + "epoch": 1.5030732860520093, + "grad_norm": 2.5355329513549805, + "learning_rate": 4.2977234389128305e-06, + "loss": 0.641, + "step": 3179 + }, + { + "epoch": 1.50354609929078, + "grad_norm": 2.674781084060669, + "learning_rate": 4.297289874604213e-06, + "loss": 0.475, + "step": 3180 + }, + { + "epoch": 1.5040189125295509, + "grad_norm": 2.6845548152923584, + "learning_rate": 4.296856198386235e-06, + "loss": 0.5328, + "step": 3181 + }, + { + "epoch": 1.5044917257683215, + "grad_norm": 2.9686241149902344, + "learning_rate": 4.296422410285902e-06, + "loss": 0.6216, + "step": 3182 + }, + { + "epoch": 1.504964539007092, + "grad_norm": 2.5095980167388916, + "learning_rate": 4.295988510330222e-06, + "loss": 0.4993, + "step": 3183 + }, + { + "epoch": 1.5054373522458628, + "grad_norm": 2.4906392097473145, + "learning_rate": 4.2955544985462125e-06, + "loss": 0.4795, + "step": 3184 + }, + { + "epoch": 1.5059101654846336, + "grad_norm": 2.5593366622924805, + "learning_rate": 4.295120374960897e-06, + "loss": 0.5527, + "step": 3185 + }, + { + "epoch": 1.5063829787234042, + "grad_norm": 2.691495180130005, + "learning_rate": 4.294686139601308e-06, + "loss": 0.5646, + "step": 3186 + }, + { + "epoch": 1.5068557919621748, + "grad_norm": 2.74320387840271, + "learning_rate": 4.294251792494483e-06, + "loss": 0.6149, + "step": 3187 + }, + { + "epoch": 1.5073286052009456, + "grad_norm": 2.8827052116394043, + "learning_rate": 4.293817333667465e-06, + "loss": 0.5414, + "step": 3188 + }, + { + "epoch": 1.5078014184397164, + "grad_norm": 2.5652425289154053, + "learning_rate": 4.293382763147308e-06, + "loss": 0.5006, + "step": 3189 + }, + { + "epoch": 1.508274231678487, + "grad_norm": 2.729295253753662, + "learning_rate": 4.29294808096107e-06, + "loss": 0.522, + "step": 3190 + }, + { + "epoch": 1.5087470449172575, + "grad_norm": 2.348118305206299, + "learning_rate": 4.292513287135817e-06, + "loss": 0.4125, + "step": 3191 + }, + { + "epoch": 1.5092198581560283, + "grad_norm": 2.809551954269409, + "learning_rate": 4.292078381698621e-06, + "loss": 0.5577, + "step": 3192 + }, + { + "epoch": 1.5096926713947991, + "grad_norm": 2.6925361156463623, + "learning_rate": 4.291643364676563e-06, + "loss": 0.62, + "step": 3193 + }, + { + "epoch": 1.5101654846335697, + "grad_norm": 2.4200620651245117, + "learning_rate": 4.291208236096729e-06, + "loss": 0.5464, + "step": 3194 + }, + { + "epoch": 1.5106382978723403, + "grad_norm": 2.5659191608428955, + "learning_rate": 4.290772995986211e-06, + "loss": 0.5402, + "step": 3195 + }, + { + "epoch": 1.511111111111111, + "grad_norm": 2.3877315521240234, + "learning_rate": 4.290337644372113e-06, + "loss": 0.463, + "step": 3196 + }, + { + "epoch": 1.5115839243498819, + "grad_norm": 2.7063233852386475, + "learning_rate": 4.289902181281538e-06, + "loss": 0.5253, + "step": 3197 + }, + { + "epoch": 1.5120567375886524, + "grad_norm": 2.56788969039917, + "learning_rate": 4.289466606741603e-06, + "loss": 0.5012, + "step": 3198 + }, + { + "epoch": 1.512529550827423, + "grad_norm": 2.637164831161499, + "learning_rate": 4.28903092077943e-06, + "loss": 0.5236, + "step": 3199 + }, + { + "epoch": 1.5130023640661938, + "grad_norm": 2.767526865005493, + "learning_rate": 4.288595123422146e-06, + "loss": 0.5832, + "step": 3200 + }, + { + "epoch": 1.5134751773049646, + "grad_norm": 2.33365535736084, + "learning_rate": 4.2881592146968866e-06, + "loss": 0.4548, + "step": 3201 + }, + { + "epoch": 1.5139479905437352, + "grad_norm": 2.544189453125, + "learning_rate": 4.287723194630793e-06, + "loss": 0.5115, + "step": 3202 + }, + { + "epoch": 1.5144208037825058, + "grad_norm": 2.588793992996216, + "learning_rate": 4.2872870632510155e-06, + "loss": 0.4766, + "step": 3203 + }, + { + "epoch": 1.5148936170212766, + "grad_norm": 2.5382184982299805, + "learning_rate": 4.286850820584709e-06, + "loss": 0.5401, + "step": 3204 + }, + { + "epoch": 1.5153664302600474, + "grad_norm": 2.597930669784546, + "learning_rate": 4.286414466659038e-06, + "loss": 0.5346, + "step": 3205 + }, + { + "epoch": 1.515839243498818, + "grad_norm": 2.8522393703460693, + "learning_rate": 4.28597800150117e-06, + "loss": 0.486, + "step": 3206 + }, + { + "epoch": 1.5163120567375885, + "grad_norm": 2.4801454544067383, + "learning_rate": 4.285541425138285e-06, + "loss": 0.5162, + "step": 3207 + }, + { + "epoch": 1.5167848699763593, + "grad_norm": 2.353665351867676, + "learning_rate": 4.285104737597563e-06, + "loss": 0.5066, + "step": 3208 + }, + { + "epoch": 1.51725768321513, + "grad_norm": 2.767976760864258, + "learning_rate": 4.2846679389061975e-06, + "loss": 0.5331, + "step": 3209 + }, + { + "epoch": 1.5177304964539007, + "grad_norm": 2.9307682514190674, + "learning_rate": 4.284231029091385e-06, + "loss": 0.5291, + "step": 3210 + }, + { + "epoch": 1.5182033096926713, + "grad_norm": 2.39719820022583, + "learning_rate": 4.283794008180329e-06, + "loss": 0.4759, + "step": 3211 + }, + { + "epoch": 1.518676122931442, + "grad_norm": 2.452244758605957, + "learning_rate": 4.283356876200242e-06, + "loss": 0.4283, + "step": 3212 + }, + { + "epoch": 1.5191489361702128, + "grad_norm": 2.4911608695983887, + "learning_rate": 4.282919633178343e-06, + "loss": 0.4812, + "step": 3213 + }, + { + "epoch": 1.5196217494089834, + "grad_norm": 2.5813944339752197, + "learning_rate": 4.282482279141856e-06, + "loss": 0.4911, + "step": 3214 + }, + { + "epoch": 1.520094562647754, + "grad_norm": 2.503542184829712, + "learning_rate": 4.282044814118013e-06, + "loss": 0.4969, + "step": 3215 + }, + { + "epoch": 1.5205673758865248, + "grad_norm": 2.5090713500976562, + "learning_rate": 4.281607238134053e-06, + "loss": 0.5293, + "step": 3216 + }, + { + "epoch": 1.5210401891252956, + "grad_norm": 2.425994396209717, + "learning_rate": 4.281169551217223e-06, + "loss": 0.5365, + "step": 3217 + }, + { + "epoch": 1.5215130023640662, + "grad_norm": 2.637655258178711, + "learning_rate": 4.2807317533947765e-06, + "loss": 0.5589, + "step": 3218 + }, + { + "epoch": 1.5219858156028367, + "grad_norm": 2.9335296154022217, + "learning_rate": 4.28029384469397e-06, + "loss": 0.6071, + "step": 3219 + }, + { + "epoch": 1.5224586288416075, + "grad_norm": 2.898683547973633, + "learning_rate": 4.279855825142073e-06, + "loss": 0.5392, + "step": 3220 + }, + { + "epoch": 1.5229314420803783, + "grad_norm": 2.613914966583252, + "learning_rate": 4.279417694766359e-06, + "loss": 0.4968, + "step": 3221 + }, + { + "epoch": 1.523404255319149, + "grad_norm": 2.500682830810547, + "learning_rate": 4.278979453594106e-06, + "loss": 0.471, + "step": 3222 + }, + { + "epoch": 1.5238770685579195, + "grad_norm": 2.5269598960876465, + "learning_rate": 4.278541101652605e-06, + "loss": 0.471, + "step": 3223 + }, + { + "epoch": 1.5243498817966903, + "grad_norm": 2.8153114318847656, + "learning_rate": 4.2781026389691465e-06, + "loss": 0.5742, + "step": 3224 + }, + { + "epoch": 1.524822695035461, + "grad_norm": 2.5648019313812256, + "learning_rate": 4.277664065571034e-06, + "loss": 0.5315, + "step": 3225 + }, + { + "epoch": 1.5252955082742317, + "grad_norm": 2.778355836868286, + "learning_rate": 4.277225381485575e-06, + "loss": 0.5543, + "step": 3226 + }, + { + "epoch": 1.5257683215130022, + "grad_norm": 2.6736745834350586, + "learning_rate": 4.2767865867400846e-06, + "loss": 0.4947, + "step": 3227 + }, + { + "epoch": 1.526241134751773, + "grad_norm": 2.9560294151306152, + "learning_rate": 4.276347681361884e-06, + "loss": 0.5835, + "step": 3228 + }, + { + "epoch": 1.5267139479905438, + "grad_norm": 2.5580296516418457, + "learning_rate": 4.275908665378302e-06, + "loss": 0.4751, + "step": 3229 + }, + { + "epoch": 1.5271867612293144, + "grad_norm": 3.0705175399780273, + "learning_rate": 4.2754695388166755e-06, + "loss": 0.5327, + "step": 3230 + }, + { + "epoch": 1.527659574468085, + "grad_norm": 2.664652109146118, + "learning_rate": 4.275030301704346e-06, + "loss": 0.4934, + "step": 3231 + }, + { + "epoch": 1.5281323877068558, + "grad_norm": 2.308499813079834, + "learning_rate": 4.274590954068663e-06, + "loss": 0.4412, + "step": 3232 + }, + { + "epoch": 1.5286052009456266, + "grad_norm": 2.871189594268799, + "learning_rate": 4.2741514959369815e-06, + "loss": 0.5001, + "step": 3233 + }, + { + "epoch": 1.5290780141843971, + "grad_norm": 2.5274453163146973, + "learning_rate": 4.273711927336666e-06, + "loss": 0.4938, + "step": 3234 + }, + { + "epoch": 1.5295508274231677, + "grad_norm": 2.8848133087158203, + "learning_rate": 4.273272248295087e-06, + "loss": 0.5397, + "step": 3235 + }, + { + "epoch": 1.5300236406619385, + "grad_norm": 2.3927090167999268, + "learning_rate": 4.27283245883962e-06, + "loss": 0.5497, + "step": 3236 + }, + { + "epoch": 1.5304964539007093, + "grad_norm": 2.5413873195648193, + "learning_rate": 4.27239255899765e-06, + "loss": 0.5108, + "step": 3237 + }, + { + "epoch": 1.53096926713948, + "grad_norm": 2.7692389488220215, + "learning_rate": 4.271952548796567e-06, + "loss": 0.5768, + "step": 3238 + }, + { + "epoch": 1.5314420803782505, + "grad_norm": 2.4621126651763916, + "learning_rate": 4.271512428263768e-06, + "loss": 0.4698, + "step": 3239 + }, + { + "epoch": 1.5319148936170213, + "grad_norm": 2.6423375606536865, + "learning_rate": 4.271072197426659e-06, + "loss": 0.4929, + "step": 3240 + }, + { + "epoch": 1.532387706855792, + "grad_norm": 2.7097692489624023, + "learning_rate": 4.270631856312649e-06, + "loss": 0.4836, + "step": 3241 + }, + { + "epoch": 1.5328605200945626, + "grad_norm": 2.545706272125244, + "learning_rate": 4.270191404949158e-06, + "loss": 0.4636, + "step": 3242 + }, + { + "epoch": 1.5333333333333332, + "grad_norm": 3.138781785964966, + "learning_rate": 4.26975084336361e-06, + "loss": 0.5988, + "step": 3243 + }, + { + "epoch": 1.533806146572104, + "grad_norm": 2.492715835571289, + "learning_rate": 4.269310171583438e-06, + "loss": 0.5095, + "step": 3244 + }, + { + "epoch": 1.5342789598108748, + "grad_norm": 2.5705838203430176, + "learning_rate": 4.268869389636077e-06, + "loss": 0.4818, + "step": 3245 + }, + { + "epoch": 1.5347517730496454, + "grad_norm": 2.7633554935455322, + "learning_rate": 4.268428497548979e-06, + "loss": 0.547, + "step": 3246 + }, + { + "epoch": 1.535224586288416, + "grad_norm": 2.654528856277466, + "learning_rate": 4.2679874953495905e-06, + "loss": 0.5261, + "step": 3247 + }, + { + "epoch": 1.5356973995271868, + "grad_norm": 2.5039751529693604, + "learning_rate": 4.2675463830653744e-06, + "loss": 0.4941, + "step": 3248 + }, + { + "epoch": 1.5361702127659576, + "grad_norm": 2.897268295288086, + "learning_rate": 4.267105160723794e-06, + "loss": 0.5404, + "step": 3249 + }, + { + "epoch": 1.5366430260047281, + "grad_norm": 2.500732421875, + "learning_rate": 4.266663828352324e-06, + "loss": 0.5375, + "step": 3250 + }, + { + "epoch": 1.5371158392434987, + "grad_norm": 2.6310064792633057, + "learning_rate": 4.266222385978444e-06, + "loss": 0.5217, + "step": 3251 + }, + { + "epoch": 1.5375886524822695, + "grad_norm": 2.7440476417541504, + "learning_rate": 4.265780833629642e-06, + "loss": 0.5419, + "step": 3252 + }, + { + "epoch": 1.5380614657210403, + "grad_norm": 2.7037577629089355, + "learning_rate": 4.2653391713334095e-06, + "loss": 0.5634, + "step": 3253 + }, + { + "epoch": 1.5385342789598109, + "grad_norm": 2.548525810241699, + "learning_rate": 4.264897399117248e-06, + "loss": 0.535, + "step": 3254 + }, + { + "epoch": 1.5390070921985815, + "grad_norm": 2.6127355098724365, + "learning_rate": 4.264455517008663e-06, + "loss": 0.4619, + "step": 3255 + }, + { + "epoch": 1.5394799054373522, + "grad_norm": 2.5597004890441895, + "learning_rate": 4.264013525035171e-06, + "loss": 0.4477, + "step": 3256 + }, + { + "epoch": 1.539952718676123, + "grad_norm": 2.642432689666748, + "learning_rate": 4.263571423224292e-06, + "loss": 0.4749, + "step": 3257 + }, + { + "epoch": 1.5404255319148936, + "grad_norm": 2.5121877193450928, + "learning_rate": 4.2631292116035526e-06, + "loss": 0.4693, + "step": 3258 + }, + { + "epoch": 1.5408983451536642, + "grad_norm": 2.390292167663574, + "learning_rate": 4.262686890200489e-06, + "loss": 0.4872, + "step": 3259 + }, + { + "epoch": 1.541371158392435, + "grad_norm": 2.5898337364196777, + "learning_rate": 4.2622444590426405e-06, + "loss": 0.5193, + "step": 3260 + }, + { + "epoch": 1.5418439716312058, + "grad_norm": 2.508821487426758, + "learning_rate": 4.261801918157558e-06, + "loss": 0.511, + "step": 3261 + }, + { + "epoch": 1.5423167848699764, + "grad_norm": 2.6992101669311523, + "learning_rate": 4.261359267572795e-06, + "loss": 0.5069, + "step": 3262 + }, + { + "epoch": 1.542789598108747, + "grad_norm": 2.6011030673980713, + "learning_rate": 4.2609165073159145e-06, + "loss": 0.5887, + "step": 3263 + }, + { + "epoch": 1.5432624113475177, + "grad_norm": 2.887053966522217, + "learning_rate": 4.260473637414483e-06, + "loss": 0.5556, + "step": 3264 + }, + { + "epoch": 1.5437352245862885, + "grad_norm": 2.6433887481689453, + "learning_rate": 4.260030657896079e-06, + "loss": 0.4728, + "step": 3265 + }, + { + "epoch": 1.544208037825059, + "grad_norm": 2.6134607791900635, + "learning_rate": 4.259587568788282e-06, + "loss": 0.483, + "step": 3266 + }, + { + "epoch": 1.5446808510638297, + "grad_norm": 2.5308640003204346, + "learning_rate": 4.259144370118684e-06, + "loss": 0.5115, + "step": 3267 + }, + { + "epoch": 1.5451536643026005, + "grad_norm": 2.8256733417510986, + "learning_rate": 4.258701061914879e-06, + "loss": 0.5414, + "step": 3268 + }, + { + "epoch": 1.5456264775413713, + "grad_norm": 2.8648319244384766, + "learning_rate": 4.258257644204471e-06, + "loss": 0.5695, + "step": 3269 + }, + { + "epoch": 1.5460992907801419, + "grad_norm": 2.8568081855773926, + "learning_rate": 4.257814117015069e-06, + "loss": 0.5264, + "step": 3270 + }, + { + "epoch": 1.5465721040189124, + "grad_norm": 2.6065011024475098, + "learning_rate": 4.257370480374289e-06, + "loss": 0.5646, + "step": 3271 + }, + { + "epoch": 1.5470449172576832, + "grad_norm": 2.7840216159820557, + "learning_rate": 4.256926734309756e-06, + "loss": 0.5191, + "step": 3272 + }, + { + "epoch": 1.547517730496454, + "grad_norm": 2.85906982421875, + "learning_rate": 4.256482878849099e-06, + "loss": 0.5911, + "step": 3273 + }, + { + "epoch": 1.5479905437352246, + "grad_norm": 2.916029930114746, + "learning_rate": 4.256038914019954e-06, + "loss": 0.5589, + "step": 3274 + }, + { + "epoch": 1.5484633569739952, + "grad_norm": 2.6748716831207275, + "learning_rate": 4.255594839849967e-06, + "loss": 0.5323, + "step": 3275 + }, + { + "epoch": 1.548936170212766, + "grad_norm": 2.717212200164795, + "learning_rate": 4.255150656366787e-06, + "loss": 0.453, + "step": 3276 + }, + { + "epoch": 1.5494089834515368, + "grad_norm": 2.4974849224090576, + "learning_rate": 4.254706363598072e-06, + "loss": 0.4516, + "step": 3277 + }, + { + "epoch": 1.5498817966903073, + "grad_norm": 2.648151397705078, + "learning_rate": 4.254261961571485e-06, + "loss": 0.5452, + "step": 3278 + }, + { + "epoch": 1.550354609929078, + "grad_norm": 2.932905435562134, + "learning_rate": 4.253817450314699e-06, + "loss": 0.4813, + "step": 3279 + }, + { + "epoch": 1.5508274231678487, + "grad_norm": 2.862912178039551, + "learning_rate": 4.25337282985539e-06, + "loss": 0.5689, + "step": 3280 + }, + { + "epoch": 1.5513002364066195, + "grad_norm": 2.532156467437744, + "learning_rate": 4.2529281002212436e-06, + "loss": 0.485, + "step": 3281 + }, + { + "epoch": 1.55177304964539, + "grad_norm": 2.583299160003662, + "learning_rate": 4.25248326143995e-06, + "loss": 0.4661, + "step": 3282 + }, + { + "epoch": 1.5522458628841607, + "grad_norm": 2.5790653228759766, + "learning_rate": 4.252038313539209e-06, + "loss": 0.5455, + "step": 3283 + }, + { + "epoch": 1.5527186761229315, + "grad_norm": 2.872864007949829, + "learning_rate": 4.251593256546724e-06, + "loss": 0.5317, + "step": 3284 + }, + { + "epoch": 1.5531914893617023, + "grad_norm": 3.0382463932037354, + "learning_rate": 4.251148090490208e-06, + "loss": 0.5131, + "step": 3285 + }, + { + "epoch": 1.5536643026004728, + "grad_norm": 2.574399709701538, + "learning_rate": 4.250702815397379e-06, + "loss": 0.5399, + "step": 3286 + }, + { + "epoch": 1.5541371158392434, + "grad_norm": 2.9784770011901855, + "learning_rate": 4.250257431295962e-06, + "loss": 0.5209, + "step": 3287 + }, + { + "epoch": 1.5546099290780142, + "grad_norm": 2.6482062339782715, + "learning_rate": 4.249811938213689e-06, + "loss": 0.5416, + "step": 3288 + }, + { + "epoch": 1.555082742316785, + "grad_norm": 2.82142972946167, + "learning_rate": 4.2493663361783e-06, + "loss": 0.594, + "step": 3289 + }, + { + "epoch": 1.5555555555555556, + "grad_norm": 2.815595865249634, + "learning_rate": 4.24892062521754e-06, + "loss": 0.5381, + "step": 3290 + }, + { + "epoch": 1.5560283687943262, + "grad_norm": 2.689764976501465, + "learning_rate": 4.248474805359161e-06, + "loss": 0.5141, + "step": 3291 + }, + { + "epoch": 1.556501182033097, + "grad_norm": 2.7718515396118164, + "learning_rate": 4.248028876630922e-06, + "loss": 0.5324, + "step": 3292 + }, + { + "epoch": 1.5569739952718678, + "grad_norm": 3.0196774005889893, + "learning_rate": 4.247582839060591e-06, + "loss": 0.4971, + "step": 3293 + }, + { + "epoch": 1.5574468085106383, + "grad_norm": 2.608475923538208, + "learning_rate": 4.247136692675939e-06, + "loss": 0.5795, + "step": 3294 + }, + { + "epoch": 1.557919621749409, + "grad_norm": 2.4912326335906982, + "learning_rate": 4.246690437504746e-06, + "loss": 0.5348, + "step": 3295 + }, + { + "epoch": 1.5583924349881797, + "grad_norm": 2.519303560256958, + "learning_rate": 4.246244073574799e-06, + "loss": 0.4953, + "step": 3296 + }, + { + "epoch": 1.5588652482269505, + "grad_norm": 2.5667171478271484, + "learning_rate": 4.24579760091389e-06, + "loss": 0.5353, + "step": 3297 + }, + { + "epoch": 1.559338061465721, + "grad_norm": 2.8835761547088623, + "learning_rate": 4.24535101954982e-06, + "loss": 0.578, + "step": 3298 + }, + { + "epoch": 1.5598108747044916, + "grad_norm": 3.0506930351257324, + "learning_rate": 4.244904329510395e-06, + "loss": 0.6418, + "step": 3299 + }, + { + "epoch": 1.5602836879432624, + "grad_norm": 2.579446315765381, + "learning_rate": 4.244457530823428e-06, + "loss": 0.5027, + "step": 3300 + }, + { + "epoch": 1.5607565011820332, + "grad_norm": 2.72012996673584, + "learning_rate": 4.24401062351674e-06, + "loss": 0.5438, + "step": 3301 + }, + { + "epoch": 1.5612293144208038, + "grad_norm": 2.527007818222046, + "learning_rate": 4.243563607618158e-06, + "loss": 0.5303, + "step": 3302 + }, + { + "epoch": 1.5617021276595744, + "grad_norm": 2.4415159225463867, + "learning_rate": 4.243116483155516e-06, + "loss": 0.4893, + "step": 3303 + }, + { + "epoch": 1.5621749408983452, + "grad_norm": 2.462256669998169, + "learning_rate": 4.242669250156653e-06, + "loss": 0.5671, + "step": 3304 + }, + { + "epoch": 1.562647754137116, + "grad_norm": 2.479865074157715, + "learning_rate": 4.242221908649418e-06, + "loss": 0.5038, + "step": 3305 + }, + { + "epoch": 1.5631205673758866, + "grad_norm": 2.74670672416687, + "learning_rate": 4.241774458661662e-06, + "loss": 0.5689, + "step": 3306 + }, + { + "epoch": 1.5635933806146571, + "grad_norm": 2.55938982963562, + "learning_rate": 4.24132690022125e-06, + "loss": 0.492, + "step": 3307 + }, + { + "epoch": 1.564066193853428, + "grad_norm": 2.634956121444702, + "learning_rate": 4.240879233356048e-06, + "loss": 0.503, + "step": 3308 + }, + { + "epoch": 1.5645390070921987, + "grad_norm": 2.381775140762329, + "learning_rate": 4.240431458093928e-06, + "loss": 0.4939, + "step": 3309 + }, + { + "epoch": 1.5650118203309693, + "grad_norm": 2.8176610469818115, + "learning_rate": 4.239983574462774e-06, + "loss": 0.5609, + "step": 3310 + }, + { + "epoch": 1.5654846335697399, + "grad_norm": 3.0268442630767822, + "learning_rate": 4.239535582490471e-06, + "loss": 0.5427, + "step": 3311 + }, + { + "epoch": 1.5659574468085107, + "grad_norm": 2.5881481170654297, + "learning_rate": 4.239087482204916e-06, + "loss": 0.5538, + "step": 3312 + }, + { + "epoch": 1.5664302600472815, + "grad_norm": 2.5317704677581787, + "learning_rate": 4.238639273634008e-06, + "loss": 0.4915, + "step": 3313 + }, + { + "epoch": 1.566903073286052, + "grad_norm": 2.9608731269836426, + "learning_rate": 4.238190956805658e-06, + "loss": 0.564, + "step": 3314 + }, + { + "epoch": 1.5673758865248226, + "grad_norm": 3.022686243057251, + "learning_rate": 4.237742531747777e-06, + "loss": 0.5503, + "step": 3315 + }, + { + "epoch": 1.5678486997635934, + "grad_norm": 2.763622283935547, + "learning_rate": 4.23729399848829e-06, + "loss": 0.5241, + "step": 3316 + }, + { + "epoch": 1.5683215130023642, + "grad_norm": 2.6112794876098633, + "learning_rate": 4.236845357055122e-06, + "loss": 0.4919, + "step": 3317 + }, + { + "epoch": 1.5687943262411348, + "grad_norm": 2.649829149246216, + "learning_rate": 4.23639660747621e-06, + "loss": 0.5472, + "step": 3318 + }, + { + "epoch": 1.5692671394799054, + "grad_norm": 2.8888115882873535, + "learning_rate": 4.2359477497794955e-06, + "loss": 0.5077, + "step": 3319 + }, + { + "epoch": 1.5697399527186762, + "grad_norm": 2.5666911602020264, + "learning_rate": 4.235498783992927e-06, + "loss": 0.5365, + "step": 3320 + }, + { + "epoch": 1.570212765957447, + "grad_norm": 2.448758363723755, + "learning_rate": 4.2350497101444575e-06, + "loss": 0.5043, + "step": 3321 + }, + { + "epoch": 1.5706855791962175, + "grad_norm": 2.595207691192627, + "learning_rate": 4.234600528262052e-06, + "loss": 0.5303, + "step": 3322 + }, + { + "epoch": 1.5711583924349881, + "grad_norm": 2.7814228534698486, + "learning_rate": 4.234151238373676e-06, + "loss": 0.4521, + "step": 3323 + }, + { + "epoch": 1.571631205673759, + "grad_norm": 2.781538724899292, + "learning_rate": 4.233701840507308e-06, + "loss": 0.5193, + "step": 3324 + }, + { + "epoch": 1.5721040189125297, + "grad_norm": 2.771907329559326, + "learning_rate": 4.233252334690928e-06, + "loss": 0.497, + "step": 3325 + }, + { + "epoch": 1.5725768321513003, + "grad_norm": 2.5557498931884766, + "learning_rate": 4.232802720952525e-06, + "loss": 0.4913, + "step": 3326 + }, + { + "epoch": 1.5730496453900709, + "grad_norm": 2.478267192840576, + "learning_rate": 4.232352999320094e-06, + "loss": 0.4967, + "step": 3327 + }, + { + "epoch": 1.5735224586288417, + "grad_norm": 3.1548502445220947, + "learning_rate": 4.231903169821639e-06, + "loss": 0.5009, + "step": 3328 + }, + { + "epoch": 1.5739952718676125, + "grad_norm": 2.634824275970459, + "learning_rate": 4.231453232485168e-06, + "loss": 0.5223, + "step": 3329 + }, + { + "epoch": 1.574468085106383, + "grad_norm": 2.579102039337158, + "learning_rate": 4.231003187338695e-06, + "loss": 0.5513, + "step": 3330 + }, + { + "epoch": 1.5749408983451536, + "grad_norm": 2.8477070331573486, + "learning_rate": 4.230553034410245e-06, + "loss": 0.561, + "step": 3331 + }, + { + "epoch": 1.5754137115839244, + "grad_norm": 2.6714725494384766, + "learning_rate": 4.2301027737278446e-06, + "loss": 0.4687, + "step": 3332 + }, + { + "epoch": 1.5758865248226952, + "grad_norm": 2.6562764644622803, + "learning_rate": 4.229652405319532e-06, + "loss": 0.5925, + "step": 3333 + }, + { + "epoch": 1.5763593380614658, + "grad_norm": 2.750946283340454, + "learning_rate": 4.229201929213348e-06, + "loss": 0.4748, + "step": 3334 + }, + { + "epoch": 1.5768321513002364, + "grad_norm": 2.760470151901245, + "learning_rate": 4.228751345437342e-06, + "loss": 0.5989, + "step": 3335 + }, + { + "epoch": 1.5773049645390071, + "grad_norm": 3.1451845169067383, + "learning_rate": 4.2283006540195706e-06, + "loss": 0.562, + "step": 3336 + }, + { + "epoch": 1.5777777777777777, + "grad_norm": 2.563011407852173, + "learning_rate": 4.227849854988095e-06, + "loss": 0.5473, + "step": 3337 + }, + { + "epoch": 1.5782505910165483, + "grad_norm": 2.310469388961792, + "learning_rate": 4.2273989483709856e-06, + "loss": 0.5033, + "step": 3338 + }, + { + "epoch": 1.578723404255319, + "grad_norm": 2.677978754043579, + "learning_rate": 4.226947934196318e-06, + "loss": 0.5291, + "step": 3339 + }, + { + "epoch": 1.57919621749409, + "grad_norm": 3.0423545837402344, + "learning_rate": 4.226496812492176e-06, + "loss": 0.5201, + "step": 3340 + }, + { + "epoch": 1.5796690307328605, + "grad_norm": 2.357513904571533, + "learning_rate": 4.226045583286647e-06, + "loss": 0.4421, + "step": 3341 + }, + { + "epoch": 1.580141843971631, + "grad_norm": 2.719860315322876, + "learning_rate": 4.225594246607828e-06, + "loss": 0.4855, + "step": 3342 + }, + { + "epoch": 1.5806146572104018, + "grad_norm": 3.2645058631896973, + "learning_rate": 4.2251428024838215e-06, + "loss": 0.6654, + "step": 3343 + }, + { + "epoch": 1.5810874704491726, + "grad_norm": 2.2997004985809326, + "learning_rate": 4.224691250942737e-06, + "loss": 0.4565, + "step": 3344 + }, + { + "epoch": 1.5815602836879432, + "grad_norm": 2.8103034496307373, + "learning_rate": 4.2242395920126926e-06, + "loss": 0.5543, + "step": 3345 + }, + { + "epoch": 1.5820330969267138, + "grad_norm": 2.720254898071289, + "learning_rate": 4.223787825721808e-06, + "loss": 0.5028, + "step": 3346 + }, + { + "epoch": 1.5825059101654846, + "grad_norm": 2.735544204711914, + "learning_rate": 4.223335952098214e-06, + "loss": 0.5169, + "step": 3347 + }, + { + "epoch": 1.5829787234042554, + "grad_norm": 2.784254550933838, + "learning_rate": 4.222883971170047e-06, + "loss": 0.4989, + "step": 3348 + }, + { + "epoch": 1.583451536643026, + "grad_norm": 2.7192094326019287, + "learning_rate": 4.22243188296545e-06, + "loss": 0.502, + "step": 3349 + }, + { + "epoch": 1.5839243498817965, + "grad_norm": 2.716501474380493, + "learning_rate": 4.221979687512573e-06, + "loss": 0.5687, + "step": 3350 + }, + { + "epoch": 1.5843971631205673, + "grad_norm": 2.8420114517211914, + "learning_rate": 4.22152738483957e-06, + "loss": 0.5903, + "step": 3351 + }, + { + "epoch": 1.5848699763593381, + "grad_norm": 2.734872579574585, + "learning_rate": 4.2210749749746065e-06, + "loss": 0.5397, + "step": 3352 + }, + { + "epoch": 1.5853427895981087, + "grad_norm": 2.4343836307525635, + "learning_rate": 4.220622457945851e-06, + "loss": 0.436, + "step": 3353 + }, + { + "epoch": 1.5858156028368793, + "grad_norm": 2.728177547454834, + "learning_rate": 4.2201698337814785e-06, + "loss": 0.5703, + "step": 3354 + }, + { + "epoch": 1.58628841607565, + "grad_norm": 2.502098560333252, + "learning_rate": 4.219717102509674e-06, + "loss": 0.5275, + "step": 3355 + }, + { + "epoch": 1.5867612293144209, + "grad_norm": 2.6595494747161865, + "learning_rate": 4.219264264158627e-06, + "loss": 0.4659, + "step": 3356 + }, + { + "epoch": 1.5872340425531914, + "grad_norm": 2.5307185649871826, + "learning_rate": 4.218811318756532e-06, + "loss": 0.5048, + "step": 3357 + }, + { + "epoch": 1.587706855791962, + "grad_norm": 2.9300129413604736, + "learning_rate": 4.218358266331593e-06, + "loss": 0.5137, + "step": 3358 + }, + { + "epoch": 1.5881796690307328, + "grad_norm": 2.686586618423462, + "learning_rate": 4.21790510691202e-06, + "loss": 0.4529, + "step": 3359 + }, + { + "epoch": 1.5886524822695036, + "grad_norm": 2.9981517791748047, + "learning_rate": 4.217451840526029e-06, + "loss": 0.6054, + "step": 3360 + }, + { + "epoch": 1.5891252955082742, + "grad_norm": 2.6943674087524414, + "learning_rate": 4.216998467201841e-06, + "loss": 0.5153, + "step": 3361 + }, + { + "epoch": 1.5895981087470448, + "grad_norm": 2.707084894180298, + "learning_rate": 4.216544986967689e-06, + "loss": 0.5235, + "step": 3362 + }, + { + "epoch": 1.5900709219858156, + "grad_norm": 2.6553728580474854, + "learning_rate": 4.216091399851808e-06, + "loss": 0.5275, + "step": 3363 + }, + { + "epoch": 1.5905437352245864, + "grad_norm": 2.9136953353881836, + "learning_rate": 4.215637705882439e-06, + "loss": 0.5834, + "step": 3364 + }, + { + "epoch": 1.591016548463357, + "grad_norm": 2.7647159099578857, + "learning_rate": 4.2151839050878325e-06, + "loss": 0.5641, + "step": 3365 + }, + { + "epoch": 1.5914893617021275, + "grad_norm": 2.4556827545166016, + "learning_rate": 4.214729997496246e-06, + "loss": 0.5636, + "step": 3366 + }, + { + "epoch": 1.5919621749408983, + "grad_norm": 2.6111652851104736, + "learning_rate": 4.2142759831359414e-06, + "loss": 0.5097, + "step": 3367 + }, + { + "epoch": 1.592434988179669, + "grad_norm": 2.4886903762817383, + "learning_rate": 4.213821862035189e-06, + "loss": 0.531, + "step": 3368 + }, + { + "epoch": 1.5929078014184397, + "grad_norm": 2.5245840549468994, + "learning_rate": 4.213367634222263e-06, + "loss": 0.5085, + "step": 3369 + }, + { + "epoch": 1.5933806146572103, + "grad_norm": 2.970214605331421, + "learning_rate": 4.212913299725447e-06, + "loss": 0.5851, + "step": 3370 + }, + { + "epoch": 1.593853427895981, + "grad_norm": 2.5433361530303955, + "learning_rate": 4.212458858573032e-06, + "loss": 0.48, + "step": 3371 + }, + { + "epoch": 1.5943262411347519, + "grad_norm": 2.3550102710723877, + "learning_rate": 4.212004310793312e-06, + "loss": 0.4405, + "step": 3372 + }, + { + "epoch": 1.5947990543735224, + "grad_norm": 2.4824719429016113, + "learning_rate": 4.2115496564145896e-06, + "loss": 0.4634, + "step": 3373 + }, + { + "epoch": 1.595271867612293, + "grad_norm": 2.4751930236816406, + "learning_rate": 4.211094895465176e-06, + "loss": 0.5662, + "step": 3374 + }, + { + "epoch": 1.5957446808510638, + "grad_norm": 2.4193356037139893, + "learning_rate": 4.210640027973386e-06, + "loss": 0.4441, + "step": 3375 + }, + { + "epoch": 1.5962174940898346, + "grad_norm": 2.4477498531341553, + "learning_rate": 4.210185053967543e-06, + "loss": 0.5205, + "step": 3376 + }, + { + "epoch": 1.5966903073286052, + "grad_norm": 2.7954161167144775, + "learning_rate": 4.209729973475976e-06, + "loss": 0.4951, + "step": 3377 + }, + { + "epoch": 1.5971631205673757, + "grad_norm": 3.1907570362091064, + "learning_rate": 4.209274786527019e-06, + "loss": 0.6024, + "step": 3378 + }, + { + "epoch": 1.5976359338061465, + "grad_norm": 2.485245704650879, + "learning_rate": 4.2088194931490165e-06, + "loss": 0.5652, + "step": 3379 + }, + { + "epoch": 1.5981087470449173, + "grad_norm": 2.589310884475708, + "learning_rate": 4.208364093370317e-06, + "loss": 0.5085, + "step": 3380 + }, + { + "epoch": 1.598581560283688, + "grad_norm": 2.8941214084625244, + "learning_rate": 4.207908587219276e-06, + "loss": 0.53, + "step": 3381 + }, + { + "epoch": 1.5990543735224585, + "grad_norm": 2.480509042739868, + "learning_rate": 4.207452974724258e-06, + "loss": 0.4543, + "step": 3382 + }, + { + "epoch": 1.5995271867612293, + "grad_norm": 2.7884905338287354, + "learning_rate": 4.206997255913629e-06, + "loss": 0.5483, + "step": 3383 + }, + { + "epoch": 1.6, + "grad_norm": 2.7976696491241455, + "learning_rate": 4.206541430815766e-06, + "loss": 0.4734, + "step": 3384 + }, + { + "epoch": 1.6004728132387707, + "grad_norm": 2.5463132858276367, + "learning_rate": 4.206085499459051e-06, + "loss": 0.4931, + "step": 3385 + }, + { + "epoch": 1.6009456264775412, + "grad_norm": 2.8384251594543457, + "learning_rate": 4.205629461871871e-06, + "loss": 0.5066, + "step": 3386 + }, + { + "epoch": 1.601418439716312, + "grad_norm": 2.8578574657440186, + "learning_rate": 4.205173318082626e-06, + "loss": 0.458, + "step": 3387 + }, + { + "epoch": 1.6018912529550828, + "grad_norm": 2.7779932022094727, + "learning_rate": 4.204717068119715e-06, + "loss": 0.5293, + "step": 3388 + }, + { + "epoch": 1.6023640661938534, + "grad_norm": 2.9123778343200684, + "learning_rate": 4.204260712011546e-06, + "loss": 0.4866, + "step": 3389 + }, + { + "epoch": 1.602836879432624, + "grad_norm": 2.757922887802124, + "learning_rate": 4.203804249786537e-06, + "loss": 0.4925, + "step": 3390 + }, + { + "epoch": 1.6033096926713948, + "grad_norm": 3.287733316421509, + "learning_rate": 4.203347681473107e-06, + "loss": 0.6694, + "step": 3391 + }, + { + "epoch": 1.6037825059101656, + "grad_norm": 3.2117912769317627, + "learning_rate": 4.202891007099687e-06, + "loss": 0.5269, + "step": 3392 + }, + { + "epoch": 1.6042553191489362, + "grad_norm": 2.8489456176757812, + "learning_rate": 4.20243422669471e-06, + "loss": 0.5073, + "step": 3393 + }, + { + "epoch": 1.6047281323877067, + "grad_norm": 2.7660224437713623, + "learning_rate": 4.201977340286619e-06, + "loss": 0.5014, + "step": 3394 + }, + { + "epoch": 1.6052009456264775, + "grad_norm": 2.68182110786438, + "learning_rate": 4.201520347903862e-06, + "loss": 0.4542, + "step": 3395 + }, + { + "epoch": 1.6056737588652483, + "grad_norm": 2.7546045780181885, + "learning_rate": 4.2010632495748934e-06, + "loss": 0.516, + "step": 3396 + }, + { + "epoch": 1.606146572104019, + "grad_norm": 2.744668483734131, + "learning_rate": 4.200606045328176e-06, + "loss": 0.5243, + "step": 3397 + }, + { + "epoch": 1.6066193853427895, + "grad_norm": 2.935343027114868, + "learning_rate": 4.200148735192177e-06, + "loss": 0.5624, + "step": 3398 + }, + { + "epoch": 1.6070921985815603, + "grad_norm": 2.7392852306365967, + "learning_rate": 4.19969131919537e-06, + "loss": 0.5796, + "step": 3399 + }, + { + "epoch": 1.607565011820331, + "grad_norm": 2.864750385284424, + "learning_rate": 4.199233797366239e-06, + "loss": 0.549, + "step": 3400 + }, + { + "epoch": 1.6080378250591016, + "grad_norm": 2.684157371520996, + "learning_rate": 4.198776169733269e-06, + "loss": 0.5532, + "step": 3401 + }, + { + "epoch": 1.6085106382978722, + "grad_norm": 2.4717135429382324, + "learning_rate": 4.198318436324957e-06, + "loss": 0.5174, + "step": 3402 + }, + { + "epoch": 1.608983451536643, + "grad_norm": 2.640242338180542, + "learning_rate": 4.197860597169802e-06, + "loss": 0.5117, + "step": 3403 + }, + { + "epoch": 1.6094562647754138, + "grad_norm": 2.4957473278045654, + "learning_rate": 4.197402652296313e-06, + "loss": 0.474, + "step": 3404 + }, + { + "epoch": 1.6099290780141844, + "grad_norm": 2.416138172149658, + "learning_rate": 4.196944601733004e-06, + "loss": 0.4858, + "step": 3405 + }, + { + "epoch": 1.610401891252955, + "grad_norm": 2.4498109817504883, + "learning_rate": 4.196486445508395e-06, + "loss": 0.5048, + "step": 3406 + }, + { + "epoch": 1.6108747044917258, + "grad_norm": 2.415895938873291, + "learning_rate": 4.196028183651014e-06, + "loss": 0.4745, + "step": 3407 + }, + { + "epoch": 1.6113475177304966, + "grad_norm": 2.843665838241577, + "learning_rate": 4.195569816189395e-06, + "loss": 0.5219, + "step": 3408 + }, + { + "epoch": 1.6118203309692671, + "grad_norm": 2.608579158782959, + "learning_rate": 4.195111343152079e-06, + "loss": 0.4941, + "step": 3409 + }, + { + "epoch": 1.6122931442080377, + "grad_norm": 2.643789529800415, + "learning_rate": 4.194652764567611e-06, + "loss": 0.515, + "step": 3410 + }, + { + "epoch": 1.6127659574468085, + "grad_norm": 2.8099429607391357, + "learning_rate": 4.194194080464547e-06, + "loss": 0.4935, + "step": 3411 + }, + { + "epoch": 1.6132387706855793, + "grad_norm": 2.595628261566162, + "learning_rate": 4.193735290871446e-06, + "loss": 0.5571, + "step": 3412 + }, + { + "epoch": 1.6137115839243499, + "grad_norm": 2.7903778553009033, + "learning_rate": 4.193276395816876e-06, + "loss": 0.5228, + "step": 3413 + }, + { + "epoch": 1.6141843971631205, + "grad_norm": 2.83910870552063, + "learning_rate": 4.192817395329409e-06, + "loss": 0.6124, + "step": 3414 + }, + { + "epoch": 1.6146572104018913, + "grad_norm": 2.6155734062194824, + "learning_rate": 4.192358289437626e-06, + "loss": 0.552, + "step": 3415 + }, + { + "epoch": 1.615130023640662, + "grad_norm": 2.795832872390747, + "learning_rate": 4.191899078170113e-06, + "loss": 0.5561, + "step": 3416 + }, + { + "epoch": 1.6156028368794326, + "grad_norm": 2.3402161598205566, + "learning_rate": 4.191439761555464e-06, + "loss": 0.4889, + "step": 3417 + }, + { + "epoch": 1.6160756501182032, + "grad_norm": 3.1183433532714844, + "learning_rate": 4.190980339622276e-06, + "loss": 0.5337, + "step": 3418 + }, + { + "epoch": 1.616548463356974, + "grad_norm": 2.6262872219085693, + "learning_rate": 4.190520812399158e-06, + "loss": 0.525, + "step": 3419 + }, + { + "epoch": 1.6170212765957448, + "grad_norm": 2.578340530395508, + "learning_rate": 4.190061179914722e-06, + "loss": 0.4975, + "step": 3420 + }, + { + "epoch": 1.6174940898345154, + "grad_norm": 3.19482159614563, + "learning_rate": 4.189601442197586e-06, + "loss": 0.5832, + "step": 3421 + }, + { + "epoch": 1.617966903073286, + "grad_norm": 2.6398792266845703, + "learning_rate": 4.189141599276378e-06, + "loss": 0.4676, + "step": 3422 + }, + { + "epoch": 1.6184397163120567, + "grad_norm": 2.624865770339966, + "learning_rate": 4.1886816511797275e-06, + "loss": 0.4507, + "step": 3423 + }, + { + "epoch": 1.6189125295508275, + "grad_norm": 2.4136857986450195, + "learning_rate": 4.1882215979362775e-06, + "loss": 0.4616, + "step": 3424 + }, + { + "epoch": 1.6193853427895981, + "grad_norm": 2.6906614303588867, + "learning_rate": 4.18776143957467e-06, + "loss": 0.5142, + "step": 3425 + }, + { + "epoch": 1.6198581560283687, + "grad_norm": 2.5149154663085938, + "learning_rate": 4.187301176123558e-06, + "loss": 0.5252, + "step": 3426 + }, + { + "epoch": 1.6203309692671395, + "grad_norm": 2.677405834197998, + "learning_rate": 4.186840807611602e-06, + "loss": 0.4635, + "step": 3427 + }, + { + "epoch": 1.6208037825059103, + "grad_norm": 2.7164649963378906, + "learning_rate": 4.186380334067464e-06, + "loss": 0.5634, + "step": 3428 + }, + { + "epoch": 1.6212765957446809, + "grad_norm": 2.8299832344055176, + "learning_rate": 4.185919755519817e-06, + "loss": 0.5166, + "step": 3429 + }, + { + "epoch": 1.6217494089834514, + "grad_norm": 2.465848207473755, + "learning_rate": 4.18545907199734e-06, + "loss": 0.4696, + "step": 3430 + }, + { + "epoch": 1.6222222222222222, + "grad_norm": 2.407616376876831, + "learning_rate": 4.1849982835287175e-06, + "loss": 0.5111, + "step": 3431 + }, + { + "epoch": 1.622695035460993, + "grad_norm": 2.452146291732788, + "learning_rate": 4.184537390142639e-06, + "loss": 0.4574, + "step": 3432 + }, + { + "epoch": 1.6231678486997636, + "grad_norm": 2.653071165084839, + "learning_rate": 4.1840763918678055e-06, + "loss": 0.5611, + "step": 3433 + }, + { + "epoch": 1.6236406619385342, + "grad_norm": 2.5920350551605225, + "learning_rate": 4.183615288732919e-06, + "loss": 0.5437, + "step": 3434 + }, + { + "epoch": 1.624113475177305, + "grad_norm": 2.782900810241699, + "learning_rate": 4.18315408076669e-06, + "loss": 0.5824, + "step": 3435 + }, + { + "epoch": 1.6245862884160758, + "grad_norm": 2.8769774436950684, + "learning_rate": 4.1826927679978365e-06, + "loss": 0.5271, + "step": 3436 + }, + { + "epoch": 1.6250591016548463, + "grad_norm": 2.488598585128784, + "learning_rate": 4.182231350455084e-06, + "loss": 0.4684, + "step": 3437 + }, + { + "epoch": 1.625531914893617, + "grad_norm": 2.6472036838531494, + "learning_rate": 4.181769828167161e-06, + "loss": 0.5372, + "step": 3438 + }, + { + "epoch": 1.6260047281323877, + "grad_norm": 2.6498794555664062, + "learning_rate": 4.1813082011628045e-06, + "loss": 0.4805, + "step": 3439 + }, + { + "epoch": 1.6264775413711585, + "grad_norm": 2.5386533737182617, + "learning_rate": 4.1808464694707595e-06, + "loss": 0.5015, + "step": 3440 + }, + { + "epoch": 1.626950354609929, + "grad_norm": 2.8812551498413086, + "learning_rate": 4.180384633119775e-06, + "loss": 0.5225, + "step": 3441 + }, + { + "epoch": 1.6274231678486997, + "grad_norm": 2.870124578475952, + "learning_rate": 4.179922692138609e-06, + "loss": 0.537, + "step": 3442 + }, + { + "epoch": 1.6278959810874705, + "grad_norm": 2.5759785175323486, + "learning_rate": 4.179460646556021e-06, + "loss": 0.5142, + "step": 3443 + }, + { + "epoch": 1.6283687943262413, + "grad_norm": 2.629347324371338, + "learning_rate": 4.1789984964007836e-06, + "loss": 0.5007, + "step": 3444 + }, + { + "epoch": 1.6288416075650118, + "grad_norm": 2.751128673553467, + "learning_rate": 4.178536241701672e-06, + "loss": 0.5677, + "step": 3445 + }, + { + "epoch": 1.6293144208037824, + "grad_norm": 2.7582364082336426, + "learning_rate": 4.178073882487469e-06, + "loss": 0.499, + "step": 3446 + }, + { + "epoch": 1.6297872340425532, + "grad_norm": 3.136711359024048, + "learning_rate": 4.177611418786963e-06, + "loss": 0.5294, + "step": 3447 + }, + { + "epoch": 1.630260047281324, + "grad_norm": 2.7363100051879883, + "learning_rate": 4.17714885062895e-06, + "loss": 0.5264, + "step": 3448 + }, + { + "epoch": 1.6307328605200946, + "grad_norm": 2.7305946350097656, + "learning_rate": 4.176686178042233e-06, + "loss": 0.5235, + "step": 3449 + }, + { + "epoch": 1.6312056737588652, + "grad_norm": 2.6500556468963623, + "learning_rate": 4.176223401055619e-06, + "loss": 0.5463, + "step": 3450 + }, + { + "epoch": 1.631678486997636, + "grad_norm": 2.756321907043457, + "learning_rate": 4.175760519697924e-06, + "loss": 0.545, + "step": 3451 + }, + { + "epoch": 1.6321513002364068, + "grad_norm": 2.6234960556030273, + "learning_rate": 4.17529753399797e-06, + "loss": 0.4927, + "step": 3452 + }, + { + "epoch": 1.6326241134751773, + "grad_norm": 2.6358842849731445, + "learning_rate": 4.174834443984584e-06, + "loss": 0.5445, + "step": 3453 + }, + { + "epoch": 1.633096926713948, + "grad_norm": 2.541147470474243, + "learning_rate": 4.174371249686601e-06, + "loss": 0.4691, + "step": 3454 + }, + { + "epoch": 1.6335697399527187, + "grad_norm": 2.566981077194214, + "learning_rate": 4.173907951132863e-06, + "loss": 0.4932, + "step": 3455 + }, + { + "epoch": 1.6340425531914895, + "grad_norm": 2.670940399169922, + "learning_rate": 4.173444548352216e-06, + "loss": 0.4979, + "step": 3456 + }, + { + "epoch": 1.63451536643026, + "grad_norm": 2.5440268516540527, + "learning_rate": 4.172981041373515e-06, + "loss": 0.4716, + "step": 3457 + }, + { + "epoch": 1.6349881796690307, + "grad_norm": 2.3801631927490234, + "learning_rate": 4.17251743022562e-06, + "loss": 0.5126, + "step": 3458 + }, + { + "epoch": 1.6354609929078014, + "grad_norm": 2.5051121711730957, + "learning_rate": 4.1720537149373985e-06, + "loss": 0.4964, + "step": 3459 + }, + { + "epoch": 1.6359338061465722, + "grad_norm": 3.5521697998046875, + "learning_rate": 4.171589895537724e-06, + "loss": 0.5447, + "step": 3460 + }, + { + "epoch": 1.6364066193853428, + "grad_norm": 2.6041572093963623, + "learning_rate": 4.171125972055477e-06, + "loss": 0.4637, + "step": 3461 + }, + { + "epoch": 1.6368794326241134, + "grad_norm": 2.2297258377075195, + "learning_rate": 4.170661944519543e-06, + "loss": 0.4702, + "step": 3462 + }, + { + "epoch": 1.6373522458628842, + "grad_norm": 2.6764535903930664, + "learning_rate": 4.170197812958815e-06, + "loss": 0.5111, + "step": 3463 + }, + { + "epoch": 1.637825059101655, + "grad_norm": 2.86892032623291, + "learning_rate": 4.169733577402193e-06, + "loss": 0.5437, + "step": 3464 + }, + { + "epoch": 1.6382978723404256, + "grad_norm": 2.9007070064544678, + "learning_rate": 4.1692692378785825e-06, + "loss": 0.5425, + "step": 3465 + }, + { + "epoch": 1.6387706855791961, + "grad_norm": 2.5902905464172363, + "learning_rate": 4.168804794416896e-06, + "loss": 0.5252, + "step": 3466 + }, + { + "epoch": 1.639243498817967, + "grad_norm": 2.821183681488037, + "learning_rate": 4.168340247046053e-06, + "loss": 0.5265, + "step": 3467 + }, + { + "epoch": 1.6397163120567377, + "grad_norm": 2.7928314208984375, + "learning_rate": 4.167875595794978e-06, + "loss": 0.5151, + "step": 3468 + }, + { + "epoch": 1.6401891252955083, + "grad_norm": 2.3130412101745605, + "learning_rate": 4.167410840692603e-06, + "loss": 0.4941, + "step": 3469 + }, + { + "epoch": 1.6406619385342789, + "grad_norm": 2.6078619956970215, + "learning_rate": 4.1669459817678655e-06, + "loss": 0.493, + "step": 3470 + }, + { + "epoch": 1.6411347517730497, + "grad_norm": 2.5335731506347656, + "learning_rate": 4.166481019049712e-06, + "loss": 0.4969, + "step": 3471 + }, + { + "epoch": 1.6416075650118205, + "grad_norm": 2.8181469440460205, + "learning_rate": 4.166015952567093e-06, + "loss": 0.5062, + "step": 3472 + }, + { + "epoch": 1.642080378250591, + "grad_norm": 2.7256782054901123, + "learning_rate": 4.165550782348966e-06, + "loss": 0.5397, + "step": 3473 + }, + { + "epoch": 1.6425531914893616, + "grad_norm": 2.284345865249634, + "learning_rate": 4.1650855084242946e-06, + "loss": 0.4448, + "step": 3474 + }, + { + "epoch": 1.6430260047281324, + "grad_norm": 3.0383145809173584, + "learning_rate": 4.164620130822049e-06, + "loss": 0.5873, + "step": 3475 + }, + { + "epoch": 1.6434988179669032, + "grad_norm": 2.754448652267456, + "learning_rate": 4.1641546495712085e-06, + "loss": 0.4852, + "step": 3476 + }, + { + "epoch": 1.6439716312056738, + "grad_norm": 2.6820101737976074, + "learning_rate": 4.1636890647007535e-06, + "loss": 0.5325, + "step": 3477 + }, + { + "epoch": 1.6444444444444444, + "grad_norm": 2.6396398544311523, + "learning_rate": 4.163223376239676e-06, + "loss": 0.466, + "step": 3478 + }, + { + "epoch": 1.6449172576832152, + "grad_norm": 2.395049810409546, + "learning_rate": 4.162757584216972e-06, + "loss": 0.4531, + "step": 3479 + }, + { + "epoch": 1.645390070921986, + "grad_norm": 2.596670627593994, + "learning_rate": 4.162291688661645e-06, + "loss": 0.5207, + "step": 3480 + }, + { + "epoch": 1.6458628841607565, + "grad_norm": 2.4391872882843018, + "learning_rate": 4.161825689602703e-06, + "loss": 0.5133, + "step": 3481 + }, + { + "epoch": 1.6463356973995271, + "grad_norm": 2.6169841289520264, + "learning_rate": 4.161359587069162e-06, + "loss": 0.5096, + "step": 3482 + }, + { + "epoch": 1.646808510638298, + "grad_norm": 2.634089946746826, + "learning_rate": 4.1608933810900445e-06, + "loss": 0.4921, + "step": 3483 + }, + { + "epoch": 1.6472813238770687, + "grad_norm": 2.815877914428711, + "learning_rate": 4.160427071694379e-06, + "loss": 0.5045, + "step": 3484 + }, + { + "epoch": 1.6477541371158393, + "grad_norm": 2.417525053024292, + "learning_rate": 4.159960658911199e-06, + "loss": 0.4997, + "step": 3485 + }, + { + "epoch": 1.6482269503546099, + "grad_norm": 2.5713605880737305, + "learning_rate": 4.15949414276955e-06, + "loss": 0.5246, + "step": 3486 + }, + { + "epoch": 1.6486997635933807, + "grad_norm": 3.49833607673645, + "learning_rate": 4.159027523298475e-06, + "loss": 0.4901, + "step": 3487 + }, + { + "epoch": 1.6491725768321515, + "grad_norm": 2.985464334487915, + "learning_rate": 4.158560800527033e-06, + "loss": 0.5726, + "step": 3488 + }, + { + "epoch": 1.649645390070922, + "grad_norm": 2.72745680809021, + "learning_rate": 4.158093974484282e-06, + "loss": 0.5119, + "step": 3489 + }, + { + "epoch": 1.6501182033096926, + "grad_norm": 2.4885571002960205, + "learning_rate": 4.157627045199289e-06, + "loss": 0.4838, + "step": 3490 + }, + { + "epoch": 1.6505910165484634, + "grad_norm": 2.7622628211975098, + "learning_rate": 4.157160012701128e-06, + "loss": 0.5269, + "step": 3491 + }, + { + "epoch": 1.6510638297872342, + "grad_norm": 2.615122079849243, + "learning_rate": 4.156692877018879e-06, + "loss": 0.5501, + "step": 3492 + }, + { + "epoch": 1.6515366430260048, + "grad_norm": 2.827753782272339, + "learning_rate": 4.156225638181631e-06, + "loss": 0.5452, + "step": 3493 + }, + { + "epoch": 1.6520094562647754, + "grad_norm": 2.724820137023926, + "learning_rate": 4.155758296218474e-06, + "loss": 0.5155, + "step": 3494 + }, + { + "epoch": 1.6524822695035462, + "grad_norm": 2.5806174278259277, + "learning_rate": 4.155290851158508e-06, + "loss": 0.5292, + "step": 3495 + }, + { + "epoch": 1.652955082742317, + "grad_norm": 2.5655179023742676, + "learning_rate": 4.154823303030838e-06, + "loss": 0.4959, + "step": 3496 + }, + { + "epoch": 1.6534278959810875, + "grad_norm": 2.656548261642456, + "learning_rate": 4.154355651864579e-06, + "loss": 0.5703, + "step": 3497 + }, + { + "epoch": 1.653900709219858, + "grad_norm": 2.9085004329681396, + "learning_rate": 4.153887897688847e-06, + "loss": 0.5061, + "step": 3498 + }, + { + "epoch": 1.654373522458629, + "grad_norm": 2.608010768890381, + "learning_rate": 4.1534200405327665e-06, + "loss": 0.5165, + "step": 3499 + }, + { + "epoch": 1.6548463356973995, + "grad_norm": 2.600463628768921, + "learning_rate": 4.152952080425471e-06, + "loss": 0.4946, + "step": 3500 + }, + { + "epoch": 1.65531914893617, + "grad_norm": 2.5561563968658447, + "learning_rate": 4.152484017396098e-06, + "loss": 0.4804, + "step": 3501 + }, + { + "epoch": 1.6557919621749408, + "grad_norm": 2.788594961166382, + "learning_rate": 4.152015851473791e-06, + "loss": 0.5635, + "step": 3502 + }, + { + "epoch": 1.6562647754137116, + "grad_norm": 2.693302631378174, + "learning_rate": 4.151547582687699e-06, + "loss": 0.5139, + "step": 3503 + }, + { + "epoch": 1.6567375886524822, + "grad_norm": 2.7887485027313232, + "learning_rate": 4.1510792110669825e-06, + "loss": 0.4952, + "step": 3504 + }, + { + "epoch": 1.6572104018912528, + "grad_norm": 2.8982298374176025, + "learning_rate": 4.150610736640803e-06, + "loss": 0.4136, + "step": 3505 + }, + { + "epoch": 1.6576832151300236, + "grad_norm": 2.7569408416748047, + "learning_rate": 4.150142159438331e-06, + "loss": 0.5272, + "step": 3506 + }, + { + "epoch": 1.6581560283687944, + "grad_norm": 2.531648874282837, + "learning_rate": 4.149673479488742e-06, + "loss": 0.5016, + "step": 3507 + }, + { + "epoch": 1.658628841607565, + "grad_norm": 2.7706353664398193, + "learning_rate": 4.149204696821219e-06, + "loss": 0.5512, + "step": 3508 + }, + { + "epoch": 1.6591016548463355, + "grad_norm": 2.7307450771331787, + "learning_rate": 4.148735811464951e-06, + "loss": 0.4968, + "step": 3509 + }, + { + "epoch": 1.6595744680851063, + "grad_norm": 3.0097429752349854, + "learning_rate": 4.1482668234491335e-06, + "loss": 0.4797, + "step": 3510 + }, + { + "epoch": 1.6600472813238771, + "grad_norm": 2.6045308113098145, + "learning_rate": 4.147797732802969e-06, + "loss": 0.5496, + "step": 3511 + }, + { + "epoch": 1.6605200945626477, + "grad_norm": 2.702061176300049, + "learning_rate": 4.147328539555664e-06, + "loss": 0.5302, + "step": 3512 + }, + { + "epoch": 1.6609929078014183, + "grad_norm": 3.3724892139434814, + "learning_rate": 4.1468592437364356e-06, + "loss": 0.5124, + "step": 3513 + }, + { + "epoch": 1.661465721040189, + "grad_norm": 2.5117242336273193, + "learning_rate": 4.146389845374502e-06, + "loss": 0.4953, + "step": 3514 + }, + { + "epoch": 1.6619385342789599, + "grad_norm": 2.86547589302063, + "learning_rate": 4.145920344499092e-06, + "loss": 0.5337, + "step": 3515 + }, + { + "epoch": 1.6624113475177305, + "grad_norm": 2.745149850845337, + "learning_rate": 4.14545074113944e-06, + "loss": 0.5187, + "step": 3516 + }, + { + "epoch": 1.662884160756501, + "grad_norm": 2.5560994148254395, + "learning_rate": 4.1449810353247855e-06, + "loss": 0.5183, + "step": 3517 + }, + { + "epoch": 1.6633569739952718, + "grad_norm": 2.2318122386932373, + "learning_rate": 4.144511227084374e-06, + "loss": 0.4452, + "step": 3518 + }, + { + "epoch": 1.6638297872340426, + "grad_norm": 2.6980903148651123, + "learning_rate": 4.14404131644746e-06, + "loss": 0.4974, + "step": 3519 + }, + { + "epoch": 1.6643026004728132, + "grad_norm": 2.6875357627868652, + "learning_rate": 4.1435713034433025e-06, + "loss": 0.4582, + "step": 3520 + }, + { + "epoch": 1.6647754137115838, + "grad_norm": 2.9430019855499268, + "learning_rate": 4.143101188101166e-06, + "loss": 0.5004, + "step": 3521 + }, + { + "epoch": 1.6652482269503546, + "grad_norm": 2.4447221755981445, + "learning_rate": 4.142630970450323e-06, + "loss": 0.5436, + "step": 3522 + }, + { + "epoch": 1.6657210401891254, + "grad_norm": 2.571023941040039, + "learning_rate": 4.142160650520053e-06, + "loss": 0.5307, + "step": 3523 + }, + { + "epoch": 1.666193853427896, + "grad_norm": 2.9725306034088135, + "learning_rate": 4.14169022833964e-06, + "loss": 0.5918, + "step": 3524 + }, + { + "epoch": 1.6666666666666665, + "grad_norm": 2.5958926677703857, + "learning_rate": 4.141219703938375e-06, + "loss": 0.5036, + "step": 3525 + }, + { + "epoch": 1.6671394799054373, + "grad_norm": 2.935788631439209, + "learning_rate": 4.140749077345556e-06, + "loss": 0.5773, + "step": 3526 + }, + { + "epoch": 1.6676122931442081, + "grad_norm": 2.5460526943206787, + "learning_rate": 4.140278348590485e-06, + "loss": 0.4762, + "step": 3527 + }, + { + "epoch": 1.6680851063829787, + "grad_norm": 2.5729143619537354, + "learning_rate": 4.139807517702475e-06, + "loss": 0.5515, + "step": 3528 + }, + { + "epoch": 1.6685579196217493, + "grad_norm": 2.4377381801605225, + "learning_rate": 4.13933658471084e-06, + "loss": 0.5383, + "step": 3529 + }, + { + "epoch": 1.66903073286052, + "grad_norm": 2.6284425258636475, + "learning_rate": 4.138865549644905e-06, + "loss": 0.5396, + "step": 3530 + }, + { + "epoch": 1.6695035460992909, + "grad_norm": 2.857250928878784, + "learning_rate": 4.138394412533998e-06, + "loss": 0.5861, + "step": 3531 + }, + { + "epoch": 1.6699763593380614, + "grad_norm": 2.9226012229919434, + "learning_rate": 4.137923173407456e-06, + "loss": 0.5262, + "step": 3532 + }, + { + "epoch": 1.670449172576832, + "grad_norm": 4.839131832122803, + "learning_rate": 4.137451832294619e-06, + "loss": 0.651, + "step": 3533 + }, + { + "epoch": 1.6709219858156028, + "grad_norm": 2.4727771282196045, + "learning_rate": 4.1369803892248375e-06, + "loss": 0.5149, + "step": 3534 + }, + { + "epoch": 1.6713947990543736, + "grad_norm": 2.5391688346862793, + "learning_rate": 4.1365088442274635e-06, + "loss": 0.4907, + "step": 3535 + }, + { + "epoch": 1.6718676122931442, + "grad_norm": 2.5168209075927734, + "learning_rate": 4.136037197331862e-06, + "loss": 0.5091, + "step": 3536 + }, + { + "epoch": 1.6723404255319148, + "grad_norm": 2.6278600692749023, + "learning_rate": 4.135565448567396e-06, + "loss": 0.4357, + "step": 3537 + }, + { + "epoch": 1.6728132387706856, + "grad_norm": 2.835184097290039, + "learning_rate": 4.135093597963441e-06, + "loss": 0.4786, + "step": 3538 + }, + { + "epoch": 1.6732860520094563, + "grad_norm": 2.385328531265259, + "learning_rate": 4.134621645549379e-06, + "loss": 0.4849, + "step": 3539 + }, + { + "epoch": 1.673758865248227, + "grad_norm": 2.6504149436950684, + "learning_rate": 4.134149591354593e-06, + "loss": 0.6037, + "step": 3540 + }, + { + "epoch": 1.6742316784869975, + "grad_norm": 2.945634126663208, + "learning_rate": 4.1336774354084786e-06, + "loss": 0.532, + "step": 3541 + }, + { + "epoch": 1.6747044917257683, + "grad_norm": 2.8373215198516846, + "learning_rate": 4.133205177740434e-06, + "loss": 0.5138, + "step": 3542 + }, + { + "epoch": 1.675177304964539, + "grad_norm": 2.6616621017456055, + "learning_rate": 4.1327328183798634e-06, + "loss": 0.5543, + "step": 3543 + }, + { + "epoch": 1.6756501182033097, + "grad_norm": 3.0843071937561035, + "learning_rate": 4.13226035735618e-06, + "loss": 0.6585, + "step": 3544 + }, + { + "epoch": 1.6761229314420802, + "grad_norm": 2.2214272022247314, + "learning_rate": 4.131787794698802e-06, + "loss": 0.5413, + "step": 3545 + }, + { + "epoch": 1.676595744680851, + "grad_norm": 2.4515018463134766, + "learning_rate": 4.131315130437152e-06, + "loss": 0.4966, + "step": 3546 + }, + { + "epoch": 1.6770685579196218, + "grad_norm": 2.647414207458496, + "learning_rate": 4.130842364600663e-06, + "loss": 0.5401, + "step": 3547 + }, + { + "epoch": 1.6775413711583924, + "grad_norm": 2.648941993713379, + "learning_rate": 4.13036949721877e-06, + "loss": 0.4796, + "step": 3548 + }, + { + "epoch": 1.678014184397163, + "grad_norm": 2.7835679054260254, + "learning_rate": 4.129896528320919e-06, + "loss": 0.5653, + "step": 3549 + }, + { + "epoch": 1.6784869976359338, + "grad_norm": 2.995964288711548, + "learning_rate": 4.129423457936556e-06, + "loss": 0.4999, + "step": 3550 + }, + { + "epoch": 1.6789598108747046, + "grad_norm": 2.5980007648468018, + "learning_rate": 4.1289502860951405e-06, + "loss": 0.5177, + "step": 3551 + }, + { + "epoch": 1.6794326241134752, + "grad_norm": 2.442254066467285, + "learning_rate": 4.128477012826133e-06, + "loss": 0.5062, + "step": 3552 + }, + { + "epoch": 1.6799054373522457, + "grad_norm": 2.3007538318634033, + "learning_rate": 4.1280036381590025e-06, + "loss": 0.5029, + "step": 3553 + }, + { + "epoch": 1.6803782505910165, + "grad_norm": 2.4169347286224365, + "learning_rate": 4.1275301621232245e-06, + "loss": 0.515, + "step": 3554 + }, + { + "epoch": 1.6808510638297873, + "grad_norm": 2.6456379890441895, + "learning_rate": 4.127056584748279e-06, + "loss": 0.5343, + "step": 3555 + }, + { + "epoch": 1.681323877068558, + "grad_norm": 2.6406595706939697, + "learning_rate": 4.1265829060636546e-06, + "loss": 0.5047, + "step": 3556 + }, + { + "epoch": 1.6817966903073285, + "grad_norm": 2.9344475269317627, + "learning_rate": 4.126109126098846e-06, + "loss": 0.5501, + "step": 3557 + }, + { + "epoch": 1.6822695035460993, + "grad_norm": 2.3292455673217773, + "learning_rate": 4.125635244883351e-06, + "loss": 0.463, + "step": 3558 + }, + { + "epoch": 1.68274231678487, + "grad_norm": 2.4150657653808594, + "learning_rate": 4.125161262446677e-06, + "loss": 0.4802, + "step": 3559 + }, + { + "epoch": 1.6832151300236406, + "grad_norm": 2.604292392730713, + "learning_rate": 4.124687178818339e-06, + "loss": 0.5683, + "step": 3560 + }, + { + "epoch": 1.6836879432624112, + "grad_norm": 2.5676791667938232, + "learning_rate": 4.1242129940278544e-06, + "loss": 0.5519, + "step": 3561 + }, + { + "epoch": 1.684160756501182, + "grad_norm": 3.078514814376831, + "learning_rate": 4.123738708104748e-06, + "loss": 0.5194, + "step": 3562 + }, + { + "epoch": 1.6846335697399528, + "grad_norm": 2.893577814102173, + "learning_rate": 4.123264321078552e-06, + "loss": 0.5107, + "step": 3563 + }, + { + "epoch": 1.6851063829787234, + "grad_norm": 2.772413730621338, + "learning_rate": 4.122789832978804e-06, + "loss": 0.6147, + "step": 3564 + }, + { + "epoch": 1.685579196217494, + "grad_norm": 2.5804643630981445, + "learning_rate": 4.12231524383505e-06, + "loss": 0.5057, + "step": 3565 + }, + { + "epoch": 1.6860520094562648, + "grad_norm": 2.599571466445923, + "learning_rate": 4.121840553676839e-06, + "loss": 0.5591, + "step": 3566 + }, + { + "epoch": 1.6865248226950356, + "grad_norm": 2.9124577045440674, + "learning_rate": 4.1213657625337275e-06, + "loss": 0.565, + "step": 3567 + }, + { + "epoch": 1.6869976359338061, + "grad_norm": 2.6582155227661133, + "learning_rate": 4.120890870435281e-06, + "loss": 0.4607, + "step": 3568 + }, + { + "epoch": 1.6874704491725767, + "grad_norm": 2.929227590560913, + "learning_rate": 4.120415877411066e-06, + "loss": 0.5705, + "step": 3569 + }, + { + "epoch": 1.6879432624113475, + "grad_norm": 2.4443247318267822, + "learning_rate": 4.11994078349066e-06, + "loss": 0.4592, + "step": 3570 + }, + { + "epoch": 1.6884160756501183, + "grad_norm": 2.4799163341522217, + "learning_rate": 4.119465588703645e-06, + "loss": 0.5361, + "step": 3571 + }, + { + "epoch": 1.6888888888888889, + "grad_norm": 2.9408936500549316, + "learning_rate": 4.1189902930796085e-06, + "loss": 0.5347, + "step": 3572 + }, + { + "epoch": 1.6893617021276595, + "grad_norm": 3.3348076343536377, + "learning_rate": 4.118514896648146e-06, + "loss": 0.5612, + "step": 3573 + }, + { + "epoch": 1.6898345153664303, + "grad_norm": 2.764889717102051, + "learning_rate": 4.118039399438857e-06, + "loss": 0.4745, + "step": 3574 + }, + { + "epoch": 1.690307328605201, + "grad_norm": 2.7023751735687256, + "learning_rate": 4.11756380148135e-06, + "loss": 0.5106, + "step": 3575 + }, + { + "epoch": 1.6907801418439716, + "grad_norm": 2.8816208839416504, + "learning_rate": 4.117088102805238e-06, + "loss": 0.6016, + "step": 3576 + }, + { + "epoch": 1.6912529550827422, + "grad_norm": 2.215733289718628, + "learning_rate": 4.11661230344014e-06, + "loss": 0.4404, + "step": 3577 + }, + { + "epoch": 1.691725768321513, + "grad_norm": 2.8190999031066895, + "learning_rate": 4.116136403415683e-06, + "loss": 0.5038, + "step": 3578 + }, + { + "epoch": 1.6921985815602838, + "grad_norm": 2.616424083709717, + "learning_rate": 4.115660402761499e-06, + "loss": 0.5493, + "step": 3579 + }, + { + "epoch": 1.6926713947990544, + "grad_norm": 2.7738113403320312, + "learning_rate": 4.115184301507226e-06, + "loss": 0.5416, + "step": 3580 + }, + { + "epoch": 1.693144208037825, + "grad_norm": 2.4793593883514404, + "learning_rate": 4.114708099682509e-06, + "loss": 0.4526, + "step": 3581 + }, + { + "epoch": 1.6936170212765957, + "grad_norm": 2.390652894973755, + "learning_rate": 4.114231797316999e-06, + "loss": 0.4908, + "step": 3582 + }, + { + "epoch": 1.6940898345153665, + "grad_norm": 2.513197660446167, + "learning_rate": 4.113755394440352e-06, + "loss": 0.4738, + "step": 3583 + }, + { + "epoch": 1.6945626477541371, + "grad_norm": 2.504497766494751, + "learning_rate": 4.113278891082234e-06, + "loss": 0.4661, + "step": 3584 + }, + { + "epoch": 1.6950354609929077, + "grad_norm": 2.4966917037963867, + "learning_rate": 4.112802287272314e-06, + "loss": 0.4979, + "step": 3585 + }, + { + "epoch": 1.6955082742316785, + "grad_norm": 2.3129689693450928, + "learning_rate": 4.112325583040265e-06, + "loss": 0.4933, + "step": 3586 + }, + { + "epoch": 1.6959810874704493, + "grad_norm": 2.822136878967285, + "learning_rate": 4.111848778415774e-06, + "loss": 0.5087, + "step": 3587 + }, + { + "epoch": 1.6964539007092199, + "grad_norm": 2.5181210041046143, + "learning_rate": 4.111371873428527e-06, + "loss": 0.4836, + "step": 3588 + }, + { + "epoch": 1.6969267139479904, + "grad_norm": 2.7564687728881836, + "learning_rate": 4.110894868108218e-06, + "loss": 0.5224, + "step": 3589 + }, + { + "epoch": 1.6973995271867612, + "grad_norm": 2.424421787261963, + "learning_rate": 4.11041776248455e-06, + "loss": 0.4552, + "step": 3590 + }, + { + "epoch": 1.697872340425532, + "grad_norm": 2.7013823986053467, + "learning_rate": 4.10994055658723e-06, + "loss": 0.5535, + "step": 3591 + }, + { + "epoch": 1.6983451536643026, + "grad_norm": 2.5660946369171143, + "learning_rate": 4.10946325044597e-06, + "loss": 0.5351, + "step": 3592 + }, + { + "epoch": 1.6988179669030732, + "grad_norm": 2.5598108768463135, + "learning_rate": 4.10898584409049e-06, + "loss": 0.5246, + "step": 3593 + }, + { + "epoch": 1.699290780141844, + "grad_norm": 2.6318907737731934, + "learning_rate": 4.108508337550518e-06, + "loss": 0.5002, + "step": 3594 + }, + { + "epoch": 1.6997635933806148, + "grad_norm": 2.527099132537842, + "learning_rate": 4.108030730855784e-06, + "loss": 0.5366, + "step": 3595 + }, + { + "epoch": 1.7002364066193854, + "grad_norm": 2.8629603385925293, + "learning_rate": 4.107553024036029e-06, + "loss": 0.5742, + "step": 3596 + }, + { + "epoch": 1.700709219858156, + "grad_norm": 2.8084018230438232, + "learning_rate": 4.107075217120994e-06, + "loss": 0.5618, + "step": 3597 + }, + { + "epoch": 1.7011820330969267, + "grad_norm": 3.6470065116882324, + "learning_rate": 4.1065973101404325e-06, + "loss": 0.508, + "step": 3598 + }, + { + "epoch": 1.7016548463356975, + "grad_norm": 3.0332422256469727, + "learning_rate": 4.106119303124102e-06, + "loss": 0.51, + "step": 3599 + }, + { + "epoch": 1.702127659574468, + "grad_norm": 2.4887590408325195, + "learning_rate": 4.105641196101765e-06, + "loss": 0.5109, + "step": 3600 + }, + { + "epoch": 1.7026004728132387, + "grad_norm": 2.6102066040039062, + "learning_rate": 4.105162989103191e-06, + "loss": 0.5278, + "step": 3601 + }, + { + "epoch": 1.7030732860520095, + "grad_norm": 2.771578073501587, + "learning_rate": 4.104684682158156e-06, + "loss": 0.498, + "step": 3602 + }, + { + "epoch": 1.7035460992907803, + "grad_norm": 2.5452702045440674, + "learning_rate": 4.1042062752964425e-06, + "loss": 0.4939, + "step": 3603 + }, + { + "epoch": 1.7040189125295508, + "grad_norm": 2.4287021160125732, + "learning_rate": 4.103727768547838e-06, + "loss": 0.4819, + "step": 3604 + }, + { + "epoch": 1.7044917257683214, + "grad_norm": 2.412280321121216, + "learning_rate": 4.103249161942138e-06, + "loss": 0.5196, + "step": 3605 + }, + { + "epoch": 1.7049645390070922, + "grad_norm": 2.8850717544555664, + "learning_rate": 4.102770455509142e-06, + "loss": 0.5724, + "step": 3606 + }, + { + "epoch": 1.705437352245863, + "grad_norm": 2.7979609966278076, + "learning_rate": 4.102291649278659e-06, + "loss": 0.5295, + "step": 3607 + }, + { + "epoch": 1.7059101654846336, + "grad_norm": 2.762238025665283, + "learning_rate": 4.1018127432805e-06, + "loss": 0.5166, + "step": 3608 + }, + { + "epoch": 1.7063829787234042, + "grad_norm": 2.921586513519287, + "learning_rate": 4.101333737544485e-06, + "loss": 0.5607, + "step": 3609 + }, + { + "epoch": 1.706855791962175, + "grad_norm": 3.001929998397827, + "learning_rate": 4.100854632100439e-06, + "loss": 0.6255, + "step": 3610 + }, + { + "epoch": 1.7073286052009458, + "grad_norm": 2.752713918685913, + "learning_rate": 4.100375426978196e-06, + "loss": 0.5732, + "step": 3611 + }, + { + "epoch": 1.7078014184397163, + "grad_norm": 2.6496472358703613, + "learning_rate": 4.099896122207593e-06, + "loss": 0.5138, + "step": 3612 + }, + { + "epoch": 1.708274231678487, + "grad_norm": 3.0079452991485596, + "learning_rate": 4.099416717818473e-06, + "loss": 0.5746, + "step": 3613 + }, + { + "epoch": 1.7087470449172577, + "grad_norm": 2.5762360095977783, + "learning_rate": 4.098937213840687e-06, + "loss": 0.5308, + "step": 3614 + }, + { + "epoch": 1.7092198581560285, + "grad_norm": 2.6026158332824707, + "learning_rate": 4.098457610304092e-06, + "loss": 0.4857, + "step": 3615 + }, + { + "epoch": 1.709692671394799, + "grad_norm": 2.587583541870117, + "learning_rate": 4.097977907238551e-06, + "loss": 0.4591, + "step": 3616 + }, + { + "epoch": 1.7101654846335697, + "grad_norm": 2.6996991634368896, + "learning_rate": 4.097498104673932e-06, + "loss": 0.5298, + "step": 3617 + }, + { + "epoch": 1.7106382978723405, + "grad_norm": 2.600029945373535, + "learning_rate": 4.097018202640111e-06, + "loss": 0.4726, + "step": 3618 + }, + { + "epoch": 1.7111111111111112, + "grad_norm": 2.8261220455169678, + "learning_rate": 4.096538201166969e-06, + "loss": 0.5242, + "step": 3619 + }, + { + "epoch": 1.7115839243498818, + "grad_norm": 3.053027629852295, + "learning_rate": 4.096058100284394e-06, + "loss": 0.5568, + "step": 3620 + }, + { + "epoch": 1.7120567375886524, + "grad_norm": 2.9638442993164062, + "learning_rate": 4.0955779000222805e-06, + "loss": 0.5325, + "step": 3621 + }, + { + "epoch": 1.7125295508274232, + "grad_norm": 2.731095790863037, + "learning_rate": 4.095097600410527e-06, + "loss": 0.4733, + "step": 3622 + }, + { + "epoch": 1.713002364066194, + "grad_norm": 2.632490873336792, + "learning_rate": 4.09461720147904e-06, + "loss": 0.5253, + "step": 3623 + }, + { + "epoch": 1.7134751773049646, + "grad_norm": 2.847689390182495, + "learning_rate": 4.094136703257732e-06, + "loss": 0.57, + "step": 3624 + }, + { + "epoch": 1.7139479905437351, + "grad_norm": 3.1078696250915527, + "learning_rate": 4.0936561057765215e-06, + "loss": 0.5368, + "step": 3625 + }, + { + "epoch": 1.714420803782506, + "grad_norm": 2.696349620819092, + "learning_rate": 4.0931754090653334e-06, + "loss": 0.491, + "step": 3626 + }, + { + "epoch": 1.7148936170212767, + "grad_norm": 2.712958812713623, + "learning_rate": 4.092694613154099e-06, + "loss": 0.5768, + "step": 3627 + }, + { + "epoch": 1.7153664302600473, + "grad_norm": 2.5421478748321533, + "learning_rate": 4.092213718072754e-06, + "loss": 0.4839, + "step": 3628 + }, + { + "epoch": 1.715839243498818, + "grad_norm": 2.5176162719726562, + "learning_rate": 4.091732723851243e-06, + "loss": 0.5049, + "step": 3629 + }, + { + "epoch": 1.7163120567375887, + "grad_norm": 2.642185926437378, + "learning_rate": 4.091251630519514e-06, + "loss": 0.589, + "step": 3630 + }, + { + "epoch": 1.7167848699763595, + "grad_norm": 2.587348461151123, + "learning_rate": 4.0907704381075245e-06, + "loss": 0.5281, + "step": 3631 + }, + { + "epoch": 1.71725768321513, + "grad_norm": 2.4628195762634277, + "learning_rate": 4.090289146645234e-06, + "loss": 0.5592, + "step": 3632 + }, + { + "epoch": 1.7177304964539006, + "grad_norm": 2.2751028537750244, + "learning_rate": 4.0898077561626125e-06, + "loss": 0.502, + "step": 3633 + }, + { + "epoch": 1.7182033096926714, + "grad_norm": 2.7712769508361816, + "learning_rate": 4.089326266689632e-06, + "loss": 0.5143, + "step": 3634 + }, + { + "epoch": 1.7186761229314422, + "grad_norm": 2.5297727584838867, + "learning_rate": 4.088844678256275e-06, + "loss": 0.5035, + "step": 3635 + }, + { + "epoch": 1.7191489361702128, + "grad_norm": 2.739130735397339, + "learning_rate": 4.088362990892527e-06, + "loss": 0.5959, + "step": 3636 + }, + { + "epoch": 1.7196217494089834, + "grad_norm": 2.3708314895629883, + "learning_rate": 4.08788120462838e-06, + "loss": 0.4796, + "step": 3637 + }, + { + "epoch": 1.7200945626477542, + "grad_norm": 2.7664241790771484, + "learning_rate": 4.087399319493832e-06, + "loss": 0.6052, + "step": 3638 + }, + { + "epoch": 1.720567375886525, + "grad_norm": 2.5900204181671143, + "learning_rate": 4.0869173355188895e-06, + "loss": 0.4955, + "step": 3639 + }, + { + "epoch": 1.7210401891252955, + "grad_norm": 2.6771862506866455, + "learning_rate": 4.0864352527335635e-06, + "loss": 0.4889, + "step": 3640 + }, + { + "epoch": 1.7215130023640661, + "grad_norm": 2.888479471206665, + "learning_rate": 4.085953071167871e-06, + "loss": 0.5719, + "step": 3641 + }, + { + "epoch": 1.721985815602837, + "grad_norm": 2.5967187881469727, + "learning_rate": 4.085470790851833e-06, + "loss": 0.4959, + "step": 3642 + }, + { + "epoch": 1.7224586288416077, + "grad_norm": 2.5317695140838623, + "learning_rate": 4.084988411815483e-06, + "loss": 0.4596, + "step": 3643 + }, + { + "epoch": 1.7229314420803783, + "grad_norm": 2.6531455516815186, + "learning_rate": 4.084505934088853e-06, + "loss": 0.5346, + "step": 3644 + }, + { + "epoch": 1.7234042553191489, + "grad_norm": 2.6525208950042725, + "learning_rate": 4.084023357701987e-06, + "loss": 0.5178, + "step": 3645 + }, + { + "epoch": 1.7238770685579197, + "grad_norm": 2.461954116821289, + "learning_rate": 4.083540682684932e-06, + "loss": 0.4802, + "step": 3646 + }, + { + "epoch": 1.7243498817966905, + "grad_norm": 2.794696807861328, + "learning_rate": 4.083057909067743e-06, + "loss": 0.5148, + "step": 3647 + }, + { + "epoch": 1.724822695035461, + "grad_norm": 2.867572546005249, + "learning_rate": 4.082575036880479e-06, + "loss": 0.5352, + "step": 3648 + }, + { + "epoch": 1.7252955082742316, + "grad_norm": 2.642820358276367, + "learning_rate": 4.082092066153207e-06, + "loss": 0.4652, + "step": 3649 + }, + { + "epoch": 1.7257683215130024, + "grad_norm": 2.782142400741577, + "learning_rate": 4.081608996915999e-06, + "loss": 0.5591, + "step": 3650 + }, + { + "epoch": 1.7262411347517732, + "grad_norm": 2.327331304550171, + "learning_rate": 4.081125829198934e-06, + "loss": 0.4339, + "step": 3651 + }, + { + "epoch": 1.7267139479905438, + "grad_norm": 2.7959988117218018, + "learning_rate": 4.0806425630320965e-06, + "loss": 0.5783, + "step": 3652 + }, + { + "epoch": 1.7271867612293144, + "grad_norm": 2.595053195953369, + "learning_rate": 4.080159198445578e-06, + "loss": 0.4602, + "step": 3653 + }, + { + "epoch": 1.7276595744680852, + "grad_norm": 3.0968129634857178, + "learning_rate": 4.079675735469475e-06, + "loss": 0.5775, + "step": 3654 + }, + { + "epoch": 1.728132387706856, + "grad_norm": 2.628044605255127, + "learning_rate": 4.07919217413389e-06, + "loss": 0.486, + "step": 3655 + }, + { + "epoch": 1.7286052009456265, + "grad_norm": 2.782799005508423, + "learning_rate": 4.078708514468933e-06, + "loss": 0.5282, + "step": 3656 + }, + { + "epoch": 1.729078014184397, + "grad_norm": 2.655365467071533, + "learning_rate": 4.0782247565047205e-06, + "loss": 0.4873, + "step": 3657 + }, + { + "epoch": 1.729550827423168, + "grad_norm": 2.9461584091186523, + "learning_rate": 4.077740900271371e-06, + "loss": 0.548, + "step": 3658 + }, + { + "epoch": 1.7300236406619387, + "grad_norm": 2.5094761848449707, + "learning_rate": 4.077256945799015e-06, + "loss": 0.5437, + "step": 3659 + }, + { + "epoch": 1.7304964539007093, + "grad_norm": 2.555793285369873, + "learning_rate": 4.0767728931177845e-06, + "loss": 0.5268, + "step": 3660 + }, + { + "epoch": 1.7309692671394799, + "grad_norm": 2.4433486461639404, + "learning_rate": 4.07628874225782e-06, + "loss": 0.5211, + "step": 3661 + }, + { + "epoch": 1.7314420803782506, + "grad_norm": 2.365206003189087, + "learning_rate": 4.075804493249267e-06, + "loss": 0.5084, + "step": 3662 + }, + { + "epoch": 1.7319148936170212, + "grad_norm": 2.514305830001831, + "learning_rate": 4.075320146122278e-06, + "loss": 0.4693, + "step": 3663 + }, + { + "epoch": 1.7323877068557918, + "grad_norm": 2.9270083904266357, + "learning_rate": 4.074835700907012e-06, + "loss": 0.5724, + "step": 3664 + }, + { + "epoch": 1.7328605200945626, + "grad_norm": 2.938692569732666, + "learning_rate": 4.0743511576336315e-06, + "loss": 0.5361, + "step": 3665 + }, + { + "epoch": 1.7333333333333334, + "grad_norm": 3.1978867053985596, + "learning_rate": 4.073866516332307e-06, + "loss": 0.6277, + "step": 3666 + }, + { + "epoch": 1.733806146572104, + "grad_norm": 2.3477370738983154, + "learning_rate": 4.073381777033217e-06, + "loss": 0.5139, + "step": 3667 + }, + { + "epoch": 1.7342789598108745, + "grad_norm": 2.5954184532165527, + "learning_rate": 4.072896939766543e-06, + "loss": 0.537, + "step": 3668 + }, + { + "epoch": 1.7347517730496453, + "grad_norm": 2.8999998569488525, + "learning_rate": 4.072412004562472e-06, + "loss": 0.5486, + "step": 3669 + }, + { + "epoch": 1.7352245862884161, + "grad_norm": 2.7320556640625, + "learning_rate": 4.071926971451201e-06, + "loss": 0.6025, + "step": 3670 + }, + { + "epoch": 1.7356973995271867, + "grad_norm": 2.499234676361084, + "learning_rate": 4.0714418404629304e-06, + "loss": 0.456, + "step": 3671 + }, + { + "epoch": 1.7361702127659573, + "grad_norm": 2.485924243927002, + "learning_rate": 4.070956611627867e-06, + "loss": 0.5097, + "step": 3672 + }, + { + "epoch": 1.736643026004728, + "grad_norm": 2.513723373413086, + "learning_rate": 4.070471284976225e-06, + "loss": 0.4744, + "step": 3673 + }, + { + "epoch": 1.7371158392434989, + "grad_norm": 2.281977653503418, + "learning_rate": 4.06998586053822e-06, + "loss": 0.5124, + "step": 3674 + }, + { + "epoch": 1.7375886524822695, + "grad_norm": 2.3683905601501465, + "learning_rate": 4.069500338344081e-06, + "loss": 0.4816, + "step": 3675 + }, + { + "epoch": 1.73806146572104, + "grad_norm": 2.5635924339294434, + "learning_rate": 4.069014718424038e-06, + "loss": 0.5665, + "step": 3676 + }, + { + "epoch": 1.7385342789598108, + "grad_norm": 2.7308456897735596, + "learning_rate": 4.068529000808328e-06, + "loss": 0.534, + "step": 3677 + }, + { + "epoch": 1.7390070921985816, + "grad_norm": 2.788452625274658, + "learning_rate": 4.068043185527196e-06, + "loss": 0.5609, + "step": 3678 + }, + { + "epoch": 1.7394799054373522, + "grad_norm": 2.832368850708008, + "learning_rate": 4.067557272610889e-06, + "loss": 0.553, + "step": 3679 + }, + { + "epoch": 1.7399527186761228, + "grad_norm": 2.9987435340881348, + "learning_rate": 4.067071262089665e-06, + "loss": 0.5, + "step": 3680 + }, + { + "epoch": 1.7404255319148936, + "grad_norm": 3.04913067817688, + "learning_rate": 4.066585153993785e-06, + "loss": 0.5158, + "step": 3681 + }, + { + "epoch": 1.7408983451536644, + "grad_norm": 2.5177130699157715, + "learning_rate": 4.066098948353516e-06, + "loss": 0.4508, + "step": 3682 + }, + { + "epoch": 1.741371158392435, + "grad_norm": 2.8991222381591797, + "learning_rate": 4.065612645199133e-06, + "loss": 0.5268, + "step": 3683 + }, + { + "epoch": 1.7418439716312055, + "grad_norm": 2.4928159713745117, + "learning_rate": 4.0651262445609156e-06, + "loss": 0.5024, + "step": 3684 + }, + { + "epoch": 1.7423167848699763, + "grad_norm": 2.9737319946289062, + "learning_rate": 4.06463974646915e-06, + "loss": 0.5429, + "step": 3685 + }, + { + "epoch": 1.7427895981087471, + "grad_norm": 2.6485493183135986, + "learning_rate": 4.064153150954128e-06, + "loss": 0.5619, + "step": 3686 + }, + { + "epoch": 1.7432624113475177, + "grad_norm": 2.564861297607422, + "learning_rate": 4.063666458046148e-06, + "loss": 0.4878, + "step": 3687 + }, + { + "epoch": 1.7437352245862883, + "grad_norm": 2.6048383712768555, + "learning_rate": 4.063179667775514e-06, + "loss": 0.4836, + "step": 3688 + }, + { + "epoch": 1.744208037825059, + "grad_norm": 2.751638650894165, + "learning_rate": 4.062692780172536e-06, + "loss": 0.5558, + "step": 3689 + }, + { + "epoch": 1.7446808510638299, + "grad_norm": 3.3866634368896484, + "learning_rate": 4.062205795267531e-06, + "loss": 0.4825, + "step": 3690 + }, + { + "epoch": 1.7451536643026004, + "grad_norm": 3.0112249851226807, + "learning_rate": 4.061718713090822e-06, + "loss": 0.5732, + "step": 3691 + }, + { + "epoch": 1.745626477541371, + "grad_norm": 2.5889365673065186, + "learning_rate": 4.061231533672736e-06, + "loss": 0.483, + "step": 3692 + }, + { + "epoch": 1.7460992907801418, + "grad_norm": 2.624598979949951, + "learning_rate": 4.0607442570436085e-06, + "loss": 0.5706, + "step": 3693 + }, + { + "epoch": 1.7465721040189126, + "grad_norm": 2.9219250679016113, + "learning_rate": 4.060256883233779e-06, + "loss": 0.5153, + "step": 3694 + }, + { + "epoch": 1.7470449172576832, + "grad_norm": 3.2219252586364746, + "learning_rate": 4.059769412273595e-06, + "loss": 0.5184, + "step": 3695 + }, + { + "epoch": 1.7475177304964538, + "grad_norm": 2.890697956085205, + "learning_rate": 4.05928184419341e-06, + "loss": 0.5312, + "step": 3696 + }, + { + "epoch": 1.7479905437352246, + "grad_norm": 2.673809289932251, + "learning_rate": 4.0587941790235816e-06, + "loss": 0.4893, + "step": 3697 + }, + { + "epoch": 1.7484633569739954, + "grad_norm": 2.5339348316192627, + "learning_rate": 4.058306416794474e-06, + "loss": 0.5115, + "step": 3698 + }, + { + "epoch": 1.748936170212766, + "grad_norm": 2.6525840759277344, + "learning_rate": 4.05781855753646e-06, + "loss": 0.5256, + "step": 3699 + }, + { + "epoch": 1.7494089834515365, + "grad_norm": 2.7868754863739014, + "learning_rate": 4.057330601279914e-06, + "loss": 0.5227, + "step": 3700 + }, + { + "epoch": 1.7498817966903073, + "grad_norm": 3.1629884243011475, + "learning_rate": 4.056842548055221e-06, + "loss": 0.5617, + "step": 3701 + }, + { + "epoch": 1.750354609929078, + "grad_norm": 2.9350688457489014, + "learning_rate": 4.056354397892769e-06, + "loss": 0.4753, + "step": 3702 + }, + { + "epoch": 1.7508274231678487, + "grad_norm": 2.9688615798950195, + "learning_rate": 4.0558661508229525e-06, + "loss": 0.596, + "step": 3703 + }, + { + "epoch": 1.7513002364066192, + "grad_norm": 2.802205801010132, + "learning_rate": 4.055377806876174e-06, + "loss": 0.5793, + "step": 3704 + }, + { + "epoch": 1.75177304964539, + "grad_norm": 2.4933416843414307, + "learning_rate": 4.054889366082839e-06, + "loss": 0.4824, + "step": 3705 + }, + { + "epoch": 1.7522458628841608, + "grad_norm": 3.7904608249664307, + "learning_rate": 4.054400828473361e-06, + "loss": 0.5124, + "step": 3706 + }, + { + "epoch": 1.7527186761229314, + "grad_norm": 2.694838762283325, + "learning_rate": 4.053912194078159e-06, + "loss": 0.5604, + "step": 3707 + }, + { + "epoch": 1.753191489361702, + "grad_norm": 2.3721256256103516, + "learning_rate": 4.053423462927659e-06, + "loss": 0.4978, + "step": 3708 + }, + { + "epoch": 1.7536643026004728, + "grad_norm": 2.718512773513794, + "learning_rate": 4.052934635052292e-06, + "loss": 0.5029, + "step": 3709 + }, + { + "epoch": 1.7541371158392436, + "grad_norm": 3.061558246612549, + "learning_rate": 4.052445710482493e-06, + "loss": 0.4886, + "step": 3710 + }, + { + "epoch": 1.7546099290780142, + "grad_norm": 3.0490729808807373, + "learning_rate": 4.051956689248709e-06, + "loss": 0.5363, + "step": 3711 + }, + { + "epoch": 1.7550827423167847, + "grad_norm": 2.611661672592163, + "learning_rate": 4.051467571381385e-06, + "loss": 0.5397, + "step": 3712 + }, + { + "epoch": 1.7555555555555555, + "grad_norm": 2.7829177379608154, + "learning_rate": 4.050978356910979e-06, + "loss": 0.4973, + "step": 3713 + }, + { + "epoch": 1.7560283687943263, + "grad_norm": 2.6228256225585938, + "learning_rate": 4.0504890458679525e-06, + "loss": 0.4551, + "step": 3714 + }, + { + "epoch": 1.756501182033097, + "grad_norm": 2.6801326274871826, + "learning_rate": 4.049999638282771e-06, + "loss": 0.5581, + "step": 3715 + }, + { + "epoch": 1.7569739952718675, + "grad_norm": 2.4476819038391113, + "learning_rate": 4.049510134185908e-06, + "loss": 0.5226, + "step": 3716 + }, + { + "epoch": 1.7574468085106383, + "grad_norm": 2.5661075115203857, + "learning_rate": 4.049020533607844e-06, + "loss": 0.5163, + "step": 3717 + }, + { + "epoch": 1.757919621749409, + "grad_norm": 2.3923349380493164, + "learning_rate": 4.048530836579065e-06, + "loss": 0.5076, + "step": 3718 + }, + { + "epoch": 1.7583924349881797, + "grad_norm": 2.8204405307769775, + "learning_rate": 4.0480410431300585e-06, + "loss": 0.5883, + "step": 3719 + }, + { + "epoch": 1.7588652482269502, + "grad_norm": 2.323107957839966, + "learning_rate": 4.047551153291325e-06, + "loss": 0.5116, + "step": 3720 + }, + { + "epoch": 1.759338061465721, + "grad_norm": 2.8306009769439697, + "learning_rate": 4.047061167093368e-06, + "loss": 0.5094, + "step": 3721 + }, + { + "epoch": 1.7598108747044918, + "grad_norm": 2.568765640258789, + "learning_rate": 4.046571084566695e-06, + "loss": 0.4725, + "step": 3722 + }, + { + "epoch": 1.7602836879432624, + "grad_norm": 2.7212061882019043, + "learning_rate": 4.046080905741822e-06, + "loss": 0.4741, + "step": 3723 + }, + { + "epoch": 1.760756501182033, + "grad_norm": 2.802917003631592, + "learning_rate": 4.04559063064927e-06, + "loss": 0.5691, + "step": 3724 + }, + { + "epoch": 1.7612293144208038, + "grad_norm": 3.1044139862060547, + "learning_rate": 4.0451002593195675e-06, + "loss": 0.5472, + "step": 3725 + }, + { + "epoch": 1.7617021276595746, + "grad_norm": 2.5855562686920166, + "learning_rate": 4.044609791783246e-06, + "loss": 0.4852, + "step": 3726 + }, + { + "epoch": 1.7621749408983451, + "grad_norm": 2.6235129833221436, + "learning_rate": 4.0441192280708465e-06, + "loss": 0.5269, + "step": 3727 + }, + { + "epoch": 1.7626477541371157, + "grad_norm": 3.535630464553833, + "learning_rate": 4.043628568212914e-06, + "loss": 0.5266, + "step": 3728 + }, + { + "epoch": 1.7631205673758865, + "grad_norm": 2.7783355712890625, + "learning_rate": 4.043137812239998e-06, + "loss": 0.5609, + "step": 3729 + }, + { + "epoch": 1.7635933806146573, + "grad_norm": 2.9344944953918457, + "learning_rate": 4.042646960182657e-06, + "loss": 0.5056, + "step": 3730 + }, + { + "epoch": 1.7640661938534279, + "grad_norm": 2.6205739974975586, + "learning_rate": 4.042156012071453e-06, + "loss": 0.4914, + "step": 3731 + }, + { + "epoch": 1.7645390070921985, + "grad_norm": 2.8004493713378906, + "learning_rate": 4.041664967936958e-06, + "loss": 0.4901, + "step": 3732 + }, + { + "epoch": 1.7650118203309693, + "grad_norm": 2.944589138031006, + "learning_rate": 4.041173827809745e-06, + "loss": 0.5572, + "step": 3733 + }, + { + "epoch": 1.76548463356974, + "grad_norm": 2.5021605491638184, + "learning_rate": 4.040682591720397e-06, + "loss": 0.4637, + "step": 3734 + }, + { + "epoch": 1.7659574468085106, + "grad_norm": 2.448030948638916, + "learning_rate": 4.040191259699497e-06, + "loss": 0.4785, + "step": 3735 + }, + { + "epoch": 1.7664302600472812, + "grad_norm": 2.7171032428741455, + "learning_rate": 4.039699831777643e-06, + "loss": 0.4919, + "step": 3736 + }, + { + "epoch": 1.766903073286052, + "grad_norm": 2.453118324279785, + "learning_rate": 4.03920830798543e-06, + "loss": 0.4326, + "step": 3737 + }, + { + "epoch": 1.7673758865248228, + "grad_norm": 3.112877368927002, + "learning_rate": 4.038716688353466e-06, + "loss": 0.5375, + "step": 3738 + }, + { + "epoch": 1.7678486997635934, + "grad_norm": 2.742239236831665, + "learning_rate": 4.038224972912361e-06, + "loss": 0.5267, + "step": 3739 + }, + { + "epoch": 1.768321513002364, + "grad_norm": 2.544785737991333, + "learning_rate": 4.037733161692731e-06, + "loss": 0.5032, + "step": 3740 + }, + { + "epoch": 1.7687943262411348, + "grad_norm": 2.4639062881469727, + "learning_rate": 4.037241254725201e-06, + "loss": 0.5532, + "step": 3741 + }, + { + "epoch": 1.7692671394799055, + "grad_norm": 2.866290330886841, + "learning_rate": 4.036749252040398e-06, + "loss": 0.5503, + "step": 3742 + }, + { + "epoch": 1.7697399527186761, + "grad_norm": 2.3466262817382812, + "learning_rate": 4.0362571536689575e-06, + "loss": 0.5286, + "step": 3743 + }, + { + "epoch": 1.7702127659574467, + "grad_norm": 2.246464967727661, + "learning_rate": 4.03576495964152e-06, + "loss": 0.4656, + "step": 3744 + }, + { + "epoch": 1.7706855791962175, + "grad_norm": 2.667558431625366, + "learning_rate": 4.035272669988733e-06, + "loss": 0.5205, + "step": 3745 + }, + { + "epoch": 1.7711583924349883, + "grad_norm": 2.974666118621826, + "learning_rate": 4.034780284741249e-06, + "loss": 0.6007, + "step": 3746 + }, + { + "epoch": 1.7716312056737589, + "grad_norm": 2.7164433002471924, + "learning_rate": 4.034287803929726e-06, + "loss": 0.4913, + "step": 3747 + }, + { + "epoch": 1.7721040189125294, + "grad_norm": 2.5923962593078613, + "learning_rate": 4.033795227584829e-06, + "loss": 0.5275, + "step": 3748 + }, + { + "epoch": 1.7725768321513002, + "grad_norm": 2.606027126312256, + "learning_rate": 4.033302555737229e-06, + "loss": 0.4869, + "step": 3749 + }, + { + "epoch": 1.773049645390071, + "grad_norm": 3.0110089778900146, + "learning_rate": 4.032809788417602e-06, + "loss": 0.4956, + "step": 3750 + }, + { + "epoch": 1.7735224586288416, + "grad_norm": 3.004598617553711, + "learning_rate": 4.032316925656632e-06, + "loss": 0.5159, + "step": 3751 + }, + { + "epoch": 1.7739952718676122, + "grad_norm": 2.731539249420166, + "learning_rate": 4.031823967485005e-06, + "loss": 0.5237, + "step": 3752 + }, + { + "epoch": 1.774468085106383, + "grad_norm": 2.7466373443603516, + "learning_rate": 4.0313309139334155e-06, + "loss": 0.4948, + "step": 3753 + }, + { + "epoch": 1.7749408983451538, + "grad_norm": 2.8596460819244385, + "learning_rate": 4.030837765032565e-06, + "loss": 0.5016, + "step": 3754 + }, + { + "epoch": 1.7754137115839244, + "grad_norm": 3.2886788845062256, + "learning_rate": 4.03034452081316e-06, + "loss": 0.5377, + "step": 3755 + }, + { + "epoch": 1.775886524822695, + "grad_norm": 2.5629258155822754, + "learning_rate": 4.029851181305912e-06, + "loss": 0.519, + "step": 3756 + }, + { + "epoch": 1.7763593380614657, + "grad_norm": 2.5988714694976807, + "learning_rate": 4.029357746541539e-06, + "loss": 0.5521, + "step": 3757 + }, + { + "epoch": 1.7768321513002365, + "grad_norm": 2.987884759902954, + "learning_rate": 4.028864216550765e-06, + "loss": 0.6225, + "step": 3758 + }, + { + "epoch": 1.777304964539007, + "grad_norm": 2.6875851154327393, + "learning_rate": 4.02837059136432e-06, + "loss": 0.5321, + "step": 3759 + }, + { + "epoch": 1.7777777777777777, + "grad_norm": 2.6414570808410645, + "learning_rate": 4.02787687101294e-06, + "loss": 0.4831, + "step": 3760 + }, + { + "epoch": 1.7782505910165485, + "grad_norm": 2.581475019454956, + "learning_rate": 4.027383055527368e-06, + "loss": 0.5204, + "step": 3761 + }, + { + "epoch": 1.7787234042553193, + "grad_norm": 2.811298131942749, + "learning_rate": 4.026889144938349e-06, + "loss": 0.5486, + "step": 3762 + }, + { + "epoch": 1.7791962174940898, + "grad_norm": 3.1589081287384033, + "learning_rate": 4.026395139276639e-06, + "loss": 0.4979, + "step": 3763 + }, + { + "epoch": 1.7796690307328604, + "grad_norm": 2.3773093223571777, + "learning_rate": 4.025901038572996e-06, + "loss": 0.503, + "step": 3764 + }, + { + "epoch": 1.7801418439716312, + "grad_norm": 2.962541341781616, + "learning_rate": 4.025406842858187e-06, + "loss": 0.4613, + "step": 3765 + }, + { + "epoch": 1.780614657210402, + "grad_norm": 2.603092908859253, + "learning_rate": 4.024912552162982e-06, + "loss": 0.5142, + "step": 3766 + }, + { + "epoch": 1.7810874704491726, + "grad_norm": 2.648927927017212, + "learning_rate": 4.024418166518159e-06, + "loss": 0.4491, + "step": 3767 + }, + { + "epoch": 1.7815602836879432, + "grad_norm": 3.3239917755126953, + "learning_rate": 4.023923685954502e-06, + "loss": 0.6272, + "step": 3768 + }, + { + "epoch": 1.782033096926714, + "grad_norm": 2.672821283340454, + "learning_rate": 4.023429110502798e-06, + "loss": 0.5171, + "step": 3769 + }, + { + "epoch": 1.7825059101654848, + "grad_norm": 2.364332437515259, + "learning_rate": 4.022934440193844e-06, + "loss": 0.4513, + "step": 3770 + }, + { + "epoch": 1.7829787234042553, + "grad_norm": 3.03108549118042, + "learning_rate": 4.022439675058441e-06, + "loss": 0.4324, + "step": 3771 + }, + { + "epoch": 1.783451536643026, + "grad_norm": 2.647557020187378, + "learning_rate": 4.021944815127393e-06, + "loss": 0.5162, + "step": 3772 + }, + { + "epoch": 1.7839243498817967, + "grad_norm": 2.4111907482147217, + "learning_rate": 4.021449860431517e-06, + "loss": 0.4712, + "step": 3773 + }, + { + "epoch": 1.7843971631205675, + "grad_norm": 2.796175718307495, + "learning_rate": 4.020954811001629e-06, + "loss": 0.5131, + "step": 3774 + }, + { + "epoch": 1.784869976359338, + "grad_norm": 2.4594924449920654, + "learning_rate": 4.020459666868553e-06, + "loss": 0.4739, + "step": 3775 + }, + { + "epoch": 1.7853427895981087, + "grad_norm": 2.5735671520233154, + "learning_rate": 4.0199644280631215e-06, + "loss": 0.4716, + "step": 3776 + }, + { + "epoch": 1.7858156028368795, + "grad_norm": 2.419990062713623, + "learning_rate": 4.01946909461617e-06, + "loss": 0.4866, + "step": 3777 + }, + { + "epoch": 1.7862884160756503, + "grad_norm": 2.5597951412200928, + "learning_rate": 4.01897366655854e-06, + "loss": 0.5569, + "step": 3778 + }, + { + "epoch": 1.7867612293144208, + "grad_norm": 2.462383985519409, + "learning_rate": 4.018478143921081e-06, + "loss": 0.4588, + "step": 3779 + }, + { + "epoch": 1.7872340425531914, + "grad_norm": 2.536701202392578, + "learning_rate": 4.017982526734646e-06, + "loss": 0.5278, + "step": 3780 + }, + { + "epoch": 1.7877068557919622, + "grad_norm": 2.691077470779419, + "learning_rate": 4.017486815030095e-06, + "loss": 0.4815, + "step": 3781 + }, + { + "epoch": 1.788179669030733, + "grad_norm": 2.4277288913726807, + "learning_rate": 4.016991008838294e-06, + "loss": 0.4877, + "step": 3782 + }, + { + "epoch": 1.7886524822695036, + "grad_norm": 2.6740009784698486, + "learning_rate": 4.016495108190115e-06, + "loss": 0.572, + "step": 3783 + }, + { + "epoch": 1.7891252955082741, + "grad_norm": 3.179232120513916, + "learning_rate": 4.0159991131164355e-06, + "loss": 0.4821, + "step": 3784 + }, + { + "epoch": 1.789598108747045, + "grad_norm": 3.2747793197631836, + "learning_rate": 4.015503023648138e-06, + "loss": 0.5517, + "step": 3785 + }, + { + "epoch": 1.7900709219858157, + "grad_norm": 2.671367645263672, + "learning_rate": 4.015006839816113e-06, + "loss": 0.5158, + "step": 3786 + }, + { + "epoch": 1.7905437352245863, + "grad_norm": 2.6600193977355957, + "learning_rate": 4.014510561651256e-06, + "loss": 0.535, + "step": 3787 + }, + { + "epoch": 1.791016548463357, + "grad_norm": 2.481509208679199, + "learning_rate": 4.014014189184466e-06, + "loss": 0.5596, + "step": 3788 + }, + { + "epoch": 1.7914893617021277, + "grad_norm": 2.759816884994507, + "learning_rate": 4.013517722446652e-06, + "loss": 0.5201, + "step": 3789 + }, + { + "epoch": 1.7919621749408985, + "grad_norm": 2.6913561820983887, + "learning_rate": 4.013021161468724e-06, + "loss": 0.5758, + "step": 3790 + }, + { + "epoch": 1.792434988179669, + "grad_norm": 2.775087594985962, + "learning_rate": 4.0125245062816044e-06, + "loss": 0.499, + "step": 3791 + }, + { + "epoch": 1.7929078014184396, + "grad_norm": 2.6134777069091797, + "learning_rate": 4.012027756916216e-06, + "loss": 0.5659, + "step": 3792 + }, + { + "epoch": 1.7933806146572104, + "grad_norm": 2.7109756469726562, + "learning_rate": 4.0115309134034895e-06, + "loss": 0.5337, + "step": 3793 + }, + { + "epoch": 1.7938534278959812, + "grad_norm": 2.5389950275421143, + "learning_rate": 4.0110339757743595e-06, + "loss": 0.4501, + "step": 3794 + }, + { + "epoch": 1.7943262411347518, + "grad_norm": 2.634648561477661, + "learning_rate": 4.010536944059771e-06, + "loss": 0.4411, + "step": 3795 + }, + { + "epoch": 1.7947990543735224, + "grad_norm": 2.527070999145508, + "learning_rate": 4.0100398182906695e-06, + "loss": 0.5145, + "step": 3796 + }, + { + "epoch": 1.7952718676122932, + "grad_norm": 2.62988543510437, + "learning_rate": 4.0095425984980105e-06, + "loss": 0.4981, + "step": 3797 + }, + { + "epoch": 1.795744680851064, + "grad_norm": 2.6032519340515137, + "learning_rate": 4.009045284712752e-06, + "loss": 0.453, + "step": 3798 + }, + { + "epoch": 1.7962174940898346, + "grad_norm": 2.735173463821411, + "learning_rate": 4.008547876965863e-06, + "loss": 0.5925, + "step": 3799 + }, + { + "epoch": 1.7966903073286051, + "grad_norm": 2.6296730041503906, + "learning_rate": 4.00805037528831e-06, + "loss": 0.5651, + "step": 3800 + }, + { + "epoch": 1.797163120567376, + "grad_norm": 2.641214370727539, + "learning_rate": 4.0075527797110735e-06, + "loss": 0.4973, + "step": 3801 + }, + { + "epoch": 1.7976359338061467, + "grad_norm": 2.6104819774627686, + "learning_rate": 4.007055090265136e-06, + "loss": 0.4432, + "step": 3802 + }, + { + "epoch": 1.7981087470449173, + "grad_norm": 2.8200619220733643, + "learning_rate": 4.0065573069814865e-06, + "loss": 0.4899, + "step": 3803 + }, + { + "epoch": 1.7985815602836879, + "grad_norm": 2.982354164123535, + "learning_rate": 4.006059429891119e-06, + "loss": 0.5488, + "step": 3804 + }, + { + "epoch": 1.7990543735224587, + "grad_norm": 2.7561678886413574, + "learning_rate": 4.005561459025034e-06, + "loss": 0.5637, + "step": 3805 + }, + { + "epoch": 1.7995271867612295, + "grad_norm": 2.702212333679199, + "learning_rate": 4.005063394414241e-06, + "loss": 0.4804, + "step": 3806 + }, + { + "epoch": 1.8, + "grad_norm": 2.8655319213867188, + "learning_rate": 4.004565236089748e-06, + "loss": 0.5759, + "step": 3807 + }, + { + "epoch": 1.8004728132387706, + "grad_norm": 2.703676223754883, + "learning_rate": 4.0040669840825756e-06, + "loss": 0.4728, + "step": 3808 + }, + { + "epoch": 1.8009456264775414, + "grad_norm": 2.802645683288574, + "learning_rate": 4.003568638423747e-06, + "loss": 0.5421, + "step": 3809 + }, + { + "epoch": 1.8014184397163122, + "grad_norm": 2.4723124504089355, + "learning_rate": 4.003070199144292e-06, + "loss": 0.4944, + "step": 3810 + }, + { + "epoch": 1.8018912529550828, + "grad_norm": 2.4889068603515625, + "learning_rate": 4.0025716662752475e-06, + "loss": 0.4774, + "step": 3811 + }, + { + "epoch": 1.8023640661938534, + "grad_norm": 2.5408077239990234, + "learning_rate": 4.002073039847653e-06, + "loss": 0.5233, + "step": 3812 + }, + { + "epoch": 1.8028368794326242, + "grad_norm": 2.734602689743042, + "learning_rate": 4.001574319892557e-06, + "loss": 0.5403, + "step": 3813 + }, + { + "epoch": 1.803309692671395, + "grad_norm": 3.3786163330078125, + "learning_rate": 4.001075506441012e-06, + "loss": 0.6969, + "step": 3814 + }, + { + "epoch": 1.8037825059101655, + "grad_norm": 2.7375378608703613, + "learning_rate": 4.000576599524078e-06, + "loss": 0.4907, + "step": 3815 + }, + { + "epoch": 1.804255319148936, + "grad_norm": 3.041804075241089, + "learning_rate": 4.000077599172818e-06, + "loss": 0.6021, + "step": 3816 + }, + { + "epoch": 1.804728132387707, + "grad_norm": 2.697599411010742, + "learning_rate": 3.999578505418305e-06, + "loss": 0.4743, + "step": 3817 + }, + { + "epoch": 1.8052009456264777, + "grad_norm": 2.276921272277832, + "learning_rate": 3.999079318291612e-06, + "loss": 0.4885, + "step": 3818 + }, + { + "epoch": 1.8056737588652483, + "grad_norm": 2.4896953105926514, + "learning_rate": 3.998580037823825e-06, + "loss": 0.503, + "step": 3819 + }, + { + "epoch": 1.8061465721040189, + "grad_norm": 2.6232175827026367, + "learning_rate": 3.998080664046029e-06, + "loss": 0.5058, + "step": 3820 + }, + { + "epoch": 1.8066193853427897, + "grad_norm": 2.695861339569092, + "learning_rate": 3.997581196989319e-06, + "loss": 0.4949, + "step": 3821 + }, + { + "epoch": 1.8070921985815604, + "grad_norm": 2.912886142730713, + "learning_rate": 3.997081636684795e-06, + "loss": 0.4971, + "step": 3822 + }, + { + "epoch": 1.807565011820331, + "grad_norm": 2.876500368118286, + "learning_rate": 3.996581983163561e-06, + "loss": 0.5584, + "step": 3823 + }, + { + "epoch": 1.8080378250591016, + "grad_norm": 2.857069730758667, + "learning_rate": 3.99608223645673e-06, + "loss": 0.5457, + "step": 3824 + }, + { + "epoch": 1.8085106382978724, + "grad_norm": 2.486743211746216, + "learning_rate": 3.995582396595419e-06, + "loss": 0.5291, + "step": 3825 + }, + { + "epoch": 1.808983451536643, + "grad_norm": 2.509441375732422, + "learning_rate": 3.9950824636107486e-06, + "loss": 0.4747, + "step": 3826 + }, + { + "epoch": 1.8094562647754135, + "grad_norm": 2.931394100189209, + "learning_rate": 3.99458243753385e-06, + "loss": 0.5116, + "step": 3827 + }, + { + "epoch": 1.8099290780141843, + "grad_norm": 2.4868650436401367, + "learning_rate": 3.994082318395856e-06, + "loss": 0.4671, + "step": 3828 + }, + { + "epoch": 1.8104018912529551, + "grad_norm": 2.5554752349853516, + "learning_rate": 3.993582106227907e-06, + "loss": 0.4969, + "step": 3829 + }, + { + "epoch": 1.8108747044917257, + "grad_norm": 2.8367133140563965, + "learning_rate": 3.99308180106115e-06, + "loss": 0.5507, + "step": 3830 + }, + { + "epoch": 1.8113475177304963, + "grad_norm": 2.68245792388916, + "learning_rate": 3.992581402926737e-06, + "loss": 0.5115, + "step": 3831 + }, + { + "epoch": 1.811820330969267, + "grad_norm": 2.406674385070801, + "learning_rate": 3.992080911855824e-06, + "loss": 0.545, + "step": 3832 + }, + { + "epoch": 1.8122931442080379, + "grad_norm": 2.5003464221954346, + "learning_rate": 3.991580327879575e-06, + "loss": 0.4331, + "step": 3833 + }, + { + "epoch": 1.8127659574468085, + "grad_norm": 2.49320912361145, + "learning_rate": 3.99107965102916e-06, + "loss": 0.5118, + "step": 3834 + }, + { + "epoch": 1.813238770685579, + "grad_norm": 2.6183295249938965, + "learning_rate": 3.990578881335752e-06, + "loss": 0.5286, + "step": 3835 + }, + { + "epoch": 1.8137115839243498, + "grad_norm": 3.1999518871307373, + "learning_rate": 3.990078018830534e-06, + "loss": 0.5048, + "step": 3836 + }, + { + "epoch": 1.8141843971631206, + "grad_norm": 2.4351117610931396, + "learning_rate": 3.9895770635446915e-06, + "loss": 0.514, + "step": 3837 + }, + { + "epoch": 1.8146572104018912, + "grad_norm": 2.6859259605407715, + "learning_rate": 3.989076015509416e-06, + "loss": 0.5575, + "step": 3838 + }, + { + "epoch": 1.8151300236406618, + "grad_norm": 2.790421962738037, + "learning_rate": 3.988574874755909e-06, + "loss": 0.5467, + "step": 3839 + }, + { + "epoch": 1.8156028368794326, + "grad_norm": 2.5202765464782715, + "learning_rate": 3.988073641315369e-06, + "loss": 0.5229, + "step": 3840 + }, + { + "epoch": 1.8160756501182034, + "grad_norm": 2.623652219772339, + "learning_rate": 3.987572315219009e-06, + "loss": 0.509, + "step": 3841 + }, + { + "epoch": 1.816548463356974, + "grad_norm": 2.6038360595703125, + "learning_rate": 3.987070896498044e-06, + "loss": 0.5304, + "step": 3842 + }, + { + "epoch": 1.8170212765957445, + "grad_norm": 2.9378011226654053, + "learning_rate": 3.9865693851836955e-06, + "loss": 0.5845, + "step": 3843 + }, + { + "epoch": 1.8174940898345153, + "grad_norm": 2.4061124324798584, + "learning_rate": 3.98606778130719e-06, + "loss": 0.4333, + "step": 3844 + }, + { + "epoch": 1.8179669030732861, + "grad_norm": 2.483489751815796, + "learning_rate": 3.985566084899759e-06, + "loss": 0.4827, + "step": 3845 + }, + { + "epoch": 1.8184397163120567, + "grad_norm": 2.7774932384490967, + "learning_rate": 3.985064295992642e-06, + "loss": 0.5016, + "step": 3846 + }, + { + "epoch": 1.8189125295508273, + "grad_norm": 2.5936765670776367, + "learning_rate": 3.984562414617083e-06, + "loss": 0.4448, + "step": 3847 + }, + { + "epoch": 1.819385342789598, + "grad_norm": 2.8608627319335938, + "learning_rate": 3.9840604408043325e-06, + "loss": 0.5735, + "step": 3848 + }, + { + "epoch": 1.8198581560283689, + "grad_norm": 2.6212472915649414, + "learning_rate": 3.983558374585646e-06, + "loss": 0.5091, + "step": 3849 + }, + { + "epoch": 1.8203309692671394, + "grad_norm": 2.832460641860962, + "learning_rate": 3.983056215992284e-06, + "loss": 0.5169, + "step": 3850 + }, + { + "epoch": 1.82080378250591, + "grad_norm": 2.5293610095977783, + "learning_rate": 3.982553965055514e-06, + "loss": 0.4708, + "step": 3851 + }, + { + "epoch": 1.8212765957446808, + "grad_norm": 2.9362871646881104, + "learning_rate": 3.982051621806611e-06, + "loss": 0.575, + "step": 3852 + }, + { + "epoch": 1.8217494089834516, + "grad_norm": 2.69073486328125, + "learning_rate": 3.98154918627685e-06, + "loss": 0.5278, + "step": 3853 + }, + { + "epoch": 1.8222222222222222, + "grad_norm": 2.6711034774780273, + "learning_rate": 3.98104665849752e-06, + "loss": 0.4918, + "step": 3854 + }, + { + "epoch": 1.8226950354609928, + "grad_norm": 2.571110963821411, + "learning_rate": 3.980544038499907e-06, + "loss": 0.5234, + "step": 3855 + }, + { + "epoch": 1.8231678486997636, + "grad_norm": 3.2603371143341064, + "learning_rate": 3.980041326315309e-06, + "loss": 0.5996, + "step": 3856 + }, + { + "epoch": 1.8236406619385344, + "grad_norm": 2.8472323417663574, + "learning_rate": 3.979538521975028e-06, + "loss": 0.4769, + "step": 3857 + }, + { + "epoch": 1.824113475177305, + "grad_norm": 2.6714751720428467, + "learning_rate": 3.979035625510371e-06, + "loss": 0.4826, + "step": 3858 + }, + { + "epoch": 1.8245862884160755, + "grad_norm": 2.6816468238830566, + "learning_rate": 3.97853263695265e-06, + "loss": 0.5127, + "step": 3859 + }, + { + "epoch": 1.8250591016548463, + "grad_norm": 2.6464123725891113, + "learning_rate": 3.978029556333185e-06, + "loss": 0.4925, + "step": 3860 + }, + { + "epoch": 1.825531914893617, + "grad_norm": 2.5317227840423584, + "learning_rate": 3.977526383683301e-06, + "loss": 0.4765, + "step": 3861 + }, + { + "epoch": 1.8260047281323877, + "grad_norm": 2.5052425861358643, + "learning_rate": 3.977023119034328e-06, + "loss": 0.4804, + "step": 3862 + }, + { + "epoch": 1.8264775413711583, + "grad_norm": 2.7022836208343506, + "learning_rate": 3.976519762417602e-06, + "loss": 0.4824, + "step": 3863 + }, + { + "epoch": 1.826950354609929, + "grad_norm": 2.7445900440216064, + "learning_rate": 3.976016313864464e-06, + "loss": 0.5698, + "step": 3864 + }, + { + "epoch": 1.8274231678486998, + "grad_norm": 2.442518711090088, + "learning_rate": 3.975512773406262e-06, + "loss": 0.5133, + "step": 3865 + }, + { + "epoch": 1.8278959810874704, + "grad_norm": 2.4100050926208496, + "learning_rate": 3.975009141074351e-06, + "loss": 0.5044, + "step": 3866 + }, + { + "epoch": 1.828368794326241, + "grad_norm": 2.9507648944854736, + "learning_rate": 3.974505416900088e-06, + "loss": 0.5367, + "step": 3867 + }, + { + "epoch": 1.8288416075650118, + "grad_norm": 2.5662600994110107, + "learning_rate": 3.974001600914837e-06, + "loss": 0.5878, + "step": 3868 + }, + { + "epoch": 1.8293144208037826, + "grad_norm": 2.4306657314300537, + "learning_rate": 3.973497693149971e-06, + "loss": 0.4647, + "step": 3869 + }, + { + "epoch": 1.8297872340425532, + "grad_norm": 2.974686622619629, + "learning_rate": 3.972993693636864e-06, + "loss": 0.4911, + "step": 3870 + }, + { + "epoch": 1.8302600472813237, + "grad_norm": 2.5711987018585205, + "learning_rate": 3.972489602406899e-06, + "loss": 0.5089, + "step": 3871 + }, + { + "epoch": 1.8307328605200945, + "grad_norm": 3.259617328643799, + "learning_rate": 3.971985419491463e-06, + "loss": 0.5966, + "step": 3872 + }, + { + "epoch": 1.8312056737588653, + "grad_norm": 2.7437000274658203, + "learning_rate": 3.971481144921949e-06, + "loss": 0.5097, + "step": 3873 + }, + { + "epoch": 1.831678486997636, + "grad_norm": 2.9597461223602295, + "learning_rate": 3.970976778729757e-06, + "loss": 0.5672, + "step": 3874 + }, + { + "epoch": 1.8321513002364065, + "grad_norm": 2.5775723457336426, + "learning_rate": 3.970472320946291e-06, + "loss": 0.4749, + "step": 3875 + }, + { + "epoch": 1.8326241134751773, + "grad_norm": 2.7381200790405273, + "learning_rate": 3.969967771602961e-06, + "loss": 0.5255, + "step": 3876 + }, + { + "epoch": 1.833096926713948, + "grad_norm": 2.651698350906372, + "learning_rate": 3.969463130731183e-06, + "loss": 0.5098, + "step": 3877 + }, + { + "epoch": 1.8335697399527187, + "grad_norm": 2.7277021408081055, + "learning_rate": 3.968958398362381e-06, + "loss": 0.5251, + "step": 3878 + }, + { + "epoch": 1.8340425531914892, + "grad_norm": 2.5184953212738037, + "learning_rate": 3.968453574527978e-06, + "loss": 0.5086, + "step": 3879 + }, + { + "epoch": 1.83451536643026, + "grad_norm": 2.8227882385253906, + "learning_rate": 3.967948659259412e-06, + "loss": 0.5742, + "step": 3880 + }, + { + "epoch": 1.8349881796690308, + "grad_norm": 2.547922134399414, + "learning_rate": 3.967443652588119e-06, + "loss": 0.5411, + "step": 3881 + }, + { + "epoch": 1.8354609929078014, + "grad_norm": 2.6572835445404053, + "learning_rate": 3.966938554545545e-06, + "loss": 0.4854, + "step": 3882 + }, + { + "epoch": 1.835933806146572, + "grad_norm": 2.9416658878326416, + "learning_rate": 3.966433365163139e-06, + "loss": 0.5236, + "step": 3883 + }, + { + "epoch": 1.8364066193853428, + "grad_norm": 2.344325304031372, + "learning_rate": 3.965928084472357e-06, + "loss": 0.4916, + "step": 3884 + }, + { + "epoch": 1.8368794326241136, + "grad_norm": 2.890418291091919, + "learning_rate": 3.965422712504662e-06, + "loss": 0.5287, + "step": 3885 + }, + { + "epoch": 1.8373522458628841, + "grad_norm": 2.6063363552093506, + "learning_rate": 3.96491724929152e-06, + "loss": 0.4842, + "step": 3886 + }, + { + "epoch": 1.8378250591016547, + "grad_norm": 2.5582427978515625, + "learning_rate": 3.964411694864404e-06, + "loss": 0.4768, + "step": 3887 + }, + { + "epoch": 1.8382978723404255, + "grad_norm": 2.84356951713562, + "learning_rate": 3.963906049254793e-06, + "loss": 0.5284, + "step": 3888 + }, + { + "epoch": 1.8387706855791963, + "grad_norm": 2.7048516273498535, + "learning_rate": 3.963400312494172e-06, + "loss": 0.5271, + "step": 3889 + }, + { + "epoch": 1.839243498817967, + "grad_norm": 2.5401699542999268, + "learning_rate": 3.962894484614031e-06, + "loss": 0.4734, + "step": 3890 + }, + { + "epoch": 1.8397163120567375, + "grad_norm": 2.208256244659424, + "learning_rate": 3.962388565645864e-06, + "loss": 0.4113, + "step": 3891 + }, + { + "epoch": 1.8401891252955083, + "grad_norm": 2.775139331817627, + "learning_rate": 3.961882555621173e-06, + "loss": 0.5172, + "step": 3892 + }, + { + "epoch": 1.840661938534279, + "grad_norm": 2.7540855407714844, + "learning_rate": 3.961376454571466e-06, + "loss": 0.5252, + "step": 3893 + }, + { + "epoch": 1.8411347517730496, + "grad_norm": 2.6731574535369873, + "learning_rate": 3.960870262528255e-06, + "loss": 0.4495, + "step": 3894 + }, + { + "epoch": 1.8416075650118202, + "grad_norm": 2.791492223739624, + "learning_rate": 3.960363979523058e-06, + "loss": 0.5457, + "step": 3895 + }, + { + "epoch": 1.842080378250591, + "grad_norm": 2.9280290603637695, + "learning_rate": 3.959857605587401e-06, + "loss": 0.5373, + "step": 3896 + }, + { + "epoch": 1.8425531914893618, + "grad_norm": 2.5652217864990234, + "learning_rate": 3.95935114075281e-06, + "loss": 0.5191, + "step": 3897 + }, + { + "epoch": 1.8430260047281324, + "grad_norm": 2.7297749519348145, + "learning_rate": 3.958844585050824e-06, + "loss": 0.5366, + "step": 3898 + }, + { + "epoch": 1.843498817966903, + "grad_norm": 2.5302982330322266, + "learning_rate": 3.958337938512983e-06, + "loss": 0.569, + "step": 3899 + }, + { + "epoch": 1.8439716312056738, + "grad_norm": 2.644777297973633, + "learning_rate": 3.957831201170832e-06, + "loss": 0.521, + "step": 3900 + }, + { + "epoch": 1.8444444444444446, + "grad_norm": 2.8375515937805176, + "learning_rate": 3.957324373055925e-06, + "loss": 0.573, + "step": 3901 + }, + { + "epoch": 1.8449172576832151, + "grad_norm": 2.512296676635742, + "learning_rate": 3.956817454199819e-06, + "loss": 0.5081, + "step": 3902 + }, + { + "epoch": 1.8453900709219857, + "grad_norm": 2.3662109375, + "learning_rate": 3.956310444634079e-06, + "loss": 0.4989, + "step": 3903 + }, + { + "epoch": 1.8458628841607565, + "grad_norm": 2.6849682331085205, + "learning_rate": 3.955803344390272e-06, + "loss": 0.5459, + "step": 3904 + }, + { + "epoch": 1.8463356973995273, + "grad_norm": 2.8364317417144775, + "learning_rate": 3.9552961534999756e-06, + "loss": 0.5704, + "step": 3905 + }, + { + "epoch": 1.8468085106382979, + "grad_norm": 2.6006948947906494, + "learning_rate": 3.954788871994768e-06, + "loss": 0.5696, + "step": 3906 + }, + { + "epoch": 1.8472813238770684, + "grad_norm": 2.558300018310547, + "learning_rate": 3.9542814999062375e-06, + "loss": 0.5047, + "step": 3907 + }, + { + "epoch": 1.8477541371158392, + "grad_norm": 2.6343321800231934, + "learning_rate": 3.953774037265974e-06, + "loss": 0.525, + "step": 3908 + }, + { + "epoch": 1.84822695035461, + "grad_norm": 2.5050008296966553, + "learning_rate": 3.953266484105576e-06, + "loss": 0.4867, + "step": 3909 + }, + { + "epoch": 1.8486997635933806, + "grad_norm": 2.3775103092193604, + "learning_rate": 3.952758840456647e-06, + "loss": 0.4349, + "step": 3910 + }, + { + "epoch": 1.8491725768321512, + "grad_norm": 2.508376359939575, + "learning_rate": 3.952251106350794e-06, + "loss": 0.539, + "step": 3911 + }, + { + "epoch": 1.849645390070922, + "grad_norm": 2.7403106689453125, + "learning_rate": 3.951743281819633e-06, + "loss": 0.4478, + "step": 3912 + }, + { + "epoch": 1.8501182033096928, + "grad_norm": 2.5332062244415283, + "learning_rate": 3.951235366894784e-06, + "loss": 0.4658, + "step": 3913 + }, + { + "epoch": 1.8505910165484634, + "grad_norm": 3.0137248039245605, + "learning_rate": 3.950727361607872e-06, + "loss": 0.5047, + "step": 3914 + }, + { + "epoch": 1.851063829787234, + "grad_norm": 2.5820653438568115, + "learning_rate": 3.950219265990528e-06, + "loss": 0.542, + "step": 3915 + }, + { + "epoch": 1.8515366430260047, + "grad_norm": 2.555133819580078, + "learning_rate": 3.949711080074389e-06, + "loss": 0.5253, + "step": 3916 + }, + { + "epoch": 1.8520094562647755, + "grad_norm": 2.876882791519165, + "learning_rate": 3.949202803891099e-06, + "loss": 0.5242, + "step": 3917 + }, + { + "epoch": 1.852482269503546, + "grad_norm": 2.5929203033447266, + "learning_rate": 3.948694437472305e-06, + "loss": 0.5358, + "step": 3918 + }, + { + "epoch": 1.8529550827423167, + "grad_norm": 2.468513250350952, + "learning_rate": 3.948185980849659e-06, + "loss": 0.5119, + "step": 3919 + }, + { + "epoch": 1.8534278959810875, + "grad_norm": 2.9259560108184814, + "learning_rate": 3.947677434054824e-06, + "loss": 0.4756, + "step": 3920 + }, + { + "epoch": 1.8539007092198583, + "grad_norm": 2.5247011184692383, + "learning_rate": 3.947168797119462e-06, + "loss": 0.4627, + "step": 3921 + }, + { + "epoch": 1.8543735224586289, + "grad_norm": 2.7396671772003174, + "learning_rate": 3.946660070075245e-06, + "loss": 0.5013, + "step": 3922 + }, + { + "epoch": 1.8548463356973994, + "grad_norm": 2.7059738636016846, + "learning_rate": 3.946151252953849e-06, + "loss": 0.5875, + "step": 3923 + }, + { + "epoch": 1.8553191489361702, + "grad_norm": 2.5638437271118164, + "learning_rate": 3.945642345786955e-06, + "loss": 0.5063, + "step": 3924 + }, + { + "epoch": 1.855791962174941, + "grad_norm": 2.6647839546203613, + "learning_rate": 3.945133348606251e-06, + "loss": 0.5421, + "step": 3925 + }, + { + "epoch": 1.8562647754137116, + "grad_norm": 3.7235286235809326, + "learning_rate": 3.944624261443431e-06, + "loss": 0.5958, + "step": 3926 + }, + { + "epoch": 1.8567375886524822, + "grad_norm": 2.769984245300293, + "learning_rate": 3.944115084330192e-06, + "loss": 0.5678, + "step": 3927 + }, + { + "epoch": 1.857210401891253, + "grad_norm": 2.567249059677124, + "learning_rate": 3.9436058172982395e-06, + "loss": 0.4767, + "step": 3928 + }, + { + "epoch": 1.8576832151300238, + "grad_norm": 2.6196048259735107, + "learning_rate": 3.943096460379283e-06, + "loss": 0.5345, + "step": 3929 + }, + { + "epoch": 1.8581560283687943, + "grad_norm": 2.5999555587768555, + "learning_rate": 3.942587013605037e-06, + "loss": 0.5482, + "step": 3930 + }, + { + "epoch": 1.858628841607565, + "grad_norm": 2.630387783050537, + "learning_rate": 3.942077477007224e-06, + "loss": 0.6023, + "step": 3931 + }, + { + "epoch": 1.8591016548463357, + "grad_norm": 2.543503761291504, + "learning_rate": 3.941567850617569e-06, + "loss": 0.5157, + "step": 3932 + }, + { + "epoch": 1.8595744680851065, + "grad_norm": 2.5109236240386963, + "learning_rate": 3.941058134467805e-06, + "loss": 0.4774, + "step": 3933 + }, + { + "epoch": 1.860047281323877, + "grad_norm": 2.5110230445861816, + "learning_rate": 3.94054832858967e-06, + "loss": 0.5064, + "step": 3934 + }, + { + "epoch": 1.8605200945626477, + "grad_norm": 2.4780776500701904, + "learning_rate": 3.940038433014908e-06, + "loss": 0.5216, + "step": 3935 + }, + { + "epoch": 1.8609929078014185, + "grad_norm": 2.4398856163024902, + "learning_rate": 3.939528447775266e-06, + "loss": 0.4958, + "step": 3936 + }, + { + "epoch": 1.8614657210401893, + "grad_norm": 2.449498176574707, + "learning_rate": 3.9390183729025e-06, + "loss": 0.5165, + "step": 3937 + }, + { + "epoch": 1.8619385342789598, + "grad_norm": 2.982544422149658, + "learning_rate": 3.938508208428371e-06, + "loss": 0.4803, + "step": 3938 + }, + { + "epoch": 1.8624113475177304, + "grad_norm": 2.6574015617370605, + "learning_rate": 3.937997954384641e-06, + "loss": 0.4797, + "step": 3939 + }, + { + "epoch": 1.8628841607565012, + "grad_norm": 2.7773542404174805, + "learning_rate": 3.937487610803086e-06, + "loss": 0.4843, + "step": 3940 + }, + { + "epoch": 1.863356973995272, + "grad_norm": 2.588937759399414, + "learning_rate": 3.9369771777154805e-06, + "loss": 0.5426, + "step": 3941 + }, + { + "epoch": 1.8638297872340426, + "grad_norm": 2.855442523956299, + "learning_rate": 3.936466655153607e-06, + "loss": 0.5443, + "step": 3942 + }, + { + "epoch": 1.8643026004728132, + "grad_norm": 2.554676055908203, + "learning_rate": 3.935956043149253e-06, + "loss": 0.5334, + "step": 3943 + }, + { + "epoch": 1.864775413711584, + "grad_norm": 2.901599884033203, + "learning_rate": 3.935445341734212e-06, + "loss": 0.5842, + "step": 3944 + }, + { + "epoch": 1.8652482269503547, + "grad_norm": 2.554485321044922, + "learning_rate": 3.934934550940285e-06, + "loss": 0.4941, + "step": 3945 + }, + { + "epoch": 1.8657210401891253, + "grad_norm": 2.357203245162964, + "learning_rate": 3.934423670799275e-06, + "loss": 0.4402, + "step": 3946 + }, + { + "epoch": 1.866193853427896, + "grad_norm": 2.7036049365997314, + "learning_rate": 3.933912701342993e-06, + "loss": 0.4966, + "step": 3947 + }, + { + "epoch": 1.8666666666666667, + "grad_norm": 2.7817211151123047, + "learning_rate": 3.933401642603255e-06, + "loss": 0.4908, + "step": 3948 + }, + { + "epoch": 1.8671394799054375, + "grad_norm": 2.439490795135498, + "learning_rate": 3.932890494611882e-06, + "loss": 0.4322, + "step": 3949 + }, + { + "epoch": 1.867612293144208, + "grad_norm": 3.187152147293091, + "learning_rate": 3.9323792574007e-06, + "loss": 0.501, + "step": 3950 + }, + { + "epoch": 1.8680851063829786, + "grad_norm": 2.405773401260376, + "learning_rate": 3.931867931001543e-06, + "loss": 0.4477, + "step": 3951 + }, + { + "epoch": 1.8685579196217494, + "grad_norm": 2.4922525882720947, + "learning_rate": 3.931356515446248e-06, + "loss": 0.5098, + "step": 3952 + }, + { + "epoch": 1.8690307328605202, + "grad_norm": 2.7781267166137695, + "learning_rate": 3.93084501076666e-06, + "loss": 0.5815, + "step": 3953 + }, + { + "epoch": 1.8695035460992908, + "grad_norm": 2.74621844291687, + "learning_rate": 3.930333416994626e-06, + "loss": 0.5605, + "step": 3954 + }, + { + "epoch": 1.8699763593380614, + "grad_norm": 2.5527689456939697, + "learning_rate": 3.929821734162004e-06, + "loss": 0.5141, + "step": 3955 + }, + { + "epoch": 1.8704491725768322, + "grad_norm": 2.5730628967285156, + "learning_rate": 3.92930996230065e-06, + "loss": 0.5446, + "step": 3956 + }, + { + "epoch": 1.870921985815603, + "grad_norm": 2.7053353786468506, + "learning_rate": 3.9287981014424334e-06, + "loss": 0.4722, + "step": 3957 + }, + { + "epoch": 1.8713947990543736, + "grad_norm": 2.7591893672943115, + "learning_rate": 3.928286151619224e-06, + "loss": 0.509, + "step": 3958 + }, + { + "epoch": 1.8718676122931441, + "grad_norm": 2.6233739852905273, + "learning_rate": 3.927774112862898e-06, + "loss": 0.5266, + "step": 3959 + }, + { + "epoch": 1.872340425531915, + "grad_norm": 2.7715370655059814, + "learning_rate": 3.9272619852053396e-06, + "loss": 0.5612, + "step": 3960 + }, + { + "epoch": 1.8728132387706857, + "grad_norm": 2.4815211296081543, + "learning_rate": 3.926749768678435e-06, + "loss": 0.5498, + "step": 3961 + }, + { + "epoch": 1.8732860520094563, + "grad_norm": 2.6819605827331543, + "learning_rate": 3.926237463314078e-06, + "loss": 0.5499, + "step": 3962 + }, + { + "epoch": 1.8737588652482269, + "grad_norm": 2.638664722442627, + "learning_rate": 3.925725069144168e-06, + "loss": 0.5429, + "step": 3963 + }, + { + "epoch": 1.8742316784869977, + "grad_norm": 2.527294874191284, + "learning_rate": 3.925212586200611e-06, + "loss": 0.5451, + "step": 3964 + }, + { + "epoch": 1.8747044917257685, + "grad_norm": 2.831638813018799, + "learning_rate": 3.924700014515315e-06, + "loss": 0.5276, + "step": 3965 + }, + { + "epoch": 1.875177304964539, + "grad_norm": 2.5906996726989746, + "learning_rate": 3.924187354120196e-06, + "loss": 0.5323, + "step": 3966 + }, + { + "epoch": 1.8756501182033096, + "grad_norm": 2.5482442378997803, + "learning_rate": 3.923674605047175e-06, + "loss": 0.4882, + "step": 3967 + }, + { + "epoch": 1.8761229314420804, + "grad_norm": 2.56402850151062, + "learning_rate": 3.923161767328179e-06, + "loss": 0.5111, + "step": 3968 + }, + { + "epoch": 1.8765957446808512, + "grad_norm": 3.223782539367676, + "learning_rate": 3.9226488409951405e-06, + "loss": 0.5829, + "step": 3969 + }, + { + "epoch": 1.8770685579196218, + "grad_norm": 2.665964365005493, + "learning_rate": 3.922135826079997e-06, + "loss": 0.4739, + "step": 3970 + }, + { + "epoch": 1.8775413711583924, + "grad_norm": 2.602696418762207, + "learning_rate": 3.921622722614691e-06, + "loss": 0.5199, + "step": 3971 + }, + { + "epoch": 1.8780141843971632, + "grad_norm": 2.5384418964385986, + "learning_rate": 3.921109530631172e-06, + "loss": 0.5086, + "step": 3972 + }, + { + "epoch": 1.878486997635934, + "grad_norm": 2.7961080074310303, + "learning_rate": 3.920596250161394e-06, + "loss": 0.5454, + "step": 3973 + }, + { + "epoch": 1.8789598108747045, + "grad_norm": 3.022007465362549, + "learning_rate": 3.920082881237317e-06, + "loss": 0.5537, + "step": 3974 + }, + { + "epoch": 1.8794326241134751, + "grad_norm": 2.699885129928589, + "learning_rate": 3.9195694238909045e-06, + "loss": 0.5274, + "step": 3975 + }, + { + "epoch": 1.879905437352246, + "grad_norm": 2.3994593620300293, + "learning_rate": 3.919055878154129e-06, + "loss": 0.4134, + "step": 3976 + }, + { + "epoch": 1.8803782505910167, + "grad_norm": 4.093045711517334, + "learning_rate": 3.918542244058967e-06, + "loss": 0.5305, + "step": 3977 + }, + { + "epoch": 1.8808510638297873, + "grad_norm": 3.011643171310425, + "learning_rate": 3.9180285216374e-06, + "loss": 0.5481, + "step": 3978 + }, + { + "epoch": 1.8813238770685579, + "grad_norm": 2.6426854133605957, + "learning_rate": 3.917514710921414e-06, + "loss": 0.5415, + "step": 3979 + }, + { + "epoch": 1.8817966903073287, + "grad_norm": 2.4379019737243652, + "learning_rate": 3.917000811943002e-06, + "loss": 0.4566, + "step": 3980 + }, + { + "epoch": 1.8822695035460995, + "grad_norm": 3.18522047996521, + "learning_rate": 3.9164868247341634e-06, + "loss": 0.6079, + "step": 3981 + }, + { + "epoch": 1.88274231678487, + "grad_norm": 2.6451141834259033, + "learning_rate": 3.915972749326903e-06, + "loss": 0.515, + "step": 3982 + }, + { + "epoch": 1.8832151300236406, + "grad_norm": 2.565598726272583, + "learning_rate": 3.915458585753226e-06, + "loss": 0.4799, + "step": 3983 + }, + { + "epoch": 1.8836879432624114, + "grad_norm": 2.711651563644409, + "learning_rate": 3.91494433404515e-06, + "loss": 0.5595, + "step": 3984 + }, + { + "epoch": 1.8841607565011822, + "grad_norm": 2.749328851699829, + "learning_rate": 3.914429994234695e-06, + "loss": 0.495, + "step": 3985 + }, + { + "epoch": 1.8846335697399526, + "grad_norm": 2.9492287635803223, + "learning_rate": 3.913915566353886e-06, + "loss": 0.5683, + "step": 3986 + }, + { + "epoch": 1.8851063829787233, + "grad_norm": 3.07747745513916, + "learning_rate": 3.913401050434756e-06, + "loss": 0.4953, + "step": 3987 + }, + { + "epoch": 1.8855791962174941, + "grad_norm": 2.8746345043182373, + "learning_rate": 3.912886446509338e-06, + "loss": 0.4752, + "step": 3988 + }, + { + "epoch": 1.8860520094562647, + "grad_norm": 2.772954225540161, + "learning_rate": 3.912371754609677e-06, + "loss": 0.5473, + "step": 3989 + }, + { + "epoch": 1.8865248226950353, + "grad_norm": 2.8906044960021973, + "learning_rate": 3.911856974767821e-06, + "loss": 0.5285, + "step": 3990 + }, + { + "epoch": 1.886997635933806, + "grad_norm": 2.8992726802825928, + "learning_rate": 3.9113421070158206e-06, + "loss": 0.571, + "step": 3991 + }, + { + "epoch": 1.887470449172577, + "grad_norm": 2.624662160873413, + "learning_rate": 3.910827151385737e-06, + "loss": 0.5183, + "step": 3992 + }, + { + "epoch": 1.8879432624113475, + "grad_norm": 2.4491732120513916, + "learning_rate": 3.910312107909632e-06, + "loss": 0.4205, + "step": 3993 + }, + { + "epoch": 1.888416075650118, + "grad_norm": 2.278259515762329, + "learning_rate": 3.909796976619575e-06, + "loss": 0.4464, + "step": 3994 + }, + { + "epoch": 1.8888888888888888, + "grad_norm": 2.6481523513793945, + "learning_rate": 3.909281757547644e-06, + "loss": 0.5023, + "step": 3995 + }, + { + "epoch": 1.8893617021276596, + "grad_norm": 2.6687493324279785, + "learning_rate": 3.908766450725917e-06, + "loss": 0.495, + "step": 3996 + }, + { + "epoch": 1.8898345153664302, + "grad_norm": 2.507525682449341, + "learning_rate": 3.908251056186481e-06, + "loss": 0.4155, + "step": 3997 + }, + { + "epoch": 1.8903073286052008, + "grad_norm": 2.7048323154449463, + "learning_rate": 3.907735573961426e-06, + "loss": 0.4601, + "step": 3998 + }, + { + "epoch": 1.8907801418439716, + "grad_norm": 2.6825389862060547, + "learning_rate": 3.907220004082848e-06, + "loss": 0.5067, + "step": 3999 + }, + { + "epoch": 1.8912529550827424, + "grad_norm": 2.775696039199829, + "learning_rate": 3.906704346582852e-06, + "loss": 0.5411, + "step": 4000 + }, + { + "epoch": 1.891725768321513, + "grad_norm": 2.4492077827453613, + "learning_rate": 3.906188601493545e-06, + "loss": 0.4931, + "step": 4001 + }, + { + "epoch": 1.8921985815602835, + "grad_norm": 2.320810556411743, + "learning_rate": 3.905672768847041e-06, + "loss": 0.4908, + "step": 4002 + }, + { + "epoch": 1.8926713947990543, + "grad_norm": 2.455162525177002, + "learning_rate": 3.905156848675455e-06, + "loss": 0.508, + "step": 4003 + }, + { + "epoch": 1.8931442080378251, + "grad_norm": 2.515921115875244, + "learning_rate": 3.904640841010915e-06, + "loss": 0.5318, + "step": 4004 + }, + { + "epoch": 1.8936170212765957, + "grad_norm": 2.7230770587921143, + "learning_rate": 3.904124745885548e-06, + "loss": 0.4793, + "step": 4005 + }, + { + "epoch": 1.8940898345153663, + "grad_norm": 2.519934892654419, + "learning_rate": 3.903608563331491e-06, + "loss": 0.5013, + "step": 4006 + }, + { + "epoch": 1.894562647754137, + "grad_norm": 2.719674587249756, + "learning_rate": 3.903092293380883e-06, + "loss": 0.516, + "step": 4007 + }, + { + "epoch": 1.8950354609929079, + "grad_norm": 3.2107343673706055, + "learning_rate": 3.902575936065869e-06, + "loss": 0.6297, + "step": 4008 + }, + { + "epoch": 1.8955082742316784, + "grad_norm": 2.9773149490356445, + "learning_rate": 3.902059491418603e-06, + "loss": 0.566, + "step": 4009 + }, + { + "epoch": 1.895981087470449, + "grad_norm": 2.6754770278930664, + "learning_rate": 3.90154295947124e-06, + "loss": 0.5187, + "step": 4010 + }, + { + "epoch": 1.8964539007092198, + "grad_norm": 2.457303762435913, + "learning_rate": 3.901026340255943e-06, + "loss": 0.5757, + "step": 4011 + }, + { + "epoch": 1.8969267139479906, + "grad_norm": 2.5944161415100098, + "learning_rate": 3.900509633804878e-06, + "loss": 0.5049, + "step": 4012 + }, + { + "epoch": 1.8973995271867612, + "grad_norm": 2.610445022583008, + "learning_rate": 3.89999284015022e-06, + "loss": 0.521, + "step": 4013 + }, + { + "epoch": 1.8978723404255318, + "grad_norm": 2.6949338912963867, + "learning_rate": 3.899475959324146e-06, + "loss": 0.5619, + "step": 4014 + }, + { + "epoch": 1.8983451536643026, + "grad_norm": 2.7889559268951416, + "learning_rate": 3.898958991358841e-06, + "loss": 0.5223, + "step": 4015 + }, + { + "epoch": 1.8988179669030734, + "grad_norm": 2.569265842437744, + "learning_rate": 3.898441936286493e-06, + "loss": 0.5724, + "step": 4016 + }, + { + "epoch": 1.899290780141844, + "grad_norm": 2.3567774295806885, + "learning_rate": 3.897924794139299e-06, + "loss": 0.4784, + "step": 4017 + }, + { + "epoch": 1.8997635933806145, + "grad_norm": 2.9176526069641113, + "learning_rate": 3.897407564949457e-06, + "loss": 0.646, + "step": 4018 + }, + { + "epoch": 1.9002364066193853, + "grad_norm": 2.7870090007781982, + "learning_rate": 3.896890248749174e-06, + "loss": 0.4922, + "step": 4019 + }, + { + "epoch": 1.900709219858156, + "grad_norm": 2.8310980796813965, + "learning_rate": 3.89637284557066e-06, + "loss": 0.4746, + "step": 4020 + }, + { + "epoch": 1.9011820330969267, + "grad_norm": 2.434915542602539, + "learning_rate": 3.895855355446131e-06, + "loss": 0.4537, + "step": 4021 + }, + { + "epoch": 1.9016548463356973, + "grad_norm": 3.0547034740448, + "learning_rate": 3.89533777840781e-06, + "loss": 0.6161, + "step": 4022 + }, + { + "epoch": 1.902127659574468, + "grad_norm": 3.416774272918701, + "learning_rate": 3.894820114487925e-06, + "loss": 0.5448, + "step": 4023 + }, + { + "epoch": 1.9026004728132389, + "grad_norm": 2.606951951980591, + "learning_rate": 3.894302363718707e-06, + "loss": 0.5501, + "step": 4024 + }, + { + "epoch": 1.9030732860520094, + "grad_norm": 3.082165002822876, + "learning_rate": 3.8937845261323945e-06, + "loss": 0.6035, + "step": 4025 + }, + { + "epoch": 1.90354609929078, + "grad_norm": 2.616093397140503, + "learning_rate": 3.893266601761231e-06, + "loss": 0.5294, + "step": 4026 + }, + { + "epoch": 1.9040189125295508, + "grad_norm": 2.7141637802124023, + "learning_rate": 3.8927485906374654e-06, + "loss": 0.5481, + "step": 4027 + }, + { + "epoch": 1.9044917257683216, + "grad_norm": 2.5129404067993164, + "learning_rate": 3.892230492793352e-06, + "loss": 0.4958, + "step": 4028 + }, + { + "epoch": 1.9049645390070922, + "grad_norm": 2.703403949737549, + "learning_rate": 3.891712308261151e-06, + "loss": 0.4852, + "step": 4029 + }, + { + "epoch": 1.9054373522458627, + "grad_norm": 2.881058931350708, + "learning_rate": 3.891194037073127e-06, + "loss": 0.4662, + "step": 4030 + }, + { + "epoch": 1.9059101654846335, + "grad_norm": 3.216769218444824, + "learning_rate": 3.8906756792615505e-06, + "loss": 0.5076, + "step": 4031 + }, + { + "epoch": 1.9063829787234043, + "grad_norm": 2.442265748977661, + "learning_rate": 3.890157234858697e-06, + "loss": 0.4748, + "step": 4032 + }, + { + "epoch": 1.906855791962175, + "grad_norm": 3.088672399520874, + "learning_rate": 3.889638703896849e-06, + "loss": 0.5729, + "step": 4033 + }, + { + "epoch": 1.9073286052009455, + "grad_norm": 2.9304986000061035, + "learning_rate": 3.889120086408291e-06, + "loss": 0.603, + "step": 4034 + }, + { + "epoch": 1.9078014184397163, + "grad_norm": 2.686093807220459, + "learning_rate": 3.888601382425318e-06, + "loss": 0.4978, + "step": 4035 + }, + { + "epoch": 1.908274231678487, + "grad_norm": 2.5668389797210693, + "learning_rate": 3.888082591980225e-06, + "loss": 0.5086, + "step": 4036 + }, + { + "epoch": 1.9087470449172577, + "grad_norm": 2.530996561050415, + "learning_rate": 3.887563715105315e-06, + "loss": 0.4678, + "step": 4037 + }, + { + "epoch": 1.9092198581560282, + "grad_norm": 3.043342351913452, + "learning_rate": 3.887044751832897e-06, + "loss": 0.5452, + "step": 4038 + }, + { + "epoch": 1.909692671394799, + "grad_norm": 2.799734115600586, + "learning_rate": 3.886525702195284e-06, + "loss": 0.5265, + "step": 4039 + }, + { + "epoch": 1.9101654846335698, + "grad_norm": 2.890022039413452, + "learning_rate": 3.886006566224796e-06, + "loss": 0.4634, + "step": 4040 + }, + { + "epoch": 1.9106382978723404, + "grad_norm": 2.6804237365722656, + "learning_rate": 3.8854873439537555e-06, + "loss": 0.5031, + "step": 4041 + }, + { + "epoch": 1.911111111111111, + "grad_norm": 2.43038272857666, + "learning_rate": 3.884968035414495e-06, + "loss": 0.5098, + "step": 4042 + }, + { + "epoch": 1.9115839243498818, + "grad_norm": 2.589583396911621, + "learning_rate": 3.884448640639346e-06, + "loss": 0.498, + "step": 4043 + }, + { + "epoch": 1.9120567375886526, + "grad_norm": 2.4565231800079346, + "learning_rate": 3.8839291596606524e-06, + "loss": 0.4318, + "step": 4044 + }, + { + "epoch": 1.9125295508274232, + "grad_norm": 2.66762638092041, + "learning_rate": 3.8834095925107575e-06, + "loss": 0.5441, + "step": 4045 + }, + { + "epoch": 1.9130023640661937, + "grad_norm": 2.7334461212158203, + "learning_rate": 3.882889939222013e-06, + "loss": 0.5209, + "step": 4046 + }, + { + "epoch": 1.9134751773049645, + "grad_norm": 2.6398537158966064, + "learning_rate": 3.8823701998267765e-06, + "loss": 0.4874, + "step": 4047 + }, + { + "epoch": 1.9139479905437353, + "grad_norm": 2.82405161857605, + "learning_rate": 3.881850374357409e-06, + "loss": 0.4519, + "step": 4048 + }, + { + "epoch": 1.914420803782506, + "grad_norm": 2.7552523612976074, + "learning_rate": 3.8813304628462776e-06, + "loss": 0.547, + "step": 4049 + }, + { + "epoch": 1.9148936170212765, + "grad_norm": 2.5287928581237793, + "learning_rate": 3.880810465325755e-06, + "loss": 0.5226, + "step": 4050 + }, + { + "epoch": 1.9153664302600473, + "grad_norm": 2.7597358226776123, + "learning_rate": 3.88029038182822e-06, + "loss": 0.5171, + "step": 4051 + }, + { + "epoch": 1.915839243498818, + "grad_norm": 2.563899278640747, + "learning_rate": 3.879770212386055e-06, + "loss": 0.4911, + "step": 4052 + }, + { + "epoch": 1.9163120567375886, + "grad_norm": 2.499404191970825, + "learning_rate": 3.879249957031649e-06, + "loss": 0.5072, + "step": 4053 + }, + { + "epoch": 1.9167848699763592, + "grad_norm": 2.817713499069214, + "learning_rate": 3.878729615797396e-06, + "loss": 0.5452, + "step": 4054 + }, + { + "epoch": 1.91725768321513, + "grad_norm": 2.7152490615844727, + "learning_rate": 3.878209188715696e-06, + "loss": 0.4917, + "step": 4055 + }, + { + "epoch": 1.9177304964539008, + "grad_norm": 2.384265661239624, + "learning_rate": 3.877688675818953e-06, + "loss": 0.4823, + "step": 4056 + }, + { + "epoch": 1.9182033096926714, + "grad_norm": 2.61059308052063, + "learning_rate": 3.877168077139577e-06, + "loss": 0.478, + "step": 4057 + }, + { + "epoch": 1.918676122931442, + "grad_norm": 2.6107938289642334, + "learning_rate": 3.8766473927099824e-06, + "loss": 0.5202, + "step": 4058 + }, + { + "epoch": 1.9191489361702128, + "grad_norm": 2.2339766025543213, + "learning_rate": 3.876126622562592e-06, + "loss": 0.547, + "step": 4059 + }, + { + "epoch": 1.9196217494089836, + "grad_norm": 2.4324610233306885, + "learning_rate": 3.8756057667298304e-06, + "loss": 0.5333, + "step": 4060 + }, + { + "epoch": 1.9200945626477541, + "grad_norm": 2.5521230697631836, + "learning_rate": 3.875084825244131e-06, + "loss": 0.5503, + "step": 4061 + }, + { + "epoch": 1.9205673758865247, + "grad_norm": 2.6985747814178467, + "learning_rate": 3.874563798137928e-06, + "loss": 0.4944, + "step": 4062 + }, + { + "epoch": 1.9210401891252955, + "grad_norm": 2.422332525253296, + "learning_rate": 3.874042685443664e-06, + "loss": 0.4807, + "step": 4063 + }, + { + "epoch": 1.9215130023640663, + "grad_norm": 2.914553165435791, + "learning_rate": 3.873521487193788e-06, + "loss": 0.4439, + "step": 4064 + }, + { + "epoch": 1.9219858156028369, + "grad_norm": 2.8098697662353516, + "learning_rate": 3.873000203420752e-06, + "loss": 0.5433, + "step": 4065 + }, + { + "epoch": 1.9224586288416075, + "grad_norm": 2.6124703884124756, + "learning_rate": 3.872478834157013e-06, + "loss": 0.4812, + "step": 4066 + }, + { + "epoch": 1.9229314420803783, + "grad_norm": 2.511059522628784, + "learning_rate": 3.871957379435035e-06, + "loss": 0.4666, + "step": 4067 + }, + { + "epoch": 1.923404255319149, + "grad_norm": 2.950542688369751, + "learning_rate": 3.871435839287287e-06, + "loss": 0.5687, + "step": 4068 + }, + { + "epoch": 1.9238770685579196, + "grad_norm": 2.4969422817230225, + "learning_rate": 3.870914213746243e-06, + "loss": 0.5235, + "step": 4069 + }, + { + "epoch": 1.9243498817966902, + "grad_norm": 2.512152910232544, + "learning_rate": 3.870392502844382e-06, + "loss": 0.4524, + "step": 4070 + }, + { + "epoch": 1.924822695035461, + "grad_norm": 3.0212557315826416, + "learning_rate": 3.86987070661419e-06, + "loss": 0.4868, + "step": 4071 + }, + { + "epoch": 1.9252955082742318, + "grad_norm": 2.8949966430664062, + "learning_rate": 3.869348825088154e-06, + "loss": 0.5556, + "step": 4072 + }, + { + "epoch": 1.9257683215130024, + "grad_norm": 2.402043581008911, + "learning_rate": 3.868826858298772e-06, + "loss": 0.5307, + "step": 4073 + }, + { + "epoch": 1.926241134751773, + "grad_norm": 2.980992078781128, + "learning_rate": 3.868304806278543e-06, + "loss": 0.6313, + "step": 4074 + }, + { + "epoch": 1.9267139479905437, + "grad_norm": 2.7140514850616455, + "learning_rate": 3.867782669059975e-06, + "loss": 0.5359, + "step": 4075 + }, + { + "epoch": 1.9271867612293145, + "grad_norm": 2.499631643295288, + "learning_rate": 3.867260446675577e-06, + "loss": 0.4873, + "step": 4076 + }, + { + "epoch": 1.9276595744680851, + "grad_norm": 2.915583610534668, + "learning_rate": 3.866738139157866e-06, + "loss": 0.5736, + "step": 4077 + }, + { + "epoch": 1.9281323877068557, + "grad_norm": 2.4231131076812744, + "learning_rate": 3.866215746539363e-06, + "loss": 0.5096, + "step": 4078 + }, + { + "epoch": 1.9286052009456265, + "grad_norm": 2.360074996948242, + "learning_rate": 3.865693268852599e-06, + "loss": 0.4907, + "step": 4079 + }, + { + "epoch": 1.9290780141843973, + "grad_norm": 2.5410032272338867, + "learning_rate": 3.865170706130101e-06, + "loss": 0.473, + "step": 4080 + }, + { + "epoch": 1.9295508274231679, + "grad_norm": 2.780090808868408, + "learning_rate": 3.86464805840441e-06, + "loss": 0.5213, + "step": 4081 + }, + { + "epoch": 1.9300236406619384, + "grad_norm": 2.7318382263183594, + "learning_rate": 3.864125325708068e-06, + "loss": 0.5617, + "step": 4082 + }, + { + "epoch": 1.9304964539007092, + "grad_norm": 2.76509165763855, + "learning_rate": 3.863602508073623e-06, + "loss": 0.52, + "step": 4083 + }, + { + "epoch": 1.93096926713948, + "grad_norm": 2.8041110038757324, + "learning_rate": 3.863079605533631e-06, + "loss": 0.5343, + "step": 4084 + }, + { + "epoch": 1.9314420803782506, + "grad_norm": 2.4462404251098633, + "learning_rate": 3.862556618120647e-06, + "loss": 0.4657, + "step": 4085 + }, + { + "epoch": 1.9319148936170212, + "grad_norm": 2.460864305496216, + "learning_rate": 3.862033545867238e-06, + "loss": 0.517, + "step": 4086 + }, + { + "epoch": 1.932387706855792, + "grad_norm": 2.6480276584625244, + "learning_rate": 3.8615103888059715e-06, + "loss": 0.4702, + "step": 4087 + }, + { + "epoch": 1.9328605200945628, + "grad_norm": 2.7175381183624268, + "learning_rate": 3.860987146969424e-06, + "loss": 0.5073, + "step": 4088 + }, + { + "epoch": 1.9333333333333333, + "grad_norm": 2.4963486194610596, + "learning_rate": 3.860463820390175e-06, + "loss": 0.4491, + "step": 4089 + }, + { + "epoch": 1.933806146572104, + "grad_norm": 2.548135757446289, + "learning_rate": 3.8599404091008075e-06, + "loss": 0.5134, + "step": 4090 + }, + { + "epoch": 1.9342789598108747, + "grad_norm": 2.8693668842315674, + "learning_rate": 3.859416913133916e-06, + "loss": 0.5467, + "step": 4091 + }, + { + "epoch": 1.9347517730496455, + "grad_norm": 2.711273670196533, + "learning_rate": 3.858893332522092e-06, + "loss": 0.6287, + "step": 4092 + }, + { + "epoch": 1.935224586288416, + "grad_norm": 2.8604533672332764, + "learning_rate": 3.858369667297941e-06, + "loss": 0.5661, + "step": 4093 + }, + { + "epoch": 1.9356973995271867, + "grad_norm": 2.936988353729248, + "learning_rate": 3.857845917494066e-06, + "loss": 0.5311, + "step": 4094 + }, + { + "epoch": 1.9361702127659575, + "grad_norm": 2.414093494415283, + "learning_rate": 3.857322083143079e-06, + "loss": 0.505, + "step": 4095 + }, + { + "epoch": 1.9366430260047283, + "grad_norm": 2.5528934001922607, + "learning_rate": 3.856798164277599e-06, + "loss": 0.4759, + "step": 4096 + }, + { + "epoch": 1.9371158392434988, + "grad_norm": 2.592893600463867, + "learning_rate": 3.8562741609302456e-06, + "loss": 0.4932, + "step": 4097 + }, + { + "epoch": 1.9375886524822694, + "grad_norm": 2.9619107246398926, + "learning_rate": 3.855750073133648e-06, + "loss": 0.5563, + "step": 4098 + }, + { + "epoch": 1.9380614657210402, + "grad_norm": 2.864889621734619, + "learning_rate": 3.855225900920438e-06, + "loss": 0.5069, + "step": 4099 + }, + { + "epoch": 1.938534278959811, + "grad_norm": 2.3951032161712646, + "learning_rate": 3.854701644323253e-06, + "loss": 0.4883, + "step": 4100 + }, + { + "epoch": 1.9390070921985816, + "grad_norm": 2.6339633464813232, + "learning_rate": 3.854177303374737e-06, + "loss": 0.5207, + "step": 4101 + }, + { + "epoch": 1.9394799054373522, + "grad_norm": 2.6435508728027344, + "learning_rate": 3.853652878107539e-06, + "loss": 0.4679, + "step": 4102 + }, + { + "epoch": 1.939952718676123, + "grad_norm": 2.4635629653930664, + "learning_rate": 3.853128368554311e-06, + "loss": 0.5639, + "step": 4103 + }, + { + "epoch": 1.9404255319148938, + "grad_norm": 2.664635419845581, + "learning_rate": 3.852603774747714e-06, + "loss": 0.5697, + "step": 4104 + }, + { + "epoch": 1.9408983451536643, + "grad_norm": 2.7020363807678223, + "learning_rate": 3.8520790967204095e-06, + "loss": 0.5462, + "step": 4105 + }, + { + "epoch": 1.941371158392435, + "grad_norm": 3.529282331466675, + "learning_rate": 3.851554334505069e-06, + "loss": 0.54, + "step": 4106 + }, + { + "epoch": 1.9418439716312057, + "grad_norm": 2.7125768661499023, + "learning_rate": 3.851029488134367e-06, + "loss": 0.5355, + "step": 4107 + }, + { + "epoch": 1.9423167848699765, + "grad_norm": 2.5226643085479736, + "learning_rate": 3.850504557640981e-06, + "loss": 0.5106, + "step": 4108 + }, + { + "epoch": 1.942789598108747, + "grad_norm": 2.834352731704712, + "learning_rate": 3.8499795430575995e-06, + "loss": 0.6069, + "step": 4109 + }, + { + "epoch": 1.9432624113475176, + "grad_norm": 2.8484177589416504, + "learning_rate": 3.849454444416911e-06, + "loss": 0.5542, + "step": 4110 + }, + { + "epoch": 1.9437352245862884, + "grad_norm": 2.402539014816284, + "learning_rate": 3.848929261751612e-06, + "loss": 0.47, + "step": 4111 + }, + { + "epoch": 1.9442080378250592, + "grad_norm": 2.7010042667388916, + "learning_rate": 3.848403995094402e-06, + "loss": 0.5263, + "step": 4112 + }, + { + "epoch": 1.9446808510638298, + "grad_norm": 2.441689968109131, + "learning_rate": 3.847878644477988e-06, + "loss": 0.5607, + "step": 4113 + }, + { + "epoch": 1.9451536643026004, + "grad_norm": 2.5994722843170166, + "learning_rate": 3.847353209935081e-06, + "loss": 0.5103, + "step": 4114 + }, + { + "epoch": 1.9456264775413712, + "grad_norm": 2.452242136001587, + "learning_rate": 3.8468276914983975e-06, + "loss": 0.4409, + "step": 4115 + }, + { + "epoch": 1.946099290780142, + "grad_norm": 2.421023368835449, + "learning_rate": 3.84630208920066e-06, + "loss": 0.4429, + "step": 4116 + }, + { + "epoch": 1.9465721040189126, + "grad_norm": 2.696399688720703, + "learning_rate": 3.8457764030745945e-06, + "loss": 0.5352, + "step": 4117 + }, + { + "epoch": 1.9470449172576831, + "grad_norm": 2.3963489532470703, + "learning_rate": 3.845250633152933e-06, + "loss": 0.4505, + "step": 4118 + }, + { + "epoch": 1.947517730496454, + "grad_norm": 2.610649585723877, + "learning_rate": 3.8447247794684135e-06, + "loss": 0.501, + "step": 4119 + }, + { + "epoch": 1.9479905437352247, + "grad_norm": 2.740412712097168, + "learning_rate": 3.8441988420537775e-06, + "loss": 0.5362, + "step": 4120 + }, + { + "epoch": 1.9484633569739953, + "grad_norm": 2.2614004611968994, + "learning_rate": 3.8436728209417755e-06, + "loss": 0.4199, + "step": 4121 + }, + { + "epoch": 1.9489361702127659, + "grad_norm": 3.0683481693267822, + "learning_rate": 3.843146716165158e-06, + "loss": 0.5248, + "step": 4122 + }, + { + "epoch": 1.9494089834515367, + "grad_norm": 3.005174398422241, + "learning_rate": 3.842620527756684e-06, + "loss": 0.5246, + "step": 4123 + }, + { + "epoch": 1.9498817966903075, + "grad_norm": 2.672896385192871, + "learning_rate": 3.842094255749117e-06, + "loss": 0.5586, + "step": 4124 + }, + { + "epoch": 1.950354609929078, + "grad_norm": 2.5481197834014893, + "learning_rate": 3.8415679001752255e-06, + "loss": 0.5061, + "step": 4125 + }, + { + "epoch": 1.9508274231678486, + "grad_norm": 2.515789270401001, + "learning_rate": 3.8410414610677835e-06, + "loss": 0.4645, + "step": 4126 + }, + { + "epoch": 1.9513002364066194, + "grad_norm": 2.7236077785491943, + "learning_rate": 3.84051493845957e-06, + "loss": 0.5623, + "step": 4127 + }, + { + "epoch": 1.9517730496453902, + "grad_norm": 2.6252009868621826, + "learning_rate": 3.839988332383369e-06, + "loss": 0.5078, + "step": 4128 + }, + { + "epoch": 1.9522458628841608, + "grad_norm": 2.719196081161499, + "learning_rate": 3.83946164287197e-06, + "loss": 0.5481, + "step": 4129 + }, + { + "epoch": 1.9527186761229314, + "grad_norm": 2.484163284301758, + "learning_rate": 3.838934869958169e-06, + "loss": 0.5332, + "step": 4130 + }, + { + "epoch": 1.9531914893617022, + "grad_norm": 2.615382671356201, + "learning_rate": 3.838408013674764e-06, + "loss": 0.4742, + "step": 4131 + }, + { + "epoch": 1.953664302600473, + "grad_norm": 2.735321044921875, + "learning_rate": 3.83788107405456e-06, + "loss": 0.421, + "step": 4132 + }, + { + "epoch": 1.9541371158392435, + "grad_norm": 2.892652750015259, + "learning_rate": 3.837354051130369e-06, + "loss": 0.5326, + "step": 4133 + }, + { + "epoch": 1.9546099290780141, + "grad_norm": 2.6800546646118164, + "learning_rate": 3.8368269449350055e-06, + "loss": 0.5041, + "step": 4134 + }, + { + "epoch": 1.955082742316785, + "grad_norm": 2.362470865249634, + "learning_rate": 3.836299755501289e-06, + "loss": 0.4697, + "step": 4135 + }, + { + "epoch": 1.9555555555555557, + "grad_norm": 2.3855135440826416, + "learning_rate": 3.835772482862047e-06, + "loss": 0.5148, + "step": 4136 + }, + { + "epoch": 1.9560283687943263, + "grad_norm": 2.3338418006896973, + "learning_rate": 3.83524512705011e-06, + "loss": 0.4643, + "step": 4137 + }, + { + "epoch": 1.9565011820330969, + "grad_norm": 2.261355400085449, + "learning_rate": 3.834717688098313e-06, + "loss": 0.5573, + "step": 4138 + }, + { + "epoch": 1.9569739952718677, + "grad_norm": 2.8166391849517822, + "learning_rate": 3.834190166039498e-06, + "loss": 0.4868, + "step": 4139 + }, + { + "epoch": 1.9574468085106385, + "grad_norm": 2.4155869483947754, + "learning_rate": 3.833662560906512e-06, + "loss": 0.4923, + "step": 4140 + }, + { + "epoch": 1.957919621749409, + "grad_norm": 2.3977696895599365, + "learning_rate": 3.833134872732206e-06, + "loss": 0.5106, + "step": 4141 + }, + { + "epoch": 1.9583924349881796, + "grad_norm": 2.9541378021240234, + "learning_rate": 3.832607101549438e-06, + "loss": 0.4683, + "step": 4142 + }, + { + "epoch": 1.9588652482269504, + "grad_norm": 2.5862700939178467, + "learning_rate": 3.832079247391068e-06, + "loss": 0.4453, + "step": 4143 + }, + { + "epoch": 1.9593380614657212, + "grad_norm": 2.7459371089935303, + "learning_rate": 3.8315513102899644e-06, + "loss": 0.5511, + "step": 4144 + }, + { + "epoch": 1.9598108747044918, + "grad_norm": 2.904869556427002, + "learning_rate": 3.831023290279e-06, + "loss": 0.5348, + "step": 4145 + }, + { + "epoch": 1.9602836879432624, + "grad_norm": 3.092846632003784, + "learning_rate": 3.830495187391051e-06, + "loss": 0.5664, + "step": 4146 + }, + { + "epoch": 1.9607565011820332, + "grad_norm": 3.2838528156280518, + "learning_rate": 3.829967001659001e-06, + "loss": 0.5115, + "step": 4147 + }, + { + "epoch": 1.961229314420804, + "grad_norm": 2.7799549102783203, + "learning_rate": 3.829438733115738e-06, + "loss": 0.5145, + "step": 4148 + }, + { + "epoch": 1.9617021276595743, + "grad_norm": 2.436084270477295, + "learning_rate": 3.828910381794154e-06, + "loss": 0.4718, + "step": 4149 + }, + { + "epoch": 1.962174940898345, + "grad_norm": 2.6662371158599854, + "learning_rate": 3.828381947727148e-06, + "loss": 0.6129, + "step": 4150 + }, + { + "epoch": 1.962647754137116, + "grad_norm": 2.937000036239624, + "learning_rate": 3.827853430947622e-06, + "loss": 0.522, + "step": 4151 + }, + { + "epoch": 1.9631205673758865, + "grad_norm": 2.5737369060516357, + "learning_rate": 3.827324831488486e-06, + "loss": 0.4916, + "step": 4152 + }, + { + "epoch": 1.963593380614657, + "grad_norm": 2.70232892036438, + "learning_rate": 3.826796149382653e-06, + "loss": 0.4726, + "step": 4153 + }, + { + "epoch": 1.9640661938534278, + "grad_norm": 2.6899707317352295, + "learning_rate": 3.826267384663042e-06, + "loss": 0.529, + "step": 4154 + }, + { + "epoch": 1.9645390070921986, + "grad_norm": 2.6142728328704834, + "learning_rate": 3.825738537362575e-06, + "loss": 0.4999, + "step": 4155 + }, + { + "epoch": 1.9650118203309692, + "grad_norm": 2.43949818611145, + "learning_rate": 3.825209607514183e-06, + "loss": 0.5035, + "step": 4156 + }, + { + "epoch": 1.9654846335697398, + "grad_norm": 2.3735458850860596, + "learning_rate": 3.824680595150801e-06, + "loss": 0.4779, + "step": 4157 + }, + { + "epoch": 1.9659574468085106, + "grad_norm": 2.444307565689087, + "learning_rate": 3.824151500305365e-06, + "loss": 0.4825, + "step": 4158 + }, + { + "epoch": 1.9664302600472814, + "grad_norm": 2.8219668865203857, + "learning_rate": 3.8236223230108224e-06, + "loss": 0.5354, + "step": 4159 + }, + { + "epoch": 1.966903073286052, + "grad_norm": 2.720721483230591, + "learning_rate": 3.823093063300121e-06, + "loss": 0.5064, + "step": 4160 + }, + { + "epoch": 1.9673758865248225, + "grad_norm": 2.324190616607666, + "learning_rate": 3.822563721206217e-06, + "loss": 0.5348, + "step": 4161 + }, + { + "epoch": 1.9678486997635933, + "grad_norm": 2.702155351638794, + "learning_rate": 3.8220342967620695e-06, + "loss": 0.5388, + "step": 4162 + }, + { + "epoch": 1.9683215130023641, + "grad_norm": 2.4956369400024414, + "learning_rate": 3.821504790000642e-06, + "loss": 0.5071, + "step": 4163 + }, + { + "epoch": 1.9687943262411347, + "grad_norm": 2.568039655685425, + "learning_rate": 3.820975200954906e-06, + "loss": 0.5133, + "step": 4164 + }, + { + "epoch": 1.9692671394799053, + "grad_norm": 2.810868978500366, + "learning_rate": 3.820445529657837e-06, + "loss": 0.4856, + "step": 4165 + }, + { + "epoch": 1.969739952718676, + "grad_norm": 2.66365647315979, + "learning_rate": 3.819915776142415e-06, + "loss": 0.5235, + "step": 4166 + }, + { + "epoch": 1.9702127659574469, + "grad_norm": 2.2982139587402344, + "learning_rate": 3.8193859404416265e-06, + "loss": 0.4361, + "step": 4167 + }, + { + "epoch": 1.9706855791962175, + "grad_norm": 2.585672378540039, + "learning_rate": 3.818856022588458e-06, + "loss": 0.4842, + "step": 4168 + }, + { + "epoch": 1.971158392434988, + "grad_norm": 2.57857346534729, + "learning_rate": 3.81832602261591e-06, + "loss": 0.5249, + "step": 4169 + }, + { + "epoch": 1.9716312056737588, + "grad_norm": 2.6947224140167236, + "learning_rate": 3.817795940556981e-06, + "loss": 0.5234, + "step": 4170 + }, + { + "epoch": 1.9721040189125296, + "grad_norm": 2.7453415393829346, + "learning_rate": 3.8172657764446764e-06, + "loss": 0.5219, + "step": 4171 + }, + { + "epoch": 1.9725768321513002, + "grad_norm": 8.424073219299316, + "learning_rate": 3.816735530312009e-06, + "loss": 0.5162, + "step": 4172 + }, + { + "epoch": 1.9730496453900708, + "grad_norm": 2.8229739665985107, + "learning_rate": 3.816205202191993e-06, + "loss": 0.4621, + "step": 4173 + }, + { + "epoch": 1.9735224586288416, + "grad_norm": 2.5969009399414062, + "learning_rate": 3.815674792117651e-06, + "loss": 0.5044, + "step": 4174 + }, + { + "epoch": 1.9739952718676124, + "grad_norm": 2.646024227142334, + "learning_rate": 3.815144300122009e-06, + "loss": 0.5094, + "step": 4175 + }, + { + "epoch": 1.974468085106383, + "grad_norm": 2.4950616359710693, + "learning_rate": 3.814613726238097e-06, + "loss": 0.4827, + "step": 4176 + }, + { + "epoch": 1.9749408983451535, + "grad_norm": 2.5636119842529297, + "learning_rate": 3.8140830704989535e-06, + "loss": 0.5241, + "step": 4177 + }, + { + "epoch": 1.9754137115839243, + "grad_norm": 2.7936553955078125, + "learning_rate": 3.813552332937619e-06, + "loss": 0.5344, + "step": 4178 + }, + { + "epoch": 1.9758865248226951, + "grad_norm": 2.8085341453552246, + "learning_rate": 3.8130215135871405e-06, + "loss": 0.5647, + "step": 4179 + }, + { + "epoch": 1.9763593380614657, + "grad_norm": 2.4776322841644287, + "learning_rate": 3.8124906124805694e-06, + "loss": 0.542, + "step": 4180 + }, + { + "epoch": 1.9768321513002363, + "grad_norm": 2.3227856159210205, + "learning_rate": 3.8119596296509635e-06, + "loss": 0.4618, + "step": 4181 + }, + { + "epoch": 1.977304964539007, + "grad_norm": 2.5157814025878906, + "learning_rate": 3.8114285651313848e-06, + "loss": 0.538, + "step": 4182 + }, + { + "epoch": 1.9777777777777779, + "grad_norm": 2.5630218982696533, + "learning_rate": 3.8108974189548987e-06, + "loss": 0.5254, + "step": 4183 + }, + { + "epoch": 1.9782505910165484, + "grad_norm": 2.703237533569336, + "learning_rate": 3.8103661911545787e-06, + "loss": 0.4859, + "step": 4184 + }, + { + "epoch": 1.978723404255319, + "grad_norm": 2.8808000087738037, + "learning_rate": 3.809834881763502e-06, + "loss": 0.5585, + "step": 4185 + }, + { + "epoch": 1.9791962174940898, + "grad_norm": 2.9047577381134033, + "learning_rate": 3.8093034908147507e-06, + "loss": 0.5022, + "step": 4186 + }, + { + "epoch": 1.9796690307328606, + "grad_norm": 2.7417640686035156, + "learning_rate": 3.8087720183414125e-06, + "loss": 0.5275, + "step": 4187 + }, + { + "epoch": 1.9801418439716312, + "grad_norm": 2.952012062072754, + "learning_rate": 3.8082404643765786e-06, + "loss": 0.543, + "step": 4188 + }, + { + "epoch": 1.9806146572104018, + "grad_norm": 2.538376569747925, + "learning_rate": 3.807708828953348e-06, + "loss": 0.4969, + "step": 4189 + }, + { + "epoch": 1.9810874704491725, + "grad_norm": 2.3476181030273438, + "learning_rate": 3.807177112104823e-06, + "loss": 0.4979, + "step": 4190 + }, + { + "epoch": 1.9815602836879433, + "grad_norm": 2.6480464935302734, + "learning_rate": 3.80664531386411e-06, + "loss": 0.4894, + "step": 4191 + }, + { + "epoch": 1.982033096926714, + "grad_norm": 2.792916774749756, + "learning_rate": 3.8061134342643235e-06, + "loss": 0.5468, + "step": 4192 + }, + { + "epoch": 1.9825059101654845, + "grad_norm": 2.368736743927002, + "learning_rate": 3.805581473338581e-06, + "loss": 0.4672, + "step": 4193 + }, + { + "epoch": 1.9829787234042553, + "grad_norm": 2.379084348678589, + "learning_rate": 3.8050494311200037e-06, + "loss": 0.4577, + "step": 4194 + }, + { + "epoch": 1.983451536643026, + "grad_norm": 2.722471237182617, + "learning_rate": 3.804517307641722e-06, + "loss": 0.4988, + "step": 4195 + }, + { + "epoch": 1.9839243498817967, + "grad_norm": 2.356649875640869, + "learning_rate": 3.8039851029368674e-06, + "loss": 0.4933, + "step": 4196 + }, + { + "epoch": 1.9843971631205672, + "grad_norm": 2.9182281494140625, + "learning_rate": 3.8034528170385776e-06, + "loss": 0.4873, + "step": 4197 + }, + { + "epoch": 1.984869976359338, + "grad_norm": 2.6232199668884277, + "learning_rate": 3.8029204499799976e-06, + "loss": 0.4425, + "step": 4198 + }, + { + "epoch": 1.9853427895981088, + "grad_norm": 2.667541980743408, + "learning_rate": 3.802388001794274e-06, + "loss": 0.5022, + "step": 4199 + }, + { + "epoch": 1.9858156028368794, + "grad_norm": 3.168470621109009, + "learning_rate": 3.8018554725145596e-06, + "loss": 0.5505, + "step": 4200 + }, + { + "epoch": 1.98628841607565, + "grad_norm": 2.716625452041626, + "learning_rate": 3.8013228621740132e-06, + "loss": 0.4937, + "step": 4201 + }, + { + "epoch": 1.9867612293144208, + "grad_norm": 2.3014442920684814, + "learning_rate": 3.800790170805799e-06, + "loss": 0.4734, + "step": 4202 + }, + { + "epoch": 1.9872340425531916, + "grad_norm": 2.9426841735839844, + "learning_rate": 3.8002573984430847e-06, + "loss": 0.4983, + "step": 4203 + }, + { + "epoch": 1.9877068557919622, + "grad_norm": 2.5598278045654297, + "learning_rate": 3.7997245451190435e-06, + "loss": 0.4834, + "step": 4204 + }, + { + "epoch": 1.9881796690307327, + "grad_norm": 2.86458420753479, + "learning_rate": 3.7991916108668538e-06, + "loss": 0.5613, + "step": 4205 + }, + { + "epoch": 1.9886524822695035, + "grad_norm": 2.842914342880249, + "learning_rate": 3.7986585957196997e-06, + "loss": 0.4951, + "step": 4206 + }, + { + "epoch": 1.9891252955082743, + "grad_norm": 3.1828150749206543, + "learning_rate": 3.7981254997107686e-06, + "loss": 0.5913, + "step": 4207 + }, + { + "epoch": 1.989598108747045, + "grad_norm": 2.5765931606292725, + "learning_rate": 3.7975923228732547e-06, + "loss": 0.5544, + "step": 4208 + }, + { + "epoch": 1.9900709219858155, + "grad_norm": 2.492234945297241, + "learning_rate": 3.797059065240357e-06, + "loss": 0.5046, + "step": 4209 + }, + { + "epoch": 1.9905437352245863, + "grad_norm": 2.870346784591675, + "learning_rate": 3.7965257268452795e-06, + "loss": 0.5354, + "step": 4210 + }, + { + "epoch": 1.991016548463357, + "grad_norm": 2.4989993572235107, + "learning_rate": 3.795992307721229e-06, + "loss": 0.4677, + "step": 4211 + }, + { + "epoch": 1.9914893617021276, + "grad_norm": 2.931114673614502, + "learning_rate": 3.7954588079014206e-06, + "loss": 0.5504, + "step": 4212 + }, + { + "epoch": 1.9919621749408982, + "grad_norm": 2.5247652530670166, + "learning_rate": 3.794925227419073e-06, + "loss": 0.4736, + "step": 4213 + }, + { + "epoch": 1.992434988179669, + "grad_norm": 2.6238436698913574, + "learning_rate": 3.794391566307409e-06, + "loss": 0.4591, + "step": 4214 + }, + { + "epoch": 1.9929078014184398, + "grad_norm": 2.654886245727539, + "learning_rate": 3.7938578245996584e-06, + "loss": 0.5149, + "step": 4215 + }, + { + "epoch": 1.9933806146572104, + "grad_norm": 2.509164810180664, + "learning_rate": 3.793324002329054e-06, + "loss": 0.4951, + "step": 4216 + }, + { + "epoch": 1.993853427895981, + "grad_norm": 2.909632921218872, + "learning_rate": 3.7927900995288345e-06, + "loss": 0.5131, + "step": 4217 + }, + { + "epoch": 1.9943262411347518, + "grad_norm": 2.4354615211486816, + "learning_rate": 3.7922561162322456e-06, + "loss": 0.4716, + "step": 4218 + }, + { + "epoch": 1.9947990543735226, + "grad_norm": 2.6514649391174316, + "learning_rate": 3.791722052472534e-06, + "loss": 0.5714, + "step": 4219 + }, + { + "epoch": 1.9952718676122931, + "grad_norm": 2.77089262008667, + "learning_rate": 3.791187908282954e-06, + "loss": 0.5736, + "step": 4220 + }, + { + "epoch": 1.9957446808510637, + "grad_norm": 2.7651021480560303, + "learning_rate": 3.7906536836967657e-06, + "loss": 0.4948, + "step": 4221 + }, + { + "epoch": 1.9962174940898345, + "grad_norm": 2.7536795139312744, + "learning_rate": 3.7901193787472306e-06, + "loss": 0.512, + "step": 4222 + }, + { + "epoch": 1.9966903073286053, + "grad_norm": 2.684893846511841, + "learning_rate": 3.78958499346762e-06, + "loss": 0.5118, + "step": 4223 + }, + { + "epoch": 1.9971631205673759, + "grad_norm": 2.7616753578186035, + "learning_rate": 3.7890505278912054e-06, + "loss": 0.4516, + "step": 4224 + }, + { + "epoch": 1.9976359338061465, + "grad_norm": 2.4731967449188232, + "learning_rate": 3.7885159820512666e-06, + "loss": 0.4736, + "step": 4225 + }, + { + "epoch": 1.9981087470449173, + "grad_norm": 2.366631031036377, + "learning_rate": 3.7879813559810884e-06, + "loss": 0.4999, + "step": 4226 + }, + { + "epoch": 1.998581560283688, + "grad_norm": 2.994624137878418, + "learning_rate": 3.7874466497139582e-06, + "loss": 0.5273, + "step": 4227 + }, + { + "epoch": 1.9990543735224586, + "grad_norm": 2.4499242305755615, + "learning_rate": 3.7869118632831712e-06, + "loss": 0.5761, + "step": 4228 + }, + { + "epoch": 1.9995271867612292, + "grad_norm": 2.3370113372802734, + "learning_rate": 3.7863769967220243e-06, + "loss": 0.4673, + "step": 4229 + }, + { + "epoch": 2.0, + "grad_norm": 3.1131203174591064, + "learning_rate": 3.7858420500638236e-06, + "loss": 0.5118, + "step": 4230 + }, + { + "epoch": 2.000472813238771, + "grad_norm": 2.2747561931610107, + "learning_rate": 3.785307023341876e-06, + "loss": 0.4166, + "step": 4231 + }, + { + "epoch": 2.000945626477541, + "grad_norm": 2.4347424507141113, + "learning_rate": 3.7847719165894963e-06, + "loss": 0.4161, + "step": 4232 + }, + { + "epoch": 2.001418439716312, + "grad_norm": 2.398805618286133, + "learning_rate": 3.784236729840003e-06, + "loss": 0.4652, + "step": 4233 + }, + { + "epoch": 2.0018912529550827, + "grad_norm": 2.1904916763305664, + "learning_rate": 3.783701463126719e-06, + "loss": 0.4554, + "step": 4234 + }, + { + "epoch": 2.0023640661938535, + "grad_norm": 2.237330913543701, + "learning_rate": 3.7831661164829735e-06, + "loss": 0.4471, + "step": 4235 + }, + { + "epoch": 2.002836879432624, + "grad_norm": 2.3656628131866455, + "learning_rate": 3.7826306899421016e-06, + "loss": 0.4052, + "step": 4236 + }, + { + "epoch": 2.0033096926713947, + "grad_norm": 2.615489959716797, + "learning_rate": 3.7820951835374405e-06, + "loss": 0.4847, + "step": 4237 + }, + { + "epoch": 2.0037825059101655, + "grad_norm": 2.453036308288574, + "learning_rate": 3.7815595973023347e-06, + "loss": 0.4672, + "step": 4238 + }, + { + "epoch": 2.0042553191489363, + "grad_norm": 2.537468671798706, + "learning_rate": 3.7810239312701306e-06, + "loss": 0.467, + "step": 4239 + }, + { + "epoch": 2.0047281323877066, + "grad_norm": 2.3321666717529297, + "learning_rate": 3.780488185474184e-06, + "loss": 0.3557, + "step": 4240 + }, + { + "epoch": 2.0052009456264774, + "grad_norm": 2.9051828384399414, + "learning_rate": 3.779952359947854e-06, + "loss": 0.5474, + "step": 4241 + }, + { + "epoch": 2.0056737588652482, + "grad_norm": 2.7458817958831787, + "learning_rate": 3.7794164547245015e-06, + "loss": 0.4659, + "step": 4242 + }, + { + "epoch": 2.006146572104019, + "grad_norm": 2.627046585083008, + "learning_rate": 3.778880469837497e-06, + "loss": 0.4179, + "step": 4243 + }, + { + "epoch": 2.0066193853427894, + "grad_norm": 2.4186174869537354, + "learning_rate": 3.7783444053202135e-06, + "loss": 0.3976, + "step": 4244 + }, + { + "epoch": 2.00709219858156, + "grad_norm": 3.109376907348633, + "learning_rate": 3.7778082612060296e-06, + "loss": 0.4095, + "step": 4245 + }, + { + "epoch": 2.007565011820331, + "grad_norm": 2.583376169204712, + "learning_rate": 3.7772720375283282e-06, + "loss": 0.4325, + "step": 4246 + }, + { + "epoch": 2.0080378250591018, + "grad_norm": 2.6199896335601807, + "learning_rate": 3.776735734320497e-06, + "loss": 0.4207, + "step": 4247 + }, + { + "epoch": 2.008510638297872, + "grad_norm": 2.545353651046753, + "learning_rate": 3.77619935161593e-06, + "loss": 0.4483, + "step": 4248 + }, + { + "epoch": 2.008983451536643, + "grad_norm": 2.770266056060791, + "learning_rate": 3.7756628894480263e-06, + "loss": 0.457, + "step": 4249 + }, + { + "epoch": 2.0094562647754137, + "grad_norm": 2.903254985809326, + "learning_rate": 3.7751263478501878e-06, + "loss": 0.4171, + "step": 4250 + }, + { + "epoch": 2.0099290780141845, + "grad_norm": 2.5576963424682617, + "learning_rate": 3.774589726855822e-06, + "loss": 0.3631, + "step": 4251 + }, + { + "epoch": 2.010401891252955, + "grad_norm": 3.7584285736083984, + "learning_rate": 3.7740530264983434e-06, + "loss": 0.4827, + "step": 4252 + }, + { + "epoch": 2.0108747044917257, + "grad_norm": 3.3116581439971924, + "learning_rate": 3.77351624681117e-06, + "loss": 0.5071, + "step": 4253 + }, + { + "epoch": 2.0113475177304965, + "grad_norm": 3.1370885372161865, + "learning_rate": 3.772979387827723e-06, + "loss": 0.4963, + "step": 4254 + }, + { + "epoch": 2.0118203309692673, + "grad_norm": 2.4832639694213867, + "learning_rate": 3.772442449581432e-06, + "loss": 0.4442, + "step": 4255 + }, + { + "epoch": 2.0122931442080376, + "grad_norm": 2.7645785808563232, + "learning_rate": 3.7719054321057293e-06, + "loss": 0.4572, + "step": 4256 + }, + { + "epoch": 2.0127659574468084, + "grad_norm": 2.7962236404418945, + "learning_rate": 3.7713683354340515e-06, + "loss": 0.4906, + "step": 4257 + }, + { + "epoch": 2.013238770685579, + "grad_norm": 2.647991895675659, + "learning_rate": 3.7708311595998425e-06, + "loss": 0.4027, + "step": 4258 + }, + { + "epoch": 2.01371158392435, + "grad_norm": 2.3780267238616943, + "learning_rate": 3.7702939046365504e-06, + "loss": 0.4285, + "step": 4259 + }, + { + "epoch": 2.0141843971631204, + "grad_norm": 2.5185933113098145, + "learning_rate": 3.7697565705776266e-06, + "loss": 0.4834, + "step": 4260 + }, + { + "epoch": 2.014657210401891, + "grad_norm": 2.432507276535034, + "learning_rate": 3.7692191574565294e-06, + "loss": 0.3695, + "step": 4261 + }, + { + "epoch": 2.015130023640662, + "grad_norm": 2.8010706901550293, + "learning_rate": 3.76868166530672e-06, + "loss": 0.478, + "step": 4262 + }, + { + "epoch": 2.0156028368794328, + "grad_norm": 2.32817006111145, + "learning_rate": 3.768144094161666e-06, + "loss": 0.4154, + "step": 4263 + }, + { + "epoch": 2.016075650118203, + "grad_norm": 3.062812328338623, + "learning_rate": 3.7676064440548405e-06, + "loss": 0.5015, + "step": 4264 + }, + { + "epoch": 2.016548463356974, + "grad_norm": 2.6129536628723145, + "learning_rate": 3.7670687150197194e-06, + "loss": 0.3843, + "step": 4265 + }, + { + "epoch": 2.0170212765957447, + "grad_norm": 2.838259696960449, + "learning_rate": 3.766530907089786e-06, + "loss": 0.4937, + "step": 4266 + }, + { + "epoch": 2.0174940898345155, + "grad_norm": 2.601203680038452, + "learning_rate": 3.7659930202985263e-06, + "loss": 0.4644, + "step": 4267 + }, + { + "epoch": 2.017966903073286, + "grad_norm": 2.5964133739471436, + "learning_rate": 3.7654550546794322e-06, + "loss": 0.4365, + "step": 4268 + }, + { + "epoch": 2.0184397163120567, + "grad_norm": 3.0028915405273438, + "learning_rate": 3.764917010266001e-06, + "loss": 0.434, + "step": 4269 + }, + { + "epoch": 2.0189125295508275, + "grad_norm": 2.719252586364746, + "learning_rate": 3.764378887091734e-06, + "loss": 0.4401, + "step": 4270 + }, + { + "epoch": 2.0193853427895982, + "grad_norm": 2.400254011154175, + "learning_rate": 3.7638406851901377e-06, + "loss": 0.4904, + "step": 4271 + }, + { + "epoch": 2.0198581560283686, + "grad_norm": 2.8015363216400146, + "learning_rate": 3.763302404594724e-06, + "loss": 0.4569, + "step": 4272 + }, + { + "epoch": 2.0203309692671394, + "grad_norm": 2.718416452407837, + "learning_rate": 3.762764045339009e-06, + "loss": 0.5124, + "step": 4273 + }, + { + "epoch": 2.02080378250591, + "grad_norm": 2.484049081802368, + "learning_rate": 3.762225607456514e-06, + "loss": 0.4255, + "step": 4274 + }, + { + "epoch": 2.021276595744681, + "grad_norm": 2.6377930641174316, + "learning_rate": 3.7616870909807645e-06, + "loss": 0.5044, + "step": 4275 + }, + { + "epoch": 2.0217494089834513, + "grad_norm": 2.8845038414001465, + "learning_rate": 3.7611484959452927e-06, + "loss": 0.4924, + "step": 4276 + }, + { + "epoch": 2.022222222222222, + "grad_norm": 2.5939974784851074, + "learning_rate": 3.7606098223836342e-06, + "loss": 0.4873, + "step": 4277 + }, + { + "epoch": 2.022695035460993, + "grad_norm": 2.499826431274414, + "learning_rate": 3.76007107032933e-06, + "loss": 0.4515, + "step": 4278 + }, + { + "epoch": 2.0231678486997637, + "grad_norm": 3.0318663120269775, + "learning_rate": 3.759532239815924e-06, + "loss": 0.4901, + "step": 4279 + }, + { + "epoch": 2.023640661938534, + "grad_norm": 2.857977867126465, + "learning_rate": 3.758993330876969e-06, + "loss": 0.4659, + "step": 4280 + }, + { + "epoch": 2.024113475177305, + "grad_norm": 2.47918438911438, + "learning_rate": 3.7584543435460196e-06, + "loss": 0.4323, + "step": 4281 + }, + { + "epoch": 2.0245862884160757, + "grad_norm": 2.6033785343170166, + "learning_rate": 3.757915277856637e-06, + "loss": 0.4437, + "step": 4282 + }, + { + "epoch": 2.0250591016548465, + "grad_norm": 2.799781322479248, + "learning_rate": 3.757376133842386e-06, + "loss": 0.4523, + "step": 4283 + }, + { + "epoch": 2.025531914893617, + "grad_norm": 2.6092529296875, + "learning_rate": 3.756836911536836e-06, + "loss": 0.3898, + "step": 4284 + }, + { + "epoch": 2.0260047281323876, + "grad_norm": 2.66229248046875, + "learning_rate": 3.7562976109735627e-06, + "loss": 0.4731, + "step": 4285 + }, + { + "epoch": 2.0264775413711584, + "grad_norm": 2.90142822265625, + "learning_rate": 3.7557582321861463e-06, + "loss": 0.4285, + "step": 4286 + }, + { + "epoch": 2.0269503546099292, + "grad_norm": 2.5138802528381348, + "learning_rate": 3.7552187752081707e-06, + "loss": 0.4467, + "step": 4287 + }, + { + "epoch": 2.0274231678486996, + "grad_norm": 3.0656235218048096, + "learning_rate": 3.754679240073226e-06, + "loss": 0.4718, + "step": 4288 + }, + { + "epoch": 2.0278959810874704, + "grad_norm": 2.9633383750915527, + "learning_rate": 3.754139626814907e-06, + "loss": 0.4741, + "step": 4289 + }, + { + "epoch": 2.028368794326241, + "grad_norm": 2.5925145149230957, + "learning_rate": 3.753599935466812e-06, + "loss": 0.4281, + "step": 4290 + }, + { + "epoch": 2.028841607565012, + "grad_norm": 2.837740659713745, + "learning_rate": 3.7530601660625456e-06, + "loss": 0.4757, + "step": 4291 + }, + { + "epoch": 2.0293144208037823, + "grad_norm": 2.3995790481567383, + "learning_rate": 3.752520318635718e-06, + "loss": 0.4148, + "step": 4292 + }, + { + "epoch": 2.029787234042553, + "grad_norm": 2.572601795196533, + "learning_rate": 3.7519803932199424e-06, + "loss": 0.4051, + "step": 4293 + }, + { + "epoch": 2.030260047281324, + "grad_norm": 2.6780295372009277, + "learning_rate": 3.751440389848837e-06, + "loss": 0.4626, + "step": 4294 + }, + { + "epoch": 2.0307328605200947, + "grad_norm": 2.8666839599609375, + "learning_rate": 3.7509003085560257e-06, + "loss": 0.4255, + "step": 4295 + }, + { + "epoch": 2.031205673758865, + "grad_norm": 2.4398207664489746, + "learning_rate": 3.750360149375138e-06, + "loss": 0.4235, + "step": 4296 + }, + { + "epoch": 2.031678486997636, + "grad_norm": 2.436840534210205, + "learning_rate": 3.7498199123398062e-06, + "loss": 0.3907, + "step": 4297 + }, + { + "epoch": 2.0321513002364067, + "grad_norm": 3.3945820331573486, + "learning_rate": 3.7492795974836683e-06, + "loss": 0.465, + "step": 4298 + }, + { + "epoch": 2.0326241134751775, + "grad_norm": 2.6693103313446045, + "learning_rate": 3.7487392048403678e-06, + "loss": 0.4948, + "step": 4299 + }, + { + "epoch": 2.033096926713948, + "grad_norm": 2.7642734050750732, + "learning_rate": 3.748198734443553e-06, + "loss": 0.4538, + "step": 4300 + }, + { + "epoch": 2.0335697399527186, + "grad_norm": 3.1436543464660645, + "learning_rate": 3.747658186326876e-06, + "loss": 0.5137, + "step": 4301 + }, + { + "epoch": 2.0340425531914894, + "grad_norm": 3.482678174972534, + "learning_rate": 3.7471175605239947e-06, + "loss": 0.4982, + "step": 4302 + }, + { + "epoch": 2.03451536643026, + "grad_norm": 2.712557077407837, + "learning_rate": 3.746576857068571e-06, + "loss": 0.4459, + "step": 4303 + }, + { + "epoch": 2.0349881796690306, + "grad_norm": 3.147440195083618, + "learning_rate": 3.7460360759942726e-06, + "loss": 0.5063, + "step": 4304 + }, + { + "epoch": 2.0354609929078014, + "grad_norm": 2.840672492980957, + "learning_rate": 3.7454952173347714e-06, + "loss": 0.5041, + "step": 4305 + }, + { + "epoch": 2.035933806146572, + "grad_norm": 2.584122657775879, + "learning_rate": 3.744954281123745e-06, + "loss": 0.4487, + "step": 4306 + }, + { + "epoch": 2.036406619385343, + "grad_norm": 2.9869542121887207, + "learning_rate": 3.7444132673948737e-06, + "loss": 0.479, + "step": 4307 + }, + { + "epoch": 2.0368794326241133, + "grad_norm": 2.478459358215332, + "learning_rate": 3.7438721761818446e-06, + "loss": 0.4636, + "step": 4308 + }, + { + "epoch": 2.037352245862884, + "grad_norm": 2.5524215698242188, + "learning_rate": 3.7433310075183504e-06, + "loss": 0.4601, + "step": 4309 + }, + { + "epoch": 2.037825059101655, + "grad_norm": 2.3709988594055176, + "learning_rate": 3.742789761438086e-06, + "loss": 0.4163, + "step": 4310 + }, + { + "epoch": 2.0382978723404257, + "grad_norm": 3.140355348587036, + "learning_rate": 3.742248437974752e-06, + "loss": 0.4433, + "step": 4311 + }, + { + "epoch": 2.038770685579196, + "grad_norm": 2.940948486328125, + "learning_rate": 3.741707037162055e-06, + "loss": 0.4299, + "step": 4312 + }, + { + "epoch": 2.039243498817967, + "grad_norm": 3.009157419204712, + "learning_rate": 3.7411655590337055e-06, + "loss": 0.463, + "step": 4313 + }, + { + "epoch": 2.0397163120567376, + "grad_norm": 2.672945737838745, + "learning_rate": 3.7406240036234185e-06, + "loss": 0.4696, + "step": 4314 + }, + { + "epoch": 2.0401891252955084, + "grad_norm": 2.745962142944336, + "learning_rate": 3.740082370964916e-06, + "loss": 0.4931, + "step": 4315 + }, + { + "epoch": 2.040661938534279, + "grad_norm": 2.3939316272735596, + "learning_rate": 3.7395406610919217e-06, + "loss": 0.4396, + "step": 4316 + }, + { + "epoch": 2.0411347517730496, + "grad_norm": 2.4364447593688965, + "learning_rate": 3.738998874038165e-06, + "loss": 0.4807, + "step": 4317 + }, + { + "epoch": 2.0416075650118204, + "grad_norm": 2.360489845275879, + "learning_rate": 3.738457009837381e-06, + "loss": 0.4426, + "step": 4318 + }, + { + "epoch": 2.042080378250591, + "grad_norm": 2.5494935512542725, + "learning_rate": 3.7379150685233108e-06, + "loss": 0.4189, + "step": 4319 + }, + { + "epoch": 2.0425531914893615, + "grad_norm": 2.635472059249878, + "learning_rate": 3.7373730501296963e-06, + "loss": 0.5014, + "step": 4320 + }, + { + "epoch": 2.0430260047281323, + "grad_norm": 2.4982943534851074, + "learning_rate": 3.7368309546902876e-06, + "loss": 0.4658, + "step": 4321 + }, + { + "epoch": 2.043498817966903, + "grad_norm": 2.692742109298706, + "learning_rate": 3.736288782238839e-06, + "loss": 0.4454, + "step": 4322 + }, + { + "epoch": 2.043971631205674, + "grad_norm": 2.6774091720581055, + "learning_rate": 3.7357465328091086e-06, + "loss": 0.5002, + "step": 4323 + }, + { + "epoch": 2.0444444444444443, + "grad_norm": 2.695138692855835, + "learning_rate": 3.735204206434861e-06, + "loss": 0.448, + "step": 4324 + }, + { + "epoch": 2.044917257683215, + "grad_norm": 2.5383570194244385, + "learning_rate": 3.7346618031498635e-06, + "loss": 0.4352, + "step": 4325 + }, + { + "epoch": 2.045390070921986, + "grad_norm": 2.267277240753174, + "learning_rate": 3.7341193229878886e-06, + "loss": 0.4162, + "step": 4326 + }, + { + "epoch": 2.0458628841607567, + "grad_norm": 2.6037328243255615, + "learning_rate": 3.733576765982715e-06, + "loss": 0.4471, + "step": 4327 + }, + { + "epoch": 2.046335697399527, + "grad_norm": 3.261385440826416, + "learning_rate": 3.7330341321681253e-06, + "loss": 0.4618, + "step": 4328 + }, + { + "epoch": 2.046808510638298, + "grad_norm": 2.440650463104248, + "learning_rate": 3.7324914215779072e-06, + "loss": 0.4476, + "step": 4329 + }, + { + "epoch": 2.0472813238770686, + "grad_norm": 2.5940682888031006, + "learning_rate": 3.731948634245853e-06, + "loss": 0.4389, + "step": 4330 + }, + { + "epoch": 2.0477541371158394, + "grad_norm": 2.7428150177001953, + "learning_rate": 3.7314057702057582e-06, + "loss": 0.4477, + "step": 4331 + }, + { + "epoch": 2.0482269503546098, + "grad_norm": 2.3546223640441895, + "learning_rate": 3.730862829491427e-06, + "loss": 0.4047, + "step": 4332 + }, + { + "epoch": 2.0486997635933806, + "grad_norm": 2.552422523498535, + "learning_rate": 3.7303198121366637e-06, + "loss": 0.4438, + "step": 4333 + }, + { + "epoch": 2.0491725768321514, + "grad_norm": 2.99226713180542, + "learning_rate": 3.729776718175281e-06, + "loss": 0.491, + "step": 4334 + }, + { + "epoch": 2.049645390070922, + "grad_norm": 3.2003321647644043, + "learning_rate": 3.7292335476410935e-06, + "loss": 0.5458, + "step": 4335 + }, + { + "epoch": 2.0501182033096925, + "grad_norm": 2.739847183227539, + "learning_rate": 3.7286903005679237e-06, + "loss": 0.4499, + "step": 4336 + }, + { + "epoch": 2.0505910165484633, + "grad_norm": 2.5917470455169678, + "learning_rate": 3.7281469769895963e-06, + "loss": 0.4714, + "step": 4337 + }, + { + "epoch": 2.051063829787234, + "grad_norm": 2.8029327392578125, + "learning_rate": 3.7276035769399422e-06, + "loss": 0.42, + "step": 4338 + }, + { + "epoch": 2.051536643026005, + "grad_norm": 2.484879493713379, + "learning_rate": 3.727060100452796e-06, + "loss": 0.4163, + "step": 4339 + }, + { + "epoch": 2.0520094562647753, + "grad_norm": 2.7126030921936035, + "learning_rate": 3.7265165475619973e-06, + "loss": 0.4112, + "step": 4340 + }, + { + "epoch": 2.052482269503546, + "grad_norm": 2.618267774581909, + "learning_rate": 3.7259729183013927e-06, + "loss": 0.4281, + "step": 4341 + }, + { + "epoch": 2.052955082742317, + "grad_norm": 2.703270673751831, + "learning_rate": 3.7254292127048293e-06, + "loss": 0.4437, + "step": 4342 + }, + { + "epoch": 2.0534278959810877, + "grad_norm": 2.429150104522705, + "learning_rate": 3.7248854308061623e-06, + "loss": 0.3971, + "step": 4343 + }, + { + "epoch": 2.053900709219858, + "grad_norm": 2.54354190826416, + "learning_rate": 3.7243415726392508e-06, + "loss": 0.4485, + "step": 4344 + }, + { + "epoch": 2.054373522458629, + "grad_norm": 2.9515016078948975, + "learning_rate": 3.723797638237957e-06, + "loss": 0.4386, + "step": 4345 + }, + { + "epoch": 2.0548463356973996, + "grad_norm": 2.9129958152770996, + "learning_rate": 3.7232536276361514e-06, + "loss": 0.4595, + "step": 4346 + }, + { + "epoch": 2.0553191489361704, + "grad_norm": 2.5397512912750244, + "learning_rate": 3.722709540867706e-06, + "loss": 0.3681, + "step": 4347 + }, + { + "epoch": 2.0557919621749408, + "grad_norm": 2.79884672164917, + "learning_rate": 3.722165377966499e-06, + "loss": 0.4576, + "step": 4348 + }, + { + "epoch": 2.0562647754137116, + "grad_norm": 2.669936180114746, + "learning_rate": 3.7216211389664137e-06, + "loss": 0.3692, + "step": 4349 + }, + { + "epoch": 2.0567375886524824, + "grad_norm": 2.512326240539551, + "learning_rate": 3.7210768239013355e-06, + "loss": 0.4554, + "step": 4350 + }, + { + "epoch": 2.057210401891253, + "grad_norm": 2.913693904876709, + "learning_rate": 3.7205324328051583e-06, + "loss": 0.5282, + "step": 4351 + }, + { + "epoch": 2.0576832151300235, + "grad_norm": 3.040891170501709, + "learning_rate": 3.719987965711778e-06, + "loss": 0.4778, + "step": 4352 + }, + { + "epoch": 2.0581560283687943, + "grad_norm": 2.7504117488861084, + "learning_rate": 3.7194434226550966e-06, + "loss": 0.4217, + "step": 4353 + }, + { + "epoch": 2.058628841607565, + "grad_norm": 2.5522971153259277, + "learning_rate": 3.718898803669021e-06, + "loss": 0.437, + "step": 4354 + }, + { + "epoch": 2.059101654846336, + "grad_norm": 2.8531908988952637, + "learning_rate": 3.718354108787461e-06, + "loss": 0.4251, + "step": 4355 + }, + { + "epoch": 2.0595744680851062, + "grad_norm": 2.5812065601348877, + "learning_rate": 3.7178093380443337e-06, + "loss": 0.4374, + "step": 4356 + }, + { + "epoch": 2.060047281323877, + "grad_norm": 2.627871513366699, + "learning_rate": 3.7172644914735583e-06, + "loss": 0.436, + "step": 4357 + }, + { + "epoch": 2.060520094562648, + "grad_norm": 2.7146239280700684, + "learning_rate": 3.7167195691090607e-06, + "loss": 0.4204, + "step": 4358 + }, + { + "epoch": 2.0609929078014186, + "grad_norm": 2.486483573913574, + "learning_rate": 3.7161745709847706e-06, + "loss": 0.4015, + "step": 4359 + }, + { + "epoch": 2.061465721040189, + "grad_norm": 2.866049289703369, + "learning_rate": 3.7156294971346226e-06, + "loss": 0.4087, + "step": 4360 + }, + { + "epoch": 2.06193853427896, + "grad_norm": 2.9345552921295166, + "learning_rate": 3.715084347592556e-06, + "loss": 0.5074, + "step": 4361 + }, + { + "epoch": 2.0624113475177306, + "grad_norm": 2.502455711364746, + "learning_rate": 3.7145391223925155e-06, + "loss": 0.469, + "step": 4362 + }, + { + "epoch": 2.0628841607565014, + "grad_norm": 2.6419875621795654, + "learning_rate": 3.713993821568449e-06, + "loss": 0.4493, + "step": 4363 + }, + { + "epoch": 2.0633569739952717, + "grad_norm": 3.812079429626465, + "learning_rate": 3.7134484451543114e-06, + "loss": 0.4764, + "step": 4364 + }, + { + "epoch": 2.0638297872340425, + "grad_norm": 2.581780195236206, + "learning_rate": 3.712902993184059e-06, + "loss": 0.3994, + "step": 4365 + }, + { + "epoch": 2.0643026004728133, + "grad_norm": 2.282508134841919, + "learning_rate": 3.712357465691656e-06, + "loss": 0.4252, + "step": 4366 + }, + { + "epoch": 2.064775413711584, + "grad_norm": 2.4727818965911865, + "learning_rate": 3.71181186271107e-06, + "loss": 0.4558, + "step": 4367 + }, + { + "epoch": 2.0652482269503545, + "grad_norm": 2.7661173343658447, + "learning_rate": 3.711266184276272e-06, + "loss": 0.505, + "step": 4368 + }, + { + "epoch": 2.0657210401891253, + "grad_norm": 2.6264543533325195, + "learning_rate": 3.71072043042124e-06, + "loss": 0.4297, + "step": 4369 + }, + { + "epoch": 2.066193853427896, + "grad_norm": 2.773699998855591, + "learning_rate": 3.7101746011799565e-06, + "loss": 0.4267, + "step": 4370 + }, + { + "epoch": 2.066666666666667, + "grad_norm": 2.686955213546753, + "learning_rate": 3.709628696586407e-06, + "loss": 0.4099, + "step": 4371 + }, + { + "epoch": 2.0671394799054372, + "grad_norm": 2.6066620349884033, + "learning_rate": 3.709082716674582e-06, + "loss": 0.4146, + "step": 4372 + }, + { + "epoch": 2.067612293144208, + "grad_norm": 2.7769250869750977, + "learning_rate": 3.7085366614784784e-06, + "loss": 0.4047, + "step": 4373 + }, + { + "epoch": 2.068085106382979, + "grad_norm": 2.4986939430236816, + "learning_rate": 3.7079905310320957e-06, + "loss": 0.4021, + "step": 4374 + }, + { + "epoch": 2.0685579196217496, + "grad_norm": 2.5456206798553467, + "learning_rate": 3.7074443253694402e-06, + "loss": 0.3569, + "step": 4375 + }, + { + "epoch": 2.06903073286052, + "grad_norm": 2.4079296588897705, + "learning_rate": 3.70689804452452e-06, + "loss": 0.4308, + "step": 4376 + }, + { + "epoch": 2.0695035460992908, + "grad_norm": 2.86014723777771, + "learning_rate": 3.7063516885313513e-06, + "loss": 0.4577, + "step": 4377 + }, + { + "epoch": 2.0699763593380616, + "grad_norm": 2.8025779724121094, + "learning_rate": 3.7058052574239523e-06, + "loss": 0.4615, + "step": 4378 + }, + { + "epoch": 2.0704491725768324, + "grad_norm": 2.902676820755005, + "learning_rate": 3.7052587512363475e-06, + "loss": 0.4765, + "step": 4379 + }, + { + "epoch": 2.0709219858156027, + "grad_norm": 2.814509391784668, + "learning_rate": 3.704712170002566e-06, + "loss": 0.434, + "step": 4380 + }, + { + "epoch": 2.0713947990543735, + "grad_norm": 2.7923502922058105, + "learning_rate": 3.704165513756639e-06, + "loss": 0.4626, + "step": 4381 + }, + { + "epoch": 2.0718676122931443, + "grad_norm": 2.6802031993865967, + "learning_rate": 3.703618782532606e-06, + "loss": 0.4835, + "step": 4382 + }, + { + "epoch": 2.072340425531915, + "grad_norm": 3.0963687896728516, + "learning_rate": 3.7030719763645085e-06, + "loss": 0.4813, + "step": 4383 + }, + { + "epoch": 2.0728132387706855, + "grad_norm": 2.5658695697784424, + "learning_rate": 3.7025250952863956e-06, + "loss": 0.4428, + "step": 4384 + }, + { + "epoch": 2.0732860520094563, + "grad_norm": 2.7738289833068848, + "learning_rate": 3.7019781393323167e-06, + "loss": 0.4376, + "step": 4385 + }, + { + "epoch": 2.073758865248227, + "grad_norm": 2.6446938514709473, + "learning_rate": 3.7014311085363303e-06, + "loss": 0.4208, + "step": 4386 + }, + { + "epoch": 2.0742316784869974, + "grad_norm": 2.7556118965148926, + "learning_rate": 3.7008840029324967e-06, + "loss": 0.3831, + "step": 4387 + }, + { + "epoch": 2.074704491725768, + "grad_norm": 2.573141574859619, + "learning_rate": 3.700336822554882e-06, + "loss": 0.4396, + "step": 4388 + }, + { + "epoch": 2.075177304964539, + "grad_norm": 2.762319803237915, + "learning_rate": 3.6997895674375566e-06, + "loss": 0.4579, + "step": 4389 + }, + { + "epoch": 2.07565011820331, + "grad_norm": 2.729780435562134, + "learning_rate": 3.699242237614596e-06, + "loss": 0.4262, + "step": 4390 + }, + { + "epoch": 2.0761229314420806, + "grad_norm": 2.657480001449585, + "learning_rate": 3.698694833120079e-06, + "loss": 0.4176, + "step": 4391 + }, + { + "epoch": 2.076595744680851, + "grad_norm": 2.8433303833007812, + "learning_rate": 3.6981473539880914e-06, + "loss": 0.457, + "step": 4392 + }, + { + "epoch": 2.0770685579196217, + "grad_norm": 2.819047212600708, + "learning_rate": 3.6975998002527225e-06, + "loss": 0.4244, + "step": 4393 + }, + { + "epoch": 2.0775413711583925, + "grad_norm": 2.6565003395080566, + "learning_rate": 3.697052171948064e-06, + "loss": 0.4384, + "step": 4394 + }, + { + "epoch": 2.078014184397163, + "grad_norm": 2.5795063972473145, + "learning_rate": 3.696504469108216e-06, + "loss": 0.4958, + "step": 4395 + }, + { + "epoch": 2.0784869976359337, + "grad_norm": 2.455730676651001, + "learning_rate": 3.6959566917672822e-06, + "loss": 0.4191, + "step": 4396 + }, + { + "epoch": 2.0789598108747045, + "grad_norm": 2.6706607341766357, + "learning_rate": 3.6954088399593684e-06, + "loss": 0.4709, + "step": 4397 + }, + { + "epoch": 2.0794326241134753, + "grad_norm": 2.3758466243743896, + "learning_rate": 3.694860913718589e-06, + "loss": 0.4231, + "step": 4398 + }, + { + "epoch": 2.079905437352246, + "grad_norm": 2.3488340377807617, + "learning_rate": 3.6943129130790583e-06, + "loss": 0.4321, + "step": 4399 + }, + { + "epoch": 2.0803782505910164, + "grad_norm": 2.6438148021698, + "learning_rate": 3.6937648380748996e-06, + "loss": 0.4907, + "step": 4400 + }, + { + "epoch": 2.0808510638297872, + "grad_norm": 2.9826784133911133, + "learning_rate": 3.6932166887402395e-06, + "loss": 0.4404, + "step": 4401 + }, + { + "epoch": 2.081323877068558, + "grad_norm": 2.5203495025634766, + "learning_rate": 3.6926684651092076e-06, + "loss": 0.4337, + "step": 4402 + }, + { + "epoch": 2.0817966903073284, + "grad_norm": 2.7704148292541504, + "learning_rate": 3.692120167215941e-06, + "loss": 0.4195, + "step": 4403 + }, + { + "epoch": 2.082269503546099, + "grad_norm": 2.879430055618286, + "learning_rate": 3.6915717950945782e-06, + "loss": 0.4498, + "step": 4404 + }, + { + "epoch": 2.08274231678487, + "grad_norm": 2.7659497261047363, + "learning_rate": 3.6910233487792655e-06, + "loss": 0.4017, + "step": 4405 + }, + { + "epoch": 2.083215130023641, + "grad_norm": 3.4017205238342285, + "learning_rate": 3.6904748283041503e-06, + "loss": 0.4733, + "step": 4406 + }, + { + "epoch": 2.083687943262411, + "grad_norm": 2.706223249435425, + "learning_rate": 3.6899262337033887e-06, + "loss": 0.4926, + "step": 4407 + }, + { + "epoch": 2.084160756501182, + "grad_norm": 2.644932508468628, + "learning_rate": 3.6893775650111372e-06, + "loss": 0.3904, + "step": 4408 + }, + { + "epoch": 2.0846335697399527, + "grad_norm": 2.666585683822632, + "learning_rate": 3.6888288222615603e-06, + "loss": 0.4698, + "step": 4409 + }, + { + "epoch": 2.0851063829787235, + "grad_norm": 3.0058486461639404, + "learning_rate": 3.688280005488826e-06, + "loss": 0.5291, + "step": 4410 + }, + { + "epoch": 2.085579196217494, + "grad_norm": 2.533088445663452, + "learning_rate": 3.687731114727105e-06, + "loss": 0.393, + "step": 4411 + }, + { + "epoch": 2.0860520094562647, + "grad_norm": 2.921687364578247, + "learning_rate": 3.6871821500105763e-06, + "loss": 0.4719, + "step": 4412 + }, + { + "epoch": 2.0865248226950355, + "grad_norm": 2.291804313659668, + "learning_rate": 3.686633111373421e-06, + "loss": 0.4105, + "step": 4413 + }, + { + "epoch": 2.0869976359338063, + "grad_norm": 2.496333122253418, + "learning_rate": 3.6860839988498255e-06, + "loss": 0.4704, + "step": 4414 + }, + { + "epoch": 2.0874704491725766, + "grad_norm": 2.8059427738189697, + "learning_rate": 3.6855348124739787e-06, + "loss": 0.4961, + "step": 4415 + }, + { + "epoch": 2.0879432624113474, + "grad_norm": 2.683922290802002, + "learning_rate": 3.6849855522800795e-06, + "loss": 0.4838, + "step": 4416 + }, + { + "epoch": 2.088416075650118, + "grad_norm": 2.694148540496826, + "learning_rate": 3.684436218302324e-06, + "loss": 0.4812, + "step": 4417 + }, + { + "epoch": 2.088888888888889, + "grad_norm": 2.724531888961792, + "learning_rate": 3.683886810574919e-06, + "loss": 0.4495, + "step": 4418 + }, + { + "epoch": 2.0893617021276594, + "grad_norm": 2.6176564693450928, + "learning_rate": 3.6833373291320746e-06, + "loss": 0.4698, + "step": 4419 + }, + { + "epoch": 2.08983451536643, + "grad_norm": 2.534116268157959, + "learning_rate": 3.6827877740080032e-06, + "loss": 0.3912, + "step": 4420 + }, + { + "epoch": 2.090307328605201, + "grad_norm": 2.5747432708740234, + "learning_rate": 3.682238145236924e-06, + "loss": 0.4072, + "step": 4421 + }, + { + "epoch": 2.0907801418439718, + "grad_norm": 2.5947659015655518, + "learning_rate": 3.6816884428530588e-06, + "loss": 0.4638, + "step": 4422 + }, + { + "epoch": 2.091252955082742, + "grad_norm": 2.811992883682251, + "learning_rate": 3.6811386668906353e-06, + "loss": 0.4345, + "step": 4423 + }, + { + "epoch": 2.091725768321513, + "grad_norm": 2.7482287883758545, + "learning_rate": 3.680588817383886e-06, + "loss": 0.4541, + "step": 4424 + }, + { + "epoch": 2.0921985815602837, + "grad_norm": 2.987131357192993, + "learning_rate": 3.6800388943670484e-06, + "loss": 0.4571, + "step": 4425 + }, + { + "epoch": 2.0926713947990545, + "grad_norm": 3.1918671131134033, + "learning_rate": 3.6794888978743637e-06, + "loss": 0.5722, + "step": 4426 + }, + { + "epoch": 2.093144208037825, + "grad_norm": 2.5654571056365967, + "learning_rate": 3.678938827940076e-06, + "loss": 0.4686, + "step": 4427 + }, + { + "epoch": 2.0936170212765957, + "grad_norm": 2.942084789276123, + "learning_rate": 3.6783886845984383e-06, + "loss": 0.4512, + "step": 4428 + }, + { + "epoch": 2.0940898345153665, + "grad_norm": 2.74847674369812, + "learning_rate": 3.677838467883703e-06, + "loss": 0.4506, + "step": 4429 + }, + { + "epoch": 2.0945626477541373, + "grad_norm": 2.7569334506988525, + "learning_rate": 3.6772881778301322e-06, + "loss": 0.502, + "step": 4430 + }, + { + "epoch": 2.0950354609929076, + "grad_norm": 2.969966173171997, + "learning_rate": 3.6767378144719884e-06, + "loss": 0.4772, + "step": 4431 + }, + { + "epoch": 2.0955082742316784, + "grad_norm": 2.773524522781372, + "learning_rate": 3.67618737784354e-06, + "loss": 0.5183, + "step": 4432 + }, + { + "epoch": 2.095981087470449, + "grad_norm": 2.6760106086730957, + "learning_rate": 3.6756368679790617e-06, + "loss": 0.4787, + "step": 4433 + }, + { + "epoch": 2.09645390070922, + "grad_norm": 2.8758978843688965, + "learning_rate": 3.6750862849128304e-06, + "loss": 0.4275, + "step": 4434 + }, + { + "epoch": 2.0969267139479904, + "grad_norm": 2.670509099960327, + "learning_rate": 3.6745356286791288e-06, + "loss": 0.4401, + "step": 4435 + }, + { + "epoch": 2.097399527186761, + "grad_norm": 2.8453969955444336, + "learning_rate": 3.673984899312244e-06, + "loss": 0.4303, + "step": 4436 + }, + { + "epoch": 2.097872340425532, + "grad_norm": 2.6212339401245117, + "learning_rate": 3.673434096846468e-06, + "loss": 0.4675, + "step": 4437 + }, + { + "epoch": 2.0983451536643027, + "grad_norm": 2.8211941719055176, + "learning_rate": 3.672883221316095e-06, + "loss": 0.4678, + "step": 4438 + }, + { + "epoch": 2.098817966903073, + "grad_norm": 2.4838058948516846, + "learning_rate": 3.672332272755427e-06, + "loss": 0.4128, + "step": 4439 + }, + { + "epoch": 2.099290780141844, + "grad_norm": 2.596660852432251, + "learning_rate": 3.671781251198769e-06, + "loss": 0.423, + "step": 4440 + }, + { + "epoch": 2.0997635933806147, + "grad_norm": 2.9979989528656006, + "learning_rate": 3.67123015668043e-06, + "loss": 0.4493, + "step": 4441 + }, + { + "epoch": 2.1002364066193855, + "grad_norm": 2.6232850551605225, + "learning_rate": 3.670678989234725e-06, + "loss": 0.4237, + "step": 4442 + }, + { + "epoch": 2.100709219858156, + "grad_norm": 2.575039863586426, + "learning_rate": 3.670127748895973e-06, + "loss": 0.4464, + "step": 4443 + }, + { + "epoch": 2.1011820330969266, + "grad_norm": 2.3381190299987793, + "learning_rate": 3.669576435698497e-06, + "loss": 0.4208, + "step": 4444 + }, + { + "epoch": 2.1016548463356974, + "grad_norm": 2.9645180702209473, + "learning_rate": 3.669025049676625e-06, + "loss": 0.5272, + "step": 4445 + }, + { + "epoch": 2.1021276595744682, + "grad_norm": 2.719320297241211, + "learning_rate": 3.668473590864689e-06, + "loss": 0.4485, + "step": 4446 + }, + { + "epoch": 2.1026004728132386, + "grad_norm": 2.8665547370910645, + "learning_rate": 3.6679220592970254e-06, + "loss": 0.4433, + "step": 4447 + }, + { + "epoch": 2.1030732860520094, + "grad_norm": 2.6922879219055176, + "learning_rate": 3.667370455007977e-06, + "loss": 0.502, + "step": 4448 + }, + { + "epoch": 2.10354609929078, + "grad_norm": 3.018228530883789, + "learning_rate": 3.6668187780318894e-06, + "loss": 0.4939, + "step": 4449 + }, + { + "epoch": 2.104018912529551, + "grad_norm": 3.187901735305786, + "learning_rate": 3.666267028403112e-06, + "loss": 0.4151, + "step": 4450 + }, + { + "epoch": 2.1044917257683213, + "grad_norm": 2.9521446228027344, + "learning_rate": 3.6657152061560012e-06, + "loss": 0.4343, + "step": 4451 + }, + { + "epoch": 2.104964539007092, + "grad_norm": 2.5125739574432373, + "learning_rate": 3.6651633113249164e-06, + "loss": 0.4071, + "step": 4452 + }, + { + "epoch": 2.105437352245863, + "grad_norm": 2.9164133071899414, + "learning_rate": 3.664611343944221e-06, + "loss": 0.4173, + "step": 4453 + }, + { + "epoch": 2.1059101654846337, + "grad_norm": 2.680893898010254, + "learning_rate": 3.6640593040482834e-06, + "loss": 0.4917, + "step": 4454 + }, + { + "epoch": 2.106382978723404, + "grad_norm": 2.6823534965515137, + "learning_rate": 3.6635071916714774e-06, + "loss": 0.4668, + "step": 4455 + }, + { + "epoch": 2.106855791962175, + "grad_norm": 2.6221907138824463, + "learning_rate": 3.6629550068481806e-06, + "loss": 0.4956, + "step": 4456 + }, + { + "epoch": 2.1073286052009457, + "grad_norm": 3.096370220184326, + "learning_rate": 3.6624027496127745e-06, + "loss": 0.3995, + "step": 4457 + }, + { + "epoch": 2.1078014184397165, + "grad_norm": 2.752885341644287, + "learning_rate": 3.661850419999647e-06, + "loss": 0.4838, + "step": 4458 + }, + { + "epoch": 2.108274231678487, + "grad_norm": 2.6806766986846924, + "learning_rate": 3.661298018043188e-06, + "loss": 0.4817, + "step": 4459 + }, + { + "epoch": 2.1087470449172576, + "grad_norm": 2.6317873001098633, + "learning_rate": 3.660745543777794e-06, + "loss": 0.4777, + "step": 4460 + }, + { + "epoch": 2.1092198581560284, + "grad_norm": 2.4939377307891846, + "learning_rate": 3.6601929972378634e-06, + "loss": 0.4525, + "step": 4461 + }, + { + "epoch": 2.109692671394799, + "grad_norm": 2.4902873039245605, + "learning_rate": 3.659640378457803e-06, + "loss": 0.4392, + "step": 4462 + }, + { + "epoch": 2.1101654846335696, + "grad_norm": 2.5082345008850098, + "learning_rate": 3.6590876874720216e-06, + "loss": 0.4224, + "step": 4463 + }, + { + "epoch": 2.1106382978723404, + "grad_norm": 2.658407211303711, + "learning_rate": 3.6585349243149313e-06, + "loss": 0.4316, + "step": 4464 + }, + { + "epoch": 2.111111111111111, + "grad_norm": 2.562883138656616, + "learning_rate": 3.6579820890209515e-06, + "loss": 0.4491, + "step": 4465 + }, + { + "epoch": 2.111583924349882, + "grad_norm": 2.5719261169433594, + "learning_rate": 3.657429181624505e-06, + "loss": 0.4406, + "step": 4466 + }, + { + "epoch": 2.1120567375886523, + "grad_norm": 2.8840596675872803, + "learning_rate": 3.6568762021600184e-06, + "loss": 0.4267, + "step": 4467 + }, + { + "epoch": 2.112529550827423, + "grad_norm": 2.660304546356201, + "learning_rate": 3.656323150661924e-06, + "loss": 0.4502, + "step": 4468 + }, + { + "epoch": 2.113002364066194, + "grad_norm": 2.610996961593628, + "learning_rate": 3.655770027164657e-06, + "loss": 0.3934, + "step": 4469 + }, + { + "epoch": 2.1134751773049647, + "grad_norm": 2.6000053882598877, + "learning_rate": 3.655216831702658e-06, + "loss": 0.4582, + "step": 4470 + }, + { + "epoch": 2.113947990543735, + "grad_norm": 2.73124098777771, + "learning_rate": 3.654663564310372e-06, + "loss": 0.4748, + "step": 4471 + }, + { + "epoch": 2.114420803782506, + "grad_norm": 2.711091995239258, + "learning_rate": 3.6541102250222495e-06, + "loss": 0.4145, + "step": 4472 + }, + { + "epoch": 2.1148936170212767, + "grad_norm": 2.655996561050415, + "learning_rate": 3.6535568138727438e-06, + "loss": 0.4407, + "step": 4473 + }, + { + "epoch": 2.1153664302600474, + "grad_norm": 2.7630865573883057, + "learning_rate": 3.653003330896313e-06, + "loss": 0.4298, + "step": 4474 + }, + { + "epoch": 2.115839243498818, + "grad_norm": 2.554415464401245, + "learning_rate": 3.6524497761274214e-06, + "loss": 0.44, + "step": 4475 + }, + { + "epoch": 2.1163120567375886, + "grad_norm": 2.790328025817871, + "learning_rate": 3.651896149600535e-06, + "loss": 0.5061, + "step": 4476 + }, + { + "epoch": 2.1167848699763594, + "grad_norm": 2.755267381668091, + "learning_rate": 3.651342451350127e-06, + "loss": 0.4588, + "step": 4477 + }, + { + "epoch": 2.11725768321513, + "grad_norm": 2.8936638832092285, + "learning_rate": 3.6507886814106722e-06, + "loss": 0.468, + "step": 4478 + }, + { + "epoch": 2.1177304964539005, + "grad_norm": 2.7394332885742188, + "learning_rate": 3.6502348398166525e-06, + "loss": 0.383, + "step": 4479 + }, + { + "epoch": 2.1182033096926713, + "grad_norm": 2.3359546661376953, + "learning_rate": 3.649680926602553e-06, + "loss": 0.3903, + "step": 4480 + }, + { + "epoch": 2.118676122931442, + "grad_norm": 3.102202892303467, + "learning_rate": 3.6491269418028637e-06, + "loss": 0.4525, + "step": 4481 + }, + { + "epoch": 2.119148936170213, + "grad_norm": 2.467970848083496, + "learning_rate": 3.648572885452078e-06, + "loss": 0.414, + "step": 4482 + }, + { + "epoch": 2.1196217494089833, + "grad_norm": 2.8984131813049316, + "learning_rate": 3.6480187575846952e-06, + "loss": 0.4571, + "step": 4483 + }, + { + "epoch": 2.120094562647754, + "grad_norm": 2.674834966659546, + "learning_rate": 3.6474645582352187e-06, + "loss": 0.455, + "step": 4484 + }, + { + "epoch": 2.120567375886525, + "grad_norm": 2.8713369369506836, + "learning_rate": 3.6469102874381552e-06, + "loss": 0.4567, + "step": 4485 + }, + { + "epoch": 2.1210401891252957, + "grad_norm": 3.174814462661743, + "learning_rate": 3.646355945228017e-06, + "loss": 0.5295, + "step": 4486 + }, + { + "epoch": 2.121513002364066, + "grad_norm": 2.6409823894500732, + "learning_rate": 3.6458015316393215e-06, + "loss": 0.4308, + "step": 4487 + }, + { + "epoch": 2.121985815602837, + "grad_norm": 2.4228954315185547, + "learning_rate": 3.645247046706588e-06, + "loss": 0.4042, + "step": 4488 + }, + { + "epoch": 2.1224586288416076, + "grad_norm": 2.553551435470581, + "learning_rate": 3.6446924904643427e-06, + "loss": 0.3925, + "step": 4489 + }, + { + "epoch": 2.1229314420803784, + "grad_norm": 2.8019237518310547, + "learning_rate": 3.6441378629471157e-06, + "loss": 0.4079, + "step": 4490 + }, + { + "epoch": 2.123404255319149, + "grad_norm": 2.993251085281372, + "learning_rate": 3.643583164189441e-06, + "loss": 0.4558, + "step": 4491 + }, + { + "epoch": 2.1238770685579196, + "grad_norm": 2.4531471729278564, + "learning_rate": 3.643028394225857e-06, + "loss": 0.4167, + "step": 4492 + }, + { + "epoch": 2.1243498817966904, + "grad_norm": 2.6827852725982666, + "learning_rate": 3.6424735530909065e-06, + "loss": 0.4311, + "step": 4493 + }, + { + "epoch": 2.124822695035461, + "grad_norm": 3.1232128143310547, + "learning_rate": 3.6419186408191377e-06, + "loss": 0.4537, + "step": 4494 + }, + { + "epoch": 2.1252955082742315, + "grad_norm": 2.816348075866699, + "learning_rate": 3.641363657445103e-06, + "loss": 0.4869, + "step": 4495 + }, + { + "epoch": 2.1257683215130023, + "grad_norm": 2.6269683837890625, + "learning_rate": 3.6408086030033575e-06, + "loss": 0.4066, + "step": 4496 + }, + { + "epoch": 2.126241134751773, + "grad_norm": 4.6375956535339355, + "learning_rate": 3.640253477528462e-06, + "loss": 0.4488, + "step": 4497 + }, + { + "epoch": 2.126713947990544, + "grad_norm": 3.020970582962036, + "learning_rate": 3.639698281054983e-06, + "loss": 0.4197, + "step": 4498 + }, + { + "epoch": 2.1271867612293143, + "grad_norm": 2.87904691696167, + "learning_rate": 3.6391430136174892e-06, + "loss": 0.4743, + "step": 4499 + }, + { + "epoch": 2.127659574468085, + "grad_norm": 2.719892978668213, + "learning_rate": 3.6385876752505554e-06, + "loss": 0.388, + "step": 4500 + }, + { + "epoch": 2.128132387706856, + "grad_norm": 2.7321808338165283, + "learning_rate": 3.638032265988759e-06, + "loss": 0.4857, + "step": 4501 + }, + { + "epoch": 2.1286052009456267, + "grad_norm": 2.700814723968506, + "learning_rate": 3.6374767858666836e-06, + "loss": 0.4819, + "step": 4502 + }, + { + "epoch": 2.129078014184397, + "grad_norm": 2.658423662185669, + "learning_rate": 3.6369212349189164e-06, + "loss": 0.4113, + "step": 4503 + }, + { + "epoch": 2.129550827423168, + "grad_norm": 2.673877716064453, + "learning_rate": 3.63636561318005e-06, + "loss": 0.3745, + "step": 4504 + }, + { + "epoch": 2.1300236406619386, + "grad_norm": 2.607758045196533, + "learning_rate": 3.6358099206846787e-06, + "loss": 0.4409, + "step": 4505 + }, + { + "epoch": 2.1304964539007094, + "grad_norm": 2.8117682933807373, + "learning_rate": 3.6352541574674044e-06, + "loss": 0.426, + "step": 4506 + }, + { + "epoch": 2.1309692671394798, + "grad_norm": 2.6970250606536865, + "learning_rate": 3.634698323562832e-06, + "loss": 0.4295, + "step": 4507 + }, + { + "epoch": 2.1314420803782506, + "grad_norm": 2.7133560180664062, + "learning_rate": 3.6341424190055696e-06, + "loss": 0.4443, + "step": 4508 + }, + { + "epoch": 2.1319148936170214, + "grad_norm": 2.57181715965271, + "learning_rate": 3.6335864438302328e-06, + "loss": 0.3995, + "step": 4509 + }, + { + "epoch": 2.132387706855792, + "grad_norm": 2.8618004322052, + "learning_rate": 3.633030398071438e-06, + "loss": 0.5075, + "step": 4510 + }, + { + "epoch": 2.1328605200945625, + "grad_norm": 2.7586729526519775, + "learning_rate": 3.6324742817638087e-06, + "loss": 0.4322, + "step": 4511 + }, + { + "epoch": 2.1333333333333333, + "grad_norm": 2.913256883621216, + "learning_rate": 3.631918094941972e-06, + "loss": 0.4708, + "step": 4512 + }, + { + "epoch": 2.133806146572104, + "grad_norm": 2.7715728282928467, + "learning_rate": 3.6313618376405585e-06, + "loss": 0.5194, + "step": 4513 + }, + { + "epoch": 2.134278959810875, + "grad_norm": 2.7986366748809814, + "learning_rate": 3.6308055098942042e-06, + "loss": 0.4419, + "step": 4514 + }, + { + "epoch": 2.1347517730496453, + "grad_norm": 3.043549060821533, + "learning_rate": 3.6302491117375492e-06, + "loss": 0.4441, + "step": 4515 + }, + { + "epoch": 2.135224586288416, + "grad_norm": 2.771761417388916, + "learning_rate": 3.629692643205238e-06, + "loss": 0.4752, + "step": 4516 + }, + { + "epoch": 2.135697399527187, + "grad_norm": 2.804941415786743, + "learning_rate": 3.6291361043319202e-06, + "loss": 0.4089, + "step": 4517 + }, + { + "epoch": 2.1361702127659576, + "grad_norm": 2.9897940158843994, + "learning_rate": 3.628579495152248e-06, + "loss": 0.4829, + "step": 4518 + }, + { + "epoch": 2.136643026004728, + "grad_norm": 2.9273486137390137, + "learning_rate": 3.6280228157008784e-06, + "loss": 0.4469, + "step": 4519 + }, + { + "epoch": 2.137115839243499, + "grad_norm": 2.584373950958252, + "learning_rate": 3.627466066012475e-06, + "loss": 0.4277, + "step": 4520 + }, + { + "epoch": 2.1375886524822696, + "grad_norm": 3.009333848953247, + "learning_rate": 3.626909246121703e-06, + "loss": 0.4025, + "step": 4521 + }, + { + "epoch": 2.1380614657210404, + "grad_norm": 2.634615659713745, + "learning_rate": 3.626352356063234e-06, + "loss": 0.4046, + "step": 4522 + }, + { + "epoch": 2.1385342789598107, + "grad_norm": 2.87310528755188, + "learning_rate": 3.625795395871743e-06, + "loss": 0.4426, + "step": 4523 + }, + { + "epoch": 2.1390070921985815, + "grad_norm": 2.94985032081604, + "learning_rate": 3.625238365581909e-06, + "loss": 0.445, + "step": 4524 + }, + { + "epoch": 2.1394799054373523, + "grad_norm": 2.470189332962036, + "learning_rate": 3.624681265228416e-06, + "loss": 0.4082, + "step": 4525 + }, + { + "epoch": 2.139952718676123, + "grad_norm": 2.5304040908813477, + "learning_rate": 3.624124094845952e-06, + "loss": 0.403, + "step": 4526 + }, + { + "epoch": 2.1404255319148935, + "grad_norm": 2.6148900985717773, + "learning_rate": 3.62356685446921e-06, + "loss": 0.3867, + "step": 4527 + }, + { + "epoch": 2.1408983451536643, + "grad_norm": 2.885549783706665, + "learning_rate": 3.623009544132886e-06, + "loss": 0.4706, + "step": 4528 + }, + { + "epoch": 2.141371158392435, + "grad_norm": 3.00490665435791, + "learning_rate": 3.6224521638716827e-06, + "loss": 0.4733, + "step": 4529 + }, + { + "epoch": 2.141843971631206, + "grad_norm": 2.925879716873169, + "learning_rate": 3.6218947137203043e-06, + "loss": 0.4581, + "step": 4530 + }, + { + "epoch": 2.1423167848699762, + "grad_norm": 3.10861873626709, + "learning_rate": 3.621337193713462e-06, + "loss": 0.4579, + "step": 4531 + }, + { + "epoch": 2.142789598108747, + "grad_norm": 2.7386577129364014, + "learning_rate": 3.6207796038858693e-06, + "loss": 0.4248, + "step": 4532 + }, + { + "epoch": 2.143262411347518, + "grad_norm": 2.601836681365967, + "learning_rate": 3.6202219442722453e-06, + "loss": 0.4928, + "step": 4533 + }, + { + "epoch": 2.1437352245862886, + "grad_norm": 2.598778247833252, + "learning_rate": 3.6196642149073123e-06, + "loss": 0.4415, + "step": 4534 + }, + { + "epoch": 2.144208037825059, + "grad_norm": 2.443995714187622, + "learning_rate": 3.619106415825798e-06, + "loss": 0.3917, + "step": 4535 + }, + { + "epoch": 2.1446808510638298, + "grad_norm": 2.84643816947937, + "learning_rate": 3.6185485470624354e-06, + "loss": 0.4162, + "step": 4536 + }, + { + "epoch": 2.1451536643026006, + "grad_norm": 2.4568188190460205, + "learning_rate": 3.617990608651959e-06, + "loss": 0.4298, + "step": 4537 + }, + { + "epoch": 2.145626477541371, + "grad_norm": 2.968804359436035, + "learning_rate": 3.61743260062911e-06, + "loss": 0.4696, + "step": 4538 + }, + { + "epoch": 2.1460992907801417, + "grad_norm": 2.629075288772583, + "learning_rate": 3.6168745230286327e-06, + "loss": 0.4234, + "step": 4539 + }, + { + "epoch": 2.1465721040189125, + "grad_norm": 2.7680578231811523, + "learning_rate": 3.6163163758852754e-06, + "loss": 0.4669, + "step": 4540 + }, + { + "epoch": 2.1470449172576833, + "grad_norm": 2.782825469970703, + "learning_rate": 3.615758159233793e-06, + "loss": 0.4552, + "step": 4541 + }, + { + "epoch": 2.147517730496454, + "grad_norm": 2.653047561645508, + "learning_rate": 3.615199873108942e-06, + "loss": 0.4393, + "step": 4542 + }, + { + "epoch": 2.1479905437352245, + "grad_norm": 2.4175806045532227, + "learning_rate": 3.6146415175454852e-06, + "loss": 0.4114, + "step": 4543 + }, + { + "epoch": 2.1484633569739953, + "grad_norm": 2.627943515777588, + "learning_rate": 3.614083092578189e-06, + "loss": 0.4215, + "step": 4544 + }, + { + "epoch": 2.148936170212766, + "grad_norm": 2.8934123516082764, + "learning_rate": 3.6135245982418227e-06, + "loss": 0.4815, + "step": 4545 + }, + { + "epoch": 2.1494089834515364, + "grad_norm": 2.8535244464874268, + "learning_rate": 3.612966034571164e-06, + "loss": 0.4683, + "step": 4546 + }, + { + "epoch": 2.149881796690307, + "grad_norm": 2.7826647758483887, + "learning_rate": 3.6124074016009893e-06, + "loss": 0.4351, + "step": 4547 + }, + { + "epoch": 2.150354609929078, + "grad_norm": 2.6906018257141113, + "learning_rate": 3.6118486993660834e-06, + "loss": 0.4585, + "step": 4548 + }, + { + "epoch": 2.150827423167849, + "grad_norm": 2.726766586303711, + "learning_rate": 3.6112899279012346e-06, + "loss": 0.4753, + "step": 4549 + }, + { + "epoch": 2.1513002364066196, + "grad_norm": 3.0193991661071777, + "learning_rate": 3.6107310872412348e-06, + "loss": 0.4827, + "step": 4550 + }, + { + "epoch": 2.15177304964539, + "grad_norm": 2.6788697242736816, + "learning_rate": 3.610172177420881e-06, + "loss": 0.4333, + "step": 4551 + }, + { + "epoch": 2.1522458628841608, + "grad_norm": 2.865410327911377, + "learning_rate": 3.609613198474973e-06, + "loss": 0.4569, + "step": 4552 + }, + { + "epoch": 2.1527186761229316, + "grad_norm": 2.9199366569519043, + "learning_rate": 3.609054150438317e-06, + "loss": 0.5097, + "step": 4553 + }, + { + "epoch": 2.153191489361702, + "grad_norm": 2.761035203933716, + "learning_rate": 3.6084950333457215e-06, + "loss": 0.5002, + "step": 4554 + }, + { + "epoch": 2.1536643026004727, + "grad_norm": 2.514223337173462, + "learning_rate": 3.607935847232002e-06, + "loss": 0.4171, + "step": 4555 + }, + { + "epoch": 2.1541371158392435, + "grad_norm": 2.5167524814605713, + "learning_rate": 3.6073765921319747e-06, + "loss": 0.4494, + "step": 4556 + }, + { + "epoch": 2.1546099290780143, + "grad_norm": 2.7540643215179443, + "learning_rate": 3.606817268080463e-06, + "loss": 0.4472, + "step": 4557 + }, + { + "epoch": 2.155082742316785, + "grad_norm": 2.7728664875030518, + "learning_rate": 3.6062578751122936e-06, + "loss": 0.4669, + "step": 4558 + }, + { + "epoch": 2.1555555555555554, + "grad_norm": 2.7788400650024414, + "learning_rate": 3.605698413262296e-06, + "loss": 0.4613, + "step": 4559 + }, + { + "epoch": 2.1560283687943262, + "grad_norm": 2.7811810970306396, + "learning_rate": 3.605138882565308e-06, + "loss": 0.4242, + "step": 4560 + }, + { + "epoch": 2.156501182033097, + "grad_norm": 2.7819995880126953, + "learning_rate": 3.6045792830561664e-06, + "loss": 0.443, + "step": 4561 + }, + { + "epoch": 2.1569739952718674, + "grad_norm": 2.671259641647339, + "learning_rate": 3.6040196147697166e-06, + "loss": 0.4336, + "step": 4562 + }, + { + "epoch": 2.157446808510638, + "grad_norm": 2.9296300411224365, + "learning_rate": 3.603459877740807e-06, + "loss": 0.479, + "step": 4563 + }, + { + "epoch": 2.157919621749409, + "grad_norm": 2.834937334060669, + "learning_rate": 3.602900072004289e-06, + "loss": 0.4603, + "step": 4564 + }, + { + "epoch": 2.15839243498818, + "grad_norm": 2.8434760570526123, + "learning_rate": 3.602340197595019e-06, + "loss": 0.4288, + "step": 4565 + }, + { + "epoch": 2.1588652482269506, + "grad_norm": 2.7245426177978516, + "learning_rate": 3.6017802545478593e-06, + "loss": 0.4194, + "step": 4566 + }, + { + "epoch": 2.159338061465721, + "grad_norm": 2.7795023918151855, + "learning_rate": 3.6012202428976735e-06, + "loss": 0.4481, + "step": 4567 + }, + { + "epoch": 2.1598108747044917, + "grad_norm": 2.9482083320617676, + "learning_rate": 3.6006601626793325e-06, + "loss": 0.468, + "step": 4568 + }, + { + "epoch": 2.1602836879432625, + "grad_norm": 2.9563326835632324, + "learning_rate": 3.6001000139277094e-06, + "loss": 0.4427, + "step": 4569 + }, + { + "epoch": 2.160756501182033, + "grad_norm": 2.7755916118621826, + "learning_rate": 3.599539796677682e-06, + "loss": 0.4258, + "step": 4570 + }, + { + "epoch": 2.1612293144208037, + "grad_norm": 2.961045265197754, + "learning_rate": 3.5989795109641333e-06, + "loss": 0.4645, + "step": 4571 + }, + { + "epoch": 2.1617021276595745, + "grad_norm": 3.0184407234191895, + "learning_rate": 3.5984191568219482e-06, + "loss": 0.4192, + "step": 4572 + }, + { + "epoch": 2.1621749408983453, + "grad_norm": 2.9811131954193115, + "learning_rate": 3.5978587342860192e-06, + "loss": 0.408, + "step": 4573 + }, + { + "epoch": 2.162647754137116, + "grad_norm": 2.9172329902648926, + "learning_rate": 3.597298243391242e-06, + "loss": 0.4528, + "step": 4574 + }, + { + "epoch": 2.1631205673758864, + "grad_norm": 2.7798452377319336, + "learning_rate": 3.596737684172513e-06, + "loss": 0.391, + "step": 4575 + }, + { + "epoch": 2.1635933806146572, + "grad_norm": 2.526277542114258, + "learning_rate": 3.596177056664738e-06, + "loss": 0.3699, + "step": 4576 + }, + { + "epoch": 2.164066193853428, + "grad_norm": 2.856269121170044, + "learning_rate": 3.5956163609028244e-06, + "loss": 0.4082, + "step": 4577 + }, + { + "epoch": 2.1645390070921984, + "grad_norm": 2.7681572437286377, + "learning_rate": 3.5950555969216845e-06, + "loss": 0.4064, + "step": 4578 + }, + { + "epoch": 2.165011820330969, + "grad_norm": 2.2924954891204834, + "learning_rate": 3.5944947647562333e-06, + "loss": 0.416, + "step": 4579 + }, + { + "epoch": 2.16548463356974, + "grad_norm": 2.439929485321045, + "learning_rate": 3.5939338644413936e-06, + "loss": 0.4476, + "step": 4580 + }, + { + "epoch": 2.1659574468085108, + "grad_norm": 2.786442518234253, + "learning_rate": 3.5933728960120877e-06, + "loss": 0.4525, + "step": 4581 + }, + { + "epoch": 2.166430260047281, + "grad_norm": 2.5910253524780273, + "learning_rate": 3.5928118595032465e-06, + "loss": 0.4441, + "step": 4582 + }, + { + "epoch": 2.166903073286052, + "grad_norm": 2.8144876956939697, + "learning_rate": 3.5922507549498024e-06, + "loss": 0.497, + "step": 4583 + }, + { + "epoch": 2.1673758865248227, + "grad_norm": 2.5714170932769775, + "learning_rate": 3.591689582386694e-06, + "loss": 0.4625, + "step": 4584 + }, + { + "epoch": 2.1678486997635935, + "grad_norm": 2.878187894821167, + "learning_rate": 3.591128341848861e-06, + "loss": 0.4835, + "step": 4585 + }, + { + "epoch": 2.168321513002364, + "grad_norm": 2.4946508407592773, + "learning_rate": 3.5905670333712504e-06, + "loss": 0.4278, + "step": 4586 + }, + { + "epoch": 2.1687943262411347, + "grad_norm": 2.9186196327209473, + "learning_rate": 3.590005656988814e-06, + "loss": 0.465, + "step": 4587 + }, + { + "epoch": 2.1692671394799055, + "grad_norm": 3.136807441711426, + "learning_rate": 3.5894442127365046e-06, + "loss": 0.4146, + "step": 4588 + }, + { + "epoch": 2.1697399527186763, + "grad_norm": 2.8106343746185303, + "learning_rate": 3.5888827006492804e-06, + "loss": 0.4737, + "step": 4589 + }, + { + "epoch": 2.1702127659574466, + "grad_norm": 2.874553680419922, + "learning_rate": 3.5883211207621047e-06, + "loss": 0.3962, + "step": 4590 + }, + { + "epoch": 2.1706855791962174, + "grad_norm": 2.7914116382598877, + "learning_rate": 3.587759473109946e-06, + "loss": 0.4705, + "step": 4591 + }, + { + "epoch": 2.171158392434988, + "grad_norm": 2.7273290157318115, + "learning_rate": 3.5871977577277745e-06, + "loss": 0.4827, + "step": 4592 + }, + { + "epoch": 2.171631205673759, + "grad_norm": 2.4167256355285645, + "learning_rate": 3.5866359746505653e-06, + "loss": 0.4181, + "step": 4593 + }, + { + "epoch": 2.1721040189125294, + "grad_norm": 2.8929779529571533, + "learning_rate": 3.586074123913299e-06, + "loss": 0.4006, + "step": 4594 + }, + { + "epoch": 2.1725768321513, + "grad_norm": 2.6996190547943115, + "learning_rate": 3.5855122055509593e-06, + "loss": 0.4792, + "step": 4595 + }, + { + "epoch": 2.173049645390071, + "grad_norm": 2.9341464042663574, + "learning_rate": 3.584950219598534e-06, + "loss": 0.3903, + "step": 4596 + }, + { + "epoch": 2.1735224586288417, + "grad_norm": 2.799330234527588, + "learning_rate": 3.5843881660910166e-06, + "loss": 0.4717, + "step": 4597 + }, + { + "epoch": 2.173995271867612, + "grad_norm": 2.5028693675994873, + "learning_rate": 3.5838260450634028e-06, + "loss": 0.4462, + "step": 4598 + }, + { + "epoch": 2.174468085106383, + "grad_norm": 2.5845541954040527, + "learning_rate": 3.583263856550693e-06, + "loss": 0.4327, + "step": 4599 + }, + { + "epoch": 2.1749408983451537, + "grad_norm": 2.4804906845092773, + "learning_rate": 3.5827016005878933e-06, + "loss": 0.4555, + "step": 4600 + }, + { + "epoch": 2.1754137115839245, + "grad_norm": 2.625746011734009, + "learning_rate": 3.5821392772100125e-06, + "loss": 0.455, + "step": 4601 + }, + { + "epoch": 2.175886524822695, + "grad_norm": 2.6230757236480713, + "learning_rate": 3.581576886452064e-06, + "loss": 0.4422, + "step": 4602 + }, + { + "epoch": 2.1763593380614656, + "grad_norm": 3.3104100227355957, + "learning_rate": 3.5810144283490656e-06, + "loss": 0.4212, + "step": 4603 + }, + { + "epoch": 2.1768321513002364, + "grad_norm": 2.6799755096435547, + "learning_rate": 3.5804519029360384e-06, + "loss": 0.4575, + "step": 4604 + }, + { + "epoch": 2.1773049645390072, + "grad_norm": 2.462216854095459, + "learning_rate": 3.5798893102480085e-06, + "loss": 0.4096, + "step": 4605 + }, + { + "epoch": 2.1777777777777776, + "grad_norm": 2.8600878715515137, + "learning_rate": 3.5793266503200074e-06, + "loss": 0.4798, + "step": 4606 + }, + { + "epoch": 2.1782505910165484, + "grad_norm": 2.935746431350708, + "learning_rate": 3.5787639231870673e-06, + "loss": 0.4021, + "step": 4607 + }, + { + "epoch": 2.178723404255319, + "grad_norm": 2.8655526638031006, + "learning_rate": 3.578201128884229e-06, + "loss": 0.4553, + "step": 4608 + }, + { + "epoch": 2.17919621749409, + "grad_norm": 3.219498634338379, + "learning_rate": 3.577638267446533e-06, + "loss": 0.4692, + "step": 4609 + }, + { + "epoch": 2.1796690307328603, + "grad_norm": 3.0449860095977783, + "learning_rate": 3.5770753389090283e-06, + "loss": 0.4675, + "step": 4610 + }, + { + "epoch": 2.180141843971631, + "grad_norm": 2.7045507431030273, + "learning_rate": 3.576512343306765e-06, + "loss": 0.4773, + "step": 4611 + }, + { + "epoch": 2.180614657210402, + "grad_norm": 2.601499557495117, + "learning_rate": 3.5759492806747985e-06, + "loss": 0.4112, + "step": 4612 + }, + { + "epoch": 2.1810874704491727, + "grad_norm": 2.987741470336914, + "learning_rate": 3.575386151048188e-06, + "loss": 0.4651, + "step": 4613 + }, + { + "epoch": 2.181560283687943, + "grad_norm": 2.961228847503662, + "learning_rate": 3.5748229544619973e-06, + "loss": 0.5116, + "step": 4614 + }, + { + "epoch": 2.182033096926714, + "grad_norm": 2.8008430004119873, + "learning_rate": 3.574259690951295e-06, + "loss": 0.4152, + "step": 4615 + }, + { + "epoch": 2.1825059101654847, + "grad_norm": 2.5429348945617676, + "learning_rate": 3.573696360551151e-06, + "loss": 0.4188, + "step": 4616 + }, + { + "epoch": 2.1829787234042555, + "grad_norm": 2.9566478729248047, + "learning_rate": 3.5731329632966428e-06, + "loss": 0.5156, + "step": 4617 + }, + { + "epoch": 2.183451536643026, + "grad_norm": 2.5302467346191406, + "learning_rate": 3.572569499222851e-06, + "loss": 0.4361, + "step": 4618 + }, + { + "epoch": 2.1839243498817966, + "grad_norm": 3.206803560256958, + "learning_rate": 3.5720059683648593e-06, + "loss": 0.5149, + "step": 4619 + }, + { + "epoch": 2.1843971631205674, + "grad_norm": 2.9432034492492676, + "learning_rate": 3.5714423707577573e-06, + "loss": 0.4411, + "step": 4620 + }, + { + "epoch": 2.184869976359338, + "grad_norm": 2.9412078857421875, + "learning_rate": 3.5708787064366358e-06, + "loss": 0.4372, + "step": 4621 + }, + { + "epoch": 2.1853427895981086, + "grad_norm": 3.1702330112457275, + "learning_rate": 3.5703149754365935e-06, + "loss": 0.4761, + "step": 4622 + }, + { + "epoch": 2.1858156028368794, + "grad_norm": 3.1240456104278564, + "learning_rate": 3.569751177792731e-06, + "loss": 0.4854, + "step": 4623 + }, + { + "epoch": 2.18628841607565, + "grad_norm": 2.7221994400024414, + "learning_rate": 3.5691873135401534e-06, + "loss": 0.4048, + "step": 4624 + }, + { + "epoch": 2.186761229314421, + "grad_norm": 2.74397873878479, + "learning_rate": 3.5686233827139695e-06, + "loss": 0.4747, + "step": 4625 + }, + { + "epoch": 2.1872340425531913, + "grad_norm": 2.7379889488220215, + "learning_rate": 3.5680593853492932e-06, + "loss": 0.4963, + "step": 4626 + }, + { + "epoch": 2.187706855791962, + "grad_norm": 3.040205478668213, + "learning_rate": 3.5674953214812435e-06, + "loss": 0.4917, + "step": 4627 + }, + { + "epoch": 2.188179669030733, + "grad_norm": 2.95302677154541, + "learning_rate": 3.56693119114494e-06, + "loss": 0.4758, + "step": 4628 + }, + { + "epoch": 2.1886524822695037, + "grad_norm": 2.5488312244415283, + "learning_rate": 3.56636699437551e-06, + "loss": 0.4057, + "step": 4629 + }, + { + "epoch": 2.189125295508274, + "grad_norm": 2.8379666805267334, + "learning_rate": 3.565802731208083e-06, + "loss": 0.4755, + "step": 4630 + }, + { + "epoch": 2.189598108747045, + "grad_norm": 2.8765869140625, + "learning_rate": 3.565238401677793e-06, + "loss": 0.4232, + "step": 4631 + }, + { + "epoch": 2.1900709219858157, + "grad_norm": 2.9091262817382812, + "learning_rate": 3.5646740058197784e-06, + "loss": 0.3874, + "step": 4632 + }, + { + "epoch": 2.1905437352245865, + "grad_norm": 2.7067387104034424, + "learning_rate": 3.5641095436691826e-06, + "loss": 0.4771, + "step": 4633 + }, + { + "epoch": 2.191016548463357, + "grad_norm": 2.403043508529663, + "learning_rate": 3.563545015261151e-06, + "loss": 0.4062, + "step": 4634 + }, + { + "epoch": 2.1914893617021276, + "grad_norm": 2.8059732913970947, + "learning_rate": 3.562980420630836e-06, + "loss": 0.4635, + "step": 4635 + }, + { + "epoch": 2.1919621749408984, + "grad_norm": 2.5467724800109863, + "learning_rate": 3.56241575981339e-06, + "loss": 0.4552, + "step": 4636 + }, + { + "epoch": 2.192434988179669, + "grad_norm": 2.651024103164673, + "learning_rate": 3.561851032843973e-06, + "loss": 0.38, + "step": 4637 + }, + { + "epoch": 2.1929078014184396, + "grad_norm": 2.5529849529266357, + "learning_rate": 3.5612862397577496e-06, + "loss": 0.4106, + "step": 4638 + }, + { + "epoch": 2.1933806146572103, + "grad_norm": 3.069258451461792, + "learning_rate": 3.5607213805898844e-06, + "loss": 0.461, + "step": 4639 + }, + { + "epoch": 2.193853427895981, + "grad_norm": 2.5652637481689453, + "learning_rate": 3.56015645537555e-06, + "loss": 0.4497, + "step": 4640 + }, + { + "epoch": 2.194326241134752, + "grad_norm": 2.699101209640503, + "learning_rate": 3.5595914641499224e-06, + "loss": 0.4887, + "step": 4641 + }, + { + "epoch": 2.1947990543735223, + "grad_norm": 2.9292235374450684, + "learning_rate": 3.5590264069481805e-06, + "loss": 0.4462, + "step": 4642 + }, + { + "epoch": 2.195271867612293, + "grad_norm": 2.6151106357574463, + "learning_rate": 3.5584612838055077e-06, + "loss": 0.4334, + "step": 4643 + }, + { + "epoch": 2.195744680851064, + "grad_norm": 2.895798444747925, + "learning_rate": 3.5578960947570923e-06, + "loss": 0.4448, + "step": 4644 + }, + { + "epoch": 2.1962174940898347, + "grad_norm": 2.627631425857544, + "learning_rate": 3.557330839838125e-06, + "loss": 0.436, + "step": 4645 + }, + { + "epoch": 2.196690307328605, + "grad_norm": 2.8803584575653076, + "learning_rate": 3.556765519083803e-06, + "loss": 0.4698, + "step": 4646 + }, + { + "epoch": 2.197163120567376, + "grad_norm": 2.436609983444214, + "learning_rate": 3.5562001325293265e-06, + "loss": 0.4043, + "step": 4647 + }, + { + "epoch": 2.1976359338061466, + "grad_norm": 2.5090718269348145, + "learning_rate": 3.5556346802098985e-06, + "loss": 0.4505, + "step": 4648 + }, + { + "epoch": 2.1981087470449174, + "grad_norm": 2.792783737182617, + "learning_rate": 3.5550691621607277e-06, + "loss": 0.43, + "step": 4649 + }, + { + "epoch": 2.198581560283688, + "grad_norm": 2.74153470993042, + "learning_rate": 3.554503578417026e-06, + "loss": 0.4496, + "step": 4650 + }, + { + "epoch": 2.1990543735224586, + "grad_norm": 3.0262627601623535, + "learning_rate": 3.5539379290140114e-06, + "loss": 0.4503, + "step": 4651 + }, + { + "epoch": 2.1995271867612294, + "grad_norm": 2.783811330795288, + "learning_rate": 3.553372213986903e-06, + "loss": 0.432, + "step": 4652 + }, + { + "epoch": 2.2, + "grad_norm": 3.091191053390503, + "learning_rate": 3.5528064333709255e-06, + "loss": 0.4658, + "step": 4653 + }, + { + "epoch": 2.2004728132387705, + "grad_norm": 2.814634084701538, + "learning_rate": 3.5522405872013076e-06, + "loss": 0.4473, + "step": 4654 + }, + { + "epoch": 2.2009456264775413, + "grad_norm": 2.6918299198150635, + "learning_rate": 3.5516746755132824e-06, + "loss": 0.5323, + "step": 4655 + }, + { + "epoch": 2.201418439716312, + "grad_norm": 2.9902455806732178, + "learning_rate": 3.5511086983420867e-06, + "loss": 0.5166, + "step": 4656 + }, + { + "epoch": 2.201891252955083, + "grad_norm": 2.932699203491211, + "learning_rate": 3.5505426557229616e-06, + "loss": 0.5197, + "step": 4657 + }, + { + "epoch": 2.2023640661938533, + "grad_norm": 2.585712432861328, + "learning_rate": 3.549976547691152e-06, + "loss": 0.425, + "step": 4658 + }, + { + "epoch": 2.202836879432624, + "grad_norm": 3.1019949913024902, + "learning_rate": 3.5494103742819065e-06, + "loss": 0.485, + "step": 4659 + }, + { + "epoch": 2.203309692671395, + "grad_norm": 2.3169195652008057, + "learning_rate": 3.548844135530478e-06, + "loss": 0.4064, + "step": 4660 + }, + { + "epoch": 2.2037825059101657, + "grad_norm": 2.779240846633911, + "learning_rate": 3.5482778314721257e-06, + "loss": 0.427, + "step": 4661 + }, + { + "epoch": 2.204255319148936, + "grad_norm": 2.765423059463501, + "learning_rate": 3.5477114621421078e-06, + "loss": 0.5125, + "step": 4662 + }, + { + "epoch": 2.204728132387707, + "grad_norm": 2.5590033531188965, + "learning_rate": 3.5471450275756913e-06, + "loss": 0.4009, + "step": 4663 + }, + { + "epoch": 2.2052009456264776, + "grad_norm": 2.706068515777588, + "learning_rate": 3.546578527808146e-06, + "loss": 0.4604, + "step": 4664 + }, + { + "epoch": 2.2056737588652484, + "grad_norm": 2.7995102405548096, + "learning_rate": 3.546011962874745e-06, + "loss": 0.4088, + "step": 4665 + }, + { + "epoch": 2.2061465721040188, + "grad_norm": 2.6369729042053223, + "learning_rate": 3.5454453328107656e-06, + "loss": 0.4634, + "step": 4666 + }, + { + "epoch": 2.2066193853427896, + "grad_norm": 3.1426475048065186, + "learning_rate": 3.54487863765149e-06, + "loss": 0.4761, + "step": 4667 + }, + { + "epoch": 2.2070921985815604, + "grad_norm": 2.7739460468292236, + "learning_rate": 3.5443118774322027e-06, + "loss": 0.467, + "step": 4668 + }, + { + "epoch": 2.207565011820331, + "grad_norm": 2.559105157852173, + "learning_rate": 3.5437450521881934e-06, + "loss": 0.4268, + "step": 4669 + }, + { + "epoch": 2.2080378250591015, + "grad_norm": 2.726593017578125, + "learning_rate": 3.543178161954758e-06, + "loss": 0.462, + "step": 4670 + }, + { + "epoch": 2.2085106382978723, + "grad_norm": 2.796109199523926, + "learning_rate": 3.5426112067671907e-06, + "loss": 0.4571, + "step": 4671 + }, + { + "epoch": 2.208983451536643, + "grad_norm": 2.7989072799682617, + "learning_rate": 3.5420441866607964e-06, + "loss": 0.4648, + "step": 4672 + }, + { + "epoch": 2.209456264775414, + "grad_norm": 2.6750967502593994, + "learning_rate": 3.5414771016708795e-06, + "loss": 0.4717, + "step": 4673 + }, + { + "epoch": 2.2099290780141843, + "grad_norm": 2.705659866333008, + "learning_rate": 3.5409099518327507e-06, + "loss": 0.4738, + "step": 4674 + }, + { + "epoch": 2.210401891252955, + "grad_norm": 2.79276442527771, + "learning_rate": 3.5403427371817234e-06, + "loss": 0.4625, + "step": 4675 + }, + { + "epoch": 2.210874704491726, + "grad_norm": 2.781339406967163, + "learning_rate": 3.539775457753115e-06, + "loss": 0.438, + "step": 4676 + }, + { + "epoch": 2.2113475177304966, + "grad_norm": 3.0088918209075928, + "learning_rate": 3.5392081135822488e-06, + "loss": 0.4776, + "step": 4677 + }, + { + "epoch": 2.211820330969267, + "grad_norm": 3.0291390419006348, + "learning_rate": 3.538640704704449e-06, + "loss": 0.4634, + "step": 4678 + }, + { + "epoch": 2.212293144208038, + "grad_norm": 2.967867374420166, + "learning_rate": 3.5380732311550477e-06, + "loss": 0.4776, + "step": 4679 + }, + { + "epoch": 2.2127659574468086, + "grad_norm": 2.6268832683563232, + "learning_rate": 3.5375056929693787e-06, + "loss": 0.4646, + "step": 4680 + }, + { + "epoch": 2.2132387706855794, + "grad_norm": 2.6688554286956787, + "learning_rate": 3.536938090182778e-06, + "loss": 0.3975, + "step": 4681 + }, + { + "epoch": 2.2137115839243497, + "grad_norm": 3.0079736709594727, + "learning_rate": 3.5363704228305906e-06, + "loss": 0.4724, + "step": 4682 + }, + { + "epoch": 2.2141843971631205, + "grad_norm": 2.4287586212158203, + "learning_rate": 3.535802690948161e-06, + "loss": 0.4371, + "step": 4683 + }, + { + "epoch": 2.2146572104018913, + "grad_norm": 2.960679531097412, + "learning_rate": 3.53523489457084e-06, + "loss": 0.4347, + "step": 4684 + }, + { + "epoch": 2.215130023640662, + "grad_norm": 2.9646008014678955, + "learning_rate": 3.5346670337339807e-06, + "loss": 0.4803, + "step": 4685 + }, + { + "epoch": 2.2156028368794325, + "grad_norm": 3.0518898963928223, + "learning_rate": 3.534099108472942e-06, + "loss": 0.4712, + "step": 4686 + }, + { + "epoch": 2.2160756501182033, + "grad_norm": 2.776681900024414, + "learning_rate": 3.533531118823086e-06, + "loss": 0.4347, + "step": 4687 + }, + { + "epoch": 2.216548463356974, + "grad_norm": 2.18019437789917, + "learning_rate": 3.53296306481978e-06, + "loss": 0.3551, + "step": 4688 + }, + { + "epoch": 2.217021276595745, + "grad_norm": 2.9400811195373535, + "learning_rate": 3.5323949464983937e-06, + "loss": 0.4912, + "step": 4689 + }, + { + "epoch": 2.2174940898345152, + "grad_norm": 2.798386812210083, + "learning_rate": 3.5318267638943e-06, + "loss": 0.3967, + "step": 4690 + }, + { + "epoch": 2.217966903073286, + "grad_norm": 2.5452775955200195, + "learning_rate": 3.531258517042879e-06, + "loss": 0.3773, + "step": 4691 + }, + { + "epoch": 2.218439716312057, + "grad_norm": 2.711137294769287, + "learning_rate": 3.5306902059795113e-06, + "loss": 0.4123, + "step": 4692 + }, + { + "epoch": 2.2189125295508276, + "grad_norm": 3.0022387504577637, + "learning_rate": 3.530121830739584e-06, + "loss": 0.4898, + "step": 4693 + }, + { + "epoch": 2.219385342789598, + "grad_norm": 2.871814250946045, + "learning_rate": 3.5295533913584877e-06, + "loss": 0.4497, + "step": 4694 + }, + { + "epoch": 2.219858156028369, + "grad_norm": 2.9782521724700928, + "learning_rate": 3.528984887871616e-06, + "loss": 0.4797, + "step": 4695 + }, + { + "epoch": 2.2203309692671396, + "grad_norm": 2.6896398067474365, + "learning_rate": 3.5284163203143673e-06, + "loss": 0.439, + "step": 4696 + }, + { + "epoch": 2.2208037825059104, + "grad_norm": 2.7898833751678467, + "learning_rate": 3.5278476887221436e-06, + "loss": 0.4656, + "step": 4697 + }, + { + "epoch": 2.2212765957446807, + "grad_norm": 2.800416946411133, + "learning_rate": 3.527278993130352e-06, + "loss": 0.4452, + "step": 4698 + }, + { + "epoch": 2.2217494089834515, + "grad_norm": 3.653228998184204, + "learning_rate": 3.526710233574401e-06, + "loss": 0.4189, + "step": 4699 + }, + { + "epoch": 2.2222222222222223, + "grad_norm": 2.856956958770752, + "learning_rate": 3.5261414100897064e-06, + "loss": 0.4298, + "step": 4700 + }, + { + "epoch": 2.222695035460993, + "grad_norm": 2.8576223850250244, + "learning_rate": 3.5255725227116854e-06, + "loss": 0.4425, + "step": 4701 + }, + { + "epoch": 2.2231678486997635, + "grad_norm": 3.1161351203918457, + "learning_rate": 3.5250035714757603e-06, + "loss": 0.4609, + "step": 4702 + }, + { + "epoch": 2.2236406619385343, + "grad_norm": 2.843379259109497, + "learning_rate": 3.5244345564173578e-06, + "loss": 0.3589, + "step": 4703 + }, + { + "epoch": 2.224113475177305, + "grad_norm": 2.877157211303711, + "learning_rate": 3.5238654775719068e-06, + "loss": 0.4591, + "step": 4704 + }, + { + "epoch": 2.2245862884160754, + "grad_norm": 3.488954782485962, + "learning_rate": 3.5232963349748424e-06, + "loss": 0.4836, + "step": 4705 + }, + { + "epoch": 2.225059101654846, + "grad_norm": 2.929037570953369, + "learning_rate": 3.5227271286616025e-06, + "loss": 0.5293, + "step": 4706 + }, + { + "epoch": 2.225531914893617, + "grad_norm": 2.6230576038360596, + "learning_rate": 3.5221578586676286e-06, + "loss": 0.4235, + "step": 4707 + }, + { + "epoch": 2.226004728132388, + "grad_norm": 2.529998302459717, + "learning_rate": 3.5215885250283664e-06, + "loss": 0.4369, + "step": 4708 + }, + { + "epoch": 2.2264775413711586, + "grad_norm": 2.817279577255249, + "learning_rate": 3.521019127779267e-06, + "loss": 0.481, + "step": 4709 + }, + { + "epoch": 2.226950354609929, + "grad_norm": 3.1513843536376953, + "learning_rate": 3.5204496669557833e-06, + "loss": 0.463, + "step": 4710 + }, + { + "epoch": 2.2274231678486998, + "grad_norm": 2.9403610229492188, + "learning_rate": 3.5198801425933725e-06, + "loss": 0.455, + "step": 4711 + }, + { + "epoch": 2.2278959810874706, + "grad_norm": 2.648346424102783, + "learning_rate": 3.5193105547274987e-06, + "loss": 0.4441, + "step": 4712 + }, + { + "epoch": 2.228368794326241, + "grad_norm": 2.791898727416992, + "learning_rate": 3.5187409033936252e-06, + "loss": 0.4682, + "step": 4713 + }, + { + "epoch": 2.2288416075650117, + "grad_norm": 2.8157432079315186, + "learning_rate": 3.5181711886272242e-06, + "loss": 0.4572, + "step": 4714 + }, + { + "epoch": 2.2293144208037825, + "grad_norm": 3.250319480895996, + "learning_rate": 3.5176014104637665e-06, + "loss": 0.4599, + "step": 4715 + }, + { + "epoch": 2.2297872340425533, + "grad_norm": 2.6747050285339355, + "learning_rate": 3.5170315689387307e-06, + "loss": 0.4328, + "step": 4716 + }, + { + "epoch": 2.230260047281324, + "grad_norm": 2.584094762802124, + "learning_rate": 3.5164616640875993e-06, + "loss": 0.4268, + "step": 4717 + }, + { + "epoch": 2.2307328605200945, + "grad_norm": 2.480710506439209, + "learning_rate": 3.5158916959458573e-06, + "loss": 0.438, + "step": 4718 + }, + { + "epoch": 2.2312056737588652, + "grad_norm": 2.9338483810424805, + "learning_rate": 3.515321664548993e-06, + "loss": 0.4937, + "step": 4719 + }, + { + "epoch": 2.231678486997636, + "grad_norm": 2.7880783081054688, + "learning_rate": 3.5147515699325013e-06, + "loss": 0.4624, + "step": 4720 + }, + { + "epoch": 2.2321513002364064, + "grad_norm": 2.740841865539551, + "learning_rate": 3.5141814121318797e-06, + "loss": 0.3689, + "step": 4721 + }, + { + "epoch": 2.232624113475177, + "grad_norm": 2.9541244506835938, + "learning_rate": 3.5136111911826277e-06, + "loss": 0.4092, + "step": 4722 + }, + { + "epoch": 2.233096926713948, + "grad_norm": 2.7205398082733154, + "learning_rate": 3.5130409071202515e-06, + "loss": 0.445, + "step": 4723 + }, + { + "epoch": 2.233569739952719, + "grad_norm": 2.563406229019165, + "learning_rate": 3.51247055998026e-06, + "loss": 0.4335, + "step": 4724 + }, + { + "epoch": 2.2340425531914896, + "grad_norm": 2.4249489307403564, + "learning_rate": 3.5119001497981666e-06, + "loss": 0.4671, + "step": 4725 + }, + { + "epoch": 2.23451536643026, + "grad_norm": 2.711630344390869, + "learning_rate": 3.5113296766094875e-06, + "loss": 0.4177, + "step": 4726 + }, + { + "epoch": 2.2349881796690307, + "grad_norm": 3.0257632732391357, + "learning_rate": 3.5107591404497443e-06, + "loss": 0.4976, + "step": 4727 + }, + { + "epoch": 2.2354609929078015, + "grad_norm": 2.717303991317749, + "learning_rate": 3.5101885413544614e-06, + "loss": 0.4621, + "step": 4728 + }, + { + "epoch": 2.235933806146572, + "grad_norm": 3.2846004962921143, + "learning_rate": 3.509617879359167e-06, + "loss": 0.4284, + "step": 4729 + }, + { + "epoch": 2.2364066193853427, + "grad_norm": 2.7217819690704346, + "learning_rate": 3.5090471544993953e-06, + "loss": 0.4247, + "step": 4730 + }, + { + "epoch": 2.2368794326241135, + "grad_norm": 2.5003223419189453, + "learning_rate": 3.5084763668106812e-06, + "loss": 0.4096, + "step": 4731 + }, + { + "epoch": 2.2373522458628843, + "grad_norm": 2.7312731742858887, + "learning_rate": 3.5079055163285658e-06, + "loss": 0.4741, + "step": 4732 + }, + { + "epoch": 2.237825059101655, + "grad_norm": 2.84940767288208, + "learning_rate": 3.5073346030885934e-06, + "loss": 0.4887, + "step": 4733 + }, + { + "epoch": 2.2382978723404254, + "grad_norm": 3.1188511848449707, + "learning_rate": 3.506763627126313e-06, + "loss": 0.5335, + "step": 4734 + }, + { + "epoch": 2.2387706855791962, + "grad_norm": 2.6741397380828857, + "learning_rate": 3.5061925884772753e-06, + "loss": 0.4137, + "step": 4735 + }, + { + "epoch": 2.239243498817967, + "grad_norm": 3.1542465686798096, + "learning_rate": 3.505621487177037e-06, + "loss": 0.5303, + "step": 4736 + }, + { + "epoch": 2.2397163120567374, + "grad_norm": 5.448268890380859, + "learning_rate": 3.505050323261159e-06, + "loss": 0.4995, + "step": 4737 + }, + { + "epoch": 2.240189125295508, + "grad_norm": 2.7317898273468018, + "learning_rate": 3.5044790967652037e-06, + "loss": 0.4595, + "step": 4738 + }, + { + "epoch": 2.240661938534279, + "grad_norm": 2.8135695457458496, + "learning_rate": 3.50390780772474e-06, + "loss": 0.4593, + "step": 4739 + }, + { + "epoch": 2.2411347517730498, + "grad_norm": 3.1391844749450684, + "learning_rate": 3.5033364561753393e-06, + "loss": 0.4902, + "step": 4740 + }, + { + "epoch": 2.24160756501182, + "grad_norm": 2.6383132934570312, + "learning_rate": 3.5027650421525762e-06, + "loss": 0.3832, + "step": 4741 + }, + { + "epoch": 2.242080378250591, + "grad_norm": 2.742546558380127, + "learning_rate": 3.5021935656920314e-06, + "loss": 0.4012, + "step": 4742 + }, + { + "epoch": 2.2425531914893617, + "grad_norm": 3.1243674755096436, + "learning_rate": 3.5016220268292873e-06, + "loss": 0.4271, + "step": 4743 + }, + { + "epoch": 2.2430260047281325, + "grad_norm": 2.794717788696289, + "learning_rate": 3.501050425599932e-06, + "loss": 0.4604, + "step": 4744 + }, + { + "epoch": 2.243498817966903, + "grad_norm": 2.8481621742248535, + "learning_rate": 3.5004787620395565e-06, + "loss": 0.4814, + "step": 4745 + }, + { + "epoch": 2.2439716312056737, + "grad_norm": 2.8842051029205322, + "learning_rate": 3.499907036183755e-06, + "loss": 0.4987, + "step": 4746 + }, + { + "epoch": 2.2444444444444445, + "grad_norm": 3.074805974960327, + "learning_rate": 3.4993352480681265e-06, + "loss": 0.4966, + "step": 4747 + }, + { + "epoch": 2.2449172576832153, + "grad_norm": 2.7204246520996094, + "learning_rate": 3.4987633977282742e-06, + "loss": 0.4, + "step": 4748 + }, + { + "epoch": 2.2453900709219856, + "grad_norm": 2.685884952545166, + "learning_rate": 3.4981914851998055e-06, + "loss": 0.4285, + "step": 4749 + }, + { + "epoch": 2.2458628841607564, + "grad_norm": 2.1666336059570312, + "learning_rate": 3.4976195105183287e-06, + "loss": 0.3756, + "step": 4750 + }, + { + "epoch": 2.246335697399527, + "grad_norm": 2.863006353378296, + "learning_rate": 3.49704747371946e-06, + "loss": 0.4535, + "step": 4751 + }, + { + "epoch": 2.246808510638298, + "grad_norm": 2.5558736324310303, + "learning_rate": 3.496475374838817e-06, + "loss": 0.4129, + "step": 4752 + }, + { + "epoch": 2.2472813238770684, + "grad_norm": 2.9780309200286865, + "learning_rate": 3.495903213912022e-06, + "loss": 0.4871, + "step": 4753 + }, + { + "epoch": 2.247754137115839, + "grad_norm": 2.951779365539551, + "learning_rate": 3.4953309909747e-06, + "loss": 0.5162, + "step": 4754 + }, + { + "epoch": 2.24822695035461, + "grad_norm": 2.7654693126678467, + "learning_rate": 3.4947587060624834e-06, + "loss": 0.4662, + "step": 4755 + }, + { + "epoch": 2.2486997635933808, + "grad_norm": 2.708247184753418, + "learning_rate": 3.494186359211002e-06, + "loss": 0.4279, + "step": 4756 + }, + { + "epoch": 2.249172576832151, + "grad_norm": 3.09916615486145, + "learning_rate": 3.4936139504558963e-06, + "loss": 0.4085, + "step": 4757 + }, + { + "epoch": 2.249645390070922, + "grad_norm": 2.913806200027466, + "learning_rate": 3.493041479832807e-06, + "loss": 0.4653, + "step": 4758 + }, + { + "epoch": 2.2501182033096927, + "grad_norm": 3.2903928756713867, + "learning_rate": 3.4924689473773787e-06, + "loss": 0.5167, + "step": 4759 + }, + { + "epoch": 2.2505910165484635, + "grad_norm": 3.1302902698516846, + "learning_rate": 3.4918963531252607e-06, + "loss": 0.5398, + "step": 4760 + }, + { + "epoch": 2.251063829787234, + "grad_norm": 2.8858273029327393, + "learning_rate": 3.4913236971121063e-06, + "loss": 0.4395, + "step": 4761 + }, + { + "epoch": 2.2515366430260046, + "grad_norm": 3.194521903991699, + "learning_rate": 3.4907509793735727e-06, + "loss": 0.5258, + "step": 4762 + }, + { + "epoch": 2.2520094562647754, + "grad_norm": 2.8640544414520264, + "learning_rate": 3.49017819994532e-06, + "loss": 0.4073, + "step": 4763 + }, + { + "epoch": 2.2524822695035462, + "grad_norm": 3.139995813369751, + "learning_rate": 3.489605358863011e-06, + "loss": 0.4653, + "step": 4764 + }, + { + "epoch": 2.2529550827423166, + "grad_norm": 2.6228537559509277, + "learning_rate": 3.489032456162317e-06, + "loss": 0.4546, + "step": 4765 + }, + { + "epoch": 2.2534278959810874, + "grad_norm": 2.8197672367095947, + "learning_rate": 3.4884594918789083e-06, + "loss": 0.479, + "step": 4766 + }, + { + "epoch": 2.253900709219858, + "grad_norm": 2.7839298248291016, + "learning_rate": 3.4878864660484612e-06, + "loss": 0.5081, + "step": 4767 + }, + { + "epoch": 2.254373522458629, + "grad_norm": 2.8630709648132324, + "learning_rate": 3.487313378706656e-06, + "loss": 0.4345, + "step": 4768 + }, + { + "epoch": 2.2548463356973993, + "grad_norm": 2.5661563873291016, + "learning_rate": 3.4867402298891755e-06, + "loss": 0.4266, + "step": 4769 + }, + { + "epoch": 2.25531914893617, + "grad_norm": 2.6274025440216064, + "learning_rate": 3.4861670196317084e-06, + "loss": 0.4645, + "step": 4770 + }, + { + "epoch": 2.255791962174941, + "grad_norm": 2.578702449798584, + "learning_rate": 3.485593747969944e-06, + "loss": 0.4242, + "step": 4771 + }, + { + "epoch": 2.2562647754137117, + "grad_norm": 2.322476625442505, + "learning_rate": 3.48502041493958e-06, + "loss": 0.3975, + "step": 4772 + }, + { + "epoch": 2.256737588652482, + "grad_norm": 2.8412630558013916, + "learning_rate": 3.484447020576313e-06, + "loss": 0.4276, + "step": 4773 + }, + { + "epoch": 2.257210401891253, + "grad_norm": 2.6090497970581055, + "learning_rate": 3.483873564915847e-06, + "loss": 0.429, + "step": 4774 + }, + { + "epoch": 2.2576832151300237, + "grad_norm": 2.692458152770996, + "learning_rate": 3.4833000479938877e-06, + "loss": 0.4211, + "step": 4775 + }, + { + "epoch": 2.2581560283687945, + "grad_norm": 2.5546815395355225, + "learning_rate": 3.482726469846146e-06, + "loss": 0.4751, + "step": 4776 + }, + { + "epoch": 2.258628841607565, + "grad_norm": 2.8409626483917236, + "learning_rate": 3.4821528305083376e-06, + "loss": 0.4821, + "step": 4777 + }, + { + "epoch": 2.2591016548463356, + "grad_norm": 2.722966432571411, + "learning_rate": 3.4815791300161785e-06, + "loss": 0.5029, + "step": 4778 + }, + { + "epoch": 2.2595744680851064, + "grad_norm": 2.691603899002075, + "learning_rate": 3.48100536840539e-06, + "loss": 0.4242, + "step": 4779 + }, + { + "epoch": 2.260047281323877, + "grad_norm": 2.64035964012146, + "learning_rate": 3.4804315457116992e-06, + "loss": 0.4033, + "step": 4780 + }, + { + "epoch": 2.2605200945626476, + "grad_norm": 2.758819580078125, + "learning_rate": 3.4798576619708357e-06, + "loss": 0.4321, + "step": 4781 + }, + { + "epoch": 2.2609929078014184, + "grad_norm": 2.8204405307769775, + "learning_rate": 3.4792837172185324e-06, + "loss": 0.4309, + "step": 4782 + }, + { + "epoch": 2.261465721040189, + "grad_norm": 2.529771327972412, + "learning_rate": 3.478709711490525e-06, + "loss": 0.4398, + "step": 4783 + }, + { + "epoch": 2.26193853427896, + "grad_norm": 2.8156251907348633, + "learning_rate": 3.4781356448225557e-06, + "loss": 0.447, + "step": 4784 + }, + { + "epoch": 2.2624113475177303, + "grad_norm": 2.689528703689575, + "learning_rate": 3.477561517250369e-06, + "loss": 0.3907, + "step": 4785 + }, + { + "epoch": 2.262884160756501, + "grad_norm": 2.9148027896881104, + "learning_rate": 3.476987328809713e-06, + "loss": 0.4287, + "step": 4786 + }, + { + "epoch": 2.263356973995272, + "grad_norm": 2.933021306991577, + "learning_rate": 3.4764130795363404e-06, + "loss": 0.4847, + "step": 4787 + }, + { + "epoch": 2.2638297872340427, + "grad_norm": 2.8559257984161377, + "learning_rate": 3.4758387694660064e-06, + "loss": 0.4554, + "step": 4788 + }, + { + "epoch": 2.264302600472813, + "grad_norm": 3.0355522632598877, + "learning_rate": 3.4752643986344707e-06, + "loss": 0.4286, + "step": 4789 + }, + { + "epoch": 2.264775413711584, + "grad_norm": 2.9768362045288086, + "learning_rate": 3.474689967077498e-06, + "loss": 0.4917, + "step": 4790 + }, + { + "epoch": 2.2652482269503547, + "grad_norm": 2.827971935272217, + "learning_rate": 3.474115474830855e-06, + "loss": 0.4542, + "step": 4791 + }, + { + "epoch": 2.2657210401891255, + "grad_norm": 2.559659719467163, + "learning_rate": 3.4735409219303123e-06, + "loss": 0.4168, + "step": 4792 + }, + { + "epoch": 2.266193853427896, + "grad_norm": 2.3172824382781982, + "learning_rate": 3.472966308411645e-06, + "loss": 0.3535, + "step": 4793 + }, + { + "epoch": 2.2666666666666666, + "grad_norm": 2.6779656410217285, + "learning_rate": 3.4723916343106327e-06, + "loss": 0.4599, + "step": 4794 + }, + { + "epoch": 2.2671394799054374, + "grad_norm": 2.55780291557312, + "learning_rate": 3.4718168996630573e-06, + "loss": 0.4185, + "step": 4795 + }, + { + "epoch": 2.267612293144208, + "grad_norm": 2.4929800033569336, + "learning_rate": 3.471242104504704e-06, + "loss": 0.4008, + "step": 4796 + }, + { + "epoch": 2.2680851063829786, + "grad_norm": 2.849475145339966, + "learning_rate": 3.4706672488713642e-06, + "loss": 0.396, + "step": 4797 + }, + { + "epoch": 2.2685579196217494, + "grad_norm": 2.4830739498138428, + "learning_rate": 3.4700923327988306e-06, + "loss": 0.4087, + "step": 4798 + }, + { + "epoch": 2.26903073286052, + "grad_norm": 3.2748119831085205, + "learning_rate": 3.469517356322901e-06, + "loss": 0.4496, + "step": 4799 + }, + { + "epoch": 2.269503546099291, + "grad_norm": 3.0440170764923096, + "learning_rate": 3.468942319479378e-06, + "loss": 0.4903, + "step": 4800 + }, + { + "epoch": 2.2699763593380613, + "grad_norm": 2.8200504779815674, + "learning_rate": 3.4683672223040645e-06, + "loss": 0.4588, + "step": 4801 + }, + { + "epoch": 2.270449172576832, + "grad_norm": 2.675206184387207, + "learning_rate": 3.4677920648327707e-06, + "loss": 0.4257, + "step": 4802 + }, + { + "epoch": 2.270921985815603, + "grad_norm": 2.862675905227661, + "learning_rate": 3.4672168471013084e-06, + "loss": 0.466, + "step": 4803 + }, + { + "epoch": 2.2713947990543737, + "grad_norm": 2.65663743019104, + "learning_rate": 3.4666415691454947e-06, + "loss": 0.4784, + "step": 4804 + }, + { + "epoch": 2.271867612293144, + "grad_norm": 2.5610506534576416, + "learning_rate": 3.4660662310011483e-06, + "loss": 0.4429, + "step": 4805 + }, + { + "epoch": 2.272340425531915, + "grad_norm": 2.6459643840789795, + "learning_rate": 3.465490832704094e-06, + "loss": 0.4345, + "step": 4806 + }, + { + "epoch": 2.2728132387706856, + "grad_norm": 2.426013469696045, + "learning_rate": 3.4649153742901585e-06, + "loss": 0.4533, + "step": 4807 + }, + { + "epoch": 2.2732860520094564, + "grad_norm": 2.6714842319488525, + "learning_rate": 3.4643398557951745e-06, + "loss": 0.4409, + "step": 4808 + }, + { + "epoch": 2.273758865248227, + "grad_norm": 2.703629493713379, + "learning_rate": 3.463764277254976e-06, + "loss": 0.3656, + "step": 4809 + }, + { + "epoch": 2.2742316784869976, + "grad_norm": 2.811753988265991, + "learning_rate": 3.4631886387054025e-06, + "loss": 0.4957, + "step": 4810 + }, + { + "epoch": 2.2747044917257684, + "grad_norm": 2.9469289779663086, + "learning_rate": 3.462612940182295e-06, + "loss": 0.4582, + "step": 4811 + }, + { + "epoch": 2.275177304964539, + "grad_norm": 2.6287801265716553, + "learning_rate": 3.462037181721501e-06, + "loss": 0.4072, + "step": 4812 + }, + { + "epoch": 2.2756501182033095, + "grad_norm": 2.7104952335357666, + "learning_rate": 3.46146136335887e-06, + "loss": 0.4998, + "step": 4813 + }, + { + "epoch": 2.2761229314420803, + "grad_norm": 3.170363187789917, + "learning_rate": 3.460885485130256e-06, + "loss": 0.4722, + "step": 4814 + }, + { + "epoch": 2.276595744680851, + "grad_norm": 2.7315151691436768, + "learning_rate": 3.460309547071516e-06, + "loss": 0.4482, + "step": 4815 + }, + { + "epoch": 2.277068557919622, + "grad_norm": 2.685988187789917, + "learning_rate": 3.4597335492185113e-06, + "loss": 0.4419, + "step": 4816 + }, + { + "epoch": 2.2775413711583923, + "grad_norm": 2.532790184020996, + "learning_rate": 3.459157491607107e-06, + "loss": 0.3961, + "step": 4817 + }, + { + "epoch": 2.278014184397163, + "grad_norm": 2.920729875564575, + "learning_rate": 3.458581374273171e-06, + "loss": 0.4767, + "step": 4818 + }, + { + "epoch": 2.278486997635934, + "grad_norm": 3.2481250762939453, + "learning_rate": 3.458005197252577e-06, + "loss": 0.4985, + "step": 4819 + }, + { + "epoch": 2.2789598108747047, + "grad_norm": 2.373809814453125, + "learning_rate": 3.4574289605811994e-06, + "loss": 0.4259, + "step": 4820 + }, + { + "epoch": 2.279432624113475, + "grad_norm": 2.7851033210754395, + "learning_rate": 3.4568526642949184e-06, + "loss": 0.4829, + "step": 4821 + }, + { + "epoch": 2.279905437352246, + "grad_norm": 2.9777133464813232, + "learning_rate": 3.456276308429618e-06, + "loss": 0.4896, + "step": 4822 + }, + { + "epoch": 2.2803782505910166, + "grad_norm": 2.7922022342681885, + "learning_rate": 3.4556998930211853e-06, + "loss": 0.4908, + "step": 4823 + }, + { + "epoch": 2.2808510638297874, + "grad_norm": 2.699180841445923, + "learning_rate": 3.4551234181055104e-06, + "loss": 0.4518, + "step": 4824 + }, + { + "epoch": 2.2813238770685578, + "grad_norm": 3.1200520992279053, + "learning_rate": 3.4545468837184885e-06, + "loss": 0.4877, + "step": 4825 + }, + { + "epoch": 2.2817966903073286, + "grad_norm": 2.56782603263855, + "learning_rate": 3.453970289896018e-06, + "loss": 0.4281, + "step": 4826 + }, + { + "epoch": 2.2822695035460994, + "grad_norm": 3.241356372833252, + "learning_rate": 3.4533936366740007e-06, + "loss": 0.4338, + "step": 4827 + }, + { + "epoch": 2.28274231678487, + "grad_norm": 3.560295343399048, + "learning_rate": 3.452816924088342e-06, + "loss": 0.4121, + "step": 4828 + }, + { + "epoch": 2.2832151300236405, + "grad_norm": 2.8512449264526367, + "learning_rate": 3.452240152174951e-06, + "loss": 0.4357, + "step": 4829 + }, + { + "epoch": 2.2836879432624113, + "grad_norm": 3.0332651138305664, + "learning_rate": 3.4516633209697408e-06, + "loss": 0.4985, + "step": 4830 + }, + { + "epoch": 2.284160756501182, + "grad_norm": 2.520930528640747, + "learning_rate": 3.451086430508629e-06, + "loss": 0.4021, + "step": 4831 + }, + { + "epoch": 2.284633569739953, + "grad_norm": 2.508227825164795, + "learning_rate": 3.4505094808275363e-06, + "loss": 0.3935, + "step": 4832 + }, + { + "epoch": 2.2851063829787233, + "grad_norm": 2.56752610206604, + "learning_rate": 3.449932471962385e-06, + "loss": 0.4689, + "step": 4833 + }, + { + "epoch": 2.285579196217494, + "grad_norm": 2.7757534980773926, + "learning_rate": 3.449355403949105e-06, + "loss": 0.4565, + "step": 4834 + }, + { + "epoch": 2.286052009456265, + "grad_norm": 3.364821195602417, + "learning_rate": 3.448778276823626e-06, + "loss": 0.4729, + "step": 4835 + }, + { + "epoch": 2.2865248226950357, + "grad_norm": 3.0045557022094727, + "learning_rate": 3.448201090621884e-06, + "loss": 0.4834, + "step": 4836 + }, + { + "epoch": 2.286997635933806, + "grad_norm": 2.9451794624328613, + "learning_rate": 3.4476238453798183e-06, + "loss": 0.489, + "step": 4837 + }, + { + "epoch": 2.287470449172577, + "grad_norm": 2.8307435512542725, + "learning_rate": 3.4470465411333708e-06, + "loss": 0.5079, + "step": 4838 + }, + { + "epoch": 2.2879432624113476, + "grad_norm": 2.7118136882781982, + "learning_rate": 3.4464691779184876e-06, + "loss": 0.4794, + "step": 4839 + }, + { + "epoch": 2.2884160756501184, + "grad_norm": 2.6724441051483154, + "learning_rate": 3.445891755771119e-06, + "loss": 0.4619, + "step": 4840 + }, + { + "epoch": 2.2888888888888888, + "grad_norm": 2.8161258697509766, + "learning_rate": 3.445314274727218e-06, + "loss": 0.4287, + "step": 4841 + }, + { + "epoch": 2.2893617021276595, + "grad_norm": 2.5681750774383545, + "learning_rate": 3.4447367348227433e-06, + "loss": 0.4167, + "step": 4842 + }, + { + "epoch": 2.2898345153664303, + "grad_norm": 2.8136284351348877, + "learning_rate": 3.444159136093654e-06, + "loss": 0.4195, + "step": 4843 + }, + { + "epoch": 2.290307328605201, + "grad_norm": 3.153651714324951, + "learning_rate": 3.443581478575915e-06, + "loss": 0.4821, + "step": 4844 + }, + { + "epoch": 2.2907801418439715, + "grad_norm": 2.980883836746216, + "learning_rate": 3.4430037623054953e-06, + "loss": 0.4627, + "step": 4845 + }, + { + "epoch": 2.2912529550827423, + "grad_norm": 2.786182403564453, + "learning_rate": 3.4424259873183664e-06, + "loss": 0.4342, + "step": 4846 + }, + { + "epoch": 2.291725768321513, + "grad_norm": 2.8938279151916504, + "learning_rate": 3.4418481536505026e-06, + "loss": 0.3997, + "step": 4847 + }, + { + "epoch": 2.2921985815602834, + "grad_norm": 2.5534510612487793, + "learning_rate": 3.4412702613378844e-06, + "loss": 0.3982, + "step": 4848 + }, + { + "epoch": 2.2926713947990542, + "grad_norm": 2.7907063961029053, + "learning_rate": 3.4406923104164956e-06, + "loss": 0.4484, + "step": 4849 + }, + { + "epoch": 2.293144208037825, + "grad_norm": 3.162702798843384, + "learning_rate": 3.4401143009223203e-06, + "loss": 0.4528, + "step": 4850 + }, + { + "epoch": 2.293617021276596, + "grad_norm": 2.4647393226623535, + "learning_rate": 3.4395362328913505e-06, + "loss": 0.3759, + "step": 4851 + }, + { + "epoch": 2.2940898345153666, + "grad_norm": 2.8219876289367676, + "learning_rate": 3.438958106359579e-06, + "loss": 0.4903, + "step": 4852 + }, + { + "epoch": 2.294562647754137, + "grad_norm": 2.827073097229004, + "learning_rate": 3.438379921363003e-06, + "loss": 0.4315, + "step": 4853 + }, + { + "epoch": 2.295035460992908, + "grad_norm": 2.472470283508301, + "learning_rate": 3.4378016779376244e-06, + "loss": 0.4478, + "step": 4854 + }, + { + "epoch": 2.2955082742316786, + "grad_norm": 3.3994734287261963, + "learning_rate": 3.4372233761194473e-06, + "loss": 0.5086, + "step": 4855 + }, + { + "epoch": 2.295981087470449, + "grad_norm": 3.030465602874756, + "learning_rate": 3.4366450159444796e-06, + "loss": 0.4159, + "step": 4856 + }, + { + "epoch": 2.2964539007092197, + "grad_norm": 2.5460705757141113, + "learning_rate": 3.4360665974487346e-06, + "loss": 0.4097, + "step": 4857 + }, + { + "epoch": 2.2969267139479905, + "grad_norm": 2.884469509124756, + "learning_rate": 3.4354881206682273e-06, + "loss": 0.4478, + "step": 4858 + }, + { + "epoch": 2.2973995271867613, + "grad_norm": 2.5139710903167725, + "learning_rate": 3.4349095856389765e-06, + "loss": 0.4286, + "step": 4859 + }, + { + "epoch": 2.297872340425532, + "grad_norm": 3.1628260612487793, + "learning_rate": 3.4343309923970053e-06, + "loss": 0.4617, + "step": 4860 + }, + { + "epoch": 2.2983451536643025, + "grad_norm": 2.6141695976257324, + "learning_rate": 3.4337523409783395e-06, + "loss": 0.3841, + "step": 4861 + }, + { + "epoch": 2.2988179669030733, + "grad_norm": 2.766834259033203, + "learning_rate": 3.43317363141901e-06, + "loss": 0.4484, + "step": 4862 + }, + { + "epoch": 2.299290780141844, + "grad_norm": 2.785491943359375, + "learning_rate": 3.4325948637550503e-06, + "loss": 0.4363, + "step": 4863 + }, + { + "epoch": 2.2997635933806144, + "grad_norm": 2.624929189682007, + "learning_rate": 3.4320160380224988e-06, + "loss": 0.4518, + "step": 4864 + }, + { + "epoch": 2.300236406619385, + "grad_norm": 2.895413398742676, + "learning_rate": 3.4314371542573944e-06, + "loss": 0.4745, + "step": 4865 + }, + { + "epoch": 2.300709219858156, + "grad_norm": 2.603816270828247, + "learning_rate": 3.430858212495783e-06, + "loss": 0.4444, + "step": 4866 + }, + { + "epoch": 2.301182033096927, + "grad_norm": 3.387360095977783, + "learning_rate": 3.4302792127737116e-06, + "loss": 0.4169, + "step": 4867 + }, + { + "epoch": 2.3016548463356976, + "grad_norm": 2.894054651260376, + "learning_rate": 3.4297001551272334e-06, + "loss": 0.4493, + "step": 4868 + }, + { + "epoch": 2.302127659574468, + "grad_norm": 3.0432028770446777, + "learning_rate": 3.4291210395924035e-06, + "loss": 0.4854, + "step": 4869 + }, + { + "epoch": 2.3026004728132388, + "grad_norm": 2.5144734382629395, + "learning_rate": 3.42854186620528e-06, + "loss": 0.4556, + "step": 4870 + }, + { + "epoch": 2.3030732860520096, + "grad_norm": 2.964812755584717, + "learning_rate": 3.427962635001926e-06, + "loss": 0.495, + "step": 4871 + }, + { + "epoch": 2.30354609929078, + "grad_norm": 2.9991118907928467, + "learning_rate": 3.4273833460184077e-06, + "loss": 0.4787, + "step": 4872 + }, + { + "epoch": 2.3040189125295507, + "grad_norm": 2.9424328804016113, + "learning_rate": 3.4268039992907955e-06, + "loss": 0.5006, + "step": 4873 + }, + { + "epoch": 2.3044917257683215, + "grad_norm": 2.792880058288574, + "learning_rate": 3.426224594855162e-06, + "loss": 0.4399, + "step": 4874 + }, + { + "epoch": 2.3049645390070923, + "grad_norm": 2.5308053493499756, + "learning_rate": 3.4256451327475838e-06, + "loss": 0.4843, + "step": 4875 + }, + { + "epoch": 2.305437352245863, + "grad_norm": 2.7937564849853516, + "learning_rate": 3.425065613004142e-06, + "loss": 0.4428, + "step": 4876 + }, + { + "epoch": 2.3059101654846335, + "grad_norm": 2.4231557846069336, + "learning_rate": 3.424486035660921e-06, + "loss": 0.4054, + "step": 4877 + }, + { + "epoch": 2.3063829787234043, + "grad_norm": 3.0622596740722656, + "learning_rate": 3.423906400754009e-06, + "loss": 0.4623, + "step": 4878 + }, + { + "epoch": 2.306855791962175, + "grad_norm": 2.6532933712005615, + "learning_rate": 3.4233267083194955e-06, + "loss": 0.4387, + "step": 4879 + }, + { + "epoch": 2.3073286052009454, + "grad_norm": 2.793325185775757, + "learning_rate": 3.422746958393477e-06, + "loss": 0.4047, + "step": 4880 + }, + { + "epoch": 2.307801418439716, + "grad_norm": 2.9178314208984375, + "learning_rate": 3.422167151012052e-06, + "loss": 0.4397, + "step": 4881 + }, + { + "epoch": 2.308274231678487, + "grad_norm": 3.463913917541504, + "learning_rate": 3.4215872862113214e-06, + "loss": 0.4347, + "step": 4882 + }, + { + "epoch": 2.308747044917258, + "grad_norm": 3.228403091430664, + "learning_rate": 3.421007364027392e-06, + "loss": 0.4405, + "step": 4883 + }, + { + "epoch": 2.3092198581560286, + "grad_norm": 2.896933078765869, + "learning_rate": 3.420427384496372e-06, + "loss": 0.4429, + "step": 4884 + }, + { + "epoch": 2.309692671394799, + "grad_norm": 2.5559937953948975, + "learning_rate": 3.4198473476543755e-06, + "loss": 0.4281, + "step": 4885 + }, + { + "epoch": 2.3101654846335697, + "grad_norm": 3.457918167114258, + "learning_rate": 3.419267253537517e-06, + "loss": 0.4495, + "step": 4886 + }, + { + "epoch": 2.3106382978723405, + "grad_norm": 2.6554839611053467, + "learning_rate": 3.418687102181918e-06, + "loss": 0.4682, + "step": 4887 + }, + { + "epoch": 2.311111111111111, + "grad_norm": 2.8171639442443848, + "learning_rate": 3.4181068936237024e-06, + "loss": 0.4184, + "step": 4888 + }, + { + "epoch": 2.3115839243498817, + "grad_norm": 2.9272499084472656, + "learning_rate": 3.4175266278989955e-06, + "loss": 0.5445, + "step": 4889 + }, + { + "epoch": 2.3120567375886525, + "grad_norm": 2.5928499698638916, + "learning_rate": 3.4169463050439284e-06, + "loss": 0.3808, + "step": 4890 + }, + { + "epoch": 2.3125295508274233, + "grad_norm": 2.6624577045440674, + "learning_rate": 3.4163659250946356e-06, + "loss": 0.4678, + "step": 4891 + }, + { + "epoch": 2.313002364066194, + "grad_norm": 2.666555643081665, + "learning_rate": 3.4157854880872553e-06, + "loss": 0.457, + "step": 4892 + }, + { + "epoch": 2.3134751773049644, + "grad_norm": 3.2987406253814697, + "learning_rate": 3.4152049940579278e-06, + "loss": 0.551, + "step": 4893 + }, + { + "epoch": 2.3139479905437352, + "grad_norm": 2.728119134902954, + "learning_rate": 3.414624443042799e-06, + "loss": 0.3935, + "step": 4894 + }, + { + "epoch": 2.314420803782506, + "grad_norm": 3.133005380630493, + "learning_rate": 3.4140438350780157e-06, + "loss": 0.4981, + "step": 4895 + }, + { + "epoch": 2.3148936170212764, + "grad_norm": 2.591252565383911, + "learning_rate": 3.4134631701997312e-06, + "loss": 0.4251, + "step": 4896 + }, + { + "epoch": 2.315366430260047, + "grad_norm": 3.007136344909668, + "learning_rate": 3.412882448444101e-06, + "loss": 0.4492, + "step": 4897 + }, + { + "epoch": 2.315839243498818, + "grad_norm": 2.6391026973724365, + "learning_rate": 3.412301669847284e-06, + "loss": 0.5151, + "step": 4898 + }, + { + "epoch": 2.3163120567375888, + "grad_norm": 7.453699111938477, + "learning_rate": 3.411720834445441e-06, + "loss": 0.4983, + "step": 4899 + }, + { + "epoch": 2.3167848699763596, + "grad_norm": 2.667712688446045, + "learning_rate": 3.41113994227474e-06, + "loss": 0.4581, + "step": 4900 + }, + { + "epoch": 2.31725768321513, + "grad_norm": 2.7727627754211426, + "learning_rate": 3.41055899337135e-06, + "loss": 0.4731, + "step": 4901 + }, + { + "epoch": 2.3177304964539007, + "grad_norm": 3.0096890926361084, + "learning_rate": 3.409977987771444e-06, + "loss": 0.4996, + "step": 4902 + }, + { + "epoch": 2.3182033096926715, + "grad_norm": 2.725830078125, + "learning_rate": 3.4093969255111993e-06, + "loss": 0.4544, + "step": 4903 + }, + { + "epoch": 2.318676122931442, + "grad_norm": 2.7596993446350098, + "learning_rate": 3.4088158066267945e-06, + "loss": 0.4846, + "step": 4904 + }, + { + "epoch": 2.3191489361702127, + "grad_norm": 2.702620029449463, + "learning_rate": 3.4082346311544156e-06, + "loss": 0.4849, + "step": 4905 + }, + { + "epoch": 2.3196217494089835, + "grad_norm": 2.725374460220337, + "learning_rate": 3.407653399130249e-06, + "loss": 0.4116, + "step": 4906 + }, + { + "epoch": 2.3200945626477543, + "grad_norm": 2.6770219802856445, + "learning_rate": 3.4070721105904847e-06, + "loss": 0.4606, + "step": 4907 + }, + { + "epoch": 2.320567375886525, + "grad_norm": 2.9249117374420166, + "learning_rate": 3.406490765571317e-06, + "loss": 0.461, + "step": 4908 + }, + { + "epoch": 2.3210401891252954, + "grad_norm": 2.7568278312683105, + "learning_rate": 3.405909364108944e-06, + "loss": 0.4065, + "step": 4909 + }, + { + "epoch": 2.321513002364066, + "grad_norm": 2.7231340408325195, + "learning_rate": 3.4053279062395676e-06, + "loss": 0.4173, + "step": 4910 + }, + { + "epoch": 2.321985815602837, + "grad_norm": 3.1401100158691406, + "learning_rate": 3.404746391999393e-06, + "loss": 0.4287, + "step": 4911 + }, + { + "epoch": 2.3224586288416074, + "grad_norm": 2.714853525161743, + "learning_rate": 3.404164821424627e-06, + "loss": 0.4552, + "step": 4912 + }, + { + "epoch": 2.322931442080378, + "grad_norm": 3.1509978771209717, + "learning_rate": 3.4035831945514825e-06, + "loss": 0.5296, + "step": 4913 + }, + { + "epoch": 2.323404255319149, + "grad_norm": 2.567194938659668, + "learning_rate": 3.403001511416174e-06, + "loss": 0.4306, + "step": 4914 + }, + { + "epoch": 2.3238770685579198, + "grad_norm": 2.7473888397216797, + "learning_rate": 3.402419772054922e-06, + "loss": 0.4009, + "step": 4915 + }, + { + "epoch": 2.3243498817966906, + "grad_norm": 2.8617780208587646, + "learning_rate": 3.401837976503947e-06, + "loss": 0.4545, + "step": 4916 + }, + { + "epoch": 2.324822695035461, + "grad_norm": 2.3650572299957275, + "learning_rate": 3.401256124799475e-06, + "loss": 0.4046, + "step": 4917 + }, + { + "epoch": 2.3252955082742317, + "grad_norm": 2.418407678604126, + "learning_rate": 3.4006742169777364e-06, + "loss": 0.4222, + "step": 4918 + }, + { + "epoch": 2.3257683215130025, + "grad_norm": 2.7232494354248047, + "learning_rate": 3.400092253074964e-06, + "loss": 0.4373, + "step": 4919 + }, + { + "epoch": 2.326241134751773, + "grad_norm": 2.702965497970581, + "learning_rate": 3.399510233127394e-06, + "loss": 0.437, + "step": 4920 + }, + { + "epoch": 2.3267139479905437, + "grad_norm": 2.8381760120391846, + "learning_rate": 3.3989281571712664e-06, + "loss": 0.4294, + "step": 4921 + }, + { + "epoch": 2.3271867612293144, + "grad_norm": 2.767131805419922, + "learning_rate": 3.398346025242823e-06, + "loss": 0.4673, + "step": 4922 + }, + { + "epoch": 2.3276595744680852, + "grad_norm": 2.5261805057525635, + "learning_rate": 3.3977638373783123e-06, + "loss": 0.4147, + "step": 4923 + }, + { + "epoch": 2.3281323877068556, + "grad_norm": 2.7176897525787354, + "learning_rate": 3.3971815936139836e-06, + "loss": 0.3885, + "step": 4924 + }, + { + "epoch": 2.3286052009456264, + "grad_norm": 2.849043130874634, + "learning_rate": 3.396599293986092e-06, + "loss": 0.4842, + "step": 4925 + }, + { + "epoch": 2.329078014184397, + "grad_norm": 2.550673484802246, + "learning_rate": 3.3960169385308927e-06, + "loss": 0.4049, + "step": 4926 + }, + { + "epoch": 2.329550827423168, + "grad_norm": 3.0821585655212402, + "learning_rate": 3.3954345272846477e-06, + "loss": 0.53, + "step": 4927 + }, + { + "epoch": 2.3300236406619383, + "grad_norm": 2.68658185005188, + "learning_rate": 3.3948520602836223e-06, + "loss": 0.4592, + "step": 4928 + }, + { + "epoch": 2.330496453900709, + "grad_norm": 2.7391903400421143, + "learning_rate": 3.394269537564082e-06, + "loss": 0.4773, + "step": 4929 + }, + { + "epoch": 2.33096926713948, + "grad_norm": 2.665114164352417, + "learning_rate": 3.393686959162299e-06, + "loss": 0.4671, + "step": 4930 + }, + { + "epoch": 2.3314420803782507, + "grad_norm": 2.6827399730682373, + "learning_rate": 3.3931043251145477e-06, + "loss": 0.4669, + "step": 4931 + }, + { + "epoch": 2.331914893617021, + "grad_norm": 3.1760666370391846, + "learning_rate": 3.392521635457106e-06, + "loss": 0.4729, + "step": 4932 + }, + { + "epoch": 2.332387706855792, + "grad_norm": 2.9686226844787598, + "learning_rate": 3.3919388902262555e-06, + "loss": 0.5017, + "step": 4933 + }, + { + "epoch": 2.3328605200945627, + "grad_norm": 2.471325397491455, + "learning_rate": 3.3913560894582818e-06, + "loss": 0.4195, + "step": 4934 + }, + { + "epoch": 2.3333333333333335, + "grad_norm": 2.4062955379486084, + "learning_rate": 3.3907732331894732e-06, + "loss": 0.3666, + "step": 4935 + }, + { + "epoch": 2.333806146572104, + "grad_norm": 2.6800320148468018, + "learning_rate": 3.3901903214561206e-06, + "loss": 0.4774, + "step": 4936 + }, + { + "epoch": 2.3342789598108746, + "grad_norm": 2.923741102218628, + "learning_rate": 3.389607354294521e-06, + "loss": 0.4546, + "step": 4937 + }, + { + "epoch": 2.3347517730496454, + "grad_norm": 3.0034096240997314, + "learning_rate": 3.3890243317409716e-06, + "loss": 0.5373, + "step": 4938 + }, + { + "epoch": 2.3352245862884162, + "grad_norm": 3.0757339000701904, + "learning_rate": 3.388441253831775e-06, + "loss": 0.4655, + "step": 4939 + }, + { + "epoch": 2.3356973995271866, + "grad_norm": 2.5352041721343994, + "learning_rate": 3.3878581206032373e-06, + "loss": 0.4391, + "step": 4940 + }, + { + "epoch": 2.3361702127659574, + "grad_norm": 2.9332237243652344, + "learning_rate": 3.3872749320916675e-06, + "loss": 0.4685, + "step": 4941 + }, + { + "epoch": 2.336643026004728, + "grad_norm": 2.4871222972869873, + "learning_rate": 3.386691688333379e-06, + "loss": 0.3952, + "step": 4942 + }, + { + "epoch": 2.337115839243499, + "grad_norm": 2.6384918689727783, + "learning_rate": 3.386108389364687e-06, + "loss": 0.4044, + "step": 4943 + }, + { + "epoch": 2.3375886524822693, + "grad_norm": 2.3545165061950684, + "learning_rate": 3.3855250352219102e-06, + "loss": 0.426, + "step": 4944 + }, + { + "epoch": 2.33806146572104, + "grad_norm": 2.972242593765259, + "learning_rate": 3.3849416259413735e-06, + "loss": 0.5033, + "step": 4945 + }, + { + "epoch": 2.338534278959811, + "grad_norm": 3.117351770401001, + "learning_rate": 3.384358161559401e-06, + "loss": 0.4695, + "step": 4946 + }, + { + "epoch": 2.3390070921985817, + "grad_norm": 2.888916492462158, + "learning_rate": 3.383774642112324e-06, + "loss": 0.437, + "step": 4947 + }, + { + "epoch": 2.339479905437352, + "grad_norm": 3.0677435398101807, + "learning_rate": 3.3831910676364753e-06, + "loss": 0.4293, + "step": 4948 + }, + { + "epoch": 2.339952718676123, + "grad_norm": 2.8571784496307373, + "learning_rate": 3.3826074381681916e-06, + "loss": 0.4574, + "step": 4949 + }, + { + "epoch": 2.3404255319148937, + "grad_norm": 2.907276153564453, + "learning_rate": 3.3820237537438127e-06, + "loss": 0.4731, + "step": 4950 + }, + { + "epoch": 2.3408983451536645, + "grad_norm": 2.923762559890747, + "learning_rate": 3.3814400143996823e-06, + "loss": 0.4648, + "step": 4951 + }, + { + "epoch": 2.341371158392435, + "grad_norm": 2.6206982135772705, + "learning_rate": 3.3808562201721473e-06, + "loss": 0.436, + "step": 4952 + }, + { + "epoch": 2.3418439716312056, + "grad_norm": 6.279088973999023, + "learning_rate": 3.380272371097558e-06, + "loss": 0.4461, + "step": 4953 + }, + { + "epoch": 2.3423167848699764, + "grad_norm": 2.785297155380249, + "learning_rate": 3.3796884672122684e-06, + "loss": 0.4619, + "step": 4954 + }, + { + "epoch": 2.342789598108747, + "grad_norm": 2.6241793632507324, + "learning_rate": 3.379104508552634e-06, + "loss": 0.4323, + "step": 4955 + }, + { + "epoch": 2.3432624113475176, + "grad_norm": 2.6052167415618896, + "learning_rate": 3.378520495155017e-06, + "loss": 0.3943, + "step": 4956 + }, + { + "epoch": 2.3437352245862884, + "grad_norm": 2.8247411251068115, + "learning_rate": 3.3779364270557818e-06, + "loss": 0.4689, + "step": 4957 + }, + { + "epoch": 2.344208037825059, + "grad_norm": 2.5348927974700928, + "learning_rate": 3.377352304291294e-06, + "loss": 0.4619, + "step": 4958 + }, + { + "epoch": 2.34468085106383, + "grad_norm": 2.906648874282837, + "learning_rate": 3.376768126897926e-06, + "loss": 0.5191, + "step": 4959 + }, + { + "epoch": 2.3451536643026003, + "grad_norm": 2.796870470046997, + "learning_rate": 3.3761838949120514e-06, + "loss": 0.4227, + "step": 4960 + }, + { + "epoch": 2.345626477541371, + "grad_norm": 2.789635419845581, + "learning_rate": 3.3755996083700464e-06, + "loss": 0.3927, + "step": 4961 + }, + { + "epoch": 2.346099290780142, + "grad_norm": 2.86641263961792, + "learning_rate": 3.375015267308295e-06, + "loss": 0.4097, + "step": 4962 + }, + { + "epoch": 2.3465721040189127, + "grad_norm": 2.8374414443969727, + "learning_rate": 3.374430871763178e-06, + "loss": 0.4566, + "step": 4963 + }, + { + "epoch": 2.347044917257683, + "grad_norm": 2.71951961517334, + "learning_rate": 3.3738464217710854e-06, + "loss": 0.4748, + "step": 4964 + }, + { + "epoch": 2.347517730496454, + "grad_norm": 2.6939785480499268, + "learning_rate": 3.373261917368408e-06, + "loss": 0.4499, + "step": 4965 + }, + { + "epoch": 2.3479905437352246, + "grad_norm": 2.862661600112915, + "learning_rate": 3.37267735859154e-06, + "loss": 0.415, + "step": 4966 + }, + { + "epoch": 2.3484633569739954, + "grad_norm": 2.3657119274139404, + "learning_rate": 3.3720927454768793e-06, + "loss": 0.4112, + "step": 4967 + }, + { + "epoch": 2.348936170212766, + "grad_norm": 3.701571464538574, + "learning_rate": 3.3715080780608277e-06, + "loss": 0.4735, + "step": 4968 + }, + { + "epoch": 2.3494089834515366, + "grad_norm": 2.894350528717041, + "learning_rate": 3.3709233563797895e-06, + "loss": 0.4278, + "step": 4969 + }, + { + "epoch": 2.3498817966903074, + "grad_norm": 3.0072877407073975, + "learning_rate": 3.3703385804701727e-06, + "loss": 0.4718, + "step": 4970 + }, + { + "epoch": 2.350354609929078, + "grad_norm": 2.9920408725738525, + "learning_rate": 3.369753750368389e-06, + "loss": 0.4636, + "step": 4971 + }, + { + "epoch": 2.3508274231678485, + "grad_norm": 2.381770372390747, + "learning_rate": 3.369168866110853e-06, + "loss": 0.3841, + "step": 4972 + }, + { + "epoch": 2.3513002364066193, + "grad_norm": 2.6195342540740967, + "learning_rate": 3.3685839277339825e-06, + "loss": 0.4422, + "step": 4973 + }, + { + "epoch": 2.35177304964539, + "grad_norm": 2.885852575302124, + "learning_rate": 3.3679989352741992e-06, + "loss": 0.4798, + "step": 4974 + }, + { + "epoch": 2.352245862884161, + "grad_norm": 2.820004940032959, + "learning_rate": 3.367413888767929e-06, + "loss": 0.4498, + "step": 4975 + }, + { + "epoch": 2.3527186761229313, + "grad_norm": 2.579680919647217, + "learning_rate": 3.366828788251599e-06, + "loss": 0.4894, + "step": 4976 + }, + { + "epoch": 2.353191489361702, + "grad_norm": 2.7509915828704834, + "learning_rate": 3.366243633761642e-06, + "loss": 0.4354, + "step": 4977 + }, + { + "epoch": 2.353664302600473, + "grad_norm": 3.061767339706421, + "learning_rate": 3.3656584253344917e-06, + "loss": 0.4651, + "step": 4978 + }, + { + "epoch": 2.3541371158392437, + "grad_norm": 2.6109485626220703, + "learning_rate": 3.365073163006587e-06, + "loss": 0.44, + "step": 4979 + }, + { + "epoch": 2.354609929078014, + "grad_norm": 3.4247376918792725, + "learning_rate": 3.36448784681437e-06, + "loss": 0.3993, + "step": 4980 + }, + { + "epoch": 2.355082742316785, + "grad_norm": 2.953695297241211, + "learning_rate": 3.363902476794285e-06, + "loss": 0.4763, + "step": 4981 + }, + { + "epoch": 2.3555555555555556, + "grad_norm": 2.836543083190918, + "learning_rate": 3.3633170529827806e-06, + "loss": 0.4755, + "step": 4982 + }, + { + "epoch": 2.3560283687943264, + "grad_norm": 2.944082021713257, + "learning_rate": 3.36273157541631e-06, + "loss": 0.472, + "step": 4983 + }, + { + "epoch": 2.3565011820330968, + "grad_norm": 2.891716957092285, + "learning_rate": 3.3621460441313262e-06, + "loss": 0.5259, + "step": 4984 + }, + { + "epoch": 2.3569739952718676, + "grad_norm": 2.8448829650878906, + "learning_rate": 3.3615604591642896e-06, + "loss": 0.4587, + "step": 4985 + }, + { + "epoch": 2.3574468085106384, + "grad_norm": 3.114393711090088, + "learning_rate": 3.36097482055166e-06, + "loss": 0.4352, + "step": 4986 + }, + { + "epoch": 2.357919621749409, + "grad_norm": 2.964851140975952, + "learning_rate": 3.360389128329904e-06, + "loss": 0.5015, + "step": 4987 + }, + { + "epoch": 2.3583924349881795, + "grad_norm": 2.4819815158843994, + "learning_rate": 3.3598033825354893e-06, + "loss": 0.3459, + "step": 4988 + }, + { + "epoch": 2.3588652482269503, + "grad_norm": 2.635754346847534, + "learning_rate": 3.359217583204889e-06, + "loss": 0.4367, + "step": 4989 + }, + { + "epoch": 2.359338061465721, + "grad_norm": 2.542482376098633, + "learning_rate": 3.358631730374576e-06, + "loss": 0.3978, + "step": 4990 + }, + { + "epoch": 2.359810874704492, + "grad_norm": 2.614018678665161, + "learning_rate": 3.358045824081031e-06, + "loss": 0.424, + "step": 4991 + }, + { + "epoch": 2.3602836879432623, + "grad_norm": 2.775373697280884, + "learning_rate": 3.3574598643607354e-06, + "loss": 0.4901, + "step": 4992 + }, + { + "epoch": 2.360756501182033, + "grad_norm": 3.091381311416626, + "learning_rate": 3.356873851250173e-06, + "loss": 0.4954, + "step": 4993 + }, + { + "epoch": 2.361229314420804, + "grad_norm": 2.440023422241211, + "learning_rate": 3.3562877847858337e-06, + "loss": 0.4053, + "step": 4994 + }, + { + "epoch": 2.3617021276595747, + "grad_norm": 2.8879518508911133, + "learning_rate": 3.3557016650042084e-06, + "loss": 0.4766, + "step": 4995 + }, + { + "epoch": 2.362174940898345, + "grad_norm": 3.1298391819000244, + "learning_rate": 3.355115491941793e-06, + "loss": 0.4743, + "step": 4996 + }, + { + "epoch": 2.362647754137116, + "grad_norm": 3.3325259685516357, + "learning_rate": 3.3545292656350845e-06, + "loss": 0.4703, + "step": 4997 + }, + { + "epoch": 2.3631205673758866, + "grad_norm": 2.7935359477996826, + "learning_rate": 3.353942986120587e-06, + "loss": 0.432, + "step": 4998 + }, + { + "epoch": 2.3635933806146574, + "grad_norm": 2.623624324798584, + "learning_rate": 3.3533566534348033e-06, + "loss": 0.4302, + "step": 4999 + }, + { + "epoch": 2.3640661938534278, + "grad_norm": 3.1467108726501465, + "learning_rate": 3.3527702676142426e-06, + "loss": 0.4661, + "step": 5000 + }, + { + "epoch": 2.3645390070921986, + "grad_norm": 2.5364840030670166, + "learning_rate": 3.352183828695418e-06, + "loss": 0.4134, + "step": 5001 + }, + { + "epoch": 2.3650118203309693, + "grad_norm": 3.002777338027954, + "learning_rate": 3.3515973367148415e-06, + "loss": 0.3771, + "step": 5002 + }, + { + "epoch": 2.36548463356974, + "grad_norm": 2.660043954849243, + "learning_rate": 3.3510107917090335e-06, + "loss": 0.4254, + "step": 5003 + }, + { + "epoch": 2.3659574468085105, + "grad_norm": 2.7041075229644775, + "learning_rate": 3.3504241937145148e-06, + "loss": 0.4651, + "step": 5004 + }, + { + "epoch": 2.3664302600472813, + "grad_norm": 2.7387280464172363, + "learning_rate": 3.349837542767811e-06, + "loss": 0.3874, + "step": 5005 + }, + { + "epoch": 2.366903073286052, + "grad_norm": 3.012188196182251, + "learning_rate": 3.349250838905449e-06, + "loss": 0.4508, + "step": 5006 + }, + { + "epoch": 2.3673758865248224, + "grad_norm": 2.3108484745025635, + "learning_rate": 3.3486640821639616e-06, + "loss": 0.3783, + "step": 5007 + }, + { + "epoch": 2.3678486997635932, + "grad_norm": 3.2188332080841064, + "learning_rate": 3.3480772725798837e-06, + "loss": 0.4879, + "step": 5008 + }, + { + "epoch": 2.368321513002364, + "grad_norm": 2.566087484359741, + "learning_rate": 3.3474904101897526e-06, + "loss": 0.3847, + "step": 5009 + }, + { + "epoch": 2.368794326241135, + "grad_norm": 2.5581698417663574, + "learning_rate": 3.3469034950301092e-06, + "loss": 0.4201, + "step": 5010 + }, + { + "epoch": 2.3692671394799056, + "grad_norm": 2.900296926498413, + "learning_rate": 3.3463165271374992e-06, + "loss": 0.4568, + "step": 5011 + }, + { + "epoch": 2.369739952718676, + "grad_norm": 2.8239312171936035, + "learning_rate": 3.34572950654847e-06, + "loss": 0.4583, + "step": 5012 + }, + { + "epoch": 2.370212765957447, + "grad_norm": 3.219465970993042, + "learning_rate": 3.3451424332995723e-06, + "loss": 0.5435, + "step": 5013 + }, + { + "epoch": 2.3706855791962176, + "grad_norm": 3.3111915588378906, + "learning_rate": 3.344555307427362e-06, + "loss": 0.435, + "step": 5014 + }, + { + "epoch": 2.371158392434988, + "grad_norm": 3.296668529510498, + "learning_rate": 3.3439681289683946e-06, + "loss": 0.4738, + "step": 5015 + }, + { + "epoch": 2.3716312056737587, + "grad_norm": 3.005722761154175, + "learning_rate": 3.343380897959234e-06, + "loss": 0.4267, + "step": 5016 + }, + { + "epoch": 2.3721040189125295, + "grad_norm": 2.7844085693359375, + "learning_rate": 3.3427936144364425e-06, + "loss": 0.4558, + "step": 5017 + }, + { + "epoch": 2.3725768321513003, + "grad_norm": 2.7532076835632324, + "learning_rate": 3.3422062784365884e-06, + "loss": 0.4144, + "step": 5018 + }, + { + "epoch": 2.373049645390071, + "grad_norm": 2.835764169692993, + "learning_rate": 3.3416188899962413e-06, + "loss": 0.4945, + "step": 5019 + }, + { + "epoch": 2.3735224586288415, + "grad_norm": 3.1513726711273193, + "learning_rate": 3.3410314491519767e-06, + "loss": 0.4971, + "step": 5020 + }, + { + "epoch": 2.3739952718676123, + "grad_norm": 3.0162220001220703, + "learning_rate": 3.3404439559403723e-06, + "loss": 0.4477, + "step": 5021 + }, + { + "epoch": 2.374468085106383, + "grad_norm": 2.676391363143921, + "learning_rate": 3.3398564103980073e-06, + "loss": 0.432, + "step": 5022 + }, + { + "epoch": 2.3749408983451534, + "grad_norm": 2.7806248664855957, + "learning_rate": 3.3392688125614663e-06, + "loss": 0.4818, + "step": 5023 + }, + { + "epoch": 2.3754137115839242, + "grad_norm": 2.968806505203247, + "learning_rate": 3.3386811624673373e-06, + "loss": 0.4893, + "step": 5024 + }, + { + "epoch": 2.375886524822695, + "grad_norm": 2.992684841156006, + "learning_rate": 3.3380934601522087e-06, + "loss": 0.4423, + "step": 5025 + }, + { + "epoch": 2.376359338061466, + "grad_norm": 2.578420639038086, + "learning_rate": 3.3375057056526762e-06, + "loss": 0.3682, + "step": 5026 + }, + { + "epoch": 2.3768321513002366, + "grad_norm": 2.7683115005493164, + "learning_rate": 3.336917899005335e-06, + "loss": 0.4038, + "step": 5027 + }, + { + "epoch": 2.377304964539007, + "grad_norm": 2.838812828063965, + "learning_rate": 3.336330040246786e-06, + "loss": 0.442, + "step": 5028 + }, + { + "epoch": 2.3777777777777778, + "grad_norm": 2.766136646270752, + "learning_rate": 3.335742129413633e-06, + "loss": 0.4745, + "step": 5029 + }, + { + "epoch": 2.3782505910165486, + "grad_norm": 2.862656593322754, + "learning_rate": 3.3351541665424812e-06, + "loss": 0.4324, + "step": 5030 + }, + { + "epoch": 2.378723404255319, + "grad_norm": 2.71425199508667, + "learning_rate": 3.3345661516699433e-06, + "loss": 0.4013, + "step": 5031 + }, + { + "epoch": 2.3791962174940897, + "grad_norm": 2.8404030799865723, + "learning_rate": 3.333978084832629e-06, + "loss": 0.5038, + "step": 5032 + }, + { + "epoch": 2.3796690307328605, + "grad_norm": 2.965851068496704, + "learning_rate": 3.3333899660671574e-06, + "loss": 0.4668, + "step": 5033 + }, + { + "epoch": 2.3801418439716313, + "grad_norm": 2.686452627182007, + "learning_rate": 3.3328017954101464e-06, + "loss": 0.4167, + "step": 5034 + }, + { + "epoch": 2.380614657210402, + "grad_norm": 2.8676156997680664, + "learning_rate": 3.3322135728982197e-06, + "loss": 0.4531, + "step": 5035 + }, + { + "epoch": 2.3810874704491725, + "grad_norm": 2.4456300735473633, + "learning_rate": 3.3316252985680026e-06, + "loss": 0.4173, + "step": 5036 + }, + { + "epoch": 2.3815602836879433, + "grad_norm": 2.5472559928894043, + "learning_rate": 3.331036972456124e-06, + "loss": 0.3926, + "step": 5037 + }, + { + "epoch": 2.382033096926714, + "grad_norm": 2.81900954246521, + "learning_rate": 3.330448594599218e-06, + "loss": 0.4785, + "step": 5038 + }, + { + "epoch": 2.3825059101654844, + "grad_norm": 3.0930590629577637, + "learning_rate": 3.329860165033919e-06, + "loss": 0.4587, + "step": 5039 + }, + { + "epoch": 2.382978723404255, + "grad_norm": 3.0553040504455566, + "learning_rate": 3.3292716837968673e-06, + "loss": 0.5285, + "step": 5040 + }, + { + "epoch": 2.383451536643026, + "grad_norm": 2.577580690383911, + "learning_rate": 3.328683150924704e-06, + "loss": 0.4184, + "step": 5041 + }, + { + "epoch": 2.383924349881797, + "grad_norm": 2.6430366039276123, + "learning_rate": 3.3280945664540735e-06, + "loss": 0.4636, + "step": 5042 + }, + { + "epoch": 2.3843971631205676, + "grad_norm": 3.228360891342163, + "learning_rate": 3.3275059304216255e-06, + "loss": 0.455, + "step": 5043 + }, + { + "epoch": 2.384869976359338, + "grad_norm": 2.776142120361328, + "learning_rate": 3.3269172428640125e-06, + "loss": 0.4785, + "step": 5044 + }, + { + "epoch": 2.3853427895981087, + "grad_norm": 2.755671739578247, + "learning_rate": 3.3263285038178882e-06, + "loss": 0.4625, + "step": 5045 + }, + { + "epoch": 2.3858156028368795, + "grad_norm": 3.061004400253296, + "learning_rate": 3.3257397133199114e-06, + "loss": 0.4641, + "step": 5046 + }, + { + "epoch": 2.38628841607565, + "grad_norm": 2.8391458988189697, + "learning_rate": 3.3251508714067432e-06, + "loss": 0.5003, + "step": 5047 + }, + { + "epoch": 2.3867612293144207, + "grad_norm": 2.390810966491699, + "learning_rate": 3.324561978115049e-06, + "loss": 0.4446, + "step": 5048 + }, + { + "epoch": 2.3872340425531915, + "grad_norm": 2.7760825157165527, + "learning_rate": 3.323973033481496e-06, + "loss": 0.4443, + "step": 5049 + }, + { + "epoch": 2.3877068557919623, + "grad_norm": 3.157893419265747, + "learning_rate": 3.3233840375427552e-06, + "loss": 0.4934, + "step": 5050 + }, + { + "epoch": 2.388179669030733, + "grad_norm": 2.7245349884033203, + "learning_rate": 3.3227949903355e-06, + "loss": 0.4254, + "step": 5051 + }, + { + "epoch": 2.3886524822695034, + "grad_norm": 2.6674044132232666, + "learning_rate": 3.322205891896409e-06, + "loss": 0.4116, + "step": 5052 + }, + { + "epoch": 2.3891252955082742, + "grad_norm": 3.1490554809570312, + "learning_rate": 3.3216167422621627e-06, + "loss": 0.4604, + "step": 5053 + }, + { + "epoch": 2.389598108747045, + "grad_norm": 2.725731134414673, + "learning_rate": 3.321027541469444e-06, + "loss": 0.4836, + "step": 5054 + }, + { + "epoch": 2.3900709219858154, + "grad_norm": 2.5378828048706055, + "learning_rate": 3.3204382895549407e-06, + "loss": 0.4228, + "step": 5055 + }, + { + "epoch": 2.390543735224586, + "grad_norm": 2.8191192150115967, + "learning_rate": 3.3198489865553427e-06, + "loss": 0.4371, + "step": 5056 + }, + { + "epoch": 2.391016548463357, + "grad_norm": 2.5676498413085938, + "learning_rate": 3.3192596325073433e-06, + "loss": 0.4463, + "step": 5057 + }, + { + "epoch": 2.391489361702128, + "grad_norm": 3.0846121311187744, + "learning_rate": 3.3186702274476397e-06, + "loss": 0.5049, + "step": 5058 + }, + { + "epoch": 2.3919621749408986, + "grad_norm": 2.6085152626037598, + "learning_rate": 3.3180807714129293e-06, + "loss": 0.4376, + "step": 5059 + }, + { + "epoch": 2.392434988179669, + "grad_norm": 3.0218591690063477, + "learning_rate": 3.3174912644399172e-06, + "loss": 0.4734, + "step": 5060 + }, + { + "epoch": 2.3929078014184397, + "grad_norm": 2.5904781818389893, + "learning_rate": 3.316901706565308e-06, + "loss": 0.4924, + "step": 5061 + }, + { + "epoch": 2.3933806146572105, + "grad_norm": 2.675478458404541, + "learning_rate": 3.3163120978258123e-06, + "loss": 0.4072, + "step": 5062 + }, + { + "epoch": 2.393853427895981, + "grad_norm": 2.7944445610046387, + "learning_rate": 3.3157224382581415e-06, + "loss": 0.4328, + "step": 5063 + }, + { + "epoch": 2.3943262411347517, + "grad_norm": 2.846224546432495, + "learning_rate": 3.315132727899012e-06, + "loss": 0.4447, + "step": 5064 + }, + { + "epoch": 2.3947990543735225, + "grad_norm": 2.6825828552246094, + "learning_rate": 3.3145429667851402e-06, + "loss": 0.4528, + "step": 5065 + }, + { + "epoch": 2.3952718676122933, + "grad_norm": 3.0305285453796387, + "learning_rate": 3.3139531549532505e-06, + "loss": 0.4538, + "step": 5066 + }, + { + "epoch": 2.395744680851064, + "grad_norm": 2.707540988922119, + "learning_rate": 3.313363292440067e-06, + "loss": 0.4412, + "step": 5067 + }, + { + "epoch": 2.3962174940898344, + "grad_norm": 3.0458385944366455, + "learning_rate": 3.3127733792823173e-06, + "loss": 0.4587, + "step": 5068 + }, + { + "epoch": 2.396690307328605, + "grad_norm": 2.7711992263793945, + "learning_rate": 3.312183415516733e-06, + "loss": 0.4157, + "step": 5069 + }, + { + "epoch": 2.397163120567376, + "grad_norm": 2.6953988075256348, + "learning_rate": 3.3115934011800494e-06, + "loss": 0.3828, + "step": 5070 + }, + { + "epoch": 2.3976359338061464, + "grad_norm": 3.033721923828125, + "learning_rate": 3.311003336309003e-06, + "loss": 0.5204, + "step": 5071 + }, + { + "epoch": 2.398108747044917, + "grad_norm": 2.6134517192840576, + "learning_rate": 3.3104132209403355e-06, + "loss": 0.4181, + "step": 5072 + }, + { + "epoch": 2.398581560283688, + "grad_norm": 2.8800251483917236, + "learning_rate": 3.30982305511079e-06, + "loss": 0.466, + "step": 5073 + }, + { + "epoch": 2.3990543735224588, + "grad_norm": 2.5043210983276367, + "learning_rate": 3.309232838857114e-06, + "loss": 0.4161, + "step": 5074 + }, + { + "epoch": 2.3995271867612296, + "grad_norm": 2.6577322483062744, + "learning_rate": 3.308642572216057e-06, + "loss": 0.465, + "step": 5075 + }, + { + "epoch": 2.4, + "grad_norm": 2.549098253250122, + "learning_rate": 3.3080522552243734e-06, + "loss": 0.4571, + "step": 5076 + }, + { + "epoch": 2.4004728132387707, + "grad_norm": 2.881958246231079, + "learning_rate": 3.3074618879188186e-06, + "loss": 0.4443, + "step": 5077 + }, + { + "epoch": 2.4009456264775415, + "grad_norm": 2.608397960662842, + "learning_rate": 3.3068714703361528e-06, + "loss": 0.3843, + "step": 5078 + }, + { + "epoch": 2.401418439716312, + "grad_norm": 2.8666789531707764, + "learning_rate": 3.306281002513139e-06, + "loss": 0.4857, + "step": 5079 + }, + { + "epoch": 2.4018912529550827, + "grad_norm": 2.9008588790893555, + "learning_rate": 3.3056904844865422e-06, + "loss": 0.4454, + "step": 5080 + }, + { + "epoch": 2.4023640661938535, + "grad_norm": 2.7446060180664062, + "learning_rate": 3.3050999162931315e-06, + "loss": 0.4522, + "step": 5081 + }, + { + "epoch": 2.4028368794326243, + "grad_norm": 2.787116765975952, + "learning_rate": 3.3045092979696804e-06, + "loss": 0.4714, + "step": 5082 + }, + { + "epoch": 2.403309692671395, + "grad_norm": 2.7494192123413086, + "learning_rate": 3.3039186295529613e-06, + "loss": 0.4107, + "step": 5083 + }, + { + "epoch": 2.4037825059101654, + "grad_norm": 2.733794927597046, + "learning_rate": 3.303327911079755e-06, + "loss": 0.4169, + "step": 5084 + }, + { + "epoch": 2.404255319148936, + "grad_norm": 2.7313334941864014, + "learning_rate": 3.3027371425868422e-06, + "loss": 0.4287, + "step": 5085 + }, + { + "epoch": 2.404728132387707, + "grad_norm": 2.7832977771759033, + "learning_rate": 3.3021463241110075e-06, + "loss": 0.5307, + "step": 5086 + }, + { + "epoch": 2.4052009456264773, + "grad_norm": 2.6615281105041504, + "learning_rate": 3.301555455689038e-06, + "loss": 0.4519, + "step": 5087 + }, + { + "epoch": 2.405673758865248, + "grad_norm": 2.343921422958374, + "learning_rate": 3.3009645373577264e-06, + "loss": 0.46, + "step": 5088 + }, + { + "epoch": 2.406146572104019, + "grad_norm": 2.6115355491638184, + "learning_rate": 3.300373569153864e-06, + "loss": 0.4782, + "step": 5089 + }, + { + "epoch": 2.4066193853427897, + "grad_norm": 2.730625629425049, + "learning_rate": 3.299782551114249e-06, + "loss": 0.4632, + "step": 5090 + }, + { + "epoch": 2.40709219858156, + "grad_norm": 2.4495043754577637, + "learning_rate": 3.2991914832756824e-06, + "loss": 0.4243, + "step": 5091 + }, + { + "epoch": 2.407565011820331, + "grad_norm": 2.8731648921966553, + "learning_rate": 3.2986003656749654e-06, + "loss": 0.4262, + "step": 5092 + }, + { + "epoch": 2.4080378250591017, + "grad_norm": 2.870342969894409, + "learning_rate": 3.2980091983489053e-06, + "loss": 0.4735, + "step": 5093 + }, + { + "epoch": 2.4085106382978725, + "grad_norm": 2.500786542892456, + "learning_rate": 3.297417981334312e-06, + "loss": 0.4007, + "step": 5094 + }, + { + "epoch": 2.408983451536643, + "grad_norm": 2.7787322998046875, + "learning_rate": 3.2968267146679978e-06, + "loss": 0.493, + "step": 5095 + }, + { + "epoch": 2.4094562647754136, + "grad_norm": 2.5229599475860596, + "learning_rate": 3.2962353983867783e-06, + "loss": 0.3676, + "step": 5096 + }, + { + "epoch": 2.4099290780141844, + "grad_norm": 3.1955904960632324, + "learning_rate": 3.2956440325274715e-06, + "loss": 0.4888, + "step": 5097 + }, + { + "epoch": 2.4104018912529552, + "grad_norm": 2.8580288887023926, + "learning_rate": 3.2950526171268995e-06, + "loss": 0.4892, + "step": 5098 + }, + { + "epoch": 2.4108747044917256, + "grad_norm": 2.6321749687194824, + "learning_rate": 3.294461152221887e-06, + "loss": 0.3823, + "step": 5099 + }, + { + "epoch": 2.4113475177304964, + "grad_norm": 2.881127119064331, + "learning_rate": 3.293869637849263e-06, + "loss": 0.4569, + "step": 5100 + }, + { + "epoch": 2.411820330969267, + "grad_norm": 2.7742316722869873, + "learning_rate": 3.293278074045857e-06, + "loss": 0.4445, + "step": 5101 + }, + { + "epoch": 2.412293144208038, + "grad_norm": 2.546701431274414, + "learning_rate": 3.2926864608485037e-06, + "loss": 0.3995, + "step": 5102 + }, + { + "epoch": 2.4127659574468083, + "grad_norm": 2.588226318359375, + "learning_rate": 3.292094798294041e-06, + "loss": 0.4081, + "step": 5103 + }, + { + "epoch": 2.413238770685579, + "grad_norm": 2.968689441680908, + "learning_rate": 3.2915030864193077e-06, + "loss": 0.4475, + "step": 5104 + }, + { + "epoch": 2.41371158392435, + "grad_norm": 2.9249184131622314, + "learning_rate": 3.290911325261148e-06, + "loss": 0.4763, + "step": 5105 + }, + { + "epoch": 2.4141843971631207, + "grad_norm": 2.817596673965454, + "learning_rate": 3.2903195148564083e-06, + "loss": 0.4451, + "step": 5106 + }, + { + "epoch": 2.414657210401891, + "grad_norm": 2.6465954780578613, + "learning_rate": 3.2897276552419377e-06, + "loss": 0.4665, + "step": 5107 + }, + { + "epoch": 2.415130023640662, + "grad_norm": 2.8613853454589844, + "learning_rate": 3.2891357464545885e-06, + "loss": 0.4398, + "step": 5108 + }, + { + "epoch": 2.4156028368794327, + "grad_norm": 2.756321907043457, + "learning_rate": 3.2885437885312175e-06, + "loss": 0.4634, + "step": 5109 + }, + { + "epoch": 2.4160756501182035, + "grad_norm": 2.8965282440185547, + "learning_rate": 3.287951781508682e-06, + "loss": 0.4319, + "step": 5110 + }, + { + "epoch": 2.416548463356974, + "grad_norm": 2.896756172180176, + "learning_rate": 3.287359725423844e-06, + "loss": 0.4771, + "step": 5111 + }, + { + "epoch": 2.4170212765957446, + "grad_norm": 2.952911376953125, + "learning_rate": 3.286767620313569e-06, + "loss": 0.5026, + "step": 5112 + }, + { + "epoch": 2.4174940898345154, + "grad_norm": 3.850515604019165, + "learning_rate": 3.2861754662147234e-06, + "loss": 0.4387, + "step": 5113 + }, + { + "epoch": 2.417966903073286, + "grad_norm": 3.0072689056396484, + "learning_rate": 3.2855832631641794e-06, + "loss": 0.4586, + "step": 5114 + }, + { + "epoch": 2.4184397163120566, + "grad_norm": 3.166790246963501, + "learning_rate": 3.2849910111988092e-06, + "loss": 0.4842, + "step": 5115 + }, + { + "epoch": 2.4189125295508274, + "grad_norm": 3.5397679805755615, + "learning_rate": 3.284398710355492e-06, + "loss": 0.5138, + "step": 5116 + }, + { + "epoch": 2.419385342789598, + "grad_norm": 2.779609441757202, + "learning_rate": 3.283806360671106e-06, + "loss": 0.4049, + "step": 5117 + }, + { + "epoch": 2.419858156028369, + "grad_norm": 2.5924575328826904, + "learning_rate": 3.283213962182535e-06, + "loss": 0.433, + "step": 5118 + }, + { + "epoch": 2.4203309692671393, + "grad_norm": 2.7429699897766113, + "learning_rate": 3.282621514926665e-06, + "loss": 0.4674, + "step": 5119 + }, + { + "epoch": 2.42080378250591, + "grad_norm": 2.8113889694213867, + "learning_rate": 3.2820290189403846e-06, + "loss": 0.3898, + "step": 5120 + }, + { + "epoch": 2.421276595744681, + "grad_norm": 2.867105722427368, + "learning_rate": 3.2814364742605863e-06, + "loss": 0.4439, + "step": 5121 + }, + { + "epoch": 2.4217494089834517, + "grad_norm": 2.428597927093506, + "learning_rate": 3.2808438809241654e-06, + "loss": 0.4339, + "step": 5122 + }, + { + "epoch": 2.422222222222222, + "grad_norm": 3.071735143661499, + "learning_rate": 3.2802512389680203e-06, + "loss": 0.4583, + "step": 5123 + }, + { + "epoch": 2.422695035460993, + "grad_norm": 3.046313762664795, + "learning_rate": 3.279658548429051e-06, + "loss": 0.5351, + "step": 5124 + }, + { + "epoch": 2.4231678486997636, + "grad_norm": 2.8412697315216064, + "learning_rate": 3.279065809344163e-06, + "loss": 0.5258, + "step": 5125 + }, + { + "epoch": 2.4236406619385344, + "grad_norm": 2.887169122695923, + "learning_rate": 3.278473021750263e-06, + "loss": 0.4568, + "step": 5126 + }, + { + "epoch": 2.424113475177305, + "grad_norm": 2.8316574096679688, + "learning_rate": 3.2778801856842624e-06, + "loss": 0.46, + "step": 5127 + }, + { + "epoch": 2.4245862884160756, + "grad_norm": 2.7660772800445557, + "learning_rate": 3.277287301183073e-06, + "loss": 0.4323, + "step": 5128 + }, + { + "epoch": 2.4250591016548464, + "grad_norm": 2.737682819366455, + "learning_rate": 3.276694368283611e-06, + "loss": 0.4296, + "step": 5129 + }, + { + "epoch": 2.425531914893617, + "grad_norm": 2.8807425498962402, + "learning_rate": 3.276101387022797e-06, + "loss": 0.4673, + "step": 5130 + }, + { + "epoch": 2.4260047281323875, + "grad_norm": 2.530526876449585, + "learning_rate": 3.275508357437552e-06, + "loss": 0.416, + "step": 5131 + }, + { + "epoch": 2.4264775413711583, + "grad_norm": 3.1189746856689453, + "learning_rate": 3.274915279564803e-06, + "loss": 0.4171, + "step": 5132 + }, + { + "epoch": 2.426950354609929, + "grad_norm": 2.6612462997436523, + "learning_rate": 3.274322153441477e-06, + "loss": 0.4104, + "step": 5133 + }, + { + "epoch": 2.4274231678487, + "grad_norm": 2.717973470687866, + "learning_rate": 3.2737289791045064e-06, + "loss": 0.479, + "step": 5134 + }, + { + "epoch": 2.4278959810874703, + "grad_norm": 2.764216661453247, + "learning_rate": 3.2731357565908247e-06, + "loss": 0.481, + "step": 5135 + }, + { + "epoch": 2.428368794326241, + "grad_norm": 2.5081393718719482, + "learning_rate": 3.272542485937369e-06, + "loss": 0.4592, + "step": 5136 + }, + { + "epoch": 2.428841607565012, + "grad_norm": 3.1380364894866943, + "learning_rate": 3.271949167181081e-06, + "loss": 0.4179, + "step": 5137 + }, + { + "epoch": 2.4293144208037827, + "grad_norm": 2.9275963306427, + "learning_rate": 3.2713558003589026e-06, + "loss": 0.5196, + "step": 5138 + }, + { + "epoch": 2.429787234042553, + "grad_norm": 2.8215506076812744, + "learning_rate": 3.270762385507781e-06, + "loss": 0.4081, + "step": 5139 + }, + { + "epoch": 2.430260047281324, + "grad_norm": 2.9185614585876465, + "learning_rate": 3.270168922664665e-06, + "loss": 0.4936, + "step": 5140 + }, + { + "epoch": 2.4307328605200946, + "grad_norm": 2.6507248878479004, + "learning_rate": 3.269575411866507e-06, + "loss": 0.4834, + "step": 5141 + }, + { + "epoch": 2.4312056737588654, + "grad_norm": 2.864741563796997, + "learning_rate": 3.2689818531502637e-06, + "loss": 0.4562, + "step": 5142 + }, + { + "epoch": 2.431678486997636, + "grad_norm": 2.806919813156128, + "learning_rate": 3.2683882465528917e-06, + "loss": 0.4645, + "step": 5143 + }, + { + "epoch": 2.4321513002364066, + "grad_norm": 2.733372211456299, + "learning_rate": 3.267794592111353e-06, + "loss": 0.4123, + "step": 5144 + }, + { + "epoch": 2.4326241134751774, + "grad_norm": 2.8005833625793457, + "learning_rate": 3.2672008898626116e-06, + "loss": 0.4343, + "step": 5145 + }, + { + "epoch": 2.433096926713948, + "grad_norm": 3.2339670658111572, + "learning_rate": 3.2666071398436354e-06, + "loss": 0.4017, + "step": 5146 + }, + { + "epoch": 2.4335697399527185, + "grad_norm": 2.510251760482788, + "learning_rate": 3.2660133420913932e-06, + "loss": 0.3882, + "step": 5147 + }, + { + "epoch": 2.4340425531914893, + "grad_norm": 3.5633628368377686, + "learning_rate": 3.26541949664286e-06, + "loss": 0.4766, + "step": 5148 + }, + { + "epoch": 2.43451536643026, + "grad_norm": 2.8246724605560303, + "learning_rate": 3.26482560353501e-06, + "loss": 0.3728, + "step": 5149 + }, + { + "epoch": 2.434988179669031, + "grad_norm": 2.4923641681671143, + "learning_rate": 3.264231662804823e-06, + "loss": 0.4346, + "step": 5150 + }, + { + "epoch": 2.4354609929078013, + "grad_norm": 3.180874824523926, + "learning_rate": 3.2636376744892827e-06, + "loss": 0.4351, + "step": 5151 + }, + { + "epoch": 2.435933806146572, + "grad_norm": 2.6933515071868896, + "learning_rate": 3.263043638625373e-06, + "loss": 0.4293, + "step": 5152 + }, + { + "epoch": 2.436406619385343, + "grad_norm": 2.584132194519043, + "learning_rate": 3.262449555250081e-06, + "loss": 0.4589, + "step": 5153 + }, + { + "epoch": 2.4368794326241137, + "grad_norm": 2.8103036880493164, + "learning_rate": 3.2618554244003985e-06, + "loss": 0.463, + "step": 5154 + }, + { + "epoch": 2.437352245862884, + "grad_norm": 2.809070587158203, + "learning_rate": 3.2612612461133197e-06, + "loss": 0.4629, + "step": 5155 + }, + { + "epoch": 2.437825059101655, + "grad_norm": 2.98148512840271, + "learning_rate": 3.2606670204258405e-06, + "loss": 0.451, + "step": 5156 + }, + { + "epoch": 2.4382978723404256, + "grad_norm": 2.691047191619873, + "learning_rate": 3.2600727473749614e-06, + "loss": 0.3878, + "step": 5157 + }, + { + "epoch": 2.4387706855791964, + "grad_norm": 2.900360345840454, + "learning_rate": 3.2594784269976856e-06, + "loss": 0.4216, + "step": 5158 + }, + { + "epoch": 2.4392434988179668, + "grad_norm": 2.8449952602386475, + "learning_rate": 3.258884059331019e-06, + "loss": 0.4268, + "step": 5159 + }, + { + "epoch": 2.4397163120567376, + "grad_norm": 2.7226388454437256, + "learning_rate": 3.258289644411969e-06, + "loss": 0.4381, + "step": 5160 + }, + { + "epoch": 2.4401891252955084, + "grad_norm": 2.513946056365967, + "learning_rate": 3.257695182277547e-06, + "loss": 0.4566, + "step": 5161 + }, + { + "epoch": 2.440661938534279, + "grad_norm": 2.9941394329071045, + "learning_rate": 3.2571006729647693e-06, + "loss": 0.4395, + "step": 5162 + }, + { + "epoch": 2.4411347517730495, + "grad_norm": 2.699094533920288, + "learning_rate": 3.2565061165106523e-06, + "loss": 0.4274, + "step": 5163 + }, + { + "epoch": 2.4416075650118203, + "grad_norm": 2.574193000793457, + "learning_rate": 3.255911512952216e-06, + "loss": 0.4187, + "step": 5164 + }, + { + "epoch": 2.442080378250591, + "grad_norm": 2.920766592025757, + "learning_rate": 3.2553168623264854e-06, + "loss": 0.4911, + "step": 5165 + }, + { + "epoch": 2.4425531914893615, + "grad_norm": 2.728421926498413, + "learning_rate": 3.2547221646704853e-06, + "loss": 0.4466, + "step": 5166 + }, + { + "epoch": 2.4430260047281322, + "grad_norm": 2.8171417713165283, + "learning_rate": 3.254127420021246e-06, + "loss": 0.4331, + "step": 5167 + }, + { + "epoch": 2.443498817966903, + "grad_norm": 2.4069135189056396, + "learning_rate": 3.2535326284157975e-06, + "loss": 0.389, + "step": 5168 + }, + { + "epoch": 2.443971631205674, + "grad_norm": 2.912405490875244, + "learning_rate": 3.2529377898911777e-06, + "loss": 0.4681, + "step": 5169 + }, + { + "epoch": 2.4444444444444446, + "grad_norm": 2.987558126449585, + "learning_rate": 3.2523429044844228e-06, + "loss": 0.4715, + "step": 5170 + }, + { + "epoch": 2.444917257683215, + "grad_norm": 2.5117199420928955, + "learning_rate": 3.251747972232574e-06, + "loss": 0.4531, + "step": 5171 + }, + { + "epoch": 2.445390070921986, + "grad_norm": 2.5405385494232178, + "learning_rate": 3.2511529931726752e-06, + "loss": 0.4323, + "step": 5172 + }, + { + "epoch": 2.4458628841607566, + "grad_norm": 2.989932060241699, + "learning_rate": 3.250557967341773e-06, + "loss": 0.4039, + "step": 5173 + }, + { + "epoch": 2.446335697399527, + "grad_norm": 2.6331627368927, + "learning_rate": 3.2499628947769186e-06, + "loss": 0.5147, + "step": 5174 + }, + { + "epoch": 2.4468085106382977, + "grad_norm": 2.71699857711792, + "learning_rate": 3.249367775515162e-06, + "loss": 0.3748, + "step": 5175 + }, + { + "epoch": 2.4472813238770685, + "grad_norm": 2.9508471488952637, + "learning_rate": 3.2487726095935606e-06, + "loss": 0.5145, + "step": 5176 + }, + { + "epoch": 2.4477541371158393, + "grad_norm": 2.8276431560516357, + "learning_rate": 3.2481773970491713e-06, + "loss": 0.4295, + "step": 5177 + }, + { + "epoch": 2.44822695035461, + "grad_norm": 2.5500540733337402, + "learning_rate": 3.2475821379190565e-06, + "loss": 0.4246, + "step": 5178 + }, + { + "epoch": 2.4486997635933805, + "grad_norm": 2.845641613006592, + "learning_rate": 3.246986832240281e-06, + "loss": 0.4211, + "step": 5179 + }, + { + "epoch": 2.4491725768321513, + "grad_norm": 3.1215856075286865, + "learning_rate": 3.2463914800499097e-06, + "loss": 0.4378, + "step": 5180 + }, + { + "epoch": 2.449645390070922, + "grad_norm": 2.4685606956481934, + "learning_rate": 3.2457960813850137e-06, + "loss": 0.4836, + "step": 5181 + }, + { + "epoch": 2.4501182033096924, + "grad_norm": 2.508028268814087, + "learning_rate": 3.245200636282666e-06, + "loss": 0.4377, + "step": 5182 + }, + { + "epoch": 2.4505910165484632, + "grad_norm": 2.899949312210083, + "learning_rate": 3.244605144779943e-06, + "loss": 0.501, + "step": 5183 + }, + { + "epoch": 2.451063829787234, + "grad_norm": 2.6494483947753906, + "learning_rate": 3.244009606913923e-06, + "loss": 0.4255, + "step": 5184 + }, + { + "epoch": 2.451536643026005, + "grad_norm": 2.4363760948181152, + "learning_rate": 3.243414022721686e-06, + "loss": 0.4402, + "step": 5185 + }, + { + "epoch": 2.4520094562647756, + "grad_norm": 2.4725022315979004, + "learning_rate": 3.242818392240317e-06, + "loss": 0.4388, + "step": 5186 + }, + { + "epoch": 2.452482269503546, + "grad_norm": 2.7010514736175537, + "learning_rate": 3.242222715506905e-06, + "loss": 0.4388, + "step": 5187 + }, + { + "epoch": 2.4529550827423168, + "grad_norm": 2.811464548110962, + "learning_rate": 3.241626992558539e-06, + "loss": 0.4634, + "step": 5188 + }, + { + "epoch": 2.4534278959810876, + "grad_norm": 2.6473052501678467, + "learning_rate": 3.2410312234323123e-06, + "loss": 0.4752, + "step": 5189 + }, + { + "epoch": 2.453900709219858, + "grad_norm": 2.5587213039398193, + "learning_rate": 3.24043540816532e-06, + "loss": 0.4458, + "step": 5190 + }, + { + "epoch": 2.4543735224586287, + "grad_norm": 2.6306557655334473, + "learning_rate": 3.239839546794662e-06, + "loss": 0.4081, + "step": 5191 + }, + { + "epoch": 2.4548463356973995, + "grad_norm": 2.4613633155822754, + "learning_rate": 3.23924363935744e-06, + "loss": 0.4165, + "step": 5192 + }, + { + "epoch": 2.4553191489361703, + "grad_norm": 2.7189204692840576, + "learning_rate": 3.238647685890757e-06, + "loss": 0.4822, + "step": 5193 + }, + { + "epoch": 2.455791962174941, + "grad_norm": 3.015977382659912, + "learning_rate": 3.238051686431722e-06, + "loss": 0.4964, + "step": 5194 + }, + { + "epoch": 2.4562647754137115, + "grad_norm": 2.8868937492370605, + "learning_rate": 3.2374556410174445e-06, + "loss": 0.4514, + "step": 5195 + }, + { + "epoch": 2.4567375886524823, + "grad_norm": 2.7959537506103516, + "learning_rate": 3.2368595496850375e-06, + "loss": 0.475, + "step": 5196 + }, + { + "epoch": 2.457210401891253, + "grad_norm": 3.0086777210235596, + "learning_rate": 3.2362634124716187e-06, + "loss": 0.4913, + "step": 5197 + }, + { + "epoch": 2.4576832151300234, + "grad_norm": 2.621335506439209, + "learning_rate": 3.2356672294143044e-06, + "loss": 0.4259, + "step": 5198 + }, + { + "epoch": 2.458156028368794, + "grad_norm": 3.1620380878448486, + "learning_rate": 3.235071000550218e-06, + "loss": 0.451, + "step": 5199 + }, + { + "epoch": 2.458628841607565, + "grad_norm": 2.7663278579711914, + "learning_rate": 3.234474725916484e-06, + "loss": 0.3854, + "step": 5200 + }, + { + "epoch": 2.459101654846336, + "grad_norm": 2.5187132358551025, + "learning_rate": 3.2338784055502288e-06, + "loss": 0.4068, + "step": 5201 + }, + { + "epoch": 2.4595744680851066, + "grad_norm": 2.6022701263427734, + "learning_rate": 3.233282039488583e-06, + "loss": 0.4484, + "step": 5202 + }, + { + "epoch": 2.460047281323877, + "grad_norm": 2.874750852584839, + "learning_rate": 3.2326856277686807e-06, + "loss": 0.45, + "step": 5203 + }, + { + "epoch": 2.4605200945626478, + "grad_norm": 2.671008586883545, + "learning_rate": 3.232089170427656e-06, + "loss": 0.4446, + "step": 5204 + }, + { + "epoch": 2.4609929078014185, + "grad_norm": 2.7365503311157227, + "learning_rate": 3.2314926675026498e-06, + "loss": 0.4402, + "step": 5205 + }, + { + "epoch": 2.461465721040189, + "grad_norm": 2.8163657188415527, + "learning_rate": 3.230896119030803e-06, + "loss": 0.3881, + "step": 5206 + }, + { + "epoch": 2.4619385342789597, + "grad_norm": 2.812433958053589, + "learning_rate": 3.2302995250492584e-06, + "loss": 0.4897, + "step": 5207 + }, + { + "epoch": 2.4624113475177305, + "grad_norm": 2.786033868789673, + "learning_rate": 3.2297028855951664e-06, + "loss": 0.4069, + "step": 5208 + }, + { + "epoch": 2.4628841607565013, + "grad_norm": 3.0247974395751953, + "learning_rate": 3.229106200705674e-06, + "loss": 0.4048, + "step": 5209 + }, + { + "epoch": 2.463356973995272, + "grad_norm": 3.3280487060546875, + "learning_rate": 3.2285094704179353e-06, + "loss": 0.5613, + "step": 5210 + }, + { + "epoch": 2.4638297872340424, + "grad_norm": 2.603219985961914, + "learning_rate": 3.2279126947691073e-06, + "loss": 0.432, + "step": 5211 + }, + { + "epoch": 2.4643026004728132, + "grad_norm": 3.1532180309295654, + "learning_rate": 3.2273158737963472e-06, + "loss": 0.4602, + "step": 5212 + }, + { + "epoch": 2.464775413711584, + "grad_norm": 2.7512969970703125, + "learning_rate": 3.2267190075368164e-06, + "loss": 0.5064, + "step": 5213 + }, + { + "epoch": 2.4652482269503544, + "grad_norm": 2.926992177963257, + "learning_rate": 3.22612209602768e-06, + "loss": 0.4753, + "step": 5214 + }, + { + "epoch": 2.465721040189125, + "grad_norm": 4.052840709686279, + "learning_rate": 3.2255251393061047e-06, + "loss": 0.5235, + "step": 5215 + }, + { + "epoch": 2.466193853427896, + "grad_norm": 2.8266959190368652, + "learning_rate": 3.2249281374092606e-06, + "loss": 0.3931, + "step": 5216 + }, + { + "epoch": 2.466666666666667, + "grad_norm": 2.564359426498413, + "learning_rate": 3.2243310903743196e-06, + "loss": 0.4146, + "step": 5217 + }, + { + "epoch": 2.4671394799054376, + "grad_norm": 2.387925148010254, + "learning_rate": 3.2237339982384576e-06, + "loss": 0.4142, + "step": 5218 + }, + { + "epoch": 2.467612293144208, + "grad_norm": 2.7045164108276367, + "learning_rate": 3.223136861038853e-06, + "loss": 0.4345, + "step": 5219 + }, + { + "epoch": 2.4680851063829787, + "grad_norm": 2.6963284015655518, + "learning_rate": 3.2225396788126872e-06, + "loss": 0.4243, + "step": 5220 + }, + { + "epoch": 2.4685579196217495, + "grad_norm": 2.8247268199920654, + "learning_rate": 3.221942451597144e-06, + "loss": 0.3919, + "step": 5221 + }, + { + "epoch": 2.46903073286052, + "grad_norm": 3.843836784362793, + "learning_rate": 3.2213451794294093e-06, + "loss": 0.4183, + "step": 5222 + }, + { + "epoch": 2.4695035460992907, + "grad_norm": 2.8579909801483154, + "learning_rate": 3.220747862346674e-06, + "loss": 0.4844, + "step": 5223 + }, + { + "epoch": 2.4699763593380615, + "grad_norm": 3.744027853012085, + "learning_rate": 3.2201505003861294e-06, + "loss": 0.4563, + "step": 5224 + }, + { + "epoch": 2.4704491725768323, + "grad_norm": 2.835108995437622, + "learning_rate": 3.219553093584971e-06, + "loss": 0.4394, + "step": 5225 + }, + { + "epoch": 2.470921985815603, + "grad_norm": 2.5681865215301514, + "learning_rate": 3.218955641980397e-06, + "loss": 0.3907, + "step": 5226 + }, + { + "epoch": 2.4713947990543734, + "grad_norm": 2.963172674179077, + "learning_rate": 3.2183581456096067e-06, + "loss": 0.5163, + "step": 5227 + }, + { + "epoch": 2.4718676122931442, + "grad_norm": 2.7840685844421387, + "learning_rate": 3.2177606045098047e-06, + "loss": 0.411, + "step": 5228 + }, + { + "epoch": 2.472340425531915, + "grad_norm": 2.7849979400634766, + "learning_rate": 3.2171630187181977e-06, + "loss": 0.4671, + "step": 5229 + }, + { + "epoch": 2.4728132387706854, + "grad_norm": 2.736406087875366, + "learning_rate": 3.216565388271994e-06, + "loss": 0.5225, + "step": 5230 + }, + { + "epoch": 2.473286052009456, + "grad_norm": 2.978271007537842, + "learning_rate": 3.215967713208406e-06, + "loss": 0.4668, + "step": 5231 + }, + { + "epoch": 2.473758865248227, + "grad_norm": 2.687560796737671, + "learning_rate": 3.2153699935646475e-06, + "loss": 0.4683, + "step": 5232 + }, + { + "epoch": 2.4742316784869978, + "grad_norm": 2.7096521854400635, + "learning_rate": 3.214772229377936e-06, + "loss": 0.4999, + "step": 5233 + }, + { + "epoch": 2.4747044917257686, + "grad_norm": 3.1861157417297363, + "learning_rate": 3.214174420685493e-06, + "loss": 0.4365, + "step": 5234 + }, + { + "epoch": 2.475177304964539, + "grad_norm": 2.623061418533325, + "learning_rate": 3.2135765675245394e-06, + "loss": 0.3717, + "step": 5235 + }, + { + "epoch": 2.4756501182033097, + "grad_norm": 2.680921792984009, + "learning_rate": 3.2129786699323016e-06, + "loss": 0.4688, + "step": 5236 + }, + { + "epoch": 2.4761229314420805, + "grad_norm": 2.80426025390625, + "learning_rate": 3.2123807279460096e-06, + "loss": 0.5043, + "step": 5237 + }, + { + "epoch": 2.476595744680851, + "grad_norm": 2.676156997680664, + "learning_rate": 3.211782741602893e-06, + "loss": 0.4486, + "step": 5238 + }, + { + "epoch": 2.4770685579196217, + "grad_norm": 2.700822591781616, + "learning_rate": 3.2111847109401855e-06, + "loss": 0.4097, + "step": 5239 + }, + { + "epoch": 2.4775413711583925, + "grad_norm": 2.735387086868286, + "learning_rate": 3.2105866359951254e-06, + "loss": 0.4357, + "step": 5240 + }, + { + "epoch": 2.4780141843971633, + "grad_norm": 2.961874485015869, + "learning_rate": 3.2099885168049507e-06, + "loss": 0.4942, + "step": 5241 + }, + { + "epoch": 2.478486997635934, + "grad_norm": 2.546588659286499, + "learning_rate": 3.209390353406904e-06, + "loss": 0.3852, + "step": 5242 + }, + { + "epoch": 2.4789598108747044, + "grad_norm": 2.6269772052764893, + "learning_rate": 3.208792145838231e-06, + "loss": 0.3935, + "step": 5243 + }, + { + "epoch": 2.479432624113475, + "grad_norm": 2.9009883403778076, + "learning_rate": 3.208193894136179e-06, + "loss": 0.4003, + "step": 5244 + }, + { + "epoch": 2.479905437352246, + "grad_norm": 2.772834300994873, + "learning_rate": 3.2075955983379982e-06, + "loss": 0.4742, + "step": 5245 + }, + { + "epoch": 2.4803782505910164, + "grad_norm": 2.728703737258911, + "learning_rate": 3.2069972584809423e-06, + "loss": 0.4405, + "step": 5246 + }, + { + "epoch": 2.480851063829787, + "grad_norm": 2.72868275642395, + "learning_rate": 3.206398874602268e-06, + "loss": 0.4714, + "step": 5247 + }, + { + "epoch": 2.481323877068558, + "grad_norm": 2.6804213523864746, + "learning_rate": 3.2058004467392323e-06, + "loss": 0.4106, + "step": 5248 + }, + { + "epoch": 2.4817966903073287, + "grad_norm": 2.6740739345550537, + "learning_rate": 3.205201974929098e-06, + "loss": 0.3855, + "step": 5249 + }, + { + "epoch": 2.482269503546099, + "grad_norm": 2.8131754398345947, + "learning_rate": 3.204603459209129e-06, + "loss": 0.418, + "step": 5250 + }, + { + "epoch": 2.48274231678487, + "grad_norm": 2.5242888927459717, + "learning_rate": 3.204004899616592e-06, + "loss": 0.4914, + "step": 5251 + }, + { + "epoch": 2.4832151300236407, + "grad_norm": 2.969191551208496, + "learning_rate": 3.2034062961887567e-06, + "loss": 0.4634, + "step": 5252 + }, + { + "epoch": 2.4836879432624115, + "grad_norm": 2.967968463897705, + "learning_rate": 3.2028076489628963e-06, + "loss": 0.456, + "step": 5253 + }, + { + "epoch": 2.484160756501182, + "grad_norm": 2.9006540775299072, + "learning_rate": 3.2022089579762845e-06, + "loss": 0.4203, + "step": 5254 + }, + { + "epoch": 2.4846335697399526, + "grad_norm": 2.6377336978912354, + "learning_rate": 3.2016102232662003e-06, + "loss": 0.4518, + "step": 5255 + }, + { + "epoch": 2.4851063829787234, + "grad_norm": 2.757749319076538, + "learning_rate": 3.201011444869925e-06, + "loss": 0.4314, + "step": 5256 + }, + { + "epoch": 2.4855791962174942, + "grad_norm": 2.571560859680176, + "learning_rate": 3.20041262282474e-06, + "loss": 0.427, + "step": 5257 + }, + { + "epoch": 2.4860520094562646, + "grad_norm": 3.1367194652557373, + "learning_rate": 3.1998137571679316e-06, + "loss": 0.4901, + "step": 5258 + }, + { + "epoch": 2.4865248226950354, + "grad_norm": 3.194042205810547, + "learning_rate": 3.1992148479367896e-06, + "loss": 0.466, + "step": 5259 + }, + { + "epoch": 2.486997635933806, + "grad_norm": 2.5546324253082275, + "learning_rate": 3.1986158951686052e-06, + "loss": 0.4182, + "step": 5260 + }, + { + "epoch": 2.487470449172577, + "grad_norm": 2.919783115386963, + "learning_rate": 3.198016898900672e-06, + "loss": 0.4234, + "step": 5261 + }, + { + "epoch": 2.4879432624113473, + "grad_norm": 2.865248918533325, + "learning_rate": 3.1974178591702877e-06, + "loss": 0.4291, + "step": 5262 + }, + { + "epoch": 2.488416075650118, + "grad_norm": 2.685737133026123, + "learning_rate": 3.196818776014752e-06, + "loss": 0.4548, + "step": 5263 + }, + { + "epoch": 2.488888888888889, + "grad_norm": 2.826974630355835, + "learning_rate": 3.196219649471365e-06, + "loss": 0.4152, + "step": 5264 + }, + { + "epoch": 2.4893617021276597, + "grad_norm": 2.764975070953369, + "learning_rate": 3.1956204795774336e-06, + "loss": 0.5209, + "step": 5265 + }, + { + "epoch": 2.48983451536643, + "grad_norm": 2.4184255599975586, + "learning_rate": 3.1950212663702662e-06, + "loss": 0.3969, + "step": 5266 + }, + { + "epoch": 2.490307328605201, + "grad_norm": 2.9361133575439453, + "learning_rate": 3.1944220098871713e-06, + "loss": 0.4589, + "step": 5267 + }, + { + "epoch": 2.4907801418439717, + "grad_norm": 2.377051830291748, + "learning_rate": 3.193822710165463e-06, + "loss": 0.4328, + "step": 5268 + }, + { + "epoch": 2.4912529550827425, + "grad_norm": 3.1302497386932373, + "learning_rate": 3.1932233672424563e-06, + "loss": 0.3918, + "step": 5269 + }, + { + "epoch": 2.491725768321513, + "grad_norm": 2.89577579498291, + "learning_rate": 3.192623981155471e-06, + "loss": 0.5004, + "step": 5270 + }, + { + "epoch": 2.4921985815602836, + "grad_norm": 2.7735235691070557, + "learning_rate": 3.1920245519418273e-06, + "loss": 0.4206, + "step": 5271 + }, + { + "epoch": 2.4926713947990544, + "grad_norm": 2.5424516201019287, + "learning_rate": 3.1914250796388493e-06, + "loss": 0.4419, + "step": 5272 + }, + { + "epoch": 2.493144208037825, + "grad_norm": 3.1216981410980225, + "learning_rate": 3.1908255642838628e-06, + "loss": 0.4552, + "step": 5273 + }, + { + "epoch": 2.4936170212765956, + "grad_norm": 3.044045925140381, + "learning_rate": 3.1902260059141978e-06, + "loss": 0.4967, + "step": 5274 + }, + { + "epoch": 2.4940898345153664, + "grad_norm": 2.5630741119384766, + "learning_rate": 3.189626404567186e-06, + "loss": 0.3908, + "step": 5275 + }, + { + "epoch": 2.494562647754137, + "grad_norm": 2.7177648544311523, + "learning_rate": 3.189026760280162e-06, + "loss": 0.4915, + "step": 5276 + }, + { + "epoch": 2.495035460992908, + "grad_norm": 2.653416395187378, + "learning_rate": 3.1884270730904632e-06, + "loss": 0.4633, + "step": 5277 + }, + { + "epoch": 2.4955082742316783, + "grad_norm": 3.7212321758270264, + "learning_rate": 3.1878273430354284e-06, + "loss": 0.4549, + "step": 5278 + }, + { + "epoch": 2.495981087470449, + "grad_norm": 2.4152729511260986, + "learning_rate": 3.187227570152402e-06, + "loss": 0.4674, + "step": 5279 + }, + { + "epoch": 2.49645390070922, + "grad_norm": 2.5354862213134766, + "learning_rate": 3.1866277544787284e-06, + "loss": 0.4135, + "step": 5280 + }, + { + "epoch": 2.4969267139479907, + "grad_norm": 3.1766583919525146, + "learning_rate": 3.186027896051754e-06, + "loss": 0.5656, + "step": 5281 + }, + { + "epoch": 2.497399527186761, + "grad_norm": 2.5636754035949707, + "learning_rate": 3.1854279949088313e-06, + "loss": 0.4138, + "step": 5282 + }, + { + "epoch": 2.497872340425532, + "grad_norm": 2.7615602016448975, + "learning_rate": 3.1848280510873124e-06, + "loss": 0.4936, + "step": 5283 + }, + { + "epoch": 2.4983451536643027, + "grad_norm": 2.964721918106079, + "learning_rate": 3.1842280646245543e-06, + "loss": 0.4865, + "step": 5284 + }, + { + "epoch": 2.4988179669030735, + "grad_norm": 2.6915178298950195, + "learning_rate": 3.1836280355579152e-06, + "loss": 0.4179, + "step": 5285 + }, + { + "epoch": 2.499290780141844, + "grad_norm": 2.820451259613037, + "learning_rate": 3.183027963924755e-06, + "loss": 0.4785, + "step": 5286 + }, + { + "epoch": 2.4997635933806146, + "grad_norm": 2.841719627380371, + "learning_rate": 3.1824278497624393e-06, + "loss": 0.4535, + "step": 5287 + }, + { + "epoch": 2.5002364066193854, + "grad_norm": 2.459167957305908, + "learning_rate": 3.181827693108333e-06, + "loss": 0.4353, + "step": 5288 + }, + { + "epoch": 2.500709219858156, + "grad_norm": 3.2538363933563232, + "learning_rate": 3.1812274939998066e-06, + "loss": 0.4037, + "step": 5289 + }, + { + "epoch": 2.5011820330969265, + "grad_norm": 2.6980504989624023, + "learning_rate": 3.180627252474231e-06, + "loss": 0.4181, + "step": 5290 + }, + { + "epoch": 2.5016548463356973, + "grad_norm": 2.9400012493133545, + "learning_rate": 3.1800269685689804e-06, + "loss": 0.4642, + "step": 5291 + }, + { + "epoch": 2.502127659574468, + "grad_norm": 2.7832958698272705, + "learning_rate": 3.1794266423214328e-06, + "loss": 0.3936, + "step": 5292 + }, + { + "epoch": 2.5026004728132385, + "grad_norm": 2.4017868041992188, + "learning_rate": 3.178826273768967e-06, + "loss": 0.3984, + "step": 5293 + }, + { + "epoch": 2.5030732860520093, + "grad_norm": 2.398120641708374, + "learning_rate": 3.1782258629489665e-06, + "loss": 0.4219, + "step": 5294 + }, + { + "epoch": 2.50354609929078, + "grad_norm": 2.973947763442993, + "learning_rate": 3.177625409898815e-06, + "loss": 0.4192, + "step": 5295 + }, + { + "epoch": 2.504018912529551, + "grad_norm": 3.1169888973236084, + "learning_rate": 3.1770249146559006e-06, + "loss": 0.5098, + "step": 5296 + }, + { + "epoch": 2.5044917257683217, + "grad_norm": 2.816964864730835, + "learning_rate": 3.1764243772576132e-06, + "loss": 0.4228, + "step": 5297 + }, + { + "epoch": 2.504964539007092, + "grad_norm": 2.5624163150787354, + "learning_rate": 3.1758237977413452e-06, + "loss": 0.4389, + "step": 5298 + }, + { + "epoch": 2.505437352245863, + "grad_norm": 2.7477777004241943, + "learning_rate": 3.175223176144494e-06, + "loss": 0.4564, + "step": 5299 + }, + { + "epoch": 2.5059101654846336, + "grad_norm": 3.1478309631347656, + "learning_rate": 3.174622512504456e-06, + "loss": 0.4859, + "step": 5300 + }, + { + "epoch": 2.506382978723404, + "grad_norm": 2.8400418758392334, + "learning_rate": 3.1740218068586315e-06, + "loss": 0.4476, + "step": 5301 + }, + { + "epoch": 2.506855791962175, + "grad_norm": 2.7097036838531494, + "learning_rate": 3.173421059244426e-06, + "loss": 0.4559, + "step": 5302 + }, + { + "epoch": 2.5073286052009456, + "grad_norm": 2.864760637283325, + "learning_rate": 3.172820269699243e-06, + "loss": 0.5124, + "step": 5303 + }, + { + "epoch": 2.5078014184397164, + "grad_norm": 2.877110004425049, + "learning_rate": 3.1722194382604926e-06, + "loss": 0.5083, + "step": 5304 + }, + { + "epoch": 2.508274231678487, + "grad_norm": 3.2369656562805176, + "learning_rate": 3.1716185649655844e-06, + "loss": 0.4894, + "step": 5305 + }, + { + "epoch": 2.5087470449172575, + "grad_norm": 2.7377753257751465, + "learning_rate": 3.171017649851934e-06, + "loss": 0.4324, + "step": 5306 + }, + { + "epoch": 2.5092198581560283, + "grad_norm": 2.883364200592041, + "learning_rate": 3.1704166929569564e-06, + "loss": 0.3731, + "step": 5307 + }, + { + "epoch": 2.509692671394799, + "grad_norm": 2.5724737644195557, + "learning_rate": 3.1698156943180716e-06, + "loss": 0.4768, + "step": 5308 + }, + { + "epoch": 2.5101654846335695, + "grad_norm": 2.7532460689544678, + "learning_rate": 3.1692146539727e-06, + "loss": 0.4385, + "step": 5309 + }, + { + "epoch": 2.5106382978723403, + "grad_norm": 2.786505699157715, + "learning_rate": 3.168613571958267e-06, + "loss": 0.4241, + "step": 5310 + }, + { + "epoch": 2.511111111111111, + "grad_norm": 3.1674118041992188, + "learning_rate": 3.1680124483121975e-06, + "loss": 0.4445, + "step": 5311 + }, + { + "epoch": 2.511583924349882, + "grad_norm": 2.7861545085906982, + "learning_rate": 3.167411283071923e-06, + "loss": 0.4264, + "step": 5312 + }, + { + "epoch": 2.5120567375886527, + "grad_norm": 2.7412493228912354, + "learning_rate": 3.1668100762748745e-06, + "loss": 0.4725, + "step": 5313 + }, + { + "epoch": 2.512529550827423, + "grad_norm": 2.710019588470459, + "learning_rate": 3.1662088279584858e-06, + "loss": 0.5207, + "step": 5314 + }, + { + "epoch": 2.513002364066194, + "grad_norm": 2.694812297821045, + "learning_rate": 3.165607538160194e-06, + "loss": 0.3666, + "step": 5315 + }, + { + "epoch": 2.5134751773049646, + "grad_norm": 2.4390623569488525, + "learning_rate": 3.1650062069174405e-06, + "loss": 0.4025, + "step": 5316 + }, + { + "epoch": 2.513947990543735, + "grad_norm": 3.055738925933838, + "learning_rate": 3.1644048342676663e-06, + "loss": 0.4288, + "step": 5317 + }, + { + "epoch": 2.5144208037825058, + "grad_norm": 3.065824508666992, + "learning_rate": 3.163803420248316e-06, + "loss": 0.4592, + "step": 5318 + }, + { + "epoch": 2.5148936170212766, + "grad_norm": 2.6011085510253906, + "learning_rate": 3.163201964896838e-06, + "loss": 0.4081, + "step": 5319 + }, + { + "epoch": 2.5153664302600474, + "grad_norm": 2.4833033084869385, + "learning_rate": 3.162600468250681e-06, + "loss": 0.4343, + "step": 5320 + }, + { + "epoch": 2.515839243498818, + "grad_norm": 2.9035534858703613, + "learning_rate": 3.161998930347299e-06, + "loss": 0.4972, + "step": 5321 + }, + { + "epoch": 2.5163120567375885, + "grad_norm": 2.788752317428589, + "learning_rate": 3.161397351224146e-06, + "loss": 0.4597, + "step": 5322 + }, + { + "epoch": 2.5167848699763593, + "grad_norm": 2.4344491958618164, + "learning_rate": 3.16079573091868e-06, + "loss": 0.359, + "step": 5323 + }, + { + "epoch": 2.51725768321513, + "grad_norm": 2.750150680541992, + "learning_rate": 3.160194069468361e-06, + "loss": 0.4596, + "step": 5324 + }, + { + "epoch": 2.5177304964539005, + "grad_norm": 2.826902389526367, + "learning_rate": 3.1595923669106526e-06, + "loss": 0.4377, + "step": 5325 + }, + { + "epoch": 2.5182033096926713, + "grad_norm": 2.554439067840576, + "learning_rate": 3.15899062328302e-06, + "loss": 0.4517, + "step": 5326 + }, + { + "epoch": 2.518676122931442, + "grad_norm": 3.0882742404937744, + "learning_rate": 3.158388838622931e-06, + "loss": 0.47, + "step": 5327 + }, + { + "epoch": 2.519148936170213, + "grad_norm": 2.918947696685791, + "learning_rate": 3.157787012967856e-06, + "loss": 0.522, + "step": 5328 + }, + { + "epoch": 2.5196217494089836, + "grad_norm": 2.8057637214660645, + "learning_rate": 3.1571851463552674e-06, + "loss": 0.4837, + "step": 5329 + }, + { + "epoch": 2.520094562647754, + "grad_norm": 2.66241455078125, + "learning_rate": 3.156583238822641e-06, + "loss": 0.3988, + "step": 5330 + }, + { + "epoch": 2.520567375886525, + "grad_norm": 2.9793803691864014, + "learning_rate": 3.155981290407456e-06, + "loss": 0.4737, + "step": 5331 + }, + { + "epoch": 2.5210401891252956, + "grad_norm": 2.847522258758545, + "learning_rate": 3.1553793011471924e-06, + "loss": 0.4394, + "step": 5332 + }, + { + "epoch": 2.521513002364066, + "grad_norm": 2.9561474323272705, + "learning_rate": 3.154777271079333e-06, + "loss": 0.47, + "step": 5333 + }, + { + "epoch": 2.5219858156028367, + "grad_norm": 2.8353018760681152, + "learning_rate": 3.154175200241365e-06, + "loss": 0.4015, + "step": 5334 + }, + { + "epoch": 2.5224586288416075, + "grad_norm": 2.609049081802368, + "learning_rate": 3.153573088670775e-06, + "loss": 0.4723, + "step": 5335 + }, + { + "epoch": 2.5229314420803783, + "grad_norm": 2.8538455963134766, + "learning_rate": 3.1529709364050556e-06, + "loss": 0.4665, + "step": 5336 + }, + { + "epoch": 2.523404255319149, + "grad_norm": 2.768310785293579, + "learning_rate": 3.1523687434816978e-06, + "loss": 0.4933, + "step": 5337 + }, + { + "epoch": 2.5238770685579195, + "grad_norm": 2.9300906658172607, + "learning_rate": 3.1517665099382e-06, + "loss": 0.4651, + "step": 5338 + }, + { + "epoch": 2.5243498817966903, + "grad_norm": 2.6984703540802, + "learning_rate": 3.1511642358120585e-06, + "loss": 0.4442, + "step": 5339 + }, + { + "epoch": 2.524822695035461, + "grad_norm": 2.8148467540740967, + "learning_rate": 3.1505619211407762e-06, + "loss": 0.4611, + "step": 5340 + }, + { + "epoch": 2.5252955082742314, + "grad_norm": 2.816436290740967, + "learning_rate": 3.1499595659618556e-06, + "loss": 0.5291, + "step": 5341 + }, + { + "epoch": 2.5257683215130022, + "grad_norm": 2.902805805206299, + "learning_rate": 3.149357170312802e-06, + "loss": 0.4394, + "step": 5342 + }, + { + "epoch": 2.526241134751773, + "grad_norm": 2.6443474292755127, + "learning_rate": 3.148754734231126e-06, + "loss": 0.4444, + "step": 5343 + }, + { + "epoch": 2.526713947990544, + "grad_norm": 2.6818583011627197, + "learning_rate": 3.148152257754336e-06, + "loss": 0.4256, + "step": 5344 + }, + { + "epoch": 2.5271867612293146, + "grad_norm": 2.5266945362091064, + "learning_rate": 3.1475497409199485e-06, + "loss": 0.4087, + "step": 5345 + }, + { + "epoch": 2.527659574468085, + "grad_norm": 2.6326711177825928, + "learning_rate": 3.146947183765477e-06, + "loss": 0.3842, + "step": 5346 + }, + { + "epoch": 2.5281323877068558, + "grad_norm": 3.122880697250366, + "learning_rate": 3.1463445863284413e-06, + "loss": 0.482, + "step": 5347 + }, + { + "epoch": 2.5286052009456266, + "grad_norm": 2.819258213043213, + "learning_rate": 3.145741948646362e-06, + "loss": 0.4628, + "step": 5348 + }, + { + "epoch": 2.529078014184397, + "grad_norm": 2.5842230319976807, + "learning_rate": 3.145139270756764e-06, + "loss": 0.4479, + "step": 5349 + }, + { + "epoch": 2.5295508274231677, + "grad_norm": 2.7257237434387207, + "learning_rate": 3.144536552697172e-06, + "loss": 0.473, + "step": 5350 + }, + { + "epoch": 2.5300236406619385, + "grad_norm": 2.6876981258392334, + "learning_rate": 3.143933794505115e-06, + "loss": 0.4615, + "step": 5351 + }, + { + "epoch": 2.5304964539007093, + "grad_norm": 2.7942895889282227, + "learning_rate": 3.143330996218124e-06, + "loss": 0.4982, + "step": 5352 + }, + { + "epoch": 2.53096926713948, + "grad_norm": 2.3150579929351807, + "learning_rate": 3.1427281578737327e-06, + "loss": 0.3905, + "step": 5353 + }, + { + "epoch": 2.5314420803782505, + "grad_norm": 2.7326138019561768, + "learning_rate": 3.142125279509478e-06, + "loss": 0.4076, + "step": 5354 + }, + { + "epoch": 2.5319148936170213, + "grad_norm": 2.46362566947937, + "learning_rate": 3.1415223611628976e-06, + "loss": 0.4043, + "step": 5355 + }, + { + "epoch": 2.532387706855792, + "grad_norm": 2.6670427322387695, + "learning_rate": 3.1409194028715323e-06, + "loss": 0.484, + "step": 5356 + }, + { + "epoch": 2.5328605200945624, + "grad_norm": 2.917771100997925, + "learning_rate": 3.140316404672926e-06, + "loss": 0.4539, + "step": 5357 + }, + { + "epoch": 2.533333333333333, + "grad_norm": 2.7964110374450684, + "learning_rate": 3.1397133666046254e-06, + "loss": 0.4706, + "step": 5358 + }, + { + "epoch": 2.533806146572104, + "grad_norm": 2.6481330394744873, + "learning_rate": 3.139110288704179e-06, + "loss": 0.4101, + "step": 5359 + }, + { + "epoch": 2.534278959810875, + "grad_norm": 2.859452962875366, + "learning_rate": 3.1385071710091365e-06, + "loss": 0.4842, + "step": 5360 + }, + { + "epoch": 2.5347517730496456, + "grad_norm": 2.686077356338501, + "learning_rate": 3.137904013557052e-06, + "loss": 0.4073, + "step": 5361 + }, + { + "epoch": 2.535224586288416, + "grad_norm": 3.7147045135498047, + "learning_rate": 3.137300816385482e-06, + "loss": 0.4536, + "step": 5362 + }, + { + "epoch": 2.5356973995271868, + "grad_norm": 2.51054048538208, + "learning_rate": 3.1366975795319856e-06, + "loss": 0.4171, + "step": 5363 + }, + { + "epoch": 2.5361702127659576, + "grad_norm": 3.043149471282959, + "learning_rate": 3.136094303034121e-06, + "loss": 0.5179, + "step": 5364 + }, + { + "epoch": 2.536643026004728, + "grad_norm": 2.398878812789917, + "learning_rate": 3.1354909869294548e-06, + "loss": 0.4144, + "step": 5365 + }, + { + "epoch": 2.5371158392434987, + "grad_norm": 2.969712257385254, + "learning_rate": 3.134887631255551e-06, + "loss": 0.3983, + "step": 5366 + }, + { + "epoch": 2.5375886524822695, + "grad_norm": 2.7707982063293457, + "learning_rate": 3.134284236049978e-06, + "loss": 0.4405, + "step": 5367 + }, + { + "epoch": 2.5380614657210403, + "grad_norm": 2.579742193222046, + "learning_rate": 3.1336808013503073e-06, + "loss": 0.4402, + "step": 5368 + }, + { + "epoch": 2.538534278959811, + "grad_norm": 2.6041927337646484, + "learning_rate": 3.1330773271941113e-06, + "loss": 0.396, + "step": 5369 + }, + { + "epoch": 2.5390070921985815, + "grad_norm": 2.7383856773376465, + "learning_rate": 3.1324738136189658e-06, + "loss": 0.4424, + "step": 5370 + }, + { + "epoch": 2.5394799054373522, + "grad_norm": 3.053644895553589, + "learning_rate": 3.13187026066245e-06, + "loss": 0.473, + "step": 5371 + }, + { + "epoch": 2.539952718676123, + "grad_norm": 2.684244155883789, + "learning_rate": 3.1312666683621428e-06, + "loss": 0.3963, + "step": 5372 + }, + { + "epoch": 2.5404255319148934, + "grad_norm": 2.6505017280578613, + "learning_rate": 3.130663036755629e-06, + "loss": 0.4292, + "step": 5373 + }, + { + "epoch": 2.540898345153664, + "grad_norm": 3.025965929031372, + "learning_rate": 3.1300593658804935e-06, + "loss": 0.4539, + "step": 5374 + }, + { + "epoch": 2.541371158392435, + "grad_norm": 2.72106671333313, + "learning_rate": 3.1294556557743237e-06, + "loss": 0.4519, + "step": 5375 + }, + { + "epoch": 2.541843971631206, + "grad_norm": 2.759995222091675, + "learning_rate": 3.12885190647471e-06, + "loss": 0.451, + "step": 5376 + }, + { + "epoch": 2.5423167848699766, + "grad_norm": 2.697950601577759, + "learning_rate": 3.1282481180192457e-06, + "loss": 0.4328, + "step": 5377 + }, + { + "epoch": 2.542789598108747, + "grad_norm": 2.6970415115356445, + "learning_rate": 3.127644290445526e-06, + "loss": 0.4489, + "step": 5378 + }, + { + "epoch": 2.5432624113475177, + "grad_norm": 2.5856997966766357, + "learning_rate": 3.127040423791148e-06, + "loss": 0.3848, + "step": 5379 + }, + { + "epoch": 2.5437352245862885, + "grad_norm": 2.9798166751861572, + "learning_rate": 3.1264365180937127e-06, + "loss": 0.5038, + "step": 5380 + }, + { + "epoch": 2.544208037825059, + "grad_norm": 3.413175106048584, + "learning_rate": 3.1258325733908224e-06, + "loss": 0.5247, + "step": 5381 + }, + { + "epoch": 2.5446808510638297, + "grad_norm": 2.838517904281616, + "learning_rate": 3.1252285897200818e-06, + "loss": 0.4652, + "step": 5382 + }, + { + "epoch": 2.5451536643026005, + "grad_norm": 2.8342528343200684, + "learning_rate": 3.1246245671190983e-06, + "loss": 0.4245, + "step": 5383 + }, + { + "epoch": 2.5456264775413713, + "grad_norm": 3.06026029586792, + "learning_rate": 3.124020505625482e-06, + "loss": 0.469, + "step": 5384 + }, + { + "epoch": 2.546099290780142, + "grad_norm": 2.633894681930542, + "learning_rate": 3.1234164052768452e-06, + "loss": 0.4509, + "step": 5385 + }, + { + "epoch": 2.5465721040189124, + "grad_norm": 2.634819984436035, + "learning_rate": 3.1228122661108023e-06, + "loss": 0.4879, + "step": 5386 + }, + { + "epoch": 2.5470449172576832, + "grad_norm": 3.9843504428863525, + "learning_rate": 3.1222080881649707e-06, + "loss": 0.4472, + "step": 5387 + }, + { + "epoch": 2.547517730496454, + "grad_norm": 2.5480258464813232, + "learning_rate": 3.1216038714769694e-06, + "loss": 0.4396, + "step": 5388 + }, + { + "epoch": 2.5479905437352244, + "grad_norm": 2.7461917400360107, + "learning_rate": 3.12099961608442e-06, + "loss": 0.4735, + "step": 5389 + }, + { + "epoch": 2.548463356973995, + "grad_norm": 3.167769193649292, + "learning_rate": 3.1203953220249493e-06, + "loss": 0.4196, + "step": 5390 + }, + { + "epoch": 2.548936170212766, + "grad_norm": 2.721696615219116, + "learning_rate": 3.1197909893361814e-06, + "loss": 0.4571, + "step": 5391 + }, + { + "epoch": 2.5494089834515368, + "grad_norm": 2.726668119430542, + "learning_rate": 3.1191866180557463e-06, + "loss": 0.4856, + "step": 5392 + }, + { + "epoch": 2.5498817966903076, + "grad_norm": 2.602205276489258, + "learning_rate": 3.1185822082212754e-06, + "loss": 0.4631, + "step": 5393 + }, + { + "epoch": 2.550354609929078, + "grad_norm": 2.7715859413146973, + "learning_rate": 3.1179777598704025e-06, + "loss": 0.4136, + "step": 5394 + }, + { + "epoch": 2.5508274231678487, + "grad_norm": 2.8081955909729004, + "learning_rate": 3.1173732730407647e-06, + "loss": 0.4963, + "step": 5395 + }, + { + "epoch": 2.5513002364066195, + "grad_norm": 2.946772336959839, + "learning_rate": 3.1167687477700006e-06, + "loss": 0.4443, + "step": 5396 + }, + { + "epoch": 2.55177304964539, + "grad_norm": 2.89345383644104, + "learning_rate": 3.1161641840957503e-06, + "loss": 0.4377, + "step": 5397 + }, + { + "epoch": 2.5522458628841607, + "grad_norm": 2.908317804336548, + "learning_rate": 3.115559582055659e-06, + "loss": 0.4702, + "step": 5398 + }, + { + "epoch": 2.5527186761229315, + "grad_norm": 2.554417848587036, + "learning_rate": 3.1149549416873704e-06, + "loss": 0.3738, + "step": 5399 + }, + { + "epoch": 2.5531914893617023, + "grad_norm": 2.3132457733154297, + "learning_rate": 3.1143502630285356e-06, + "loss": 0.4074, + "step": 5400 + }, + { + "epoch": 2.553664302600473, + "grad_norm": 2.751666784286499, + "learning_rate": 3.1137455461168026e-06, + "loss": 0.4697, + "step": 5401 + }, + { + "epoch": 2.5541371158392434, + "grad_norm": 2.7088871002197266, + "learning_rate": 3.113140790989826e-06, + "loss": 0.4754, + "step": 5402 + }, + { + "epoch": 2.554609929078014, + "grad_norm": 3.0633046627044678, + "learning_rate": 3.1125359976852605e-06, + "loss": 0.4874, + "step": 5403 + }, + { + "epoch": 2.555082742316785, + "grad_norm": 3.399456024169922, + "learning_rate": 3.111931166240764e-06, + "loss": 0.5529, + "step": 5404 + }, + { + "epoch": 2.5555555555555554, + "grad_norm": 2.7729690074920654, + "learning_rate": 3.1113262966939985e-06, + "loss": 0.4677, + "step": 5405 + }, + { + "epoch": 2.556028368794326, + "grad_norm": 2.81025767326355, + "learning_rate": 3.1107213890826244e-06, + "loss": 0.4954, + "step": 5406 + }, + { + "epoch": 2.556501182033097, + "grad_norm": 2.4837241172790527, + "learning_rate": 3.110116443444307e-06, + "loss": 0.3681, + "step": 5407 + }, + { + "epoch": 2.5569739952718678, + "grad_norm": 2.6406874656677246, + "learning_rate": 3.109511459816714e-06, + "loss": 0.4569, + "step": 5408 + }, + { + "epoch": 2.5574468085106385, + "grad_norm": 2.6093738079071045, + "learning_rate": 3.1089064382375155e-06, + "loss": 0.413, + "step": 5409 + }, + { + "epoch": 2.557919621749409, + "grad_norm": 2.6629011631011963, + "learning_rate": 3.108301378744383e-06, + "loss": 0.4286, + "step": 5410 + }, + { + "epoch": 2.5583924349881797, + "grad_norm": 2.694796323776245, + "learning_rate": 3.10769628137499e-06, + "loss": 0.4316, + "step": 5411 + }, + { + "epoch": 2.5588652482269505, + "grad_norm": 2.88023042678833, + "learning_rate": 3.107091146167015e-06, + "loss": 0.4378, + "step": 5412 + }, + { + "epoch": 2.559338061465721, + "grad_norm": 2.8804919719696045, + "learning_rate": 3.1064859731581365e-06, + "loss": 0.4971, + "step": 5413 + }, + { + "epoch": 2.5598108747044916, + "grad_norm": 2.850468397140503, + "learning_rate": 3.1058807623860353e-06, + "loss": 0.4686, + "step": 5414 + }, + { + "epoch": 2.5602836879432624, + "grad_norm": 3.0548019409179688, + "learning_rate": 3.1052755138883963e-06, + "loss": 0.4497, + "step": 5415 + }, + { + "epoch": 2.5607565011820332, + "grad_norm": 3.10168719291687, + "learning_rate": 3.1046702277029046e-06, + "loss": 0.569, + "step": 5416 + }, + { + "epoch": 2.561229314420804, + "grad_norm": 2.5887374877929688, + "learning_rate": 3.1040649038672494e-06, + "loss": 0.3812, + "step": 5417 + }, + { + "epoch": 2.5617021276595744, + "grad_norm": 2.9928438663482666, + "learning_rate": 3.1034595424191212e-06, + "loss": 0.4308, + "step": 5418 + }, + { + "epoch": 2.562174940898345, + "grad_norm": 2.7003073692321777, + "learning_rate": 3.102854143396214e-06, + "loss": 0.4967, + "step": 5419 + }, + { + "epoch": 2.562647754137116, + "grad_norm": 3.172868490219116, + "learning_rate": 3.102248706836222e-06, + "loss": 0.5311, + "step": 5420 + }, + { + "epoch": 2.5631205673758863, + "grad_norm": 3.0146191120147705, + "learning_rate": 3.101643232776844e-06, + "loss": 0.4714, + "step": 5421 + }, + { + "epoch": 2.563593380614657, + "grad_norm": 3.0683791637420654, + "learning_rate": 3.1010377212557806e-06, + "loss": 0.4047, + "step": 5422 + }, + { + "epoch": 2.564066193853428, + "grad_norm": 2.8260676860809326, + "learning_rate": 3.1004321723107334e-06, + "loss": 0.5282, + "step": 5423 + }, + { + "epoch": 2.5645390070921987, + "grad_norm": 3.0792388916015625, + "learning_rate": 3.0998265859794074e-06, + "loss": 0.5323, + "step": 5424 + }, + { + "epoch": 2.5650118203309695, + "grad_norm": 2.7332866191864014, + "learning_rate": 3.09922096229951e-06, + "loss": 0.4401, + "step": 5425 + }, + { + "epoch": 2.56548463356974, + "grad_norm": 2.9366047382354736, + "learning_rate": 3.098615301308751e-06, + "loss": 0.4495, + "step": 5426 + }, + { + "epoch": 2.5659574468085107, + "grad_norm": 2.982088565826416, + "learning_rate": 3.098009603044842e-06, + "loss": 0.495, + "step": 5427 + }, + { + "epoch": 2.5664302600472815, + "grad_norm": 3.1204755306243896, + "learning_rate": 3.0974038675454976e-06, + "loss": 0.4354, + "step": 5428 + }, + { + "epoch": 2.566903073286052, + "grad_norm": 2.835238218307495, + "learning_rate": 3.0967980948484333e-06, + "loss": 0.4161, + "step": 5429 + }, + { + "epoch": 2.5673758865248226, + "grad_norm": 2.8104958534240723, + "learning_rate": 3.096192284991369e-06, + "loss": 0.5045, + "step": 5430 + }, + { + "epoch": 2.5678486997635934, + "grad_norm": 3.1636080741882324, + "learning_rate": 3.0955864380120247e-06, + "loss": 0.4533, + "step": 5431 + }, + { + "epoch": 2.568321513002364, + "grad_norm": 2.980112314224243, + "learning_rate": 3.0949805539481247e-06, + "loss": 0.3998, + "step": 5432 + }, + { + "epoch": 2.568794326241135, + "grad_norm": 2.6379945278167725, + "learning_rate": 3.0943746328373953e-06, + "loss": 0.3785, + "step": 5433 + }, + { + "epoch": 2.5692671394799054, + "grad_norm": 2.780930757522583, + "learning_rate": 3.0937686747175627e-06, + "loss": 0.4801, + "step": 5434 + }, + { + "epoch": 2.569739952718676, + "grad_norm": 2.6608550548553467, + "learning_rate": 3.0931626796263585e-06, + "loss": 0.4047, + "step": 5435 + }, + { + "epoch": 2.570212765957447, + "grad_norm": 3.130584716796875, + "learning_rate": 3.0925566476015156e-06, + "loss": 0.5049, + "step": 5436 + }, + { + "epoch": 2.5706855791962173, + "grad_norm": 2.9699313640594482, + "learning_rate": 3.0919505786807687e-06, + "loss": 0.3847, + "step": 5437 + }, + { + "epoch": 2.571158392434988, + "grad_norm": 2.919260025024414, + "learning_rate": 3.091344472901855e-06, + "loss": 0.4631, + "step": 5438 + }, + { + "epoch": 2.571631205673759, + "grad_norm": 2.956587553024292, + "learning_rate": 3.0907383303025134e-06, + "loss": 0.4974, + "step": 5439 + }, + { + "epoch": 2.5721040189125297, + "grad_norm": 2.758542776107788, + "learning_rate": 3.090132150920486e-06, + "loss": 0.4785, + "step": 5440 + }, + { + "epoch": 2.5725768321513005, + "grad_norm": 2.678469657897949, + "learning_rate": 3.0895259347935175e-06, + "loss": 0.4453, + "step": 5441 + }, + { + "epoch": 2.573049645390071, + "grad_norm": 2.6508545875549316, + "learning_rate": 3.088919681959355e-06, + "loss": 0.4426, + "step": 5442 + }, + { + "epoch": 2.5735224586288417, + "grad_norm": 2.6156187057495117, + "learning_rate": 3.0883133924557453e-06, + "loss": 0.4445, + "step": 5443 + }, + { + "epoch": 2.5739952718676125, + "grad_norm": 2.484374761581421, + "learning_rate": 3.08770706632044e-06, + "loss": 0.4155, + "step": 5444 + }, + { + "epoch": 2.574468085106383, + "grad_norm": 2.7465295791625977, + "learning_rate": 3.087100703591193e-06, + "loss": 0.4085, + "step": 5445 + }, + { + "epoch": 2.5749408983451536, + "grad_norm": 2.771740198135376, + "learning_rate": 3.08649430430576e-06, + "loss": 0.4313, + "step": 5446 + }, + { + "epoch": 2.5754137115839244, + "grad_norm": 2.7480874061584473, + "learning_rate": 3.0858878685018984e-06, + "loss": 0.3471, + "step": 5447 + }, + { + "epoch": 2.575886524822695, + "grad_norm": 2.894913673400879, + "learning_rate": 3.085281396217368e-06, + "loss": 0.4888, + "step": 5448 + }, + { + "epoch": 2.576359338061466, + "grad_norm": 3.037628173828125, + "learning_rate": 3.0846748874899306e-06, + "loss": 0.3976, + "step": 5449 + }, + { + "epoch": 2.5768321513002364, + "grad_norm": 2.4811434745788574, + "learning_rate": 3.0840683423573526e-06, + "loss": 0.4822, + "step": 5450 + }, + { + "epoch": 2.577304964539007, + "grad_norm": 3.0078725814819336, + "learning_rate": 3.0834617608573998e-06, + "loss": 0.4999, + "step": 5451 + }, + { + "epoch": 2.5777777777777775, + "grad_norm": 3.174154043197632, + "learning_rate": 3.0828551430278413e-06, + "loss": 0.4626, + "step": 5452 + }, + { + "epoch": 2.5782505910165483, + "grad_norm": 2.8277535438537598, + "learning_rate": 3.082248488906449e-06, + "loss": 0.4633, + "step": 5453 + }, + { + "epoch": 2.578723404255319, + "grad_norm": 2.731767416000366, + "learning_rate": 3.0816417985309966e-06, + "loss": 0.4148, + "step": 5454 + }, + { + "epoch": 2.57919621749409, + "grad_norm": 2.5480549335479736, + "learning_rate": 3.0810350719392597e-06, + "loss": 0.4773, + "step": 5455 + }, + { + "epoch": 2.5796690307328607, + "grad_norm": 2.9755172729492188, + "learning_rate": 3.080428309169017e-06, + "loss": 0.5107, + "step": 5456 + }, + { + "epoch": 2.580141843971631, + "grad_norm": 2.6499290466308594, + "learning_rate": 3.079821510258048e-06, + "loss": 0.3982, + "step": 5457 + }, + { + "epoch": 2.580614657210402, + "grad_norm": 2.663214921951294, + "learning_rate": 3.079214675244136e-06, + "loss": 0.4419, + "step": 5458 + }, + { + "epoch": 2.5810874704491726, + "grad_norm": 2.595489263534546, + "learning_rate": 3.078607804165066e-06, + "loss": 0.3958, + "step": 5459 + }, + { + "epoch": 2.581560283687943, + "grad_norm": 3.031458854675293, + "learning_rate": 3.0780008970586255e-06, + "loss": 0.518, + "step": 5460 + }, + { + "epoch": 2.582033096926714, + "grad_norm": 2.827071189880371, + "learning_rate": 3.077393953962603e-06, + "loss": 0.4397, + "step": 5461 + }, + { + "epoch": 2.5825059101654846, + "grad_norm": 2.656111240386963, + "learning_rate": 3.0767869749147917e-06, + "loss": 0.4912, + "step": 5462 + }, + { + "epoch": 2.5829787234042554, + "grad_norm": 2.545365333557129, + "learning_rate": 3.076179959952984e-06, + "loss": 0.3991, + "step": 5463 + }, + { + "epoch": 2.583451536643026, + "grad_norm": 2.5794365406036377, + "learning_rate": 3.075572909114977e-06, + "loss": 0.4499, + "step": 5464 + }, + { + "epoch": 2.5839243498817965, + "grad_norm": 2.787140369415283, + "learning_rate": 3.074965822438568e-06, + "loss": 0.386, + "step": 5465 + }, + { + "epoch": 2.5843971631205673, + "grad_norm": 2.6406853199005127, + "learning_rate": 3.0743586999615594e-06, + "loss": 0.4853, + "step": 5466 + }, + { + "epoch": 2.584869976359338, + "grad_norm": 2.8082082271575928, + "learning_rate": 3.073751541721752e-06, + "loss": 0.4669, + "step": 5467 + }, + { + "epoch": 2.5853427895981085, + "grad_norm": 2.8808975219726562, + "learning_rate": 3.073144347756952e-06, + "loss": 0.4193, + "step": 5468 + }, + { + "epoch": 2.5858156028368793, + "grad_norm": 2.823352813720703, + "learning_rate": 3.072537118104968e-06, + "loss": 0.482, + "step": 5469 + }, + { + "epoch": 2.58628841607565, + "grad_norm": 2.6454555988311768, + "learning_rate": 3.0719298528036073e-06, + "loss": 0.4667, + "step": 5470 + }, + { + "epoch": 2.586761229314421, + "grad_norm": 2.871145486831665, + "learning_rate": 3.0713225518906826e-06, + "loss": 0.5125, + "step": 5471 + }, + { + "epoch": 2.5872340425531917, + "grad_norm": 3.1301417350769043, + "learning_rate": 3.070715215404007e-06, + "loss": 0.4827, + "step": 5472 + }, + { + "epoch": 2.587706855791962, + "grad_norm": 2.31062912940979, + "learning_rate": 3.070107843381398e-06, + "loss": 0.3954, + "step": 5473 + }, + { + "epoch": 2.588179669030733, + "grad_norm": 2.8366353511810303, + "learning_rate": 3.069500435860674e-06, + "loss": 0.4597, + "step": 5474 + }, + { + "epoch": 2.5886524822695036, + "grad_norm": 2.900143623352051, + "learning_rate": 3.068892992879654e-06, + "loss": 0.4294, + "step": 5475 + }, + { + "epoch": 2.589125295508274, + "grad_norm": 2.923313617706299, + "learning_rate": 3.0682855144761626e-06, + "loss": 0.505, + "step": 5476 + }, + { + "epoch": 2.5895981087470448, + "grad_norm": 2.726475954055786, + "learning_rate": 3.0676780006880242e-06, + "loss": 0.4208, + "step": 5477 + }, + { + "epoch": 2.5900709219858156, + "grad_norm": 4.115052223205566, + "learning_rate": 3.0670704515530654e-06, + "loss": 0.466, + "step": 5478 + }, + { + "epoch": 2.5905437352245864, + "grad_norm": 2.6018717288970947, + "learning_rate": 3.0664628671091163e-06, + "loss": 0.4697, + "step": 5479 + }, + { + "epoch": 2.591016548463357, + "grad_norm": 2.7393722534179688, + "learning_rate": 3.0658552473940085e-06, + "loss": 0.4618, + "step": 5480 + }, + { + "epoch": 2.5914893617021275, + "grad_norm": 2.8406929969787598, + "learning_rate": 3.065247592445575e-06, + "loss": 0.4806, + "step": 5481 + }, + { + "epoch": 2.5919621749408983, + "grad_norm": 2.9773001670837402, + "learning_rate": 3.0646399023016525e-06, + "loss": 0.4764, + "step": 5482 + }, + { + "epoch": 2.592434988179669, + "grad_norm": 3.374643325805664, + "learning_rate": 3.0640321770000804e-06, + "loss": 0.4481, + "step": 5483 + }, + { + "epoch": 2.5929078014184395, + "grad_norm": 2.5742013454437256, + "learning_rate": 3.0634244165786965e-06, + "loss": 0.432, + "step": 5484 + }, + { + "epoch": 2.5933806146572103, + "grad_norm": 2.9390289783477783, + "learning_rate": 3.062816621075346e-06, + "loss": 0.3941, + "step": 5485 + }, + { + "epoch": 2.593853427895981, + "grad_norm": 2.683414936065674, + "learning_rate": 3.062208790527871e-06, + "loss": 0.4268, + "step": 5486 + }, + { + "epoch": 2.594326241134752, + "grad_norm": 2.689647674560547, + "learning_rate": 3.06160092497412e-06, + "loss": 0.4569, + "step": 5487 + }, + { + "epoch": 2.5947990543735227, + "grad_norm": 3.1170310974121094, + "learning_rate": 3.060993024451943e-06, + "loss": 0.4387, + "step": 5488 + }, + { + "epoch": 2.595271867612293, + "grad_norm": 2.8732447624206543, + "learning_rate": 3.0603850889991894e-06, + "loss": 0.451, + "step": 5489 + }, + { + "epoch": 2.595744680851064, + "grad_norm": 3.0444157123565674, + "learning_rate": 3.0597771186537135e-06, + "loss": 0.4691, + "step": 5490 + }, + { + "epoch": 2.5962174940898346, + "grad_norm": 2.3791720867156982, + "learning_rate": 3.0591691134533714e-06, + "loss": 0.4771, + "step": 5491 + }, + { + "epoch": 2.596690307328605, + "grad_norm": 3.0677225589752197, + "learning_rate": 3.05856107343602e-06, + "loss": 0.459, + "step": 5492 + }, + { + "epoch": 2.5971631205673757, + "grad_norm": 3.1702635288238525, + "learning_rate": 3.05795299863952e-06, + "loss": 0.4816, + "step": 5493 + }, + { + "epoch": 2.5976359338061465, + "grad_norm": 2.964869499206543, + "learning_rate": 3.057344889101734e-06, + "loss": 0.4369, + "step": 5494 + }, + { + "epoch": 2.5981087470449173, + "grad_norm": 3.1333882808685303, + "learning_rate": 3.056736744860525e-06, + "loss": 0.4178, + "step": 5495 + }, + { + "epoch": 2.598581560283688, + "grad_norm": 2.4340405464172363, + "learning_rate": 3.05612856595376e-06, + "loss": 0.4359, + "step": 5496 + }, + { + "epoch": 2.5990543735224585, + "grad_norm": 2.638620615005493, + "learning_rate": 3.0555203524193083e-06, + "loss": 0.3915, + "step": 5497 + }, + { + "epoch": 2.5995271867612293, + "grad_norm": 2.8218815326690674, + "learning_rate": 3.054912104295039e-06, + "loss": 0.4684, + "step": 5498 + }, + { + "epoch": 2.6, + "grad_norm": 2.6696009635925293, + "learning_rate": 3.054303821618827e-06, + "loss": 0.4073, + "step": 5499 + }, + { + "epoch": 2.6004728132387704, + "grad_norm": 2.3880512714385986, + "learning_rate": 3.0536955044285465e-06, + "loss": 0.3576, + "step": 5500 + }, + { + "epoch": 2.6009456264775412, + "grad_norm": 2.762890100479126, + "learning_rate": 3.053087152762075e-06, + "loss": 0.3857, + "step": 5501 + }, + { + "epoch": 2.601418439716312, + "grad_norm": 2.729033946990967, + "learning_rate": 3.052478766657292e-06, + "loss": 0.3935, + "step": 5502 + }, + { + "epoch": 2.601891252955083, + "grad_norm": 2.630490303039551, + "learning_rate": 3.051870346152078e-06, + "loss": 0.3932, + "step": 5503 + }, + { + "epoch": 2.6023640661938536, + "grad_norm": 3.0335981845855713, + "learning_rate": 3.051261891284318e-06, + "loss": 0.4313, + "step": 5504 + }, + { + "epoch": 2.602836879432624, + "grad_norm": 2.969888687133789, + "learning_rate": 3.0506534020918963e-06, + "loss": 0.4698, + "step": 5505 + }, + { + "epoch": 2.603309692671395, + "grad_norm": 3.093996524810791, + "learning_rate": 3.050044878612703e-06, + "loss": 0.5338, + "step": 5506 + }, + { + "epoch": 2.6037825059101656, + "grad_norm": 2.759993314743042, + "learning_rate": 3.049436320884626e-06, + "loss": 0.4429, + "step": 5507 + }, + { + "epoch": 2.604255319148936, + "grad_norm": 2.979422092437744, + "learning_rate": 3.0488277289455587e-06, + "loss": 0.4489, + "step": 5508 + }, + { + "epoch": 2.6047281323877067, + "grad_norm": 2.8266701698303223, + "learning_rate": 3.048219102833396e-06, + "loss": 0.489, + "step": 5509 + }, + { + "epoch": 2.6052009456264775, + "grad_norm": 2.2582461833953857, + "learning_rate": 3.047610442586033e-06, + "loss": 0.3759, + "step": 5510 + }, + { + "epoch": 2.6056737588652483, + "grad_norm": 3.078152894973755, + "learning_rate": 3.0470017482413694e-06, + "loss": 0.5059, + "step": 5511 + }, + { + "epoch": 2.606146572104019, + "grad_norm": 2.7895498275756836, + "learning_rate": 3.0463930198373047e-06, + "loss": 0.4752, + "step": 5512 + }, + { + "epoch": 2.6066193853427895, + "grad_norm": 3.2307958602905273, + "learning_rate": 3.045784257411743e-06, + "loss": 0.4847, + "step": 5513 + }, + { + "epoch": 2.6070921985815603, + "grad_norm": 2.793661594390869, + "learning_rate": 3.0451754610025884e-06, + "loss": 0.4492, + "step": 5514 + }, + { + "epoch": 2.607565011820331, + "grad_norm": 2.4443132877349854, + "learning_rate": 3.0445666306477484e-06, + "loss": 0.4174, + "step": 5515 + }, + { + "epoch": 2.6080378250591014, + "grad_norm": 2.628769636154175, + "learning_rate": 3.0439577663851326e-06, + "loss": 0.3889, + "step": 5516 + }, + { + "epoch": 2.608510638297872, + "grad_norm": 2.9367563724517822, + "learning_rate": 3.0433488682526525e-06, + "loss": 0.437, + "step": 5517 + }, + { + "epoch": 2.608983451536643, + "grad_norm": 3.171353340148926, + "learning_rate": 3.04273993628822e-06, + "loss": 0.47, + "step": 5518 + }, + { + "epoch": 2.609456264775414, + "grad_norm": 2.856576442718506, + "learning_rate": 3.0421309705297513e-06, + "loss": 0.4797, + "step": 5519 + }, + { + "epoch": 2.6099290780141846, + "grad_norm": 2.4926068782806396, + "learning_rate": 3.041521971015165e-06, + "loss": 0.4294, + "step": 5520 + }, + { + "epoch": 2.610401891252955, + "grad_norm": 2.7897613048553467, + "learning_rate": 3.040912937782379e-06, + "loss": 0.4388, + "step": 5521 + }, + { + "epoch": 2.6108747044917258, + "grad_norm": 3.588188886642456, + "learning_rate": 3.0403038708693173e-06, + "loss": 0.4027, + "step": 5522 + }, + { + "epoch": 2.6113475177304966, + "grad_norm": 3.5394980907440186, + "learning_rate": 3.0396947703139017e-06, + "loss": 0.4866, + "step": 5523 + }, + { + "epoch": 2.611820330969267, + "grad_norm": 3.086865186691284, + "learning_rate": 3.03908563615406e-06, + "loss": 0.4344, + "step": 5524 + }, + { + "epoch": 2.6122931442080377, + "grad_norm": 2.649564504623413, + "learning_rate": 3.0384764684277194e-06, + "loss": 0.4571, + "step": 5525 + }, + { + "epoch": 2.6127659574468085, + "grad_norm": 2.945234775543213, + "learning_rate": 3.0378672671728105e-06, + "loss": 0.4885, + "step": 5526 + }, + { + "epoch": 2.6132387706855793, + "grad_norm": 2.625424861907959, + "learning_rate": 3.037258032427265e-06, + "loss": 0.4095, + "step": 5527 + }, + { + "epoch": 2.61371158392435, + "grad_norm": 2.7597248554229736, + "learning_rate": 3.0366487642290175e-06, + "loss": 0.4393, + "step": 5528 + }, + { + "epoch": 2.6141843971631205, + "grad_norm": 2.721189260482788, + "learning_rate": 3.0360394626160043e-06, + "loss": 0.3865, + "step": 5529 + }, + { + "epoch": 2.6146572104018913, + "grad_norm": 2.624056339263916, + "learning_rate": 3.0354301276261656e-06, + "loss": 0.4273, + "step": 5530 + }, + { + "epoch": 2.615130023640662, + "grad_norm": 2.7764177322387695, + "learning_rate": 3.034820759297439e-06, + "loss": 0.4756, + "step": 5531 + }, + { + "epoch": 2.6156028368794324, + "grad_norm": 3.0841729640960693, + "learning_rate": 3.0342113576677696e-06, + "loss": 0.4907, + "step": 5532 + }, + { + "epoch": 2.616075650118203, + "grad_norm": 2.678715705871582, + "learning_rate": 3.0336019227751017e-06, + "loss": 0.4478, + "step": 5533 + }, + { + "epoch": 2.616548463356974, + "grad_norm": 2.378679037094116, + "learning_rate": 3.032992454657382e-06, + "loss": 0.3678, + "step": 5534 + }, + { + "epoch": 2.617021276595745, + "grad_norm": 2.792079210281372, + "learning_rate": 3.0323829533525583e-06, + "loss": 0.4115, + "step": 5535 + }, + { + "epoch": 2.6174940898345156, + "grad_norm": 2.738133192062378, + "learning_rate": 3.0317734188985832e-06, + "loss": 0.4152, + "step": 5536 + }, + { + "epoch": 2.617966903073286, + "grad_norm": 2.6963796615600586, + "learning_rate": 3.0311638513334084e-06, + "loss": 0.4096, + "step": 5537 + }, + { + "epoch": 2.6184397163120567, + "grad_norm": 2.694145679473877, + "learning_rate": 3.03055425069499e-06, + "loss": 0.3793, + "step": 5538 + }, + { + "epoch": 2.6189125295508275, + "grad_norm": 2.762403964996338, + "learning_rate": 3.0299446170212855e-06, + "loss": 0.459, + "step": 5539 + }, + { + "epoch": 2.619385342789598, + "grad_norm": 2.804382562637329, + "learning_rate": 3.0293349503502522e-06, + "loss": 0.4853, + "step": 5540 + }, + { + "epoch": 2.6198581560283687, + "grad_norm": 2.7768518924713135, + "learning_rate": 3.0287252507198537e-06, + "loss": 0.4496, + "step": 5541 + }, + { + "epoch": 2.6203309692671395, + "grad_norm": 2.9075138568878174, + "learning_rate": 3.028115518168052e-06, + "loss": 0.4498, + "step": 5542 + }, + { + "epoch": 2.6208037825059103, + "grad_norm": 2.8966822624206543, + "learning_rate": 3.0275057527328126e-06, + "loss": 0.4434, + "step": 5543 + }, + { + "epoch": 2.621276595744681, + "grad_norm": 2.8140156269073486, + "learning_rate": 3.0268959544521027e-06, + "loss": 0.3935, + "step": 5544 + }, + { + "epoch": 2.6217494089834514, + "grad_norm": 2.8606276512145996, + "learning_rate": 3.0262861233638924e-06, + "loss": 0.4222, + "step": 5545 + }, + { + "epoch": 2.6222222222222222, + "grad_norm": 3.003610134124756, + "learning_rate": 3.0256762595061522e-06, + "loss": 0.428, + "step": 5546 + }, + { + "epoch": 2.622695035460993, + "grad_norm": 2.725907802581787, + "learning_rate": 3.025066362916857e-06, + "loss": 0.3975, + "step": 5547 + }, + { + "epoch": 2.6231678486997634, + "grad_norm": 2.5247902870178223, + "learning_rate": 3.024456433633982e-06, + "loss": 0.4584, + "step": 5548 + }, + { + "epoch": 2.623640661938534, + "grad_norm": 2.932798147201538, + "learning_rate": 3.0238464716955045e-06, + "loss": 0.4991, + "step": 5549 + }, + { + "epoch": 2.624113475177305, + "grad_norm": 2.693547010421753, + "learning_rate": 3.023236477139404e-06, + "loss": 0.4405, + "step": 5550 + }, + { + "epoch": 2.6245862884160758, + "grad_norm": 3.2600035667419434, + "learning_rate": 3.022626450003662e-06, + "loss": 0.4904, + "step": 5551 + }, + { + "epoch": 2.6250591016548466, + "grad_norm": 2.9471960067749023, + "learning_rate": 3.0220163903262627e-06, + "loss": 0.4487, + "step": 5552 + }, + { + "epoch": 2.625531914893617, + "grad_norm": 2.583944082260132, + "learning_rate": 3.0214062981451926e-06, + "loss": 0.3552, + "step": 5553 + }, + { + "epoch": 2.6260047281323877, + "grad_norm": 2.675062656402588, + "learning_rate": 3.0207961734984377e-06, + "loss": 0.4524, + "step": 5554 + }, + { + "epoch": 2.6264775413711585, + "grad_norm": 3.0126802921295166, + "learning_rate": 3.0201860164239887e-06, + "loss": 0.4124, + "step": 5555 + }, + { + "epoch": 2.626950354609929, + "grad_norm": 2.490734577178955, + "learning_rate": 3.019575826959838e-06, + "loss": 0.4095, + "step": 5556 + }, + { + "epoch": 2.6274231678486997, + "grad_norm": 2.72817063331604, + "learning_rate": 3.018965605143978e-06, + "loss": 0.4298, + "step": 5557 + }, + { + "epoch": 2.6278959810874705, + "grad_norm": 3.1298327445983887, + "learning_rate": 3.0183553510144064e-06, + "loss": 0.4961, + "step": 5558 + }, + { + "epoch": 2.6283687943262413, + "grad_norm": 3.2379956245422363, + "learning_rate": 3.0177450646091195e-06, + "loss": 0.4943, + "step": 5559 + }, + { + "epoch": 2.628841607565012, + "grad_norm": 2.5040571689605713, + "learning_rate": 3.017134745966117e-06, + "loss": 0.3701, + "step": 5560 + }, + { + "epoch": 2.6293144208037824, + "grad_norm": 3.047184944152832, + "learning_rate": 3.0165243951234025e-06, + "loss": 0.4587, + "step": 5561 + }, + { + "epoch": 2.629787234042553, + "grad_norm": 2.4926774501800537, + "learning_rate": 3.0159140121189783e-06, + "loss": 0.3723, + "step": 5562 + }, + { + "epoch": 2.630260047281324, + "grad_norm": 2.5434961318969727, + "learning_rate": 3.015303596990851e-06, + "loss": 0.4176, + "step": 5563 + }, + { + "epoch": 2.6307328605200944, + "grad_norm": 2.5117976665496826, + "learning_rate": 3.0146931497770284e-06, + "loss": 0.4218, + "step": 5564 + }, + { + "epoch": 2.631205673758865, + "grad_norm": 2.9408798217773438, + "learning_rate": 3.0140826705155196e-06, + "loss": 0.4473, + "step": 5565 + }, + { + "epoch": 2.631678486997636, + "grad_norm": 2.996422052383423, + "learning_rate": 3.0134721592443385e-06, + "loss": 0.4513, + "step": 5566 + }, + { + "epoch": 2.6321513002364068, + "grad_norm": 2.984356164932251, + "learning_rate": 3.0128616160014955e-06, + "loss": 0.4749, + "step": 5567 + }, + { + "epoch": 2.6326241134751776, + "grad_norm": 2.6075069904327393, + "learning_rate": 3.0122510408250095e-06, + "loss": 0.4707, + "step": 5568 + }, + { + "epoch": 2.633096926713948, + "grad_norm": 2.9463071823120117, + "learning_rate": 3.0116404337528972e-06, + "loss": 0.5125, + "step": 5569 + }, + { + "epoch": 2.6335697399527187, + "grad_norm": 2.98574161529541, + "learning_rate": 3.0110297948231787e-06, + "loss": 0.4487, + "step": 5570 + }, + { + "epoch": 2.6340425531914895, + "grad_norm": 2.6039397716522217, + "learning_rate": 3.010419124073876e-06, + "loss": 0.4516, + "step": 5571 + }, + { + "epoch": 2.63451536643026, + "grad_norm": 2.8480236530303955, + "learning_rate": 3.0098084215430124e-06, + "loss": 0.4962, + "step": 5572 + }, + { + "epoch": 2.6349881796690307, + "grad_norm": 2.527597427368164, + "learning_rate": 3.0091976872686133e-06, + "loss": 0.435, + "step": 5573 + }, + { + "epoch": 2.6354609929078014, + "grad_norm": 2.898303508758545, + "learning_rate": 3.0085869212887076e-06, + "loss": 0.4473, + "step": 5574 + }, + { + "epoch": 2.6359338061465722, + "grad_norm": 2.981414318084717, + "learning_rate": 3.007976123641324e-06, + "loss": 0.4203, + "step": 5575 + }, + { + "epoch": 2.636406619385343, + "grad_norm": 3.219064474105835, + "learning_rate": 3.0073652943644947e-06, + "loss": 0.4596, + "step": 5576 + }, + { + "epoch": 2.6368794326241134, + "grad_norm": 2.7287049293518066, + "learning_rate": 3.0067544334962532e-06, + "loss": 0.433, + "step": 5577 + }, + { + "epoch": 2.637352245862884, + "grad_norm": 2.6232664585113525, + "learning_rate": 3.0061435410746352e-06, + "loss": 0.4254, + "step": 5578 + }, + { + "epoch": 2.637825059101655, + "grad_norm": 2.908311605453491, + "learning_rate": 3.0055326171376788e-06, + "loss": 0.4349, + "step": 5579 + }, + { + "epoch": 2.6382978723404253, + "grad_norm": 2.8369064331054688, + "learning_rate": 3.0049216617234224e-06, + "loss": 0.4675, + "step": 5580 + }, + { + "epoch": 2.638770685579196, + "grad_norm": 2.659499406814575, + "learning_rate": 3.0043106748699085e-06, + "loss": 0.4073, + "step": 5581 + }, + { + "epoch": 2.639243498817967, + "grad_norm": 2.579765558242798, + "learning_rate": 3.00369965661518e-06, + "loss": 0.4536, + "step": 5582 + }, + { + "epoch": 2.6397163120567377, + "grad_norm": 3.572861909866333, + "learning_rate": 3.0030886069972827e-06, + "loss": 0.5227, + "step": 5583 + }, + { + "epoch": 2.6401891252955085, + "grad_norm": 2.6523196697235107, + "learning_rate": 3.002477526054263e-06, + "loss": 0.3846, + "step": 5584 + }, + { + "epoch": 2.640661938534279, + "grad_norm": 3.072181463241577, + "learning_rate": 3.001866413824173e-06, + "loss": 0.5399, + "step": 5585 + }, + { + "epoch": 2.6411347517730497, + "grad_norm": 2.7304325103759766, + "learning_rate": 3.0012552703450597e-06, + "loss": 0.4048, + "step": 5586 + }, + { + "epoch": 2.6416075650118205, + "grad_norm": 3.039491891860962, + "learning_rate": 3.0006440956549798e-06, + "loss": 0.5035, + "step": 5587 + }, + { + "epoch": 2.642080378250591, + "grad_norm": 2.7623798847198486, + "learning_rate": 3.000032889791988e-06, + "loss": 0.4369, + "step": 5588 + }, + { + "epoch": 2.6425531914893616, + "grad_norm": 3.391052722930908, + "learning_rate": 2.9994216527941394e-06, + "loss": 0.5308, + "step": 5589 + }, + { + "epoch": 2.6430260047281324, + "grad_norm": 3.0263915061950684, + "learning_rate": 2.9988103846994954e-06, + "loss": 0.4319, + "step": 5590 + }, + { + "epoch": 2.6434988179669032, + "grad_norm": 2.786607027053833, + "learning_rate": 2.998199085546115e-06, + "loss": 0.4695, + "step": 5591 + }, + { + "epoch": 2.643971631205674, + "grad_norm": 2.884674310684204, + "learning_rate": 2.9975877553720627e-06, + "loss": 0.4615, + "step": 5592 + }, + { + "epoch": 2.6444444444444444, + "grad_norm": 2.6100499629974365, + "learning_rate": 2.996976394215402e-06, + "loss": 0.4784, + "step": 5593 + }, + { + "epoch": 2.644917257683215, + "grad_norm": 2.6978676319122314, + "learning_rate": 2.9963650021142018e-06, + "loss": 0.3911, + "step": 5594 + }, + { + "epoch": 2.645390070921986, + "grad_norm": 2.8080835342407227, + "learning_rate": 2.9957535791065284e-06, + "loss": 0.4997, + "step": 5595 + }, + { + "epoch": 2.6458628841607563, + "grad_norm": 2.6639578342437744, + "learning_rate": 2.9951421252304537e-06, + "loss": 0.4066, + "step": 5596 + }, + { + "epoch": 2.646335697399527, + "grad_norm": 3.102456569671631, + "learning_rate": 2.9945306405240505e-06, + "loss": 0.5554, + "step": 5597 + }, + { + "epoch": 2.646808510638298, + "grad_norm": 2.6524150371551514, + "learning_rate": 2.993919125025392e-06, + "loss": 0.3881, + "step": 5598 + }, + { + "epoch": 2.6472813238770687, + "grad_norm": 2.926316499710083, + "learning_rate": 2.993307578772556e-06, + "loss": 0.4845, + "step": 5599 + }, + { + "epoch": 2.6477541371158395, + "grad_norm": 3.346550703048706, + "learning_rate": 2.9926960018036195e-06, + "loss": 0.4481, + "step": 5600 + }, + { + "epoch": 2.64822695035461, + "grad_norm": 2.6211020946502686, + "learning_rate": 2.9920843941566634e-06, + "loss": 0.4355, + "step": 5601 + }, + { + "epoch": 2.6486997635933807, + "grad_norm": 2.7479333877563477, + "learning_rate": 2.99147275586977e-06, + "loss": 0.4373, + "step": 5602 + }, + { + "epoch": 2.6491725768321515, + "grad_norm": 2.523385524749756, + "learning_rate": 2.9908610869810235e-06, + "loss": 0.4467, + "step": 5603 + }, + { + "epoch": 2.649645390070922, + "grad_norm": 2.93886137008667, + "learning_rate": 2.9902493875285086e-06, + "loss": 0.4956, + "step": 5604 + }, + { + "epoch": 2.6501182033096926, + "grad_norm": 2.7630443572998047, + "learning_rate": 2.989637657550315e-06, + "loss": 0.5012, + "step": 5605 + }, + { + "epoch": 2.6505910165484634, + "grad_norm": 2.6733906269073486, + "learning_rate": 2.989025897084531e-06, + "loss": 0.446, + "step": 5606 + }, + { + "epoch": 2.651063829787234, + "grad_norm": 2.8411107063293457, + "learning_rate": 2.9884141061692484e-06, + "loss": 0.4817, + "step": 5607 + }, + { + "epoch": 2.651536643026005, + "grad_norm": 2.8667192459106445, + "learning_rate": 2.987802284842562e-06, + "loss": 0.3909, + "step": 5608 + }, + { + "epoch": 2.6520094562647754, + "grad_norm": 3.4640755653381348, + "learning_rate": 2.987190433142565e-06, + "loss": 0.4379, + "step": 5609 + }, + { + "epoch": 2.652482269503546, + "grad_norm": 2.675121307373047, + "learning_rate": 2.9865785511073565e-06, + "loss": 0.4833, + "step": 5610 + }, + { + "epoch": 2.652955082742317, + "grad_norm": 2.4375529289245605, + "learning_rate": 2.9859666387750353e-06, + "loss": 0.3949, + "step": 5611 + }, + { + "epoch": 2.6534278959810873, + "grad_norm": 2.7312581539154053, + "learning_rate": 2.9853546961837026e-06, + "loss": 0.4546, + "step": 5612 + }, + { + "epoch": 2.653900709219858, + "grad_norm": 2.7695999145507812, + "learning_rate": 2.9847427233714617e-06, + "loss": 0.4696, + "step": 5613 + }, + { + "epoch": 2.654373522458629, + "grad_norm": 2.6313109397888184, + "learning_rate": 2.984130720376416e-06, + "loss": 0.4733, + "step": 5614 + }, + { + "epoch": 2.6548463356973997, + "grad_norm": 2.656864881515503, + "learning_rate": 2.9835186872366733e-06, + "loss": 0.3806, + "step": 5615 + }, + { + "epoch": 2.65531914893617, + "grad_norm": 2.720075845718384, + "learning_rate": 2.982906623990342e-06, + "loss": 0.4041, + "step": 5616 + }, + { + "epoch": 2.655791962174941, + "grad_norm": 2.6684951782226562, + "learning_rate": 2.9822945306755334e-06, + "loss": 0.4552, + "step": 5617 + }, + { + "epoch": 2.6562647754137116, + "grad_norm": 2.567751884460449, + "learning_rate": 2.9816824073303585e-06, + "loss": 0.465, + "step": 5618 + }, + { + "epoch": 2.656737588652482, + "grad_norm": 2.7490367889404297, + "learning_rate": 2.981070253992933e-06, + "loss": 0.4647, + "step": 5619 + }, + { + "epoch": 2.657210401891253, + "grad_norm": 2.548656463623047, + "learning_rate": 2.9804580707013715e-06, + "loss": 0.4226, + "step": 5620 + }, + { + "epoch": 2.6576832151300236, + "grad_norm": 2.5484731197357178, + "learning_rate": 2.9798458574937927e-06, + "loss": 0.382, + "step": 5621 + }, + { + "epoch": 2.6581560283687944, + "grad_norm": 2.7293949127197266, + "learning_rate": 2.979233614408317e-06, + "loss": 0.4418, + "step": 5622 + }, + { + "epoch": 2.658628841607565, + "grad_norm": 2.645036458969116, + "learning_rate": 2.9786213414830646e-06, + "loss": 0.414, + "step": 5623 + }, + { + "epoch": 2.6591016548463355, + "grad_norm": 2.5287609100341797, + "learning_rate": 2.9780090387561604e-06, + "loss": 0.3914, + "step": 5624 + }, + { + "epoch": 2.6595744680851063, + "grad_norm": 2.5570411682128906, + "learning_rate": 2.9773967062657293e-06, + "loss": 0.4431, + "step": 5625 + }, + { + "epoch": 2.660047281323877, + "grad_norm": 2.681749105453491, + "learning_rate": 2.9767843440498983e-06, + "loss": 0.4245, + "step": 5626 + }, + { + "epoch": 2.6605200945626475, + "grad_norm": 2.8629777431488037, + "learning_rate": 2.976171952146798e-06, + "loss": 0.4643, + "step": 5627 + }, + { + "epoch": 2.6609929078014183, + "grad_norm": 2.577148199081421, + "learning_rate": 2.9755595305945573e-06, + "loss": 0.43, + "step": 5628 + }, + { + "epoch": 2.661465721040189, + "grad_norm": 2.747218370437622, + "learning_rate": 2.97494707943131e-06, + "loss": 0.5194, + "step": 5629 + }, + { + "epoch": 2.66193853427896, + "grad_norm": 2.535604953765869, + "learning_rate": 2.9743345986951904e-06, + "loss": 0.4401, + "step": 5630 + }, + { + "epoch": 2.6624113475177307, + "grad_norm": 3.3341166973114014, + "learning_rate": 2.973722088424336e-06, + "loss": 0.4925, + "step": 5631 + }, + { + "epoch": 2.662884160756501, + "grad_norm": 2.9264349937438965, + "learning_rate": 2.973109548656884e-06, + "loss": 0.4787, + "step": 5632 + }, + { + "epoch": 2.663356973995272, + "grad_norm": 2.7132506370544434, + "learning_rate": 2.9724969794309742e-06, + "loss": 0.4138, + "step": 5633 + }, + { + "epoch": 2.6638297872340426, + "grad_norm": 2.7970192432403564, + "learning_rate": 2.9718843807847497e-06, + "loss": 0.4896, + "step": 5634 + }, + { + "epoch": 2.664302600472813, + "grad_norm": 2.610208749771118, + "learning_rate": 2.9712717527563545e-06, + "loss": 0.3997, + "step": 5635 + }, + { + "epoch": 2.6647754137115838, + "grad_norm": 3.5483577251434326, + "learning_rate": 2.9706590953839335e-06, + "loss": 0.5109, + "step": 5636 + }, + { + "epoch": 2.6652482269503546, + "grad_norm": 2.746933698654175, + "learning_rate": 2.9700464087056345e-06, + "loss": 0.4672, + "step": 5637 + }, + { + "epoch": 2.6657210401891254, + "grad_norm": 2.704436779022217, + "learning_rate": 2.969433692759607e-06, + "loss": 0.4402, + "step": 5638 + }, + { + "epoch": 2.666193853427896, + "grad_norm": 2.859520196914673, + "learning_rate": 2.9688209475840005e-06, + "loss": 0.4679, + "step": 5639 + }, + { + "epoch": 2.6666666666666665, + "grad_norm": 2.518580436706543, + "learning_rate": 2.968208173216971e-06, + "loss": 0.3772, + "step": 5640 + }, + { + "epoch": 2.6671394799054373, + "grad_norm": 2.7624926567077637, + "learning_rate": 2.967595369696671e-06, + "loss": 0.4753, + "step": 5641 + }, + { + "epoch": 2.667612293144208, + "grad_norm": 2.654003620147705, + "learning_rate": 2.966982537061257e-06, + "loss": 0.4583, + "step": 5642 + }, + { + "epoch": 2.6680851063829785, + "grad_norm": 2.8473968505859375, + "learning_rate": 2.966369675348888e-06, + "loss": 0.4623, + "step": 5643 + }, + { + "epoch": 2.6685579196217493, + "grad_norm": 2.5587947368621826, + "learning_rate": 2.9657567845977253e-06, + "loss": 0.4014, + "step": 5644 + }, + { + "epoch": 2.66903073286052, + "grad_norm": 2.572220802307129, + "learning_rate": 2.96514386484593e-06, + "loss": 0.4249, + "step": 5645 + }, + { + "epoch": 2.669503546099291, + "grad_norm": 2.7995707988739014, + "learning_rate": 2.964530916131665e-06, + "loss": 0.4575, + "step": 5646 + }, + { + "epoch": 2.6699763593380617, + "grad_norm": 2.8712687492370605, + "learning_rate": 2.963917938493097e-06, + "loss": 0.4353, + "step": 5647 + }, + { + "epoch": 2.670449172576832, + "grad_norm": 2.856473207473755, + "learning_rate": 2.963304931968393e-06, + "loss": 0.4345, + "step": 5648 + }, + { + "epoch": 2.670921985815603, + "grad_norm": 2.709198474884033, + "learning_rate": 2.9626918965957224e-06, + "loss": 0.4116, + "step": 5649 + }, + { + "epoch": 2.6713947990543736, + "grad_norm": 2.8144607543945312, + "learning_rate": 2.962078832413257e-06, + "loss": 0.4575, + "step": 5650 + }, + { + "epoch": 2.671867612293144, + "grad_norm": 3.131911039352417, + "learning_rate": 2.961465739459168e-06, + "loss": 0.4743, + "step": 5651 + }, + { + "epoch": 2.6723404255319148, + "grad_norm": 2.8487515449523926, + "learning_rate": 2.9608526177716316e-06, + "loss": 0.4314, + "step": 5652 + }, + { + "epoch": 2.6728132387706856, + "grad_norm": 2.613229751586914, + "learning_rate": 2.960239467388823e-06, + "loss": 0.4807, + "step": 5653 + }, + { + "epoch": 2.6732860520094563, + "grad_norm": 2.5049116611480713, + "learning_rate": 2.9596262883489213e-06, + "loss": 0.4708, + "step": 5654 + }, + { + "epoch": 2.673758865248227, + "grad_norm": 2.6347460746765137, + "learning_rate": 2.9590130806901052e-06, + "loss": 0.3689, + "step": 5655 + }, + { + "epoch": 2.6742316784869975, + "grad_norm": 3.3290371894836426, + "learning_rate": 2.9583998444505578e-06, + "loss": 0.4674, + "step": 5656 + }, + { + "epoch": 2.6747044917257683, + "grad_norm": 2.748403549194336, + "learning_rate": 2.957786579668462e-06, + "loss": 0.3852, + "step": 5657 + }, + { + "epoch": 2.675177304964539, + "grad_norm": 2.837573766708374, + "learning_rate": 2.957173286382003e-06, + "loss": 0.4541, + "step": 5658 + }, + { + "epoch": 2.6756501182033094, + "grad_norm": 3.0976510047912598, + "learning_rate": 2.9565599646293686e-06, + "loss": 0.4669, + "step": 5659 + }, + { + "epoch": 2.6761229314420802, + "grad_norm": 2.7059597969055176, + "learning_rate": 2.955946614448747e-06, + "loss": 0.3935, + "step": 5660 + }, + { + "epoch": 2.676595744680851, + "grad_norm": 2.6700541973114014, + "learning_rate": 2.9553332358783294e-06, + "loss": 0.4322, + "step": 5661 + }, + { + "epoch": 2.677068557919622, + "grad_norm": 2.9782698154449463, + "learning_rate": 2.9547198289563068e-06, + "loss": 0.4338, + "step": 5662 + }, + { + "epoch": 2.6775413711583926, + "grad_norm": 2.637876510620117, + "learning_rate": 2.9541063937208755e-06, + "loss": 0.4289, + "step": 5663 + }, + { + "epoch": 2.678014184397163, + "grad_norm": 3.421949863433838, + "learning_rate": 2.953492930210229e-06, + "loss": 0.5458, + "step": 5664 + }, + { + "epoch": 2.678486997635934, + "grad_norm": 2.8273842334747314, + "learning_rate": 2.952879438462567e-06, + "loss": 0.4529, + "step": 5665 + }, + { + "epoch": 2.6789598108747046, + "grad_norm": 2.9090168476104736, + "learning_rate": 2.9522659185160873e-06, + "loss": 0.444, + "step": 5666 + }, + { + "epoch": 2.679432624113475, + "grad_norm": 2.646710157394409, + "learning_rate": 2.9516523704089927e-06, + "loss": 0.4226, + "step": 5667 + }, + { + "epoch": 2.6799054373522457, + "grad_norm": 2.65915584564209, + "learning_rate": 2.951038794179486e-06, + "loss": 0.4307, + "step": 5668 + }, + { + "epoch": 2.6803782505910165, + "grad_norm": 3.004507303237915, + "learning_rate": 2.950425189865771e-06, + "loss": 0.4799, + "step": 5669 + }, + { + "epoch": 2.6808510638297873, + "grad_norm": 2.5210134983062744, + "learning_rate": 2.949811557506054e-06, + "loss": 0.3842, + "step": 5670 + }, + { + "epoch": 2.681323877068558, + "grad_norm": 2.8072893619537354, + "learning_rate": 2.9491978971385436e-06, + "loss": 0.435, + "step": 5671 + }, + { + "epoch": 2.6817966903073285, + "grad_norm": 2.5701990127563477, + "learning_rate": 2.9485842088014498e-06, + "loss": 0.4932, + "step": 5672 + }, + { + "epoch": 2.6822695035460993, + "grad_norm": 2.9368457794189453, + "learning_rate": 2.9479704925329854e-06, + "loss": 0.455, + "step": 5673 + }, + { + "epoch": 2.68274231678487, + "grad_norm": 2.8576247692108154, + "learning_rate": 2.947356748371362e-06, + "loss": 0.4254, + "step": 5674 + }, + { + "epoch": 2.6832151300236404, + "grad_norm": 2.8999195098876953, + "learning_rate": 2.946742976354795e-06, + "loss": 0.4159, + "step": 5675 + }, + { + "epoch": 2.6836879432624112, + "grad_norm": 2.8439736366271973, + "learning_rate": 2.946129176521502e-06, + "loss": 0.4035, + "step": 5676 + }, + { + "epoch": 2.684160756501182, + "grad_norm": 2.8525729179382324, + "learning_rate": 2.945515348909702e-06, + "loss": 0.4137, + "step": 5677 + }, + { + "epoch": 2.684633569739953, + "grad_norm": 2.6573562622070312, + "learning_rate": 2.9449014935576147e-06, + "loss": 0.4203, + "step": 5678 + }, + { + "epoch": 2.6851063829787236, + "grad_norm": 2.765794277191162, + "learning_rate": 2.9442876105034616e-06, + "loss": 0.5184, + "step": 5679 + }, + { + "epoch": 2.685579196217494, + "grad_norm": 2.694617748260498, + "learning_rate": 2.943673699785467e-06, + "loss": 0.417, + "step": 5680 + }, + { + "epoch": 2.6860520094562648, + "grad_norm": 2.740774393081665, + "learning_rate": 2.943059761441857e-06, + "loss": 0.4431, + "step": 5681 + }, + { + "epoch": 2.6865248226950356, + "grad_norm": 2.670642614364624, + "learning_rate": 2.942445795510859e-06, + "loss": 0.4298, + "step": 5682 + }, + { + "epoch": 2.686997635933806, + "grad_norm": 2.838907241821289, + "learning_rate": 2.9418318020307e-06, + "loss": 0.4529, + "step": 5683 + }, + { + "epoch": 2.6874704491725767, + "grad_norm": 2.562317371368408, + "learning_rate": 2.9412177810396135e-06, + "loss": 0.4251, + "step": 5684 + }, + { + "epoch": 2.6879432624113475, + "grad_norm": 2.5805928707122803, + "learning_rate": 2.9406037325758298e-06, + "loss": 0.4405, + "step": 5685 + }, + { + "epoch": 2.6884160756501183, + "grad_norm": 2.5701205730438232, + "learning_rate": 2.939989656677583e-06, + "loss": 0.4184, + "step": 5686 + }, + { + "epoch": 2.688888888888889, + "grad_norm": 2.7990400791168213, + "learning_rate": 2.939375553383111e-06, + "loss": 0.4866, + "step": 5687 + }, + { + "epoch": 2.6893617021276595, + "grad_norm": 3.063319206237793, + "learning_rate": 2.9387614227306487e-06, + "loss": 0.4202, + "step": 5688 + }, + { + "epoch": 2.6898345153664303, + "grad_norm": 3.0891315937042236, + "learning_rate": 2.938147264758437e-06, + "loss": 0.4344, + "step": 5689 + }, + { + "epoch": 2.690307328605201, + "grad_norm": 2.8982670307159424, + "learning_rate": 2.9375330795047165e-06, + "loss": 0.4548, + "step": 5690 + }, + { + "epoch": 2.6907801418439714, + "grad_norm": 2.7947235107421875, + "learning_rate": 2.9369188670077293e-06, + "loss": 0.5028, + "step": 5691 + }, + { + "epoch": 2.691252955082742, + "grad_norm": 3.1615960597991943, + "learning_rate": 2.9363046273057206e-06, + "loss": 0.4855, + "step": 5692 + }, + { + "epoch": 2.691725768321513, + "grad_norm": 2.669516086578369, + "learning_rate": 2.935690360436935e-06, + "loss": 0.3813, + "step": 5693 + }, + { + "epoch": 2.692198581560284, + "grad_norm": 2.8743274211883545, + "learning_rate": 2.935076066439622e-06, + "loss": 0.4302, + "step": 5694 + }, + { + "epoch": 2.6926713947990546, + "grad_norm": 2.6829612255096436, + "learning_rate": 2.9344617453520295e-06, + "loss": 0.4063, + "step": 5695 + }, + { + "epoch": 2.693144208037825, + "grad_norm": 2.776447057723999, + "learning_rate": 2.9338473972124097e-06, + "loss": 0.4921, + "step": 5696 + }, + { + "epoch": 2.6936170212765957, + "grad_norm": 2.7865772247314453, + "learning_rate": 2.9332330220590143e-06, + "loss": 0.4939, + "step": 5697 + }, + { + "epoch": 2.6940898345153665, + "grad_norm": 3.020526170730591, + "learning_rate": 2.932618619930098e-06, + "loss": 0.4839, + "step": 5698 + }, + { + "epoch": 2.694562647754137, + "grad_norm": 2.637057065963745, + "learning_rate": 2.932004190863918e-06, + "loss": 0.4343, + "step": 5699 + }, + { + "epoch": 2.6950354609929077, + "grad_norm": 2.7426512241363525, + "learning_rate": 2.9313897348987314e-06, + "loss": 0.3609, + "step": 5700 + }, + { + "epoch": 2.6955082742316785, + "grad_norm": 2.767186164855957, + "learning_rate": 2.9307752520727974e-06, + "loss": 0.3793, + "step": 5701 + }, + { + "epoch": 2.6959810874704493, + "grad_norm": 2.4791622161865234, + "learning_rate": 2.930160742424377e-06, + "loss": 0.4192, + "step": 5702 + }, + { + "epoch": 2.69645390070922, + "grad_norm": 2.661461591720581, + "learning_rate": 2.9295462059917336e-06, + "loss": 0.4758, + "step": 5703 + }, + { + "epoch": 2.6969267139479904, + "grad_norm": 2.896242380142212, + "learning_rate": 2.928931642813131e-06, + "loss": 0.42, + "step": 5704 + }, + { + "epoch": 2.6973995271867612, + "grad_norm": 2.783813238143921, + "learning_rate": 2.9283170529268366e-06, + "loss": 0.4726, + "step": 5705 + }, + { + "epoch": 2.697872340425532, + "grad_norm": 2.4347333908081055, + "learning_rate": 2.927702436371117e-06, + "loss": 0.4199, + "step": 5706 + }, + { + "epoch": 2.6983451536643024, + "grad_norm": 2.4643805027008057, + "learning_rate": 2.927087793184242e-06, + "loss": 0.3578, + "step": 5707 + }, + { + "epoch": 2.698817966903073, + "grad_norm": 2.6396660804748535, + "learning_rate": 2.9264731234044835e-06, + "loss": 0.4509, + "step": 5708 + }, + { + "epoch": 2.699290780141844, + "grad_norm": 2.7341182231903076, + "learning_rate": 2.925858427070113e-06, + "loss": 0.4331, + "step": 5709 + }, + { + "epoch": 2.699763593380615, + "grad_norm": 2.7578938007354736, + "learning_rate": 2.9252437042194058e-06, + "loss": 0.4508, + "step": 5710 + }, + { + "epoch": 2.7002364066193856, + "grad_norm": 2.557788133621216, + "learning_rate": 2.9246289548906375e-06, + "loss": 0.3775, + "step": 5711 + }, + { + "epoch": 2.700709219858156, + "grad_norm": 2.802851676940918, + "learning_rate": 2.924014179122086e-06, + "loss": 0.4518, + "step": 5712 + }, + { + "epoch": 2.7011820330969267, + "grad_norm": 2.4773001670837402, + "learning_rate": 2.9233993769520313e-06, + "loss": 0.4019, + "step": 5713 + }, + { + "epoch": 2.7016548463356975, + "grad_norm": 3.108971357345581, + "learning_rate": 2.922784548418754e-06, + "loss": 0.4715, + "step": 5714 + }, + { + "epoch": 2.702127659574468, + "grad_norm": 2.8596770763397217, + "learning_rate": 2.9221696935605366e-06, + "loss": 0.4361, + "step": 5715 + }, + { + "epoch": 2.7026004728132387, + "grad_norm": 2.570604085922241, + "learning_rate": 2.9215548124156633e-06, + "loss": 0.3982, + "step": 5716 + }, + { + "epoch": 2.7030732860520095, + "grad_norm": 2.3157799243927, + "learning_rate": 2.9209399050224206e-06, + "loss": 0.456, + "step": 5717 + }, + { + "epoch": 2.7035460992907803, + "grad_norm": 2.6865758895874023, + "learning_rate": 2.9203249714190952e-06, + "loss": 0.4441, + "step": 5718 + }, + { + "epoch": 2.704018912529551, + "grad_norm": 2.76723313331604, + "learning_rate": 2.919710011643978e-06, + "loss": 0.464, + "step": 5719 + }, + { + "epoch": 2.7044917257683214, + "grad_norm": 2.648792028427124, + "learning_rate": 2.9190950257353578e-06, + "loss": 0.3426, + "step": 5720 + }, + { + "epoch": 2.704964539007092, + "grad_norm": 2.878739833831787, + "learning_rate": 2.9184800137315276e-06, + "loss": 0.4431, + "step": 5721 + }, + { + "epoch": 2.705437352245863, + "grad_norm": 2.670567274093628, + "learning_rate": 2.917864975670783e-06, + "loss": 0.4347, + "step": 5722 + }, + { + "epoch": 2.7059101654846334, + "grad_norm": 2.7031569480895996, + "learning_rate": 2.9172499115914184e-06, + "loss": 0.4557, + "step": 5723 + }, + { + "epoch": 2.706382978723404, + "grad_norm": 2.5225696563720703, + "learning_rate": 2.9166348215317314e-06, + "loss": 0.4159, + "step": 5724 + }, + { + "epoch": 2.706855791962175, + "grad_norm": 2.8676085472106934, + "learning_rate": 2.916019705530021e-06, + "loss": 0.5018, + "step": 5725 + }, + { + "epoch": 2.7073286052009458, + "grad_norm": 2.576463460922241, + "learning_rate": 2.915404563624587e-06, + "loss": 0.4317, + "step": 5726 + }, + { + "epoch": 2.7078014184397166, + "grad_norm": 3.155565023422241, + "learning_rate": 2.9147893958537328e-06, + "loss": 0.5029, + "step": 5727 + }, + { + "epoch": 2.708274231678487, + "grad_norm": 2.604079008102417, + "learning_rate": 2.9141742022557622e-06, + "loss": 0.4324, + "step": 5728 + }, + { + "epoch": 2.7087470449172577, + "grad_norm": 2.6597228050231934, + "learning_rate": 2.913558982868979e-06, + "loss": 0.4335, + "step": 5729 + }, + { + "epoch": 2.7092198581560285, + "grad_norm": 2.811384439468384, + "learning_rate": 2.9129437377316923e-06, + "loss": 0.4031, + "step": 5730 + }, + { + "epoch": 2.709692671394799, + "grad_norm": 3.1041207313537598, + "learning_rate": 2.91232846688221e-06, + "loss": 0.481, + "step": 5731 + }, + { + "epoch": 2.7101654846335697, + "grad_norm": 2.5992188453674316, + "learning_rate": 2.9117131703588414e-06, + "loss": 0.4266, + "step": 5732 + }, + { + "epoch": 2.7106382978723405, + "grad_norm": 2.7726242542266846, + "learning_rate": 2.911097848199899e-06, + "loss": 0.4464, + "step": 5733 + }, + { + "epoch": 2.7111111111111112, + "grad_norm": 2.8683483600616455, + "learning_rate": 2.9104825004436966e-06, + "loss": 0.4248, + "step": 5734 + }, + { + "epoch": 2.711583924349882, + "grad_norm": 2.776386022567749, + "learning_rate": 2.9098671271285484e-06, + "loss": 0.4556, + "step": 5735 + }, + { + "epoch": 2.7120567375886524, + "grad_norm": 2.7612528800964355, + "learning_rate": 2.909251728292771e-06, + "loss": 0.455, + "step": 5736 + }, + { + "epoch": 2.712529550827423, + "grad_norm": 2.9223551750183105, + "learning_rate": 2.908636303974684e-06, + "loss": 0.4302, + "step": 5737 + }, + { + "epoch": 2.713002364066194, + "grad_norm": 2.898226022720337, + "learning_rate": 2.908020854212606e-06, + "loss": 0.4827, + "step": 5738 + }, + { + "epoch": 2.7134751773049643, + "grad_norm": 2.706361770629883, + "learning_rate": 2.9074053790448576e-06, + "loss": 0.4444, + "step": 5739 + }, + { + "epoch": 2.713947990543735, + "grad_norm": 2.8227248191833496, + "learning_rate": 2.9067898785097637e-06, + "loss": 0.4661, + "step": 5740 + }, + { + "epoch": 2.714420803782506, + "grad_norm": 2.597837448120117, + "learning_rate": 2.9061743526456474e-06, + "loss": 0.4646, + "step": 5741 + }, + { + "epoch": 2.7148936170212767, + "grad_norm": 2.5525131225585938, + "learning_rate": 2.9055588014908354e-06, + "loss": 0.4172, + "step": 5742 + }, + { + "epoch": 2.7153664302600475, + "grad_norm": 2.713071823120117, + "learning_rate": 2.904943225083655e-06, + "loss": 0.4893, + "step": 5743 + }, + { + "epoch": 2.715839243498818, + "grad_norm": 2.538623571395874, + "learning_rate": 2.9043276234624353e-06, + "loss": 0.3905, + "step": 5744 + }, + { + "epoch": 2.7163120567375887, + "grad_norm": 2.5190389156341553, + "learning_rate": 2.9037119966655076e-06, + "loss": 0.4318, + "step": 5745 + }, + { + "epoch": 2.7167848699763595, + "grad_norm": 2.6587612628936768, + "learning_rate": 2.903096344731204e-06, + "loss": 0.4153, + "step": 5746 + }, + { + "epoch": 2.71725768321513, + "grad_norm": 2.836731433868408, + "learning_rate": 2.902480667697859e-06, + "loss": 0.4779, + "step": 5747 + }, + { + "epoch": 2.7177304964539006, + "grad_norm": 2.8076045513153076, + "learning_rate": 2.9018649656038074e-06, + "loss": 0.5126, + "step": 5748 + }, + { + "epoch": 2.7182033096926714, + "grad_norm": 2.8930516242980957, + "learning_rate": 2.9012492384873865e-06, + "loss": 0.4561, + "step": 5749 + }, + { + "epoch": 2.7186761229314422, + "grad_norm": 2.7000370025634766, + "learning_rate": 2.9006334863869343e-06, + "loss": 0.4659, + "step": 5750 + }, + { + "epoch": 2.719148936170213, + "grad_norm": 2.927011251449585, + "learning_rate": 2.9000177093407926e-06, + "loss": 0.5123, + "step": 5751 + }, + { + "epoch": 2.7196217494089834, + "grad_norm": 3.0102779865264893, + "learning_rate": 2.8994019073873015e-06, + "loss": 0.3972, + "step": 5752 + }, + { + "epoch": 2.720094562647754, + "grad_norm": 2.778838634490967, + "learning_rate": 2.8987860805648054e-06, + "loss": 0.4922, + "step": 5753 + }, + { + "epoch": 2.720567375886525, + "grad_norm": 2.6150314807891846, + "learning_rate": 2.898170228911648e-06, + "loss": 0.4425, + "step": 5754 + }, + { + "epoch": 2.7210401891252953, + "grad_norm": 2.9329984188079834, + "learning_rate": 2.8975543524661777e-06, + "loss": 0.4872, + "step": 5755 + }, + { + "epoch": 2.721513002364066, + "grad_norm": 2.756803512573242, + "learning_rate": 2.8969384512667404e-06, + "loss": 0.4362, + "step": 5756 + }, + { + "epoch": 2.721985815602837, + "grad_norm": 2.600877285003662, + "learning_rate": 2.896322525351686e-06, + "loss": 0.4802, + "step": 5757 + }, + { + "epoch": 2.7224586288416077, + "grad_norm": 2.647069215774536, + "learning_rate": 2.8957065747593655e-06, + "loss": 0.4649, + "step": 5758 + }, + { + "epoch": 2.7229314420803785, + "grad_norm": 2.845388174057007, + "learning_rate": 2.895090599528132e-06, + "loss": 0.4533, + "step": 5759 + }, + { + "epoch": 2.723404255319149, + "grad_norm": 2.973881721496582, + "learning_rate": 2.8944745996963397e-06, + "loss": 0.4959, + "step": 5760 + }, + { + "epoch": 2.7238770685579197, + "grad_norm": 2.8995487689971924, + "learning_rate": 2.8938585753023435e-06, + "loss": 0.4597, + "step": 5761 + }, + { + "epoch": 2.7243498817966905, + "grad_norm": 2.903693437576294, + "learning_rate": 2.8932425263845004e-06, + "loss": 0.4521, + "step": 5762 + }, + { + "epoch": 2.724822695035461, + "grad_norm": 2.7609009742736816, + "learning_rate": 2.8926264529811702e-06, + "loss": 0.4399, + "step": 5763 + }, + { + "epoch": 2.7252955082742316, + "grad_norm": 2.788787603378296, + "learning_rate": 2.892010355130712e-06, + "loss": 0.4614, + "step": 5764 + }, + { + "epoch": 2.7257683215130024, + "grad_norm": 2.786498785018921, + "learning_rate": 2.8913942328714887e-06, + "loss": 0.4798, + "step": 5765 + }, + { + "epoch": 2.726241134751773, + "grad_norm": 2.9809393882751465, + "learning_rate": 2.8907780862418616e-06, + "loss": 0.5108, + "step": 5766 + }, + { + "epoch": 2.726713947990544, + "grad_norm": 2.6621177196502686, + "learning_rate": 2.8901619152801967e-06, + "loss": 0.4031, + "step": 5767 + }, + { + "epoch": 2.7271867612293144, + "grad_norm": 3.3092098236083984, + "learning_rate": 2.8895457200248607e-06, + "loss": 0.4671, + "step": 5768 + }, + { + "epoch": 2.727659574468085, + "grad_norm": 2.866306781768799, + "learning_rate": 2.8889295005142204e-06, + "loss": 0.4434, + "step": 5769 + }, + { + "epoch": 2.728132387706856, + "grad_norm": 2.6861231327056885, + "learning_rate": 2.888313256786646e-06, + "loss": 0.429, + "step": 5770 + }, + { + "epoch": 2.7286052009456263, + "grad_norm": 2.873180389404297, + "learning_rate": 2.8876969888805072e-06, + "loss": 0.4412, + "step": 5771 + }, + { + "epoch": 2.729078014184397, + "grad_norm": 2.511678695678711, + "learning_rate": 2.887080696834178e-06, + "loss": 0.4024, + "step": 5772 + }, + { + "epoch": 2.729550827423168, + "grad_norm": 2.6502726078033447, + "learning_rate": 2.88646438068603e-06, + "loss": 0.4357, + "step": 5773 + }, + { + "epoch": 2.7300236406619387, + "grad_norm": 2.7156145572662354, + "learning_rate": 2.8858480404744403e-06, + "loss": 0.4511, + "step": 5774 + }, + { + "epoch": 2.7304964539007095, + "grad_norm": 2.882582187652588, + "learning_rate": 2.8852316762377842e-06, + "loss": 0.4822, + "step": 5775 + }, + { + "epoch": 2.73096926713948, + "grad_norm": 2.7139666080474854, + "learning_rate": 2.8846152880144413e-06, + "loss": 0.4666, + "step": 5776 + }, + { + "epoch": 2.7314420803782506, + "grad_norm": 2.7453949451446533, + "learning_rate": 2.8839988758427907e-06, + "loss": 0.3927, + "step": 5777 + }, + { + "epoch": 2.731914893617021, + "grad_norm": 2.7859580516815186, + "learning_rate": 2.883382439761214e-06, + "loss": 0.4466, + "step": 5778 + }, + { + "epoch": 2.732387706855792, + "grad_norm": 2.695234537124634, + "learning_rate": 2.882765979808094e-06, + "loss": 0.4227, + "step": 5779 + }, + { + "epoch": 2.7328605200945626, + "grad_norm": 2.8081552982330322, + "learning_rate": 2.8821494960218148e-06, + "loss": 0.447, + "step": 5780 + }, + { + "epoch": 2.7333333333333334, + "grad_norm": 2.887643337249756, + "learning_rate": 2.881532988440762e-06, + "loss": 0.5018, + "step": 5781 + }, + { + "epoch": 2.733806146572104, + "grad_norm": 3.108212471008301, + "learning_rate": 2.8809164571033233e-06, + "loss": 0.4132, + "step": 5782 + }, + { + "epoch": 2.7342789598108745, + "grad_norm": 2.874328374862671, + "learning_rate": 2.880299902047886e-06, + "loss": 0.4618, + "step": 5783 + }, + { + "epoch": 2.7347517730496453, + "grad_norm": 3.089132308959961, + "learning_rate": 2.879683323312843e-06, + "loss": 0.4956, + "step": 5784 + }, + { + "epoch": 2.735224586288416, + "grad_norm": 2.5173206329345703, + "learning_rate": 2.879066720936583e-06, + "loss": 0.4087, + "step": 5785 + }, + { + "epoch": 2.7356973995271865, + "grad_norm": 2.6401286125183105, + "learning_rate": 2.8784500949575014e-06, + "loss": 0.3995, + "step": 5786 + }, + { + "epoch": 2.7361702127659573, + "grad_norm": 2.9371910095214844, + "learning_rate": 2.877833445413991e-06, + "loss": 0.5209, + "step": 5787 + }, + { + "epoch": 2.736643026004728, + "grad_norm": 3.218158006668091, + "learning_rate": 2.8772167723444498e-06, + "loss": 0.4275, + "step": 5788 + }, + { + "epoch": 2.737115839243499, + "grad_norm": 2.9072160720825195, + "learning_rate": 2.8766000757872736e-06, + "loss": 0.4244, + "step": 5789 + }, + { + "epoch": 2.7375886524822697, + "grad_norm": 3.0378096103668213, + "learning_rate": 2.8759833557808614e-06, + "loss": 0.507, + "step": 5790 + }, + { + "epoch": 2.73806146572104, + "grad_norm": 2.728353977203369, + "learning_rate": 2.8753666123636148e-06, + "loss": 0.413, + "step": 5791 + }, + { + "epoch": 2.738534278959811, + "grad_norm": 2.6869957447052, + "learning_rate": 2.874749845573935e-06, + "loss": 0.44, + "step": 5792 + }, + { + "epoch": 2.7390070921985816, + "grad_norm": 2.6381702423095703, + "learning_rate": 2.8741330554502263e-06, + "loss": 0.4708, + "step": 5793 + }, + { + "epoch": 2.739479905437352, + "grad_norm": 2.6944689750671387, + "learning_rate": 2.873516242030892e-06, + "loss": 0.4555, + "step": 5794 + }, + { + "epoch": 2.739952718676123, + "grad_norm": 3.168473243713379, + "learning_rate": 2.8728994053543396e-06, + "loss": 0.4538, + "step": 5795 + }, + { + "epoch": 2.7404255319148936, + "grad_norm": 2.7504515647888184, + "learning_rate": 2.872282545458976e-06, + "loss": 0.4628, + "step": 5796 + }, + { + "epoch": 2.7408983451536644, + "grad_norm": 2.896462917327881, + "learning_rate": 2.8716656623832114e-06, + "loss": 0.4946, + "step": 5797 + }, + { + "epoch": 2.741371158392435, + "grad_norm": 2.8053417205810547, + "learning_rate": 2.8710487561654547e-06, + "loss": 0.4893, + "step": 5798 + }, + { + "epoch": 2.7418439716312055, + "grad_norm": 2.63171124458313, + "learning_rate": 2.870431826844119e-06, + "loss": 0.4257, + "step": 5799 + }, + { + "epoch": 2.7423167848699763, + "grad_norm": 3.0963807106018066, + "learning_rate": 2.869814874457618e-06, + "loss": 0.5404, + "step": 5800 + }, + { + "epoch": 2.742789598108747, + "grad_norm": 2.591132164001465, + "learning_rate": 2.8691978990443664e-06, + "loss": 0.4015, + "step": 5801 + }, + { + "epoch": 2.7432624113475175, + "grad_norm": 3.0319552421569824, + "learning_rate": 2.8685809006427812e-06, + "loss": 0.4411, + "step": 5802 + }, + { + "epoch": 2.7437352245862883, + "grad_norm": 2.7791874408721924, + "learning_rate": 2.8679638792912784e-06, + "loss": 0.43, + "step": 5803 + }, + { + "epoch": 2.744208037825059, + "grad_norm": 3.530632495880127, + "learning_rate": 2.867346835028279e-06, + "loss": 0.4581, + "step": 5804 + }, + { + "epoch": 2.74468085106383, + "grad_norm": 3.2043099403381348, + "learning_rate": 2.8667297678922024e-06, + "loss": 0.4375, + "step": 5805 + }, + { + "epoch": 2.7451536643026007, + "grad_norm": 2.8442344665527344, + "learning_rate": 2.8661126779214716e-06, + "loss": 0.4059, + "step": 5806 + }, + { + "epoch": 2.745626477541371, + "grad_norm": 2.7561380863189697, + "learning_rate": 2.86549556515451e-06, + "loss": 0.4391, + "step": 5807 + }, + { + "epoch": 2.746099290780142, + "grad_norm": 3.229663848876953, + "learning_rate": 2.8648784296297418e-06, + "loss": 0.4579, + "step": 5808 + }, + { + "epoch": 2.7465721040189126, + "grad_norm": 2.8375027179718018, + "learning_rate": 2.864261271385593e-06, + "loss": 0.4566, + "step": 5809 + }, + { + "epoch": 2.747044917257683, + "grad_norm": 2.392998695373535, + "learning_rate": 2.863644090460493e-06, + "loss": 0.4123, + "step": 5810 + }, + { + "epoch": 2.7475177304964538, + "grad_norm": 2.707610607147217, + "learning_rate": 2.86302688689287e-06, + "loss": 0.4299, + "step": 5811 + }, + { + "epoch": 2.7479905437352246, + "grad_norm": 2.824042797088623, + "learning_rate": 2.8624096607211547e-06, + "loss": 0.3799, + "step": 5812 + }, + { + "epoch": 2.7484633569739954, + "grad_norm": 2.933102607727051, + "learning_rate": 2.861792411983779e-06, + "loss": 0.4416, + "step": 5813 + }, + { + "epoch": 2.748936170212766, + "grad_norm": 2.7000277042388916, + "learning_rate": 2.8611751407191757e-06, + "loss": 0.4346, + "step": 5814 + }, + { + "epoch": 2.7494089834515365, + "grad_norm": 2.6757142543792725, + "learning_rate": 2.86055784696578e-06, + "loss": 0.4559, + "step": 5815 + }, + { + "epoch": 2.7498817966903073, + "grad_norm": 2.5791053771972656, + "learning_rate": 2.8599405307620287e-06, + "loss": 0.4619, + "step": 5816 + }, + { + "epoch": 2.750354609929078, + "grad_norm": 3.0327374935150146, + "learning_rate": 2.859323192146359e-06, + "loss": 0.4163, + "step": 5817 + }, + { + "epoch": 2.7508274231678485, + "grad_norm": 2.5580220222473145, + "learning_rate": 2.8587058311572084e-06, + "loss": 0.4005, + "step": 5818 + }, + { + "epoch": 2.7513002364066192, + "grad_norm": 2.592179536819458, + "learning_rate": 2.85808844783302e-06, + "loss": 0.4404, + "step": 5819 + }, + { + "epoch": 2.75177304964539, + "grad_norm": 3.2779927253723145, + "learning_rate": 2.8574710422122342e-06, + "loss": 0.54, + "step": 5820 + }, + { + "epoch": 2.752245862884161, + "grad_norm": 2.4804370403289795, + "learning_rate": 2.8568536143332933e-06, + "loss": 0.4476, + "step": 5821 + }, + { + "epoch": 2.7527186761229316, + "grad_norm": 2.649477481842041, + "learning_rate": 2.8562361642346427e-06, + "loss": 0.4336, + "step": 5822 + }, + { + "epoch": 2.753191489361702, + "grad_norm": 3.138587474822998, + "learning_rate": 2.855618691954728e-06, + "loss": 0.5042, + "step": 5823 + }, + { + "epoch": 2.753664302600473, + "grad_norm": 2.75093412399292, + "learning_rate": 2.855001197531997e-06, + "loss": 0.4327, + "step": 5824 + }, + { + "epoch": 2.7541371158392436, + "grad_norm": 2.678809642791748, + "learning_rate": 2.854383681004898e-06, + "loss": 0.4409, + "step": 5825 + }, + { + "epoch": 2.754609929078014, + "grad_norm": 2.965386390686035, + "learning_rate": 2.853766142411881e-06, + "loss": 0.4716, + "step": 5826 + }, + { + "epoch": 2.7550827423167847, + "grad_norm": 2.6419436931610107, + "learning_rate": 2.853148581791398e-06, + "loss": 0.4367, + "step": 5827 + }, + { + "epoch": 2.7555555555555555, + "grad_norm": 3.205794095993042, + "learning_rate": 2.8525309991819004e-06, + "loss": 0.4869, + "step": 5828 + }, + { + "epoch": 2.7560283687943263, + "grad_norm": 3.041008472442627, + "learning_rate": 2.851913394621844e-06, + "loss": 0.5087, + "step": 5829 + }, + { + "epoch": 2.756501182033097, + "grad_norm": 2.6525566577911377, + "learning_rate": 2.851295768149684e-06, + "loss": 0.3951, + "step": 5830 + }, + { + "epoch": 2.7569739952718675, + "grad_norm": 2.732220411300659, + "learning_rate": 2.850678119803876e-06, + "loss": 0.4797, + "step": 5831 + }, + { + "epoch": 2.7574468085106383, + "grad_norm": 2.8965251445770264, + "learning_rate": 2.8500604496228797e-06, + "loss": 0.4938, + "step": 5832 + }, + { + "epoch": 2.757919621749409, + "grad_norm": 2.48020076751709, + "learning_rate": 2.849442757645154e-06, + "loss": 0.4172, + "step": 5833 + }, + { + "epoch": 2.7583924349881794, + "grad_norm": 2.4764912128448486, + "learning_rate": 2.8488250439091603e-06, + "loss": 0.4123, + "step": 5834 + }, + { + "epoch": 2.7588652482269502, + "grad_norm": 2.4547016620635986, + "learning_rate": 2.84820730845336e-06, + "loss": 0.4116, + "step": 5835 + }, + { + "epoch": 2.759338061465721, + "grad_norm": 2.55476975440979, + "learning_rate": 2.847589551316218e-06, + "loss": 0.4744, + "step": 5836 + }, + { + "epoch": 2.759810874704492, + "grad_norm": 2.3866238594055176, + "learning_rate": 2.846971772536199e-06, + "loss": 0.4406, + "step": 5837 + }, + { + "epoch": 2.7602836879432626, + "grad_norm": 2.855318784713745, + "learning_rate": 2.8463539721517687e-06, + "loss": 0.4517, + "step": 5838 + }, + { + "epoch": 2.760756501182033, + "grad_norm": 2.527198314666748, + "learning_rate": 2.8457361502013954e-06, + "loss": 0.3588, + "step": 5839 + }, + { + "epoch": 2.7612293144208038, + "grad_norm": 2.6761462688446045, + "learning_rate": 2.8451183067235476e-06, + "loss": 0.4192, + "step": 5840 + }, + { + "epoch": 2.7617021276595746, + "grad_norm": 2.5692319869995117, + "learning_rate": 2.8445004417566967e-06, + "loss": 0.4108, + "step": 5841 + }, + { + "epoch": 2.762174940898345, + "grad_norm": 2.5721096992492676, + "learning_rate": 2.8438825553393133e-06, + "loss": 0.3941, + "step": 5842 + }, + { + "epoch": 2.7626477541371157, + "grad_norm": 2.699430227279663, + "learning_rate": 2.843264647509872e-06, + "loss": 0.4418, + "step": 5843 + }, + { + "epoch": 2.7631205673758865, + "grad_norm": 2.6943318843841553, + "learning_rate": 2.842646718306846e-06, + "loss": 0.4505, + "step": 5844 + }, + { + "epoch": 2.7635933806146573, + "grad_norm": 2.661656379699707, + "learning_rate": 2.8420287677687107e-06, + "loss": 0.4413, + "step": 5845 + }, + { + "epoch": 2.764066193853428, + "grad_norm": 2.830467939376831, + "learning_rate": 2.8414107959339444e-06, + "loss": 0.5095, + "step": 5846 + }, + { + "epoch": 2.7645390070921985, + "grad_norm": 2.598053455352783, + "learning_rate": 2.840792802841024e-06, + "loss": 0.4029, + "step": 5847 + }, + { + "epoch": 2.7650118203309693, + "grad_norm": 2.641700029373169, + "learning_rate": 2.8401747885284316e-06, + "loss": 0.4237, + "step": 5848 + }, + { + "epoch": 2.76548463356974, + "grad_norm": 2.6672768592834473, + "learning_rate": 2.8395567530346454e-06, + "loss": 0.4181, + "step": 5849 + }, + { + "epoch": 2.7659574468085104, + "grad_norm": 2.5851705074310303, + "learning_rate": 2.838938696398149e-06, + "loss": 0.4165, + "step": 5850 + }, + { + "epoch": 2.766430260047281, + "grad_norm": 2.318120002746582, + "learning_rate": 2.8383206186574276e-06, + "loss": 0.3578, + "step": 5851 + }, + { + "epoch": 2.766903073286052, + "grad_norm": 2.6199793815612793, + "learning_rate": 2.8377025198509635e-06, + "loss": 0.4719, + "step": 5852 + }, + { + "epoch": 2.767375886524823, + "grad_norm": 2.7186086177825928, + "learning_rate": 2.837084400017245e-06, + "loss": 0.41, + "step": 5853 + }, + { + "epoch": 2.7678486997635936, + "grad_norm": 2.702514886856079, + "learning_rate": 2.8364662591947583e-06, + "loss": 0.4659, + "step": 5854 + }, + { + "epoch": 2.768321513002364, + "grad_norm": 2.612375259399414, + "learning_rate": 2.835848097421993e-06, + "loss": 0.4252, + "step": 5855 + }, + { + "epoch": 2.7687943262411348, + "grad_norm": 3.0127978324890137, + "learning_rate": 2.8352299147374394e-06, + "loss": 0.4084, + "step": 5856 + }, + { + "epoch": 2.7692671394799055, + "grad_norm": 2.6460049152374268, + "learning_rate": 2.83461171117959e-06, + "loss": 0.4035, + "step": 5857 + }, + { + "epoch": 2.769739952718676, + "grad_norm": 2.9844725131988525, + "learning_rate": 2.8339934867869357e-06, + "loss": 0.4912, + "step": 5858 + }, + { + "epoch": 2.7702127659574467, + "grad_norm": 2.731217861175537, + "learning_rate": 2.833375241597972e-06, + "loss": 0.4112, + "step": 5859 + }, + { + "epoch": 2.7706855791962175, + "grad_norm": 2.731194496154785, + "learning_rate": 2.832756975651193e-06, + "loss": 0.4516, + "step": 5860 + }, + { + "epoch": 2.7711583924349883, + "grad_norm": 3.0532076358795166, + "learning_rate": 2.8321386889850965e-06, + "loss": 0.3959, + "step": 5861 + }, + { + "epoch": 2.771631205673759, + "grad_norm": 3.5437800884246826, + "learning_rate": 2.831520381638181e-06, + "loss": 0.6055, + "step": 5862 + }, + { + "epoch": 2.7721040189125294, + "grad_norm": 2.4297714233398438, + "learning_rate": 2.830902053648944e-06, + "loss": 0.4038, + "step": 5863 + }, + { + "epoch": 2.7725768321513002, + "grad_norm": 2.696768045425415, + "learning_rate": 2.8302837050558876e-06, + "loss": 0.3983, + "step": 5864 + }, + { + "epoch": 2.773049645390071, + "grad_norm": 2.6574649810791016, + "learning_rate": 2.8296653358975122e-06, + "loss": 0.4937, + "step": 5865 + }, + { + "epoch": 2.7735224586288414, + "grad_norm": 2.9393341541290283, + "learning_rate": 2.8290469462123234e-06, + "loss": 0.4603, + "step": 5866 + }, + { + "epoch": 2.773995271867612, + "grad_norm": 2.7630696296691895, + "learning_rate": 2.828428536038824e-06, + "loss": 0.4663, + "step": 5867 + }, + { + "epoch": 2.774468085106383, + "grad_norm": 2.7354233264923096, + "learning_rate": 2.8278101054155183e-06, + "loss": 0.4444, + "step": 5868 + }, + { + "epoch": 2.774940898345154, + "grad_norm": 3.0489425659179688, + "learning_rate": 2.827191654380915e-06, + "loss": 0.4684, + "step": 5869 + }, + { + "epoch": 2.7754137115839246, + "grad_norm": 2.9602572917938232, + "learning_rate": 2.8265731829735226e-06, + "loss": 0.4571, + "step": 5870 + }, + { + "epoch": 2.775886524822695, + "grad_norm": 2.774132013320923, + "learning_rate": 2.825954691231851e-06, + "loss": 0.4458, + "step": 5871 + }, + { + "epoch": 2.7763593380614657, + "grad_norm": 2.696622133255005, + "learning_rate": 2.825336179194409e-06, + "loss": 0.4933, + "step": 5872 + }, + { + "epoch": 2.7768321513002365, + "grad_norm": 2.742184638977051, + "learning_rate": 2.8247176468997096e-06, + "loss": 0.4464, + "step": 5873 + }, + { + "epoch": 2.777304964539007, + "grad_norm": 2.7033183574676514, + "learning_rate": 2.824099094386266e-06, + "loss": 0.4369, + "step": 5874 + }, + { + "epoch": 2.7777777777777777, + "grad_norm": 2.7264044284820557, + "learning_rate": 2.8234805216925935e-06, + "loss": 0.4621, + "step": 5875 + }, + { + "epoch": 2.7782505910165485, + "grad_norm": 2.6417739391326904, + "learning_rate": 2.822861928857208e-06, + "loss": 0.4254, + "step": 5876 + }, + { + "epoch": 2.7787234042553193, + "grad_norm": 3.17209529876709, + "learning_rate": 2.8222433159186245e-06, + "loss": 0.5011, + "step": 5877 + }, + { + "epoch": 2.77919621749409, + "grad_norm": 3.1434381008148193, + "learning_rate": 2.8216246829153633e-06, + "loss": 0.4567, + "step": 5878 + }, + { + "epoch": 2.7796690307328604, + "grad_norm": 2.781608819961548, + "learning_rate": 2.821006029885943e-06, + "loss": 0.4723, + "step": 5879 + }, + { + "epoch": 2.780141843971631, + "grad_norm": 3.00079345703125, + "learning_rate": 2.820387356868885e-06, + "loss": 0.4796, + "step": 5880 + }, + { + "epoch": 2.780614657210402, + "grad_norm": 2.703555107116699, + "learning_rate": 2.819768663902712e-06, + "loss": 0.4577, + "step": 5881 + }, + { + "epoch": 2.7810874704491724, + "grad_norm": 2.5741801261901855, + "learning_rate": 2.8191499510259453e-06, + "loss": 0.4255, + "step": 5882 + }, + { + "epoch": 2.781560283687943, + "grad_norm": 2.9871208667755127, + "learning_rate": 2.8185312182771112e-06, + "loss": 0.4495, + "step": 5883 + }, + { + "epoch": 2.782033096926714, + "grad_norm": 2.525317668914795, + "learning_rate": 2.8179124656947343e-06, + "loss": 0.4428, + "step": 5884 + }, + { + "epoch": 2.7825059101654848, + "grad_norm": 2.525092840194702, + "learning_rate": 2.817293693317343e-06, + "loss": 0.4348, + "step": 5885 + }, + { + "epoch": 2.7829787234042556, + "grad_norm": 2.8485171794891357, + "learning_rate": 2.816674901183464e-06, + "loss": 0.4206, + "step": 5886 + }, + { + "epoch": 2.783451536643026, + "grad_norm": 2.6612746715545654, + "learning_rate": 2.8160560893316272e-06, + "loss": 0.396, + "step": 5887 + }, + { + "epoch": 2.7839243498817967, + "grad_norm": 2.7093865871429443, + "learning_rate": 2.815437257800364e-06, + "loss": 0.4468, + "step": 5888 + }, + { + "epoch": 2.7843971631205675, + "grad_norm": 2.6130900382995605, + "learning_rate": 2.814818406628206e-06, + "loss": 0.443, + "step": 5889 + }, + { + "epoch": 2.784869976359338, + "grad_norm": 2.8147552013397217, + "learning_rate": 2.8141995358536866e-06, + "loss": 0.4454, + "step": 5890 + }, + { + "epoch": 2.7853427895981087, + "grad_norm": 2.5621275901794434, + "learning_rate": 2.8135806455153395e-06, + "loss": 0.439, + "step": 5891 + }, + { + "epoch": 2.7858156028368795, + "grad_norm": 2.880228281021118, + "learning_rate": 2.812961735651701e-06, + "loss": 0.3895, + "step": 5892 + }, + { + "epoch": 2.7862884160756503, + "grad_norm": 2.5861377716064453, + "learning_rate": 2.8123428063013068e-06, + "loss": 0.4402, + "step": 5893 + }, + { + "epoch": 2.786761229314421, + "grad_norm": 2.9707765579223633, + "learning_rate": 2.811723857502696e-06, + "loss": 0.4461, + "step": 5894 + }, + { + "epoch": 2.7872340425531914, + "grad_norm": 2.923999309539795, + "learning_rate": 2.811104889294408e-06, + "loss": 0.4395, + "step": 5895 + }, + { + "epoch": 2.787706855791962, + "grad_norm": 2.846933603286743, + "learning_rate": 2.810485901714981e-06, + "loss": 0.5168, + "step": 5896 + }, + { + "epoch": 2.788179669030733, + "grad_norm": 4.1052350997924805, + "learning_rate": 2.8098668948029597e-06, + "loss": 0.5152, + "step": 5897 + }, + { + "epoch": 2.7886524822695034, + "grad_norm": 2.7391018867492676, + "learning_rate": 2.8092478685968856e-06, + "loss": 0.4515, + "step": 5898 + }, + { + "epoch": 2.789125295508274, + "grad_norm": 2.976088285446167, + "learning_rate": 2.8086288231353027e-06, + "loss": 0.5156, + "step": 5899 + }, + { + "epoch": 2.789598108747045, + "grad_norm": 2.6139633655548096, + "learning_rate": 2.8080097584567562e-06, + "loss": 0.4237, + "step": 5900 + }, + { + "epoch": 2.7900709219858157, + "grad_norm": 2.501654624938965, + "learning_rate": 2.807390674599792e-06, + "loss": 0.4349, + "step": 5901 + }, + { + "epoch": 2.7905437352245865, + "grad_norm": 2.8814525604248047, + "learning_rate": 2.8067715716029586e-06, + "loss": 0.4866, + "step": 5902 + }, + { + "epoch": 2.791016548463357, + "grad_norm": 2.7953200340270996, + "learning_rate": 2.8061524495048046e-06, + "loss": 0.3964, + "step": 5903 + }, + { + "epoch": 2.7914893617021277, + "grad_norm": 2.7362849712371826, + "learning_rate": 2.8055333083438808e-06, + "loss": 0.4181, + "step": 5904 + }, + { + "epoch": 2.7919621749408985, + "grad_norm": 2.9740512371063232, + "learning_rate": 2.8049141481587366e-06, + "loss": 0.4784, + "step": 5905 + }, + { + "epoch": 2.792434988179669, + "grad_norm": 2.595813274383545, + "learning_rate": 2.8042949689879262e-06, + "loss": 0.4421, + "step": 5906 + }, + { + "epoch": 2.7929078014184396, + "grad_norm": 2.886899948120117, + "learning_rate": 2.803675770870002e-06, + "loss": 0.4435, + "step": 5907 + }, + { + "epoch": 2.7933806146572104, + "grad_norm": 2.6057486534118652, + "learning_rate": 2.8030565538435196e-06, + "loss": 0.4472, + "step": 5908 + }, + { + "epoch": 2.7938534278959812, + "grad_norm": 2.7422802448272705, + "learning_rate": 2.802437317947034e-06, + "loss": 0.4799, + "step": 5909 + }, + { + "epoch": 2.794326241134752, + "grad_norm": 2.3904244899749756, + "learning_rate": 2.801818063219102e-06, + "loss": 0.4508, + "step": 5910 + }, + { + "epoch": 2.7947990543735224, + "grad_norm": 2.8434207439422607, + "learning_rate": 2.8011987896982835e-06, + "loss": 0.4473, + "step": 5911 + }, + { + "epoch": 2.795271867612293, + "grad_norm": 2.916088819503784, + "learning_rate": 2.8005794974231366e-06, + "loss": 0.464, + "step": 5912 + }, + { + "epoch": 2.795744680851064, + "grad_norm": 2.6483397483825684, + "learning_rate": 2.7999601864322236e-06, + "loss": 0.441, + "step": 5913 + }, + { + "epoch": 2.7962174940898343, + "grad_norm": 2.9287428855895996, + "learning_rate": 2.7993408567641033e-06, + "loss": 0.4551, + "step": 5914 + }, + { + "epoch": 2.796690307328605, + "grad_norm": 2.575024127960205, + "learning_rate": 2.798721508457342e-06, + "loss": 0.4494, + "step": 5915 + }, + { + "epoch": 2.797163120567376, + "grad_norm": 2.7156829833984375, + "learning_rate": 2.7981021415505015e-06, + "loss": 0.419, + "step": 5916 + }, + { + "epoch": 2.7976359338061467, + "grad_norm": 2.850553035736084, + "learning_rate": 2.7974827560821482e-06, + "loss": 0.4709, + "step": 5917 + }, + { + "epoch": 2.7981087470449175, + "grad_norm": 2.673846483230591, + "learning_rate": 2.796863352090847e-06, + "loss": 0.4224, + "step": 5918 + }, + { + "epoch": 2.798581560283688, + "grad_norm": 2.9093217849731445, + "learning_rate": 2.796243929615168e-06, + "loss": 0.468, + "step": 5919 + }, + { + "epoch": 2.7990543735224587, + "grad_norm": 2.4853813648223877, + "learning_rate": 2.7956244886936775e-06, + "loss": 0.4723, + "step": 5920 + }, + { + "epoch": 2.7995271867612295, + "grad_norm": 3.026428461074829, + "learning_rate": 2.795005029364946e-06, + "loss": 0.4721, + "step": 5921 + }, + { + "epoch": 2.8, + "grad_norm": 2.886295795440674, + "learning_rate": 2.794385551667546e-06, + "loss": 0.456, + "step": 5922 + }, + { + "epoch": 2.8004728132387706, + "grad_norm": 3.2260656356811523, + "learning_rate": 2.7937660556400486e-06, + "loss": 0.4499, + "step": 5923 + }, + { + "epoch": 2.8009456264775414, + "grad_norm": 2.7971982955932617, + "learning_rate": 2.793146541321027e-06, + "loss": 0.3982, + "step": 5924 + }, + { + "epoch": 2.801418439716312, + "grad_norm": 2.85461163520813, + "learning_rate": 2.7925270087490546e-06, + "loss": 0.4841, + "step": 5925 + }, + { + "epoch": 2.801891252955083, + "grad_norm": 3.0642316341400146, + "learning_rate": 2.7919074579627086e-06, + "loss": 0.4538, + "step": 5926 + }, + { + "epoch": 2.8023640661938534, + "grad_norm": 2.9053616523742676, + "learning_rate": 2.7912878890005657e-06, + "loss": 0.434, + "step": 5927 + }, + { + "epoch": 2.802836879432624, + "grad_norm": 2.7649240493774414, + "learning_rate": 2.7906683019012027e-06, + "loss": 0.414, + "step": 5928 + }, + { + "epoch": 2.803309692671395, + "grad_norm": 2.8717660903930664, + "learning_rate": 2.7900486967031987e-06, + "loss": 0.4337, + "step": 5929 + }, + { + "epoch": 2.8037825059101653, + "grad_norm": 2.6860995292663574, + "learning_rate": 2.789429073445135e-06, + "loss": 0.447, + "step": 5930 + }, + { + "epoch": 2.804255319148936, + "grad_norm": 2.67509126663208, + "learning_rate": 2.7888094321655918e-06, + "loss": 0.4955, + "step": 5931 + }, + { + "epoch": 2.804728132387707, + "grad_norm": 2.7426326274871826, + "learning_rate": 2.7881897729031514e-06, + "loss": 0.4564, + "step": 5932 + }, + { + "epoch": 2.8052009456264777, + "grad_norm": 2.7087252140045166, + "learning_rate": 2.7875700956963973e-06, + "loss": 0.4571, + "step": 5933 + }, + { + "epoch": 2.8056737588652485, + "grad_norm": 2.513526439666748, + "learning_rate": 2.7869504005839147e-06, + "loss": 0.4361, + "step": 5934 + }, + { + "epoch": 2.806146572104019, + "grad_norm": 3.2246084213256836, + "learning_rate": 2.7863306876042885e-06, + "loss": 0.4612, + "step": 5935 + }, + { + "epoch": 2.8066193853427897, + "grad_norm": 3.226325511932373, + "learning_rate": 2.7857109567961066e-06, + "loss": 0.4528, + "step": 5936 + }, + { + "epoch": 2.8070921985815604, + "grad_norm": 2.8861422538757324, + "learning_rate": 2.785091208197956e-06, + "loss": 0.5049, + "step": 5937 + }, + { + "epoch": 2.807565011820331, + "grad_norm": 2.76279616355896, + "learning_rate": 2.7844714418484257e-06, + "loss": 0.4714, + "step": 5938 + }, + { + "epoch": 2.8080378250591016, + "grad_norm": 2.9591920375823975, + "learning_rate": 2.7838516577861063e-06, + "loss": 0.4633, + "step": 5939 + }, + { + "epoch": 2.8085106382978724, + "grad_norm": 2.536916971206665, + "learning_rate": 2.7832318560495885e-06, + "loss": 0.4108, + "step": 5940 + }, + { + "epoch": 2.808983451536643, + "grad_norm": 3.2484991550445557, + "learning_rate": 2.7826120366774657e-06, + "loss": 0.4888, + "step": 5941 + }, + { + "epoch": 2.8094562647754135, + "grad_norm": 2.7129359245300293, + "learning_rate": 2.781992199708329e-06, + "loss": 0.4008, + "step": 5942 + }, + { + "epoch": 2.8099290780141843, + "grad_norm": 2.4176113605499268, + "learning_rate": 2.781372345180776e-06, + "loss": 0.3864, + "step": 5943 + }, + { + "epoch": 2.810401891252955, + "grad_norm": 2.6557252407073975, + "learning_rate": 2.7807524731334e-06, + "loss": 0.4295, + "step": 5944 + }, + { + "epoch": 2.8108747044917255, + "grad_norm": 2.9191324710845947, + "learning_rate": 2.7801325836047993e-06, + "loss": 0.4854, + "step": 5945 + }, + { + "epoch": 2.8113475177304963, + "grad_norm": 2.6325371265411377, + "learning_rate": 2.7795126766335705e-06, + "loss": 0.4332, + "step": 5946 + }, + { + "epoch": 2.811820330969267, + "grad_norm": 2.658337116241455, + "learning_rate": 2.778892752258314e-06, + "loss": 0.4276, + "step": 5947 + }, + { + "epoch": 2.812293144208038, + "grad_norm": 2.763782262802124, + "learning_rate": 2.778272810517627e-06, + "loss": 0.4246, + "step": 5948 + }, + { + "epoch": 2.8127659574468087, + "grad_norm": 2.407607078552246, + "learning_rate": 2.777652851450113e-06, + "loss": 0.3788, + "step": 5949 + }, + { + "epoch": 2.813238770685579, + "grad_norm": 3.0339951515197754, + "learning_rate": 2.7770328750943736e-06, + "loss": 0.477, + "step": 5950 + }, + { + "epoch": 2.81371158392435, + "grad_norm": 2.3475773334503174, + "learning_rate": 2.776412881489012e-06, + "loss": 0.4206, + "step": 5951 + }, + { + "epoch": 2.8141843971631206, + "grad_norm": 3.0455260276794434, + "learning_rate": 2.7757928706726318e-06, + "loss": 0.4301, + "step": 5952 + }, + { + "epoch": 2.814657210401891, + "grad_norm": 2.803920030593872, + "learning_rate": 2.7751728426838386e-06, + "loss": 0.3738, + "step": 5953 + }, + { + "epoch": 2.815130023640662, + "grad_norm": 3.1083319187164307, + "learning_rate": 2.77455279756124e-06, + "loss": 0.5365, + "step": 5954 + }, + { + "epoch": 2.8156028368794326, + "grad_norm": 3.180809497833252, + "learning_rate": 2.7739327353434427e-06, + "loss": 0.4789, + "step": 5955 + }, + { + "epoch": 2.8160756501182034, + "grad_norm": 2.975043773651123, + "learning_rate": 2.7733126560690543e-06, + "loss": 0.4798, + "step": 5956 + }, + { + "epoch": 2.816548463356974, + "grad_norm": 2.765475034713745, + "learning_rate": 2.772692559776685e-06, + "loss": 0.4206, + "step": 5957 + }, + { + "epoch": 2.8170212765957445, + "grad_norm": 2.48612380027771, + "learning_rate": 2.7720724465049463e-06, + "loss": 0.4234, + "step": 5958 + }, + { + "epoch": 2.8174940898345153, + "grad_norm": 2.7145729064941406, + "learning_rate": 2.77145231629245e-06, + "loss": 0.4713, + "step": 5959 + }, + { + "epoch": 2.817966903073286, + "grad_norm": 2.5993762016296387, + "learning_rate": 2.7708321691778074e-06, + "loss": 0.4144, + "step": 5960 + }, + { + "epoch": 2.8184397163120565, + "grad_norm": 3.0902538299560547, + "learning_rate": 2.770212005199633e-06, + "loss": 0.4822, + "step": 5961 + }, + { + "epoch": 2.8189125295508273, + "grad_norm": 2.849757671356201, + "learning_rate": 2.7695918243965424e-06, + "loss": 0.4449, + "step": 5962 + }, + { + "epoch": 2.819385342789598, + "grad_norm": 2.77148699760437, + "learning_rate": 2.768971626807151e-06, + "loss": 0.4448, + "step": 5963 + }, + { + "epoch": 2.819858156028369, + "grad_norm": 2.7865898609161377, + "learning_rate": 2.7683514124700757e-06, + "loss": 0.4944, + "step": 5964 + }, + { + "epoch": 2.8203309692671397, + "grad_norm": 2.9057955741882324, + "learning_rate": 2.767731181423934e-06, + "loss": 0.5074, + "step": 5965 + }, + { + "epoch": 2.82080378250591, + "grad_norm": 2.725837469100952, + "learning_rate": 2.7671109337073465e-06, + "loss": 0.4207, + "step": 5966 + }, + { + "epoch": 2.821276595744681, + "grad_norm": 3.078531265258789, + "learning_rate": 2.7664906693589315e-06, + "loss": 0.4835, + "step": 5967 + }, + { + "epoch": 2.8217494089834516, + "grad_norm": 2.8692002296447754, + "learning_rate": 2.765870388417312e-06, + "loss": 0.4284, + "step": 5968 + }, + { + "epoch": 2.822222222222222, + "grad_norm": 2.8519723415374756, + "learning_rate": 2.765250090921109e-06, + "loss": 0.541, + "step": 5969 + }, + { + "epoch": 2.8226950354609928, + "grad_norm": 3.2037532329559326, + "learning_rate": 2.7646297769089457e-06, + "loss": 0.4276, + "step": 5970 + }, + { + "epoch": 2.8231678486997636, + "grad_norm": 2.8637137413024902, + "learning_rate": 2.7640094464194468e-06, + "loss": 0.4904, + "step": 5971 + }, + { + "epoch": 2.8236406619385344, + "grad_norm": 2.681516408920288, + "learning_rate": 2.7633890994912372e-06, + "loss": 0.4942, + "step": 5972 + }, + { + "epoch": 2.824113475177305, + "grad_norm": 3.0035219192504883, + "learning_rate": 2.7627687361629434e-06, + "loss": 0.4556, + "step": 5973 + }, + { + "epoch": 2.8245862884160755, + "grad_norm": 2.8107759952545166, + "learning_rate": 2.7621483564731923e-06, + "loss": 0.4225, + "step": 5974 + }, + { + "epoch": 2.8250591016548463, + "grad_norm": 2.87276029586792, + "learning_rate": 2.7615279604606126e-06, + "loss": 0.5045, + "step": 5975 + }, + { + "epoch": 2.825531914893617, + "grad_norm": 2.687953233718872, + "learning_rate": 2.760907548163833e-06, + "loss": 0.4018, + "step": 5976 + }, + { + "epoch": 2.8260047281323875, + "grad_norm": 2.587979555130005, + "learning_rate": 2.760287119621486e-06, + "loss": 0.4407, + "step": 5977 + }, + { + "epoch": 2.8264775413711583, + "grad_norm": 2.805602550506592, + "learning_rate": 2.7596666748722e-06, + "loss": 0.4559, + "step": 5978 + }, + { + "epoch": 2.826950354609929, + "grad_norm": 2.320763111114502, + "learning_rate": 2.759046213954609e-06, + "loss": 0.3847, + "step": 5979 + }, + { + "epoch": 2.8274231678487, + "grad_norm": 2.6876401901245117, + "learning_rate": 2.758425736907347e-06, + "loss": 0.4528, + "step": 5980 + }, + { + "epoch": 2.8278959810874706, + "grad_norm": 2.6852915287017822, + "learning_rate": 2.757805243769046e-06, + "loss": 0.395, + "step": 5981 + }, + { + "epoch": 2.828368794326241, + "grad_norm": 2.808326005935669, + "learning_rate": 2.7571847345783447e-06, + "loss": 0.4647, + "step": 5982 + }, + { + "epoch": 2.828841607565012, + "grad_norm": 2.641479015350342, + "learning_rate": 2.7565642093738766e-06, + "loss": 0.3798, + "step": 5983 + }, + { + "epoch": 2.8293144208037826, + "grad_norm": 2.8066110610961914, + "learning_rate": 2.7559436681942803e-06, + "loss": 0.5072, + "step": 5984 + }, + { + "epoch": 2.829787234042553, + "grad_norm": 2.898375988006592, + "learning_rate": 2.7553231110781936e-06, + "loss": 0.5182, + "step": 5985 + }, + { + "epoch": 2.8302600472813237, + "grad_norm": 2.704890489578247, + "learning_rate": 2.7547025380642574e-06, + "loss": 0.3999, + "step": 5986 + }, + { + "epoch": 2.8307328605200945, + "grad_norm": 2.6024270057678223, + "learning_rate": 2.7540819491911106e-06, + "loss": 0.4302, + "step": 5987 + }, + { + "epoch": 2.8312056737588653, + "grad_norm": 2.8006081581115723, + "learning_rate": 2.7534613444973946e-06, + "loss": 0.4492, + "step": 5988 + }, + { + "epoch": 2.831678486997636, + "grad_norm": 2.9532058238983154, + "learning_rate": 2.752840724021752e-06, + "loss": 0.4552, + "step": 5989 + }, + { + "epoch": 2.8321513002364065, + "grad_norm": 3.1830217838287354, + "learning_rate": 2.7522200878028265e-06, + "loss": 0.5013, + "step": 5990 + }, + { + "epoch": 2.8326241134751773, + "grad_norm": 2.716176748275757, + "learning_rate": 2.7515994358792624e-06, + "loss": 0.4569, + "step": 5991 + }, + { + "epoch": 2.833096926713948, + "grad_norm": 2.6852715015411377, + "learning_rate": 2.7509787682897044e-06, + "loss": 0.4764, + "step": 5992 + }, + { + "epoch": 2.8335697399527184, + "grad_norm": 2.9383316040039062, + "learning_rate": 2.7503580850727985e-06, + "loss": 0.5205, + "step": 5993 + }, + { + "epoch": 2.8340425531914892, + "grad_norm": 2.703132152557373, + "learning_rate": 2.749737386267193e-06, + "loss": 0.4543, + "step": 5994 + }, + { + "epoch": 2.83451536643026, + "grad_norm": 2.4304885864257812, + "learning_rate": 2.7491166719115354e-06, + "loss": 0.4479, + "step": 5995 + }, + { + "epoch": 2.834988179669031, + "grad_norm": 2.975722551345825, + "learning_rate": 2.748495942044475e-06, + "loss": 0.4074, + "step": 5996 + }, + { + "epoch": 2.8354609929078016, + "grad_norm": 3.440208911895752, + "learning_rate": 2.7478751967046617e-06, + "loss": 0.4497, + "step": 5997 + }, + { + "epoch": 2.835933806146572, + "grad_norm": 2.734673261642456, + "learning_rate": 2.747254435930747e-06, + "loss": 0.437, + "step": 5998 + }, + { + "epoch": 2.8364066193853428, + "grad_norm": 3.1918959617614746, + "learning_rate": 2.7466336597613826e-06, + "loss": 0.4197, + "step": 5999 + }, + { + "epoch": 2.8368794326241136, + "grad_norm": 3.1440329551696777, + "learning_rate": 2.7460128682352216e-06, + "loss": 0.4425, + "step": 6000 + }, + { + "epoch": 2.837352245862884, + "grad_norm": 2.582993507385254, + "learning_rate": 2.7453920613909183e-06, + "loss": 0.4475, + "step": 6001 + }, + { + "epoch": 2.8378250591016547, + "grad_norm": 3.2682149410247803, + "learning_rate": 2.744771239267128e-06, + "loss": 0.4615, + "step": 6002 + }, + { + "epoch": 2.8382978723404255, + "grad_norm": 2.848477840423584, + "learning_rate": 2.7441504019025046e-06, + "loss": 0.4093, + "step": 6003 + }, + { + "epoch": 2.8387706855791963, + "grad_norm": 2.3582282066345215, + "learning_rate": 2.7435295493357067e-06, + "loss": 0.3911, + "step": 6004 + }, + { + "epoch": 2.839243498817967, + "grad_norm": 2.7707207202911377, + "learning_rate": 2.742908681605392e-06, + "loss": 0.4069, + "step": 6005 + }, + { + "epoch": 2.8397163120567375, + "grad_norm": 3.0763752460479736, + "learning_rate": 2.7422877987502183e-06, + "loss": 0.512, + "step": 6006 + }, + { + "epoch": 2.8401891252955083, + "grad_norm": 2.8027124404907227, + "learning_rate": 2.741666900808846e-06, + "loss": 0.4922, + "step": 6007 + }, + { + "epoch": 2.840661938534279, + "grad_norm": 2.487982988357544, + "learning_rate": 2.7410459878199353e-06, + "loss": 0.4368, + "step": 6008 + }, + { + "epoch": 2.8411347517730494, + "grad_norm": 2.8727993965148926, + "learning_rate": 2.7404250598221484e-06, + "loss": 0.4639, + "step": 6009 + }, + { + "epoch": 2.84160756501182, + "grad_norm": 2.5556678771972656, + "learning_rate": 2.739804116854147e-06, + "loss": 0.4217, + "step": 6010 + }, + { + "epoch": 2.842080378250591, + "grad_norm": 2.6306912899017334, + "learning_rate": 2.7391831589545948e-06, + "loss": 0.4816, + "step": 6011 + }, + { + "epoch": 2.842553191489362, + "grad_norm": 2.7340946197509766, + "learning_rate": 2.7385621861621557e-06, + "loss": 0.4113, + "step": 6012 + }, + { + "epoch": 2.8430260047281326, + "grad_norm": 2.834190607070923, + "learning_rate": 2.737941198515495e-06, + "loss": 0.4691, + "step": 6013 + }, + { + "epoch": 2.843498817966903, + "grad_norm": 2.7139697074890137, + "learning_rate": 2.737320196053281e-06, + "loss": 0.3798, + "step": 6014 + }, + { + "epoch": 2.8439716312056738, + "grad_norm": 2.7934985160827637, + "learning_rate": 2.736699178814177e-06, + "loss": 0.446, + "step": 6015 + }, + { + "epoch": 2.8444444444444446, + "grad_norm": 2.6941518783569336, + "learning_rate": 2.7360781468368534e-06, + "loss": 0.4787, + "step": 6016 + }, + { + "epoch": 2.844917257683215, + "grad_norm": 3.1530468463897705, + "learning_rate": 2.7354571001599792e-06, + "loss": 0.474, + "step": 6017 + }, + { + "epoch": 2.8453900709219857, + "grad_norm": 2.613875389099121, + "learning_rate": 2.7348360388222243e-06, + "loss": 0.4297, + "step": 6018 + }, + { + "epoch": 2.8458628841607565, + "grad_norm": 2.5481486320495605, + "learning_rate": 2.7342149628622587e-06, + "loss": 0.3762, + "step": 6019 + }, + { + "epoch": 2.8463356973995273, + "grad_norm": 2.6425609588623047, + "learning_rate": 2.7335938723187544e-06, + "loss": 0.4077, + "step": 6020 + }, + { + "epoch": 2.846808510638298, + "grad_norm": 2.6281731128692627, + "learning_rate": 2.7329727672303836e-06, + "loss": 0.466, + "step": 6021 + }, + { + "epoch": 2.8472813238770684, + "grad_norm": 2.8862180709838867, + "learning_rate": 2.7323516476358197e-06, + "loss": 0.4191, + "step": 6022 + }, + { + "epoch": 2.8477541371158392, + "grad_norm": 2.907731533050537, + "learning_rate": 2.7317305135737383e-06, + "loss": 0.4867, + "step": 6023 + }, + { + "epoch": 2.84822695035461, + "grad_norm": 2.825593948364258, + "learning_rate": 2.731109365082814e-06, + "loss": 0.4888, + "step": 6024 + }, + { + "epoch": 2.8486997635933804, + "grad_norm": 2.478163003921509, + "learning_rate": 2.730488202201722e-06, + "loss": 0.4714, + "step": 6025 + }, + { + "epoch": 2.849172576832151, + "grad_norm": 2.928899049758911, + "learning_rate": 2.7298670249691418e-06, + "loss": 0.4671, + "step": 6026 + }, + { + "epoch": 2.849645390070922, + "grad_norm": 2.778256893157959, + "learning_rate": 2.7292458334237488e-06, + "loss": 0.429, + "step": 6027 + }, + { + "epoch": 2.850118203309693, + "grad_norm": 3.0689055919647217, + "learning_rate": 2.7286246276042234e-06, + "loss": 0.4727, + "step": 6028 + }, + { + "epoch": 2.8505910165484636, + "grad_norm": 2.582066774368286, + "learning_rate": 2.7280034075492447e-06, + "loss": 0.4025, + "step": 6029 + }, + { + "epoch": 2.851063829787234, + "grad_norm": 3.6679015159606934, + "learning_rate": 2.7273821732974936e-06, + "loss": 0.4856, + "step": 6030 + }, + { + "epoch": 2.8515366430260047, + "grad_norm": 2.7222588062286377, + "learning_rate": 2.7267609248876516e-06, + "loss": 0.4255, + "step": 6031 + }, + { + "epoch": 2.8520094562647755, + "grad_norm": 2.455038547515869, + "learning_rate": 2.726139662358401e-06, + "loss": 0.4234, + "step": 6032 + }, + { + "epoch": 2.852482269503546, + "grad_norm": 2.8277318477630615, + "learning_rate": 2.7255183857484253e-06, + "loss": 0.4146, + "step": 6033 + }, + { + "epoch": 2.8529550827423167, + "grad_norm": 2.523615837097168, + "learning_rate": 2.724897095096409e-06, + "loss": 0.4227, + "step": 6034 + }, + { + "epoch": 2.8534278959810875, + "grad_norm": 3.353646755218506, + "learning_rate": 2.724275790441036e-06, + "loss": 0.5041, + "step": 6035 + }, + { + "epoch": 2.8539007092198583, + "grad_norm": 2.753981828689575, + "learning_rate": 2.7236544718209934e-06, + "loss": 0.4646, + "step": 6036 + }, + { + "epoch": 2.854373522458629, + "grad_norm": 2.954744577407837, + "learning_rate": 2.723033139274967e-06, + "loss": 0.5182, + "step": 6037 + }, + { + "epoch": 2.8548463356973994, + "grad_norm": 2.4814131259918213, + "learning_rate": 2.7224117928416462e-06, + "loss": 0.4626, + "step": 6038 + }, + { + "epoch": 2.8553191489361702, + "grad_norm": 2.7414886951446533, + "learning_rate": 2.721790432559717e-06, + "loss": 0.4111, + "step": 6039 + }, + { + "epoch": 2.855791962174941, + "grad_norm": 2.8743896484375, + "learning_rate": 2.7211690584678706e-06, + "loss": 0.4986, + "step": 6040 + }, + { + "epoch": 2.8562647754137114, + "grad_norm": 3.0691921710968018, + "learning_rate": 2.720547670604797e-06, + "loss": 0.4743, + "step": 6041 + }, + { + "epoch": 2.856737588652482, + "grad_norm": 2.7273411750793457, + "learning_rate": 2.7199262690091872e-06, + "loss": 0.4403, + "step": 6042 + }, + { + "epoch": 2.857210401891253, + "grad_norm": 2.8022944927215576, + "learning_rate": 2.7193048537197325e-06, + "loss": 0.4413, + "step": 6043 + }, + { + "epoch": 2.8576832151300238, + "grad_norm": 2.4883248805999756, + "learning_rate": 2.718683424775126e-06, + "loss": 0.4485, + "step": 6044 + }, + { + "epoch": 2.8581560283687946, + "grad_norm": 2.457249879837036, + "learning_rate": 2.718061982214062e-06, + "loss": 0.4167, + "step": 6045 + }, + { + "epoch": 2.858628841607565, + "grad_norm": 2.7210328578948975, + "learning_rate": 2.717440526075234e-06, + "loss": 0.4419, + "step": 6046 + }, + { + "epoch": 2.8591016548463357, + "grad_norm": 2.684483766555786, + "learning_rate": 2.7168190563973386e-06, + "loss": 0.4449, + "step": 6047 + }, + { + "epoch": 2.8595744680851065, + "grad_norm": 2.5305230617523193, + "learning_rate": 2.7161975732190706e-06, + "loss": 0.3829, + "step": 6048 + }, + { + "epoch": 2.860047281323877, + "grad_norm": 3.0284602642059326, + "learning_rate": 2.7155760765791278e-06, + "loss": 0.5164, + "step": 6049 + }, + { + "epoch": 2.8605200945626477, + "grad_norm": 3.154599189758301, + "learning_rate": 2.7149545665162085e-06, + "loss": 0.527, + "step": 6050 + }, + { + "epoch": 2.8609929078014185, + "grad_norm": 2.6798126697540283, + "learning_rate": 2.7143330430690113e-06, + "loss": 0.4379, + "step": 6051 + }, + { + "epoch": 2.8614657210401893, + "grad_norm": 2.9531302452087402, + "learning_rate": 2.7137115062762344e-06, + "loss": 0.4549, + "step": 6052 + }, + { + "epoch": 2.86193853427896, + "grad_norm": 2.779531240463257, + "learning_rate": 2.7130899561765787e-06, + "loss": 0.4037, + "step": 6053 + }, + { + "epoch": 2.8624113475177304, + "grad_norm": 2.786763906478882, + "learning_rate": 2.7124683928087466e-06, + "loss": 0.3986, + "step": 6054 + }, + { + "epoch": 2.862884160756501, + "grad_norm": 2.430415630340576, + "learning_rate": 2.7118468162114385e-06, + "loss": 0.4402, + "step": 6055 + }, + { + "epoch": 2.863356973995272, + "grad_norm": 3.027268409729004, + "learning_rate": 2.7112252264233596e-06, + "loss": 0.4737, + "step": 6056 + }, + { + "epoch": 2.8638297872340424, + "grad_norm": 3.024935483932495, + "learning_rate": 2.710603623483211e-06, + "loss": 0.3997, + "step": 6057 + }, + { + "epoch": 2.864302600472813, + "grad_norm": 2.8862195014953613, + "learning_rate": 2.7099820074296985e-06, + "loss": 0.4896, + "step": 6058 + }, + { + "epoch": 2.864775413711584, + "grad_norm": 2.595579147338867, + "learning_rate": 2.709360378301527e-06, + "loss": 0.4387, + "step": 6059 + }, + { + "epoch": 2.8652482269503547, + "grad_norm": 2.8046188354492188, + "learning_rate": 2.708738736137403e-06, + "loss": 0.4726, + "step": 6060 + }, + { + "epoch": 2.8657210401891255, + "grad_norm": 3.040304660797119, + "learning_rate": 2.708117080976033e-06, + "loss": 0.4642, + "step": 6061 + }, + { + "epoch": 2.866193853427896, + "grad_norm": 2.618128538131714, + "learning_rate": 2.7074954128561248e-06, + "loss": 0.3171, + "step": 6062 + }, + { + "epoch": 2.8666666666666667, + "grad_norm": 2.7966055870056152, + "learning_rate": 2.706873731816387e-06, + "loss": 0.4893, + "step": 6063 + }, + { + "epoch": 2.8671394799054375, + "grad_norm": 2.9198038578033447, + "learning_rate": 2.706252037895529e-06, + "loss": 0.4428, + "step": 6064 + }, + { + "epoch": 2.867612293144208, + "grad_norm": 2.417705774307251, + "learning_rate": 2.7056303311322617e-06, + "loss": 0.3704, + "step": 6065 + }, + { + "epoch": 2.8680851063829786, + "grad_norm": 3.143918752670288, + "learning_rate": 2.7050086115652953e-06, + "loss": 0.5247, + "step": 6066 + }, + { + "epoch": 2.8685579196217494, + "grad_norm": 2.620781183242798, + "learning_rate": 2.704386879233341e-06, + "loss": 0.4131, + "step": 6067 + }, + { + "epoch": 2.8690307328605202, + "grad_norm": 2.6929845809936523, + "learning_rate": 2.703765134175112e-06, + "loss": 0.4833, + "step": 6068 + }, + { + "epoch": 2.869503546099291, + "grad_norm": 2.695920944213867, + "learning_rate": 2.7031433764293214e-06, + "loss": 0.435, + "step": 6069 + }, + { + "epoch": 2.8699763593380614, + "grad_norm": 2.6184475421905518, + "learning_rate": 2.702521606034684e-06, + "loss": 0.3898, + "step": 6070 + }, + { + "epoch": 2.870449172576832, + "grad_norm": 3.130624532699585, + "learning_rate": 2.7018998230299136e-06, + "loss": 0.4934, + "step": 6071 + }, + { + "epoch": 2.870921985815603, + "grad_norm": 2.947936534881592, + "learning_rate": 2.701278027453727e-06, + "loss": 0.4167, + "step": 6072 + }, + { + "epoch": 2.8713947990543733, + "grad_norm": 2.389263391494751, + "learning_rate": 2.7006562193448406e-06, + "loss": 0.3854, + "step": 6073 + }, + { + "epoch": 2.871867612293144, + "grad_norm": 2.9040684700012207, + "learning_rate": 2.700034398741971e-06, + "loss": 0.4656, + "step": 6074 + }, + { + "epoch": 2.872340425531915, + "grad_norm": 2.8671910762786865, + "learning_rate": 2.6994125656838365e-06, + "loss": 0.4642, + "step": 6075 + }, + { + "epoch": 2.8728132387706857, + "grad_norm": 2.6957180500030518, + "learning_rate": 2.698790720209156e-06, + "loss": 0.4894, + "step": 6076 + }, + { + "epoch": 2.8732860520094565, + "grad_norm": 2.748342514038086, + "learning_rate": 2.698168862356648e-06, + "loss": 0.4552, + "step": 6077 + }, + { + "epoch": 2.873758865248227, + "grad_norm": 2.7459912300109863, + "learning_rate": 2.6975469921650344e-06, + "loss": 0.4244, + "step": 6078 + }, + { + "epoch": 2.8742316784869977, + "grad_norm": 2.515650987625122, + "learning_rate": 2.6969251096730366e-06, + "loss": 0.4178, + "step": 6079 + }, + { + "epoch": 2.8747044917257685, + "grad_norm": 2.747373342514038, + "learning_rate": 2.696303214919375e-06, + "loss": 0.4623, + "step": 6080 + }, + { + "epoch": 2.875177304964539, + "grad_norm": 2.72092604637146, + "learning_rate": 2.695681307942773e-06, + "loss": 0.4227, + "step": 6081 + }, + { + "epoch": 2.8756501182033096, + "grad_norm": 2.6925108432769775, + "learning_rate": 2.695059388781955e-06, + "loss": 0.3807, + "step": 6082 + }, + { + "epoch": 2.8761229314420804, + "grad_norm": 2.673546314239502, + "learning_rate": 2.6944374574756427e-06, + "loss": 0.424, + "step": 6083 + }, + { + "epoch": 2.876595744680851, + "grad_norm": 2.7018187046051025, + "learning_rate": 2.6938155140625636e-06, + "loss": 0.4367, + "step": 6084 + }, + { + "epoch": 2.877068557919622, + "grad_norm": 2.9420957565307617, + "learning_rate": 2.6931935585814416e-06, + "loss": 0.4223, + "step": 6085 + }, + { + "epoch": 2.8775413711583924, + "grad_norm": 2.6523385047912598, + "learning_rate": 2.6925715910710036e-06, + "loss": 0.4074, + "step": 6086 + }, + { + "epoch": 2.878014184397163, + "grad_norm": 2.6104063987731934, + "learning_rate": 2.691949611569978e-06, + "loss": 0.423, + "step": 6087 + }, + { + "epoch": 2.878486997635934, + "grad_norm": 2.6463685035705566, + "learning_rate": 2.691327620117091e-06, + "loss": 0.4354, + "step": 6088 + }, + { + "epoch": 2.8789598108747043, + "grad_norm": 2.5863583087921143, + "learning_rate": 2.6907056167510725e-06, + "loss": 0.4177, + "step": 6089 + }, + { + "epoch": 2.879432624113475, + "grad_norm": 2.6946942806243896, + "learning_rate": 2.690083601510651e-06, + "loss": 0.4176, + "step": 6090 + }, + { + "epoch": 2.879905437352246, + "grad_norm": 3.0649454593658447, + "learning_rate": 2.6894615744345575e-06, + "loss": 0.4827, + "step": 6091 + }, + { + "epoch": 2.8803782505910167, + "grad_norm": 2.6454906463623047, + "learning_rate": 2.6888395355615226e-06, + "loss": 0.4757, + "step": 6092 + }, + { + "epoch": 2.8808510638297875, + "grad_norm": 3.251805067062378, + "learning_rate": 2.688217484930278e-06, + "loss": 0.5651, + "step": 6093 + }, + { + "epoch": 2.881323877068558, + "grad_norm": 2.543999433517456, + "learning_rate": 2.687595422579555e-06, + "loss": 0.4196, + "step": 6094 + }, + { + "epoch": 2.8817966903073287, + "grad_norm": 3.1502909660339355, + "learning_rate": 2.686973348548088e-06, + "loss": 0.4376, + "step": 6095 + }, + { + "epoch": 2.8822695035460995, + "grad_norm": 2.7800376415252686, + "learning_rate": 2.686351262874611e-06, + "loss": 0.444, + "step": 6096 + }, + { + "epoch": 2.88274231678487, + "grad_norm": 3.1529603004455566, + "learning_rate": 2.685729165597858e-06, + "loss": 0.5137, + "step": 6097 + }, + { + "epoch": 2.8832151300236406, + "grad_norm": 2.6079602241516113, + "learning_rate": 2.685107056756564e-06, + "loss": 0.4213, + "step": 6098 + }, + { + "epoch": 2.8836879432624114, + "grad_norm": 2.8969249725341797, + "learning_rate": 2.6844849363894648e-06, + "loss": 0.4679, + "step": 6099 + }, + { + "epoch": 2.884160756501182, + "grad_norm": 2.5882437229156494, + "learning_rate": 2.6838628045352977e-06, + "loss": 0.3891, + "step": 6100 + }, + { + "epoch": 2.8846335697399526, + "grad_norm": 2.9458062648773193, + "learning_rate": 2.6832406612328007e-06, + "loss": 0.4802, + "step": 6101 + }, + { + "epoch": 2.8851063829787233, + "grad_norm": 2.8463058471679688, + "learning_rate": 2.6826185065207105e-06, + "loss": 0.4332, + "step": 6102 + }, + { + "epoch": 2.885579196217494, + "grad_norm": 2.8799285888671875, + "learning_rate": 2.6819963404377667e-06, + "loss": 0.4474, + "step": 6103 + }, + { + "epoch": 2.8860520094562645, + "grad_norm": 2.846860408782959, + "learning_rate": 2.681374163022709e-06, + "loss": 0.4317, + "step": 6104 + }, + { + "epoch": 2.8865248226950353, + "grad_norm": 2.7918877601623535, + "learning_rate": 2.6807519743142775e-06, + "loss": 0.4243, + "step": 6105 + }, + { + "epoch": 2.886997635933806, + "grad_norm": 2.9351487159729004, + "learning_rate": 2.6801297743512127e-06, + "loss": 0.5253, + "step": 6106 + }, + { + "epoch": 2.887470449172577, + "grad_norm": 2.9422426223754883, + "learning_rate": 2.6795075631722576e-06, + "loss": 0.4887, + "step": 6107 + }, + { + "epoch": 2.8879432624113477, + "grad_norm": 2.6837220191955566, + "learning_rate": 2.678885340816153e-06, + "loss": 0.4761, + "step": 6108 + }, + { + "epoch": 2.888416075650118, + "grad_norm": 2.6800777912139893, + "learning_rate": 2.6782631073216425e-06, + "loss": 0.4248, + "step": 6109 + }, + { + "epoch": 2.888888888888889, + "grad_norm": 2.9654436111450195, + "learning_rate": 2.6776408627274702e-06, + "loss": 0.487, + "step": 6110 + }, + { + "epoch": 2.8893617021276596, + "grad_norm": 2.7725181579589844, + "learning_rate": 2.6770186070723804e-06, + "loss": 0.4166, + "step": 6111 + }, + { + "epoch": 2.88983451536643, + "grad_norm": 2.6547815799713135, + "learning_rate": 2.676396340395118e-06, + "loss": 0.4039, + "step": 6112 + }, + { + "epoch": 2.890307328605201, + "grad_norm": 2.690997838973999, + "learning_rate": 2.6757740627344292e-06, + "loss": 0.4639, + "step": 6113 + }, + { + "epoch": 2.8907801418439716, + "grad_norm": 2.4693069458007812, + "learning_rate": 2.67515177412906e-06, + "loss": 0.4052, + "step": 6114 + }, + { + "epoch": 2.8912529550827424, + "grad_norm": 2.7137033939361572, + "learning_rate": 2.6745294746177576e-06, + "loss": 0.4442, + "step": 6115 + }, + { + "epoch": 2.891725768321513, + "grad_norm": 3.7417004108428955, + "learning_rate": 2.6739071642392712e-06, + "loss": 0.4809, + "step": 6116 + }, + { + "epoch": 2.8921985815602835, + "grad_norm": 2.707094669342041, + "learning_rate": 2.673284843032347e-06, + "loss": 0.411, + "step": 6117 + }, + { + "epoch": 2.8926713947990543, + "grad_norm": 2.7864158153533936, + "learning_rate": 2.672662511035736e-06, + "loss": 0.4939, + "step": 6118 + }, + { + "epoch": 2.893144208037825, + "grad_norm": 2.8753504753112793, + "learning_rate": 2.672040168288187e-06, + "loss": 0.4396, + "step": 6119 + }, + { + "epoch": 2.8936170212765955, + "grad_norm": 2.7581071853637695, + "learning_rate": 2.6714178148284516e-06, + "loss": 0.427, + "step": 6120 + }, + { + "epoch": 2.8940898345153663, + "grad_norm": 2.9754791259765625, + "learning_rate": 2.6707954506952803e-06, + "loss": 0.4255, + "step": 6121 + }, + { + "epoch": 2.894562647754137, + "grad_norm": 2.876939296722412, + "learning_rate": 2.670173075927426e-06, + "loss": 0.4699, + "step": 6122 + }, + { + "epoch": 2.895035460992908, + "grad_norm": 2.4875400066375732, + "learning_rate": 2.6695506905636397e-06, + "loss": 0.3568, + "step": 6123 + }, + { + "epoch": 2.8955082742316787, + "grad_norm": 2.703606128692627, + "learning_rate": 2.668928294642675e-06, + "loss": 0.3646, + "step": 6124 + }, + { + "epoch": 2.895981087470449, + "grad_norm": 2.8618338108062744, + "learning_rate": 2.6683058882032868e-06, + "loss": 0.378, + "step": 6125 + }, + { + "epoch": 2.89645390070922, + "grad_norm": 2.9756760597229004, + "learning_rate": 2.667683471284229e-06, + "loss": 0.4348, + "step": 6126 + }, + { + "epoch": 2.8969267139479906, + "grad_norm": 2.7861104011535645, + "learning_rate": 2.667061043924256e-06, + "loss": 0.4435, + "step": 6127 + }, + { + "epoch": 2.897399527186761, + "grad_norm": 2.7932238578796387, + "learning_rate": 2.6664386061621243e-06, + "loss": 0.4824, + "step": 6128 + }, + { + "epoch": 2.8978723404255318, + "grad_norm": 2.85483455657959, + "learning_rate": 2.6658161580365917e-06, + "loss": 0.4925, + "step": 6129 + }, + { + "epoch": 2.8983451536643026, + "grad_norm": 2.4242141246795654, + "learning_rate": 2.6651936995864136e-06, + "loss": 0.3466, + "step": 6130 + }, + { + "epoch": 2.8988179669030734, + "grad_norm": 3.385214328765869, + "learning_rate": 2.6645712308503473e-06, + "loss": 0.4751, + "step": 6131 + }, + { + "epoch": 2.899290780141844, + "grad_norm": 2.7109622955322266, + "learning_rate": 2.6639487518671525e-06, + "loss": 0.4469, + "step": 6132 + }, + { + "epoch": 2.8997635933806145, + "grad_norm": 2.6537814140319824, + "learning_rate": 2.6633262626755877e-06, + "loss": 0.4678, + "step": 6133 + }, + { + "epoch": 2.9002364066193853, + "grad_norm": 2.5992231369018555, + "learning_rate": 2.6627037633144124e-06, + "loss": 0.4206, + "step": 6134 + }, + { + "epoch": 2.900709219858156, + "grad_norm": 2.988940954208374, + "learning_rate": 2.6620812538223885e-06, + "loss": 0.4554, + "step": 6135 + }, + { + "epoch": 2.9011820330969265, + "grad_norm": 3.0678138732910156, + "learning_rate": 2.661458734238274e-06, + "loss": 0.4671, + "step": 6136 + }, + { + "epoch": 2.9016548463356973, + "grad_norm": 2.6902482509613037, + "learning_rate": 2.6608362046008335e-06, + "loss": 0.372, + "step": 6137 + }, + { + "epoch": 2.902127659574468, + "grad_norm": 3.031597375869751, + "learning_rate": 2.660213664948827e-06, + "loss": 0.4424, + "step": 6138 + }, + { + "epoch": 2.902600472813239, + "grad_norm": 2.8376755714416504, + "learning_rate": 2.6595911153210187e-06, + "loss": 0.4599, + "step": 6139 + }, + { + "epoch": 2.9030732860520096, + "grad_norm": 3.3164854049682617, + "learning_rate": 2.6589685557561707e-06, + "loss": 0.3897, + "step": 6140 + }, + { + "epoch": 2.90354609929078, + "grad_norm": 2.9535014629364014, + "learning_rate": 2.658345986293048e-06, + "loss": 0.4957, + "step": 6141 + }, + { + "epoch": 2.904018912529551, + "grad_norm": 2.821276903152466, + "learning_rate": 2.657723406970415e-06, + "loss": 0.4453, + "step": 6142 + }, + { + "epoch": 2.9044917257683216, + "grad_norm": 2.7314651012420654, + "learning_rate": 2.657100817827037e-06, + "loss": 0.4406, + "step": 6143 + }, + { + "epoch": 2.904964539007092, + "grad_norm": 2.9509520530700684, + "learning_rate": 2.6564782189016804e-06, + "loss": 0.4629, + "step": 6144 + }, + { + "epoch": 2.9054373522458627, + "grad_norm": 2.6234960556030273, + "learning_rate": 2.655855610233111e-06, + "loss": 0.4306, + "step": 6145 + }, + { + "epoch": 2.9059101654846335, + "grad_norm": 2.7209644317626953, + "learning_rate": 2.6552329918600962e-06, + "loss": 0.3643, + "step": 6146 + }, + { + "epoch": 2.9063829787234043, + "grad_norm": 2.9797747135162354, + "learning_rate": 2.654610363821404e-06, + "loss": 0.4616, + "step": 6147 + }, + { + "epoch": 2.906855791962175, + "grad_norm": 2.8179666996002197, + "learning_rate": 2.6539877261558016e-06, + "loss": 0.4526, + "step": 6148 + }, + { + "epoch": 2.9073286052009455, + "grad_norm": 2.7492244243621826, + "learning_rate": 2.653365078902059e-06, + "loss": 0.4862, + "step": 6149 + }, + { + "epoch": 2.9078014184397163, + "grad_norm": 3.0262451171875, + "learning_rate": 2.6527424220989457e-06, + "loss": 0.3728, + "step": 6150 + }, + { + "epoch": 2.908274231678487, + "grad_norm": 2.8092808723449707, + "learning_rate": 2.6521197557852315e-06, + "loss": 0.4668, + "step": 6151 + }, + { + "epoch": 2.9087470449172574, + "grad_norm": 2.915719985961914, + "learning_rate": 2.651497079999687e-06, + "loss": 0.5124, + "step": 6152 + }, + { + "epoch": 2.9092198581560282, + "grad_norm": 2.9794204235076904, + "learning_rate": 2.6508743947810834e-06, + "loss": 0.5207, + "step": 6153 + }, + { + "epoch": 2.909692671394799, + "grad_norm": 2.882453680038452, + "learning_rate": 2.650251700168193e-06, + "loss": 0.4382, + "step": 6154 + }, + { + "epoch": 2.91016548463357, + "grad_norm": 3.183680534362793, + "learning_rate": 2.6496289961997886e-06, + "loss": 0.5134, + "step": 6155 + }, + { + "epoch": 2.9106382978723406, + "grad_norm": 2.9374759197235107, + "learning_rate": 2.649006282914642e-06, + "loss": 0.4748, + "step": 6156 + }, + { + "epoch": 2.911111111111111, + "grad_norm": 2.8096041679382324, + "learning_rate": 2.648383560351527e-06, + "loss": 0.4672, + "step": 6157 + }, + { + "epoch": 2.911583924349882, + "grad_norm": 2.8799238204956055, + "learning_rate": 2.6477608285492196e-06, + "loss": 0.4679, + "step": 6158 + }, + { + "epoch": 2.9120567375886526, + "grad_norm": 2.689310073852539, + "learning_rate": 2.6471380875464923e-06, + "loss": 0.4069, + "step": 6159 + }, + { + "epoch": 2.912529550827423, + "grad_norm": 2.909323215484619, + "learning_rate": 2.6465153373821216e-06, + "loss": 0.4463, + "step": 6160 + }, + { + "epoch": 2.9130023640661937, + "grad_norm": 2.797724962234497, + "learning_rate": 2.6458925780948845e-06, + "loss": 0.4269, + "step": 6161 + }, + { + "epoch": 2.9134751773049645, + "grad_norm": 2.7533204555511475, + "learning_rate": 2.645269809723556e-06, + "loss": 0.453, + "step": 6162 + }, + { + "epoch": 2.9139479905437353, + "grad_norm": 2.6615989208221436, + "learning_rate": 2.6446470323069122e-06, + "loss": 0.3921, + "step": 6163 + }, + { + "epoch": 2.914420803782506, + "grad_norm": 3.0493314266204834, + "learning_rate": 2.644024245883733e-06, + "loss": 0.4779, + "step": 6164 + }, + { + "epoch": 2.9148936170212765, + "grad_norm": 2.649845600128174, + "learning_rate": 2.643401450492795e-06, + "loss": 0.454, + "step": 6165 + }, + { + "epoch": 2.9153664302600473, + "grad_norm": 2.7931838035583496, + "learning_rate": 2.642778646172877e-06, + "loss": 0.504, + "step": 6166 + }, + { + "epoch": 2.915839243498818, + "grad_norm": 2.9518136978149414, + "learning_rate": 2.64215583296276e-06, + "loss": 0.4767, + "step": 6167 + }, + { + "epoch": 2.9163120567375884, + "grad_norm": 2.6047427654266357, + "learning_rate": 2.6415330109012216e-06, + "loss": 0.4316, + "step": 6168 + }, + { + "epoch": 2.916784869976359, + "grad_norm": 2.7732112407684326, + "learning_rate": 2.640910180027044e-06, + "loss": 0.4213, + "step": 6169 + }, + { + "epoch": 2.91725768321513, + "grad_norm": 3.1157236099243164, + "learning_rate": 2.6402873403790068e-06, + "loss": 0.4559, + "step": 6170 + }, + { + "epoch": 2.917730496453901, + "grad_norm": 2.68424129486084, + "learning_rate": 2.6396644919958917e-06, + "loss": 0.3456, + "step": 6171 + }, + { + "epoch": 2.9182033096926716, + "grad_norm": 3.1093270778656006, + "learning_rate": 2.639041634916482e-06, + "loss": 0.4172, + "step": 6172 + }, + { + "epoch": 2.918676122931442, + "grad_norm": 2.9844655990600586, + "learning_rate": 2.6384187691795594e-06, + "loss": 0.4844, + "step": 6173 + }, + { + "epoch": 2.9191489361702128, + "grad_norm": 2.907151222229004, + "learning_rate": 2.637795894823906e-06, + "loss": 0.5126, + "step": 6174 + }, + { + "epoch": 2.9196217494089836, + "grad_norm": 2.804105520248413, + "learning_rate": 2.637173011888307e-06, + "loss": 0.3919, + "step": 6175 + }, + { + "epoch": 2.920094562647754, + "grad_norm": 2.8809266090393066, + "learning_rate": 2.636550120411547e-06, + "loss": 0.4468, + "step": 6176 + }, + { + "epoch": 2.9205673758865247, + "grad_norm": 2.686290979385376, + "learning_rate": 2.6359272204324087e-06, + "loss": 0.4352, + "step": 6177 + }, + { + "epoch": 2.9210401891252955, + "grad_norm": 2.448101758956909, + "learning_rate": 2.635304311989678e-06, + "loss": 0.4218, + "step": 6178 + }, + { + "epoch": 2.9215130023640663, + "grad_norm": 2.81024169921875, + "learning_rate": 2.6346813951221416e-06, + "loss": 0.5177, + "step": 6179 + }, + { + "epoch": 2.921985815602837, + "grad_norm": 2.7590086460113525, + "learning_rate": 2.6340584698685856e-06, + "loss": 0.3897, + "step": 6180 + }, + { + "epoch": 2.9224586288416075, + "grad_norm": 3.1226227283477783, + "learning_rate": 2.6334355362677965e-06, + "loss": 0.4595, + "step": 6181 + }, + { + "epoch": 2.9229314420803783, + "grad_norm": 2.673828125, + "learning_rate": 2.6328125943585607e-06, + "loss": 0.4932, + "step": 6182 + }, + { + "epoch": 2.923404255319149, + "grad_norm": 2.8297293186187744, + "learning_rate": 2.632189644179668e-06, + "loss": 0.3819, + "step": 6183 + }, + { + "epoch": 2.9238770685579194, + "grad_norm": 2.9661548137664795, + "learning_rate": 2.6315666857699056e-06, + "loss": 0.4419, + "step": 6184 + }, + { + "epoch": 2.92434988179669, + "grad_norm": 2.9745798110961914, + "learning_rate": 2.6309437191680627e-06, + "loss": 0.4423, + "step": 6185 + }, + { + "epoch": 2.924822695035461, + "grad_norm": 2.8351712226867676, + "learning_rate": 2.6303207444129285e-06, + "loss": 0.5043, + "step": 6186 + }, + { + "epoch": 2.925295508274232, + "grad_norm": 2.6442384719848633, + "learning_rate": 2.6296977615432927e-06, + "loss": 0.4431, + "step": 6187 + }, + { + "epoch": 2.9257683215130026, + "grad_norm": 2.4128029346466064, + "learning_rate": 2.6290747705979457e-06, + "loss": 0.3603, + "step": 6188 + }, + { + "epoch": 2.926241134751773, + "grad_norm": 2.730424642562866, + "learning_rate": 2.6284517716156786e-06, + "loss": 0.439, + "step": 6189 + }, + { + "epoch": 2.9267139479905437, + "grad_norm": 2.6215405464172363, + "learning_rate": 2.627828764635284e-06, + "loss": 0.4117, + "step": 6190 + }, + { + "epoch": 2.9271867612293145, + "grad_norm": 2.56585955619812, + "learning_rate": 2.627205749695552e-06, + "loss": 0.4404, + "step": 6191 + }, + { + "epoch": 2.927659574468085, + "grad_norm": 2.9587886333465576, + "learning_rate": 2.6265827268352763e-06, + "loss": 0.4295, + "step": 6192 + }, + { + "epoch": 2.9281323877068557, + "grad_norm": 2.6611828804016113, + "learning_rate": 2.625959696093249e-06, + "loss": 0.4441, + "step": 6193 + }, + { + "epoch": 2.9286052009456265, + "grad_norm": 2.4391369819641113, + "learning_rate": 2.6253366575082634e-06, + "loss": 0.4447, + "step": 6194 + }, + { + "epoch": 2.9290780141843973, + "grad_norm": 2.710763454437256, + "learning_rate": 2.6247136111191144e-06, + "loss": 0.4662, + "step": 6195 + }, + { + "epoch": 2.929550827423168, + "grad_norm": 2.770697593688965, + "learning_rate": 2.6240905569645952e-06, + "loss": 0.4263, + "step": 6196 + }, + { + "epoch": 2.9300236406619384, + "grad_norm": 2.5885732173919678, + "learning_rate": 2.623467495083501e-06, + "loss": 0.4303, + "step": 6197 + }, + { + "epoch": 2.9304964539007092, + "grad_norm": 2.5716748237609863, + "learning_rate": 2.6228444255146274e-06, + "loss": 0.3714, + "step": 6198 + }, + { + "epoch": 2.93096926713948, + "grad_norm": 3.0437910556793213, + "learning_rate": 2.6222213482967703e-06, + "loss": 0.4077, + "step": 6199 + }, + { + "epoch": 2.9314420803782504, + "grad_norm": 2.7861344814300537, + "learning_rate": 2.6215982634687253e-06, + "loss": 0.4157, + "step": 6200 + }, + { + "epoch": 2.931914893617021, + "grad_norm": 2.5265355110168457, + "learning_rate": 2.6209751710692905e-06, + "loss": 0.4586, + "step": 6201 + }, + { + "epoch": 2.932387706855792, + "grad_norm": 2.940112590789795, + "learning_rate": 2.6203520711372615e-06, + "loss": 0.4208, + "step": 6202 + }, + { + "epoch": 2.9328605200945628, + "grad_norm": 2.7124581336975098, + "learning_rate": 2.6197289637114363e-06, + "loss": 0.4173, + "step": 6203 + }, + { + "epoch": 2.9333333333333336, + "grad_norm": 2.818523406982422, + "learning_rate": 2.619105848830615e-06, + "loss": 0.4349, + "step": 6204 + }, + { + "epoch": 2.933806146572104, + "grad_norm": 2.7630393505096436, + "learning_rate": 2.6184827265335937e-06, + "loss": 0.5078, + "step": 6205 + }, + { + "epoch": 2.9342789598108747, + "grad_norm": 3.0554699897766113, + "learning_rate": 2.6178595968591726e-06, + "loss": 0.4712, + "step": 6206 + }, + { + "epoch": 2.9347517730496455, + "grad_norm": 2.721992254257202, + "learning_rate": 2.6172364598461507e-06, + "loss": 0.4847, + "step": 6207 + }, + { + "epoch": 2.935224586288416, + "grad_norm": 2.809663772583008, + "learning_rate": 2.6166133155333303e-06, + "loss": 0.4447, + "step": 6208 + }, + { + "epoch": 2.9356973995271867, + "grad_norm": 2.568394660949707, + "learning_rate": 2.6159901639595088e-06, + "loss": 0.4543, + "step": 6209 + }, + { + "epoch": 2.9361702127659575, + "grad_norm": 3.3670637607574463, + "learning_rate": 2.6153670051634884e-06, + "loss": 0.4901, + "step": 6210 + }, + { + "epoch": 2.9366430260047283, + "grad_norm": 3.082508087158203, + "learning_rate": 2.614743839184071e-06, + "loss": 0.4862, + "step": 6211 + }, + { + "epoch": 2.937115839243499, + "grad_norm": 2.692139148712158, + "learning_rate": 2.6141206660600566e-06, + "loss": 0.5199, + "step": 6212 + }, + { + "epoch": 2.9375886524822694, + "grad_norm": 3.231433391571045, + "learning_rate": 2.6134974858302504e-06, + "loss": 0.464, + "step": 6213 + }, + { + "epoch": 2.93806146572104, + "grad_norm": 3.224238157272339, + "learning_rate": 2.612874298533452e-06, + "loss": 0.4507, + "step": 6214 + }, + { + "epoch": 2.938534278959811, + "grad_norm": 2.812755584716797, + "learning_rate": 2.6122511042084663e-06, + "loss": 0.4527, + "step": 6215 + }, + { + "epoch": 2.9390070921985814, + "grad_norm": 2.837811231613159, + "learning_rate": 2.611627902894098e-06, + "loss": 0.4782, + "step": 6216 + }, + { + "epoch": 2.939479905437352, + "grad_norm": 3.093817710876465, + "learning_rate": 2.6110046946291476e-06, + "loss": 0.4933, + "step": 6217 + }, + { + "epoch": 2.939952718676123, + "grad_norm": 2.950119733810425, + "learning_rate": 2.6103814794524235e-06, + "loss": 0.4884, + "step": 6218 + }, + { + "epoch": 2.9404255319148938, + "grad_norm": 2.469681978225708, + "learning_rate": 2.6097582574027274e-06, + "loss": 0.4135, + "step": 6219 + }, + { + "epoch": 2.9408983451536646, + "grad_norm": 2.779238224029541, + "learning_rate": 2.609135028518866e-06, + "loss": 0.5165, + "step": 6220 + }, + { + "epoch": 2.941371158392435, + "grad_norm": 2.807705879211426, + "learning_rate": 2.608511792839645e-06, + "loss": 0.4046, + "step": 6221 + }, + { + "epoch": 2.9418439716312057, + "grad_norm": 2.6067750453948975, + "learning_rate": 2.607888550403871e-06, + "loss": 0.406, + "step": 6222 + }, + { + "epoch": 2.9423167848699765, + "grad_norm": 2.865766763687134, + "learning_rate": 2.607265301250349e-06, + "loss": 0.471, + "step": 6223 + }, + { + "epoch": 2.942789598108747, + "grad_norm": 2.977681875228882, + "learning_rate": 2.6066420454178876e-06, + "loss": 0.4666, + "step": 6224 + }, + { + "epoch": 2.9432624113475176, + "grad_norm": 2.870884418487549, + "learning_rate": 2.606018782945294e-06, + "loss": 0.4768, + "step": 6225 + }, + { + "epoch": 2.9437352245862884, + "grad_norm": 2.992851495742798, + "learning_rate": 2.6053955138713756e-06, + "loss": 0.4657, + "step": 6226 + }, + { + "epoch": 2.9442080378250592, + "grad_norm": 2.7279815673828125, + "learning_rate": 2.6047722382349406e-06, + "loss": 0.4087, + "step": 6227 + }, + { + "epoch": 2.94468085106383, + "grad_norm": 2.8587028980255127, + "learning_rate": 2.604148956074797e-06, + "loss": 0.4452, + "step": 6228 + }, + { + "epoch": 2.9451536643026004, + "grad_norm": 3.001694679260254, + "learning_rate": 2.6035256674297555e-06, + "loss": 0.4852, + "step": 6229 + }, + { + "epoch": 2.945626477541371, + "grad_norm": 2.858069896697998, + "learning_rate": 2.6029023723386237e-06, + "loss": 0.4281, + "step": 6230 + }, + { + "epoch": 2.946099290780142, + "grad_norm": 2.675856828689575, + "learning_rate": 2.602279070840213e-06, + "loss": 0.4545, + "step": 6231 + }, + { + "epoch": 2.9465721040189123, + "grad_norm": 2.530245065689087, + "learning_rate": 2.6016557629733334e-06, + "loss": 0.4619, + "step": 6232 + }, + { + "epoch": 2.947044917257683, + "grad_norm": 2.7533743381500244, + "learning_rate": 2.601032448776795e-06, + "loss": 0.4879, + "step": 6233 + }, + { + "epoch": 2.947517730496454, + "grad_norm": 3.130453109741211, + "learning_rate": 2.600409128289409e-06, + "loss": 0.4056, + "step": 6234 + }, + { + "epoch": 2.9479905437352247, + "grad_norm": 3.4736509323120117, + "learning_rate": 2.5997858015499867e-06, + "loss": 0.5063, + "step": 6235 + }, + { + "epoch": 2.9484633569739955, + "grad_norm": 2.871978282928467, + "learning_rate": 2.5991624685973406e-06, + "loss": 0.4562, + "step": 6236 + }, + { + "epoch": 2.948936170212766, + "grad_norm": 2.976503372192383, + "learning_rate": 2.5985391294702817e-06, + "loss": 0.5079, + "step": 6237 + }, + { + "epoch": 2.9494089834515367, + "grad_norm": 2.578122615814209, + "learning_rate": 2.597915784207623e-06, + "loss": 0.4069, + "step": 6238 + }, + { + "epoch": 2.9498817966903075, + "grad_norm": 2.885911226272583, + "learning_rate": 2.597292432848178e-06, + "loss": 0.4382, + "step": 6239 + }, + { + "epoch": 2.950354609929078, + "grad_norm": 2.9301681518554688, + "learning_rate": 2.5966690754307605e-06, + "loss": 0.4888, + "step": 6240 + }, + { + "epoch": 2.9508274231678486, + "grad_norm": 2.9912192821502686, + "learning_rate": 2.5960457119941834e-06, + "loss": 0.4699, + "step": 6241 + }, + { + "epoch": 2.9513002364066194, + "grad_norm": 2.6612601280212402, + "learning_rate": 2.5954223425772607e-06, + "loss": 0.3736, + "step": 6242 + }, + { + "epoch": 2.9517730496453902, + "grad_norm": 2.9325380325317383, + "learning_rate": 2.5947989672188067e-06, + "loss": 0.4771, + "step": 6243 + }, + { + "epoch": 2.952245862884161, + "grad_norm": 2.8143959045410156, + "learning_rate": 2.594175585957637e-06, + "loss": 0.5103, + "step": 6244 + }, + { + "epoch": 2.9527186761229314, + "grad_norm": 2.355078935623169, + "learning_rate": 2.5935521988325674e-06, + "loss": 0.44, + "step": 6245 + }, + { + "epoch": 2.953191489361702, + "grad_norm": 2.733156442642212, + "learning_rate": 2.5929288058824114e-06, + "loss": 0.4306, + "step": 6246 + }, + { + "epoch": 2.953664302600473, + "grad_norm": 3.182563304901123, + "learning_rate": 2.5923054071459865e-06, + "loss": 0.417, + "step": 6247 + }, + { + "epoch": 2.9541371158392433, + "grad_norm": 2.4162323474884033, + "learning_rate": 2.5916820026621094e-06, + "loss": 0.3802, + "step": 6248 + }, + { + "epoch": 2.954609929078014, + "grad_norm": 2.772706985473633, + "learning_rate": 2.591058592469595e-06, + "loss": 0.4654, + "step": 6249 + }, + { + "epoch": 2.955082742316785, + "grad_norm": 2.6011102199554443, + "learning_rate": 2.5904351766072616e-06, + "loss": 0.4619, + "step": 6250 + }, + { + "epoch": 2.9555555555555557, + "grad_norm": 2.5700361728668213, + "learning_rate": 2.589811755113926e-06, + "loss": 0.3991, + "step": 6251 + }, + { + "epoch": 2.9560283687943265, + "grad_norm": 2.6444971561431885, + "learning_rate": 2.589188328028407e-06, + "loss": 0.4388, + "step": 6252 + }, + { + "epoch": 2.956501182033097, + "grad_norm": 2.739567279815674, + "learning_rate": 2.588564895389521e-06, + "loss": 0.4193, + "step": 6253 + }, + { + "epoch": 2.9569739952718677, + "grad_norm": 2.7070045471191406, + "learning_rate": 2.5879414572360877e-06, + "loss": 0.4347, + "step": 6254 + }, + { + "epoch": 2.9574468085106385, + "grad_norm": 2.7811532020568848, + "learning_rate": 2.587318013606926e-06, + "loss": 0.43, + "step": 6255 + }, + { + "epoch": 2.957919621749409, + "grad_norm": 3.0036091804504395, + "learning_rate": 2.5866945645408537e-06, + "loss": 0.4855, + "step": 6256 + }, + { + "epoch": 2.9583924349881796, + "grad_norm": 2.948573112487793, + "learning_rate": 2.5860711100766918e-06, + "loss": 0.4594, + "step": 6257 + }, + { + "epoch": 2.9588652482269504, + "grad_norm": 2.6371593475341797, + "learning_rate": 2.5854476502532583e-06, + "loss": 0.446, + "step": 6258 + }, + { + "epoch": 2.959338061465721, + "grad_norm": 2.668677806854248, + "learning_rate": 2.5848241851093754e-06, + "loss": 0.3991, + "step": 6259 + }, + { + "epoch": 2.959810874704492, + "grad_norm": 3.1640663146972656, + "learning_rate": 2.5842007146838614e-06, + "loss": 0.5146, + "step": 6260 + }, + { + "epoch": 2.9602836879432624, + "grad_norm": 2.9412102699279785, + "learning_rate": 2.5835772390155382e-06, + "loss": 0.4798, + "step": 6261 + }, + { + "epoch": 2.960756501182033, + "grad_norm": 2.7674343585968018, + "learning_rate": 2.582953758143227e-06, + "loss": 0.4262, + "step": 6262 + }, + { + "epoch": 2.961229314420804, + "grad_norm": 3.5219457149505615, + "learning_rate": 2.582330272105749e-06, + "loss": 0.4905, + "step": 6263 + }, + { + "epoch": 2.9617021276595743, + "grad_norm": 2.4274468421936035, + "learning_rate": 2.5817067809419267e-06, + "loss": 0.4048, + "step": 6264 + }, + { + "epoch": 2.962174940898345, + "grad_norm": 2.6907944679260254, + "learning_rate": 2.5810832846905814e-06, + "loss": 0.388, + "step": 6265 + }, + { + "epoch": 2.962647754137116, + "grad_norm": 2.603151321411133, + "learning_rate": 2.5804597833905347e-06, + "loss": 0.4377, + "step": 6266 + }, + { + "epoch": 2.9631205673758867, + "grad_norm": 2.685837507247925, + "learning_rate": 2.57983627708061e-06, + "loss": 0.4409, + "step": 6267 + }, + { + "epoch": 2.963593380614657, + "grad_norm": 2.8281500339508057, + "learning_rate": 2.579212765799631e-06, + "loss": 0.4567, + "step": 6268 + }, + { + "epoch": 2.964066193853428, + "grad_norm": 2.6387875080108643, + "learning_rate": 2.57858924958642e-06, + "loss": 0.4061, + "step": 6269 + }, + { + "epoch": 2.9645390070921986, + "grad_norm": 2.64139986038208, + "learning_rate": 2.5779657284798017e-06, + "loss": 0.4539, + "step": 6270 + }, + { + "epoch": 2.965011820330969, + "grad_norm": 2.7384836673736572, + "learning_rate": 2.5773422025185983e-06, + "loss": 0.408, + "step": 6271 + }, + { + "epoch": 2.96548463356974, + "grad_norm": 2.262514352798462, + "learning_rate": 2.576718671741636e-06, + "loss": 0.3726, + "step": 6272 + }, + { + "epoch": 2.9659574468085106, + "grad_norm": 2.53800106048584, + "learning_rate": 2.5760951361877384e-06, + "loss": 0.4716, + "step": 6273 + }, + { + "epoch": 2.9664302600472814, + "grad_norm": 3.256701707839966, + "learning_rate": 2.57547159589573e-06, + "loss": 0.518, + "step": 6274 + }, + { + "epoch": 2.966903073286052, + "grad_norm": 2.9427342414855957, + "learning_rate": 2.574848050904436e-06, + "loss": 0.4255, + "step": 6275 + }, + { + "epoch": 2.9673758865248225, + "grad_norm": 2.5794098377227783, + "learning_rate": 2.574224501252682e-06, + "loss": 0.4412, + "step": 6276 + }, + { + "epoch": 2.9678486997635933, + "grad_norm": 2.5894877910614014, + "learning_rate": 2.573600946979294e-06, + "loss": 0.4356, + "step": 6277 + }, + { + "epoch": 2.968321513002364, + "grad_norm": 2.9597361087799072, + "learning_rate": 2.572977388123098e-06, + "loss": 0.4376, + "step": 6278 + }, + { + "epoch": 2.9687943262411345, + "grad_norm": 2.779303550720215, + "learning_rate": 2.5723538247229197e-06, + "loss": 0.3985, + "step": 6279 + }, + { + "epoch": 2.9692671394799053, + "grad_norm": 2.9173855781555176, + "learning_rate": 2.5717302568175866e-06, + "loss": 0.4581, + "step": 6280 + }, + { + "epoch": 2.969739952718676, + "grad_norm": 2.703721284866333, + "learning_rate": 2.5711066844459242e-06, + "loss": 0.3705, + "step": 6281 + }, + { + "epoch": 2.970212765957447, + "grad_norm": 2.5415029525756836, + "learning_rate": 2.5704831076467613e-06, + "loss": 0.4089, + "step": 6282 + }, + { + "epoch": 2.9706855791962177, + "grad_norm": 2.791780948638916, + "learning_rate": 2.5698595264589234e-06, + "loss": 0.4357, + "step": 6283 + }, + { + "epoch": 2.971158392434988, + "grad_norm": 2.887662887573242, + "learning_rate": 2.5692359409212392e-06, + "loss": 0.4093, + "step": 6284 + }, + { + "epoch": 2.971631205673759, + "grad_norm": 3.0309557914733887, + "learning_rate": 2.5686123510725364e-06, + "loss": 0.4461, + "step": 6285 + }, + { + "epoch": 2.9721040189125296, + "grad_norm": 2.6861515045166016, + "learning_rate": 2.5679887569516437e-06, + "loss": 0.4199, + "step": 6286 + }, + { + "epoch": 2.9725768321513, + "grad_norm": 2.7014012336730957, + "learning_rate": 2.5673651585973897e-06, + "loss": 0.4373, + "step": 6287 + }, + { + "epoch": 2.9730496453900708, + "grad_norm": 2.951265811920166, + "learning_rate": 2.5667415560486026e-06, + "loss": 0.4426, + "step": 6288 + }, + { + "epoch": 2.9735224586288416, + "grad_norm": 2.7664504051208496, + "learning_rate": 2.5661179493441106e-06, + "loss": 0.474, + "step": 6289 + }, + { + "epoch": 2.9739952718676124, + "grad_norm": 2.6081087589263916, + "learning_rate": 2.5654943385227445e-06, + "loss": 0.4058, + "step": 6290 + }, + { + "epoch": 2.974468085106383, + "grad_norm": 2.9416966438293457, + "learning_rate": 2.564870723623333e-06, + "loss": 0.506, + "step": 6291 + }, + { + "epoch": 2.9749408983451535, + "grad_norm": 2.9441659450531006, + "learning_rate": 2.564247104684706e-06, + "loss": 0.4505, + "step": 6292 + }, + { + "epoch": 2.9754137115839243, + "grad_norm": 2.7110862731933594, + "learning_rate": 2.563623481745693e-06, + "loss": 0.4493, + "step": 6293 + }, + { + "epoch": 2.975886524822695, + "grad_norm": 2.88459849357605, + "learning_rate": 2.562999854845125e-06, + "loss": 0.4462, + "step": 6294 + }, + { + "epoch": 2.9763593380614655, + "grad_norm": 3.0491793155670166, + "learning_rate": 2.5623762240218327e-06, + "loss": 0.4928, + "step": 6295 + }, + { + "epoch": 2.9768321513002363, + "grad_norm": 2.9475483894348145, + "learning_rate": 2.561752589314646e-06, + "loss": 0.4535, + "step": 6296 + }, + { + "epoch": 2.977304964539007, + "grad_norm": 2.879495859146118, + "learning_rate": 2.561128950762397e-06, + "loss": 0.4393, + "step": 6297 + }, + { + "epoch": 2.977777777777778, + "grad_norm": 2.8478336334228516, + "learning_rate": 2.560505308403916e-06, + "loss": 0.4363, + "step": 6298 + }, + { + "epoch": 2.9782505910165487, + "grad_norm": 2.5475094318389893, + "learning_rate": 2.5598816622780343e-06, + "loss": 0.3825, + "step": 6299 + }, + { + "epoch": 2.978723404255319, + "grad_norm": 2.85430908203125, + "learning_rate": 2.5592580124235838e-06, + "loss": 0.4226, + "step": 6300 + }, + { + "epoch": 2.97919621749409, + "grad_norm": 2.569775104522705, + "learning_rate": 2.5586343588793975e-06, + "loss": 0.4045, + "step": 6301 + }, + { + "epoch": 2.9796690307328606, + "grad_norm": 2.4482202529907227, + "learning_rate": 2.558010701684307e-06, + "loss": 0.4625, + "step": 6302 + }, + { + "epoch": 2.980141843971631, + "grad_norm": 2.9301230907440186, + "learning_rate": 2.5573870408771436e-06, + "loss": 0.4358, + "step": 6303 + }, + { + "epoch": 2.9806146572104018, + "grad_norm": 2.9865870475769043, + "learning_rate": 2.5567633764967416e-06, + "loss": 0.497, + "step": 6304 + }, + { + "epoch": 2.9810874704491725, + "grad_norm": 2.523524522781372, + "learning_rate": 2.556139708581933e-06, + "loss": 0.4141, + "step": 6305 + }, + { + "epoch": 2.9815602836879433, + "grad_norm": 2.8489344120025635, + "learning_rate": 2.5555160371715504e-06, + "loss": 0.4205, + "step": 6306 + }, + { + "epoch": 2.982033096926714, + "grad_norm": 2.417759895324707, + "learning_rate": 2.5548923623044274e-06, + "loss": 0.44, + "step": 6307 + }, + { + "epoch": 2.9825059101654845, + "grad_norm": 2.7626900672912598, + "learning_rate": 2.554268684019398e-06, + "loss": 0.4646, + "step": 6308 + }, + { + "epoch": 2.9829787234042553, + "grad_norm": 3.0916266441345215, + "learning_rate": 2.5536450023552956e-06, + "loss": 0.4443, + "step": 6309 + }, + { + "epoch": 2.983451536643026, + "grad_norm": 2.721992015838623, + "learning_rate": 2.5530213173509542e-06, + "loss": 0.4008, + "step": 6310 + }, + { + "epoch": 2.9839243498817964, + "grad_norm": 2.825334072113037, + "learning_rate": 2.552397629045208e-06, + "loss": 0.4513, + "step": 6311 + }, + { + "epoch": 2.9843971631205672, + "grad_norm": 2.912050485610962, + "learning_rate": 2.5517739374768915e-06, + "loss": 0.4104, + "step": 6312 + }, + { + "epoch": 2.984869976359338, + "grad_norm": 2.760650634765625, + "learning_rate": 2.551150242684838e-06, + "loss": 0.4372, + "step": 6313 + }, + { + "epoch": 2.985342789598109, + "grad_norm": 2.8926033973693848, + "learning_rate": 2.5505265447078838e-06, + "loss": 0.475, + "step": 6314 + }, + { + "epoch": 2.9858156028368796, + "grad_norm": 2.6279892921447754, + "learning_rate": 2.5499028435848633e-06, + "loss": 0.4589, + "step": 6315 + }, + { + "epoch": 2.98628841607565, + "grad_norm": 3.2147316932678223, + "learning_rate": 2.549279139354611e-06, + "loss": 0.4968, + "step": 6316 + }, + { + "epoch": 2.986761229314421, + "grad_norm": 2.4510674476623535, + "learning_rate": 2.5486554320559626e-06, + "loss": 0.4291, + "step": 6317 + }, + { + "epoch": 2.9872340425531916, + "grad_norm": 2.6919643878936768, + "learning_rate": 2.5480317217277544e-06, + "loss": 0.4704, + "step": 6318 + }, + { + "epoch": 2.987706855791962, + "grad_norm": 2.9832234382629395, + "learning_rate": 2.5474080084088215e-06, + "loss": 0.4129, + "step": 6319 + }, + { + "epoch": 2.9881796690307327, + "grad_norm": 2.893209218978882, + "learning_rate": 2.5467842921380004e-06, + "loss": 0.5099, + "step": 6320 + }, + { + "epoch": 2.9886524822695035, + "grad_norm": 2.6734580993652344, + "learning_rate": 2.5461605729541254e-06, + "loss": 0.4588, + "step": 6321 + }, + { + "epoch": 2.9891252955082743, + "grad_norm": 2.5591681003570557, + "learning_rate": 2.5455368508960343e-06, + "loss": 0.4162, + "step": 6322 + }, + { + "epoch": 2.989598108747045, + "grad_norm": 3.2619881629943848, + "learning_rate": 2.5449131260025626e-06, + "loss": 0.4412, + "step": 6323 + }, + { + "epoch": 2.9900709219858155, + "grad_norm": 2.897914409637451, + "learning_rate": 2.544289398312549e-06, + "loss": 0.5079, + "step": 6324 + }, + { + "epoch": 2.9905437352245863, + "grad_norm": 2.7891685962677, + "learning_rate": 2.5436656678648274e-06, + "loss": 0.42, + "step": 6325 + }, + { + "epoch": 2.991016548463357, + "grad_norm": 3.022341728210449, + "learning_rate": 2.5430419346982367e-06, + "loss": 0.4739, + "step": 6326 + }, + { + "epoch": 2.9914893617021274, + "grad_norm": 3.395775556564331, + "learning_rate": 2.542418198851614e-06, + "loss": 0.4822, + "step": 6327 + }, + { + "epoch": 2.9919621749408982, + "grad_norm": 3.0200490951538086, + "learning_rate": 2.541794460363795e-06, + "loss": 0.4755, + "step": 6328 + }, + { + "epoch": 2.992434988179669, + "grad_norm": 3.302020311355591, + "learning_rate": 2.541170719273619e-06, + "loss": 0.4603, + "step": 6329 + }, + { + "epoch": 2.99290780141844, + "grad_norm": 2.5985910892486572, + "learning_rate": 2.5405469756199226e-06, + "loss": 0.4475, + "step": 6330 + }, + { + "epoch": 2.9933806146572106, + "grad_norm": 2.9413928985595703, + "learning_rate": 2.5399232294415434e-06, + "loss": 0.4695, + "step": 6331 + }, + { + "epoch": 2.993853427895981, + "grad_norm": 2.942777156829834, + "learning_rate": 2.53929948077732e-06, + "loss": 0.4462, + "step": 6332 + }, + { + "epoch": 2.9943262411347518, + "grad_norm": 2.971120595932007, + "learning_rate": 2.53867572966609e-06, + "loss": 0.4546, + "step": 6333 + }, + { + "epoch": 2.9947990543735226, + "grad_norm": 2.8248138427734375, + "learning_rate": 2.5380519761466927e-06, + "loss": 0.453, + "step": 6334 + }, + { + "epoch": 2.995271867612293, + "grad_norm": 3.0819008350372314, + "learning_rate": 2.5374282202579647e-06, + "loss": 0.4774, + "step": 6335 + }, + { + "epoch": 2.9957446808510637, + "grad_norm": 2.742570161819458, + "learning_rate": 2.5368044620387466e-06, + "loss": 0.5059, + "step": 6336 + }, + { + "epoch": 2.9962174940898345, + "grad_norm": 2.9087419509887695, + "learning_rate": 2.5361807015278757e-06, + "loss": 0.3606, + "step": 6337 + }, + { + "epoch": 2.9966903073286053, + "grad_norm": 2.6887354850769043, + "learning_rate": 2.5355569387641908e-06, + "loss": 0.4247, + "step": 6338 + }, + { + "epoch": 2.997163120567376, + "grad_norm": 2.8516008853912354, + "learning_rate": 2.534933173786531e-06, + "loss": 0.4502, + "step": 6339 + }, + { + "epoch": 2.9976359338061465, + "grad_norm": 2.4463164806365967, + "learning_rate": 2.5343094066337366e-06, + "loss": 0.3883, + "step": 6340 + }, + { + "epoch": 2.9981087470449173, + "grad_norm": 2.87025785446167, + "learning_rate": 2.533685637344645e-06, + "loss": 0.4534, + "step": 6341 + }, + { + "epoch": 2.998581560283688, + "grad_norm": 3.0706169605255127, + "learning_rate": 2.5330618659580967e-06, + "loss": 0.5426, + "step": 6342 + }, + { + "epoch": 2.9990543735224584, + "grad_norm": 2.7185773849487305, + "learning_rate": 2.532438092512931e-06, + "loss": 0.497, + "step": 6343 + }, + { + "epoch": 2.999527186761229, + "grad_norm": 2.840207815170288, + "learning_rate": 2.531814317047988e-06, + "loss": 0.4073, + "step": 6344 + }, + { + "epoch": 3.0, + "grad_norm": 3.1592655181884766, + "learning_rate": 2.5311905396021063e-06, + "loss": 0.4728, + "step": 6345 + }, + { + "epoch": 3.000472813238771, + "grad_norm": 2.190042495727539, + "learning_rate": 2.530566760214127e-06, + "loss": 0.3588, + "step": 6346 + }, + { + "epoch": 3.000945626477541, + "grad_norm": 2.749516248703003, + "learning_rate": 2.5299429789228898e-06, + "loss": 0.3495, + "step": 6347 + }, + { + "epoch": 3.001418439716312, + "grad_norm": 2.6181938648223877, + "learning_rate": 2.5293191957672335e-06, + "loss": 0.3611, + "step": 6348 + }, + { + "epoch": 3.0018912529550827, + "grad_norm": 2.7235212326049805, + "learning_rate": 2.528695410786e-06, + "loss": 0.4173, + "step": 6349 + }, + { + "epoch": 3.0023640661938535, + "grad_norm": 2.5408031940460205, + "learning_rate": 2.528071624018029e-06, + "loss": 0.3651, + "step": 6350 + }, + { + "epoch": 3.002836879432624, + "grad_norm": 2.7824409008026123, + "learning_rate": 2.5274478355021615e-06, + "loss": 0.378, + "step": 6351 + }, + { + "epoch": 3.0033096926713947, + "grad_norm": 2.7671427726745605, + "learning_rate": 2.526824045277238e-06, + "loss": 0.446, + "step": 6352 + }, + { + "epoch": 3.0037825059101655, + "grad_norm": 2.6746346950531006, + "learning_rate": 2.526200253382098e-06, + "loss": 0.3831, + "step": 6353 + }, + { + "epoch": 3.0042553191489363, + "grad_norm": 2.437439441680908, + "learning_rate": 2.525576459855583e-06, + "loss": 0.352, + "step": 6354 + }, + { + "epoch": 3.0047281323877066, + "grad_norm": 2.7632546424865723, + "learning_rate": 2.5249526647365343e-06, + "loss": 0.4636, + "step": 6355 + }, + { + "epoch": 3.0052009456264774, + "grad_norm": 2.681955099105835, + "learning_rate": 2.524328868063793e-06, + "loss": 0.3978, + "step": 6356 + }, + { + "epoch": 3.0056737588652482, + "grad_norm": 2.9575345516204834, + "learning_rate": 2.523705069876199e-06, + "loss": 0.3803, + "step": 6357 + }, + { + "epoch": 3.006146572104019, + "grad_norm": 2.7368216514587402, + "learning_rate": 2.523081270212594e-06, + "loss": 0.3968, + "step": 6358 + }, + { + "epoch": 3.0066193853427894, + "grad_norm": 2.637592077255249, + "learning_rate": 2.522457469111821e-06, + "loss": 0.3629, + "step": 6359 + }, + { + "epoch": 3.00709219858156, + "grad_norm": 2.579331398010254, + "learning_rate": 2.5218336666127187e-06, + "loss": 0.4044, + "step": 6360 + }, + { + "epoch": 3.007565011820331, + "grad_norm": 3.014544725418091, + "learning_rate": 2.5212098627541296e-06, + "loss": 0.3518, + "step": 6361 + }, + { + "epoch": 3.0080378250591018, + "grad_norm": 2.5261058807373047, + "learning_rate": 2.520586057574896e-06, + "loss": 0.3763, + "step": 6362 + }, + { + "epoch": 3.008510638297872, + "grad_norm": 3.234910249710083, + "learning_rate": 2.519962251113858e-06, + "loss": 0.3691, + "step": 6363 + }, + { + "epoch": 3.008983451536643, + "grad_norm": 3.2930967807769775, + "learning_rate": 2.519338443409859e-06, + "loss": 0.4363, + "step": 6364 + }, + { + "epoch": 3.0094562647754137, + "grad_norm": 2.807910442352295, + "learning_rate": 2.51871463450174e-06, + "loss": 0.3984, + "step": 6365 + }, + { + "epoch": 3.0099290780141845, + "grad_norm": 3.1555075645446777, + "learning_rate": 2.518090824428342e-06, + "loss": 0.4006, + "step": 6366 + }, + { + "epoch": 3.010401891252955, + "grad_norm": 3.1793272495269775, + "learning_rate": 2.5174670132285084e-06, + "loss": 0.4966, + "step": 6367 + }, + { + "epoch": 3.0108747044917257, + "grad_norm": 2.7007548809051514, + "learning_rate": 2.5168432009410805e-06, + "loss": 0.3755, + "step": 6368 + }, + { + "epoch": 3.0113475177304965, + "grad_norm": 2.914792537689209, + "learning_rate": 2.5162193876048995e-06, + "loss": 0.39, + "step": 6369 + }, + { + "epoch": 3.0118203309692673, + "grad_norm": 2.935516119003296, + "learning_rate": 2.5155955732588093e-06, + "loss": 0.4045, + "step": 6370 + }, + { + "epoch": 3.0122931442080376, + "grad_norm": 2.8817989826202393, + "learning_rate": 2.5149717579416503e-06, + "loss": 0.3751, + "step": 6371 + }, + { + "epoch": 3.0127659574468084, + "grad_norm": 2.9181740283966064, + "learning_rate": 2.514347941692266e-06, + "loss": 0.3689, + "step": 6372 + }, + { + "epoch": 3.013238770685579, + "grad_norm": 3.052060604095459, + "learning_rate": 2.5137241245494982e-06, + "loss": 0.3874, + "step": 6373 + }, + { + "epoch": 3.01371158392435, + "grad_norm": 2.6931657791137695, + "learning_rate": 2.513100306552189e-06, + "loss": 0.3673, + "step": 6374 + }, + { + "epoch": 3.0141843971631204, + "grad_norm": 2.3422248363494873, + "learning_rate": 2.5124764877391824e-06, + "loss": 0.3753, + "step": 6375 + }, + { + "epoch": 3.014657210401891, + "grad_norm": 2.5826265811920166, + "learning_rate": 2.5118526681493186e-06, + "loss": 0.3661, + "step": 6376 + }, + { + "epoch": 3.015130023640662, + "grad_norm": 2.7407493591308594, + "learning_rate": 2.5112288478214415e-06, + "loss": 0.3887, + "step": 6377 + }, + { + "epoch": 3.0156028368794328, + "grad_norm": 2.7378315925598145, + "learning_rate": 2.510605026794393e-06, + "loss": 0.3623, + "step": 6378 + }, + { + "epoch": 3.016075650118203, + "grad_norm": 2.59541654586792, + "learning_rate": 2.5099812051070167e-06, + "loss": 0.3804, + "step": 6379 + }, + { + "epoch": 3.016548463356974, + "grad_norm": 3.1022770404815674, + "learning_rate": 2.509357382798154e-06, + "loss": 0.4092, + "step": 6380 + }, + { + "epoch": 3.0170212765957447, + "grad_norm": 2.521545648574829, + "learning_rate": 2.5087335599066476e-06, + "loss": 0.3509, + "step": 6381 + }, + { + "epoch": 3.0174940898345155, + "grad_norm": 2.949395179748535, + "learning_rate": 2.5081097364713407e-06, + "loss": 0.387, + "step": 6382 + }, + { + "epoch": 3.017966903073286, + "grad_norm": 2.4806487560272217, + "learning_rate": 2.507485912531077e-06, + "loss": 0.4004, + "step": 6383 + }, + { + "epoch": 3.0184397163120567, + "grad_norm": 2.6480894088745117, + "learning_rate": 2.506862088124698e-06, + "loss": 0.3366, + "step": 6384 + }, + { + "epoch": 3.0189125295508275, + "grad_norm": 2.62559175491333, + "learning_rate": 2.5062382632910463e-06, + "loss": 0.3676, + "step": 6385 + }, + { + "epoch": 3.0193853427895982, + "grad_norm": 2.694767951965332, + "learning_rate": 2.5056144380689657e-06, + "loss": 0.3438, + "step": 6386 + }, + { + "epoch": 3.0198581560283686, + "grad_norm": 2.808107614517212, + "learning_rate": 2.504990612497299e-06, + "loss": 0.3831, + "step": 6387 + }, + { + "epoch": 3.0203309692671394, + "grad_norm": 3.2392303943634033, + "learning_rate": 2.504366786614888e-06, + "loss": 0.3493, + "step": 6388 + }, + { + "epoch": 3.02080378250591, + "grad_norm": 2.6899030208587646, + "learning_rate": 2.5037429604605774e-06, + "loss": 0.3998, + "step": 6389 + }, + { + "epoch": 3.021276595744681, + "grad_norm": 2.5622799396514893, + "learning_rate": 2.503119134073208e-06, + "loss": 0.3443, + "step": 6390 + }, + { + "epoch": 3.0217494089834513, + "grad_norm": 2.716832399368286, + "learning_rate": 2.502495307491625e-06, + "loss": 0.4465, + "step": 6391 + }, + { + "epoch": 3.022222222222222, + "grad_norm": 2.8117692470550537, + "learning_rate": 2.501871480754669e-06, + "loss": 0.3513, + "step": 6392 + }, + { + "epoch": 3.022695035460993, + "grad_norm": 3.1260762214660645, + "learning_rate": 2.501247653901185e-06, + "loss": 0.4336, + "step": 6393 + }, + { + "epoch": 3.0231678486997637, + "grad_norm": 2.5076897144317627, + "learning_rate": 2.5006238269700137e-06, + "loss": 0.3437, + "step": 6394 + }, + { + "epoch": 3.023640661938534, + "grad_norm": 2.781937837600708, + "learning_rate": 2.5e-06, + "loss": 0.3583, + "step": 6395 + }, + { + "epoch": 3.024113475177305, + "grad_norm": 3.084050178527832, + "learning_rate": 2.499376173029987e-06, + "loss": 0.3785, + "step": 6396 + }, + { + "epoch": 3.0245862884160757, + "grad_norm": 3.2292473316192627, + "learning_rate": 2.498752346098816e-06, + "loss": 0.3858, + "step": 6397 + }, + { + "epoch": 3.0250591016548465, + "grad_norm": 2.738614797592163, + "learning_rate": 2.498128519245332e-06, + "loss": 0.4166, + "step": 6398 + }, + { + "epoch": 3.025531914893617, + "grad_norm": 2.940103054046631, + "learning_rate": 2.4975046925083764e-06, + "loss": 0.4117, + "step": 6399 + }, + { + "epoch": 3.0260047281323876, + "grad_norm": 2.5177032947540283, + "learning_rate": 2.4968808659267927e-06, + "loss": 0.3704, + "step": 6400 + }, + { + "epoch": 3.0264775413711584, + "grad_norm": 2.6969990730285645, + "learning_rate": 2.4962570395394243e-06, + "loss": 0.3721, + "step": 6401 + }, + { + "epoch": 3.0269503546099292, + "grad_norm": 2.9696028232574463, + "learning_rate": 2.495633213385112e-06, + "loss": 0.3934, + "step": 6402 + }, + { + "epoch": 3.0274231678486996, + "grad_norm": 3.4032552242279053, + "learning_rate": 2.495009387502702e-06, + "loss": 0.3877, + "step": 6403 + }, + { + "epoch": 3.0278959810874704, + "grad_norm": 2.6801865100860596, + "learning_rate": 2.4943855619310343e-06, + "loss": 0.3421, + "step": 6404 + }, + { + "epoch": 3.028368794326241, + "grad_norm": 2.827056884765625, + "learning_rate": 2.493761736708954e-06, + "loss": 0.3791, + "step": 6405 + }, + { + "epoch": 3.028841607565012, + "grad_norm": 2.6393566131591797, + "learning_rate": 2.4931379118753034e-06, + "loss": 0.3729, + "step": 6406 + }, + { + "epoch": 3.0293144208037823, + "grad_norm": 2.833519458770752, + "learning_rate": 2.4925140874689236e-06, + "loss": 0.3836, + "step": 6407 + }, + { + "epoch": 3.029787234042553, + "grad_norm": 2.8852169513702393, + "learning_rate": 2.4918902635286597e-06, + "loss": 0.4307, + "step": 6408 + }, + { + "epoch": 3.030260047281324, + "grad_norm": 2.7166404724121094, + "learning_rate": 2.491266440093354e-06, + "loss": 0.3825, + "step": 6409 + }, + { + "epoch": 3.0307328605200947, + "grad_norm": 2.5828018188476562, + "learning_rate": 2.4906426172018474e-06, + "loss": 0.3579, + "step": 6410 + }, + { + "epoch": 3.031205673758865, + "grad_norm": 2.915632724761963, + "learning_rate": 2.490018794892985e-06, + "loss": 0.4099, + "step": 6411 + }, + { + "epoch": 3.031678486997636, + "grad_norm": 2.7117249965667725, + "learning_rate": 2.489394973205607e-06, + "loss": 0.4063, + "step": 6412 + }, + { + "epoch": 3.0321513002364067, + "grad_norm": 2.3989102840423584, + "learning_rate": 2.488771152178559e-06, + "loss": 0.3377, + "step": 6413 + }, + { + "epoch": 3.0326241134751775, + "grad_norm": 2.6560115814208984, + "learning_rate": 2.488147331850682e-06, + "loss": 0.4072, + "step": 6414 + }, + { + "epoch": 3.033096926713948, + "grad_norm": 2.9466328620910645, + "learning_rate": 2.4875235122608184e-06, + "loss": 0.3559, + "step": 6415 + }, + { + "epoch": 3.0335697399527186, + "grad_norm": 2.765348196029663, + "learning_rate": 2.4868996934478114e-06, + "loss": 0.336, + "step": 6416 + }, + { + "epoch": 3.0340425531914894, + "grad_norm": 2.6021807193756104, + "learning_rate": 2.4862758754505017e-06, + "loss": 0.3861, + "step": 6417 + }, + { + "epoch": 3.03451536643026, + "grad_norm": 2.7293684482574463, + "learning_rate": 2.4856520583077344e-06, + "loss": 0.3926, + "step": 6418 + }, + { + "epoch": 3.0349881796690306, + "grad_norm": 2.9704763889312744, + "learning_rate": 2.485028242058351e-06, + "loss": 0.4303, + "step": 6419 + }, + { + "epoch": 3.0354609929078014, + "grad_norm": 3.385713815689087, + "learning_rate": 2.484404426741191e-06, + "loss": 0.44, + "step": 6420 + }, + { + "epoch": 3.035933806146572, + "grad_norm": 3.177983045578003, + "learning_rate": 2.4837806123951013e-06, + "loss": 0.4256, + "step": 6421 + }, + { + "epoch": 3.036406619385343, + "grad_norm": 2.6287200450897217, + "learning_rate": 2.4831567990589203e-06, + "loss": 0.3764, + "step": 6422 + }, + { + "epoch": 3.0368794326241133, + "grad_norm": 2.81823992729187, + "learning_rate": 2.4825329867714924e-06, + "loss": 0.3645, + "step": 6423 + }, + { + "epoch": 3.037352245862884, + "grad_norm": 3.1826934814453125, + "learning_rate": 2.4819091755716586e-06, + "loss": 0.3666, + "step": 6424 + }, + { + "epoch": 3.037825059101655, + "grad_norm": 3.0880346298217773, + "learning_rate": 2.481285365498261e-06, + "loss": 0.4339, + "step": 6425 + }, + { + "epoch": 3.0382978723404257, + "grad_norm": 3.1764965057373047, + "learning_rate": 2.480661556590142e-06, + "loss": 0.4804, + "step": 6426 + }, + { + "epoch": 3.038770685579196, + "grad_norm": 2.89469313621521, + "learning_rate": 2.480037748886142e-06, + "loss": 0.3875, + "step": 6427 + }, + { + "epoch": 3.039243498817967, + "grad_norm": 2.6043636798858643, + "learning_rate": 2.479413942425105e-06, + "loss": 0.3859, + "step": 6428 + }, + { + "epoch": 3.0397163120567376, + "grad_norm": 2.6570727825164795, + "learning_rate": 2.4787901372458712e-06, + "loss": 0.3508, + "step": 6429 + }, + { + "epoch": 3.0401891252955084, + "grad_norm": 2.914050579071045, + "learning_rate": 2.4781663333872825e-06, + "loss": 0.3904, + "step": 6430 + }, + { + "epoch": 3.040661938534279, + "grad_norm": 2.595606803894043, + "learning_rate": 2.47754253088818e-06, + "loss": 0.3753, + "step": 6431 + }, + { + "epoch": 3.0411347517730496, + "grad_norm": 2.68186616897583, + "learning_rate": 2.4769187297874065e-06, + "loss": 0.3545, + "step": 6432 + }, + { + "epoch": 3.0416075650118204, + "grad_norm": 2.956507921218872, + "learning_rate": 2.476294930123802e-06, + "loss": 0.3778, + "step": 6433 + }, + { + "epoch": 3.042080378250591, + "grad_norm": 2.8327226638793945, + "learning_rate": 2.475671131936209e-06, + "loss": 0.3205, + "step": 6434 + }, + { + "epoch": 3.0425531914893615, + "grad_norm": 2.594348430633545, + "learning_rate": 2.475047335263466e-06, + "loss": 0.3859, + "step": 6435 + }, + { + "epoch": 3.0430260047281323, + "grad_norm": 3.5030717849731445, + "learning_rate": 2.4744235401444177e-06, + "loss": 0.3611, + "step": 6436 + }, + { + "epoch": 3.043498817966903, + "grad_norm": 2.8478317260742188, + "learning_rate": 2.4737997466179034e-06, + "loss": 0.3927, + "step": 6437 + }, + { + "epoch": 3.043971631205674, + "grad_norm": 2.677827835083008, + "learning_rate": 2.4731759547227627e-06, + "loss": 0.3784, + "step": 6438 + }, + { + "epoch": 3.0444444444444443, + "grad_norm": 3.0059866905212402, + "learning_rate": 2.4725521644978393e-06, + "loss": 0.4279, + "step": 6439 + }, + { + "epoch": 3.044917257683215, + "grad_norm": 3.012500047683716, + "learning_rate": 2.4719283759819713e-06, + "loss": 0.4007, + "step": 6440 + }, + { + "epoch": 3.045390070921986, + "grad_norm": 2.758204936981201, + "learning_rate": 2.4713045892140007e-06, + "loss": 0.3668, + "step": 6441 + }, + { + "epoch": 3.0458628841607567, + "grad_norm": 2.9551615715026855, + "learning_rate": 2.4706808042327678e-06, + "loss": 0.3524, + "step": 6442 + }, + { + "epoch": 3.046335697399527, + "grad_norm": 2.8639965057373047, + "learning_rate": 2.4700570210771115e-06, + "loss": 0.3886, + "step": 6443 + }, + { + "epoch": 3.046808510638298, + "grad_norm": 2.718219757080078, + "learning_rate": 2.4694332397858738e-06, + "loss": 0.3693, + "step": 6444 + }, + { + "epoch": 3.0472813238770686, + "grad_norm": 3.050135612487793, + "learning_rate": 2.4688094603978933e-06, + "loss": 0.3979, + "step": 6445 + }, + { + "epoch": 3.0477541371158394, + "grad_norm": 2.786186456680298, + "learning_rate": 2.468185682952013e-06, + "loss": 0.3809, + "step": 6446 + }, + { + "epoch": 3.0482269503546098, + "grad_norm": 2.6462252140045166, + "learning_rate": 2.4675619074870697e-06, + "loss": 0.3746, + "step": 6447 + }, + { + "epoch": 3.0486997635933806, + "grad_norm": 2.984783887863159, + "learning_rate": 2.4669381340419037e-06, + "loss": 0.4092, + "step": 6448 + }, + { + "epoch": 3.0491725768321514, + "grad_norm": 2.936380624771118, + "learning_rate": 2.466314362655356e-06, + "loss": 0.4335, + "step": 6449 + }, + { + "epoch": 3.049645390070922, + "grad_norm": 2.730738639831543, + "learning_rate": 2.465690593366264e-06, + "loss": 0.364, + "step": 6450 + }, + { + "epoch": 3.0501182033096925, + "grad_norm": 2.7273590564727783, + "learning_rate": 2.4650668262134693e-06, + "loss": 0.3905, + "step": 6451 + }, + { + "epoch": 3.0505910165484633, + "grad_norm": 2.9588208198547363, + "learning_rate": 2.4644430612358105e-06, + "loss": 0.3936, + "step": 6452 + }, + { + "epoch": 3.051063829787234, + "grad_norm": 2.8721611499786377, + "learning_rate": 2.4638192984721247e-06, + "loss": 0.4279, + "step": 6453 + }, + { + "epoch": 3.051536643026005, + "grad_norm": 3.7179651260375977, + "learning_rate": 2.463195537961254e-06, + "loss": 0.427, + "step": 6454 + }, + { + "epoch": 3.0520094562647753, + "grad_norm": 2.651731491088867, + "learning_rate": 2.4625717797420353e-06, + "loss": 0.3471, + "step": 6455 + }, + { + "epoch": 3.052482269503546, + "grad_norm": 3.898737668991089, + "learning_rate": 2.4619480238533085e-06, + "loss": 0.4574, + "step": 6456 + }, + { + "epoch": 3.052955082742317, + "grad_norm": 2.916252374649048, + "learning_rate": 2.4613242703339108e-06, + "loss": 0.3622, + "step": 6457 + }, + { + "epoch": 3.0534278959810877, + "grad_norm": 3.122565507888794, + "learning_rate": 2.4607005192226806e-06, + "loss": 0.3954, + "step": 6458 + }, + { + "epoch": 3.053900709219858, + "grad_norm": 3.2377424240112305, + "learning_rate": 2.4600767705584575e-06, + "loss": 0.4082, + "step": 6459 + }, + { + "epoch": 3.054373522458629, + "grad_norm": 2.941102981567383, + "learning_rate": 2.459453024380079e-06, + "loss": 0.4324, + "step": 6460 + }, + { + "epoch": 3.0548463356973996, + "grad_norm": 2.964313507080078, + "learning_rate": 2.4588292807263816e-06, + "loss": 0.3037, + "step": 6461 + }, + { + "epoch": 3.0553191489361704, + "grad_norm": 2.824669599533081, + "learning_rate": 2.4582055396362055e-06, + "loss": 0.4076, + "step": 6462 + }, + { + "epoch": 3.0557919621749408, + "grad_norm": 2.7739884853363037, + "learning_rate": 2.457581801148387e-06, + "loss": 0.3615, + "step": 6463 + }, + { + "epoch": 3.0562647754137116, + "grad_norm": 3.2974464893341064, + "learning_rate": 2.456958065301764e-06, + "loss": 0.426, + "step": 6464 + }, + { + "epoch": 3.0567375886524824, + "grad_norm": 3.0801217555999756, + "learning_rate": 2.456334332135174e-06, + "loss": 0.3737, + "step": 6465 + }, + { + "epoch": 3.057210401891253, + "grad_norm": 2.788851022720337, + "learning_rate": 2.455710601687452e-06, + "loss": 0.4367, + "step": 6466 + }, + { + "epoch": 3.0576832151300235, + "grad_norm": 2.8078136444091797, + "learning_rate": 2.4550868739974378e-06, + "loss": 0.3796, + "step": 6467 + }, + { + "epoch": 3.0581560283687943, + "grad_norm": 2.9871349334716797, + "learning_rate": 2.4544631491039657e-06, + "loss": 0.3869, + "step": 6468 + }, + { + "epoch": 3.058628841607565, + "grad_norm": 2.9170174598693848, + "learning_rate": 2.453839427045875e-06, + "loss": 0.4591, + "step": 6469 + }, + { + "epoch": 3.059101654846336, + "grad_norm": 2.7316131591796875, + "learning_rate": 2.4532157078620013e-06, + "loss": 0.3723, + "step": 6470 + }, + { + "epoch": 3.0595744680851062, + "grad_norm": 3.047921657562256, + "learning_rate": 2.4525919915911793e-06, + "loss": 0.3804, + "step": 6471 + }, + { + "epoch": 3.060047281323877, + "grad_norm": 3.047934055328369, + "learning_rate": 2.4519682782722465e-06, + "loss": 0.3949, + "step": 6472 + }, + { + "epoch": 3.060520094562648, + "grad_norm": 2.4911186695098877, + "learning_rate": 2.4513445679440374e-06, + "loss": 0.3629, + "step": 6473 + }, + { + "epoch": 3.0609929078014186, + "grad_norm": 2.5353519916534424, + "learning_rate": 2.4507208606453895e-06, + "loss": 0.3417, + "step": 6474 + }, + { + "epoch": 3.061465721040189, + "grad_norm": 2.474622964859009, + "learning_rate": 2.4500971564151384e-06, + "loss": 0.3468, + "step": 6475 + }, + { + "epoch": 3.06193853427896, + "grad_norm": 2.7016963958740234, + "learning_rate": 2.4494734552921166e-06, + "loss": 0.3872, + "step": 6476 + }, + { + "epoch": 3.0624113475177306, + "grad_norm": 2.912144184112549, + "learning_rate": 2.4488497573151625e-06, + "loss": 0.3727, + "step": 6477 + }, + { + "epoch": 3.0628841607565014, + "grad_norm": 2.8234877586364746, + "learning_rate": 2.4482260625231093e-06, + "loss": 0.3472, + "step": 6478 + }, + { + "epoch": 3.0633569739952717, + "grad_norm": 2.6554179191589355, + "learning_rate": 2.447602370954793e-06, + "loss": 0.343, + "step": 6479 + }, + { + "epoch": 3.0638297872340425, + "grad_norm": 2.666419744491577, + "learning_rate": 2.446978682649047e-06, + "loss": 0.3932, + "step": 6480 + }, + { + "epoch": 3.0643026004728133, + "grad_norm": 2.968574285507202, + "learning_rate": 2.446354997644705e-06, + "loss": 0.4418, + "step": 6481 + }, + { + "epoch": 3.064775413711584, + "grad_norm": 2.692253589630127, + "learning_rate": 2.4457313159806028e-06, + "loss": 0.3141, + "step": 6482 + }, + { + "epoch": 3.0652482269503545, + "grad_norm": 2.5857295989990234, + "learning_rate": 2.445107637695574e-06, + "loss": 0.3392, + "step": 6483 + }, + { + "epoch": 3.0657210401891253, + "grad_norm": 3.2332825660705566, + "learning_rate": 2.4444839628284504e-06, + "loss": 0.4694, + "step": 6484 + }, + { + "epoch": 3.066193853427896, + "grad_norm": 2.7391014099121094, + "learning_rate": 2.4438602914180684e-06, + "loss": 0.3966, + "step": 6485 + }, + { + "epoch": 3.066666666666667, + "grad_norm": 2.7882139682769775, + "learning_rate": 2.4432366235032593e-06, + "loss": 0.3552, + "step": 6486 + }, + { + "epoch": 3.0671394799054372, + "grad_norm": 2.8907811641693115, + "learning_rate": 2.4426129591228573e-06, + "loss": 0.4478, + "step": 6487 + }, + { + "epoch": 3.067612293144208, + "grad_norm": 2.878929853439331, + "learning_rate": 2.4419892983156947e-06, + "loss": 0.3457, + "step": 6488 + }, + { + "epoch": 3.068085106382979, + "grad_norm": 2.7087442874908447, + "learning_rate": 2.441365641120603e-06, + "loss": 0.3491, + "step": 6489 + }, + { + "epoch": 3.0685579196217496, + "grad_norm": 3.2330431938171387, + "learning_rate": 2.4407419875764167e-06, + "loss": 0.3901, + "step": 6490 + }, + { + "epoch": 3.06903073286052, + "grad_norm": 3.0529370307922363, + "learning_rate": 2.440118337721966e-06, + "loss": 0.4059, + "step": 6491 + }, + { + "epoch": 3.0695035460992908, + "grad_norm": 2.4786794185638428, + "learning_rate": 2.439494691596085e-06, + "loss": 0.3153, + "step": 6492 + }, + { + "epoch": 3.0699763593380616, + "grad_norm": 2.956310510635376, + "learning_rate": 2.438871049237604e-06, + "loss": 0.3973, + "step": 6493 + }, + { + "epoch": 3.0704491725768324, + "grad_norm": 3.0816991329193115, + "learning_rate": 2.4382474106853543e-06, + "loss": 0.388, + "step": 6494 + }, + { + "epoch": 3.0709219858156027, + "grad_norm": 2.6103477478027344, + "learning_rate": 2.4376237759781686e-06, + "loss": 0.3656, + "step": 6495 + }, + { + "epoch": 3.0713947990543735, + "grad_norm": 2.974076271057129, + "learning_rate": 2.437000145154875e-06, + "loss": 0.3246, + "step": 6496 + }, + { + "epoch": 3.0718676122931443, + "grad_norm": 2.633605718612671, + "learning_rate": 2.4363765182543075e-06, + "loss": 0.3556, + "step": 6497 + }, + { + "epoch": 3.072340425531915, + "grad_norm": 2.49161434173584, + "learning_rate": 2.4357528953152953e-06, + "loss": 0.3506, + "step": 6498 + }, + { + "epoch": 3.0728132387706855, + "grad_norm": 2.6435935497283936, + "learning_rate": 2.4351292763766676e-06, + "loss": 0.3652, + "step": 6499 + }, + { + "epoch": 3.0732860520094563, + "grad_norm": 2.9710617065429688, + "learning_rate": 2.4345056614772563e-06, + "loss": 0.3713, + "step": 6500 + }, + { + "epoch": 3.073758865248227, + "grad_norm": 2.6947052478790283, + "learning_rate": 2.43388205065589e-06, + "loss": 0.378, + "step": 6501 + }, + { + "epoch": 3.0742316784869974, + "grad_norm": 2.9686238765716553, + "learning_rate": 2.433258443951398e-06, + "loss": 0.3936, + "step": 6502 + }, + { + "epoch": 3.074704491725768, + "grad_norm": 2.6008691787719727, + "learning_rate": 2.432634841402611e-06, + "loss": 0.3709, + "step": 6503 + }, + { + "epoch": 3.075177304964539, + "grad_norm": 2.595116376876831, + "learning_rate": 2.4320112430483563e-06, + "loss": 0.3884, + "step": 6504 + }, + { + "epoch": 3.07565011820331, + "grad_norm": 2.685241460800171, + "learning_rate": 2.431387648927464e-06, + "loss": 0.3751, + "step": 6505 + }, + { + "epoch": 3.0761229314420806, + "grad_norm": 2.8863797187805176, + "learning_rate": 2.430764059078762e-06, + "loss": 0.3765, + "step": 6506 + }, + { + "epoch": 3.076595744680851, + "grad_norm": 3.020766019821167, + "learning_rate": 2.430140473541077e-06, + "loss": 0.362, + "step": 6507 + }, + { + "epoch": 3.0770685579196217, + "grad_norm": 2.9521167278289795, + "learning_rate": 2.42951689235324e-06, + "loss": 0.41, + "step": 6508 + }, + { + "epoch": 3.0775413711583925, + "grad_norm": 2.5844924449920654, + "learning_rate": 2.4288933155540757e-06, + "loss": 0.3258, + "step": 6509 + }, + { + "epoch": 3.078014184397163, + "grad_norm": 3.052661657333374, + "learning_rate": 2.4282697431824138e-06, + "loss": 0.363, + "step": 6510 + }, + { + "epoch": 3.0784869976359337, + "grad_norm": 3.109342575073242, + "learning_rate": 2.427646175277081e-06, + "loss": 0.4105, + "step": 6511 + }, + { + "epoch": 3.0789598108747045, + "grad_norm": 3.3141326904296875, + "learning_rate": 2.427022611876903e-06, + "loss": 0.405, + "step": 6512 + }, + { + "epoch": 3.0794326241134753, + "grad_norm": 3.054673194885254, + "learning_rate": 2.426399053020707e-06, + "loss": 0.3532, + "step": 6513 + }, + { + "epoch": 3.079905437352246, + "grad_norm": 2.823489189147949, + "learning_rate": 2.425775498747318e-06, + "loss": 0.3762, + "step": 6514 + }, + { + "epoch": 3.0803782505910164, + "grad_norm": 2.6739792823791504, + "learning_rate": 2.425151949095565e-06, + "loss": 0.4044, + "step": 6515 + }, + { + "epoch": 3.0808510638297872, + "grad_norm": 2.7313177585601807, + "learning_rate": 2.4245284041042714e-06, + "loss": 0.3136, + "step": 6516 + }, + { + "epoch": 3.081323877068558, + "grad_norm": 3.1661181449890137, + "learning_rate": 2.4239048638122624e-06, + "loss": 0.44, + "step": 6517 + }, + { + "epoch": 3.0817966903073284, + "grad_norm": 3.326542377471924, + "learning_rate": 2.4232813282583647e-06, + "loss": 0.3798, + "step": 6518 + }, + { + "epoch": 3.082269503546099, + "grad_norm": 3.0194952487945557, + "learning_rate": 2.422657797481402e-06, + "loss": 0.423, + "step": 6519 + }, + { + "epoch": 3.08274231678487, + "grad_norm": 2.6704318523406982, + "learning_rate": 2.4220342715201995e-06, + "loss": 0.41, + "step": 6520 + }, + { + "epoch": 3.083215130023641, + "grad_norm": 3.057990312576294, + "learning_rate": 2.421410750413581e-06, + "loss": 0.4096, + "step": 6521 + }, + { + "epoch": 3.083687943262411, + "grad_norm": 2.6242079734802246, + "learning_rate": 2.4207872342003693e-06, + "loss": 0.3673, + "step": 6522 + }, + { + "epoch": 3.084160756501182, + "grad_norm": 2.933910846710205, + "learning_rate": 2.4201637229193904e-06, + "loss": 0.4018, + "step": 6523 + }, + { + "epoch": 3.0846335697399527, + "grad_norm": 2.6973681449890137, + "learning_rate": 2.4195402166094657e-06, + "loss": 0.3533, + "step": 6524 + }, + { + "epoch": 3.0851063829787235, + "grad_norm": 3.096013307571411, + "learning_rate": 2.4189167153094194e-06, + "loss": 0.3872, + "step": 6525 + }, + { + "epoch": 3.085579196217494, + "grad_norm": 3.0707414150238037, + "learning_rate": 2.4182932190580737e-06, + "loss": 0.3775, + "step": 6526 + }, + { + "epoch": 3.0860520094562647, + "grad_norm": 2.873190402984619, + "learning_rate": 2.417669727894251e-06, + "loss": 0.3144, + "step": 6527 + }, + { + "epoch": 3.0865248226950355, + "grad_norm": 2.316431999206543, + "learning_rate": 2.4170462418567732e-06, + "loss": 0.3238, + "step": 6528 + }, + { + "epoch": 3.0869976359338063, + "grad_norm": 2.3672494888305664, + "learning_rate": 2.4164227609844626e-06, + "loss": 0.3585, + "step": 6529 + }, + { + "epoch": 3.0874704491725766, + "grad_norm": 2.904538154602051, + "learning_rate": 2.415799285316139e-06, + "loss": 0.366, + "step": 6530 + }, + { + "epoch": 3.0879432624113474, + "grad_norm": 2.914602279663086, + "learning_rate": 2.415175814890626e-06, + "loss": 0.3793, + "step": 6531 + }, + { + "epoch": 3.088416075650118, + "grad_norm": 2.652005672454834, + "learning_rate": 2.4145523497467417e-06, + "loss": 0.362, + "step": 6532 + }, + { + "epoch": 3.088888888888889, + "grad_norm": 2.5137813091278076, + "learning_rate": 2.413928889923309e-06, + "loss": 0.2974, + "step": 6533 + }, + { + "epoch": 3.0893617021276594, + "grad_norm": 3.2166645526885986, + "learning_rate": 2.413305435459147e-06, + "loss": 0.4151, + "step": 6534 + }, + { + "epoch": 3.08983451536643, + "grad_norm": 3.0506820678710938, + "learning_rate": 2.412681986393075e-06, + "loss": 0.4223, + "step": 6535 + }, + { + "epoch": 3.090307328605201, + "grad_norm": 3.035275936126709, + "learning_rate": 2.412058542763913e-06, + "loss": 0.4841, + "step": 6536 + }, + { + "epoch": 3.0907801418439718, + "grad_norm": 3.3195009231567383, + "learning_rate": 2.4114351046104793e-06, + "loss": 0.4205, + "step": 6537 + }, + { + "epoch": 3.091252955082742, + "grad_norm": 2.8700361251831055, + "learning_rate": 2.410811671971594e-06, + "loss": 0.3704, + "step": 6538 + }, + { + "epoch": 3.091725768321513, + "grad_norm": 2.900595188140869, + "learning_rate": 2.410188244886075e-06, + "loss": 0.4184, + "step": 6539 + }, + { + "epoch": 3.0921985815602837, + "grad_norm": 2.88179349899292, + "learning_rate": 2.409564823392739e-06, + "loss": 0.4156, + "step": 6540 + }, + { + "epoch": 3.0926713947990545, + "grad_norm": 2.677568197250366, + "learning_rate": 2.408941407530406e-06, + "loss": 0.4084, + "step": 6541 + }, + { + "epoch": 3.093144208037825, + "grad_norm": 3.0236027240753174, + "learning_rate": 2.408317997337892e-06, + "loss": 0.4384, + "step": 6542 + }, + { + "epoch": 3.0936170212765957, + "grad_norm": 3.1708545684814453, + "learning_rate": 2.4076945928540143e-06, + "loss": 0.3876, + "step": 6543 + }, + { + "epoch": 3.0940898345153665, + "grad_norm": 3.248821973800659, + "learning_rate": 2.40707119411759e-06, + "loss": 0.3865, + "step": 6544 + }, + { + "epoch": 3.0945626477541373, + "grad_norm": 3.0961649417877197, + "learning_rate": 2.4064478011674334e-06, + "loss": 0.3982, + "step": 6545 + }, + { + "epoch": 3.0950354609929076, + "grad_norm": 3.1989805698394775, + "learning_rate": 2.4058244140423637e-06, + "loss": 0.4777, + "step": 6546 + }, + { + "epoch": 3.0955082742316784, + "grad_norm": 2.805640459060669, + "learning_rate": 2.4052010327811933e-06, + "loss": 0.3764, + "step": 6547 + }, + { + "epoch": 3.095981087470449, + "grad_norm": 2.7225050926208496, + "learning_rate": 2.40457765742274e-06, + "loss": 0.3286, + "step": 6548 + }, + { + "epoch": 3.09645390070922, + "grad_norm": 3.119915008544922, + "learning_rate": 2.4039542880058174e-06, + "loss": 0.4463, + "step": 6549 + }, + { + "epoch": 3.0969267139479904, + "grad_norm": 2.8503530025482178, + "learning_rate": 2.4033309245692403e-06, + "loss": 0.395, + "step": 6550 + }, + { + "epoch": 3.097399527186761, + "grad_norm": 2.947504758834839, + "learning_rate": 2.4027075671518225e-06, + "loss": 0.4024, + "step": 6551 + }, + { + "epoch": 3.097872340425532, + "grad_norm": 3.170905113220215, + "learning_rate": 2.402084215792377e-06, + "loss": 0.4302, + "step": 6552 + }, + { + "epoch": 3.0983451536643027, + "grad_norm": 2.910475492477417, + "learning_rate": 2.4014608705297195e-06, + "loss": 0.4037, + "step": 6553 + }, + { + "epoch": 3.098817966903073, + "grad_norm": 2.627511978149414, + "learning_rate": 2.400837531402661e-06, + "loss": 0.3972, + "step": 6554 + }, + { + "epoch": 3.099290780141844, + "grad_norm": 2.6485681533813477, + "learning_rate": 2.4002141984500133e-06, + "loss": 0.4044, + "step": 6555 + }, + { + "epoch": 3.0997635933806147, + "grad_norm": 2.930954694747925, + "learning_rate": 2.399590871710592e-06, + "loss": 0.4214, + "step": 6556 + }, + { + "epoch": 3.1002364066193855, + "grad_norm": 2.6014554500579834, + "learning_rate": 2.3989675512232063e-06, + "loss": 0.3493, + "step": 6557 + }, + { + "epoch": 3.100709219858156, + "grad_norm": 2.899001121520996, + "learning_rate": 2.398344237026667e-06, + "loss": 0.382, + "step": 6558 + }, + { + "epoch": 3.1011820330969266, + "grad_norm": 2.4698870182037354, + "learning_rate": 2.3977209291597876e-06, + "loss": 0.3558, + "step": 6559 + }, + { + "epoch": 3.1016548463356974, + "grad_norm": 3.2926251888275146, + "learning_rate": 2.3970976276613763e-06, + "loss": 0.4078, + "step": 6560 + }, + { + "epoch": 3.1021276595744682, + "grad_norm": 2.5306150913238525, + "learning_rate": 2.3964743325702454e-06, + "loss": 0.3657, + "step": 6561 + }, + { + "epoch": 3.1026004728132386, + "grad_norm": 2.727583408355713, + "learning_rate": 2.395851043925204e-06, + "loss": 0.3791, + "step": 6562 + }, + { + "epoch": 3.1030732860520094, + "grad_norm": 3.1403541564941406, + "learning_rate": 2.3952277617650602e-06, + "loss": 0.3934, + "step": 6563 + }, + { + "epoch": 3.10354609929078, + "grad_norm": 2.5816383361816406, + "learning_rate": 2.3946044861286256e-06, + "loss": 0.3703, + "step": 6564 + }, + { + "epoch": 3.104018912529551, + "grad_norm": 2.5742220878601074, + "learning_rate": 2.3939812170547067e-06, + "loss": 0.3628, + "step": 6565 + }, + { + "epoch": 3.1044917257683213, + "grad_norm": 2.7276530265808105, + "learning_rate": 2.393357954582113e-06, + "loss": 0.3789, + "step": 6566 + }, + { + "epoch": 3.104964539007092, + "grad_norm": 3.05595064163208, + "learning_rate": 2.3927346987496515e-06, + "loss": 0.3766, + "step": 6567 + }, + { + "epoch": 3.105437352245863, + "grad_norm": 2.786970615386963, + "learning_rate": 2.39211144959613e-06, + "loss": 0.3329, + "step": 6568 + }, + { + "epoch": 3.1059101654846337, + "grad_norm": 3.499018430709839, + "learning_rate": 2.391488207160356e-06, + "loss": 0.4175, + "step": 6569 + }, + { + "epoch": 3.106382978723404, + "grad_norm": 2.969735860824585, + "learning_rate": 2.3908649714811346e-06, + "loss": 0.3893, + "step": 6570 + }, + { + "epoch": 3.106855791962175, + "grad_norm": 3.1494929790496826, + "learning_rate": 2.3902417425972734e-06, + "loss": 0.4048, + "step": 6571 + }, + { + "epoch": 3.1073286052009457, + "grad_norm": 2.6393489837646484, + "learning_rate": 2.3896185205475782e-06, + "loss": 0.3216, + "step": 6572 + }, + { + "epoch": 3.1078014184397165, + "grad_norm": 3.6984152793884277, + "learning_rate": 2.3889953053708528e-06, + "loss": 0.3646, + "step": 6573 + }, + { + "epoch": 3.108274231678487, + "grad_norm": 3.518547534942627, + "learning_rate": 2.388372097105903e-06, + "loss": 0.3627, + "step": 6574 + }, + { + "epoch": 3.1087470449172576, + "grad_norm": 3.422043800354004, + "learning_rate": 2.3877488957915333e-06, + "loss": 0.4116, + "step": 6575 + }, + { + "epoch": 3.1092198581560284, + "grad_norm": 2.8088064193725586, + "learning_rate": 2.3871257014665486e-06, + "loss": 0.3477, + "step": 6576 + }, + { + "epoch": 3.109692671394799, + "grad_norm": 2.7877607345581055, + "learning_rate": 2.3865025141697513e-06, + "loss": 0.351, + "step": 6577 + }, + { + "epoch": 3.1101654846335696, + "grad_norm": 2.9446799755096436, + "learning_rate": 2.3858793339399433e-06, + "loss": 0.4025, + "step": 6578 + }, + { + "epoch": 3.1106382978723404, + "grad_norm": 2.886584758758545, + "learning_rate": 2.3852561608159304e-06, + "loss": 0.3765, + "step": 6579 + }, + { + "epoch": 3.111111111111111, + "grad_norm": 3.45711088180542, + "learning_rate": 2.384632994836513e-06, + "loss": 0.3744, + "step": 6580 + }, + { + "epoch": 3.111583924349882, + "grad_norm": 2.737441301345825, + "learning_rate": 2.3840098360404916e-06, + "loss": 0.4048, + "step": 6581 + }, + { + "epoch": 3.1120567375886523, + "grad_norm": 2.742567300796509, + "learning_rate": 2.383386684466671e-06, + "loss": 0.3717, + "step": 6582 + }, + { + "epoch": 3.112529550827423, + "grad_norm": 3.017970561981201, + "learning_rate": 2.382763540153849e-06, + "loss": 0.3922, + "step": 6583 + }, + { + "epoch": 3.113002364066194, + "grad_norm": 3.132004499435425, + "learning_rate": 2.3821404031408283e-06, + "loss": 0.3969, + "step": 6584 + }, + { + "epoch": 3.1134751773049647, + "grad_norm": 2.910820245742798, + "learning_rate": 2.3815172734664075e-06, + "loss": 0.4241, + "step": 6585 + }, + { + "epoch": 3.113947990543735, + "grad_norm": 3.0029842853546143, + "learning_rate": 2.380894151169386e-06, + "loss": 0.4007, + "step": 6586 + }, + { + "epoch": 3.114420803782506, + "grad_norm": 3.0309178829193115, + "learning_rate": 2.380271036288564e-06, + "loss": 0.3876, + "step": 6587 + }, + { + "epoch": 3.1148936170212767, + "grad_norm": 2.963204860687256, + "learning_rate": 2.379647928862739e-06, + "loss": 0.4017, + "step": 6588 + }, + { + "epoch": 3.1153664302600474, + "grad_norm": 3.0127944946289062, + "learning_rate": 2.3790248289307103e-06, + "loss": 0.3651, + "step": 6589 + }, + { + "epoch": 3.115839243498818, + "grad_norm": 2.557485580444336, + "learning_rate": 2.3784017365312755e-06, + "loss": 0.3419, + "step": 6590 + }, + { + "epoch": 3.1163120567375886, + "grad_norm": 2.8577969074249268, + "learning_rate": 2.3777786517032306e-06, + "loss": 0.372, + "step": 6591 + }, + { + "epoch": 3.1167848699763594, + "grad_norm": 2.450324058532715, + "learning_rate": 2.3771555744853735e-06, + "loss": 0.3442, + "step": 6592 + }, + { + "epoch": 3.11725768321513, + "grad_norm": 2.7939295768737793, + "learning_rate": 2.3765325049164996e-06, + "loss": 0.401, + "step": 6593 + }, + { + "epoch": 3.1177304964539005, + "grad_norm": 2.9690325260162354, + "learning_rate": 2.3759094430354056e-06, + "loss": 0.3962, + "step": 6594 + }, + { + "epoch": 3.1182033096926713, + "grad_norm": 2.7630631923675537, + "learning_rate": 2.375286388880887e-06, + "loss": 0.4126, + "step": 6595 + }, + { + "epoch": 3.118676122931442, + "grad_norm": 2.6259944438934326, + "learning_rate": 2.3746633424917366e-06, + "loss": 0.3285, + "step": 6596 + }, + { + "epoch": 3.119148936170213, + "grad_norm": 2.7107701301574707, + "learning_rate": 2.3740403039067516e-06, + "loss": 0.3636, + "step": 6597 + }, + { + "epoch": 3.1196217494089833, + "grad_norm": 2.985301971435547, + "learning_rate": 2.373417273164724e-06, + "loss": 0.3928, + "step": 6598 + }, + { + "epoch": 3.120094562647754, + "grad_norm": 3.2578976154327393, + "learning_rate": 2.3727942503044483e-06, + "loss": 0.3379, + "step": 6599 + }, + { + "epoch": 3.120567375886525, + "grad_norm": 3.1681406497955322, + "learning_rate": 2.372171235364717e-06, + "loss": 0.4023, + "step": 6600 + }, + { + "epoch": 3.1210401891252957, + "grad_norm": 3.120147705078125, + "learning_rate": 2.371548228384321e-06, + "loss": 0.4228, + "step": 6601 + }, + { + "epoch": 3.121513002364066, + "grad_norm": 2.7786099910736084, + "learning_rate": 2.3709252294020547e-06, + "loss": 0.4386, + "step": 6602 + }, + { + "epoch": 3.121985815602837, + "grad_norm": 2.698849678039551, + "learning_rate": 2.3703022384567086e-06, + "loss": 0.3861, + "step": 6603 + }, + { + "epoch": 3.1224586288416076, + "grad_norm": 2.7917959690093994, + "learning_rate": 2.3696792555870724e-06, + "loss": 0.3535, + "step": 6604 + }, + { + "epoch": 3.1229314420803784, + "grad_norm": 2.8249263763427734, + "learning_rate": 2.3690562808319385e-06, + "loss": 0.3415, + "step": 6605 + }, + { + "epoch": 3.123404255319149, + "grad_norm": 2.567458391189575, + "learning_rate": 2.368433314230095e-06, + "loss": 0.3827, + "step": 6606 + }, + { + "epoch": 3.1238770685579196, + "grad_norm": 2.9670443534851074, + "learning_rate": 2.3678103558203328e-06, + "loss": 0.4238, + "step": 6607 + }, + { + "epoch": 3.1243498817966904, + "grad_norm": 2.6893439292907715, + "learning_rate": 2.36718740564144e-06, + "loss": 0.3461, + "step": 6608 + }, + { + "epoch": 3.124822695035461, + "grad_norm": 3.2669708728790283, + "learning_rate": 2.3665644637322044e-06, + "loss": 0.3992, + "step": 6609 + }, + { + "epoch": 3.1252955082742315, + "grad_norm": 2.889340400695801, + "learning_rate": 2.3659415301314152e-06, + "loss": 0.3829, + "step": 6610 + }, + { + "epoch": 3.1257683215130023, + "grad_norm": 2.625603199005127, + "learning_rate": 2.3653186048778584e-06, + "loss": 0.3559, + "step": 6611 + }, + { + "epoch": 3.126241134751773, + "grad_norm": 2.8128650188446045, + "learning_rate": 2.3646956880103224e-06, + "loss": 0.4035, + "step": 6612 + }, + { + "epoch": 3.126713947990544, + "grad_norm": 3.1887412071228027, + "learning_rate": 2.3640727795675925e-06, + "loss": 0.3938, + "step": 6613 + }, + { + "epoch": 3.1271867612293143, + "grad_norm": 2.886514186859131, + "learning_rate": 2.363449879588454e-06, + "loss": 0.3504, + "step": 6614 + }, + { + "epoch": 3.127659574468085, + "grad_norm": 3.2149860858917236, + "learning_rate": 2.3628269881116937e-06, + "loss": 0.4137, + "step": 6615 + }, + { + "epoch": 3.128132387706856, + "grad_norm": 3.3155312538146973, + "learning_rate": 2.362204105176094e-06, + "loss": 0.3811, + "step": 6616 + }, + { + "epoch": 3.1286052009456267, + "grad_norm": 2.6228792667388916, + "learning_rate": 2.3615812308204415e-06, + "loss": 0.3511, + "step": 6617 + }, + { + "epoch": 3.129078014184397, + "grad_norm": 2.7686524391174316, + "learning_rate": 2.3609583650835187e-06, + "loss": 0.3722, + "step": 6618 + }, + { + "epoch": 3.129550827423168, + "grad_norm": 3.396368980407715, + "learning_rate": 2.3603355080041083e-06, + "loss": 0.4678, + "step": 6619 + }, + { + "epoch": 3.1300236406619386, + "grad_norm": 2.7329437732696533, + "learning_rate": 2.359712659620994e-06, + "loss": 0.3775, + "step": 6620 + }, + { + "epoch": 3.1304964539007094, + "grad_norm": 2.7633914947509766, + "learning_rate": 2.3590898199729567e-06, + "loss": 0.3306, + "step": 6621 + }, + { + "epoch": 3.1309692671394798, + "grad_norm": 3.020887613296509, + "learning_rate": 2.3584669890987792e-06, + "loss": 0.4121, + "step": 6622 + }, + { + "epoch": 3.1314420803782506, + "grad_norm": 2.8912103176116943, + "learning_rate": 2.3578441670372414e-06, + "loss": 0.4297, + "step": 6623 + }, + { + "epoch": 3.1319148936170214, + "grad_norm": 3.0654027462005615, + "learning_rate": 2.3572213538271234e-06, + "loss": 0.3856, + "step": 6624 + }, + { + "epoch": 3.132387706855792, + "grad_norm": 3.1126575469970703, + "learning_rate": 2.356598549507206e-06, + "loss": 0.3886, + "step": 6625 + }, + { + "epoch": 3.1328605200945625, + "grad_norm": 2.7066447734832764, + "learning_rate": 2.3559757541162687e-06, + "loss": 0.4212, + "step": 6626 + }, + { + "epoch": 3.1333333333333333, + "grad_norm": 2.876338243484497, + "learning_rate": 2.355352967693088e-06, + "loss": 0.3607, + "step": 6627 + }, + { + "epoch": 3.133806146572104, + "grad_norm": 2.9011716842651367, + "learning_rate": 2.3547301902764454e-06, + "loss": 0.428, + "step": 6628 + }, + { + "epoch": 3.134278959810875, + "grad_norm": 2.805656909942627, + "learning_rate": 2.3541074219051163e-06, + "loss": 0.4038, + "step": 6629 + }, + { + "epoch": 3.1347517730496453, + "grad_norm": 2.89546275138855, + "learning_rate": 2.353484662617879e-06, + "loss": 0.3798, + "step": 6630 + }, + { + "epoch": 3.135224586288416, + "grad_norm": 3.0290539264678955, + "learning_rate": 2.352861912453508e-06, + "loss": 0.3916, + "step": 6631 + }, + { + "epoch": 3.135697399527187, + "grad_norm": 2.848393440246582, + "learning_rate": 2.352239171450781e-06, + "loss": 0.3423, + "step": 6632 + }, + { + "epoch": 3.1361702127659576, + "grad_norm": 2.871372938156128, + "learning_rate": 2.3516164396484737e-06, + "loss": 0.3872, + "step": 6633 + }, + { + "epoch": 3.136643026004728, + "grad_norm": 3.120682716369629, + "learning_rate": 2.3509937170853585e-06, + "loss": 0.3952, + "step": 6634 + }, + { + "epoch": 3.137115839243499, + "grad_norm": 2.6936683654785156, + "learning_rate": 2.3503710038002127e-06, + "loss": 0.3643, + "step": 6635 + }, + { + "epoch": 3.1375886524822696, + "grad_norm": 3.749519109725952, + "learning_rate": 2.349748299831808e-06, + "loss": 0.4519, + "step": 6636 + }, + { + "epoch": 3.1380614657210404, + "grad_norm": 2.8034276962280273, + "learning_rate": 2.3491256052189175e-06, + "loss": 0.401, + "step": 6637 + }, + { + "epoch": 3.1385342789598107, + "grad_norm": 2.6201975345611572, + "learning_rate": 2.348502920000314e-06, + "loss": 0.3491, + "step": 6638 + }, + { + "epoch": 3.1390070921985815, + "grad_norm": 2.890552043914795, + "learning_rate": 2.347880244214769e-06, + "loss": 0.3439, + "step": 6639 + }, + { + "epoch": 3.1394799054373523, + "grad_norm": 2.899594306945801, + "learning_rate": 2.347257577901055e-06, + "loss": 0.3707, + "step": 6640 + }, + { + "epoch": 3.139952718676123, + "grad_norm": 2.8660130500793457, + "learning_rate": 2.346634921097942e-06, + "loss": 0.3582, + "step": 6641 + }, + { + "epoch": 3.1404255319148935, + "grad_norm": 2.9805452823638916, + "learning_rate": 2.346012273844199e-06, + "loss": 0.3466, + "step": 6642 + }, + { + "epoch": 3.1408983451536643, + "grad_norm": 3.162977457046509, + "learning_rate": 2.345389636178597e-06, + "loss": 0.3657, + "step": 6643 + }, + { + "epoch": 3.141371158392435, + "grad_norm": 2.838988780975342, + "learning_rate": 2.344767008139904e-06, + "loss": 0.3826, + "step": 6644 + }, + { + "epoch": 3.141843971631206, + "grad_norm": 3.8427252769470215, + "learning_rate": 2.3441443897668893e-06, + "loss": 0.3697, + "step": 6645 + }, + { + "epoch": 3.1423167848699762, + "grad_norm": 2.9233880043029785, + "learning_rate": 2.34352178109832e-06, + "loss": 0.3481, + "step": 6646 + }, + { + "epoch": 3.142789598108747, + "grad_norm": 2.5840606689453125, + "learning_rate": 2.342899182172963e-06, + "loss": 0.3746, + "step": 6647 + }, + { + "epoch": 3.143262411347518, + "grad_norm": 2.806793451309204, + "learning_rate": 2.3422765930295857e-06, + "loss": 0.419, + "step": 6648 + }, + { + "epoch": 3.1437352245862886, + "grad_norm": 2.803952693939209, + "learning_rate": 2.3416540137069522e-06, + "loss": 0.3965, + "step": 6649 + }, + { + "epoch": 3.144208037825059, + "grad_norm": 2.8416364192962646, + "learning_rate": 2.3410314442438297e-06, + "loss": 0.4317, + "step": 6650 + }, + { + "epoch": 3.1446808510638298, + "grad_norm": 2.9956440925598145, + "learning_rate": 2.3404088846789826e-06, + "loss": 0.4268, + "step": 6651 + }, + { + "epoch": 3.1451536643026006, + "grad_norm": 3.1649162769317627, + "learning_rate": 2.339786335051173e-06, + "loss": 0.4149, + "step": 6652 + }, + { + "epoch": 3.145626477541371, + "grad_norm": 2.909107208251953, + "learning_rate": 2.3391637953991673e-06, + "loss": 0.4085, + "step": 6653 + }, + { + "epoch": 3.1460992907801417, + "grad_norm": 2.416755199432373, + "learning_rate": 2.3385412657617264e-06, + "loss": 0.3585, + "step": 6654 + }, + { + "epoch": 3.1465721040189125, + "grad_norm": 3.1122629642486572, + "learning_rate": 2.3379187461776123e-06, + "loss": 0.3876, + "step": 6655 + }, + { + "epoch": 3.1470449172576833, + "grad_norm": 2.6854658126831055, + "learning_rate": 2.337296236685588e-06, + "loss": 0.3125, + "step": 6656 + }, + { + "epoch": 3.147517730496454, + "grad_norm": 2.779876708984375, + "learning_rate": 2.3366737373244127e-06, + "loss": 0.3688, + "step": 6657 + }, + { + "epoch": 3.1479905437352245, + "grad_norm": 3.1444761753082275, + "learning_rate": 2.3360512481328484e-06, + "loss": 0.4089, + "step": 6658 + }, + { + "epoch": 3.1484633569739953, + "grad_norm": 2.71445894241333, + "learning_rate": 2.335428769149654e-06, + "loss": 0.3532, + "step": 6659 + }, + { + "epoch": 3.148936170212766, + "grad_norm": 2.9788241386413574, + "learning_rate": 2.334806300413587e-06, + "loss": 0.4238, + "step": 6660 + }, + { + "epoch": 3.1494089834515364, + "grad_norm": 3.0118865966796875, + "learning_rate": 2.334183841963409e-06, + "loss": 0.4437, + "step": 6661 + }, + { + "epoch": 3.149881796690307, + "grad_norm": 3.2229537963867188, + "learning_rate": 2.3335613938378753e-06, + "loss": 0.3582, + "step": 6662 + }, + { + "epoch": 3.150354609929078, + "grad_norm": 2.734997034072876, + "learning_rate": 2.3329389560757447e-06, + "loss": 0.3737, + "step": 6663 + }, + { + "epoch": 3.150827423167849, + "grad_norm": 3.4746382236480713, + "learning_rate": 2.3323165287157724e-06, + "loss": 0.3516, + "step": 6664 + }, + { + "epoch": 3.1513002364066196, + "grad_norm": 2.9428153038024902, + "learning_rate": 2.3316941117967137e-06, + "loss": 0.3985, + "step": 6665 + }, + { + "epoch": 3.15177304964539, + "grad_norm": 2.6840944290161133, + "learning_rate": 2.3310717053573257e-06, + "loss": 0.3274, + "step": 6666 + }, + { + "epoch": 3.1522458628841608, + "grad_norm": 3.048335552215576, + "learning_rate": 2.3304493094363607e-06, + "loss": 0.4262, + "step": 6667 + }, + { + "epoch": 3.1527186761229316, + "grad_norm": 2.87381911277771, + "learning_rate": 2.329826924072575e-06, + "loss": 0.3867, + "step": 6668 + }, + { + "epoch": 3.153191489361702, + "grad_norm": 2.6236355304718018, + "learning_rate": 2.32920454930472e-06, + "loss": 0.3649, + "step": 6669 + }, + { + "epoch": 3.1536643026004727, + "grad_norm": 3.1326401233673096, + "learning_rate": 2.328582185171549e-06, + "loss": 0.3451, + "step": 6670 + }, + { + "epoch": 3.1541371158392435, + "grad_norm": 3.011826992034912, + "learning_rate": 2.327959831711814e-06, + "loss": 0.4118, + "step": 6671 + }, + { + "epoch": 3.1546099290780143, + "grad_norm": 2.834933280944824, + "learning_rate": 2.3273374889642646e-06, + "loss": 0.4378, + "step": 6672 + }, + { + "epoch": 3.155082742316785, + "grad_norm": 3.085756778717041, + "learning_rate": 2.326715156967654e-06, + "loss": 0.4389, + "step": 6673 + }, + { + "epoch": 3.1555555555555554, + "grad_norm": 2.7912232875823975, + "learning_rate": 2.3260928357607305e-06, + "loss": 0.3352, + "step": 6674 + }, + { + "epoch": 3.1560283687943262, + "grad_norm": 2.7643113136291504, + "learning_rate": 2.3254705253822424e-06, + "loss": 0.3449, + "step": 6675 + }, + { + "epoch": 3.156501182033097, + "grad_norm": 2.8984663486480713, + "learning_rate": 2.3248482258709405e-06, + "loss": 0.4231, + "step": 6676 + }, + { + "epoch": 3.1569739952718674, + "grad_norm": 3.214996814727783, + "learning_rate": 2.324225937265572e-06, + "loss": 0.4616, + "step": 6677 + }, + { + "epoch": 3.157446808510638, + "grad_norm": 2.58534836769104, + "learning_rate": 2.3236036596048827e-06, + "loss": 0.3264, + "step": 6678 + }, + { + "epoch": 3.157919621749409, + "grad_norm": 2.790714740753174, + "learning_rate": 2.322981392927621e-06, + "loss": 0.4086, + "step": 6679 + }, + { + "epoch": 3.15839243498818, + "grad_norm": 2.726029872894287, + "learning_rate": 2.32235913727253e-06, + "loss": 0.3344, + "step": 6680 + }, + { + "epoch": 3.1588652482269506, + "grad_norm": 2.8392906188964844, + "learning_rate": 2.3217368926783583e-06, + "loss": 0.3468, + "step": 6681 + }, + { + "epoch": 3.159338061465721, + "grad_norm": 2.9796900749206543, + "learning_rate": 2.321114659183848e-06, + "loss": 0.4051, + "step": 6682 + }, + { + "epoch": 3.1598108747044917, + "grad_norm": 3.0399303436279297, + "learning_rate": 2.320492436827743e-06, + "loss": 0.402, + "step": 6683 + }, + { + "epoch": 3.1602836879432625, + "grad_norm": 2.9295334815979004, + "learning_rate": 2.3198702256487877e-06, + "loss": 0.3975, + "step": 6684 + }, + { + "epoch": 3.160756501182033, + "grad_norm": 2.881552219390869, + "learning_rate": 2.319248025685723e-06, + "loss": 0.4342, + "step": 6685 + }, + { + "epoch": 3.1612293144208037, + "grad_norm": 3.0711705684661865, + "learning_rate": 2.3186258369772916e-06, + "loss": 0.3829, + "step": 6686 + }, + { + "epoch": 3.1617021276595745, + "grad_norm": 2.6614468097686768, + "learning_rate": 2.3180036595622345e-06, + "loss": 0.3473, + "step": 6687 + }, + { + "epoch": 3.1621749408983453, + "grad_norm": 3.0084400177001953, + "learning_rate": 2.3173814934792903e-06, + "loss": 0.4363, + "step": 6688 + }, + { + "epoch": 3.162647754137116, + "grad_norm": 2.9340786933898926, + "learning_rate": 2.3167593387672006e-06, + "loss": 0.4235, + "step": 6689 + }, + { + "epoch": 3.1631205673758864, + "grad_norm": 3.0765340328216553, + "learning_rate": 2.3161371954647023e-06, + "loss": 0.4601, + "step": 6690 + }, + { + "epoch": 3.1635933806146572, + "grad_norm": 2.816096067428589, + "learning_rate": 2.3155150636105356e-06, + "loss": 0.3764, + "step": 6691 + }, + { + "epoch": 3.164066193853428, + "grad_norm": 3.0476551055908203, + "learning_rate": 2.3148929432434372e-06, + "loss": 0.3956, + "step": 6692 + }, + { + "epoch": 3.1645390070921984, + "grad_norm": 2.628934860229492, + "learning_rate": 2.314270834402143e-06, + "loss": 0.3551, + "step": 6693 + }, + { + "epoch": 3.165011820330969, + "grad_norm": 3.3933539390563965, + "learning_rate": 2.31364873712539e-06, + "loss": 0.4523, + "step": 6694 + }, + { + "epoch": 3.16548463356974, + "grad_norm": 3.256176233291626, + "learning_rate": 2.313026651451912e-06, + "loss": 0.417, + "step": 6695 + }, + { + "epoch": 3.1659574468085108, + "grad_norm": 2.92926025390625, + "learning_rate": 2.312404577420445e-06, + "loss": 0.4365, + "step": 6696 + }, + { + "epoch": 3.166430260047281, + "grad_norm": 2.9514732360839844, + "learning_rate": 2.3117825150697233e-06, + "loss": 0.4632, + "step": 6697 + }, + { + "epoch": 3.166903073286052, + "grad_norm": 2.8635852336883545, + "learning_rate": 2.3111604644384778e-06, + "loss": 0.4018, + "step": 6698 + }, + { + "epoch": 3.1673758865248227, + "grad_norm": 2.5937020778656006, + "learning_rate": 2.3105384255654433e-06, + "loss": 0.3682, + "step": 6699 + }, + { + "epoch": 3.1678486997635935, + "grad_norm": 2.857851266860962, + "learning_rate": 2.3099163984893497e-06, + "loss": 0.3293, + "step": 6700 + }, + { + "epoch": 3.168321513002364, + "grad_norm": 2.5903947353363037, + "learning_rate": 2.3092943832489283e-06, + "loss": 0.3543, + "step": 6701 + }, + { + "epoch": 3.1687943262411347, + "grad_norm": 2.9783661365509033, + "learning_rate": 2.30867237988291e-06, + "loss": 0.3707, + "step": 6702 + }, + { + "epoch": 3.1692671394799055, + "grad_norm": 3.0133306980133057, + "learning_rate": 2.3080503884300225e-06, + "loss": 0.439, + "step": 6703 + }, + { + "epoch": 3.1697399527186763, + "grad_norm": 2.7119483947753906, + "learning_rate": 2.3074284089289968e-06, + "loss": 0.3956, + "step": 6704 + }, + { + "epoch": 3.1702127659574466, + "grad_norm": 3.0499672889709473, + "learning_rate": 2.3068064414185597e-06, + "loss": 0.434, + "step": 6705 + }, + { + "epoch": 3.1706855791962174, + "grad_norm": 2.862807512283325, + "learning_rate": 2.306184485937437e-06, + "loss": 0.3644, + "step": 6706 + }, + { + "epoch": 3.171158392434988, + "grad_norm": 2.9445149898529053, + "learning_rate": 2.305562542524358e-06, + "loss": 0.3894, + "step": 6707 + }, + { + "epoch": 3.171631205673759, + "grad_norm": 3.0442428588867188, + "learning_rate": 2.304940611218046e-06, + "loss": 0.3816, + "step": 6708 + }, + { + "epoch": 3.1721040189125294, + "grad_norm": 2.7101798057556152, + "learning_rate": 2.304318692057228e-06, + "loss": 0.3708, + "step": 6709 + }, + { + "epoch": 3.1725768321513, + "grad_norm": 2.7874515056610107, + "learning_rate": 2.303696785080626e-06, + "loss": 0.404, + "step": 6710 + }, + { + "epoch": 3.173049645390071, + "grad_norm": 3.0438833236694336, + "learning_rate": 2.303074890326964e-06, + "loss": 0.4342, + "step": 6711 + }, + { + "epoch": 3.1735224586288417, + "grad_norm": 2.6079208850860596, + "learning_rate": 2.302453007834966e-06, + "loss": 0.3725, + "step": 6712 + }, + { + "epoch": 3.173995271867612, + "grad_norm": 3.3353021144866943, + "learning_rate": 2.3018311376433523e-06, + "loss": 0.4372, + "step": 6713 + }, + { + "epoch": 3.174468085106383, + "grad_norm": 2.840771436691284, + "learning_rate": 2.3012092797908454e-06, + "loss": 0.3979, + "step": 6714 + }, + { + "epoch": 3.1749408983451537, + "grad_norm": 3.0474867820739746, + "learning_rate": 2.3005874343161648e-06, + "loss": 0.4077, + "step": 6715 + }, + { + "epoch": 3.1754137115839245, + "grad_norm": 2.849835157394409, + "learning_rate": 2.2999656012580296e-06, + "loss": 0.393, + "step": 6716 + }, + { + "epoch": 3.175886524822695, + "grad_norm": 2.6361217498779297, + "learning_rate": 2.29934378065516e-06, + "loss": 0.3894, + "step": 6717 + }, + { + "epoch": 3.1763593380614656, + "grad_norm": 3.139700174331665, + "learning_rate": 2.298721972546273e-06, + "loss": 0.36, + "step": 6718 + }, + { + "epoch": 3.1768321513002364, + "grad_norm": 2.987861156463623, + "learning_rate": 2.298100176970087e-06, + "loss": 0.4306, + "step": 6719 + }, + { + "epoch": 3.1773049645390072, + "grad_norm": 2.6403157711029053, + "learning_rate": 2.297478393965317e-06, + "loss": 0.3978, + "step": 6720 + }, + { + "epoch": 3.1777777777777776, + "grad_norm": 2.819519281387329, + "learning_rate": 2.296856623570679e-06, + "loss": 0.3467, + "step": 6721 + }, + { + "epoch": 3.1782505910165484, + "grad_norm": 2.7195916175842285, + "learning_rate": 2.296234865824889e-06, + "loss": 0.3685, + "step": 6722 + }, + { + "epoch": 3.178723404255319, + "grad_norm": 3.015488624572754, + "learning_rate": 2.2956131207666604e-06, + "loss": 0.3751, + "step": 6723 + }, + { + "epoch": 3.17919621749409, + "grad_norm": 2.9283792972564697, + "learning_rate": 2.2949913884347055e-06, + "loss": 0.3261, + "step": 6724 + }, + { + "epoch": 3.1796690307328603, + "grad_norm": 3.358991861343384, + "learning_rate": 2.294369668867739e-06, + "loss": 0.4505, + "step": 6725 + }, + { + "epoch": 3.180141843971631, + "grad_norm": 2.9143471717834473, + "learning_rate": 2.2937479621044712e-06, + "loss": 0.3612, + "step": 6726 + }, + { + "epoch": 3.180614657210402, + "grad_norm": 3.020519495010376, + "learning_rate": 2.2931262681836136e-06, + "loss": 0.4241, + "step": 6727 + }, + { + "epoch": 3.1810874704491727, + "grad_norm": 2.693737745285034, + "learning_rate": 2.2925045871438765e-06, + "loss": 0.366, + "step": 6728 + }, + { + "epoch": 3.181560283687943, + "grad_norm": 2.9427194595336914, + "learning_rate": 2.2918829190239677e-06, + "loss": 0.3741, + "step": 6729 + }, + { + "epoch": 3.182033096926714, + "grad_norm": 2.529383659362793, + "learning_rate": 2.291261263862598e-06, + "loss": 0.4469, + "step": 6730 + }, + { + "epoch": 3.1825059101654847, + "grad_norm": 3.0097804069519043, + "learning_rate": 2.290639621698473e-06, + "loss": 0.4167, + "step": 6731 + }, + { + "epoch": 3.1829787234042555, + "grad_norm": 2.7047014236450195, + "learning_rate": 2.290017992570302e-06, + "loss": 0.3615, + "step": 6732 + }, + { + "epoch": 3.183451536643026, + "grad_norm": 2.676964282989502, + "learning_rate": 2.2893963765167897e-06, + "loss": 0.3722, + "step": 6733 + }, + { + "epoch": 3.1839243498817966, + "grad_norm": 3.0529778003692627, + "learning_rate": 2.2887747735766413e-06, + "loss": 0.395, + "step": 6734 + }, + { + "epoch": 3.1843971631205674, + "grad_norm": 2.826725721359253, + "learning_rate": 2.288153183788562e-06, + "loss": 0.3713, + "step": 6735 + }, + { + "epoch": 3.184869976359338, + "grad_norm": 2.8689587116241455, + "learning_rate": 2.287531607191254e-06, + "loss": 0.4383, + "step": 6736 + }, + { + "epoch": 3.1853427895981086, + "grad_norm": 3.1835694313049316, + "learning_rate": 2.2869100438234217e-06, + "loss": 0.3908, + "step": 6737 + }, + { + "epoch": 3.1858156028368794, + "grad_norm": 3.227262020111084, + "learning_rate": 2.286288493723767e-06, + "loss": 0.3549, + "step": 6738 + }, + { + "epoch": 3.18628841607565, + "grad_norm": 2.7543468475341797, + "learning_rate": 2.2856669569309896e-06, + "loss": 0.351, + "step": 6739 + }, + { + "epoch": 3.186761229314421, + "grad_norm": 2.5381555557250977, + "learning_rate": 2.2850454334837923e-06, + "loss": 0.3473, + "step": 6740 + }, + { + "epoch": 3.1872340425531913, + "grad_norm": 2.785923957824707, + "learning_rate": 2.284423923420872e-06, + "loss": 0.4144, + "step": 6741 + }, + { + "epoch": 3.187706855791962, + "grad_norm": 2.583853006362915, + "learning_rate": 2.28380242678093e-06, + "loss": 0.3088, + "step": 6742 + }, + { + "epoch": 3.188179669030733, + "grad_norm": 2.604647159576416, + "learning_rate": 2.2831809436026627e-06, + "loss": 0.3474, + "step": 6743 + }, + { + "epoch": 3.1886524822695037, + "grad_norm": 6.13611364364624, + "learning_rate": 2.2825594739247662e-06, + "loss": 0.4089, + "step": 6744 + }, + { + "epoch": 3.189125295508274, + "grad_norm": 3.034011125564575, + "learning_rate": 2.281938017785939e-06, + "loss": 0.4569, + "step": 6745 + }, + { + "epoch": 3.189598108747045, + "grad_norm": 2.9352638721466064, + "learning_rate": 2.281316575224874e-06, + "loss": 0.4293, + "step": 6746 + }, + { + "epoch": 3.1900709219858157, + "grad_norm": 3.860957384109497, + "learning_rate": 2.280695146280268e-06, + "loss": 0.4082, + "step": 6747 + }, + { + "epoch": 3.1905437352245865, + "grad_norm": 2.8131468296051025, + "learning_rate": 2.280073730990814e-06, + "loss": 0.3194, + "step": 6748 + }, + { + "epoch": 3.191016548463357, + "grad_norm": 3.1310737133026123, + "learning_rate": 2.2794523293952033e-06, + "loss": 0.4454, + "step": 6749 + }, + { + "epoch": 3.1914893617021276, + "grad_norm": 3.065091133117676, + "learning_rate": 2.27883094153213e-06, + "loss": 0.3789, + "step": 6750 + }, + { + "epoch": 3.1919621749408984, + "grad_norm": 3.315216541290283, + "learning_rate": 2.278209567440284e-06, + "loss": 0.4037, + "step": 6751 + }, + { + "epoch": 3.192434988179669, + "grad_norm": 3.0228476524353027, + "learning_rate": 2.2775882071583546e-06, + "loss": 0.3652, + "step": 6752 + }, + { + "epoch": 3.1929078014184396, + "grad_norm": 3.703540802001953, + "learning_rate": 2.2769668607250336e-06, + "loss": 0.3477, + "step": 6753 + }, + { + "epoch": 3.1933806146572103, + "grad_norm": 2.952481508255005, + "learning_rate": 2.2763455281790065e-06, + "loss": 0.4026, + "step": 6754 + }, + { + "epoch": 3.193853427895981, + "grad_norm": 2.5798189640045166, + "learning_rate": 2.275724209558965e-06, + "loss": 0.3475, + "step": 6755 + }, + { + "epoch": 3.194326241134752, + "grad_norm": 2.599669933319092, + "learning_rate": 2.2751029049035923e-06, + "loss": 0.3499, + "step": 6756 + }, + { + "epoch": 3.1947990543735223, + "grad_norm": 3.0463781356811523, + "learning_rate": 2.2744816142515756e-06, + "loss": 0.3927, + "step": 6757 + }, + { + "epoch": 3.195271867612293, + "grad_norm": 3.134199380874634, + "learning_rate": 2.2738603376416003e-06, + "loss": 0.3957, + "step": 6758 + }, + { + "epoch": 3.195744680851064, + "grad_norm": 3.1326372623443604, + "learning_rate": 2.273239075112349e-06, + "loss": 0.4305, + "step": 6759 + }, + { + "epoch": 3.1962174940898347, + "grad_norm": 2.847128391265869, + "learning_rate": 2.2726178267025072e-06, + "loss": 0.3825, + "step": 6760 + }, + { + "epoch": 3.196690307328605, + "grad_norm": 2.697584629058838, + "learning_rate": 2.2719965924507566e-06, + "loss": 0.3517, + "step": 6761 + }, + { + "epoch": 3.197163120567376, + "grad_norm": 2.881446599960327, + "learning_rate": 2.271375372395777e-06, + "loss": 0.3791, + "step": 6762 + }, + { + "epoch": 3.1976359338061466, + "grad_norm": 3.085054874420166, + "learning_rate": 2.270754166576252e-06, + "loss": 0.4324, + "step": 6763 + }, + { + "epoch": 3.1981087470449174, + "grad_norm": 3.3494462966918945, + "learning_rate": 2.270132975030859e-06, + "loss": 0.4242, + "step": 6764 + }, + { + "epoch": 3.198581560283688, + "grad_norm": 2.8617660999298096, + "learning_rate": 2.2695117977982785e-06, + "loss": 0.3563, + "step": 6765 + }, + { + "epoch": 3.1990543735224586, + "grad_norm": 2.7437968254089355, + "learning_rate": 2.2688906349171873e-06, + "loss": 0.4042, + "step": 6766 + }, + { + "epoch": 3.1995271867612294, + "grad_norm": 3.1129143238067627, + "learning_rate": 2.268269486426262e-06, + "loss": 0.3761, + "step": 6767 + }, + { + "epoch": 3.2, + "grad_norm": 3.32441782951355, + "learning_rate": 2.2676483523641807e-06, + "loss": 0.4439, + "step": 6768 + }, + { + "epoch": 3.2004728132387705, + "grad_norm": 2.8744730949401855, + "learning_rate": 2.267027232769617e-06, + "loss": 0.4015, + "step": 6769 + }, + { + "epoch": 3.2009456264775413, + "grad_norm": 3.6283397674560547, + "learning_rate": 2.2664061276812465e-06, + "loss": 0.3634, + "step": 6770 + }, + { + "epoch": 3.201418439716312, + "grad_norm": 2.7826597690582275, + "learning_rate": 2.2657850371377426e-06, + "loss": 0.3178, + "step": 6771 + }, + { + "epoch": 3.201891252955083, + "grad_norm": 2.668173313140869, + "learning_rate": 2.265163961177776e-06, + "loss": 0.3662, + "step": 6772 + }, + { + "epoch": 3.2023640661938533, + "grad_norm": 2.868441104888916, + "learning_rate": 2.264542899840021e-06, + "loss": 0.4235, + "step": 6773 + }, + { + "epoch": 3.202836879432624, + "grad_norm": 3.2715935707092285, + "learning_rate": 2.263921853163147e-06, + "loss": 0.4741, + "step": 6774 + }, + { + "epoch": 3.203309692671395, + "grad_norm": 2.8647544384002686, + "learning_rate": 2.2633008211858233e-06, + "loss": 0.3885, + "step": 6775 + }, + { + "epoch": 3.2037825059101657, + "grad_norm": 3.070164680480957, + "learning_rate": 2.2626798039467207e-06, + "loss": 0.4191, + "step": 6776 + }, + { + "epoch": 3.204255319148936, + "grad_norm": 2.846686840057373, + "learning_rate": 2.262058801484505e-06, + "loss": 0.3619, + "step": 6777 + }, + { + "epoch": 3.204728132387707, + "grad_norm": 2.767031192779541, + "learning_rate": 2.261437813837845e-06, + "loss": 0.3248, + "step": 6778 + }, + { + "epoch": 3.2052009456264776, + "grad_norm": 2.6819260120391846, + "learning_rate": 2.2608168410454065e-06, + "loss": 0.3871, + "step": 6779 + }, + { + "epoch": 3.2056737588652484, + "grad_norm": 3.1176788806915283, + "learning_rate": 2.260195883145854e-06, + "loss": 0.3929, + "step": 6780 + }, + { + "epoch": 3.2061465721040188, + "grad_norm": 3.143209457397461, + "learning_rate": 2.2595749401778524e-06, + "loss": 0.4188, + "step": 6781 + }, + { + "epoch": 3.2066193853427896, + "grad_norm": 2.9685657024383545, + "learning_rate": 2.2589540121800647e-06, + "loss": 0.4049, + "step": 6782 + }, + { + "epoch": 3.2070921985815604, + "grad_norm": 2.6853368282318115, + "learning_rate": 2.258333099191155e-06, + "loss": 0.349, + "step": 6783 + }, + { + "epoch": 3.207565011820331, + "grad_norm": 2.8418309688568115, + "learning_rate": 2.257712201249783e-06, + "loss": 0.4121, + "step": 6784 + }, + { + "epoch": 3.2080378250591015, + "grad_norm": 2.9441449642181396, + "learning_rate": 2.2570913183946085e-06, + "loss": 0.3846, + "step": 6785 + }, + { + "epoch": 3.2085106382978723, + "grad_norm": 2.9956493377685547, + "learning_rate": 2.256470450664294e-06, + "loss": 0.3941, + "step": 6786 + }, + { + "epoch": 3.208983451536643, + "grad_norm": 3.1774401664733887, + "learning_rate": 2.255849598097496e-06, + "loss": 0.4252, + "step": 6787 + }, + { + "epoch": 3.209456264775414, + "grad_norm": 2.8948934078216553, + "learning_rate": 2.255228760732873e-06, + "loss": 0.3963, + "step": 6788 + }, + { + "epoch": 3.2099290780141843, + "grad_norm": 3.440021276473999, + "learning_rate": 2.2546079386090825e-06, + "loss": 0.3777, + "step": 6789 + }, + { + "epoch": 3.210401891252955, + "grad_norm": 3.1573195457458496, + "learning_rate": 2.253987131764779e-06, + "loss": 0.3896, + "step": 6790 + }, + { + "epoch": 3.210874704491726, + "grad_norm": 3.4218719005584717, + "learning_rate": 2.2533663402386183e-06, + "loss": 0.3979, + "step": 6791 + }, + { + "epoch": 3.2113475177304966, + "grad_norm": 3.3442487716674805, + "learning_rate": 2.252745564069253e-06, + "loss": 0.406, + "step": 6792 + }, + { + "epoch": 3.211820330969267, + "grad_norm": 2.6089327335357666, + "learning_rate": 2.2521248032953387e-06, + "loss": 0.3539, + "step": 6793 + }, + { + "epoch": 3.212293144208038, + "grad_norm": 3.8015971183776855, + "learning_rate": 2.251504057955526e-06, + "loss": 0.4184, + "step": 6794 + }, + { + "epoch": 3.2127659574468086, + "grad_norm": 3.797565460205078, + "learning_rate": 2.250883328088465e-06, + "loss": 0.3392, + "step": 6795 + }, + { + "epoch": 3.2132387706855794, + "grad_norm": 3.290762186050415, + "learning_rate": 2.2502626137328077e-06, + "loss": 0.3726, + "step": 6796 + }, + { + "epoch": 3.2137115839243497, + "grad_norm": 3.149158000946045, + "learning_rate": 2.2496419149272023e-06, + "loss": 0.3869, + "step": 6797 + }, + { + "epoch": 3.2141843971631205, + "grad_norm": 2.652902364730835, + "learning_rate": 2.2490212317102964e-06, + "loss": 0.3256, + "step": 6798 + }, + { + "epoch": 3.2146572104018913, + "grad_norm": 3.3039770126342773, + "learning_rate": 2.248400564120739e-06, + "loss": 0.4231, + "step": 6799 + }, + { + "epoch": 3.215130023640662, + "grad_norm": 3.0190038681030273, + "learning_rate": 2.247779912197174e-06, + "loss": 0.4319, + "step": 6800 + }, + { + "epoch": 3.2156028368794325, + "grad_norm": 2.861393690109253, + "learning_rate": 2.2471592759782485e-06, + "loss": 0.465, + "step": 6801 + }, + { + "epoch": 3.2160756501182033, + "grad_norm": 2.7796146869659424, + "learning_rate": 2.246538655502606e-06, + "loss": 0.3896, + "step": 6802 + }, + { + "epoch": 3.216548463356974, + "grad_norm": 3.1849005222320557, + "learning_rate": 2.24591805080889e-06, + "loss": 0.3782, + "step": 6803 + }, + { + "epoch": 3.217021276595745, + "grad_norm": 3.076164960861206, + "learning_rate": 2.2452974619357435e-06, + "loss": 0.4023, + "step": 6804 + }, + { + "epoch": 3.2174940898345152, + "grad_norm": 2.7006006240844727, + "learning_rate": 2.2446768889218064e-06, + "loss": 0.3902, + "step": 6805 + }, + { + "epoch": 3.217966903073286, + "grad_norm": 2.9310474395751953, + "learning_rate": 2.2440563318057205e-06, + "loss": 0.366, + "step": 6806 + }, + { + "epoch": 3.218439716312057, + "grad_norm": 3.057248592376709, + "learning_rate": 2.2434357906261246e-06, + "loss": 0.4042, + "step": 6807 + }, + { + "epoch": 3.2189125295508276, + "grad_norm": 3.3720197677612305, + "learning_rate": 2.242815265421656e-06, + "loss": 0.3816, + "step": 6808 + }, + { + "epoch": 3.219385342789598, + "grad_norm": 2.9626352787017822, + "learning_rate": 2.2421947562309545e-06, + "loss": 0.363, + "step": 6809 + }, + { + "epoch": 3.219858156028369, + "grad_norm": 2.7848782539367676, + "learning_rate": 2.2415742630926533e-06, + "loss": 0.3597, + "step": 6810 + }, + { + "epoch": 3.2203309692671396, + "grad_norm": 2.757319450378418, + "learning_rate": 2.2409537860453913e-06, + "loss": 0.3304, + "step": 6811 + }, + { + "epoch": 3.2208037825059104, + "grad_norm": 2.7765560150146484, + "learning_rate": 2.240333325127801e-06, + "loss": 0.3896, + "step": 6812 + }, + { + "epoch": 3.2212765957446807, + "grad_norm": 2.9882447719573975, + "learning_rate": 2.239712880378515e-06, + "loss": 0.4004, + "step": 6813 + }, + { + "epoch": 3.2217494089834515, + "grad_norm": 2.8551244735717773, + "learning_rate": 2.2390924518361673e-06, + "loss": 0.4167, + "step": 6814 + }, + { + "epoch": 3.2222222222222223, + "grad_norm": 2.8051679134368896, + "learning_rate": 2.2384720395393878e-06, + "loss": 0.3319, + "step": 6815 + }, + { + "epoch": 3.222695035460993, + "grad_norm": 3.1172873973846436, + "learning_rate": 2.2378516435268086e-06, + "loss": 0.379, + "step": 6816 + }, + { + "epoch": 3.2231678486997635, + "grad_norm": 3.0282177925109863, + "learning_rate": 2.237231263837058e-06, + "loss": 0.3855, + "step": 6817 + }, + { + "epoch": 3.2236406619385343, + "grad_norm": 2.7156803607940674, + "learning_rate": 2.236610900508763e-06, + "loss": 0.4062, + "step": 6818 + }, + { + "epoch": 3.224113475177305, + "grad_norm": 2.721327781677246, + "learning_rate": 2.235990553580554e-06, + "loss": 0.3726, + "step": 6819 + }, + { + "epoch": 3.2245862884160754, + "grad_norm": 2.881181240081787, + "learning_rate": 2.235370223091055e-06, + "loss": 0.421, + "step": 6820 + }, + { + "epoch": 3.225059101654846, + "grad_norm": 2.8074657917022705, + "learning_rate": 2.234749909078892e-06, + "loss": 0.3628, + "step": 6821 + }, + { + "epoch": 3.225531914893617, + "grad_norm": 2.8781638145446777, + "learning_rate": 2.234129611582689e-06, + "loss": 0.3857, + "step": 6822 + }, + { + "epoch": 3.226004728132388, + "grad_norm": 2.9473299980163574, + "learning_rate": 2.233509330641068e-06, + "loss": 0.4358, + "step": 6823 + }, + { + "epoch": 3.2264775413711586, + "grad_norm": 3.261209011077881, + "learning_rate": 2.2328890662926543e-06, + "loss": 0.4115, + "step": 6824 + }, + { + "epoch": 3.226950354609929, + "grad_norm": 3.2796943187713623, + "learning_rate": 2.232268818576067e-06, + "loss": 0.3846, + "step": 6825 + }, + { + "epoch": 3.2274231678486998, + "grad_norm": 3.1083059310913086, + "learning_rate": 2.2316485875299247e-06, + "loss": 0.3452, + "step": 6826 + }, + { + "epoch": 3.2278959810874706, + "grad_norm": 2.7947003841400146, + "learning_rate": 2.23102837319285e-06, + "loss": 0.3733, + "step": 6827 + }, + { + "epoch": 3.228368794326241, + "grad_norm": 2.792348861694336, + "learning_rate": 2.230408175603458e-06, + "loss": 0.411, + "step": 6828 + }, + { + "epoch": 3.2288416075650117, + "grad_norm": 2.8563876152038574, + "learning_rate": 2.229787994800368e-06, + "loss": 0.4303, + "step": 6829 + }, + { + "epoch": 3.2293144208037825, + "grad_norm": 2.9573659896850586, + "learning_rate": 2.2291678308221943e-06, + "loss": 0.4124, + "step": 6830 + }, + { + "epoch": 3.2297872340425533, + "grad_norm": 2.8554422855377197, + "learning_rate": 2.228547683707551e-06, + "loss": 0.3715, + "step": 6831 + }, + { + "epoch": 3.230260047281324, + "grad_norm": 2.9457242488861084, + "learning_rate": 2.227927553495054e-06, + "loss": 0.4339, + "step": 6832 + }, + { + "epoch": 3.2307328605200945, + "grad_norm": 2.799135684967041, + "learning_rate": 2.227307440223315e-06, + "loss": 0.3335, + "step": 6833 + }, + { + "epoch": 3.2312056737588652, + "grad_norm": 2.768529176712036, + "learning_rate": 2.2266873439309465e-06, + "loss": 0.3929, + "step": 6834 + }, + { + "epoch": 3.231678486997636, + "grad_norm": 3.124069929122925, + "learning_rate": 2.2260672646565585e-06, + "loss": 0.4205, + "step": 6835 + }, + { + "epoch": 3.2321513002364064, + "grad_norm": 2.8153982162475586, + "learning_rate": 2.2254472024387603e-06, + "loss": 0.3565, + "step": 6836 + }, + { + "epoch": 3.232624113475177, + "grad_norm": 3.1802141666412354, + "learning_rate": 2.224827157316162e-06, + "loss": 0.4614, + "step": 6837 + }, + { + "epoch": 3.233096926713948, + "grad_norm": 2.669651746749878, + "learning_rate": 2.2242071293273682e-06, + "loss": 0.3581, + "step": 6838 + }, + { + "epoch": 3.233569739952719, + "grad_norm": 3.073127269744873, + "learning_rate": 2.223587118510989e-06, + "loss": 0.3581, + "step": 6839 + }, + { + "epoch": 3.2340425531914896, + "grad_norm": 2.875955820083618, + "learning_rate": 2.222967124905627e-06, + "loss": 0.3905, + "step": 6840 + }, + { + "epoch": 3.23451536643026, + "grad_norm": 2.887744903564453, + "learning_rate": 2.2223471485498872e-06, + "loss": 0.4131, + "step": 6841 + }, + { + "epoch": 3.2349881796690307, + "grad_norm": 2.6957902908325195, + "learning_rate": 2.2217271894823735e-06, + "loss": 0.3631, + "step": 6842 + }, + { + "epoch": 3.2354609929078015, + "grad_norm": 2.7098400592803955, + "learning_rate": 2.221107247741688e-06, + "loss": 0.3959, + "step": 6843 + }, + { + "epoch": 3.235933806146572, + "grad_norm": 2.986271858215332, + "learning_rate": 2.22048732336643e-06, + "loss": 0.3515, + "step": 6844 + }, + { + "epoch": 3.2364066193853427, + "grad_norm": 3.0537121295928955, + "learning_rate": 2.2198674163952015e-06, + "loss": 0.438, + "step": 6845 + }, + { + "epoch": 3.2368794326241135, + "grad_norm": 2.8351151943206787, + "learning_rate": 2.2192475268666e-06, + "loss": 0.4069, + "step": 6846 + }, + { + "epoch": 3.2373522458628843, + "grad_norm": 2.6455280780792236, + "learning_rate": 2.218627654819225e-06, + "loss": 0.3626, + "step": 6847 + }, + { + "epoch": 3.237825059101655, + "grad_norm": 3.060352325439453, + "learning_rate": 2.2180078002916717e-06, + "loss": 0.3306, + "step": 6848 + }, + { + "epoch": 3.2382978723404254, + "grad_norm": 3.0178887844085693, + "learning_rate": 2.2173879633225355e-06, + "loss": 0.4111, + "step": 6849 + }, + { + "epoch": 3.2387706855791962, + "grad_norm": 2.895822763442993, + "learning_rate": 2.2167681439504123e-06, + "loss": 0.4053, + "step": 6850 + }, + { + "epoch": 3.239243498817967, + "grad_norm": 2.7295608520507812, + "learning_rate": 2.2161483422138945e-06, + "loss": 0.4021, + "step": 6851 + }, + { + "epoch": 3.2397163120567374, + "grad_norm": 3.1004912853240967, + "learning_rate": 2.2155285581515747e-06, + "loss": 0.3882, + "step": 6852 + }, + { + "epoch": 3.240189125295508, + "grad_norm": 2.927987813949585, + "learning_rate": 2.214908791802045e-06, + "loss": 0.4036, + "step": 6853 + }, + { + "epoch": 3.240661938534279, + "grad_norm": 3.1679599285125732, + "learning_rate": 2.2142890432038943e-06, + "loss": 0.3897, + "step": 6854 + }, + { + "epoch": 3.2411347517730498, + "grad_norm": 3.2094008922576904, + "learning_rate": 2.213669312395712e-06, + "loss": 0.4429, + "step": 6855 + }, + { + "epoch": 3.24160756501182, + "grad_norm": 4.637594223022461, + "learning_rate": 2.2130495994160857e-06, + "loss": 0.3708, + "step": 6856 + }, + { + "epoch": 3.242080378250591, + "grad_norm": 3.0063490867614746, + "learning_rate": 2.212429904303603e-06, + "loss": 0.3949, + "step": 6857 + }, + { + "epoch": 3.2425531914893617, + "grad_norm": 3.285444736480713, + "learning_rate": 2.21181022709685e-06, + "loss": 0.4236, + "step": 6858 + }, + { + "epoch": 3.2430260047281325, + "grad_norm": 3.02506422996521, + "learning_rate": 2.2111905678344086e-06, + "loss": 0.368, + "step": 6859 + }, + { + "epoch": 3.243498817966903, + "grad_norm": 2.9845006465911865, + "learning_rate": 2.2105709265548657e-06, + "loss": 0.4154, + "step": 6860 + }, + { + "epoch": 3.2439716312056737, + "grad_norm": 3.2537527084350586, + "learning_rate": 2.2099513032968013e-06, + "loss": 0.4385, + "step": 6861 + }, + { + "epoch": 3.2444444444444445, + "grad_norm": 2.8521063327789307, + "learning_rate": 2.2093316980987985e-06, + "loss": 0.384, + "step": 6862 + }, + { + "epoch": 3.2449172576832153, + "grad_norm": 3.186844825744629, + "learning_rate": 2.208712110999436e-06, + "loss": 0.4131, + "step": 6863 + }, + { + "epoch": 3.2453900709219856, + "grad_norm": 2.932058095932007, + "learning_rate": 2.208092542037292e-06, + "loss": 0.3341, + "step": 6864 + }, + { + "epoch": 3.2458628841607564, + "grad_norm": 3.0818707942962646, + "learning_rate": 2.2074729912509462e-06, + "loss": 0.4149, + "step": 6865 + }, + { + "epoch": 3.246335697399527, + "grad_norm": 2.9788503646850586, + "learning_rate": 2.2068534586789735e-06, + "loss": 0.3572, + "step": 6866 + }, + { + "epoch": 3.246808510638298, + "grad_norm": 2.84075665473938, + "learning_rate": 2.206233944359952e-06, + "loss": 0.3561, + "step": 6867 + }, + { + "epoch": 3.2472813238770684, + "grad_norm": 2.966459035873413, + "learning_rate": 2.2056144483324545e-06, + "loss": 0.3909, + "step": 6868 + }, + { + "epoch": 3.247754137115839, + "grad_norm": 2.892038106918335, + "learning_rate": 2.204994970635054e-06, + "loss": 0.3557, + "step": 6869 + }, + { + "epoch": 3.24822695035461, + "grad_norm": 2.7458810806274414, + "learning_rate": 2.2043755113063233e-06, + "loss": 0.3551, + "step": 6870 + }, + { + "epoch": 3.2486997635933808, + "grad_norm": 2.766803741455078, + "learning_rate": 2.2037560703848334e-06, + "loss": 0.3343, + "step": 6871 + }, + { + "epoch": 3.249172576832151, + "grad_norm": 2.9780561923980713, + "learning_rate": 2.2031366479091533e-06, + "loss": 0.4004, + "step": 6872 + }, + { + "epoch": 3.249645390070922, + "grad_norm": 2.8848516941070557, + "learning_rate": 2.202517243917853e-06, + "loss": 0.3467, + "step": 6873 + }, + { + "epoch": 3.2501182033096927, + "grad_norm": 2.9962213039398193, + "learning_rate": 2.201897858449499e-06, + "loss": 0.3796, + "step": 6874 + }, + { + "epoch": 3.2505910165484635, + "grad_norm": 2.838131904602051, + "learning_rate": 2.201278491542659e-06, + "loss": 0.3683, + "step": 6875 + }, + { + "epoch": 3.251063829787234, + "grad_norm": 3.0232505798339844, + "learning_rate": 2.200659143235897e-06, + "loss": 0.3793, + "step": 6876 + }, + { + "epoch": 3.2515366430260046, + "grad_norm": 3.0690126419067383, + "learning_rate": 2.2000398135677776e-06, + "loss": 0.417, + "step": 6877 + }, + { + "epoch": 3.2520094562647754, + "grad_norm": 3.1838719844818115, + "learning_rate": 2.1994205025768643e-06, + "loss": 0.4608, + "step": 6878 + }, + { + "epoch": 3.2524822695035462, + "grad_norm": 3.1187257766723633, + "learning_rate": 2.198801210301717e-06, + "loss": 0.3396, + "step": 6879 + }, + { + "epoch": 3.2529550827423166, + "grad_norm": 2.7608656883239746, + "learning_rate": 2.1981819367808984e-06, + "loss": 0.386, + "step": 6880 + }, + { + "epoch": 3.2534278959810874, + "grad_norm": 3.027456283569336, + "learning_rate": 2.197562682052968e-06, + "loss": 0.3941, + "step": 6881 + }, + { + "epoch": 3.253900709219858, + "grad_norm": 2.925515651702881, + "learning_rate": 2.1969434461564816e-06, + "loss": 0.3608, + "step": 6882 + }, + { + "epoch": 3.254373522458629, + "grad_norm": 2.946770668029785, + "learning_rate": 2.196324229129999e-06, + "loss": 0.4116, + "step": 6883 + }, + { + "epoch": 3.2548463356973993, + "grad_norm": 2.6497952938079834, + "learning_rate": 2.1957050310120746e-06, + "loss": 0.338, + "step": 6884 + }, + { + "epoch": 3.25531914893617, + "grad_norm": 2.6915128231048584, + "learning_rate": 2.195085851841264e-06, + "loss": 0.3372, + "step": 6885 + }, + { + "epoch": 3.255791962174941, + "grad_norm": 3.4022350311279297, + "learning_rate": 2.1944666916561205e-06, + "loss": 0.3844, + "step": 6886 + }, + { + "epoch": 3.2562647754137117, + "grad_norm": 2.7463366985321045, + "learning_rate": 2.1938475504951958e-06, + "loss": 0.3268, + "step": 6887 + }, + { + "epoch": 3.256737588652482, + "grad_norm": 2.828810691833496, + "learning_rate": 2.193228428397042e-06, + "loss": 0.3275, + "step": 6888 + }, + { + "epoch": 3.257210401891253, + "grad_norm": 3.4016268253326416, + "learning_rate": 2.192609325400208e-06, + "loss": 0.3916, + "step": 6889 + }, + { + "epoch": 3.2576832151300237, + "grad_norm": 2.4980733394622803, + "learning_rate": 2.191990241543245e-06, + "loss": 0.3636, + "step": 6890 + }, + { + "epoch": 3.2581560283687945, + "grad_norm": 3.0384702682495117, + "learning_rate": 2.191371176864698e-06, + "loss": 0.398, + "step": 6891 + }, + { + "epoch": 3.258628841607565, + "grad_norm": 2.8949527740478516, + "learning_rate": 2.190752131403115e-06, + "loss": 0.3919, + "step": 6892 + }, + { + "epoch": 3.2591016548463356, + "grad_norm": 2.765617609024048, + "learning_rate": 2.190133105197041e-06, + "loss": 0.3799, + "step": 6893 + }, + { + "epoch": 3.2595744680851064, + "grad_norm": 2.6149277687072754, + "learning_rate": 2.18951409828502e-06, + "loss": 0.3895, + "step": 6894 + }, + { + "epoch": 3.260047281323877, + "grad_norm": 2.9738945960998535, + "learning_rate": 2.1888951107055934e-06, + "loss": 0.3879, + "step": 6895 + }, + { + "epoch": 3.2605200945626476, + "grad_norm": 2.9438633918762207, + "learning_rate": 2.1882761424973053e-06, + "loss": 0.438, + "step": 6896 + }, + { + "epoch": 3.2609929078014184, + "grad_norm": 3.114243984222412, + "learning_rate": 2.1876571936986936e-06, + "loss": 0.4737, + "step": 6897 + }, + { + "epoch": 3.261465721040189, + "grad_norm": 3.017526388168335, + "learning_rate": 2.1870382643483e-06, + "loss": 0.4039, + "step": 6898 + }, + { + "epoch": 3.26193853427896, + "grad_norm": 3.1475703716278076, + "learning_rate": 2.1864193544846613e-06, + "loss": 0.3825, + "step": 6899 + }, + { + "epoch": 3.2624113475177303, + "grad_norm": 2.75502872467041, + "learning_rate": 2.1858004641463142e-06, + "loss": 0.3507, + "step": 6900 + }, + { + "epoch": 3.262884160756501, + "grad_norm": 3.0467209815979004, + "learning_rate": 2.1851815933717944e-06, + "loss": 0.3938, + "step": 6901 + }, + { + "epoch": 3.263356973995272, + "grad_norm": 2.993014097213745, + "learning_rate": 2.184562742199636e-06, + "loss": 0.3711, + "step": 6902 + }, + { + "epoch": 3.2638297872340427, + "grad_norm": 2.607309341430664, + "learning_rate": 2.183943910668373e-06, + "loss": 0.3689, + "step": 6903 + }, + { + "epoch": 3.264302600472813, + "grad_norm": 2.961653470993042, + "learning_rate": 2.1833250988165373e-06, + "loss": 0.3806, + "step": 6904 + }, + { + "epoch": 3.264775413711584, + "grad_norm": 2.8202552795410156, + "learning_rate": 2.1827063066826574e-06, + "loss": 0.391, + "step": 6905 + }, + { + "epoch": 3.2652482269503547, + "grad_norm": 3.032648801803589, + "learning_rate": 2.1820875343052666e-06, + "loss": 0.4011, + "step": 6906 + }, + { + "epoch": 3.2657210401891255, + "grad_norm": 2.8265180587768555, + "learning_rate": 2.1814687817228896e-06, + "loss": 0.3923, + "step": 6907 + }, + { + "epoch": 3.266193853427896, + "grad_norm": 3.1425564289093018, + "learning_rate": 2.1808500489740555e-06, + "loss": 0.4913, + "step": 6908 + }, + { + "epoch": 3.2666666666666666, + "grad_norm": 2.977809429168701, + "learning_rate": 2.18023133609729e-06, + "loss": 0.379, + "step": 6909 + }, + { + "epoch": 3.2671394799054374, + "grad_norm": 3.509551525115967, + "learning_rate": 2.1796126431311153e-06, + "loss": 0.4025, + "step": 6910 + }, + { + "epoch": 3.267612293144208, + "grad_norm": 2.9133846759796143, + "learning_rate": 2.178993970114058e-06, + "loss": 0.4209, + "step": 6911 + }, + { + "epoch": 3.2680851063829786, + "grad_norm": 2.945513963699341, + "learning_rate": 2.178375317084637e-06, + "loss": 0.3882, + "step": 6912 + }, + { + "epoch": 3.2685579196217494, + "grad_norm": 2.7868733406066895, + "learning_rate": 2.1777566840813763e-06, + "loss": 0.3456, + "step": 6913 + }, + { + "epoch": 3.26903073286052, + "grad_norm": 2.803220748901367, + "learning_rate": 2.1771380711427937e-06, + "loss": 0.3394, + "step": 6914 + }, + { + "epoch": 3.269503546099291, + "grad_norm": 3.1293554306030273, + "learning_rate": 2.176519478307407e-06, + "loss": 0.402, + "step": 6915 + }, + { + "epoch": 3.2699763593380613, + "grad_norm": 2.843971014022827, + "learning_rate": 2.1759009056137347e-06, + "loss": 0.3449, + "step": 6916 + }, + { + "epoch": 3.270449172576832, + "grad_norm": 2.9983274936676025, + "learning_rate": 2.1752823531002917e-06, + "loss": 0.4091, + "step": 6917 + }, + { + "epoch": 3.270921985815603, + "grad_norm": 2.686722993850708, + "learning_rate": 2.174663820805592e-06, + "loss": 0.4303, + "step": 6918 + }, + { + "epoch": 3.2713947990543737, + "grad_norm": 2.669349431991577, + "learning_rate": 2.1740453087681508e-06, + "loss": 0.3796, + "step": 6919 + }, + { + "epoch": 3.271867612293144, + "grad_norm": 2.992138624191284, + "learning_rate": 2.173426817026477e-06, + "loss": 0.4125, + "step": 6920 + }, + { + "epoch": 3.272340425531915, + "grad_norm": 3.332834243774414, + "learning_rate": 2.1728083456190852e-06, + "loss": 0.3885, + "step": 6921 + }, + { + "epoch": 3.2728132387706856, + "grad_norm": 2.869673013687134, + "learning_rate": 2.1721898945844825e-06, + "loss": 0.3941, + "step": 6922 + }, + { + "epoch": 3.2732860520094564, + "grad_norm": 2.804440975189209, + "learning_rate": 2.1715714639611774e-06, + "loss": 0.4007, + "step": 6923 + }, + { + "epoch": 3.273758865248227, + "grad_norm": 3.1751439571380615, + "learning_rate": 2.1709530537876774e-06, + "loss": 0.3981, + "step": 6924 + }, + { + "epoch": 3.2742316784869976, + "grad_norm": 2.6367175579071045, + "learning_rate": 2.1703346641024878e-06, + "loss": 0.3582, + "step": 6925 + }, + { + "epoch": 3.2747044917257684, + "grad_norm": 2.99164080619812, + "learning_rate": 2.1697162949441137e-06, + "loss": 0.3846, + "step": 6926 + }, + { + "epoch": 3.275177304964539, + "grad_norm": 3.3206982612609863, + "learning_rate": 2.169097946351057e-06, + "loss": 0.3689, + "step": 6927 + }, + { + "epoch": 3.2756501182033095, + "grad_norm": 2.927907943725586, + "learning_rate": 2.16847961836182e-06, + "loss": 0.3536, + "step": 6928 + }, + { + "epoch": 3.2761229314420803, + "grad_norm": 3.1950864791870117, + "learning_rate": 2.167861311014904e-06, + "loss": 0.4154, + "step": 6929 + }, + { + "epoch": 3.276595744680851, + "grad_norm": 2.888383388519287, + "learning_rate": 2.1672430243488073e-06, + "loss": 0.3702, + "step": 6930 + }, + { + "epoch": 3.277068557919622, + "grad_norm": 2.842287063598633, + "learning_rate": 2.166624758402029e-06, + "loss": 0.3623, + "step": 6931 + }, + { + "epoch": 3.2775413711583923, + "grad_norm": 2.84350323677063, + "learning_rate": 2.166006513213065e-06, + "loss": 0.3757, + "step": 6932 + }, + { + "epoch": 3.278014184397163, + "grad_norm": 3.105626344680786, + "learning_rate": 2.165388288820411e-06, + "loss": 0.3955, + "step": 6933 + }, + { + "epoch": 3.278486997635934, + "grad_norm": 3.273508071899414, + "learning_rate": 2.164770085262561e-06, + "loss": 0.4046, + "step": 6934 + }, + { + "epoch": 3.2789598108747047, + "grad_norm": 3.2530124187469482, + "learning_rate": 2.1641519025780066e-06, + "loss": 0.3141, + "step": 6935 + }, + { + "epoch": 3.279432624113475, + "grad_norm": 2.822849750518799, + "learning_rate": 2.163533740805242e-06, + "loss": 0.3973, + "step": 6936 + }, + { + "epoch": 3.279905437352246, + "grad_norm": 2.772097587585449, + "learning_rate": 2.162915599982756e-06, + "loss": 0.3606, + "step": 6937 + }, + { + "epoch": 3.2803782505910166, + "grad_norm": 3.150696039199829, + "learning_rate": 2.1622974801490365e-06, + "loss": 0.4709, + "step": 6938 + }, + { + "epoch": 3.2808510638297874, + "grad_norm": 3.2072134017944336, + "learning_rate": 2.1616793813425736e-06, + "loss": 0.3946, + "step": 6939 + }, + { + "epoch": 3.2813238770685578, + "grad_norm": 2.9922473430633545, + "learning_rate": 2.1610613036018515e-06, + "loss": 0.3263, + "step": 6940 + }, + { + "epoch": 3.2817966903073286, + "grad_norm": 2.7818009853363037, + "learning_rate": 2.1604432469653555e-06, + "loss": 0.3887, + "step": 6941 + }, + { + "epoch": 3.2822695035460994, + "grad_norm": 3.12998628616333, + "learning_rate": 2.15982521147157e-06, + "loss": 0.3522, + "step": 6942 + }, + { + "epoch": 3.28274231678487, + "grad_norm": 2.876678228378296, + "learning_rate": 2.159207197158976e-06, + "loss": 0.3643, + "step": 6943 + }, + { + "epoch": 3.2832151300236405, + "grad_norm": 2.825488805770874, + "learning_rate": 2.1585892040660565e-06, + "loss": 0.3223, + "step": 6944 + }, + { + "epoch": 3.2836879432624113, + "grad_norm": 2.8724498748779297, + "learning_rate": 2.1579712322312906e-06, + "loss": 0.3855, + "step": 6945 + }, + { + "epoch": 3.284160756501182, + "grad_norm": 2.841064691543579, + "learning_rate": 2.1573532816931547e-06, + "loss": 0.4106, + "step": 6946 + }, + { + "epoch": 3.284633569739953, + "grad_norm": 3.053391218185425, + "learning_rate": 2.1567353524901288e-06, + "loss": 0.4875, + "step": 6947 + }, + { + "epoch": 3.2851063829787233, + "grad_norm": 2.7294771671295166, + "learning_rate": 2.156117444660687e-06, + "loss": 0.3856, + "step": 6948 + }, + { + "epoch": 3.285579196217494, + "grad_norm": 3.0965659618377686, + "learning_rate": 2.155499558243304e-06, + "loss": 0.4104, + "step": 6949 + }, + { + "epoch": 3.286052009456265, + "grad_norm": 2.778923511505127, + "learning_rate": 2.1548816932764536e-06, + "loss": 0.3636, + "step": 6950 + }, + { + "epoch": 3.2865248226950357, + "grad_norm": 2.890679121017456, + "learning_rate": 2.1542638497986054e-06, + "loss": 0.4026, + "step": 6951 + }, + { + "epoch": 3.286997635933806, + "grad_norm": 3.0466806888580322, + "learning_rate": 2.1536460278482326e-06, + "loss": 0.3856, + "step": 6952 + }, + { + "epoch": 3.287470449172577, + "grad_norm": 3.1367077827453613, + "learning_rate": 2.1530282274638013e-06, + "loss": 0.3767, + "step": 6953 + }, + { + "epoch": 3.2879432624113476, + "grad_norm": 2.984694719314575, + "learning_rate": 2.1524104486837823e-06, + "loss": 0.4142, + "step": 6954 + }, + { + "epoch": 3.2884160756501184, + "grad_norm": 3.1542797088623047, + "learning_rate": 2.151792691546641e-06, + "loss": 0.4361, + "step": 6955 + }, + { + "epoch": 3.2888888888888888, + "grad_norm": 2.7306816577911377, + "learning_rate": 2.1511749560908405e-06, + "loss": 0.3692, + "step": 6956 + }, + { + "epoch": 3.2893617021276595, + "grad_norm": 3.6679904460906982, + "learning_rate": 2.150557242354847e-06, + "loss": 0.4496, + "step": 6957 + }, + { + "epoch": 3.2898345153664303, + "grad_norm": 3.2040863037109375, + "learning_rate": 2.1499395503771207e-06, + "loss": 0.3526, + "step": 6958 + }, + { + "epoch": 3.290307328605201, + "grad_norm": 3.2416043281555176, + "learning_rate": 2.1493218801961246e-06, + "loss": 0.3955, + "step": 6959 + }, + { + "epoch": 3.2907801418439715, + "grad_norm": 2.8164525032043457, + "learning_rate": 2.1487042318503174e-06, + "loss": 0.3727, + "step": 6960 + }, + { + "epoch": 3.2912529550827423, + "grad_norm": 2.5954513549804688, + "learning_rate": 2.148086605378156e-06, + "loss": 0.3315, + "step": 6961 + }, + { + "epoch": 3.291725768321513, + "grad_norm": 2.8068149089813232, + "learning_rate": 2.1474690008181e-06, + "loss": 0.3702, + "step": 6962 + }, + { + "epoch": 3.2921985815602834, + "grad_norm": 2.9063730239868164, + "learning_rate": 2.1468514182086025e-06, + "loss": 0.3357, + "step": 6963 + }, + { + "epoch": 3.2926713947990542, + "grad_norm": 2.7623207569122314, + "learning_rate": 2.1462338575881197e-06, + "loss": 0.381, + "step": 6964 + }, + { + "epoch": 3.293144208037825, + "grad_norm": 2.6818830966949463, + "learning_rate": 2.145616318995103e-06, + "loss": 0.3733, + "step": 6965 + }, + { + "epoch": 3.293617021276596, + "grad_norm": 2.7966864109039307, + "learning_rate": 2.1449988024680034e-06, + "loss": 0.3993, + "step": 6966 + }, + { + "epoch": 3.2940898345153666, + "grad_norm": 3.0644514560699463, + "learning_rate": 2.1443813080452728e-06, + "loss": 0.3541, + "step": 6967 + }, + { + "epoch": 3.294562647754137, + "grad_norm": 3.03204607963562, + "learning_rate": 2.1437638357653586e-06, + "loss": 0.3864, + "step": 6968 + }, + { + "epoch": 3.295035460992908, + "grad_norm": 2.980565071105957, + "learning_rate": 2.143146385666707e-06, + "loss": 0.36, + "step": 6969 + }, + { + "epoch": 3.2955082742316786, + "grad_norm": 3.1261661052703857, + "learning_rate": 2.1425289577877675e-06, + "loss": 0.4053, + "step": 6970 + }, + { + "epoch": 3.295981087470449, + "grad_norm": 3.0194897651672363, + "learning_rate": 2.1419115521669804e-06, + "loss": 0.4553, + "step": 6971 + }, + { + "epoch": 3.2964539007092197, + "grad_norm": 2.7620482444763184, + "learning_rate": 2.141294168842792e-06, + "loss": 0.3846, + "step": 6972 + }, + { + "epoch": 3.2969267139479905, + "grad_norm": 2.9575016498565674, + "learning_rate": 2.1406768078536427e-06, + "loss": 0.4415, + "step": 6973 + }, + { + "epoch": 3.2973995271867613, + "grad_norm": 3.17909574508667, + "learning_rate": 2.1400594692379717e-06, + "loss": 0.4514, + "step": 6974 + }, + { + "epoch": 3.297872340425532, + "grad_norm": 3.128613233566284, + "learning_rate": 2.1394421530342207e-06, + "loss": 0.3757, + "step": 6975 + }, + { + "epoch": 3.2983451536643025, + "grad_norm": 3.0247111320495605, + "learning_rate": 2.1388248592808243e-06, + "loss": 0.3881, + "step": 6976 + }, + { + "epoch": 3.2988179669030733, + "grad_norm": 2.8091228008270264, + "learning_rate": 2.1382075880162217e-06, + "loss": 0.3782, + "step": 6977 + }, + { + "epoch": 3.299290780141844, + "grad_norm": 2.985105514526367, + "learning_rate": 2.137590339278846e-06, + "loss": 0.3783, + "step": 6978 + }, + { + "epoch": 3.2997635933806144, + "grad_norm": 3.1862502098083496, + "learning_rate": 2.1369731131071304e-06, + "loss": 0.4776, + "step": 6979 + }, + { + "epoch": 3.300236406619385, + "grad_norm": 3.3138091564178467, + "learning_rate": 2.1363559095395075e-06, + "loss": 0.4056, + "step": 6980 + }, + { + "epoch": 3.300709219858156, + "grad_norm": 3.023695707321167, + "learning_rate": 2.135738728614407e-06, + "loss": 0.3716, + "step": 6981 + }, + { + "epoch": 3.301182033096927, + "grad_norm": 6.149252414703369, + "learning_rate": 2.135121570370259e-06, + "loss": 0.3713, + "step": 6982 + }, + { + "epoch": 3.3016548463356976, + "grad_norm": 2.689671754837036, + "learning_rate": 2.134504434845491e-06, + "loss": 0.3541, + "step": 6983 + }, + { + "epoch": 3.302127659574468, + "grad_norm": 3.241212844848633, + "learning_rate": 2.1338873220785284e-06, + "loss": 0.4328, + "step": 6984 + }, + { + "epoch": 3.3026004728132388, + "grad_norm": 3.6037068367004395, + "learning_rate": 2.133270232107798e-06, + "loss": 0.4091, + "step": 6985 + }, + { + "epoch": 3.3030732860520096, + "grad_norm": 3.300031900405884, + "learning_rate": 2.1326531649717216e-06, + "loss": 0.3742, + "step": 6986 + }, + { + "epoch": 3.30354609929078, + "grad_norm": 2.82257342338562, + "learning_rate": 2.1320361207087225e-06, + "loss": 0.3622, + "step": 6987 + }, + { + "epoch": 3.3040189125295507, + "grad_norm": 3.297513246536255, + "learning_rate": 2.1314190993572196e-06, + "loss": 0.4606, + "step": 6988 + }, + { + "epoch": 3.3044917257683215, + "grad_norm": 2.676440954208374, + "learning_rate": 2.130802100955634e-06, + "loss": 0.382, + "step": 6989 + }, + { + "epoch": 3.3049645390070923, + "grad_norm": 2.9548017978668213, + "learning_rate": 2.130185125542383e-06, + "loss": 0.3751, + "step": 6990 + }, + { + "epoch": 3.305437352245863, + "grad_norm": 2.800647020339966, + "learning_rate": 2.129568173155882e-06, + "loss": 0.3868, + "step": 6991 + }, + { + "epoch": 3.3059101654846335, + "grad_norm": 3.3789260387420654, + "learning_rate": 2.128951243834546e-06, + "loss": 0.4373, + "step": 6992 + }, + { + "epoch": 3.3063829787234043, + "grad_norm": 2.944807767868042, + "learning_rate": 2.12833433761679e-06, + "loss": 0.4205, + "step": 6993 + }, + { + "epoch": 3.306855791962175, + "grad_norm": 2.577975273132324, + "learning_rate": 2.127717454541025e-06, + "loss": 0.4197, + "step": 6994 + }, + { + "epoch": 3.3073286052009454, + "grad_norm": 3.0542666912078857, + "learning_rate": 2.127100594645661e-06, + "loss": 0.3811, + "step": 6995 + }, + { + "epoch": 3.307801418439716, + "grad_norm": 3.163015842437744, + "learning_rate": 2.1264837579691088e-06, + "loss": 0.415, + "step": 6996 + }, + { + "epoch": 3.308274231678487, + "grad_norm": 2.9161269664764404, + "learning_rate": 2.1258669445497746e-06, + "loss": 0.3714, + "step": 6997 + }, + { + "epoch": 3.308747044917258, + "grad_norm": 2.934483289718628, + "learning_rate": 2.1252501544260657e-06, + "loss": 0.4085, + "step": 6998 + }, + { + "epoch": 3.3092198581560286, + "grad_norm": 3.155613660812378, + "learning_rate": 2.1246333876363852e-06, + "loss": 0.4698, + "step": 6999 + }, + { + "epoch": 3.309692671394799, + "grad_norm": 2.648171901702881, + "learning_rate": 2.124016644219139e-06, + "loss": 0.3091, + "step": 7000 + }, + { + "epoch": 3.3101654846335697, + "grad_norm": 2.908219814300537, + "learning_rate": 2.123399924212728e-06, + "loss": 0.4063, + "step": 7001 + }, + { + "epoch": 3.3106382978723405, + "grad_norm": 3.138749361038208, + "learning_rate": 2.122783227655551e-06, + "loss": 0.4296, + "step": 7002 + }, + { + "epoch": 3.311111111111111, + "grad_norm": 3.044466018676758, + "learning_rate": 2.1221665545860094e-06, + "loss": 0.4424, + "step": 7003 + }, + { + "epoch": 3.3115839243498817, + "grad_norm": 2.6758792400360107, + "learning_rate": 2.121549905042499e-06, + "loss": 0.4073, + "step": 7004 + }, + { + "epoch": 3.3120567375886525, + "grad_norm": 2.8901989459991455, + "learning_rate": 2.1209332790634174e-06, + "loss": 0.3842, + "step": 7005 + }, + { + "epoch": 3.3125295508274233, + "grad_norm": 2.8179712295532227, + "learning_rate": 2.1203166766871582e-06, + "loss": 0.366, + "step": 7006 + }, + { + "epoch": 3.313002364066194, + "grad_norm": 2.6536550521850586, + "learning_rate": 2.1197000979521138e-06, + "loss": 0.3851, + "step": 7007 + }, + { + "epoch": 3.3134751773049644, + "grad_norm": 3.1277682781219482, + "learning_rate": 2.1190835428966775e-06, + "loss": 0.4249, + "step": 7008 + }, + { + "epoch": 3.3139479905437352, + "grad_norm": 2.924666166305542, + "learning_rate": 2.1184670115592383e-06, + "loss": 0.3873, + "step": 7009 + }, + { + "epoch": 3.314420803782506, + "grad_norm": 2.7921009063720703, + "learning_rate": 2.1178505039781856e-06, + "loss": 0.3754, + "step": 7010 + }, + { + "epoch": 3.3148936170212764, + "grad_norm": 2.5349879264831543, + "learning_rate": 2.1172340201919067e-06, + "loss": 0.3701, + "step": 7011 + }, + { + "epoch": 3.315366430260047, + "grad_norm": 2.849376678466797, + "learning_rate": 2.1166175602387866e-06, + "loss": 0.3963, + "step": 7012 + }, + { + "epoch": 3.315839243498818, + "grad_norm": 3.141280174255371, + "learning_rate": 2.11600112415721e-06, + "loss": 0.4158, + "step": 7013 + }, + { + "epoch": 3.3163120567375888, + "grad_norm": 2.922807455062866, + "learning_rate": 2.11538471198556e-06, + "loss": 0.3667, + "step": 7014 + }, + { + "epoch": 3.3167848699763596, + "grad_norm": 2.770400047302246, + "learning_rate": 2.114768323762216e-06, + "loss": 0.3674, + "step": 7015 + }, + { + "epoch": 3.31725768321513, + "grad_norm": 2.7706570625305176, + "learning_rate": 2.114151959525561e-06, + "loss": 0.3761, + "step": 7016 + }, + { + "epoch": 3.3177304964539007, + "grad_norm": 3.041755437850952, + "learning_rate": 2.1135356193139704e-06, + "loss": 0.4483, + "step": 7017 + }, + { + "epoch": 3.3182033096926715, + "grad_norm": 3.5757904052734375, + "learning_rate": 2.1129193031658227e-06, + "loss": 0.4094, + "step": 7018 + }, + { + "epoch": 3.318676122931442, + "grad_norm": 2.9292917251586914, + "learning_rate": 2.1123030111194936e-06, + "loss": 0.3514, + "step": 7019 + }, + { + "epoch": 3.3191489361702127, + "grad_norm": 3.1443874835968018, + "learning_rate": 2.111686743213355e-06, + "loss": 0.4098, + "step": 7020 + }, + { + "epoch": 3.3196217494089835, + "grad_norm": 2.9738030433654785, + "learning_rate": 2.1110704994857804e-06, + "loss": 0.3584, + "step": 7021 + }, + { + "epoch": 3.3200945626477543, + "grad_norm": 2.8961563110351562, + "learning_rate": 2.1104542799751397e-06, + "loss": 0.3736, + "step": 7022 + }, + { + "epoch": 3.320567375886525, + "grad_norm": 3.9264683723449707, + "learning_rate": 2.1098380847198037e-06, + "loss": 0.457, + "step": 7023 + }, + { + "epoch": 3.3210401891252954, + "grad_norm": 2.8742756843566895, + "learning_rate": 2.109221913758139e-06, + "loss": 0.4252, + "step": 7024 + }, + { + "epoch": 3.321513002364066, + "grad_norm": 3.7229559421539307, + "learning_rate": 2.108605767128512e-06, + "loss": 0.4451, + "step": 7025 + }, + { + "epoch": 3.321985815602837, + "grad_norm": 2.6417593955993652, + "learning_rate": 2.1079896448692884e-06, + "loss": 0.3658, + "step": 7026 + }, + { + "epoch": 3.3224586288416074, + "grad_norm": 2.8780412673950195, + "learning_rate": 2.10737354701883e-06, + "loss": 0.4225, + "step": 7027 + }, + { + "epoch": 3.322931442080378, + "grad_norm": 2.557816505432129, + "learning_rate": 2.1067574736155e-06, + "loss": 0.3812, + "step": 7028 + }, + { + "epoch": 3.323404255319149, + "grad_norm": 2.859062910079956, + "learning_rate": 2.106141424697658e-06, + "loss": 0.3629, + "step": 7029 + }, + { + "epoch": 3.3238770685579198, + "grad_norm": 2.4776878356933594, + "learning_rate": 2.1055254003036607e-06, + "loss": 0.3591, + "step": 7030 + }, + { + "epoch": 3.3243498817966906, + "grad_norm": 3.085066795349121, + "learning_rate": 2.1049094004718687e-06, + "loss": 0.4237, + "step": 7031 + }, + { + "epoch": 3.324822695035461, + "grad_norm": 2.862592935562134, + "learning_rate": 2.1042934252406345e-06, + "loss": 0.3185, + "step": 7032 + }, + { + "epoch": 3.3252955082742317, + "grad_norm": 2.965743064880371, + "learning_rate": 2.1036774746483145e-06, + "loss": 0.4058, + "step": 7033 + }, + { + "epoch": 3.3257683215130025, + "grad_norm": 2.7420589923858643, + "learning_rate": 2.103061548733261e-06, + "loss": 0.3566, + "step": 7034 + }, + { + "epoch": 3.326241134751773, + "grad_norm": 2.7824347019195557, + "learning_rate": 2.1024456475338235e-06, + "loss": 0.3553, + "step": 7035 + }, + { + "epoch": 3.3267139479905437, + "grad_norm": 3.0410704612731934, + "learning_rate": 2.1018297710883528e-06, + "loss": 0.3772, + "step": 7036 + }, + { + "epoch": 3.3271867612293144, + "grad_norm": 3.0811562538146973, + "learning_rate": 2.101213919435196e-06, + "loss": 0.3738, + "step": 7037 + }, + { + "epoch": 3.3276595744680852, + "grad_norm": 2.939445734024048, + "learning_rate": 2.100598092612699e-06, + "loss": 0.4107, + "step": 7038 + }, + { + "epoch": 3.3281323877068556, + "grad_norm": 3.05804705619812, + "learning_rate": 2.0999822906592086e-06, + "loss": 0.3972, + "step": 7039 + }, + { + "epoch": 3.3286052009456264, + "grad_norm": 2.803558111190796, + "learning_rate": 2.0993665136130657e-06, + "loss": 0.3487, + "step": 7040 + }, + { + "epoch": 3.329078014184397, + "grad_norm": 2.937675714492798, + "learning_rate": 2.0987507615126147e-06, + "loss": 0.4095, + "step": 7041 + }, + { + "epoch": 3.329550827423168, + "grad_norm": 2.853905439376831, + "learning_rate": 2.098135034396194e-06, + "loss": 0.3775, + "step": 7042 + }, + { + "epoch": 3.3300236406619383, + "grad_norm": 3.3520495891571045, + "learning_rate": 2.097519332302142e-06, + "loss": 0.4065, + "step": 7043 + }, + { + "epoch": 3.330496453900709, + "grad_norm": 2.8787078857421875, + "learning_rate": 2.096903655268797e-06, + "loss": 0.3452, + "step": 7044 + }, + { + "epoch": 3.33096926713948, + "grad_norm": 2.993896007537842, + "learning_rate": 2.096288003334493e-06, + "loss": 0.3814, + "step": 7045 + }, + { + "epoch": 3.3314420803782507, + "grad_norm": 3.5248336791992188, + "learning_rate": 2.0956723765375655e-06, + "loss": 0.3852, + "step": 7046 + }, + { + "epoch": 3.331914893617021, + "grad_norm": 3.2227890491485596, + "learning_rate": 2.0950567749163463e-06, + "loss": 0.3913, + "step": 7047 + }, + { + "epoch": 3.332387706855792, + "grad_norm": 3.390401601791382, + "learning_rate": 2.094441198509165e-06, + "loss": 0.3944, + "step": 7048 + }, + { + "epoch": 3.3328605200945627, + "grad_norm": 3.2057554721832275, + "learning_rate": 2.0938256473543534e-06, + "loss": 0.404, + "step": 7049 + }, + { + "epoch": 3.3333333333333335, + "grad_norm": 2.866708755493164, + "learning_rate": 2.0932101214902367e-06, + "loss": 0.4345, + "step": 7050 + }, + { + "epoch": 3.333806146572104, + "grad_norm": 3.4304039478302, + "learning_rate": 2.0925946209551428e-06, + "loss": 0.4209, + "step": 7051 + }, + { + "epoch": 3.3342789598108746, + "grad_norm": 3.996561288833618, + "learning_rate": 2.091979145787395e-06, + "loss": 0.4394, + "step": 7052 + }, + { + "epoch": 3.3347517730496454, + "grad_norm": 3.1932613849639893, + "learning_rate": 2.0913636960253166e-06, + "loss": 0.3837, + "step": 7053 + }, + { + "epoch": 3.3352245862884162, + "grad_norm": 2.908832311630249, + "learning_rate": 2.0907482717072293e-06, + "loss": 0.3526, + "step": 7054 + }, + { + "epoch": 3.3356973995271866, + "grad_norm": 2.7319607734680176, + "learning_rate": 2.090132872871452e-06, + "loss": 0.3686, + "step": 7055 + }, + { + "epoch": 3.3361702127659574, + "grad_norm": 2.9213504791259766, + "learning_rate": 2.0895174995563043e-06, + "loss": 0.4034, + "step": 7056 + }, + { + "epoch": 3.336643026004728, + "grad_norm": 2.8093936443328857, + "learning_rate": 2.0889021518001017e-06, + "loss": 0.4151, + "step": 7057 + }, + { + "epoch": 3.337115839243499, + "grad_norm": 3.1840829849243164, + "learning_rate": 2.0882868296411594e-06, + "loss": 0.3501, + "step": 7058 + }, + { + "epoch": 3.3375886524822693, + "grad_norm": 2.793567657470703, + "learning_rate": 2.087671533117791e-06, + "loss": 0.3911, + "step": 7059 + }, + { + "epoch": 3.33806146572104, + "grad_norm": 3.0820090770721436, + "learning_rate": 2.0870562622683077e-06, + "loss": 0.432, + "step": 7060 + }, + { + "epoch": 3.338534278959811, + "grad_norm": 2.774630546569824, + "learning_rate": 2.0864410171310213e-06, + "loss": 0.3434, + "step": 7061 + }, + { + "epoch": 3.3390070921985817, + "grad_norm": 2.70447039604187, + "learning_rate": 2.085825797744239e-06, + "loss": 0.3787, + "step": 7062 + }, + { + "epoch": 3.339479905437352, + "grad_norm": 3.1014437675476074, + "learning_rate": 2.0852106041462672e-06, + "loss": 0.4568, + "step": 7063 + }, + { + "epoch": 3.339952718676123, + "grad_norm": 3.312680244445801, + "learning_rate": 2.0845954363754133e-06, + "loss": 0.4285, + "step": 7064 + }, + { + "epoch": 3.3404255319148937, + "grad_norm": 2.7070534229278564, + "learning_rate": 2.0839802944699806e-06, + "loss": 0.4096, + "step": 7065 + }, + { + "epoch": 3.3408983451536645, + "grad_norm": 2.8172531127929688, + "learning_rate": 2.083365178468269e-06, + "loss": 0.3652, + "step": 7066 + }, + { + "epoch": 3.341371158392435, + "grad_norm": 2.896378517150879, + "learning_rate": 2.082750088408582e-06, + "loss": 0.3778, + "step": 7067 + }, + { + "epoch": 3.3418439716312056, + "grad_norm": 2.769805669784546, + "learning_rate": 2.0821350243292175e-06, + "loss": 0.3593, + "step": 7068 + }, + { + "epoch": 3.3423167848699764, + "grad_norm": 2.672520875930786, + "learning_rate": 2.0815199862684728e-06, + "loss": 0.3873, + "step": 7069 + }, + { + "epoch": 3.342789598108747, + "grad_norm": 2.841327428817749, + "learning_rate": 2.0809049742646435e-06, + "loss": 0.41, + "step": 7070 + }, + { + "epoch": 3.3432624113475176, + "grad_norm": 3.0540482997894287, + "learning_rate": 2.080289988356023e-06, + "loss": 0.32, + "step": 7071 + }, + { + "epoch": 3.3437352245862884, + "grad_norm": 3.471684217453003, + "learning_rate": 2.079675028580905e-06, + "loss": 0.3779, + "step": 7072 + }, + { + "epoch": 3.344208037825059, + "grad_norm": 2.8545875549316406, + "learning_rate": 2.07906009497758e-06, + "loss": 0.4645, + "step": 7073 + }, + { + "epoch": 3.34468085106383, + "grad_norm": 2.7771127223968506, + "learning_rate": 2.078445187584337e-06, + "loss": 0.3889, + "step": 7074 + }, + { + "epoch": 3.3451536643026003, + "grad_norm": 2.769188165664673, + "learning_rate": 2.0778303064394647e-06, + "loss": 0.3745, + "step": 7075 + }, + { + "epoch": 3.345626477541371, + "grad_norm": 2.739577531814575, + "learning_rate": 2.0772154515812467e-06, + "loss": 0.4402, + "step": 7076 + }, + { + "epoch": 3.346099290780142, + "grad_norm": 2.6124343872070312, + "learning_rate": 2.0766006230479696e-06, + "loss": 0.3595, + "step": 7077 + }, + { + "epoch": 3.3465721040189127, + "grad_norm": 2.7100563049316406, + "learning_rate": 2.0759858208779136e-06, + "loss": 0.3641, + "step": 7078 + }, + { + "epoch": 3.347044917257683, + "grad_norm": 2.8594000339508057, + "learning_rate": 2.075371045109363e-06, + "loss": 0.402, + "step": 7079 + }, + { + "epoch": 3.347517730496454, + "grad_norm": 3.2045278549194336, + "learning_rate": 2.0747562957805955e-06, + "loss": 0.4719, + "step": 7080 + }, + { + "epoch": 3.3479905437352246, + "grad_norm": 2.825594663619995, + "learning_rate": 2.0741415729298874e-06, + "loss": 0.4127, + "step": 7081 + }, + { + "epoch": 3.3484633569739954, + "grad_norm": 2.992403984069824, + "learning_rate": 2.0735268765955173e-06, + "loss": 0.3943, + "step": 7082 + }, + { + "epoch": 3.348936170212766, + "grad_norm": 3.0629165172576904, + "learning_rate": 2.072912206815758e-06, + "loss": 0.4132, + "step": 7083 + }, + { + "epoch": 3.3494089834515366, + "grad_norm": 2.7553658485412598, + "learning_rate": 2.0722975636288836e-06, + "loss": 0.3667, + "step": 7084 + }, + { + "epoch": 3.3498817966903074, + "grad_norm": 3.1556780338287354, + "learning_rate": 2.0716829470731647e-06, + "loss": 0.4383, + "step": 7085 + }, + { + "epoch": 3.350354609929078, + "grad_norm": 2.6693310737609863, + "learning_rate": 2.071068357186869e-06, + "loss": 0.363, + "step": 7086 + }, + { + "epoch": 3.3508274231678485, + "grad_norm": 2.738314628601074, + "learning_rate": 2.0704537940082673e-06, + "loss": 0.3493, + "step": 7087 + }, + { + "epoch": 3.3513002364066193, + "grad_norm": 3.2205989360809326, + "learning_rate": 2.069839257575624e-06, + "loss": 0.3802, + "step": 7088 + }, + { + "epoch": 3.35177304964539, + "grad_norm": 2.8969876766204834, + "learning_rate": 2.069224747927203e-06, + "loss": 0.3846, + "step": 7089 + }, + { + "epoch": 3.352245862884161, + "grad_norm": 2.833179473876953, + "learning_rate": 2.0686102651012694e-06, + "loss": 0.3892, + "step": 7090 + }, + { + "epoch": 3.3527186761229313, + "grad_norm": 3.303830623626709, + "learning_rate": 2.067995809136082e-06, + "loss": 0.4009, + "step": 7091 + }, + { + "epoch": 3.353191489361702, + "grad_norm": 3.3684141635894775, + "learning_rate": 2.0673813800699024e-06, + "loss": 0.4434, + "step": 7092 + }, + { + "epoch": 3.353664302600473, + "grad_norm": 2.6549112796783447, + "learning_rate": 2.066766977940987e-06, + "loss": 0.3941, + "step": 7093 + }, + { + "epoch": 3.3541371158392437, + "grad_norm": 2.852935314178467, + "learning_rate": 2.066152602787591e-06, + "loss": 0.4143, + "step": 7094 + }, + { + "epoch": 3.354609929078014, + "grad_norm": 2.9621706008911133, + "learning_rate": 2.0655382546479713e-06, + "loss": 0.4502, + "step": 7095 + }, + { + "epoch": 3.355082742316785, + "grad_norm": 3.2836413383483887, + "learning_rate": 2.064923933560378e-06, + "loss": 0.3993, + "step": 7096 + }, + { + "epoch": 3.3555555555555556, + "grad_norm": 2.8187968730926514, + "learning_rate": 2.0643096395630654e-06, + "loss": 0.3766, + "step": 7097 + }, + { + "epoch": 3.3560283687943264, + "grad_norm": 2.7965118885040283, + "learning_rate": 2.0636953726942803e-06, + "loss": 0.4258, + "step": 7098 + }, + { + "epoch": 3.3565011820330968, + "grad_norm": 3.002030611038208, + "learning_rate": 2.063081132992271e-06, + "loss": 0.3548, + "step": 7099 + }, + { + "epoch": 3.3569739952718676, + "grad_norm": 2.927603006362915, + "learning_rate": 2.0624669204952847e-06, + "loss": 0.3759, + "step": 7100 + }, + { + "epoch": 3.3574468085106384, + "grad_norm": 2.911393165588379, + "learning_rate": 2.061852735241563e-06, + "loss": 0.3599, + "step": 7101 + }, + { + "epoch": 3.357919621749409, + "grad_norm": 3.0596864223480225, + "learning_rate": 2.0612385772693517e-06, + "loss": 0.3557, + "step": 7102 + }, + { + "epoch": 3.3583924349881795, + "grad_norm": 2.7869808673858643, + "learning_rate": 2.0606244466168905e-06, + "loss": 0.3696, + "step": 7103 + }, + { + "epoch": 3.3588652482269503, + "grad_norm": 2.927715539932251, + "learning_rate": 2.060010343322417e-06, + "loss": 0.3309, + "step": 7104 + }, + { + "epoch": 3.359338061465721, + "grad_norm": 3.44653058052063, + "learning_rate": 2.059396267424171e-06, + "loss": 0.4453, + "step": 7105 + }, + { + "epoch": 3.359810874704492, + "grad_norm": 3.047652244567871, + "learning_rate": 2.0587822189603873e-06, + "loss": 0.3615, + "step": 7106 + }, + { + "epoch": 3.3602836879432623, + "grad_norm": 2.6640517711639404, + "learning_rate": 2.0581681979693002e-06, + "loss": 0.3716, + "step": 7107 + }, + { + "epoch": 3.360756501182033, + "grad_norm": 2.8253493309020996, + "learning_rate": 2.0575542044891424e-06, + "loss": 0.3485, + "step": 7108 + }, + { + "epoch": 3.361229314420804, + "grad_norm": 3.0512938499450684, + "learning_rate": 2.0569402385581433e-06, + "loss": 0.4582, + "step": 7109 + }, + { + "epoch": 3.3617021276595747, + "grad_norm": 2.935060739517212, + "learning_rate": 2.0563263002145333e-06, + "loss": 0.425, + "step": 7110 + }, + { + "epoch": 3.362174940898345, + "grad_norm": 3.2708780765533447, + "learning_rate": 2.0557123894965396e-06, + "loss": 0.4193, + "step": 7111 + }, + { + "epoch": 3.362647754137116, + "grad_norm": 2.758329391479492, + "learning_rate": 2.055098506442386e-06, + "loss": 0.3754, + "step": 7112 + }, + { + "epoch": 3.3631205673758866, + "grad_norm": 3.0359015464782715, + "learning_rate": 2.0544846510902987e-06, + "loss": 0.4207, + "step": 7113 + }, + { + "epoch": 3.3635933806146574, + "grad_norm": 3.096968412399292, + "learning_rate": 2.0538708234784983e-06, + "loss": 0.4303, + "step": 7114 + }, + { + "epoch": 3.3640661938534278, + "grad_norm": 3.0777673721313477, + "learning_rate": 2.053257023645206e-06, + "loss": 0.3904, + "step": 7115 + }, + { + "epoch": 3.3645390070921986, + "grad_norm": 2.9483232498168945, + "learning_rate": 2.0526432516286394e-06, + "loss": 0.3949, + "step": 7116 + }, + { + "epoch": 3.3650118203309693, + "grad_norm": 2.839067220687866, + "learning_rate": 2.0520295074670154e-06, + "loss": 0.3705, + "step": 7117 + }, + { + "epoch": 3.36548463356974, + "grad_norm": 3.0450778007507324, + "learning_rate": 2.0514157911985506e-06, + "loss": 0.3987, + "step": 7118 + }, + { + "epoch": 3.3659574468085105, + "grad_norm": 3.425318717956543, + "learning_rate": 2.0508021028614564e-06, + "loss": 0.3941, + "step": 7119 + }, + { + "epoch": 3.3664302600472813, + "grad_norm": 2.9509286880493164, + "learning_rate": 2.0501884424939465e-06, + "loss": 0.354, + "step": 7120 + }, + { + "epoch": 3.366903073286052, + "grad_norm": 2.799504518508911, + "learning_rate": 2.0495748101342303e-06, + "loss": 0.3891, + "step": 7121 + }, + { + "epoch": 3.3673758865248224, + "grad_norm": 2.9140994548797607, + "learning_rate": 2.048961205820515e-06, + "loss": 0.3638, + "step": 7122 + }, + { + "epoch": 3.3678486997635932, + "grad_norm": 2.8074216842651367, + "learning_rate": 2.0483476295910077e-06, + "loss": 0.3501, + "step": 7123 + }, + { + "epoch": 3.368321513002364, + "grad_norm": 2.770829677581787, + "learning_rate": 2.0477340814839126e-06, + "loss": 0.3774, + "step": 7124 + }, + { + "epoch": 3.368794326241135, + "grad_norm": 2.581655502319336, + "learning_rate": 2.047120561537434e-06, + "loss": 0.3523, + "step": 7125 + }, + { + "epoch": 3.3692671394799056, + "grad_norm": 3.4234209060668945, + "learning_rate": 2.046507069789772e-06, + "loss": 0.4191, + "step": 7126 + }, + { + "epoch": 3.369739952718676, + "grad_norm": 2.669860601425171, + "learning_rate": 2.045893606279126e-06, + "loss": 0.3542, + "step": 7127 + }, + { + "epoch": 3.370212765957447, + "grad_norm": 3.2426629066467285, + "learning_rate": 2.045280171043694e-06, + "loss": 0.4416, + "step": 7128 + }, + { + "epoch": 3.3706855791962176, + "grad_norm": 3.1318910121917725, + "learning_rate": 2.044666764121672e-06, + "loss": 0.3999, + "step": 7129 + }, + { + "epoch": 3.371158392434988, + "grad_norm": 2.7044012546539307, + "learning_rate": 2.044053385551254e-06, + "loss": 0.3907, + "step": 7130 + }, + { + "epoch": 3.3716312056737587, + "grad_norm": 2.9429895877838135, + "learning_rate": 2.0434400353706322e-06, + "loss": 0.3827, + "step": 7131 + }, + { + "epoch": 3.3721040189125295, + "grad_norm": 2.7258787155151367, + "learning_rate": 2.0428267136179973e-06, + "loss": 0.3688, + "step": 7132 + }, + { + "epoch": 3.3725768321513003, + "grad_norm": 2.765108108520508, + "learning_rate": 2.042213420331539e-06, + "loss": 0.4078, + "step": 7133 + }, + { + "epoch": 3.373049645390071, + "grad_norm": 3.2951347827911377, + "learning_rate": 2.0416001555494435e-06, + "loss": 0.4259, + "step": 7134 + }, + { + "epoch": 3.3735224586288415, + "grad_norm": 3.3917062282562256, + "learning_rate": 2.040986919309895e-06, + "loss": 0.5094, + "step": 7135 + }, + { + "epoch": 3.3739952718676123, + "grad_norm": 2.746434450149536, + "learning_rate": 2.04037371165108e-06, + "loss": 0.3513, + "step": 7136 + }, + { + "epoch": 3.374468085106383, + "grad_norm": 3.268731117248535, + "learning_rate": 2.0397605326111774e-06, + "loss": 0.3909, + "step": 7137 + }, + { + "epoch": 3.3749408983451534, + "grad_norm": 2.8498165607452393, + "learning_rate": 2.0391473822283692e-06, + "loss": 0.3657, + "step": 7138 + }, + { + "epoch": 3.3754137115839242, + "grad_norm": 2.855966567993164, + "learning_rate": 2.0385342605408325e-06, + "loss": 0.3927, + "step": 7139 + }, + { + "epoch": 3.375886524822695, + "grad_norm": 3.1839048862457275, + "learning_rate": 2.0379211675867438e-06, + "loss": 0.4476, + "step": 7140 + }, + { + "epoch": 3.376359338061466, + "grad_norm": 2.9379947185516357, + "learning_rate": 2.037308103404278e-06, + "loss": 0.3657, + "step": 7141 + }, + { + "epoch": 3.3768321513002366, + "grad_norm": 2.9251210689544678, + "learning_rate": 2.0366950680316073e-06, + "loss": 0.3975, + "step": 7142 + }, + { + "epoch": 3.377304964539007, + "grad_norm": 2.811885118484497, + "learning_rate": 2.036082061506904e-06, + "loss": 0.3064, + "step": 7143 + }, + { + "epoch": 3.3777777777777778, + "grad_norm": 2.755229949951172, + "learning_rate": 2.0354690838683363e-06, + "loss": 0.3328, + "step": 7144 + }, + { + "epoch": 3.3782505910165486, + "grad_norm": 3.006819725036621, + "learning_rate": 2.0348561351540706e-06, + "loss": 0.4168, + "step": 7145 + }, + { + "epoch": 3.378723404255319, + "grad_norm": 2.8788509368896484, + "learning_rate": 2.034243215402275e-06, + "loss": 0.4123, + "step": 7146 + }, + { + "epoch": 3.3791962174940897, + "grad_norm": 2.9732980728149414, + "learning_rate": 2.033630324651112e-06, + "loss": 0.3371, + "step": 7147 + }, + { + "epoch": 3.3796690307328605, + "grad_norm": 2.7731754779815674, + "learning_rate": 2.033017462938744e-06, + "loss": 0.382, + "step": 7148 + }, + { + "epoch": 3.3801418439716313, + "grad_norm": 2.766395092010498, + "learning_rate": 2.032404630303331e-06, + "loss": 0.3295, + "step": 7149 + }, + { + "epoch": 3.380614657210402, + "grad_norm": 3.197960138320923, + "learning_rate": 2.03179182678303e-06, + "loss": 0.354, + "step": 7150 + }, + { + "epoch": 3.3810874704491725, + "grad_norm": 3.048553228378296, + "learning_rate": 2.031179052416e-06, + "loss": 0.4027, + "step": 7151 + }, + { + "epoch": 3.3815602836879433, + "grad_norm": 3.1527998447418213, + "learning_rate": 2.0305663072403934e-06, + "loss": 0.4229, + "step": 7152 + }, + { + "epoch": 3.382033096926714, + "grad_norm": 3.0407028198242188, + "learning_rate": 2.029953591294366e-06, + "loss": 0.4254, + "step": 7153 + }, + { + "epoch": 3.3825059101654844, + "grad_norm": 2.7170357704162598, + "learning_rate": 2.0293409046160673e-06, + "loss": 0.3307, + "step": 7154 + }, + { + "epoch": 3.382978723404255, + "grad_norm": 3.0128726959228516, + "learning_rate": 2.028728247243646e-06, + "loss": 0.3873, + "step": 7155 + }, + { + "epoch": 3.383451536643026, + "grad_norm": 4.861877918243408, + "learning_rate": 2.0281156192152507e-06, + "loss": 0.4371, + "step": 7156 + }, + { + "epoch": 3.383924349881797, + "grad_norm": 2.890249252319336, + "learning_rate": 2.0275030205690257e-06, + "loss": 0.3899, + "step": 7157 + }, + { + "epoch": 3.3843971631205676, + "grad_norm": 3.0774779319763184, + "learning_rate": 2.026890451343117e-06, + "loss": 0.4151, + "step": 7158 + }, + { + "epoch": 3.384869976359338, + "grad_norm": 2.8705947399139404, + "learning_rate": 2.026277911575665e-06, + "loss": 0.4004, + "step": 7159 + }, + { + "epoch": 3.3853427895981087, + "grad_norm": 3.170760154724121, + "learning_rate": 2.0256654013048096e-06, + "loss": 0.4442, + "step": 7160 + }, + { + "epoch": 3.3858156028368795, + "grad_norm": 4.211156368255615, + "learning_rate": 2.0250529205686905e-06, + "loss": 0.4605, + "step": 7161 + }, + { + "epoch": 3.38628841607565, + "grad_norm": 2.513519287109375, + "learning_rate": 2.0244404694054435e-06, + "loss": 0.3506, + "step": 7162 + }, + { + "epoch": 3.3867612293144207, + "grad_norm": 3.1558821201324463, + "learning_rate": 2.023828047853203e-06, + "loss": 0.43, + "step": 7163 + }, + { + "epoch": 3.3872340425531915, + "grad_norm": 3.6770291328430176, + "learning_rate": 2.023215655950102e-06, + "loss": 0.3911, + "step": 7164 + }, + { + "epoch": 3.3877068557919623, + "grad_norm": 2.6544485092163086, + "learning_rate": 2.022603293734271e-06, + "loss": 0.3306, + "step": 7165 + }, + { + "epoch": 3.388179669030733, + "grad_norm": 3.34232759475708, + "learning_rate": 2.0219909612438405e-06, + "loss": 0.4233, + "step": 7166 + }, + { + "epoch": 3.3886524822695034, + "grad_norm": 3.388561725616455, + "learning_rate": 2.0213786585169363e-06, + "loss": 0.4171, + "step": 7167 + }, + { + "epoch": 3.3891252955082742, + "grad_norm": 2.8606953620910645, + "learning_rate": 2.020766385591684e-06, + "loss": 0.3864, + "step": 7168 + }, + { + "epoch": 3.389598108747045, + "grad_norm": 3.0135979652404785, + "learning_rate": 2.020154142506208e-06, + "loss": 0.3933, + "step": 7169 + }, + { + "epoch": 3.3900709219858154, + "grad_norm": 2.5003163814544678, + "learning_rate": 2.0195419292986294e-06, + "loss": 0.3852, + "step": 7170 + }, + { + "epoch": 3.390543735224586, + "grad_norm": 2.8591368198394775, + "learning_rate": 2.0189297460070685e-06, + "loss": 0.3962, + "step": 7171 + }, + { + "epoch": 3.391016548463357, + "grad_norm": 2.8830223083496094, + "learning_rate": 2.0183175926696427e-06, + "loss": 0.3632, + "step": 7172 + }, + { + "epoch": 3.391489361702128, + "grad_norm": 3.3904542922973633, + "learning_rate": 2.0177054693244674e-06, + "loss": 0.4284, + "step": 7173 + }, + { + "epoch": 3.3919621749408986, + "grad_norm": 3.0325920581817627, + "learning_rate": 2.0170933760096585e-06, + "loss": 0.4331, + "step": 7174 + }, + { + "epoch": 3.392434988179669, + "grad_norm": 2.60345196723938, + "learning_rate": 2.016481312763327e-06, + "loss": 0.4077, + "step": 7175 + }, + { + "epoch": 3.3929078014184397, + "grad_norm": 2.8146891593933105, + "learning_rate": 2.0158692796235845e-06, + "loss": 0.4224, + "step": 7176 + }, + { + "epoch": 3.3933806146572105, + "grad_norm": 2.8158490657806396, + "learning_rate": 2.0152572766285396e-06, + "loss": 0.3454, + "step": 7177 + }, + { + "epoch": 3.393853427895981, + "grad_norm": 3.2753400802612305, + "learning_rate": 2.0146453038162978e-06, + "loss": 0.3615, + "step": 7178 + }, + { + "epoch": 3.3943262411347517, + "grad_norm": 3.0527124404907227, + "learning_rate": 2.0140333612249655e-06, + "loss": 0.415, + "step": 7179 + }, + { + "epoch": 3.3947990543735225, + "grad_norm": 2.6813764572143555, + "learning_rate": 2.0134214488926435e-06, + "loss": 0.3391, + "step": 7180 + }, + { + "epoch": 3.3952718676122933, + "grad_norm": 2.809319496154785, + "learning_rate": 2.0128095668574356e-06, + "loss": 0.3123, + "step": 7181 + }, + { + "epoch": 3.395744680851064, + "grad_norm": 2.6619064807891846, + "learning_rate": 2.0121977151574396e-06, + "loss": 0.4222, + "step": 7182 + }, + { + "epoch": 3.3962174940898344, + "grad_norm": 2.9201200008392334, + "learning_rate": 2.0115858938307516e-06, + "loss": 0.3712, + "step": 7183 + }, + { + "epoch": 3.396690307328605, + "grad_norm": 3.2058637142181396, + "learning_rate": 2.0109741029154696e-06, + "loss": 0.4004, + "step": 7184 + }, + { + "epoch": 3.397163120567376, + "grad_norm": 2.821855306625366, + "learning_rate": 2.0103623424496862e-06, + "loss": 0.4053, + "step": 7185 + }, + { + "epoch": 3.3976359338061464, + "grad_norm": 3.0371549129486084, + "learning_rate": 2.009750612471492e-06, + "loss": 0.4246, + "step": 7186 + }, + { + "epoch": 3.398108747044917, + "grad_norm": 2.8827290534973145, + "learning_rate": 2.009138913018978e-06, + "loss": 0.3256, + "step": 7187 + }, + { + "epoch": 3.398581560283688, + "grad_norm": 3.168039560317993, + "learning_rate": 2.0085272441302305e-06, + "loss": 0.4233, + "step": 7188 + }, + { + "epoch": 3.3990543735224588, + "grad_norm": 3.259723663330078, + "learning_rate": 2.0079156058433374e-06, + "loss": 0.4168, + "step": 7189 + }, + { + "epoch": 3.3995271867612296, + "grad_norm": 2.456231117248535, + "learning_rate": 2.007303998196382e-06, + "loss": 0.3383, + "step": 7190 + }, + { + "epoch": 3.4, + "grad_norm": 2.735180377960205, + "learning_rate": 2.006692421227445e-06, + "loss": 0.3475, + "step": 7191 + }, + { + "epoch": 3.4004728132387707, + "grad_norm": 2.76263427734375, + "learning_rate": 2.006080874974609e-06, + "loss": 0.3651, + "step": 7192 + }, + { + "epoch": 3.4009456264775415, + "grad_norm": 3.36867094039917, + "learning_rate": 2.0054693594759504e-06, + "loss": 0.4479, + "step": 7193 + }, + { + "epoch": 3.401418439716312, + "grad_norm": 2.532167673110962, + "learning_rate": 2.004857874769547e-06, + "loss": 0.3818, + "step": 7194 + }, + { + "epoch": 3.4018912529550827, + "grad_norm": 2.8723537921905518, + "learning_rate": 2.0042464208934724e-06, + "loss": 0.3332, + "step": 7195 + }, + { + "epoch": 3.4023640661938535, + "grad_norm": 2.676460027694702, + "learning_rate": 2.0036349978857987e-06, + "loss": 0.3488, + "step": 7196 + }, + { + "epoch": 3.4028368794326243, + "grad_norm": 2.805851459503174, + "learning_rate": 2.0030236057845983e-06, + "loss": 0.3796, + "step": 7197 + }, + { + "epoch": 3.403309692671395, + "grad_norm": 2.688988447189331, + "learning_rate": 2.0024122446279377e-06, + "loss": 0.3707, + "step": 7198 + }, + { + "epoch": 3.4037825059101654, + "grad_norm": 3.118720293045044, + "learning_rate": 2.0018009144538853e-06, + "loss": 0.4064, + "step": 7199 + }, + { + "epoch": 3.404255319148936, + "grad_norm": 2.876507520675659, + "learning_rate": 2.001189615300506e-06, + "loss": 0.3543, + "step": 7200 + }, + { + "epoch": 3.404728132387707, + "grad_norm": 3.0043466091156006, + "learning_rate": 2.000578347205861e-06, + "loss": 0.3833, + "step": 7201 + }, + { + "epoch": 3.4052009456264773, + "grad_norm": 3.1057114601135254, + "learning_rate": 1.9999671102080133e-06, + "loss": 0.4154, + "step": 7202 + }, + { + "epoch": 3.405673758865248, + "grad_norm": 2.9791855812072754, + "learning_rate": 1.9993559043450202e-06, + "loss": 0.3865, + "step": 7203 + }, + { + "epoch": 3.406146572104019, + "grad_norm": 3.4403460025787354, + "learning_rate": 1.9987447296549407e-06, + "loss": 0.3883, + "step": 7204 + }, + { + "epoch": 3.4066193853427897, + "grad_norm": 2.9962027072906494, + "learning_rate": 1.998133586175829e-06, + "loss": 0.3796, + "step": 7205 + }, + { + "epoch": 3.40709219858156, + "grad_norm": 3.0613129138946533, + "learning_rate": 1.997522473945737e-06, + "loss": 0.3917, + "step": 7206 + }, + { + "epoch": 3.407565011820331, + "grad_norm": 3.065985679626465, + "learning_rate": 1.996911393002718e-06, + "loss": 0.3521, + "step": 7207 + }, + { + "epoch": 3.4080378250591017, + "grad_norm": 2.976177930831909, + "learning_rate": 1.996300343384821e-06, + "loss": 0.3852, + "step": 7208 + }, + { + "epoch": 3.4085106382978725, + "grad_norm": 3.3587961196899414, + "learning_rate": 1.995689325130092e-06, + "loss": 0.3947, + "step": 7209 + }, + { + "epoch": 3.408983451536643, + "grad_norm": 2.626983165740967, + "learning_rate": 1.995078338276578e-06, + "loss": 0.316, + "step": 7210 + }, + { + "epoch": 3.4094562647754136, + "grad_norm": 3.14713978767395, + "learning_rate": 1.9944673828623217e-06, + "loss": 0.4008, + "step": 7211 + }, + { + "epoch": 3.4099290780141844, + "grad_norm": 2.968918800354004, + "learning_rate": 1.993856458925365e-06, + "loss": 0.439, + "step": 7212 + }, + { + "epoch": 3.4104018912529552, + "grad_norm": 2.7724127769470215, + "learning_rate": 1.9932455665037476e-06, + "loss": 0.3941, + "step": 7213 + }, + { + "epoch": 3.4108747044917256, + "grad_norm": 2.963146448135376, + "learning_rate": 1.9926347056355057e-06, + "loss": 0.3893, + "step": 7214 + }, + { + "epoch": 3.4113475177304964, + "grad_norm": 2.791637420654297, + "learning_rate": 1.9920238763586765e-06, + "loss": 0.4068, + "step": 7215 + }, + { + "epoch": 3.411820330969267, + "grad_norm": 3.030275583267212, + "learning_rate": 1.9914130787112924e-06, + "loss": 0.3828, + "step": 7216 + }, + { + "epoch": 3.412293144208038, + "grad_norm": 3.113128900527954, + "learning_rate": 1.990802312731387e-06, + "loss": 0.3903, + "step": 7217 + }, + { + "epoch": 3.4127659574468083, + "grad_norm": 3.104170322418213, + "learning_rate": 1.9901915784569884e-06, + "loss": 0.4171, + "step": 7218 + }, + { + "epoch": 3.413238770685579, + "grad_norm": 3.1247572898864746, + "learning_rate": 1.989580875926125e-06, + "loss": 0.4022, + "step": 7219 + }, + { + "epoch": 3.41371158392435, + "grad_norm": 2.9487457275390625, + "learning_rate": 1.988970205176822e-06, + "loss": 0.3948, + "step": 7220 + }, + { + "epoch": 3.4141843971631207, + "grad_norm": 2.8763654232025146, + "learning_rate": 1.9883595662471028e-06, + "loss": 0.3588, + "step": 7221 + }, + { + "epoch": 3.414657210401891, + "grad_norm": 2.563152551651001, + "learning_rate": 1.987748959174991e-06, + "loss": 0.3509, + "step": 7222 + }, + { + "epoch": 3.415130023640662, + "grad_norm": 3.148759365081787, + "learning_rate": 1.9871383839985053e-06, + "loss": 0.4364, + "step": 7223 + }, + { + "epoch": 3.4156028368794327, + "grad_norm": 2.8187363147735596, + "learning_rate": 1.986527840755663e-06, + "loss": 0.3803, + "step": 7224 + }, + { + "epoch": 3.4160756501182035, + "grad_norm": 3.009376287460327, + "learning_rate": 1.985917329484481e-06, + "loss": 0.3841, + "step": 7225 + }, + { + "epoch": 3.416548463356974, + "grad_norm": 2.869291067123413, + "learning_rate": 1.985306850222972e-06, + "loss": 0.3877, + "step": 7226 + }, + { + "epoch": 3.4170212765957446, + "grad_norm": 3.108461856842041, + "learning_rate": 1.9846964030091497e-06, + "loss": 0.3767, + "step": 7227 + }, + { + "epoch": 3.4174940898345154, + "grad_norm": 3.096320629119873, + "learning_rate": 1.9840859878810226e-06, + "loss": 0.4603, + "step": 7228 + }, + { + "epoch": 3.417966903073286, + "grad_norm": 2.8519909381866455, + "learning_rate": 1.983475604876598e-06, + "loss": 0.3263, + "step": 7229 + }, + { + "epoch": 3.4184397163120566, + "grad_norm": 3.192051410675049, + "learning_rate": 1.9828652540338835e-06, + "loss": 0.4132, + "step": 7230 + }, + { + "epoch": 3.4189125295508274, + "grad_norm": 3.0398056507110596, + "learning_rate": 1.9822549353908817e-06, + "loss": 0.4038, + "step": 7231 + }, + { + "epoch": 3.419385342789598, + "grad_norm": 3.12247896194458, + "learning_rate": 1.9816446489855944e-06, + "loss": 0.409, + "step": 7232 + }, + { + "epoch": 3.419858156028369, + "grad_norm": 3.20316481590271, + "learning_rate": 1.9810343948560223e-06, + "loss": 0.4058, + "step": 7233 + }, + { + "epoch": 3.4203309692671393, + "grad_norm": 3.3397457599639893, + "learning_rate": 1.9804241730401625e-06, + "loss": 0.3657, + "step": 7234 + }, + { + "epoch": 3.42080378250591, + "grad_norm": 3.928691864013672, + "learning_rate": 1.979813983576012e-06, + "loss": 0.361, + "step": 7235 + }, + { + "epoch": 3.421276595744681, + "grad_norm": 3.5814051628112793, + "learning_rate": 1.9792038265015635e-06, + "loss": 0.3975, + "step": 7236 + }, + { + "epoch": 3.4217494089834517, + "grad_norm": 2.8578879833221436, + "learning_rate": 1.9785937018548086e-06, + "loss": 0.3915, + "step": 7237 + }, + { + "epoch": 3.422222222222222, + "grad_norm": 3.0343220233917236, + "learning_rate": 1.977983609673738e-06, + "loss": 0.3686, + "step": 7238 + }, + { + "epoch": 3.422695035460993, + "grad_norm": 3.2719056606292725, + "learning_rate": 1.977373549996338e-06, + "loss": 0.3905, + "step": 7239 + }, + { + "epoch": 3.4231678486997636, + "grad_norm": 2.6638169288635254, + "learning_rate": 1.976763522860597e-06, + "loss": 0.3631, + "step": 7240 + }, + { + "epoch": 3.4236406619385344, + "grad_norm": 2.7679927349090576, + "learning_rate": 1.9761535283044967e-06, + "loss": 0.377, + "step": 7241 + }, + { + "epoch": 3.424113475177305, + "grad_norm": 2.774540424346924, + "learning_rate": 1.975543566366019e-06, + "loss": 0.3509, + "step": 7242 + }, + { + "epoch": 3.4245862884160756, + "grad_norm": 2.811659336090088, + "learning_rate": 1.9749336370831438e-06, + "loss": 0.3835, + "step": 7243 + }, + { + "epoch": 3.4250591016548464, + "grad_norm": 2.8533360958099365, + "learning_rate": 1.9743237404938478e-06, + "loss": 0.3765, + "step": 7244 + }, + { + "epoch": 3.425531914893617, + "grad_norm": 2.712301015853882, + "learning_rate": 1.9737138766361084e-06, + "loss": 0.3797, + "step": 7245 + }, + { + "epoch": 3.4260047281323875, + "grad_norm": 2.9763426780700684, + "learning_rate": 1.9731040455478986e-06, + "loss": 0.4223, + "step": 7246 + }, + { + "epoch": 3.4264775413711583, + "grad_norm": 2.8802297115325928, + "learning_rate": 1.9724942472671882e-06, + "loss": 0.3666, + "step": 7247 + }, + { + "epoch": 3.426950354609929, + "grad_norm": 2.934107542037964, + "learning_rate": 1.9718844818319486e-06, + "loss": 0.3612, + "step": 7248 + }, + { + "epoch": 3.4274231678487, + "grad_norm": 3.0172696113586426, + "learning_rate": 1.9712747492801467e-06, + "loss": 0.3643, + "step": 7249 + }, + { + "epoch": 3.4278959810874703, + "grad_norm": 3.368419647216797, + "learning_rate": 1.970665049649748e-06, + "loss": 0.4511, + "step": 7250 + }, + { + "epoch": 3.428368794326241, + "grad_norm": 3.077819585800171, + "learning_rate": 1.9700553829787162e-06, + "loss": 0.4013, + "step": 7251 + }, + { + "epoch": 3.428841607565012, + "grad_norm": 2.690673828125, + "learning_rate": 1.96944574930501e-06, + "loss": 0.3776, + "step": 7252 + }, + { + "epoch": 3.4293144208037827, + "grad_norm": 3.1122169494628906, + "learning_rate": 1.9688361486665924e-06, + "loss": 0.3802, + "step": 7253 + }, + { + "epoch": 3.429787234042553, + "grad_norm": 2.9874207973480225, + "learning_rate": 1.968226581101417e-06, + "loss": 0.4492, + "step": 7254 + }, + { + "epoch": 3.430260047281324, + "grad_norm": 2.885493278503418, + "learning_rate": 1.967617046647442e-06, + "loss": 0.3958, + "step": 7255 + }, + { + "epoch": 3.4307328605200946, + "grad_norm": 2.953897476196289, + "learning_rate": 1.9670075453426195e-06, + "loss": 0.3973, + "step": 7256 + }, + { + "epoch": 3.4312056737588654, + "grad_norm": 2.685088634490967, + "learning_rate": 1.966398077224899e-06, + "loss": 0.393, + "step": 7257 + }, + { + "epoch": 3.431678486997636, + "grad_norm": 4.035208702087402, + "learning_rate": 1.9657886423322313e-06, + "loss": 0.4263, + "step": 7258 + }, + { + "epoch": 3.4321513002364066, + "grad_norm": 2.942042827606201, + "learning_rate": 1.965179240702562e-06, + "loss": 0.4319, + "step": 7259 + }, + { + "epoch": 3.4326241134751774, + "grad_norm": 3.0794999599456787, + "learning_rate": 1.9645698723738356e-06, + "loss": 0.4199, + "step": 7260 + }, + { + "epoch": 3.433096926713948, + "grad_norm": 3.0653584003448486, + "learning_rate": 1.963960537383996e-06, + "loss": 0.3723, + "step": 7261 + }, + { + "epoch": 3.4335697399527185, + "grad_norm": 3.1571545600891113, + "learning_rate": 1.963351235770983e-06, + "loss": 0.4211, + "step": 7262 + }, + { + "epoch": 3.4340425531914893, + "grad_norm": 2.6681735515594482, + "learning_rate": 1.962741967572736e-06, + "loss": 0.3333, + "step": 7263 + }, + { + "epoch": 3.43451536643026, + "grad_norm": 2.9747934341430664, + "learning_rate": 1.9621327328271907e-06, + "loss": 0.3896, + "step": 7264 + }, + { + "epoch": 3.434988179669031, + "grad_norm": 2.7994508743286133, + "learning_rate": 1.9615235315722814e-06, + "loss": 0.3642, + "step": 7265 + }, + { + "epoch": 3.4354609929078013, + "grad_norm": 2.933928966522217, + "learning_rate": 1.9609143638459405e-06, + "loss": 0.3955, + "step": 7266 + }, + { + "epoch": 3.435933806146572, + "grad_norm": 2.9577367305755615, + "learning_rate": 1.9603052296860983e-06, + "loss": 0.3437, + "step": 7267 + }, + { + "epoch": 3.436406619385343, + "grad_norm": 3.017282009124756, + "learning_rate": 1.959696129130684e-06, + "loss": 0.3784, + "step": 7268 + }, + { + "epoch": 3.4368794326241137, + "grad_norm": 3.2072815895080566, + "learning_rate": 1.959087062217622e-06, + "loss": 0.3901, + "step": 7269 + }, + { + "epoch": 3.437352245862884, + "grad_norm": 2.91153621673584, + "learning_rate": 1.9584780289848358e-06, + "loss": 0.4402, + "step": 7270 + }, + { + "epoch": 3.437825059101655, + "grad_norm": 2.846842050552368, + "learning_rate": 1.9578690294702495e-06, + "loss": 0.3804, + "step": 7271 + }, + { + "epoch": 3.4382978723404256, + "grad_norm": 3.0958521366119385, + "learning_rate": 1.957260063711781e-06, + "loss": 0.4103, + "step": 7272 + }, + { + "epoch": 3.4387706855791964, + "grad_norm": 2.9808530807495117, + "learning_rate": 1.9566511317473483e-06, + "loss": 0.4127, + "step": 7273 + }, + { + "epoch": 3.4392434988179668, + "grad_norm": 2.725851058959961, + "learning_rate": 1.9560422336148678e-06, + "loss": 0.3493, + "step": 7274 + }, + { + "epoch": 3.4397163120567376, + "grad_norm": 2.7861814498901367, + "learning_rate": 1.9554333693522515e-06, + "loss": 0.3703, + "step": 7275 + }, + { + "epoch": 3.4401891252955084, + "grad_norm": 3.128708839416504, + "learning_rate": 1.954824538997412e-06, + "loss": 0.3917, + "step": 7276 + }, + { + "epoch": 3.440661938534279, + "grad_norm": 3.117403268814087, + "learning_rate": 1.954215742588257e-06, + "loss": 0.3581, + "step": 7277 + }, + { + "epoch": 3.4411347517730495, + "grad_norm": 2.710076093673706, + "learning_rate": 1.9536069801626957e-06, + "loss": 0.3255, + "step": 7278 + }, + { + "epoch": 3.4416075650118203, + "grad_norm": 2.7732627391815186, + "learning_rate": 1.952998251758632e-06, + "loss": 0.375, + "step": 7279 + }, + { + "epoch": 3.442080378250591, + "grad_norm": 2.896050453186035, + "learning_rate": 1.9523895574139673e-06, + "loss": 0.4087, + "step": 7280 + }, + { + "epoch": 3.4425531914893615, + "grad_norm": 2.9051663875579834, + "learning_rate": 1.9517808971666048e-06, + "loss": 0.3423, + "step": 7281 + }, + { + "epoch": 3.4430260047281322, + "grad_norm": 3.0232038497924805, + "learning_rate": 1.9511722710544417e-06, + "loss": 0.364, + "step": 7282 + }, + { + "epoch": 3.443498817966903, + "grad_norm": 2.753870725631714, + "learning_rate": 1.9505636791153744e-06, + "loss": 0.3484, + "step": 7283 + }, + { + "epoch": 3.443971631205674, + "grad_norm": 2.944079637527466, + "learning_rate": 1.9499551213872983e-06, + "loss": 0.3354, + "step": 7284 + }, + { + "epoch": 3.4444444444444446, + "grad_norm": 3.1531970500946045, + "learning_rate": 1.949346597908104e-06, + "loss": 0.3394, + "step": 7285 + }, + { + "epoch": 3.444917257683215, + "grad_norm": 3.0357189178466797, + "learning_rate": 1.948738108715683e-06, + "loss": 0.4302, + "step": 7286 + }, + { + "epoch": 3.445390070921986, + "grad_norm": 3.3698086738586426, + "learning_rate": 1.948129653847923e-06, + "loss": 0.419, + "step": 7287 + }, + { + "epoch": 3.4458628841607566, + "grad_norm": 3.343132495880127, + "learning_rate": 1.947521233342709e-06, + "loss": 0.3895, + "step": 7288 + }, + { + "epoch": 3.446335697399527, + "grad_norm": 3.1905252933502197, + "learning_rate": 1.9469128472379257e-06, + "loss": 0.429, + "step": 7289 + }, + { + "epoch": 3.4468085106382977, + "grad_norm": 2.8517212867736816, + "learning_rate": 1.946304495571454e-06, + "loss": 0.3513, + "step": 7290 + }, + { + "epoch": 3.4472813238770685, + "grad_norm": 2.7713496685028076, + "learning_rate": 1.9456961783811735e-06, + "loss": 0.4331, + "step": 7291 + }, + { + "epoch": 3.4477541371158393, + "grad_norm": 2.8258652687072754, + "learning_rate": 1.945087895704962e-06, + "loss": 0.3539, + "step": 7292 + }, + { + "epoch": 3.44822695035461, + "grad_norm": 2.757322072982788, + "learning_rate": 1.9444796475806925e-06, + "loss": 0.3865, + "step": 7293 + }, + { + "epoch": 3.4486997635933805, + "grad_norm": 2.8410696983337402, + "learning_rate": 1.943871434046241e-06, + "loss": 0.3612, + "step": 7294 + }, + { + "epoch": 3.4491725768321513, + "grad_norm": 3.2297637462615967, + "learning_rate": 1.9432632551394753e-06, + "loss": 0.3956, + "step": 7295 + }, + { + "epoch": 3.449645390070922, + "grad_norm": 2.991351842880249, + "learning_rate": 1.9426551108982666e-06, + "loss": 0.3864, + "step": 7296 + }, + { + "epoch": 3.4501182033096924, + "grad_norm": 2.7942168712615967, + "learning_rate": 1.94204700136048e-06, + "loss": 0.4314, + "step": 7297 + }, + { + "epoch": 3.4505910165484632, + "grad_norm": 2.8188698291778564, + "learning_rate": 1.9414389265639805e-06, + "loss": 0.3585, + "step": 7298 + }, + { + "epoch": 3.451063829787234, + "grad_norm": 3.2826895713806152, + "learning_rate": 1.9408308865466295e-06, + "loss": 0.4614, + "step": 7299 + }, + { + "epoch": 3.451536643026005, + "grad_norm": 3.273867130279541, + "learning_rate": 1.9402228813462865e-06, + "loss": 0.3533, + "step": 7300 + }, + { + "epoch": 3.4520094562647756, + "grad_norm": 3.5334157943725586, + "learning_rate": 1.939614911000811e-06, + "loss": 0.4088, + "step": 7301 + }, + { + "epoch": 3.452482269503546, + "grad_norm": 2.983908176422119, + "learning_rate": 1.9390069755480583e-06, + "loss": 0.3725, + "step": 7302 + }, + { + "epoch": 3.4529550827423168, + "grad_norm": 2.893660306930542, + "learning_rate": 1.93839907502588e-06, + "loss": 0.3746, + "step": 7303 + }, + { + "epoch": 3.4534278959810876, + "grad_norm": 3.1762871742248535, + "learning_rate": 1.9377912094721295e-06, + "loss": 0.446, + "step": 7304 + }, + { + "epoch": 3.453900709219858, + "grad_norm": 3.3231537342071533, + "learning_rate": 1.9371833789246554e-06, + "loss": 0.4837, + "step": 7305 + }, + { + "epoch": 3.4543735224586287, + "grad_norm": 3.548333168029785, + "learning_rate": 1.936575583421304e-06, + "loss": 0.3911, + "step": 7306 + }, + { + "epoch": 3.4548463356973995, + "grad_norm": 3.0627071857452393, + "learning_rate": 1.9359678229999213e-06, + "loss": 0.3751, + "step": 7307 + }, + { + "epoch": 3.4553191489361703, + "grad_norm": 2.797663927078247, + "learning_rate": 1.9353600976983475e-06, + "loss": 0.41, + "step": 7308 + }, + { + "epoch": 3.455791962174941, + "grad_norm": 2.803269624710083, + "learning_rate": 1.9347524075544258e-06, + "loss": 0.3775, + "step": 7309 + }, + { + "epoch": 3.4562647754137115, + "grad_norm": 2.828010320663452, + "learning_rate": 1.934144752605993e-06, + "loss": 0.375, + "step": 7310 + }, + { + "epoch": 3.4567375886524823, + "grad_norm": 3.456477165222168, + "learning_rate": 1.933537132890884e-06, + "loss": 0.4764, + "step": 7311 + }, + { + "epoch": 3.457210401891253, + "grad_norm": 2.723670244216919, + "learning_rate": 1.9329295484469354e-06, + "loss": 0.3581, + "step": 7312 + }, + { + "epoch": 3.4576832151300234, + "grad_norm": 3.9723474979400635, + "learning_rate": 1.9323219993119766e-06, + "loss": 0.3951, + "step": 7313 + }, + { + "epoch": 3.458156028368794, + "grad_norm": 2.951300859451294, + "learning_rate": 1.931714485523838e-06, + "loss": 0.3865, + "step": 7314 + }, + { + "epoch": 3.458628841607565, + "grad_norm": 2.9265835285186768, + "learning_rate": 1.931107007120347e-06, + "loss": 0.3731, + "step": 7315 + }, + { + "epoch": 3.459101654846336, + "grad_norm": 3.271883249282837, + "learning_rate": 1.930499564139327e-06, + "loss": 0.3971, + "step": 7316 + }, + { + "epoch": 3.4595744680851066, + "grad_norm": 2.8716280460357666, + "learning_rate": 1.929892156618603e-06, + "loss": 0.3332, + "step": 7317 + }, + { + "epoch": 3.460047281323877, + "grad_norm": 2.9820191860198975, + "learning_rate": 1.929284784595993e-06, + "loss": 0.3907, + "step": 7318 + }, + { + "epoch": 3.4605200945626478, + "grad_norm": 3.313225269317627, + "learning_rate": 1.9286774481093183e-06, + "loss": 0.3678, + "step": 7319 + }, + { + "epoch": 3.4609929078014185, + "grad_norm": 3.365387439727783, + "learning_rate": 1.928070147196394e-06, + "loss": 0.4894, + "step": 7320 + }, + { + "epoch": 3.461465721040189, + "grad_norm": 3.1723599433898926, + "learning_rate": 1.927462881895033e-06, + "loss": 0.4607, + "step": 7321 + }, + { + "epoch": 3.4619385342789597, + "grad_norm": 2.7644999027252197, + "learning_rate": 1.9268556522430483e-06, + "loss": 0.3627, + "step": 7322 + }, + { + "epoch": 3.4624113475177305, + "grad_norm": 2.65572190284729, + "learning_rate": 1.9262484582782483e-06, + "loss": 0.3893, + "step": 7323 + }, + { + "epoch": 3.4628841607565013, + "grad_norm": 2.992037773132324, + "learning_rate": 1.9256413000384415e-06, + "loss": 0.4175, + "step": 7324 + }, + { + "epoch": 3.463356973995272, + "grad_norm": 3.020496368408203, + "learning_rate": 1.925034177561433e-06, + "loss": 0.42, + "step": 7325 + }, + { + "epoch": 3.4638297872340424, + "grad_norm": 2.780334234237671, + "learning_rate": 1.9244270908850236e-06, + "loss": 0.4195, + "step": 7326 + }, + { + "epoch": 3.4643026004728132, + "grad_norm": 2.863028049468994, + "learning_rate": 1.9238200400470166e-06, + "loss": 0.3706, + "step": 7327 + }, + { + "epoch": 3.464775413711584, + "grad_norm": 3.2766900062561035, + "learning_rate": 1.923213025085209e-06, + "loss": 0.4506, + "step": 7328 + }, + { + "epoch": 3.4652482269503544, + "grad_norm": 2.7300634384155273, + "learning_rate": 1.9226060460373975e-06, + "loss": 0.3463, + "step": 7329 + }, + { + "epoch": 3.465721040189125, + "grad_norm": 3.136104106903076, + "learning_rate": 1.921999102941376e-06, + "loss": 0.3839, + "step": 7330 + }, + { + "epoch": 3.466193853427896, + "grad_norm": 2.944932699203491, + "learning_rate": 1.921392195834934e-06, + "loss": 0.432, + "step": 7331 + }, + { + "epoch": 3.466666666666667, + "grad_norm": 3.428375005722046, + "learning_rate": 1.9207853247558647e-06, + "loss": 0.3407, + "step": 7332 + }, + { + "epoch": 3.4671394799054376, + "grad_norm": 3.3732450008392334, + "learning_rate": 1.9201784897419535e-06, + "loss": 0.361, + "step": 7333 + }, + { + "epoch": 3.467612293144208, + "grad_norm": 2.8291900157928467, + "learning_rate": 1.9195716908309836e-06, + "loss": 0.3805, + "step": 7334 + }, + { + "epoch": 3.4680851063829787, + "grad_norm": 3.3229610919952393, + "learning_rate": 1.9189649280607407e-06, + "loss": 0.3756, + "step": 7335 + }, + { + "epoch": 3.4685579196217495, + "grad_norm": 2.949416160583496, + "learning_rate": 1.918358201469004e-06, + "loss": 0.4316, + "step": 7336 + }, + { + "epoch": 3.46903073286052, + "grad_norm": 3.525501251220703, + "learning_rate": 1.9177515110935515e-06, + "loss": 0.4018, + "step": 7337 + }, + { + "epoch": 3.4695035460992907, + "grad_norm": 3.1439104080200195, + "learning_rate": 1.917144856972159e-06, + "loss": 0.4176, + "step": 7338 + }, + { + "epoch": 3.4699763593380615, + "grad_norm": 3.0022377967834473, + "learning_rate": 1.9165382391426006e-06, + "loss": 0.3962, + "step": 7339 + }, + { + "epoch": 3.4704491725768323, + "grad_norm": 3.2174794673919678, + "learning_rate": 1.9159316576426482e-06, + "loss": 0.441, + "step": 7340 + }, + { + "epoch": 3.470921985815603, + "grad_norm": 2.965123414993286, + "learning_rate": 1.9153251125100694e-06, + "loss": 0.4105, + "step": 7341 + }, + { + "epoch": 3.4713947990543734, + "grad_norm": 2.722904920578003, + "learning_rate": 1.9147186037826333e-06, + "loss": 0.4102, + "step": 7342 + }, + { + "epoch": 3.4718676122931442, + "grad_norm": 3.4894051551818848, + "learning_rate": 1.9141121314981033e-06, + "loss": 0.4225, + "step": 7343 + }, + { + "epoch": 3.472340425531915, + "grad_norm": 2.828497886657715, + "learning_rate": 1.913505695694241e-06, + "loss": 0.374, + "step": 7344 + }, + { + "epoch": 3.4728132387706854, + "grad_norm": 3.3046014308929443, + "learning_rate": 1.9128992964088077e-06, + "loss": 0.3568, + "step": 7345 + }, + { + "epoch": 3.473286052009456, + "grad_norm": 2.927281618118286, + "learning_rate": 1.9122929336795605e-06, + "loss": 0.4308, + "step": 7346 + }, + { + "epoch": 3.473758865248227, + "grad_norm": 2.9569990634918213, + "learning_rate": 1.911686607544256e-06, + "loss": 0.3226, + "step": 7347 + }, + { + "epoch": 3.4742316784869978, + "grad_norm": 3.1061038970947266, + "learning_rate": 1.9110803180406468e-06, + "loss": 0.4426, + "step": 7348 + }, + { + "epoch": 3.4747044917257686, + "grad_norm": 2.9609580039978027, + "learning_rate": 1.9104740652064825e-06, + "loss": 0.3835, + "step": 7349 + }, + { + "epoch": 3.475177304964539, + "grad_norm": 3.1547608375549316, + "learning_rate": 1.9098678490795147e-06, + "loss": 0.3814, + "step": 7350 + }, + { + "epoch": 3.4756501182033097, + "grad_norm": 2.869022846221924, + "learning_rate": 1.909261669697487e-06, + "loss": 0.4048, + "step": 7351 + }, + { + "epoch": 3.4761229314420805, + "grad_norm": 3.0565078258514404, + "learning_rate": 1.908655527098146e-06, + "loss": 0.3736, + "step": 7352 + }, + { + "epoch": 3.476595744680851, + "grad_norm": 2.893603563308716, + "learning_rate": 1.9080494213192317e-06, + "loss": 0.3906, + "step": 7353 + }, + { + "epoch": 3.4770685579196217, + "grad_norm": 2.818938732147217, + "learning_rate": 1.9074433523984844e-06, + "loss": 0.3958, + "step": 7354 + }, + { + "epoch": 3.4775413711583925, + "grad_norm": 2.675461769104004, + "learning_rate": 1.9068373203736419e-06, + "loss": 0.3371, + "step": 7355 + }, + { + "epoch": 3.4780141843971633, + "grad_norm": 2.5831551551818848, + "learning_rate": 1.9062313252824384e-06, + "loss": 0.3365, + "step": 7356 + }, + { + "epoch": 3.478486997635934, + "grad_norm": 3.299736738204956, + "learning_rate": 1.9056253671626054e-06, + "loss": 0.3923, + "step": 7357 + }, + { + "epoch": 3.4789598108747044, + "grad_norm": 2.508787155151367, + "learning_rate": 1.905019446051876e-06, + "loss": 0.3367, + "step": 7358 + }, + { + "epoch": 3.479432624113475, + "grad_norm": 2.980327606201172, + "learning_rate": 1.9044135619879753e-06, + "loss": 0.3842, + "step": 7359 + }, + { + "epoch": 3.479905437352246, + "grad_norm": 3.2114269733428955, + "learning_rate": 1.9038077150086317e-06, + "loss": 0.4625, + "step": 7360 + }, + { + "epoch": 3.4803782505910164, + "grad_norm": 3.2119715213775635, + "learning_rate": 1.9032019051515677e-06, + "loss": 0.4197, + "step": 7361 + }, + { + "epoch": 3.480851063829787, + "grad_norm": 3.2967300415039062, + "learning_rate": 1.9025961324545034e-06, + "loss": 0.4462, + "step": 7362 + }, + { + "epoch": 3.481323877068558, + "grad_norm": 3.132643461227417, + "learning_rate": 1.9019903969551589e-06, + "loss": 0.4355, + "step": 7363 + }, + { + "epoch": 3.4817966903073287, + "grad_norm": 2.9940602779388428, + "learning_rate": 1.9013846986912493e-06, + "loss": 0.3584, + "step": 7364 + }, + { + "epoch": 3.482269503546099, + "grad_norm": 2.901935577392578, + "learning_rate": 1.9007790377004907e-06, + "loss": 0.3987, + "step": 7365 + }, + { + "epoch": 3.48274231678487, + "grad_norm": 3.143404006958008, + "learning_rate": 1.9001734140205939e-06, + "loss": 0.3642, + "step": 7366 + }, + { + "epoch": 3.4832151300236407, + "grad_norm": 3.6701388359069824, + "learning_rate": 1.899567827689267e-06, + "loss": 0.4111, + "step": 7367 + }, + { + "epoch": 3.4836879432624115, + "grad_norm": 2.6821651458740234, + "learning_rate": 1.8989622787442202e-06, + "loss": 0.3827, + "step": 7368 + }, + { + "epoch": 3.484160756501182, + "grad_norm": 3.3916640281677246, + "learning_rate": 1.8983567672231562e-06, + "loss": 0.4214, + "step": 7369 + }, + { + "epoch": 3.4846335697399526, + "grad_norm": 3.3192927837371826, + "learning_rate": 1.8977512931637788e-06, + "loss": 0.4375, + "step": 7370 + }, + { + "epoch": 3.4851063829787234, + "grad_norm": 3.177884578704834, + "learning_rate": 1.8971458566037877e-06, + "loss": 0.3944, + "step": 7371 + }, + { + "epoch": 3.4855791962174942, + "grad_norm": 3.1554532051086426, + "learning_rate": 1.896540457580879e-06, + "loss": 0.3712, + "step": 7372 + }, + { + "epoch": 3.4860520094562646, + "grad_norm": 2.7773821353912354, + "learning_rate": 1.8959350961327516e-06, + "loss": 0.3534, + "step": 7373 + }, + { + "epoch": 3.4865248226950354, + "grad_norm": 2.4792168140411377, + "learning_rate": 1.8953297722970956e-06, + "loss": 0.3571, + "step": 7374 + }, + { + "epoch": 3.486997635933806, + "grad_norm": 2.676361083984375, + "learning_rate": 1.8947244861116044e-06, + "loss": 0.3365, + "step": 7375 + }, + { + "epoch": 3.487470449172577, + "grad_norm": 2.9340765476226807, + "learning_rate": 1.8941192376139655e-06, + "loss": 0.4656, + "step": 7376 + }, + { + "epoch": 3.4879432624113473, + "grad_norm": 3.7924742698669434, + "learning_rate": 1.8935140268418646e-06, + "loss": 0.3639, + "step": 7377 + }, + { + "epoch": 3.488416075650118, + "grad_norm": 2.798912286758423, + "learning_rate": 1.892908853832986e-06, + "loss": 0.3741, + "step": 7378 + }, + { + "epoch": 3.488888888888889, + "grad_norm": 3.1731197834014893, + "learning_rate": 1.8923037186250112e-06, + "loss": 0.4041, + "step": 7379 + }, + { + "epoch": 3.4893617021276597, + "grad_norm": 2.893725633621216, + "learning_rate": 1.8916986212556182e-06, + "loss": 0.3103, + "step": 7380 + }, + { + "epoch": 3.48983451536643, + "grad_norm": 3.2489001750946045, + "learning_rate": 1.891093561762486e-06, + "loss": 0.328, + "step": 7381 + }, + { + "epoch": 3.490307328605201, + "grad_norm": 2.8076415061950684, + "learning_rate": 1.8904885401832862e-06, + "loss": 0.426, + "step": 7382 + }, + { + "epoch": 3.4907801418439717, + "grad_norm": 3.076544761657715, + "learning_rate": 1.8898835565556938e-06, + "loss": 0.3664, + "step": 7383 + }, + { + "epoch": 3.4912529550827425, + "grad_norm": 2.7615935802459717, + "learning_rate": 1.8892786109173769e-06, + "loss": 0.3718, + "step": 7384 + }, + { + "epoch": 3.491725768321513, + "grad_norm": 2.9050116539001465, + "learning_rate": 1.8886737033060023e-06, + "loss": 0.3456, + "step": 7385 + }, + { + "epoch": 3.4921985815602836, + "grad_norm": 2.4928293228149414, + "learning_rate": 1.8880688337592366e-06, + "loss": 0.3487, + "step": 7386 + }, + { + "epoch": 3.4926713947990544, + "grad_norm": 2.773418426513672, + "learning_rate": 1.88746400231474e-06, + "loss": 0.3771, + "step": 7387 + }, + { + "epoch": 3.493144208037825, + "grad_norm": 2.7137296199798584, + "learning_rate": 1.886859209010175e-06, + "loss": 0.376, + "step": 7388 + }, + { + "epoch": 3.4936170212765956, + "grad_norm": 3.327976942062378, + "learning_rate": 1.886254453883199e-06, + "loss": 0.3481, + "step": 7389 + }, + { + "epoch": 3.4940898345153664, + "grad_norm": 3.8637235164642334, + "learning_rate": 1.8856497369714655e-06, + "loss": 0.3726, + "step": 7390 + }, + { + "epoch": 3.494562647754137, + "grad_norm": 3.1517951488494873, + "learning_rate": 1.88504505831263e-06, + "loss": 0.4459, + "step": 7391 + }, + { + "epoch": 3.495035460992908, + "grad_norm": 3.160130262374878, + "learning_rate": 1.884440417944342e-06, + "loss": 0.3918, + "step": 7392 + }, + { + "epoch": 3.4955082742316783, + "grad_norm": 2.6518726348876953, + "learning_rate": 1.8838358159042503e-06, + "loss": 0.3493, + "step": 7393 + }, + { + "epoch": 3.495981087470449, + "grad_norm": 2.7487380504608154, + "learning_rate": 1.8832312522300009e-06, + "loss": 0.3846, + "step": 7394 + }, + { + "epoch": 3.49645390070922, + "grad_norm": 3.062293291091919, + "learning_rate": 1.8826267269592355e-06, + "loss": 0.3792, + "step": 7395 + }, + { + "epoch": 3.4969267139479907, + "grad_norm": 3.3636794090270996, + "learning_rate": 1.8820222401295979e-06, + "loss": 0.4504, + "step": 7396 + }, + { + "epoch": 3.497399527186761, + "grad_norm": 3.230196237564087, + "learning_rate": 1.8814177917787246e-06, + "loss": 0.3953, + "step": 7397 + }, + { + "epoch": 3.497872340425532, + "grad_norm": 2.891002893447876, + "learning_rate": 1.8808133819442541e-06, + "loss": 0.3923, + "step": 7398 + }, + { + "epoch": 3.4983451536643027, + "grad_norm": 2.7478551864624023, + "learning_rate": 1.8802090106638196e-06, + "loss": 0.4115, + "step": 7399 + }, + { + "epoch": 3.4988179669030735, + "grad_norm": 3.0452797412872314, + "learning_rate": 1.8796046779750515e-06, + "loss": 0.4154, + "step": 7400 + }, + { + "epoch": 3.499290780141844, + "grad_norm": 3.0759124755859375, + "learning_rate": 1.87900038391558e-06, + "loss": 0.4277, + "step": 7401 + }, + { + "epoch": 3.4997635933806146, + "grad_norm": 2.7563929557800293, + "learning_rate": 1.8783961285230314e-06, + "loss": 0.3896, + "step": 7402 + }, + { + "epoch": 3.5002364066193854, + "grad_norm": 2.661916494369507, + "learning_rate": 1.87779191183503e-06, + "loss": 0.3625, + "step": 7403 + }, + { + "epoch": 3.500709219858156, + "grad_norm": 2.881241798400879, + "learning_rate": 1.877187733889199e-06, + "loss": 0.3724, + "step": 7404 + }, + { + "epoch": 3.5011820330969265, + "grad_norm": 3.2405693531036377, + "learning_rate": 1.8765835947231554e-06, + "loss": 0.3974, + "step": 7405 + }, + { + "epoch": 3.5016548463356973, + "grad_norm": 2.924288034439087, + "learning_rate": 1.8759794943745184e-06, + "loss": 0.3467, + "step": 7406 + }, + { + "epoch": 3.502127659574468, + "grad_norm": 3.031663656234741, + "learning_rate": 1.8753754328809027e-06, + "loss": 0.3995, + "step": 7407 + }, + { + "epoch": 3.5026004728132385, + "grad_norm": 3.028277635574341, + "learning_rate": 1.874771410279919e-06, + "loss": 0.3741, + "step": 7408 + }, + { + "epoch": 3.5030732860520093, + "grad_norm": 3.0211644172668457, + "learning_rate": 1.8741674266091782e-06, + "loss": 0.4018, + "step": 7409 + }, + { + "epoch": 3.50354609929078, + "grad_norm": 2.732234239578247, + "learning_rate": 1.8735634819062875e-06, + "loss": 0.313, + "step": 7410 + }, + { + "epoch": 3.504018912529551, + "grad_norm": 3.139596939086914, + "learning_rate": 1.8729595762088525e-06, + "loss": 0.4112, + "step": 7411 + }, + { + "epoch": 3.5044917257683217, + "grad_norm": 2.894230365753174, + "learning_rate": 1.8723557095544754e-06, + "loss": 0.3891, + "step": 7412 + }, + { + "epoch": 3.504964539007092, + "grad_norm": 2.850205659866333, + "learning_rate": 1.8717518819807547e-06, + "loss": 0.424, + "step": 7413 + }, + { + "epoch": 3.505437352245863, + "grad_norm": 3.047736644744873, + "learning_rate": 1.8711480935252907e-06, + "loss": 0.3757, + "step": 7414 + }, + { + "epoch": 3.5059101654846336, + "grad_norm": 3.0174455642700195, + "learning_rate": 1.8705443442256772e-06, + "loss": 0.3625, + "step": 7415 + }, + { + "epoch": 3.506382978723404, + "grad_norm": 2.840681552886963, + "learning_rate": 1.869940634119507e-06, + "loss": 0.3595, + "step": 7416 + }, + { + "epoch": 3.506855791962175, + "grad_norm": 3.067473888397217, + "learning_rate": 1.8693369632443713e-06, + "loss": 0.432, + "step": 7417 + }, + { + "epoch": 3.5073286052009456, + "grad_norm": 2.94655179977417, + "learning_rate": 1.8687333316378572e-06, + "loss": 0.4222, + "step": 7418 + }, + { + "epoch": 3.5078014184397164, + "grad_norm": 2.968548536300659, + "learning_rate": 1.868129739337551e-06, + "loss": 0.4098, + "step": 7419 + }, + { + "epoch": 3.508274231678487, + "grad_norm": 2.70094895362854, + "learning_rate": 1.867526186381034e-06, + "loss": 0.386, + "step": 7420 + }, + { + "epoch": 3.5087470449172575, + "grad_norm": 3.25897216796875, + "learning_rate": 1.8669226728058895e-06, + "loss": 0.4411, + "step": 7421 + }, + { + "epoch": 3.5092198581560283, + "grad_norm": 4.281215667724609, + "learning_rate": 1.866319198649694e-06, + "loss": 0.4011, + "step": 7422 + }, + { + "epoch": 3.509692671394799, + "grad_norm": 2.8394858837127686, + "learning_rate": 1.8657157639500223e-06, + "loss": 0.4162, + "step": 7423 + }, + { + "epoch": 3.5101654846335695, + "grad_norm": 2.732691764831543, + "learning_rate": 1.86511236874445e-06, + "loss": 0.3603, + "step": 7424 + }, + { + "epoch": 3.5106382978723403, + "grad_norm": 3.0152828693389893, + "learning_rate": 1.8645090130705463e-06, + "loss": 0.3811, + "step": 7425 + }, + { + "epoch": 3.511111111111111, + "grad_norm": 3.1762008666992188, + "learning_rate": 1.8639056969658793e-06, + "loss": 0.3985, + "step": 7426 + }, + { + "epoch": 3.511583924349882, + "grad_norm": 3.151123523712158, + "learning_rate": 1.863302420468016e-06, + "loss": 0.3582, + "step": 7427 + }, + { + "epoch": 3.5120567375886527, + "grad_norm": 2.738206386566162, + "learning_rate": 1.862699183614518e-06, + "loss": 0.3768, + "step": 7428 + }, + { + "epoch": 3.512529550827423, + "grad_norm": 3.235212564468384, + "learning_rate": 1.8620959864429487e-06, + "loss": 0.3964, + "step": 7429 + }, + { + "epoch": 3.513002364066194, + "grad_norm": 3.1113579273223877, + "learning_rate": 1.8614928289908648e-06, + "loss": 0.3979, + "step": 7430 + }, + { + "epoch": 3.5134751773049646, + "grad_norm": 2.6802520751953125, + "learning_rate": 1.860889711295822e-06, + "loss": 0.327, + "step": 7431 + }, + { + "epoch": 3.513947990543735, + "grad_norm": 2.9212403297424316, + "learning_rate": 1.860286633395375e-06, + "loss": 0.4104, + "step": 7432 + }, + { + "epoch": 3.5144208037825058, + "grad_norm": 2.868861198425293, + "learning_rate": 1.8596835953270742e-06, + "loss": 0.383, + "step": 7433 + }, + { + "epoch": 3.5148936170212766, + "grad_norm": 2.831655740737915, + "learning_rate": 1.8590805971284686e-06, + "loss": 0.3615, + "step": 7434 + }, + { + "epoch": 3.5153664302600474, + "grad_norm": 3.1540114879608154, + "learning_rate": 1.8584776388371039e-06, + "loss": 0.3914, + "step": 7435 + }, + { + "epoch": 3.515839243498818, + "grad_norm": 3.22031307220459, + "learning_rate": 1.8578747204905223e-06, + "loss": 0.4358, + "step": 7436 + }, + { + "epoch": 3.5163120567375885, + "grad_norm": 3.2922887802124023, + "learning_rate": 1.8572718421262677e-06, + "loss": 0.3894, + "step": 7437 + }, + { + "epoch": 3.5167848699763593, + "grad_norm": 2.936475992202759, + "learning_rate": 1.856669003781876e-06, + "loss": 0.3748, + "step": 7438 + }, + { + "epoch": 3.51725768321513, + "grad_norm": 3.4542860984802246, + "learning_rate": 1.8560662054948856e-06, + "loss": 0.3362, + "step": 7439 + }, + { + "epoch": 3.5177304964539005, + "grad_norm": 3.1532278060913086, + "learning_rate": 1.8554634473028288e-06, + "loss": 0.411, + "step": 7440 + }, + { + "epoch": 3.5182033096926713, + "grad_norm": 3.1678943634033203, + "learning_rate": 1.854860729243237e-06, + "loss": 0.4357, + "step": 7441 + }, + { + "epoch": 3.518676122931442, + "grad_norm": 2.608930826187134, + "learning_rate": 1.8542580513536385e-06, + "loss": 0.3851, + "step": 7442 + }, + { + "epoch": 3.519148936170213, + "grad_norm": 3.127915143966675, + "learning_rate": 1.853655413671559e-06, + "loss": 0.4227, + "step": 7443 + }, + { + "epoch": 3.5196217494089836, + "grad_norm": 3.0593245029449463, + "learning_rate": 1.8530528162345238e-06, + "loss": 0.4315, + "step": 7444 + }, + { + "epoch": 3.520094562647754, + "grad_norm": 2.7818729877471924, + "learning_rate": 1.852450259080053e-06, + "loss": 0.4018, + "step": 7445 + }, + { + "epoch": 3.520567375886525, + "grad_norm": 3.2635445594787598, + "learning_rate": 1.8518477422456639e-06, + "loss": 0.415, + "step": 7446 + }, + { + "epoch": 3.5210401891252956, + "grad_norm": 2.5713813304901123, + "learning_rate": 1.851245265768875e-06, + "loss": 0.3309, + "step": 7447 + }, + { + "epoch": 3.521513002364066, + "grad_norm": 2.6778969764709473, + "learning_rate": 1.8506428296871982e-06, + "loss": 0.3106, + "step": 7448 + }, + { + "epoch": 3.5219858156028367, + "grad_norm": 2.901095390319824, + "learning_rate": 1.8500404340381455e-06, + "loss": 0.3729, + "step": 7449 + }, + { + "epoch": 3.5224586288416075, + "grad_norm": 3.1000046730041504, + "learning_rate": 1.849438078859225e-06, + "loss": 0.438, + "step": 7450 + }, + { + "epoch": 3.5229314420803783, + "grad_norm": 2.901890993118286, + "learning_rate": 1.8488357641879417e-06, + "loss": 0.3934, + "step": 7451 + }, + { + "epoch": 3.523404255319149, + "grad_norm": 3.2212157249450684, + "learning_rate": 1.8482334900618009e-06, + "loss": 0.4359, + "step": 7452 + }, + { + "epoch": 3.5238770685579195, + "grad_norm": 3.3780901432037354, + "learning_rate": 1.847631256518303e-06, + "loss": 0.4022, + "step": 7453 + }, + { + "epoch": 3.5243498817966903, + "grad_norm": 2.9996445178985596, + "learning_rate": 1.847029063594945e-06, + "loss": 0.3989, + "step": 7454 + }, + { + "epoch": 3.524822695035461, + "grad_norm": 2.8581080436706543, + "learning_rate": 1.8464269113292255e-06, + "loss": 0.3401, + "step": 7455 + }, + { + "epoch": 3.5252955082742314, + "grad_norm": 2.9551661014556885, + "learning_rate": 1.8458247997586354e-06, + "loss": 0.4556, + "step": 7456 + }, + { + "epoch": 3.5257683215130022, + "grad_norm": 2.9672555923461914, + "learning_rate": 1.8452227289206672e-06, + "loss": 0.3575, + "step": 7457 + }, + { + "epoch": 3.526241134751773, + "grad_norm": 3.226273536682129, + "learning_rate": 1.8446206988528087e-06, + "loss": 0.3769, + "step": 7458 + }, + { + "epoch": 3.526713947990544, + "grad_norm": 2.994356155395508, + "learning_rate": 1.8440187095925443e-06, + "loss": 0.3653, + "step": 7459 + }, + { + "epoch": 3.5271867612293146, + "grad_norm": 2.489049196243286, + "learning_rate": 1.8434167611773595e-06, + "loss": 0.3454, + "step": 7460 + }, + { + "epoch": 3.527659574468085, + "grad_norm": 2.7897472381591797, + "learning_rate": 1.8428148536447333e-06, + "loss": 0.3526, + "step": 7461 + }, + { + "epoch": 3.5281323877068558, + "grad_norm": 2.947746992111206, + "learning_rate": 1.842212987032145e-06, + "loss": 0.3542, + "step": 7462 + }, + { + "epoch": 3.5286052009456266, + "grad_norm": 2.9303736686706543, + "learning_rate": 1.84161116137707e-06, + "loss": 0.3618, + "step": 7463 + }, + { + "epoch": 3.529078014184397, + "grad_norm": 2.81052827835083, + "learning_rate": 1.8410093767169807e-06, + "loss": 0.3833, + "step": 7464 + }, + { + "epoch": 3.5295508274231677, + "grad_norm": 3.4084126949310303, + "learning_rate": 1.840407633089348e-06, + "loss": 0.3868, + "step": 7465 + }, + { + "epoch": 3.5300236406619385, + "grad_norm": 2.8372802734375, + "learning_rate": 1.839805930531639e-06, + "loss": 0.3407, + "step": 7466 + }, + { + "epoch": 3.5304964539007093, + "grad_norm": 2.9218525886535645, + "learning_rate": 1.8392042690813205e-06, + "loss": 0.3772, + "step": 7467 + }, + { + "epoch": 3.53096926713948, + "grad_norm": 3.425274610519409, + "learning_rate": 1.8386026487758552e-06, + "loss": 0.3996, + "step": 7468 + }, + { + "epoch": 3.5314420803782505, + "grad_norm": 3.027423858642578, + "learning_rate": 1.8380010696527015e-06, + "loss": 0.3752, + "step": 7469 + }, + { + "epoch": 3.5319148936170213, + "grad_norm": 2.974896192550659, + "learning_rate": 1.8373995317493193e-06, + "loss": 0.3657, + "step": 7470 + }, + { + "epoch": 3.532387706855792, + "grad_norm": 2.837458610534668, + "learning_rate": 1.8367980351031628e-06, + "loss": 0.3949, + "step": 7471 + }, + { + "epoch": 3.5328605200945624, + "grad_norm": 2.8257288932800293, + "learning_rate": 1.8361965797516844e-06, + "loss": 0.3253, + "step": 7472 + }, + { + "epoch": 3.533333333333333, + "grad_norm": 2.8278095722198486, + "learning_rate": 1.8355951657323351e-06, + "loss": 0.3588, + "step": 7473 + }, + { + "epoch": 3.533806146572104, + "grad_norm": 2.641160249710083, + "learning_rate": 1.8349937930825601e-06, + "loss": 0.3423, + "step": 7474 + }, + { + "epoch": 3.534278959810875, + "grad_norm": 2.6909263134002686, + "learning_rate": 1.8343924618398065e-06, + "loss": 0.3973, + "step": 7475 + }, + { + "epoch": 3.5347517730496456, + "grad_norm": 3.0727429389953613, + "learning_rate": 1.8337911720415157e-06, + "loss": 0.4207, + "step": 7476 + }, + { + "epoch": 3.535224586288416, + "grad_norm": 3.218925714492798, + "learning_rate": 1.8331899237251265e-06, + "loss": 0.3955, + "step": 7477 + }, + { + "epoch": 3.5356973995271868, + "grad_norm": 3.163914918899536, + "learning_rate": 1.832588716928078e-06, + "loss": 0.4655, + "step": 7478 + }, + { + "epoch": 3.5361702127659576, + "grad_norm": 2.8622686862945557, + "learning_rate": 1.831987551687803e-06, + "loss": 0.4084, + "step": 7479 + }, + { + "epoch": 3.536643026004728, + "grad_norm": 2.8534188270568848, + "learning_rate": 1.831386428041734e-06, + "loss": 0.4144, + "step": 7480 + }, + { + "epoch": 3.5371158392434987, + "grad_norm": 2.8138554096221924, + "learning_rate": 1.8307853460273008e-06, + "loss": 0.3835, + "step": 7481 + }, + { + "epoch": 3.5375886524822695, + "grad_norm": 3.061960458755493, + "learning_rate": 1.830184305681929e-06, + "loss": 0.4128, + "step": 7482 + }, + { + "epoch": 3.5380614657210403, + "grad_norm": 2.8524835109710693, + "learning_rate": 1.8295833070430444e-06, + "loss": 0.3372, + "step": 7483 + }, + { + "epoch": 3.538534278959811, + "grad_norm": 3.2567028999328613, + "learning_rate": 1.8289823501480663e-06, + "loss": 0.4533, + "step": 7484 + }, + { + "epoch": 3.5390070921985815, + "grad_norm": 2.945634603500366, + "learning_rate": 1.8283814350344158e-06, + "loss": 0.3565, + "step": 7485 + }, + { + "epoch": 3.5394799054373522, + "grad_norm": 2.903287649154663, + "learning_rate": 1.8277805617395089e-06, + "loss": 0.349, + "step": 7486 + }, + { + "epoch": 3.539952718676123, + "grad_norm": 3.249272584915161, + "learning_rate": 1.827179730300757e-06, + "loss": 0.4076, + "step": 7487 + }, + { + "epoch": 3.5404255319148934, + "grad_norm": 2.9591739177703857, + "learning_rate": 1.8265789407555748e-06, + "loss": 0.3439, + "step": 7488 + }, + { + "epoch": 3.540898345153664, + "grad_norm": 3.8527538776397705, + "learning_rate": 1.8259781931413683e-06, + "loss": 0.4684, + "step": 7489 + }, + { + "epoch": 3.541371158392435, + "grad_norm": 2.7392261028289795, + "learning_rate": 1.8253774874955449e-06, + "loss": 0.3494, + "step": 7490 + }, + { + "epoch": 3.541843971631206, + "grad_norm": 2.880993127822876, + "learning_rate": 1.8247768238555069e-06, + "loss": 0.3546, + "step": 7491 + }, + { + "epoch": 3.5423167848699766, + "grad_norm": 2.9944894313812256, + "learning_rate": 1.8241762022586545e-06, + "loss": 0.3594, + "step": 7492 + }, + { + "epoch": 3.542789598108747, + "grad_norm": 3.0084292888641357, + "learning_rate": 1.8235756227423878e-06, + "loss": 0.408, + "step": 7493 + }, + { + "epoch": 3.5432624113475177, + "grad_norm": 2.75227689743042, + "learning_rate": 1.8229750853440998e-06, + "loss": 0.3515, + "step": 7494 + }, + { + "epoch": 3.5437352245862885, + "grad_norm": 3.041893243789673, + "learning_rate": 1.8223745901011856e-06, + "loss": 0.401, + "step": 7495 + }, + { + "epoch": 3.544208037825059, + "grad_norm": 2.8728370666503906, + "learning_rate": 1.8217741370510345e-06, + "loss": 0.3832, + "step": 7496 + }, + { + "epoch": 3.5446808510638297, + "grad_norm": 3.095460891723633, + "learning_rate": 1.8211737262310331e-06, + "loss": 0.3086, + "step": 7497 + }, + { + "epoch": 3.5451536643026005, + "grad_norm": 3.1869826316833496, + "learning_rate": 1.8205733576785678e-06, + "loss": 0.3666, + "step": 7498 + }, + { + "epoch": 3.5456264775413713, + "grad_norm": 3.307560443878174, + "learning_rate": 1.8199730314310204e-06, + "loss": 0.4489, + "step": 7499 + }, + { + "epoch": 3.546099290780142, + "grad_norm": 2.9531142711639404, + "learning_rate": 1.8193727475257697e-06, + "loss": 0.4017, + "step": 7500 + }, + { + "epoch": 3.5465721040189124, + "grad_norm": 3.2969162464141846, + "learning_rate": 1.8187725060001942e-06, + "loss": 0.4179, + "step": 7501 + }, + { + "epoch": 3.5470449172576832, + "grad_norm": 2.9434688091278076, + "learning_rate": 1.818172306891667e-06, + "loss": 0.3562, + "step": 7502 + }, + { + "epoch": 3.547517730496454, + "grad_norm": 3.070732355117798, + "learning_rate": 1.8175721502375616e-06, + "loss": 0.393, + "step": 7503 + }, + { + "epoch": 3.5479905437352244, + "grad_norm": 2.970898389816284, + "learning_rate": 1.8169720360752457e-06, + "loss": 0.4448, + "step": 7504 + }, + { + "epoch": 3.548463356973995, + "grad_norm": 2.7050931453704834, + "learning_rate": 1.8163719644420858e-06, + "loss": 0.3562, + "step": 7505 + }, + { + "epoch": 3.548936170212766, + "grad_norm": 3.0073063373565674, + "learning_rate": 1.8157719353754467e-06, + "loss": 0.4429, + "step": 7506 + }, + { + "epoch": 3.5494089834515368, + "grad_norm": 3.0240445137023926, + "learning_rate": 1.8151719489126874e-06, + "loss": 0.4073, + "step": 7507 + }, + { + "epoch": 3.5498817966903076, + "grad_norm": 3.558763265609741, + "learning_rate": 1.8145720050911695e-06, + "loss": 0.4025, + "step": 7508 + }, + { + "epoch": 3.550354609929078, + "grad_norm": 3.637258768081665, + "learning_rate": 1.8139721039482473e-06, + "loss": 0.5074, + "step": 7509 + }, + { + "epoch": 3.5508274231678487, + "grad_norm": 2.804719924926758, + "learning_rate": 1.8133722455212726e-06, + "loss": 0.3727, + "step": 7510 + }, + { + "epoch": 3.5513002364066195, + "grad_norm": 2.9034759998321533, + "learning_rate": 1.8127724298475984e-06, + "loss": 0.3642, + "step": 7511 + }, + { + "epoch": 3.55177304964539, + "grad_norm": 3.1304872035980225, + "learning_rate": 1.8121726569645714e-06, + "loss": 0.434, + "step": 7512 + }, + { + "epoch": 3.5522458628841607, + "grad_norm": 3.019956111907959, + "learning_rate": 1.8115729269095378e-06, + "loss": 0.4235, + "step": 7513 + }, + { + "epoch": 3.5527186761229315, + "grad_norm": 2.7984633445739746, + "learning_rate": 1.810973239719839e-06, + "loss": 0.3344, + "step": 7514 + }, + { + "epoch": 3.5531914893617023, + "grad_norm": 2.839709997177124, + "learning_rate": 1.8103735954328145e-06, + "loss": 0.3708, + "step": 7515 + }, + { + "epoch": 3.553664302600473, + "grad_norm": 2.766819477081299, + "learning_rate": 1.809773994085803e-06, + "loss": 0.3402, + "step": 7516 + }, + { + "epoch": 3.5541371158392434, + "grad_norm": 2.707942247390747, + "learning_rate": 1.8091744357161372e-06, + "loss": 0.4327, + "step": 7517 + }, + { + "epoch": 3.554609929078014, + "grad_norm": 3.512702465057373, + "learning_rate": 1.8085749203611516e-06, + "loss": 0.3965, + "step": 7518 + }, + { + "epoch": 3.555082742316785, + "grad_norm": 2.717024803161621, + "learning_rate": 1.8079754480581738e-06, + "loss": 0.3237, + "step": 7519 + }, + { + "epoch": 3.5555555555555554, + "grad_norm": 2.659001350402832, + "learning_rate": 1.8073760188445296e-06, + "loss": 0.3546, + "step": 7520 + }, + { + "epoch": 3.556028368794326, + "grad_norm": 2.615028142929077, + "learning_rate": 1.8067766327575445e-06, + "loss": 0.3232, + "step": 7521 + }, + { + "epoch": 3.556501182033097, + "grad_norm": 2.659428119659424, + "learning_rate": 1.8061772898345386e-06, + "loss": 0.3769, + "step": 7522 + }, + { + "epoch": 3.5569739952718678, + "grad_norm": 3.142369270324707, + "learning_rate": 1.8055779901128296e-06, + "loss": 0.4292, + "step": 7523 + }, + { + "epoch": 3.5574468085106385, + "grad_norm": 3.0832736492156982, + "learning_rate": 1.8049787336297352e-06, + "loss": 0.3871, + "step": 7524 + }, + { + "epoch": 3.557919621749409, + "grad_norm": 2.778411865234375, + "learning_rate": 1.8043795204225664e-06, + "loss": 0.3938, + "step": 7525 + }, + { + "epoch": 3.5583924349881797, + "grad_norm": 3.1651480197906494, + "learning_rate": 1.8037803505286355e-06, + "loss": 0.3315, + "step": 7526 + }, + { + "epoch": 3.5588652482269505, + "grad_norm": 3.266508102416992, + "learning_rate": 1.8031812239852498e-06, + "loss": 0.4156, + "step": 7527 + }, + { + "epoch": 3.559338061465721, + "grad_norm": 3.1345436573028564, + "learning_rate": 1.8025821408297127e-06, + "loss": 0.3813, + "step": 7528 + }, + { + "epoch": 3.5598108747044916, + "grad_norm": 3.1535425186157227, + "learning_rate": 1.8019831010993289e-06, + "loss": 0.3897, + "step": 7529 + }, + { + "epoch": 3.5602836879432624, + "grad_norm": 3.0934345722198486, + "learning_rate": 1.8013841048313952e-06, + "loss": 0.4074, + "step": 7530 + }, + { + "epoch": 3.5607565011820332, + "grad_norm": 3.224876642227173, + "learning_rate": 1.8007851520632108e-06, + "loss": 0.3969, + "step": 7531 + }, + { + "epoch": 3.561229314420804, + "grad_norm": 3.082303285598755, + "learning_rate": 1.8001862428320693e-06, + "loss": 0.3559, + "step": 7532 + }, + { + "epoch": 3.5617021276595744, + "grad_norm": 3.5289969444274902, + "learning_rate": 1.7995873771752608e-06, + "loss": 0.3961, + "step": 7533 + }, + { + "epoch": 3.562174940898345, + "grad_norm": 3.1893370151519775, + "learning_rate": 1.7989885551300762e-06, + "loss": 0.3721, + "step": 7534 + }, + { + "epoch": 3.562647754137116, + "grad_norm": 2.6911089420318604, + "learning_rate": 1.7983897767337999e-06, + "loss": 0.3801, + "step": 7535 + }, + { + "epoch": 3.5631205673758863, + "grad_norm": 3.0837483406066895, + "learning_rate": 1.797791042023716e-06, + "loss": 0.3886, + "step": 7536 + }, + { + "epoch": 3.563593380614657, + "grad_norm": 2.973459005355835, + "learning_rate": 1.7971923510371054e-06, + "loss": 0.438, + "step": 7537 + }, + { + "epoch": 3.564066193853428, + "grad_norm": 3.1537392139434814, + "learning_rate": 1.7965937038112435e-06, + "loss": 0.4022, + "step": 7538 + }, + { + "epoch": 3.5645390070921987, + "grad_norm": 3.2339680194854736, + "learning_rate": 1.795995100383409e-06, + "loss": 0.3883, + "step": 7539 + }, + { + "epoch": 3.5650118203309695, + "grad_norm": 2.5029079914093018, + "learning_rate": 1.7953965407908714e-06, + "loss": 0.3522, + "step": 7540 + }, + { + "epoch": 3.56548463356974, + "grad_norm": 3.1560211181640625, + "learning_rate": 1.7947980250709027e-06, + "loss": 0.4024, + "step": 7541 + }, + { + "epoch": 3.5659574468085107, + "grad_norm": 2.950477361679077, + "learning_rate": 1.7941995532607687e-06, + "loss": 0.3598, + "step": 7542 + }, + { + "epoch": 3.5664302600472815, + "grad_norm": 3.1263279914855957, + "learning_rate": 1.793601125397733e-06, + "loss": 0.3535, + "step": 7543 + }, + { + "epoch": 3.566903073286052, + "grad_norm": 2.986631393432617, + "learning_rate": 1.7930027415190587e-06, + "loss": 0.4251, + "step": 7544 + }, + { + "epoch": 3.5673758865248226, + "grad_norm": 2.6882247924804688, + "learning_rate": 1.7924044016620022e-06, + "loss": 0.3584, + "step": 7545 + }, + { + "epoch": 3.5678486997635934, + "grad_norm": 2.9358696937561035, + "learning_rate": 1.791806105863822e-06, + "loss": 0.3671, + "step": 7546 + }, + { + "epoch": 3.568321513002364, + "grad_norm": 2.774198055267334, + "learning_rate": 1.7912078541617704e-06, + "loss": 0.3505, + "step": 7547 + }, + { + "epoch": 3.568794326241135, + "grad_norm": 2.7384231090545654, + "learning_rate": 1.7906096465930964e-06, + "loss": 0.3992, + "step": 7548 + }, + { + "epoch": 3.5692671394799054, + "grad_norm": 2.8625354766845703, + "learning_rate": 1.7900114831950506e-06, + "loss": 0.3858, + "step": 7549 + }, + { + "epoch": 3.569739952718676, + "grad_norm": 2.737884044647217, + "learning_rate": 1.7894133640048761e-06, + "loss": 0.3973, + "step": 7550 + }, + { + "epoch": 3.570212765957447, + "grad_norm": 2.9817614555358887, + "learning_rate": 1.7888152890598154e-06, + "loss": 0.3613, + "step": 7551 + }, + { + "epoch": 3.5706855791962173, + "grad_norm": 2.760956287384033, + "learning_rate": 1.7882172583971081e-06, + "loss": 0.3645, + "step": 7552 + }, + { + "epoch": 3.571158392434988, + "grad_norm": 2.6867735385894775, + "learning_rate": 1.7876192720539908e-06, + "loss": 0.3771, + "step": 7553 + }, + { + "epoch": 3.571631205673759, + "grad_norm": 3.3362443447113037, + "learning_rate": 1.7870213300676986e-06, + "loss": 0.3989, + "step": 7554 + }, + { + "epoch": 3.5721040189125297, + "grad_norm": 2.8359227180480957, + "learning_rate": 1.7864234324754617e-06, + "loss": 0.3645, + "step": 7555 + }, + { + "epoch": 3.5725768321513005, + "grad_norm": 3.3070647716522217, + "learning_rate": 1.7858255793145076e-06, + "loss": 0.4128, + "step": 7556 + }, + { + "epoch": 3.573049645390071, + "grad_norm": 2.544879913330078, + "learning_rate": 1.7852277706220644e-06, + "loss": 0.3779, + "step": 7557 + }, + { + "epoch": 3.5735224586288417, + "grad_norm": 2.890796661376953, + "learning_rate": 1.7846300064353525e-06, + "loss": 0.373, + "step": 7558 + }, + { + "epoch": 3.5739952718676125, + "grad_norm": 2.9703400135040283, + "learning_rate": 1.7840322867915944e-06, + "loss": 0.3619, + "step": 7559 + }, + { + "epoch": 3.574468085106383, + "grad_norm": 3.0122430324554443, + "learning_rate": 1.7834346117280066e-06, + "loss": 0.4079, + "step": 7560 + }, + { + "epoch": 3.5749408983451536, + "grad_norm": 2.904963493347168, + "learning_rate": 1.7828369812818025e-06, + "loss": 0.38, + "step": 7561 + }, + { + "epoch": 3.5754137115839244, + "grad_norm": 3.0917439460754395, + "learning_rate": 1.7822393954901957e-06, + "loss": 0.383, + "step": 7562 + }, + { + "epoch": 3.575886524822695, + "grad_norm": 2.633920907974243, + "learning_rate": 1.7816418543903935e-06, + "loss": 0.3823, + "step": 7563 + }, + { + "epoch": 3.576359338061466, + "grad_norm": 2.9266390800476074, + "learning_rate": 1.781044358019604e-06, + "loss": 0.4642, + "step": 7564 + }, + { + "epoch": 3.5768321513002364, + "grad_norm": 2.878138780593872, + "learning_rate": 1.7804469064150299e-06, + "loss": 0.4056, + "step": 7565 + }, + { + "epoch": 3.577304964539007, + "grad_norm": 2.949370861053467, + "learning_rate": 1.7798494996138708e-06, + "loss": 0.3863, + "step": 7566 + }, + { + "epoch": 3.5777777777777775, + "grad_norm": 3.1444685459136963, + "learning_rate": 1.7792521376533264e-06, + "loss": 0.3611, + "step": 7567 + }, + { + "epoch": 3.5782505910165483, + "grad_norm": 3.0719716548919678, + "learning_rate": 1.7786548205705906e-06, + "loss": 0.3866, + "step": 7568 + }, + { + "epoch": 3.578723404255319, + "grad_norm": 3.155343770980835, + "learning_rate": 1.7780575484028566e-06, + "loss": 0.3896, + "step": 7569 + }, + { + "epoch": 3.57919621749409, + "grad_norm": 3.0447211265563965, + "learning_rate": 1.7774603211873138e-06, + "loss": 0.3713, + "step": 7570 + }, + { + "epoch": 3.5796690307328607, + "grad_norm": 2.8683619499206543, + "learning_rate": 1.7768631389611471e-06, + "loss": 0.3808, + "step": 7571 + }, + { + "epoch": 3.580141843971631, + "grad_norm": 3.1548070907592773, + "learning_rate": 1.776266001761543e-06, + "loss": 0.3714, + "step": 7572 + }, + { + "epoch": 3.580614657210402, + "grad_norm": 2.8699257373809814, + "learning_rate": 1.7756689096256816e-06, + "loss": 0.3694, + "step": 7573 + }, + { + "epoch": 3.5810874704491726, + "grad_norm": 2.834714412689209, + "learning_rate": 1.7750718625907398e-06, + "loss": 0.3935, + "step": 7574 + }, + { + "epoch": 3.581560283687943, + "grad_norm": 3.3828539848327637, + "learning_rate": 1.7744748606938957e-06, + "loss": 0.4783, + "step": 7575 + }, + { + "epoch": 3.582033096926714, + "grad_norm": 3.3892476558685303, + "learning_rate": 1.7738779039723202e-06, + "loss": 0.41, + "step": 7576 + }, + { + "epoch": 3.5825059101654846, + "grad_norm": 3.014289379119873, + "learning_rate": 1.7732809924631842e-06, + "loss": 0.3516, + "step": 7577 + }, + { + "epoch": 3.5829787234042554, + "grad_norm": 3.477212429046631, + "learning_rate": 1.772684126203654e-06, + "loss": 0.4144, + "step": 7578 + }, + { + "epoch": 3.583451536643026, + "grad_norm": 2.9156792163848877, + "learning_rate": 1.772087305230893e-06, + "loss": 0.3772, + "step": 7579 + }, + { + "epoch": 3.5839243498817965, + "grad_norm": 2.639169931411743, + "learning_rate": 1.7714905295820651e-06, + "loss": 0.3487, + "step": 7580 + }, + { + "epoch": 3.5843971631205673, + "grad_norm": 3.196894407272339, + "learning_rate": 1.7708937992943263e-06, + "loss": 0.4852, + "step": 7581 + }, + { + "epoch": 3.584869976359338, + "grad_norm": 2.9140779972076416, + "learning_rate": 1.7702971144048347e-06, + "loss": 0.3703, + "step": 7582 + }, + { + "epoch": 3.5853427895981085, + "grad_norm": 3.3844895362854004, + "learning_rate": 1.7697004749507418e-06, + "loss": 0.4227, + "step": 7583 + }, + { + "epoch": 3.5858156028368793, + "grad_norm": 3.080061912536621, + "learning_rate": 1.769103880969198e-06, + "loss": 0.4237, + "step": 7584 + }, + { + "epoch": 3.58628841607565, + "grad_norm": 3.037505865097046, + "learning_rate": 1.7685073324973506e-06, + "loss": 0.3902, + "step": 7585 + }, + { + "epoch": 3.586761229314421, + "grad_norm": 3.6563873291015625, + "learning_rate": 1.7679108295723436e-06, + "loss": 0.3956, + "step": 7586 + }, + { + "epoch": 3.5872340425531917, + "grad_norm": 3.158935546875, + "learning_rate": 1.76731437223132e-06, + "loss": 0.3898, + "step": 7587 + }, + { + "epoch": 3.587706855791962, + "grad_norm": 3.059199571609497, + "learning_rate": 1.7667179605114176e-06, + "loss": 0.4183, + "step": 7588 + }, + { + "epoch": 3.588179669030733, + "grad_norm": 2.8123233318328857, + "learning_rate": 1.7661215944497716e-06, + "loss": 0.3731, + "step": 7589 + }, + { + "epoch": 3.5886524822695036, + "grad_norm": 3.094287633895874, + "learning_rate": 1.7655252740835169e-06, + "loss": 0.4562, + "step": 7590 + }, + { + "epoch": 3.589125295508274, + "grad_norm": 2.886833667755127, + "learning_rate": 1.7649289994497822e-06, + "loss": 0.4178, + "step": 7591 + }, + { + "epoch": 3.5895981087470448, + "grad_norm": 3.3040647506713867, + "learning_rate": 1.764332770585696e-06, + "loss": 0.4311, + "step": 7592 + }, + { + "epoch": 3.5900709219858156, + "grad_norm": 2.7948951721191406, + "learning_rate": 1.7637365875283827e-06, + "loss": 0.3704, + "step": 7593 + }, + { + "epoch": 3.5905437352245864, + "grad_norm": 3.092221975326538, + "learning_rate": 1.7631404503149623e-06, + "loss": 0.4166, + "step": 7594 + }, + { + "epoch": 3.591016548463357, + "grad_norm": 3.6018600463867188, + "learning_rate": 1.7625443589825564e-06, + "loss": 0.4251, + "step": 7595 + }, + { + "epoch": 3.5914893617021275, + "grad_norm": 2.708017110824585, + "learning_rate": 1.7619483135682791e-06, + "loss": 0.3775, + "step": 7596 + }, + { + "epoch": 3.5919621749408983, + "grad_norm": 2.8069381713867188, + "learning_rate": 1.7613523141092438e-06, + "loss": 0.3929, + "step": 7597 + }, + { + "epoch": 3.592434988179669, + "grad_norm": 3.097787380218506, + "learning_rate": 1.7607563606425616e-06, + "loss": 0.3992, + "step": 7598 + }, + { + "epoch": 3.5929078014184395, + "grad_norm": 2.9691715240478516, + "learning_rate": 1.7601604532053385e-06, + "loss": 0.4001, + "step": 7599 + }, + { + "epoch": 3.5933806146572103, + "grad_norm": 2.5511624813079834, + "learning_rate": 1.7595645918346807e-06, + "loss": 0.3136, + "step": 7600 + }, + { + "epoch": 3.593853427895981, + "grad_norm": 2.4688427448272705, + "learning_rate": 1.7589687765676891e-06, + "loss": 0.3922, + "step": 7601 + }, + { + "epoch": 3.594326241134752, + "grad_norm": 3.004023790359497, + "learning_rate": 1.7583730074414613e-06, + "loss": 0.4203, + "step": 7602 + }, + { + "epoch": 3.5947990543735227, + "grad_norm": 2.902641773223877, + "learning_rate": 1.7577772844930957e-06, + "loss": 0.3855, + "step": 7603 + }, + { + "epoch": 3.595271867612293, + "grad_norm": 3.851375102996826, + "learning_rate": 1.7571816077596826e-06, + "loss": 0.3769, + "step": 7604 + }, + { + "epoch": 3.595744680851064, + "grad_norm": 3.03249192237854, + "learning_rate": 1.756585977278315e-06, + "loss": 0.3448, + "step": 7605 + }, + { + "epoch": 3.5962174940898346, + "grad_norm": 2.992363214492798, + "learning_rate": 1.7559903930860789e-06, + "loss": 0.3893, + "step": 7606 + }, + { + "epoch": 3.596690307328605, + "grad_norm": 2.9322855472564697, + "learning_rate": 1.7553948552200577e-06, + "loss": 0.4337, + "step": 7607 + }, + { + "epoch": 3.5971631205673757, + "grad_norm": 3.2564096450805664, + "learning_rate": 1.7547993637173347e-06, + "loss": 0.3943, + "step": 7608 + }, + { + "epoch": 3.5976359338061465, + "grad_norm": 2.9988484382629395, + "learning_rate": 1.7542039186149867e-06, + "loss": 0.3421, + "step": 7609 + }, + { + "epoch": 3.5981087470449173, + "grad_norm": 2.8188817501068115, + "learning_rate": 1.7536085199500914e-06, + "loss": 0.3657, + "step": 7610 + }, + { + "epoch": 3.598581560283688, + "grad_norm": 3.0583255290985107, + "learning_rate": 1.7530131677597206e-06, + "loss": 0.4036, + "step": 7611 + }, + { + "epoch": 3.5990543735224585, + "grad_norm": 2.8700921535491943, + "learning_rate": 1.7524178620809435e-06, + "loss": 0.3928, + "step": 7612 + }, + { + "epoch": 3.5995271867612293, + "grad_norm": 3.4497945308685303, + "learning_rate": 1.751822602950829e-06, + "loss": 0.3517, + "step": 7613 + }, + { + "epoch": 3.6, + "grad_norm": 3.334191083908081, + "learning_rate": 1.75122739040644e-06, + "loss": 0.3414, + "step": 7614 + }, + { + "epoch": 3.6004728132387704, + "grad_norm": 3.1435158252716064, + "learning_rate": 1.7506322244848387e-06, + "loss": 0.4075, + "step": 7615 + }, + { + "epoch": 3.6009456264775412, + "grad_norm": 3.178990125656128, + "learning_rate": 1.7500371052230824e-06, + "loss": 0.4688, + "step": 7616 + }, + { + "epoch": 3.601418439716312, + "grad_norm": 2.9292044639587402, + "learning_rate": 1.7494420326582267e-06, + "loss": 0.3882, + "step": 7617 + }, + { + "epoch": 3.601891252955083, + "grad_norm": 2.6899197101593018, + "learning_rate": 1.7488470068273256e-06, + "loss": 0.3916, + "step": 7618 + }, + { + "epoch": 3.6023640661938536, + "grad_norm": 2.8319191932678223, + "learning_rate": 1.7482520277674273e-06, + "loss": 0.3924, + "step": 7619 + }, + { + "epoch": 3.602836879432624, + "grad_norm": 2.74589204788208, + "learning_rate": 1.747657095515578e-06, + "loss": 0.2911, + "step": 7620 + }, + { + "epoch": 3.603309692671395, + "grad_norm": 2.857028007507324, + "learning_rate": 1.7470622101088233e-06, + "loss": 0.3618, + "step": 7621 + }, + { + "epoch": 3.6037825059101656, + "grad_norm": 3.3715617656707764, + "learning_rate": 1.746467371584203e-06, + "loss": 0.4186, + "step": 7622 + }, + { + "epoch": 3.604255319148936, + "grad_norm": 2.839526414871216, + "learning_rate": 1.745872579978755e-06, + "loss": 0.4088, + "step": 7623 + }, + { + "epoch": 3.6047281323877067, + "grad_norm": 3.7689156532287598, + "learning_rate": 1.7452778353295155e-06, + "loss": 0.4748, + "step": 7624 + }, + { + "epoch": 3.6052009456264775, + "grad_norm": 2.9345123767852783, + "learning_rate": 1.7446831376735152e-06, + "loss": 0.4117, + "step": 7625 + }, + { + "epoch": 3.6056737588652483, + "grad_norm": 2.7898924350738525, + "learning_rate": 1.7440884870477845e-06, + "loss": 0.3515, + "step": 7626 + }, + { + "epoch": 3.606146572104019, + "grad_norm": 3.4268569946289062, + "learning_rate": 1.7434938834893481e-06, + "loss": 0.4051, + "step": 7627 + }, + { + "epoch": 3.6066193853427895, + "grad_norm": 3.019066095352173, + "learning_rate": 1.7428993270352311e-06, + "loss": 0.4128, + "step": 7628 + }, + { + "epoch": 3.6070921985815603, + "grad_norm": 3.1277568340301514, + "learning_rate": 1.742304817722454e-06, + "loss": 0.37, + "step": 7629 + }, + { + "epoch": 3.607565011820331, + "grad_norm": 2.924818277359009, + "learning_rate": 1.7417103555880318e-06, + "loss": 0.3792, + "step": 7630 + }, + { + "epoch": 3.6080378250591014, + "grad_norm": 2.664699077606201, + "learning_rate": 1.7411159406689821e-06, + "loss": 0.3584, + "step": 7631 + }, + { + "epoch": 3.608510638297872, + "grad_norm": 3.223729133605957, + "learning_rate": 1.7405215730023144e-06, + "loss": 0.3956, + "step": 7632 + }, + { + "epoch": 3.608983451536643, + "grad_norm": 2.934225559234619, + "learning_rate": 1.7399272526250388e-06, + "loss": 0.4179, + "step": 7633 + }, + { + "epoch": 3.609456264775414, + "grad_norm": 2.833798885345459, + "learning_rate": 1.7393329795741603e-06, + "loss": 0.3283, + "step": 7634 + }, + { + "epoch": 3.6099290780141846, + "grad_norm": 3.008798837661743, + "learning_rate": 1.738738753886681e-06, + "loss": 0.3704, + "step": 7635 + }, + { + "epoch": 3.610401891252955, + "grad_norm": 2.8714520931243896, + "learning_rate": 1.7381445755996023e-06, + "loss": 0.3646, + "step": 7636 + }, + { + "epoch": 3.6108747044917258, + "grad_norm": 3.083554267883301, + "learning_rate": 1.7375504447499193e-06, + "loss": 0.3785, + "step": 7637 + }, + { + "epoch": 3.6113475177304966, + "grad_norm": 3.270347833633423, + "learning_rate": 1.7369563613746277e-06, + "loss": 0.4426, + "step": 7638 + }, + { + "epoch": 3.611820330969267, + "grad_norm": 2.7754862308502197, + "learning_rate": 1.7363623255107175e-06, + "loss": 0.3448, + "step": 7639 + }, + { + "epoch": 3.6122931442080377, + "grad_norm": 2.98140025138855, + "learning_rate": 1.7357683371951767e-06, + "loss": 0.4027, + "step": 7640 + }, + { + "epoch": 3.6127659574468085, + "grad_norm": 3.1640074253082275, + "learning_rate": 1.7351743964649908e-06, + "loss": 0.3913, + "step": 7641 + }, + { + "epoch": 3.6132387706855793, + "grad_norm": 2.758202075958252, + "learning_rate": 1.7345805033571417e-06, + "loss": 0.4148, + "step": 7642 + }, + { + "epoch": 3.61371158392435, + "grad_norm": 3.1030571460723877, + "learning_rate": 1.7339866579086074e-06, + "loss": 0.4002, + "step": 7643 + }, + { + "epoch": 3.6141843971631205, + "grad_norm": 3.2414135932922363, + "learning_rate": 1.733392860156366e-06, + "loss": 0.4732, + "step": 7644 + }, + { + "epoch": 3.6146572104018913, + "grad_norm": 2.8720390796661377, + "learning_rate": 1.7327991101373886e-06, + "loss": 0.4112, + "step": 7645 + }, + { + "epoch": 3.615130023640662, + "grad_norm": 3.0104875564575195, + "learning_rate": 1.7322054078886474e-06, + "loss": 0.3934, + "step": 7646 + }, + { + "epoch": 3.6156028368794324, + "grad_norm": 2.8615126609802246, + "learning_rate": 1.7316117534471091e-06, + "loss": 0.3437, + "step": 7647 + }, + { + "epoch": 3.616075650118203, + "grad_norm": 2.8283586502075195, + "learning_rate": 1.7310181468497369e-06, + "loss": 0.374, + "step": 7648 + }, + { + "epoch": 3.616548463356974, + "grad_norm": 3.2289321422576904, + "learning_rate": 1.7304245881334935e-06, + "loss": 0.3899, + "step": 7649 + }, + { + "epoch": 3.617021276595745, + "grad_norm": 3.126882791519165, + "learning_rate": 1.7298310773353356e-06, + "loss": 0.388, + "step": 7650 + }, + { + "epoch": 3.6174940898345156, + "grad_norm": 3.013657569885254, + "learning_rate": 1.7292376144922201e-06, + "loss": 0.379, + "step": 7651 + }, + { + "epoch": 3.617966903073286, + "grad_norm": 3.070192337036133, + "learning_rate": 1.7286441996410989e-06, + "loss": 0.3801, + "step": 7652 + }, + { + "epoch": 3.6184397163120567, + "grad_norm": 2.805380344390869, + "learning_rate": 1.7280508328189199e-06, + "loss": 0.3577, + "step": 7653 + }, + { + "epoch": 3.6189125295508275, + "grad_norm": 3.2853379249572754, + "learning_rate": 1.7274575140626318e-06, + "loss": 0.4168, + "step": 7654 + }, + { + "epoch": 3.619385342789598, + "grad_norm": 3.16316819190979, + "learning_rate": 1.7268642434091761e-06, + "loss": 0.425, + "step": 7655 + }, + { + "epoch": 3.6198581560283687, + "grad_norm": 3.2971179485321045, + "learning_rate": 1.7262710208954947e-06, + "loss": 0.3884, + "step": 7656 + }, + { + "epoch": 3.6203309692671395, + "grad_norm": 3.1823747158050537, + "learning_rate": 1.725677846558524e-06, + "loss": 0.3419, + "step": 7657 + }, + { + "epoch": 3.6208037825059103, + "grad_norm": 3.114654779434204, + "learning_rate": 1.7250847204351973e-06, + "loss": 0.3951, + "step": 7658 + }, + { + "epoch": 3.621276595744681, + "grad_norm": 3.0272440910339355, + "learning_rate": 1.7244916425624482e-06, + "loss": 0.4102, + "step": 7659 + }, + { + "epoch": 3.6217494089834514, + "grad_norm": 2.973611354827881, + "learning_rate": 1.7238986129772035e-06, + "loss": 0.3827, + "step": 7660 + }, + { + "epoch": 3.6222222222222222, + "grad_norm": 3.063713312149048, + "learning_rate": 1.7233056317163894e-06, + "loss": 0.3909, + "step": 7661 + }, + { + "epoch": 3.622695035460993, + "grad_norm": 3.203725576400757, + "learning_rate": 1.7227126988169283e-06, + "loss": 0.3933, + "step": 7662 + }, + { + "epoch": 3.6231678486997634, + "grad_norm": 2.945887327194214, + "learning_rate": 1.7221198143157386e-06, + "loss": 0.3722, + "step": 7663 + }, + { + "epoch": 3.623640661938534, + "grad_norm": 3.042691469192505, + "learning_rate": 1.7215269782497373e-06, + "loss": 0.4108, + "step": 7664 + }, + { + "epoch": 3.624113475177305, + "grad_norm": 2.8496763706207275, + "learning_rate": 1.720934190655837e-06, + "loss": 0.3867, + "step": 7665 + }, + { + "epoch": 3.6245862884160758, + "grad_norm": 2.7017154693603516, + "learning_rate": 1.7203414515709493e-06, + "loss": 0.3246, + "step": 7666 + }, + { + "epoch": 3.6250591016548466, + "grad_norm": 2.66630482673645, + "learning_rate": 1.7197487610319808e-06, + "loss": 0.365, + "step": 7667 + }, + { + "epoch": 3.625531914893617, + "grad_norm": 2.8724591732025146, + "learning_rate": 1.7191561190758348e-06, + "loss": 0.3361, + "step": 7668 + }, + { + "epoch": 3.6260047281323877, + "grad_norm": 3.1413803100585938, + "learning_rate": 1.7185635257394143e-06, + "loss": 0.3949, + "step": 7669 + }, + { + "epoch": 3.6264775413711585, + "grad_norm": 2.9866268634796143, + "learning_rate": 1.7179709810596163e-06, + "loss": 0.3728, + "step": 7670 + }, + { + "epoch": 3.626950354609929, + "grad_norm": 3.003497838973999, + "learning_rate": 1.717378485073336e-06, + "loss": 0.384, + "step": 7671 + }, + { + "epoch": 3.6274231678486997, + "grad_norm": 3.0043468475341797, + "learning_rate": 1.716786037817466e-06, + "loss": 0.3432, + "step": 7672 + }, + { + "epoch": 3.6278959810874705, + "grad_norm": 3.216550827026367, + "learning_rate": 1.7161936393288945e-06, + "loss": 0.3963, + "step": 7673 + }, + { + "epoch": 3.6283687943262413, + "grad_norm": 3.1091387271881104, + "learning_rate": 1.715601289644509e-06, + "loss": 0.4347, + "step": 7674 + }, + { + "epoch": 3.628841607565012, + "grad_norm": 3.2288286685943604, + "learning_rate": 1.7150089888011916e-06, + "loss": 0.4291, + "step": 7675 + }, + { + "epoch": 3.6293144208037824, + "grad_norm": 2.943941831588745, + "learning_rate": 1.7144167368358216e-06, + "loss": 0.3643, + "step": 7676 + }, + { + "epoch": 3.629787234042553, + "grad_norm": 2.819683313369751, + "learning_rate": 1.7138245337852774e-06, + "loss": 0.4051, + "step": 7677 + }, + { + "epoch": 3.630260047281324, + "grad_norm": 2.9988269805908203, + "learning_rate": 1.713232379686432e-06, + "loss": 0.4102, + "step": 7678 + }, + { + "epoch": 3.6307328605200944, + "grad_norm": 3.0041310787200928, + "learning_rate": 1.7126402745761566e-06, + "loss": 0.3854, + "step": 7679 + }, + { + "epoch": 3.631205673758865, + "grad_norm": 2.8700194358825684, + "learning_rate": 1.7120482184913192e-06, + "loss": 0.3441, + "step": 7680 + }, + { + "epoch": 3.631678486997636, + "grad_norm": 3.5275180339813232, + "learning_rate": 1.7114562114687833e-06, + "loss": 0.3808, + "step": 7681 + }, + { + "epoch": 3.6321513002364068, + "grad_norm": 3.182326078414917, + "learning_rate": 1.710864253545412e-06, + "loss": 0.4178, + "step": 7682 + }, + { + "epoch": 3.6326241134751776, + "grad_norm": 3.0514512062072754, + "learning_rate": 1.7102723447580627e-06, + "loss": 0.3527, + "step": 7683 + }, + { + "epoch": 3.633096926713948, + "grad_norm": 2.8293066024780273, + "learning_rate": 1.7096804851435922e-06, + "loss": 0.3723, + "step": 7684 + }, + { + "epoch": 3.6335697399527187, + "grad_norm": 2.9601097106933594, + "learning_rate": 1.709088674738853e-06, + "loss": 0.3704, + "step": 7685 + }, + { + "epoch": 3.6340425531914895, + "grad_norm": 2.8070995807647705, + "learning_rate": 1.7084969135806933e-06, + "loss": 0.346, + "step": 7686 + }, + { + "epoch": 3.63451536643026, + "grad_norm": 3.0162715911865234, + "learning_rate": 1.70790520170596e-06, + "loss": 0.39, + "step": 7687 + }, + { + "epoch": 3.6349881796690307, + "grad_norm": 3.018763780593872, + "learning_rate": 1.7073135391514967e-06, + "loss": 0.4621, + "step": 7688 + }, + { + "epoch": 3.6354609929078014, + "grad_norm": 2.963604688644409, + "learning_rate": 1.706721925954144e-06, + "loss": 0.339, + "step": 7689 + }, + { + "epoch": 3.6359338061465722, + "grad_norm": 2.8532896041870117, + "learning_rate": 1.7061303621507383e-06, + "loss": 0.3915, + "step": 7690 + }, + { + "epoch": 3.636406619385343, + "grad_norm": 3.248006820678711, + "learning_rate": 1.7055388477781133e-06, + "loss": 0.3712, + "step": 7691 + }, + { + "epoch": 3.6368794326241134, + "grad_norm": 3.2195777893066406, + "learning_rate": 1.7049473828731011e-06, + "loss": 0.4358, + "step": 7692 + }, + { + "epoch": 3.637352245862884, + "grad_norm": 2.7190768718719482, + "learning_rate": 1.7043559674725296e-06, + "loss": 0.341, + "step": 7693 + }, + { + "epoch": 3.637825059101655, + "grad_norm": 2.6047232151031494, + "learning_rate": 1.7037646016132223e-06, + "loss": 0.3513, + "step": 7694 + }, + { + "epoch": 3.6382978723404253, + "grad_norm": 3.0824201107025146, + "learning_rate": 1.7031732853320026e-06, + "loss": 0.4097, + "step": 7695 + }, + { + "epoch": 3.638770685579196, + "grad_norm": 2.845461130142212, + "learning_rate": 1.7025820186656883e-06, + "loss": 0.3395, + "step": 7696 + }, + { + "epoch": 3.639243498817967, + "grad_norm": 2.937863826751709, + "learning_rate": 1.7019908016510953e-06, + "loss": 0.395, + "step": 7697 + }, + { + "epoch": 3.6397163120567377, + "grad_norm": 3.349780559539795, + "learning_rate": 1.701399634325036e-06, + "loss": 0.3889, + "step": 7698 + }, + { + "epoch": 3.6401891252955085, + "grad_norm": 2.8527066707611084, + "learning_rate": 1.7008085167243187e-06, + "loss": 0.3753, + "step": 7699 + }, + { + "epoch": 3.640661938534279, + "grad_norm": 2.8112385272979736, + "learning_rate": 1.7002174488857517e-06, + "loss": 0.3912, + "step": 7700 + }, + { + "epoch": 3.6411347517730497, + "grad_norm": 2.731933832168579, + "learning_rate": 1.6996264308461363e-06, + "loss": 0.4142, + "step": 7701 + }, + { + "epoch": 3.6416075650118205, + "grad_norm": 3.70465350151062, + "learning_rate": 1.6990354626422744e-06, + "loss": 0.4089, + "step": 7702 + }, + { + "epoch": 3.642080378250591, + "grad_norm": 2.8656258583068848, + "learning_rate": 1.698444544310962e-06, + "loss": 0.3771, + "step": 7703 + }, + { + "epoch": 3.6425531914893616, + "grad_norm": 2.878830671310425, + "learning_rate": 1.697853675888993e-06, + "loss": 0.3754, + "step": 7704 + }, + { + "epoch": 3.6430260047281324, + "grad_norm": 3.440528154373169, + "learning_rate": 1.6972628574131586e-06, + "loss": 0.4543, + "step": 7705 + }, + { + "epoch": 3.6434988179669032, + "grad_norm": 2.70736026763916, + "learning_rate": 1.6966720889202451e-06, + "loss": 0.4049, + "step": 7706 + }, + { + "epoch": 3.643971631205674, + "grad_norm": 2.787992238998413, + "learning_rate": 1.6960813704470391e-06, + "loss": 0.3854, + "step": 7707 + }, + { + "epoch": 3.6444444444444444, + "grad_norm": 2.631490707397461, + "learning_rate": 1.6954907020303213e-06, + "loss": 0.3775, + "step": 7708 + }, + { + "epoch": 3.644917257683215, + "grad_norm": 3.052255392074585, + "learning_rate": 1.6949000837068685e-06, + "loss": 0.3873, + "step": 7709 + }, + { + "epoch": 3.645390070921986, + "grad_norm": 2.7443203926086426, + "learning_rate": 1.6943095155134586e-06, + "loss": 0.3362, + "step": 7710 + }, + { + "epoch": 3.6458628841607563, + "grad_norm": 2.931688070297241, + "learning_rate": 1.6937189974868618e-06, + "loss": 0.3839, + "step": 7711 + }, + { + "epoch": 3.646335697399527, + "grad_norm": 2.950242757797241, + "learning_rate": 1.6931285296638479e-06, + "loss": 0.3552, + "step": 7712 + }, + { + "epoch": 3.646808510638298, + "grad_norm": 2.940735340118408, + "learning_rate": 1.6925381120811823e-06, + "loss": 0.3881, + "step": 7713 + }, + { + "epoch": 3.6472813238770687, + "grad_norm": 2.771355390548706, + "learning_rate": 1.6919477447756273e-06, + "loss": 0.3578, + "step": 7714 + }, + { + "epoch": 3.6477541371158395, + "grad_norm": 2.919004201889038, + "learning_rate": 1.6913574277839435e-06, + "loss": 0.3971, + "step": 7715 + }, + { + "epoch": 3.64822695035461, + "grad_norm": 3.293705463409424, + "learning_rate": 1.6907671611428872e-06, + "loss": 0.422, + "step": 7716 + }, + { + "epoch": 3.6486997635933807, + "grad_norm": 2.744239091873169, + "learning_rate": 1.6901769448892103e-06, + "loss": 0.398, + "step": 7717 + }, + { + "epoch": 3.6491725768321515, + "grad_norm": 3.1726129055023193, + "learning_rate": 1.689586779059665e-06, + "loss": 0.39, + "step": 7718 + }, + { + "epoch": 3.649645390070922, + "grad_norm": 3.146743059158325, + "learning_rate": 1.688996663690997e-06, + "loss": 0.4059, + "step": 7719 + }, + { + "epoch": 3.6501182033096926, + "grad_norm": 2.941025495529175, + "learning_rate": 1.688406598819951e-06, + "loss": 0.3479, + "step": 7720 + }, + { + "epoch": 3.6505910165484634, + "grad_norm": 3.3480939865112305, + "learning_rate": 1.6878165844832679e-06, + "loss": 0.4141, + "step": 7721 + }, + { + "epoch": 3.651063829787234, + "grad_norm": 2.9145030975341797, + "learning_rate": 1.6872266207176833e-06, + "loss": 0.3497, + "step": 7722 + }, + { + "epoch": 3.651536643026005, + "grad_norm": 3.119502067565918, + "learning_rate": 1.686636707559934e-06, + "loss": 0.424, + "step": 7723 + }, + { + "epoch": 3.6520094562647754, + "grad_norm": 3.0867667198181152, + "learning_rate": 1.6860468450467497e-06, + "loss": 0.3998, + "step": 7724 + }, + { + "epoch": 3.652482269503546, + "grad_norm": 2.9128987789154053, + "learning_rate": 1.6854570332148602e-06, + "loss": 0.4043, + "step": 7725 + }, + { + "epoch": 3.652955082742317, + "grad_norm": 2.9973206520080566, + "learning_rate": 1.6848672721009896e-06, + "loss": 0.3395, + "step": 7726 + }, + { + "epoch": 3.6534278959810873, + "grad_norm": 2.824916124343872, + "learning_rate": 1.6842775617418591e-06, + "loss": 0.4102, + "step": 7727 + }, + { + "epoch": 3.653900709219858, + "grad_norm": 2.7984440326690674, + "learning_rate": 1.6836879021741887e-06, + "loss": 0.3823, + "step": 7728 + }, + { + "epoch": 3.654373522458629, + "grad_norm": 2.8412179946899414, + "learning_rate": 1.6830982934346917e-06, + "loss": 0.3755, + "step": 7729 + }, + { + "epoch": 3.6548463356973997, + "grad_norm": 3.1677138805389404, + "learning_rate": 1.6825087355600836e-06, + "loss": 0.4224, + "step": 7730 + }, + { + "epoch": 3.65531914893617, + "grad_norm": 3.097085475921631, + "learning_rate": 1.6819192285870718e-06, + "loss": 0.4103, + "step": 7731 + }, + { + "epoch": 3.655791962174941, + "grad_norm": 2.9802496433258057, + "learning_rate": 1.6813297725523613e-06, + "loss": 0.4297, + "step": 7732 + }, + { + "epoch": 3.6562647754137116, + "grad_norm": 3.0135059356689453, + "learning_rate": 1.680740367492657e-06, + "loss": 0.4526, + "step": 7733 + }, + { + "epoch": 3.656737588652482, + "grad_norm": 2.7776739597320557, + "learning_rate": 1.6801510134446575e-06, + "loss": 0.3924, + "step": 7734 + }, + { + "epoch": 3.657210401891253, + "grad_norm": 2.7500126361846924, + "learning_rate": 1.6795617104450595e-06, + "loss": 0.3785, + "step": 7735 + }, + { + "epoch": 3.6576832151300236, + "grad_norm": 3.494142770767212, + "learning_rate": 1.6789724585305566e-06, + "loss": 0.3483, + "step": 7736 + }, + { + "epoch": 3.6581560283687944, + "grad_norm": 3.055081605911255, + "learning_rate": 1.6783832577378377e-06, + "loss": 0.4481, + "step": 7737 + }, + { + "epoch": 3.658628841607565, + "grad_norm": 2.781412124633789, + "learning_rate": 1.6777941081035914e-06, + "loss": 0.3969, + "step": 7738 + }, + { + "epoch": 3.6591016548463355, + "grad_norm": 3.1672184467315674, + "learning_rate": 1.677205009664501e-06, + "loss": 0.3959, + "step": 7739 + }, + { + "epoch": 3.6595744680851063, + "grad_norm": 3.0597715377807617, + "learning_rate": 1.6766159624572458e-06, + "loss": 0.418, + "step": 7740 + }, + { + "epoch": 3.660047281323877, + "grad_norm": 3.2906267642974854, + "learning_rate": 1.676026966518505e-06, + "loss": 0.4335, + "step": 7741 + }, + { + "epoch": 3.6605200945626475, + "grad_norm": 3.2519290447235107, + "learning_rate": 1.6754380218849515e-06, + "loss": 0.3786, + "step": 7742 + }, + { + "epoch": 3.6609929078014183, + "grad_norm": 3.24716854095459, + "learning_rate": 1.6748491285932572e-06, + "loss": 0.3599, + "step": 7743 + }, + { + "epoch": 3.661465721040189, + "grad_norm": 3.2940993309020996, + "learning_rate": 1.6742602866800897e-06, + "loss": 0.3934, + "step": 7744 + }, + { + "epoch": 3.66193853427896, + "grad_norm": 2.917409896850586, + "learning_rate": 1.6736714961821124e-06, + "loss": 0.4197, + "step": 7745 + }, + { + "epoch": 3.6624113475177307, + "grad_norm": 3.005068063735962, + "learning_rate": 1.6730827571359887e-06, + "loss": 0.4239, + "step": 7746 + }, + { + "epoch": 3.662884160756501, + "grad_norm": 2.751880168914795, + "learning_rate": 1.6724940695783745e-06, + "loss": 0.4257, + "step": 7747 + }, + { + "epoch": 3.663356973995272, + "grad_norm": 3.090670585632324, + "learning_rate": 1.6719054335459273e-06, + "loss": 0.3686, + "step": 7748 + }, + { + "epoch": 3.6638297872340426, + "grad_norm": 3.250251293182373, + "learning_rate": 1.6713168490752974e-06, + "loss": 0.4249, + "step": 7749 + }, + { + "epoch": 3.664302600472813, + "grad_norm": 2.8662827014923096, + "learning_rate": 1.6707283162031335e-06, + "loss": 0.3692, + "step": 7750 + }, + { + "epoch": 3.6647754137115838, + "grad_norm": 2.8709118366241455, + "learning_rate": 1.6701398349660813e-06, + "loss": 0.3929, + "step": 7751 + }, + { + "epoch": 3.6652482269503546, + "grad_norm": 2.992035388946533, + "learning_rate": 1.6695514054007822e-06, + "loss": 0.4131, + "step": 7752 + }, + { + "epoch": 3.6657210401891254, + "grad_norm": 3.0427589416503906, + "learning_rate": 1.668963027543876e-06, + "loss": 0.387, + "step": 7753 + }, + { + "epoch": 3.666193853427896, + "grad_norm": 3.0147807598114014, + "learning_rate": 1.6683747014319987e-06, + "loss": 0.3648, + "step": 7754 + }, + { + "epoch": 3.6666666666666665, + "grad_norm": 2.5483829975128174, + "learning_rate": 1.6677864271017811e-06, + "loss": 0.3643, + "step": 7755 + }, + { + "epoch": 3.6671394799054373, + "grad_norm": 2.7661986351013184, + "learning_rate": 1.6671982045898544e-06, + "loss": 0.3731, + "step": 7756 + }, + { + "epoch": 3.667612293144208, + "grad_norm": 2.778036117553711, + "learning_rate": 1.666610033932843e-06, + "loss": 0.3744, + "step": 7757 + }, + { + "epoch": 3.6680851063829785, + "grad_norm": 2.9028329849243164, + "learning_rate": 1.6660219151673712e-06, + "loss": 0.4286, + "step": 7758 + }, + { + "epoch": 3.6685579196217493, + "grad_norm": 2.826687812805176, + "learning_rate": 1.6654338483300575e-06, + "loss": 0.318, + "step": 7759 + }, + { + "epoch": 3.66903073286052, + "grad_norm": 2.7063660621643066, + "learning_rate": 1.6648458334575186e-06, + "loss": 0.3351, + "step": 7760 + }, + { + "epoch": 3.669503546099291, + "grad_norm": 2.708361864089966, + "learning_rate": 1.664257870586368e-06, + "loss": 0.376, + "step": 7761 + }, + { + "epoch": 3.6699763593380617, + "grad_norm": 3.1139161586761475, + "learning_rate": 1.6636699597532141e-06, + "loss": 0.3572, + "step": 7762 + }, + { + "epoch": 3.670449172576832, + "grad_norm": 3.0858285427093506, + "learning_rate": 1.6630821009946658e-06, + "loss": 0.4204, + "step": 7763 + }, + { + "epoch": 3.670921985815603, + "grad_norm": 3.5593984127044678, + "learning_rate": 1.6624942943473252e-06, + "loss": 0.463, + "step": 7764 + }, + { + "epoch": 3.6713947990543736, + "grad_norm": 2.863851308822632, + "learning_rate": 1.6619065398477921e-06, + "loss": 0.4272, + "step": 7765 + }, + { + "epoch": 3.671867612293144, + "grad_norm": 2.833399772644043, + "learning_rate": 1.6613188375326638e-06, + "loss": 0.3509, + "step": 7766 + }, + { + "epoch": 3.6723404255319148, + "grad_norm": 2.988948345184326, + "learning_rate": 1.6607311874385346e-06, + "loss": 0.3572, + "step": 7767 + }, + { + "epoch": 3.6728132387706856, + "grad_norm": 2.7349398136138916, + "learning_rate": 1.6601435896019936e-06, + "loss": 0.3875, + "step": 7768 + }, + { + "epoch": 3.6732860520094563, + "grad_norm": 2.8544445037841797, + "learning_rate": 1.659556044059629e-06, + "loss": 0.4057, + "step": 7769 + }, + { + "epoch": 3.673758865248227, + "grad_norm": 3.0341904163360596, + "learning_rate": 1.6589685508480235e-06, + "loss": 0.3935, + "step": 7770 + }, + { + "epoch": 3.6742316784869975, + "grad_norm": 2.7495710849761963, + "learning_rate": 1.6583811100037595e-06, + "loss": 0.352, + "step": 7771 + }, + { + "epoch": 3.6747044917257683, + "grad_norm": 3.258525848388672, + "learning_rate": 1.6577937215634133e-06, + "loss": 0.4414, + "step": 7772 + }, + { + "epoch": 3.675177304964539, + "grad_norm": 3.0686328411102295, + "learning_rate": 1.657206385563558e-06, + "loss": 0.353, + "step": 7773 + }, + { + "epoch": 3.6756501182033094, + "grad_norm": 3.1168248653411865, + "learning_rate": 1.6566191020407668e-06, + "loss": 0.4064, + "step": 7774 + }, + { + "epoch": 3.6761229314420802, + "grad_norm": 2.7837352752685547, + "learning_rate": 1.6560318710316053e-06, + "loss": 0.3956, + "step": 7775 + }, + { + "epoch": 3.676595744680851, + "grad_norm": 3.1514039039611816, + "learning_rate": 1.6554446925726391e-06, + "loss": 0.4044, + "step": 7776 + }, + { + "epoch": 3.677068557919622, + "grad_norm": 3.010352611541748, + "learning_rate": 1.6548575667004285e-06, + "loss": 0.4162, + "step": 7777 + }, + { + "epoch": 3.6775413711583926, + "grad_norm": 3.1727633476257324, + "learning_rate": 1.6542704934515308e-06, + "loss": 0.411, + "step": 7778 + }, + { + "epoch": 3.678014184397163, + "grad_norm": 3.6771271228790283, + "learning_rate": 1.6536834728625018e-06, + "loss": 0.4562, + "step": 7779 + }, + { + "epoch": 3.678486997635934, + "grad_norm": 2.9793131351470947, + "learning_rate": 1.6530965049698908e-06, + "loss": 0.4039, + "step": 7780 + }, + { + "epoch": 3.6789598108747046, + "grad_norm": 3.193751096725464, + "learning_rate": 1.6525095898102478e-06, + "loss": 0.4064, + "step": 7781 + }, + { + "epoch": 3.679432624113475, + "grad_norm": 2.6643173694610596, + "learning_rate": 1.6519227274201169e-06, + "loss": 0.3731, + "step": 7782 + }, + { + "epoch": 3.6799054373522457, + "grad_norm": 3.4855685234069824, + "learning_rate": 1.6513359178360384e-06, + "loss": 0.3815, + "step": 7783 + }, + { + "epoch": 3.6803782505910165, + "grad_norm": 3.320537567138672, + "learning_rate": 1.6507491610945514e-06, + "loss": 0.4065, + "step": 7784 + }, + { + "epoch": 3.6808510638297873, + "grad_norm": 3.2793102264404297, + "learning_rate": 1.6501624572321895e-06, + "loss": 0.429, + "step": 7785 + }, + { + "epoch": 3.681323877068558, + "grad_norm": 2.8609631061553955, + "learning_rate": 1.6495758062854854e-06, + "loss": 0.3881, + "step": 7786 + }, + { + "epoch": 3.6817966903073285, + "grad_norm": 2.956533193588257, + "learning_rate": 1.6489892082909675e-06, + "loss": 0.3662, + "step": 7787 + }, + { + "epoch": 3.6822695035460993, + "grad_norm": 2.949092149734497, + "learning_rate": 1.6484026632851591e-06, + "loss": 0.4021, + "step": 7788 + }, + { + "epoch": 3.68274231678487, + "grad_norm": 2.802572250366211, + "learning_rate": 1.6478161713045831e-06, + "loss": 0.3862, + "step": 7789 + }, + { + "epoch": 3.6832151300236404, + "grad_norm": 3.0064797401428223, + "learning_rate": 1.6472297323857578e-06, + "loss": 0.3495, + "step": 7790 + }, + { + "epoch": 3.6836879432624112, + "grad_norm": 2.74037766456604, + "learning_rate": 1.646643346565197e-06, + "loss": 0.362, + "step": 7791 + }, + { + "epoch": 3.684160756501182, + "grad_norm": 3.423326253890991, + "learning_rate": 1.646057013879414e-06, + "loss": 0.4383, + "step": 7792 + }, + { + "epoch": 3.684633569739953, + "grad_norm": 2.7426443099975586, + "learning_rate": 1.645470734364915e-06, + "loss": 0.3741, + "step": 7793 + }, + { + "epoch": 3.6851063829787236, + "grad_norm": 2.879296064376831, + "learning_rate": 1.6448845080582077e-06, + "loss": 0.4003, + "step": 7794 + }, + { + "epoch": 3.685579196217494, + "grad_norm": 2.6821188926696777, + "learning_rate": 1.6442983349957924e-06, + "loss": 0.3545, + "step": 7795 + }, + { + "epoch": 3.6860520094562648, + "grad_norm": 2.714059829711914, + "learning_rate": 1.6437122152141665e-06, + "loss": 0.3702, + "step": 7796 + }, + { + "epoch": 3.6865248226950356, + "grad_norm": 2.593811511993408, + "learning_rate": 1.6431261487498274e-06, + "loss": 0.3512, + "step": 7797 + }, + { + "epoch": 3.686997635933806, + "grad_norm": 2.687533378601074, + "learning_rate": 1.6425401356392652e-06, + "loss": 0.36, + "step": 7798 + }, + { + "epoch": 3.6874704491725767, + "grad_norm": 3.1675431728363037, + "learning_rate": 1.6419541759189694e-06, + "loss": 0.3349, + "step": 7799 + }, + { + "epoch": 3.6879432624113475, + "grad_norm": 2.777310371398926, + "learning_rate": 1.6413682696254246e-06, + "loss": 0.317, + "step": 7800 + }, + { + "epoch": 3.6884160756501183, + "grad_norm": 3.0121655464172363, + "learning_rate": 1.640782416795112e-06, + "loss": 0.3612, + "step": 7801 + }, + { + "epoch": 3.688888888888889, + "grad_norm": 3.0532145500183105, + "learning_rate": 1.6401966174645113e-06, + "loss": 0.4065, + "step": 7802 + }, + { + "epoch": 3.6893617021276595, + "grad_norm": 2.8221664428710938, + "learning_rate": 1.6396108716700961e-06, + "loss": 0.3669, + "step": 7803 + }, + { + "epoch": 3.6898345153664303, + "grad_norm": 2.966357707977295, + "learning_rate": 1.6390251794483405e-06, + "loss": 0.391, + "step": 7804 + }, + { + "epoch": 3.690307328605201, + "grad_norm": 3.460252046585083, + "learning_rate": 1.6384395408357118e-06, + "loss": 0.429, + "step": 7805 + }, + { + "epoch": 3.6907801418439714, + "grad_norm": 2.8907718658447266, + "learning_rate": 1.637853955868674e-06, + "loss": 0.3761, + "step": 7806 + }, + { + "epoch": 3.691252955082742, + "grad_norm": 3.114612102508545, + "learning_rate": 1.6372684245836912e-06, + "loss": 0.4376, + "step": 7807 + }, + { + "epoch": 3.691725768321513, + "grad_norm": 2.9361326694488525, + "learning_rate": 1.6366829470172191e-06, + "loss": 0.3672, + "step": 7808 + }, + { + "epoch": 3.692198581560284, + "grad_norm": 3.2719476222991943, + "learning_rate": 1.6360975232057156e-06, + "loss": 0.4266, + "step": 7809 + }, + { + "epoch": 3.6926713947990546, + "grad_norm": 2.873952865600586, + "learning_rate": 1.635512153185631e-06, + "loss": 0.4056, + "step": 7810 + }, + { + "epoch": 3.693144208037825, + "grad_norm": 3.0273401737213135, + "learning_rate": 1.634926836993413e-06, + "loss": 0.3947, + "step": 7811 + }, + { + "epoch": 3.6936170212765957, + "grad_norm": 2.868738889694214, + "learning_rate": 1.634341574665509e-06, + "loss": 0.3935, + "step": 7812 + }, + { + "epoch": 3.6940898345153665, + "grad_norm": 3.3080437183380127, + "learning_rate": 1.6337563662383591e-06, + "loss": 0.3606, + "step": 7813 + }, + { + "epoch": 3.694562647754137, + "grad_norm": 2.8339016437530518, + "learning_rate": 1.6331712117484014e-06, + "loss": 0.4019, + "step": 7814 + }, + { + "epoch": 3.6950354609929077, + "grad_norm": 2.666815996170044, + "learning_rate": 1.6325861112320717e-06, + "loss": 0.3502, + "step": 7815 + }, + { + "epoch": 3.6955082742316785, + "grad_norm": 2.7624311447143555, + "learning_rate": 1.6320010647258008e-06, + "loss": 0.3481, + "step": 7816 + }, + { + "epoch": 3.6959810874704493, + "grad_norm": 2.7796332836151123, + "learning_rate": 1.6314160722660183e-06, + "loss": 0.3735, + "step": 7817 + }, + { + "epoch": 3.69645390070922, + "grad_norm": 2.954318046569824, + "learning_rate": 1.6308311338891484e-06, + "loss": 0.3933, + "step": 7818 + }, + { + "epoch": 3.6969267139479904, + "grad_norm": 2.821072816848755, + "learning_rate": 1.6302462496316115e-06, + "loss": 0.3437, + "step": 7819 + }, + { + "epoch": 3.6973995271867612, + "grad_norm": 3.436192750930786, + "learning_rate": 1.629661419529828e-06, + "loss": 0.4469, + "step": 7820 + }, + { + "epoch": 3.697872340425532, + "grad_norm": 3.1361067295074463, + "learning_rate": 1.629076643620211e-06, + "loss": 0.3887, + "step": 7821 + }, + { + "epoch": 3.6983451536643024, + "grad_norm": 3.355024576187134, + "learning_rate": 1.6284919219391732e-06, + "loss": 0.424, + "step": 7822 + }, + { + "epoch": 3.698817966903073, + "grad_norm": 2.7671639919281006, + "learning_rate": 1.6279072545231212e-06, + "loss": 0.3765, + "step": 7823 + }, + { + "epoch": 3.699290780141844, + "grad_norm": 2.9509360790252686, + "learning_rate": 1.6273226414084606e-06, + "loss": 0.4057, + "step": 7824 + }, + { + "epoch": 3.699763593380615, + "grad_norm": 2.9852921962738037, + "learning_rate": 1.6267380826315932e-06, + "loss": 0.4238, + "step": 7825 + }, + { + "epoch": 3.7002364066193856, + "grad_norm": 2.826594114303589, + "learning_rate": 1.626153578228915e-06, + "loss": 0.3958, + "step": 7826 + }, + { + "epoch": 3.700709219858156, + "grad_norm": 2.9103410243988037, + "learning_rate": 1.6255691282368228e-06, + "loss": 0.394, + "step": 7827 + }, + { + "epoch": 3.7011820330969267, + "grad_norm": 3.362992525100708, + "learning_rate": 1.6249847326917068e-06, + "loss": 0.4233, + "step": 7828 + }, + { + "epoch": 3.7016548463356975, + "grad_norm": 2.711280107498169, + "learning_rate": 1.624400391629954e-06, + "loss": 0.2977, + "step": 7829 + }, + { + "epoch": 3.702127659574468, + "grad_norm": 2.8354649543762207, + "learning_rate": 1.6238161050879497e-06, + "loss": 0.3549, + "step": 7830 + }, + { + "epoch": 3.7026004728132387, + "grad_norm": 3.096376895904541, + "learning_rate": 1.6232318731020743e-06, + "loss": 0.3486, + "step": 7831 + }, + { + "epoch": 3.7030732860520095, + "grad_norm": 2.918267250061035, + "learning_rate": 1.6226476957087064e-06, + "loss": 0.3659, + "step": 7832 + }, + { + "epoch": 3.7035460992907803, + "grad_norm": 2.705399513244629, + "learning_rate": 1.6220635729442195e-06, + "loss": 0.4301, + "step": 7833 + }, + { + "epoch": 3.704018912529551, + "grad_norm": 2.9192235469818115, + "learning_rate": 1.621479504844983e-06, + "loss": 0.3384, + "step": 7834 + }, + { + "epoch": 3.7044917257683214, + "grad_norm": 2.78623104095459, + "learning_rate": 1.6208954914473669e-06, + "loss": 0.3528, + "step": 7835 + }, + { + "epoch": 3.704964539007092, + "grad_norm": 3.0218069553375244, + "learning_rate": 1.6203115327877333e-06, + "loss": 0.3698, + "step": 7836 + }, + { + "epoch": 3.705437352245863, + "grad_norm": 3.019101619720459, + "learning_rate": 1.6197276289024422e-06, + "loss": 0.4398, + "step": 7837 + }, + { + "epoch": 3.7059101654846334, + "grad_norm": 2.9220848083496094, + "learning_rate": 1.6191437798278531e-06, + "loss": 0.3803, + "step": 7838 + }, + { + "epoch": 3.706382978723404, + "grad_norm": 3.2731969356536865, + "learning_rate": 1.6185599856003181e-06, + "loss": 0.4529, + "step": 7839 + }, + { + "epoch": 3.706855791962175, + "grad_norm": 2.85239577293396, + "learning_rate": 1.617976246256188e-06, + "loss": 0.3801, + "step": 7840 + }, + { + "epoch": 3.7073286052009458, + "grad_norm": 2.8250765800476074, + "learning_rate": 1.6173925618318092e-06, + "loss": 0.3267, + "step": 7841 + }, + { + "epoch": 3.7078014184397166, + "grad_norm": 2.9152321815490723, + "learning_rate": 1.616808932363525e-06, + "loss": 0.428, + "step": 7842 + }, + { + "epoch": 3.708274231678487, + "grad_norm": 2.912656545639038, + "learning_rate": 1.6162253578876766e-06, + "loss": 0.3802, + "step": 7843 + }, + { + "epoch": 3.7087470449172577, + "grad_norm": 3.0700762271881104, + "learning_rate": 1.6156418384405992e-06, + "loss": 0.377, + "step": 7844 + }, + { + "epoch": 3.7092198581560285, + "grad_norm": 2.873141050338745, + "learning_rate": 1.6150583740586274e-06, + "loss": 0.399, + "step": 7845 + }, + { + "epoch": 3.709692671394799, + "grad_norm": 2.899555206298828, + "learning_rate": 1.6144749647780906e-06, + "loss": 0.402, + "step": 7846 + }, + { + "epoch": 3.7101654846335697, + "grad_norm": 3.257697343826294, + "learning_rate": 1.6138916106353139e-06, + "loss": 0.4193, + "step": 7847 + }, + { + "epoch": 3.7106382978723405, + "grad_norm": 2.6879804134368896, + "learning_rate": 1.613308311666622e-06, + "loss": 0.3474, + "step": 7848 + }, + { + "epoch": 3.7111111111111112, + "grad_norm": 2.712491273880005, + "learning_rate": 1.6127250679083323e-06, + "loss": 0.3315, + "step": 7849 + }, + { + "epoch": 3.711583924349882, + "grad_norm": 2.9762673377990723, + "learning_rate": 1.6121418793967631e-06, + "loss": 0.3953, + "step": 7850 + }, + { + "epoch": 3.7120567375886524, + "grad_norm": 2.743668556213379, + "learning_rate": 1.6115587461682258e-06, + "loss": 0.381, + "step": 7851 + }, + { + "epoch": 3.712529550827423, + "grad_norm": 3.0545318126678467, + "learning_rate": 1.6109756682590288e-06, + "loss": 0.412, + "step": 7852 + }, + { + "epoch": 3.713002364066194, + "grad_norm": 3.0125906467437744, + "learning_rate": 1.61039264570548e-06, + "loss": 0.3931, + "step": 7853 + }, + { + "epoch": 3.7134751773049643, + "grad_norm": 2.809302806854248, + "learning_rate": 1.6098096785438794e-06, + "loss": 0.3943, + "step": 7854 + }, + { + "epoch": 3.713947990543735, + "grad_norm": 3.092452049255371, + "learning_rate": 1.6092267668105276e-06, + "loss": 0.3932, + "step": 7855 + }, + { + "epoch": 3.714420803782506, + "grad_norm": 2.9878969192504883, + "learning_rate": 1.608643910541719e-06, + "loss": 0.4289, + "step": 7856 + }, + { + "epoch": 3.7148936170212767, + "grad_norm": 2.693387508392334, + "learning_rate": 1.6080611097737444e-06, + "loss": 0.373, + "step": 7857 + }, + { + "epoch": 3.7153664302600475, + "grad_norm": 3.4097673892974854, + "learning_rate": 1.6074783645428945e-06, + "loss": 0.4487, + "step": 7858 + }, + { + "epoch": 3.715839243498818, + "grad_norm": 3.1466784477233887, + "learning_rate": 1.6068956748854525e-06, + "loss": 0.3648, + "step": 7859 + }, + { + "epoch": 3.7163120567375887, + "grad_norm": 3.062107563018799, + "learning_rate": 1.6063130408377015e-06, + "loss": 0.3899, + "step": 7860 + }, + { + "epoch": 3.7167848699763595, + "grad_norm": 3.2298364639282227, + "learning_rate": 1.6057304624359188e-06, + "loss": 0.4243, + "step": 7861 + }, + { + "epoch": 3.71725768321513, + "grad_norm": 3.0285773277282715, + "learning_rate": 1.6051479397163784e-06, + "loss": 0.3469, + "step": 7862 + }, + { + "epoch": 3.7177304964539006, + "grad_norm": 2.8438515663146973, + "learning_rate": 1.6045654727153525e-06, + "loss": 0.3363, + "step": 7863 + }, + { + "epoch": 3.7182033096926714, + "grad_norm": 3.1558034420013428, + "learning_rate": 1.6039830614691081e-06, + "loss": 0.4326, + "step": 7864 + }, + { + "epoch": 3.7186761229314422, + "grad_norm": 2.438640594482422, + "learning_rate": 1.603400706013909e-06, + "loss": 0.3647, + "step": 7865 + }, + { + "epoch": 3.719148936170213, + "grad_norm": 3.0443127155303955, + "learning_rate": 1.6028184063860168e-06, + "loss": 0.4295, + "step": 7866 + }, + { + "epoch": 3.7196217494089834, + "grad_norm": 2.8009512424468994, + "learning_rate": 1.602236162621688e-06, + "loss": 0.4055, + "step": 7867 + }, + { + "epoch": 3.720094562647754, + "grad_norm": 3.227698802947998, + "learning_rate": 1.6016539747571775e-06, + "loss": 0.4681, + "step": 7868 + }, + { + "epoch": 3.720567375886525, + "grad_norm": 2.8242595195770264, + "learning_rate": 1.601071842828735e-06, + "loss": 0.3707, + "step": 7869 + }, + { + "epoch": 3.7210401891252953, + "grad_norm": 3.602937698364258, + "learning_rate": 1.6004897668726067e-06, + "loss": 0.5201, + "step": 7870 + }, + { + "epoch": 3.721513002364066, + "grad_norm": 3.578422784805298, + "learning_rate": 1.599907746925037e-06, + "loss": 0.4514, + "step": 7871 + }, + { + "epoch": 3.721985815602837, + "grad_norm": 2.7365758419036865, + "learning_rate": 1.5993257830222635e-06, + "loss": 0.356, + "step": 7872 + }, + { + "epoch": 3.7224586288416077, + "grad_norm": 3.125636577606201, + "learning_rate": 1.5987438752005258e-06, + "loss": 0.4277, + "step": 7873 + }, + { + "epoch": 3.7229314420803785, + "grad_norm": 2.7157294750213623, + "learning_rate": 1.5981620234960549e-06, + "loss": 0.363, + "step": 7874 + }, + { + "epoch": 3.723404255319149, + "grad_norm": 2.90950083732605, + "learning_rate": 1.5975802279450793e-06, + "loss": 0.4027, + "step": 7875 + }, + { + "epoch": 3.7238770685579197, + "grad_norm": 2.659787178039551, + "learning_rate": 1.596998488583827e-06, + "loss": 0.3632, + "step": 7876 + }, + { + "epoch": 3.7243498817966905, + "grad_norm": 3.221623182296753, + "learning_rate": 1.5964168054485185e-06, + "loss": 0.4295, + "step": 7877 + }, + { + "epoch": 3.724822695035461, + "grad_norm": 2.6838672161102295, + "learning_rate": 1.595835178575374e-06, + "loss": 0.3413, + "step": 7878 + }, + { + "epoch": 3.7252955082742316, + "grad_norm": 2.804706335067749, + "learning_rate": 1.5952536080006084e-06, + "loss": 0.3801, + "step": 7879 + }, + { + "epoch": 3.7257683215130024, + "grad_norm": 2.7647509574890137, + "learning_rate": 1.5946720937604326e-06, + "loss": 0.3941, + "step": 7880 + }, + { + "epoch": 3.726241134751773, + "grad_norm": 2.8363754749298096, + "learning_rate": 1.5940906358910566e-06, + "loss": 0.3772, + "step": 7881 + }, + { + "epoch": 3.726713947990544, + "grad_norm": 3.4147698879241943, + "learning_rate": 1.5935092344286835e-06, + "loss": 0.399, + "step": 7882 + }, + { + "epoch": 3.7271867612293144, + "grad_norm": 2.984090805053711, + "learning_rate": 1.5929278894095162e-06, + "loss": 0.3373, + "step": 7883 + }, + { + "epoch": 3.727659574468085, + "grad_norm": 3.250173330307007, + "learning_rate": 1.5923466008697521e-06, + "loss": 0.3932, + "step": 7884 + }, + { + "epoch": 3.728132387706856, + "grad_norm": 3.2699649333953857, + "learning_rate": 1.5917653688455848e-06, + "loss": 0.4529, + "step": 7885 + }, + { + "epoch": 3.7286052009456263, + "grad_norm": 3.175934076309204, + "learning_rate": 1.591184193373206e-06, + "loss": 0.3726, + "step": 7886 + }, + { + "epoch": 3.729078014184397, + "grad_norm": 2.8128812313079834, + "learning_rate": 1.5906030744888024e-06, + "loss": 0.3648, + "step": 7887 + }, + { + "epoch": 3.729550827423168, + "grad_norm": 3.025012493133545, + "learning_rate": 1.5900220122285564e-06, + "loss": 0.3945, + "step": 7888 + }, + { + "epoch": 3.7300236406619387, + "grad_norm": 3.237680435180664, + "learning_rate": 1.5894410066286512e-06, + "loss": 0.3815, + "step": 7889 + }, + { + "epoch": 3.7304964539007095, + "grad_norm": 3.458033323287964, + "learning_rate": 1.5888600577252605e-06, + "loss": 0.4104, + "step": 7890 + }, + { + "epoch": 3.73096926713948, + "grad_norm": 2.718867540359497, + "learning_rate": 1.58827916555456e-06, + "loss": 0.3243, + "step": 7891 + }, + { + "epoch": 3.7314420803782506, + "grad_norm": 3.047157049179077, + "learning_rate": 1.5876983301527176e-06, + "loss": 0.3689, + "step": 7892 + }, + { + "epoch": 3.731914893617021, + "grad_norm": 3.2904715538024902, + "learning_rate": 1.5871175515558995e-06, + "loss": 0.4045, + "step": 7893 + }, + { + "epoch": 3.732387706855792, + "grad_norm": 2.956467866897583, + "learning_rate": 1.5865368298002692e-06, + "loss": 0.3806, + "step": 7894 + }, + { + "epoch": 3.7328605200945626, + "grad_norm": 3.3309173583984375, + "learning_rate": 1.5859561649219843e-06, + "loss": 0.4011, + "step": 7895 + }, + { + "epoch": 3.7333333333333334, + "grad_norm": 2.7853524684906006, + "learning_rate": 1.5853755569572018e-06, + "loss": 0.3239, + "step": 7896 + }, + { + "epoch": 3.733806146572104, + "grad_norm": 2.9832780361175537, + "learning_rate": 1.584795005942073e-06, + "loss": 0.4582, + "step": 7897 + }, + { + "epoch": 3.7342789598108745, + "grad_norm": 3.2866461277008057, + "learning_rate": 1.584214511912745e-06, + "loss": 0.3876, + "step": 7898 + }, + { + "epoch": 3.7347517730496453, + "grad_norm": 3.018526792526245, + "learning_rate": 1.5836340749053646e-06, + "loss": 0.3221, + "step": 7899 + }, + { + "epoch": 3.735224586288416, + "grad_norm": 2.9109885692596436, + "learning_rate": 1.583053694956072e-06, + "loss": 0.4225, + "step": 7900 + }, + { + "epoch": 3.7356973995271865, + "grad_norm": 3.104146718978882, + "learning_rate": 1.5824733721010051e-06, + "loss": 0.3843, + "step": 7901 + }, + { + "epoch": 3.7361702127659573, + "grad_norm": 3.0982813835144043, + "learning_rate": 1.5818931063762989e-06, + "loss": 0.4223, + "step": 7902 + }, + { + "epoch": 3.736643026004728, + "grad_norm": 2.7797579765319824, + "learning_rate": 1.5813128978180819e-06, + "loss": 0.3536, + "step": 7903 + }, + { + "epoch": 3.737115839243499, + "grad_norm": 2.870884656906128, + "learning_rate": 1.5807327464624835e-06, + "loss": 0.3053, + "step": 7904 + }, + { + "epoch": 3.7375886524822697, + "grad_norm": 2.896674633026123, + "learning_rate": 1.5801526523456251e-06, + "loss": 0.3806, + "step": 7905 + }, + { + "epoch": 3.73806146572104, + "grad_norm": 3.009662389755249, + "learning_rate": 1.5795726155036284e-06, + "loss": 0.3568, + "step": 7906 + }, + { + "epoch": 3.738534278959811, + "grad_norm": 2.6860599517822266, + "learning_rate": 1.578992635972609e-06, + "loss": 0.4392, + "step": 7907 + }, + { + "epoch": 3.7390070921985816, + "grad_norm": 2.9046099185943604, + "learning_rate": 1.578412713788679e-06, + "loss": 0.3756, + "step": 7908 + }, + { + "epoch": 3.739479905437352, + "grad_norm": 2.8035101890563965, + "learning_rate": 1.5778328489879488e-06, + "loss": 0.3576, + "step": 7909 + }, + { + "epoch": 3.739952718676123, + "grad_norm": 2.767514228820801, + "learning_rate": 1.5772530416065238e-06, + "loss": 0.4037, + "step": 7910 + }, + { + "epoch": 3.7404255319148936, + "grad_norm": 3.0867795944213867, + "learning_rate": 1.576673291680505e-06, + "loss": 0.4394, + "step": 7911 + }, + { + "epoch": 3.7408983451536644, + "grad_norm": 3.295976161956787, + "learning_rate": 1.5760935992459926e-06, + "loss": 0.3938, + "step": 7912 + }, + { + "epoch": 3.741371158392435, + "grad_norm": 2.725949287414551, + "learning_rate": 1.5755139643390794e-06, + "loss": 0.3633, + "step": 7913 + }, + { + "epoch": 3.7418439716312055, + "grad_norm": 3.0864083766937256, + "learning_rate": 1.5749343869958585e-06, + "loss": 0.3034, + "step": 7914 + }, + { + "epoch": 3.7423167848699763, + "grad_norm": 3.707273244857788, + "learning_rate": 1.5743548672524175e-06, + "loss": 0.4206, + "step": 7915 + }, + { + "epoch": 3.742789598108747, + "grad_norm": 2.9829516410827637, + "learning_rate": 1.573775405144839e-06, + "loss": 0.333, + "step": 7916 + }, + { + "epoch": 3.7432624113475175, + "grad_norm": 3.3303117752075195, + "learning_rate": 1.5731960007092056e-06, + "loss": 0.4558, + "step": 7917 + }, + { + "epoch": 3.7437352245862883, + "grad_norm": 2.63291335105896, + "learning_rate": 1.5726166539815925e-06, + "loss": 0.39, + "step": 7918 + }, + { + "epoch": 3.744208037825059, + "grad_norm": 3.0533673763275146, + "learning_rate": 1.572037364998075e-06, + "loss": 0.3586, + "step": 7919 + }, + { + "epoch": 3.74468085106383, + "grad_norm": 2.9185104370117188, + "learning_rate": 1.5714581337947216e-06, + "loss": 0.3809, + "step": 7920 + }, + { + "epoch": 3.7451536643026007, + "grad_norm": 3.1863298416137695, + "learning_rate": 1.5708789604075975e-06, + "loss": 0.4132, + "step": 7921 + }, + { + "epoch": 3.745626477541371, + "grad_norm": 3.2700514793395996, + "learning_rate": 1.5702998448727674e-06, + "loss": 0.4601, + "step": 7922 + }, + { + "epoch": 3.746099290780142, + "grad_norm": 3.4729206562042236, + "learning_rate": 1.5697207872262886e-06, + "loss": 0.4585, + "step": 7923 + }, + { + "epoch": 3.7465721040189126, + "grad_norm": 3.1432926654815674, + "learning_rate": 1.5691417875042182e-06, + "loss": 0.3128, + "step": 7924 + }, + { + "epoch": 3.747044917257683, + "grad_norm": 3.096121072769165, + "learning_rate": 1.5685628457426066e-06, + "loss": 0.3903, + "step": 7925 + }, + { + "epoch": 3.7475177304964538, + "grad_norm": 2.6897027492523193, + "learning_rate": 1.5679839619775023e-06, + "loss": 0.3707, + "step": 7926 + }, + { + "epoch": 3.7479905437352246, + "grad_norm": 2.8020687103271484, + "learning_rate": 1.5674051362449503e-06, + "loss": 0.3986, + "step": 7927 + }, + { + "epoch": 3.7484633569739954, + "grad_norm": 3.1278326511383057, + "learning_rate": 1.56682636858099e-06, + "loss": 0.3899, + "step": 7928 + }, + { + "epoch": 3.748936170212766, + "grad_norm": 2.907982587814331, + "learning_rate": 1.5662476590216613e-06, + "loss": 0.3422, + "step": 7929 + }, + { + "epoch": 3.7494089834515365, + "grad_norm": 3.1246347427368164, + "learning_rate": 1.5656690076029962e-06, + "loss": 0.452, + "step": 7930 + }, + { + "epoch": 3.7498817966903073, + "grad_norm": 2.9161367416381836, + "learning_rate": 1.565090414361024e-06, + "loss": 0.3284, + "step": 7931 + }, + { + "epoch": 3.750354609929078, + "grad_norm": 2.943183422088623, + "learning_rate": 1.564511879331773e-06, + "loss": 0.3478, + "step": 7932 + }, + { + "epoch": 3.7508274231678485, + "grad_norm": 3.2308566570281982, + "learning_rate": 1.563933402551266e-06, + "loss": 0.4143, + "step": 7933 + }, + { + "epoch": 3.7513002364066192, + "grad_norm": 2.6846251487731934, + "learning_rate": 1.5633549840555206e-06, + "loss": 0.3681, + "step": 7934 + }, + { + "epoch": 3.75177304964539, + "grad_norm": 3.0995283126831055, + "learning_rate": 1.562776623880554e-06, + "loss": 0.4642, + "step": 7935 + }, + { + "epoch": 3.752245862884161, + "grad_norm": 2.7406163215637207, + "learning_rate": 1.562198322062376e-06, + "loss": 0.3823, + "step": 7936 + }, + { + "epoch": 3.7527186761229316, + "grad_norm": 2.85732364654541, + "learning_rate": 1.5616200786369978e-06, + "loss": 0.3053, + "step": 7937 + }, + { + "epoch": 3.753191489361702, + "grad_norm": 2.812526226043701, + "learning_rate": 1.5610418936404223e-06, + "loss": 0.3944, + "step": 7938 + }, + { + "epoch": 3.753664302600473, + "grad_norm": 2.8886849880218506, + "learning_rate": 1.5604637671086499e-06, + "loss": 0.3936, + "step": 7939 + }, + { + "epoch": 3.7541371158392436, + "grad_norm": 2.831774950027466, + "learning_rate": 1.5598856990776801e-06, + "loss": 0.3146, + "step": 7940 + }, + { + "epoch": 3.754609929078014, + "grad_norm": 2.8853790760040283, + "learning_rate": 1.5593076895835052e-06, + "loss": 0.3286, + "step": 7941 + }, + { + "epoch": 3.7550827423167847, + "grad_norm": 3.2724483013153076, + "learning_rate": 1.5587297386621158e-06, + "loss": 0.3396, + "step": 7942 + }, + { + "epoch": 3.7555555555555555, + "grad_norm": 3.5077168941497803, + "learning_rate": 1.5581518463494983e-06, + "loss": 0.4528, + "step": 7943 + }, + { + "epoch": 3.7560283687943263, + "grad_norm": 3.031503915786743, + "learning_rate": 1.5575740126816346e-06, + "loss": 0.3803, + "step": 7944 + }, + { + "epoch": 3.756501182033097, + "grad_norm": 3.0939114093780518, + "learning_rate": 1.556996237694506e-06, + "loss": 0.3931, + "step": 7945 + }, + { + "epoch": 3.7569739952718675, + "grad_norm": 2.9404146671295166, + "learning_rate": 1.556418521424085e-06, + "loss": 0.3608, + "step": 7946 + }, + { + "epoch": 3.7574468085106383, + "grad_norm": 3.4363012313842773, + "learning_rate": 1.5558408639063465e-06, + "loss": 0.4335, + "step": 7947 + }, + { + "epoch": 3.757919621749409, + "grad_norm": 3.2819864749908447, + "learning_rate": 1.5552632651772575e-06, + "loss": 0.4147, + "step": 7948 + }, + { + "epoch": 3.7583924349881794, + "grad_norm": 2.917788505554199, + "learning_rate": 1.554685725272782e-06, + "loss": 0.3516, + "step": 7949 + }, + { + "epoch": 3.7588652482269502, + "grad_norm": 2.8425943851470947, + "learning_rate": 1.5541082442288818e-06, + "loss": 0.3596, + "step": 7950 + }, + { + "epoch": 3.759338061465721, + "grad_norm": 3.087005376815796, + "learning_rate": 1.5535308220815126e-06, + "loss": 0.3968, + "step": 7951 + }, + { + "epoch": 3.759810874704492, + "grad_norm": 2.743110179901123, + "learning_rate": 1.5529534588666298e-06, + "loss": 0.3802, + "step": 7952 + }, + { + "epoch": 3.7602836879432626, + "grad_norm": 2.914424180984497, + "learning_rate": 1.5523761546201825e-06, + "loss": 0.4055, + "step": 7953 + }, + { + "epoch": 3.760756501182033, + "grad_norm": 2.9691991806030273, + "learning_rate": 1.551798909378116e-06, + "loss": 0.3384, + "step": 7954 + }, + { + "epoch": 3.7612293144208038, + "grad_norm": 2.433657646179199, + "learning_rate": 1.5512217231763747e-06, + "loss": 0.3019, + "step": 7955 + }, + { + "epoch": 3.7617021276595746, + "grad_norm": 2.7904880046844482, + "learning_rate": 1.5506445960508957e-06, + "loss": 0.389, + "step": 7956 + }, + { + "epoch": 3.762174940898345, + "grad_norm": 2.9241607189178467, + "learning_rate": 1.5500675280376154e-06, + "loss": 0.4291, + "step": 7957 + }, + { + "epoch": 3.7626477541371157, + "grad_norm": 3.216491222381592, + "learning_rate": 1.549490519172465e-06, + "loss": 0.4065, + "step": 7958 + }, + { + "epoch": 3.7631205673758865, + "grad_norm": 2.8859689235687256, + "learning_rate": 1.548913569491371e-06, + "loss": 0.353, + "step": 7959 + }, + { + "epoch": 3.7635933806146573, + "grad_norm": 2.958773136138916, + "learning_rate": 1.5483366790302594e-06, + "loss": 0.3829, + "step": 7960 + }, + { + "epoch": 3.764066193853428, + "grad_norm": 2.868649482727051, + "learning_rate": 1.5477598478250505e-06, + "loss": 0.3591, + "step": 7961 + }, + { + "epoch": 3.7645390070921985, + "grad_norm": 2.6912996768951416, + "learning_rate": 1.5471830759116591e-06, + "loss": 0.3695, + "step": 7962 + }, + { + "epoch": 3.7650118203309693, + "grad_norm": 3.3318257331848145, + "learning_rate": 1.5466063633260004e-06, + "loss": 0.4126, + "step": 7963 + }, + { + "epoch": 3.76548463356974, + "grad_norm": 2.865525007247925, + "learning_rate": 1.5460297101039825e-06, + "loss": 0.4235, + "step": 7964 + }, + { + "epoch": 3.7659574468085104, + "grad_norm": 2.8639180660247803, + "learning_rate": 1.5454531162815123e-06, + "loss": 0.4392, + "step": 7965 + }, + { + "epoch": 3.766430260047281, + "grad_norm": 2.5752499103546143, + "learning_rate": 1.5448765818944902e-06, + "loss": 0.4113, + "step": 7966 + }, + { + "epoch": 3.766903073286052, + "grad_norm": 2.7622742652893066, + "learning_rate": 1.5443001069788155e-06, + "loss": 0.3785, + "step": 7967 + }, + { + "epoch": 3.767375886524823, + "grad_norm": 2.965579032897949, + "learning_rate": 1.5437236915703829e-06, + "loss": 0.335, + "step": 7968 + }, + { + "epoch": 3.7678486997635936, + "grad_norm": 3.0587408542633057, + "learning_rate": 1.5431473357050816e-06, + "loss": 0.4047, + "step": 7969 + }, + { + "epoch": 3.768321513002364, + "grad_norm": 3.2929413318634033, + "learning_rate": 1.5425710394188014e-06, + "loss": 0.4061, + "step": 7970 + }, + { + "epoch": 3.7687943262411348, + "grad_norm": 2.663043975830078, + "learning_rate": 1.541994802747424e-06, + "loss": 0.3478, + "step": 7971 + }, + { + "epoch": 3.7692671394799055, + "grad_norm": 3.0657591819763184, + "learning_rate": 1.5414186257268293e-06, + "loss": 0.3735, + "step": 7972 + }, + { + "epoch": 3.769739952718676, + "grad_norm": 2.963189125061035, + "learning_rate": 1.5408425083928939e-06, + "loss": 0.4743, + "step": 7973 + }, + { + "epoch": 3.7702127659574467, + "grad_norm": 3.1509387493133545, + "learning_rate": 1.540266450781489e-06, + "loss": 0.4164, + "step": 7974 + }, + { + "epoch": 3.7706855791962175, + "grad_norm": 3.4436306953430176, + "learning_rate": 1.539690452928485e-06, + "loss": 0.4583, + "step": 7975 + }, + { + "epoch": 3.7711583924349883, + "grad_norm": 3.1746156215667725, + "learning_rate": 1.5391145148697454e-06, + "loss": 0.4042, + "step": 7976 + }, + { + "epoch": 3.771631205673759, + "grad_norm": 3.531028985977173, + "learning_rate": 1.5385386366411304e-06, + "loss": 0.4304, + "step": 7977 + }, + { + "epoch": 3.7721040189125294, + "grad_norm": 2.867871046066284, + "learning_rate": 1.5379628182785e-06, + "loss": 0.4023, + "step": 7978 + }, + { + "epoch": 3.7725768321513002, + "grad_norm": 3.0504629611968994, + "learning_rate": 1.5373870598177051e-06, + "loss": 0.3785, + "step": 7979 + }, + { + "epoch": 3.773049645390071, + "grad_norm": 2.8188650608062744, + "learning_rate": 1.5368113612945983e-06, + "loss": 0.3808, + "step": 7980 + }, + { + "epoch": 3.7735224586288414, + "grad_norm": 3.0809133052825928, + "learning_rate": 1.5362357227450248e-06, + "loss": 0.3912, + "step": 7981 + }, + { + "epoch": 3.773995271867612, + "grad_norm": 3.223273277282715, + "learning_rate": 1.5356601442048257e-06, + "loss": 0.3802, + "step": 7982 + }, + { + "epoch": 3.774468085106383, + "grad_norm": 2.7513339519500732, + "learning_rate": 1.535084625709842e-06, + "loss": 0.3822, + "step": 7983 + }, + { + "epoch": 3.774940898345154, + "grad_norm": 3.085592031478882, + "learning_rate": 1.5345091672959074e-06, + "loss": 0.4348, + "step": 7984 + }, + { + "epoch": 3.7754137115839246, + "grad_norm": 3.315108299255371, + "learning_rate": 1.5339337689988525e-06, + "loss": 0.4196, + "step": 7985 + }, + { + "epoch": 3.775886524822695, + "grad_norm": 3.713372230529785, + "learning_rate": 1.533358430854507e-06, + "loss": 0.4292, + "step": 7986 + }, + { + "epoch": 3.7763593380614657, + "grad_norm": 2.7899155616760254, + "learning_rate": 1.532783152898692e-06, + "loss": 0.3874, + "step": 7987 + }, + { + "epoch": 3.7768321513002365, + "grad_norm": 2.918851852416992, + "learning_rate": 1.5322079351672297e-06, + "loss": 0.4073, + "step": 7988 + }, + { + "epoch": 3.777304964539007, + "grad_norm": 3.13395619392395, + "learning_rate": 1.5316327776959361e-06, + "loss": 0.3441, + "step": 7989 + }, + { + "epoch": 3.7777777777777777, + "grad_norm": 3.2320916652679443, + "learning_rate": 1.531057680520623e-06, + "loss": 0.372, + "step": 7990 + }, + { + "epoch": 3.7782505910165485, + "grad_norm": 3.1130621433258057, + "learning_rate": 1.5304826436770991e-06, + "loss": 0.3514, + "step": 7991 + }, + { + "epoch": 3.7787234042553193, + "grad_norm": 3.223207712173462, + "learning_rate": 1.5299076672011696e-06, + "loss": 0.44, + "step": 7992 + }, + { + "epoch": 3.77919621749409, + "grad_norm": 3.0757877826690674, + "learning_rate": 1.5293327511286366e-06, + "loss": 0.4051, + "step": 7993 + }, + { + "epoch": 3.7796690307328604, + "grad_norm": 2.936678409576416, + "learning_rate": 1.528757895495297e-06, + "loss": 0.3965, + "step": 7994 + }, + { + "epoch": 3.780141843971631, + "grad_norm": 2.993445873260498, + "learning_rate": 1.5281831003369435e-06, + "loss": 0.4222, + "step": 7995 + }, + { + "epoch": 3.780614657210402, + "grad_norm": 2.9140853881835938, + "learning_rate": 1.5276083656893679e-06, + "loss": 0.3662, + "step": 7996 + }, + { + "epoch": 3.7810874704491724, + "grad_norm": 3.2649893760681152, + "learning_rate": 1.5270336915883549e-06, + "loss": 0.4272, + "step": 7997 + }, + { + "epoch": 3.781560283687943, + "grad_norm": 3.0631372928619385, + "learning_rate": 1.5264590780696887e-06, + "loss": 0.4111, + "step": 7998 + }, + { + "epoch": 3.782033096926714, + "grad_norm": 2.791299343109131, + "learning_rate": 1.5258845251691463e-06, + "loss": 0.416, + "step": 7999 + }, + { + "epoch": 3.7825059101654848, + "grad_norm": 3.262294054031372, + "learning_rate": 1.5253100329225023e-06, + "loss": 0.4236, + "step": 8000 + }, + { + "epoch": 3.7829787234042556, + "grad_norm": 2.574486017227173, + "learning_rate": 1.5247356013655295e-06, + "loss": 0.4089, + "step": 8001 + }, + { + "epoch": 3.783451536643026, + "grad_norm": 3.1566531658172607, + "learning_rate": 1.5241612305339936e-06, + "loss": 0.3955, + "step": 8002 + }, + { + "epoch": 3.7839243498817967, + "grad_norm": 2.5845813751220703, + "learning_rate": 1.5235869204636602e-06, + "loss": 0.3672, + "step": 8003 + }, + { + "epoch": 3.7843971631205675, + "grad_norm": 2.877570629119873, + "learning_rate": 1.5230126711902876e-06, + "loss": 0.3919, + "step": 8004 + }, + { + "epoch": 3.784869976359338, + "grad_norm": 3.183061122894287, + "learning_rate": 1.5224384827496314e-06, + "loss": 0.3291, + "step": 8005 + }, + { + "epoch": 3.7853427895981087, + "grad_norm": 3.0778391361236572, + "learning_rate": 1.5218643551774451e-06, + "loss": 0.3571, + "step": 8006 + }, + { + "epoch": 3.7858156028368795, + "grad_norm": 3.2364399433135986, + "learning_rate": 1.5212902885094762e-06, + "loss": 0.4045, + "step": 8007 + }, + { + "epoch": 3.7862884160756503, + "grad_norm": 3.0571746826171875, + "learning_rate": 1.5207162827814687e-06, + "loss": 0.4181, + "step": 8008 + }, + { + "epoch": 3.786761229314421, + "grad_norm": 2.7215163707733154, + "learning_rate": 1.5201423380291652e-06, + "loss": 0.3328, + "step": 8009 + }, + { + "epoch": 3.7872340425531914, + "grad_norm": 3.0521233081817627, + "learning_rate": 1.5195684542883007e-06, + "loss": 0.4072, + "step": 8010 + }, + { + "epoch": 3.787706855791962, + "grad_norm": 2.541666269302368, + "learning_rate": 1.5189946315946104e-06, + "loss": 0.3293, + "step": 8011 + }, + { + "epoch": 3.788179669030733, + "grad_norm": 3.0041720867156982, + "learning_rate": 1.5184208699838232e-06, + "loss": 0.3998, + "step": 8012 + }, + { + "epoch": 3.7886524822695034, + "grad_norm": 3.0763001441955566, + "learning_rate": 1.5178471694916635e-06, + "loss": 0.38, + "step": 8013 + }, + { + "epoch": 3.789125295508274, + "grad_norm": 3.0788497924804688, + "learning_rate": 1.5172735301538544e-06, + "loss": 0.3986, + "step": 8014 + }, + { + "epoch": 3.789598108747045, + "grad_norm": 2.830225944519043, + "learning_rate": 1.5166999520061127e-06, + "loss": 0.3977, + "step": 8015 + }, + { + "epoch": 3.7900709219858157, + "grad_norm": 3.196078062057495, + "learning_rate": 1.5161264350841543e-06, + "loss": 0.4058, + "step": 8016 + }, + { + "epoch": 3.7905437352245865, + "grad_norm": 9.898200988769531, + "learning_rate": 1.5155529794236884e-06, + "loss": 0.3451, + "step": 8017 + }, + { + "epoch": 3.791016548463357, + "grad_norm": 3.0028066635131836, + "learning_rate": 1.514979585060421e-06, + "loss": 0.4029, + "step": 8018 + }, + { + "epoch": 3.7914893617021277, + "grad_norm": 2.984926223754883, + "learning_rate": 1.5144062520300562e-06, + "loss": 0.3995, + "step": 8019 + }, + { + "epoch": 3.7919621749408985, + "grad_norm": 2.938596487045288, + "learning_rate": 1.5138329803682925e-06, + "loss": 0.386, + "step": 8020 + }, + { + "epoch": 3.792434988179669, + "grad_norm": 2.992565393447876, + "learning_rate": 1.513259770110825e-06, + "loss": 0.3919, + "step": 8021 + }, + { + "epoch": 3.7929078014184396, + "grad_norm": 3.0182361602783203, + "learning_rate": 1.5126866212933453e-06, + "loss": 0.3506, + "step": 8022 + }, + { + "epoch": 3.7933806146572104, + "grad_norm": 3.2039108276367188, + "learning_rate": 1.5121135339515392e-06, + "loss": 0.3807, + "step": 8023 + }, + { + "epoch": 3.7938534278959812, + "grad_norm": 2.9290878772735596, + "learning_rate": 1.5115405081210927e-06, + "loss": 0.3596, + "step": 8024 + }, + { + "epoch": 3.794326241134752, + "grad_norm": 3.106152057647705, + "learning_rate": 1.510967543837683e-06, + "loss": 0.3703, + "step": 8025 + }, + { + "epoch": 3.7947990543735224, + "grad_norm": 2.9752190113067627, + "learning_rate": 1.510394641136989e-06, + "loss": 0.4049, + "step": 8026 + }, + { + "epoch": 3.795271867612293, + "grad_norm": 2.996206283569336, + "learning_rate": 1.5098218000546815e-06, + "loss": 0.4286, + "step": 8027 + }, + { + "epoch": 3.795744680851064, + "grad_norm": 2.9403493404388428, + "learning_rate": 1.5092490206264281e-06, + "loss": 0.3628, + "step": 8028 + }, + { + "epoch": 3.7962174940898343, + "grad_norm": 2.8101110458374023, + "learning_rate": 1.5086763028878943e-06, + "loss": 0.4016, + "step": 8029 + }, + { + "epoch": 3.796690307328605, + "grad_norm": 3.162264108657837, + "learning_rate": 1.5081036468747401e-06, + "loss": 0.4133, + "step": 8030 + }, + { + "epoch": 3.797163120567376, + "grad_norm": 2.6871988773345947, + "learning_rate": 1.5075310526226223e-06, + "loss": 0.3748, + "step": 8031 + }, + { + "epoch": 3.7976359338061467, + "grad_norm": 2.997924327850342, + "learning_rate": 1.5069585201671944e-06, + "loss": 0.4083, + "step": 8032 + }, + { + "epoch": 3.7981087470449175, + "grad_norm": 2.8266279697418213, + "learning_rate": 1.506386049544104e-06, + "loss": 0.4488, + "step": 8033 + }, + { + "epoch": 3.798581560283688, + "grad_norm": 2.7106378078460693, + "learning_rate": 1.5058136407889985e-06, + "loss": 0.363, + "step": 8034 + }, + { + "epoch": 3.7990543735224587, + "grad_norm": 2.8983304500579834, + "learning_rate": 1.5052412939375183e-06, + "loss": 0.4156, + "step": 8035 + }, + { + "epoch": 3.7995271867612295, + "grad_norm": 3.0333914756774902, + "learning_rate": 1.5046690090253001e-06, + "loss": 0.3694, + "step": 8036 + }, + { + "epoch": 3.8, + "grad_norm": 2.872662305831909, + "learning_rate": 1.5040967860879785e-06, + "loss": 0.3492, + "step": 8037 + }, + { + "epoch": 3.8004728132387706, + "grad_norm": 2.7279646396636963, + "learning_rate": 1.5035246251611835e-06, + "loss": 0.327, + "step": 8038 + }, + { + "epoch": 3.8009456264775414, + "grad_norm": 2.969326972961426, + "learning_rate": 1.5029525262805405e-06, + "loss": 0.3977, + "step": 8039 + }, + { + "epoch": 3.801418439716312, + "grad_norm": 3.073899745941162, + "learning_rate": 1.5023804894816723e-06, + "loss": 0.388, + "step": 8040 + }, + { + "epoch": 3.801891252955083, + "grad_norm": 3.026284694671631, + "learning_rate": 1.5018085148001953e-06, + "loss": 0.3761, + "step": 8041 + }, + { + "epoch": 3.8023640661938534, + "grad_norm": 3.0478618144989014, + "learning_rate": 1.5012366022717262e-06, + "loss": 0.4415, + "step": 8042 + }, + { + "epoch": 3.802836879432624, + "grad_norm": 2.801584005355835, + "learning_rate": 1.500664751931874e-06, + "loss": 0.4079, + "step": 8043 + }, + { + "epoch": 3.803309692671395, + "grad_norm": 3.4839112758636475, + "learning_rate": 1.5000929638162459e-06, + "loss": 0.4391, + "step": 8044 + }, + { + "epoch": 3.8037825059101653, + "grad_norm": 2.6945605278015137, + "learning_rate": 1.4995212379604446e-06, + "loss": 0.3564, + "step": 8045 + }, + { + "epoch": 3.804255319148936, + "grad_norm": 3.0870234966278076, + "learning_rate": 1.4989495744000687e-06, + "loss": 0.3801, + "step": 8046 + }, + { + "epoch": 3.804728132387707, + "grad_norm": 2.975332021713257, + "learning_rate": 1.4983779731707135e-06, + "loss": 0.3408, + "step": 8047 + }, + { + "epoch": 3.8052009456264777, + "grad_norm": 2.9920027256011963, + "learning_rate": 1.497806434307969e-06, + "loss": 0.3875, + "step": 8048 + }, + { + "epoch": 3.8056737588652485, + "grad_norm": 3.1974916458129883, + "learning_rate": 1.4972349578474244e-06, + "loss": 0.4492, + "step": 8049 + }, + { + "epoch": 3.806146572104019, + "grad_norm": 2.839503526687622, + "learning_rate": 1.4966635438246622e-06, + "loss": 0.3785, + "step": 8050 + }, + { + "epoch": 3.8066193853427897, + "grad_norm": 3.274502992630005, + "learning_rate": 1.4960921922752603e-06, + "loss": 0.4404, + "step": 8051 + }, + { + "epoch": 3.8070921985815604, + "grad_norm": 3.0852737426757812, + "learning_rate": 1.4955209032347967e-06, + "loss": 0.4047, + "step": 8052 + }, + { + "epoch": 3.807565011820331, + "grad_norm": 2.9251608848571777, + "learning_rate": 1.4949496767388417e-06, + "loss": 0.3654, + "step": 8053 + }, + { + "epoch": 3.8080378250591016, + "grad_norm": 2.518220901489258, + "learning_rate": 1.4943785128229635e-06, + "loss": 0.3157, + "step": 8054 + }, + { + "epoch": 3.8085106382978724, + "grad_norm": 3.3993279933929443, + "learning_rate": 1.4938074115227257e-06, + "loss": 0.4204, + "step": 8055 + }, + { + "epoch": 3.808983451536643, + "grad_norm": 3.2847096920013428, + "learning_rate": 1.4932363728736876e-06, + "loss": 0.339, + "step": 8056 + }, + { + "epoch": 3.8094562647754135, + "grad_norm": 2.7779417037963867, + "learning_rate": 1.492665396911407e-06, + "loss": 0.3538, + "step": 8057 + }, + { + "epoch": 3.8099290780141843, + "grad_norm": 2.958131790161133, + "learning_rate": 1.4920944836714353e-06, + "loss": 0.363, + "step": 8058 + }, + { + "epoch": 3.810401891252955, + "grad_norm": 3.1873440742492676, + "learning_rate": 1.491523633189319e-06, + "loss": 0.3785, + "step": 8059 + }, + { + "epoch": 3.8108747044917255, + "grad_norm": 3.132652759552002, + "learning_rate": 1.4909528455006055e-06, + "loss": 0.375, + "step": 8060 + }, + { + "epoch": 3.8113475177304963, + "grad_norm": 2.8598761558532715, + "learning_rate": 1.490382120640833e-06, + "loss": 0.4152, + "step": 8061 + }, + { + "epoch": 3.811820330969267, + "grad_norm": 3.115870952606201, + "learning_rate": 1.4898114586455399e-06, + "loss": 0.4609, + "step": 8062 + }, + { + "epoch": 3.812293144208038, + "grad_norm": 3.347944974899292, + "learning_rate": 1.4892408595502571e-06, + "loss": 0.3836, + "step": 8063 + }, + { + "epoch": 3.8127659574468087, + "grad_norm": 3.1747031211853027, + "learning_rate": 1.4886703233905132e-06, + "loss": 0.374, + "step": 8064 + }, + { + "epoch": 3.813238770685579, + "grad_norm": 2.945139169692993, + "learning_rate": 1.4880998502018345e-06, + "loss": 0.3652, + "step": 8065 + }, + { + "epoch": 3.81371158392435, + "grad_norm": 2.8911492824554443, + "learning_rate": 1.4875294400197403e-06, + "loss": 0.3683, + "step": 8066 + }, + { + "epoch": 3.8141843971631206, + "grad_norm": 3.080268383026123, + "learning_rate": 1.4869590928797491e-06, + "loss": 0.3919, + "step": 8067 + }, + { + "epoch": 3.814657210401891, + "grad_norm": 3.0834288597106934, + "learning_rate": 1.4863888088173734e-06, + "loss": 0.3988, + "step": 8068 + }, + { + "epoch": 3.815130023640662, + "grad_norm": 2.765702724456787, + "learning_rate": 1.4858185878681213e-06, + "loss": 0.3659, + "step": 8069 + }, + { + "epoch": 3.8156028368794326, + "grad_norm": 3.074059247970581, + "learning_rate": 1.4852484300674993e-06, + "loss": 0.3888, + "step": 8070 + }, + { + "epoch": 3.8160756501182034, + "grad_norm": 3.0009944438934326, + "learning_rate": 1.484678335451007e-06, + "loss": 0.417, + "step": 8071 + }, + { + "epoch": 3.816548463356974, + "grad_norm": 2.6661112308502197, + "learning_rate": 1.4841083040541438e-06, + "loss": 0.3544, + "step": 8072 + }, + { + "epoch": 3.8170212765957445, + "grad_norm": 2.7849514484405518, + "learning_rate": 1.4835383359124018e-06, + "loss": 0.3691, + "step": 8073 + }, + { + "epoch": 3.8174940898345153, + "grad_norm": 3.008070707321167, + "learning_rate": 1.4829684310612697e-06, + "loss": 0.4228, + "step": 8074 + }, + { + "epoch": 3.817966903073286, + "grad_norm": 2.649296998977661, + "learning_rate": 1.4823985895362348e-06, + "loss": 0.3642, + "step": 8075 + }, + { + "epoch": 3.8184397163120565, + "grad_norm": 2.6017661094665527, + "learning_rate": 1.4818288113727768e-06, + "loss": 0.3537, + "step": 8076 + }, + { + "epoch": 3.8189125295508273, + "grad_norm": 2.9071972370147705, + "learning_rate": 1.481259096606375e-06, + "loss": 0.3096, + "step": 8077 + }, + { + "epoch": 3.819385342789598, + "grad_norm": 3.0866518020629883, + "learning_rate": 1.4806894452725024e-06, + "loss": 0.4148, + "step": 8078 + }, + { + "epoch": 3.819858156028369, + "grad_norm": 3.2099499702453613, + "learning_rate": 1.4801198574066272e-06, + "loss": 0.4058, + "step": 8079 + }, + { + "epoch": 3.8203309692671397, + "grad_norm": 3.0204920768737793, + "learning_rate": 1.4795503330442176e-06, + "loss": 0.3427, + "step": 8080 + }, + { + "epoch": 3.82080378250591, + "grad_norm": 2.88667368888855, + "learning_rate": 1.478980872220734e-06, + "loss": 0.4075, + "step": 8081 + }, + { + "epoch": 3.821276595744681, + "grad_norm": 2.926673173904419, + "learning_rate": 1.4784114749716338e-06, + "loss": 0.3449, + "step": 8082 + }, + { + "epoch": 3.8217494089834516, + "grad_norm": 2.818936347961426, + "learning_rate": 1.4778421413323723e-06, + "loss": 0.3628, + "step": 8083 + }, + { + "epoch": 3.822222222222222, + "grad_norm": 2.960322380065918, + "learning_rate": 1.4772728713383983e-06, + "loss": 0.3669, + "step": 8084 + }, + { + "epoch": 3.8226950354609928, + "grad_norm": 2.940131902694702, + "learning_rate": 1.4767036650251584e-06, + "loss": 0.4357, + "step": 8085 + }, + { + "epoch": 3.8231678486997636, + "grad_norm": 2.9251785278320312, + "learning_rate": 1.4761345224280943e-06, + "loss": 0.4046, + "step": 8086 + }, + { + "epoch": 3.8236406619385344, + "grad_norm": 3.115590810775757, + "learning_rate": 1.475565443582643e-06, + "loss": 0.3712, + "step": 8087 + }, + { + "epoch": 3.824113475177305, + "grad_norm": 2.5968618392944336, + "learning_rate": 1.4749964285242408e-06, + "loss": 0.3432, + "step": 8088 + }, + { + "epoch": 3.8245862884160755, + "grad_norm": 3.195409059524536, + "learning_rate": 1.4744274772883148e-06, + "loss": 0.3717, + "step": 8089 + }, + { + "epoch": 3.8250591016548463, + "grad_norm": 2.8658018112182617, + "learning_rate": 1.4738585899102942e-06, + "loss": 0.3807, + "step": 8090 + }, + { + "epoch": 3.825531914893617, + "grad_norm": 2.9005510807037354, + "learning_rate": 1.4732897664255998e-06, + "loss": 0.3988, + "step": 8091 + }, + { + "epoch": 3.8260047281323875, + "grad_norm": 3.9155731201171875, + "learning_rate": 1.472721006869649e-06, + "loss": 0.3981, + "step": 8092 + }, + { + "epoch": 3.8264775413711583, + "grad_norm": 2.89312744140625, + "learning_rate": 1.4721523112778575e-06, + "loss": 0.3286, + "step": 8093 + }, + { + "epoch": 3.826950354609929, + "grad_norm": 3.006071090698242, + "learning_rate": 1.4715836796856332e-06, + "loss": 0.3901, + "step": 8094 + }, + { + "epoch": 3.8274231678487, + "grad_norm": 3.083411693572998, + "learning_rate": 1.4710151121283845e-06, + "loss": 0.3741, + "step": 8095 + }, + { + "epoch": 3.8278959810874706, + "grad_norm": 2.864989995956421, + "learning_rate": 1.4704466086415131e-06, + "loss": 0.3887, + "step": 8096 + }, + { + "epoch": 3.828368794326241, + "grad_norm": 2.4846417903900146, + "learning_rate": 1.4698781692604158e-06, + "loss": 0.33, + "step": 8097 + }, + { + "epoch": 3.828841607565012, + "grad_norm": 3.2497007846832275, + "learning_rate": 1.4693097940204893e-06, + "loss": 0.4011, + "step": 8098 + }, + { + "epoch": 3.8293144208037826, + "grad_norm": 3.0079777240753174, + "learning_rate": 1.4687414829571218e-06, + "loss": 0.4263, + "step": 8099 + }, + { + "epoch": 3.829787234042553, + "grad_norm": 2.8538410663604736, + "learning_rate": 1.4681732361057005e-06, + "loss": 0.3651, + "step": 8100 + }, + { + "epoch": 3.8302600472813237, + "grad_norm": 3.238163948059082, + "learning_rate": 1.4676050535016076e-06, + "loss": 0.392, + "step": 8101 + }, + { + "epoch": 3.8307328605200945, + "grad_norm": 2.9991304874420166, + "learning_rate": 1.46703693518022e-06, + "loss": 0.3643, + "step": 8102 + }, + { + "epoch": 3.8312056737588653, + "grad_norm": 2.9816839694976807, + "learning_rate": 1.466468881176914e-06, + "loss": 0.3803, + "step": 8103 + }, + { + "epoch": 3.831678486997636, + "grad_norm": 3.2009265422821045, + "learning_rate": 1.465900891527059e-06, + "loss": 0.3828, + "step": 8104 + }, + { + "epoch": 3.8321513002364065, + "grad_norm": 2.9479124546051025, + "learning_rate": 1.4653329662660201e-06, + "loss": 0.3683, + "step": 8105 + }, + { + "epoch": 3.8326241134751773, + "grad_norm": 2.938507080078125, + "learning_rate": 1.4647651054291614e-06, + "loss": 0.3703, + "step": 8106 + }, + { + "epoch": 3.833096926713948, + "grad_norm": 2.7777645587921143, + "learning_rate": 1.4641973090518397e-06, + "loss": 0.3982, + "step": 8107 + }, + { + "epoch": 3.8335697399527184, + "grad_norm": 3.2470149993896484, + "learning_rate": 1.4636295771694099e-06, + "loss": 0.3748, + "step": 8108 + }, + { + "epoch": 3.8340425531914892, + "grad_norm": 2.869310140609741, + "learning_rate": 1.4630619098172223e-06, + "loss": 0.3577, + "step": 8109 + }, + { + "epoch": 3.83451536643026, + "grad_norm": 3.1245369911193848, + "learning_rate": 1.4624943070306225e-06, + "loss": 0.4518, + "step": 8110 + }, + { + "epoch": 3.834988179669031, + "grad_norm": 3.0390701293945312, + "learning_rate": 1.4619267688449529e-06, + "loss": 0.5051, + "step": 8111 + }, + { + "epoch": 3.8354609929078016, + "grad_norm": 2.929943799972534, + "learning_rate": 1.4613592952955507e-06, + "loss": 0.4207, + "step": 8112 + }, + { + "epoch": 3.835933806146572, + "grad_norm": 3.17008376121521, + "learning_rate": 1.4607918864177523e-06, + "loss": 0.3836, + "step": 8113 + }, + { + "epoch": 3.8364066193853428, + "grad_norm": 3.0689237117767334, + "learning_rate": 1.460224542246886e-06, + "loss": 0.3413, + "step": 8114 + }, + { + "epoch": 3.8368794326241136, + "grad_norm": 2.9966423511505127, + "learning_rate": 1.4596572628182774e-06, + "loss": 0.4367, + "step": 8115 + }, + { + "epoch": 3.837352245862884, + "grad_norm": 3.0572052001953125, + "learning_rate": 1.45909004816725e-06, + "loss": 0.4089, + "step": 8116 + }, + { + "epoch": 3.8378250591016547, + "grad_norm": 2.911263942718506, + "learning_rate": 1.4585228983291203e-06, + "loss": 0.3848, + "step": 8117 + }, + { + "epoch": 3.8382978723404255, + "grad_norm": 2.9233853816986084, + "learning_rate": 1.4579558133392038e-06, + "loss": 0.4012, + "step": 8118 + }, + { + "epoch": 3.8387706855791963, + "grad_norm": 2.7813868522644043, + "learning_rate": 1.4573887932328097e-06, + "loss": 0.3898, + "step": 8119 + }, + { + "epoch": 3.839243498817967, + "grad_norm": 2.8727006912231445, + "learning_rate": 1.4568218380452436e-06, + "loss": 0.3965, + "step": 8120 + }, + { + "epoch": 3.8397163120567375, + "grad_norm": 3.0381174087524414, + "learning_rate": 1.4562549478118077e-06, + "loss": 0.4304, + "step": 8121 + }, + { + "epoch": 3.8401891252955083, + "grad_norm": 2.7406346797943115, + "learning_rate": 1.4556881225677982e-06, + "loss": 0.3636, + "step": 8122 + }, + { + "epoch": 3.840661938534279, + "grad_norm": 3.3900108337402344, + "learning_rate": 1.4551213623485111e-06, + "loss": 0.3863, + "step": 8123 + }, + { + "epoch": 3.8411347517730494, + "grad_norm": 2.885150909423828, + "learning_rate": 1.4545546671892354e-06, + "loss": 0.3679, + "step": 8124 + }, + { + "epoch": 3.84160756501182, + "grad_norm": 3.3361690044403076, + "learning_rate": 1.4539880371252555e-06, + "loss": 0.4333, + "step": 8125 + }, + { + "epoch": 3.842080378250591, + "grad_norm": 3.1547763347625732, + "learning_rate": 1.4534214721918545e-06, + "loss": 0.4477, + "step": 8126 + }, + { + "epoch": 3.842553191489362, + "grad_norm": 3.0337510108947754, + "learning_rate": 1.4528549724243095e-06, + "loss": 0.3647, + "step": 8127 + }, + { + "epoch": 3.8430260047281326, + "grad_norm": 2.8390069007873535, + "learning_rate": 1.452288537857893e-06, + "loss": 0.3698, + "step": 8128 + }, + { + "epoch": 3.843498817966903, + "grad_norm": 2.857513427734375, + "learning_rate": 1.451722168527876e-06, + "loss": 0.3842, + "step": 8129 + }, + { + "epoch": 3.8439716312056738, + "grad_norm": 3.015320062637329, + "learning_rate": 1.451155864469522e-06, + "loss": 0.4058, + "step": 8130 + }, + { + "epoch": 3.8444444444444446, + "grad_norm": 2.923957347869873, + "learning_rate": 1.450589625718094e-06, + "loss": 0.3976, + "step": 8131 + }, + { + "epoch": 3.844917257683215, + "grad_norm": 3.332338571548462, + "learning_rate": 1.4500234523088492e-06, + "loss": 0.4118, + "step": 8132 + }, + { + "epoch": 3.8453900709219857, + "grad_norm": 3.0403711795806885, + "learning_rate": 1.4494573442770381e-06, + "loss": 0.3715, + "step": 8133 + }, + { + "epoch": 3.8458628841607565, + "grad_norm": 3.2310287952423096, + "learning_rate": 1.4488913016579135e-06, + "loss": 0.4587, + "step": 8134 + }, + { + "epoch": 3.8463356973995273, + "grad_norm": 3.091282844543457, + "learning_rate": 1.448325324486718e-06, + "loss": 0.4234, + "step": 8135 + }, + { + "epoch": 3.846808510638298, + "grad_norm": 3.11161208152771, + "learning_rate": 1.4477594127986933e-06, + "loss": 0.4176, + "step": 8136 + }, + { + "epoch": 3.8472813238770684, + "grad_norm": 3.21042537689209, + "learning_rate": 1.4471935666290751e-06, + "loss": 0.4326, + "step": 8137 + }, + { + "epoch": 3.8477541371158392, + "grad_norm": 3.411543846130371, + "learning_rate": 1.4466277860130981e-06, + "loss": 0.4525, + "step": 8138 + }, + { + "epoch": 3.84822695035461, + "grad_norm": 3.0475308895111084, + "learning_rate": 1.4460620709859898e-06, + "loss": 0.3906, + "step": 8139 + }, + { + "epoch": 3.8486997635933804, + "grad_norm": 2.989367723464966, + "learning_rate": 1.4454964215829742e-06, + "loss": 0.3732, + "step": 8140 + }, + { + "epoch": 3.849172576832151, + "grad_norm": 2.8130393028259277, + "learning_rate": 1.4449308378392734e-06, + "loss": 0.3733, + "step": 8141 + }, + { + "epoch": 3.849645390070922, + "grad_norm": 12.2243013381958, + "learning_rate": 1.444365319790103e-06, + "loss": 0.3506, + "step": 8142 + }, + { + "epoch": 3.850118203309693, + "grad_norm": 3.075556516647339, + "learning_rate": 1.4437998674706743e-06, + "loss": 0.376, + "step": 8143 + }, + { + "epoch": 3.8505910165484636, + "grad_norm": 2.765650510787964, + "learning_rate": 1.4432344809161974e-06, + "loss": 0.3865, + "step": 8144 + }, + { + "epoch": 3.851063829787234, + "grad_norm": 3.171588897705078, + "learning_rate": 1.4426691601618747e-06, + "loss": 0.4391, + "step": 8145 + }, + { + "epoch": 3.8515366430260047, + "grad_norm": 2.8378992080688477, + "learning_rate": 1.4421039052429083e-06, + "loss": 0.3984, + "step": 8146 + }, + { + "epoch": 3.8520094562647755, + "grad_norm": 2.6588387489318848, + "learning_rate": 1.4415387161944929e-06, + "loss": 0.3961, + "step": 8147 + }, + { + "epoch": 3.852482269503546, + "grad_norm": 2.919325351715088, + "learning_rate": 1.4409735930518197e-06, + "loss": 0.4058, + "step": 8148 + }, + { + "epoch": 3.8529550827423167, + "grad_norm": 3.2239115238189697, + "learning_rate": 1.4404085358500778e-06, + "loss": 0.4018, + "step": 8149 + }, + { + "epoch": 3.8534278959810875, + "grad_norm": 3.2509875297546387, + "learning_rate": 1.4398435446244502e-06, + "loss": 0.4078, + "step": 8150 + }, + { + "epoch": 3.8539007092198583, + "grad_norm": 3.124782085418701, + "learning_rate": 1.4392786194101155e-06, + "loss": 0.4459, + "step": 8151 + }, + { + "epoch": 3.854373522458629, + "grad_norm": 2.924095392227173, + "learning_rate": 1.4387137602422512e-06, + "loss": 0.3686, + "step": 8152 + }, + { + "epoch": 3.8548463356973994, + "grad_norm": 2.9307191371917725, + "learning_rate": 1.4381489671560272e-06, + "loss": 0.4345, + "step": 8153 + }, + { + "epoch": 3.8553191489361702, + "grad_norm": 2.868488073348999, + "learning_rate": 1.4375842401866113e-06, + "loss": 0.366, + "step": 8154 + }, + { + "epoch": 3.855791962174941, + "grad_norm": 2.9893085956573486, + "learning_rate": 1.4370195793691661e-06, + "loss": 0.3401, + "step": 8155 + }, + { + "epoch": 3.8562647754137114, + "grad_norm": 3.0113472938537598, + "learning_rate": 1.4364549847388492e-06, + "loss": 0.4051, + "step": 8156 + }, + { + "epoch": 3.856737588652482, + "grad_norm": 3.4693121910095215, + "learning_rate": 1.4358904563308184e-06, + "loss": 0.4505, + "step": 8157 + }, + { + "epoch": 3.857210401891253, + "grad_norm": 2.9048118591308594, + "learning_rate": 1.4353259941802216e-06, + "loss": 0.3973, + "step": 8158 + }, + { + "epoch": 3.8576832151300238, + "grad_norm": 3.264910936355591, + "learning_rate": 1.434761598322208e-06, + "loss": 0.4317, + "step": 8159 + }, + { + "epoch": 3.8581560283687946, + "grad_norm": 2.973742723464966, + "learning_rate": 1.4341972687919186e-06, + "loss": 0.3896, + "step": 8160 + }, + { + "epoch": 3.858628841607565, + "grad_norm": 2.7802605628967285, + "learning_rate": 1.4336330056244906e-06, + "loss": 0.4063, + "step": 8161 + }, + { + "epoch": 3.8591016548463357, + "grad_norm": 3.1401731967926025, + "learning_rate": 1.433068808855061e-06, + "loss": 0.4068, + "step": 8162 + }, + { + "epoch": 3.8595744680851065, + "grad_norm": 3.132723331451416, + "learning_rate": 1.432504678518757e-06, + "loss": 0.4724, + "step": 8163 + }, + { + "epoch": 3.860047281323877, + "grad_norm": 2.94944167137146, + "learning_rate": 1.4319406146507068e-06, + "loss": 0.3666, + "step": 8164 + }, + { + "epoch": 3.8605200945626477, + "grad_norm": 2.972322463989258, + "learning_rate": 1.4313766172860311e-06, + "loss": 0.4226, + "step": 8165 + }, + { + "epoch": 3.8609929078014185, + "grad_norm": 2.9808123111724854, + "learning_rate": 1.430812686459847e-06, + "loss": 0.4079, + "step": 8166 + }, + { + "epoch": 3.8614657210401893, + "grad_norm": 2.9656291007995605, + "learning_rate": 1.4302488222072698e-06, + "loss": 0.3423, + "step": 8167 + }, + { + "epoch": 3.86193853427896, + "grad_norm": 2.886765241622925, + "learning_rate": 1.4296850245634073e-06, + "loss": 0.3577, + "step": 8168 + }, + { + "epoch": 3.8624113475177304, + "grad_norm": 3.0613043308258057, + "learning_rate": 1.4291212935633653e-06, + "loss": 0.4121, + "step": 8169 + }, + { + "epoch": 3.862884160756501, + "grad_norm": 2.842050313949585, + "learning_rate": 1.4285576292422445e-06, + "loss": 0.373, + "step": 8170 + }, + { + "epoch": 3.863356973995272, + "grad_norm": 3.0604517459869385, + "learning_rate": 1.4279940316351413e-06, + "loss": 0.3938, + "step": 8171 + }, + { + "epoch": 3.8638297872340424, + "grad_norm": 3.9742302894592285, + "learning_rate": 1.42743050077715e-06, + "loss": 0.4463, + "step": 8172 + }, + { + "epoch": 3.864302600472813, + "grad_norm": 2.8330607414245605, + "learning_rate": 1.4268670367033572e-06, + "loss": 0.4423, + "step": 8173 + }, + { + "epoch": 3.864775413711584, + "grad_norm": 2.953256607055664, + "learning_rate": 1.4263036394488497e-06, + "loss": 0.3553, + "step": 8174 + }, + { + "epoch": 3.8652482269503547, + "grad_norm": 2.865849018096924, + "learning_rate": 1.4257403090487065e-06, + "loss": 0.3348, + "step": 8175 + }, + { + "epoch": 3.8657210401891255, + "grad_norm": 2.712502956390381, + "learning_rate": 1.4251770455380027e-06, + "loss": 0.3896, + "step": 8176 + }, + { + "epoch": 3.866193853427896, + "grad_norm": 2.798898220062256, + "learning_rate": 1.4246138489518123e-06, + "loss": 0.4275, + "step": 8177 + }, + { + "epoch": 3.8666666666666667, + "grad_norm": 2.830899953842163, + "learning_rate": 1.4240507193252023e-06, + "loss": 0.3952, + "step": 8178 + }, + { + "epoch": 3.8671394799054375, + "grad_norm": 2.5789451599121094, + "learning_rate": 1.4234876566932348e-06, + "loss": 0.3483, + "step": 8179 + }, + { + "epoch": 3.867612293144208, + "grad_norm": 2.8513095378875732, + "learning_rate": 1.422924661090972e-06, + "loss": 0.3403, + "step": 8180 + }, + { + "epoch": 3.8680851063829786, + "grad_norm": 3.5031449794769287, + "learning_rate": 1.4223617325534664e-06, + "loss": 0.3964, + "step": 8181 + }, + { + "epoch": 3.8685579196217494, + "grad_norm": 2.7495479583740234, + "learning_rate": 1.4217988711157715e-06, + "loss": 0.3376, + "step": 8182 + }, + { + "epoch": 3.8690307328605202, + "grad_norm": 2.8609421253204346, + "learning_rate": 1.421236076812933e-06, + "loss": 0.3967, + "step": 8183 + }, + { + "epoch": 3.869503546099291, + "grad_norm": 3.0624637603759766, + "learning_rate": 1.420673349679994e-06, + "loss": 0.3764, + "step": 8184 + }, + { + "epoch": 3.8699763593380614, + "grad_norm": 3.3084404468536377, + "learning_rate": 1.4201106897519926e-06, + "loss": 0.4567, + "step": 8185 + }, + { + "epoch": 3.870449172576832, + "grad_norm": 3.164116382598877, + "learning_rate": 1.4195480970639624e-06, + "loss": 0.4217, + "step": 8186 + }, + { + "epoch": 3.870921985815603, + "grad_norm": 2.971390724182129, + "learning_rate": 1.4189855716509355e-06, + "loss": 0.3981, + "step": 8187 + }, + { + "epoch": 3.8713947990543733, + "grad_norm": 3.0537233352661133, + "learning_rate": 1.418423113547937e-06, + "loss": 0.4093, + "step": 8188 + }, + { + "epoch": 3.871867612293144, + "grad_norm": 3.698120594024658, + "learning_rate": 1.4178607227899877e-06, + "loss": 0.3158, + "step": 8189 + }, + { + "epoch": 3.872340425531915, + "grad_norm": 3.0320451259613037, + "learning_rate": 1.417298399412107e-06, + "loss": 0.3903, + "step": 8190 + }, + { + "epoch": 3.8728132387706857, + "grad_norm": 2.913296699523926, + "learning_rate": 1.4167361434493068e-06, + "loss": 0.3396, + "step": 8191 + }, + { + "epoch": 3.8732860520094565, + "grad_norm": 3.011906147003174, + "learning_rate": 1.4161739549365976e-06, + "loss": 0.3915, + "step": 8192 + }, + { + "epoch": 3.873758865248227, + "grad_norm": 3.2707724571228027, + "learning_rate": 1.4156118339089842e-06, + "loss": 0.4466, + "step": 8193 + }, + { + "epoch": 3.8742316784869977, + "grad_norm": 3.036747694015503, + "learning_rate": 1.4150497804014656e-06, + "loss": 0.4095, + "step": 8194 + }, + { + "epoch": 3.8747044917257685, + "grad_norm": 2.8851394653320312, + "learning_rate": 1.4144877944490411e-06, + "loss": 0.4235, + "step": 8195 + }, + { + "epoch": 3.875177304964539, + "grad_norm": 3.099785566329956, + "learning_rate": 1.4139258760867008e-06, + "loss": 0.4102, + "step": 8196 + }, + { + "epoch": 3.8756501182033096, + "grad_norm": 3.0752081871032715, + "learning_rate": 1.4133640253494347e-06, + "loss": 0.4165, + "step": 8197 + }, + { + "epoch": 3.8761229314420804, + "grad_norm": 2.842257261276245, + "learning_rate": 1.412802242272226e-06, + "loss": 0.3573, + "step": 8198 + }, + { + "epoch": 3.876595744680851, + "grad_norm": 2.93868350982666, + "learning_rate": 1.4122405268900547e-06, + "loss": 0.36, + "step": 8199 + }, + { + "epoch": 3.877068557919622, + "grad_norm": 2.674356460571289, + "learning_rate": 1.411678879237896e-06, + "loss": 0.3763, + "step": 8200 + }, + { + "epoch": 3.8775413711583924, + "grad_norm": 2.710617780685425, + "learning_rate": 1.411117299350721e-06, + "loss": 0.358, + "step": 8201 + }, + { + "epoch": 3.878014184397163, + "grad_norm": 3.0299410820007324, + "learning_rate": 1.4105557872634968e-06, + "loss": 0.3723, + "step": 8202 + }, + { + "epoch": 3.878486997635934, + "grad_norm": 3.1951241493225098, + "learning_rate": 1.4099943430111874e-06, + "loss": 0.4163, + "step": 8203 + }, + { + "epoch": 3.8789598108747043, + "grad_norm": 2.752410411834717, + "learning_rate": 1.4094329666287495e-06, + "loss": 0.3753, + "step": 8204 + }, + { + "epoch": 3.879432624113475, + "grad_norm": 3.1242496967315674, + "learning_rate": 1.40887165815114e-06, + "loss": 0.3694, + "step": 8205 + }, + { + "epoch": 3.879905437352246, + "grad_norm": 5.16750431060791, + "learning_rate": 1.4083104176133079e-06, + "loss": 0.3869, + "step": 8206 + }, + { + "epoch": 3.8803782505910167, + "grad_norm": 3.2995245456695557, + "learning_rate": 1.4077492450501978e-06, + "loss": 0.4194, + "step": 8207 + }, + { + "epoch": 3.8808510638297875, + "grad_norm": 3.506807804107666, + "learning_rate": 1.4071881404967541e-06, + "loss": 0.3873, + "step": 8208 + }, + { + "epoch": 3.881323877068558, + "grad_norm": 3.1201252937316895, + "learning_rate": 1.4066271039879123e-06, + "loss": 0.3625, + "step": 8209 + }, + { + "epoch": 3.8817966903073287, + "grad_norm": 2.870683193206787, + "learning_rate": 1.4060661355586073e-06, + "loss": 0.4039, + "step": 8210 + }, + { + "epoch": 3.8822695035460995, + "grad_norm": 3.177701234817505, + "learning_rate": 1.405505235243767e-06, + "loss": 0.3715, + "step": 8211 + }, + { + "epoch": 3.88274231678487, + "grad_norm": 3.0319771766662598, + "learning_rate": 1.4049444030783157e-06, + "loss": 0.3588, + "step": 8212 + }, + { + "epoch": 3.8832151300236406, + "grad_norm": 2.4598889350891113, + "learning_rate": 1.404383639097176e-06, + "loss": 0.2788, + "step": 8213 + }, + { + "epoch": 3.8836879432624114, + "grad_norm": 2.916987419128418, + "learning_rate": 1.4038229433352623e-06, + "loss": 0.4167, + "step": 8214 + }, + { + "epoch": 3.884160756501182, + "grad_norm": 3.005075216293335, + "learning_rate": 1.4032623158274872e-06, + "loss": 0.4251, + "step": 8215 + }, + { + "epoch": 3.8846335697399526, + "grad_norm": 3.1718621253967285, + "learning_rate": 1.4027017566087591e-06, + "loss": 0.383, + "step": 8216 + }, + { + "epoch": 3.8851063829787233, + "grad_norm": 2.954662322998047, + "learning_rate": 1.402141265713981e-06, + "loss": 0.373, + "step": 8217 + }, + { + "epoch": 3.885579196217494, + "grad_norm": 3.408008337020874, + "learning_rate": 1.4015808431780526e-06, + "loss": 0.4216, + "step": 8218 + }, + { + "epoch": 3.8860520094562645, + "grad_norm": 3.1599369049072266, + "learning_rate": 1.4010204890358675e-06, + "loss": 0.4544, + "step": 8219 + }, + { + "epoch": 3.8865248226950353, + "grad_norm": 2.8919107913970947, + "learning_rate": 1.4004602033223186e-06, + "loss": 0.3785, + "step": 8220 + }, + { + "epoch": 3.886997635933806, + "grad_norm": 3.522581100463867, + "learning_rate": 1.3998999860722918e-06, + "loss": 0.4276, + "step": 8221 + }, + { + "epoch": 3.887470449172577, + "grad_norm": 2.9278945922851562, + "learning_rate": 1.399339837320668e-06, + "loss": 0.409, + "step": 8222 + }, + { + "epoch": 3.8879432624113477, + "grad_norm": 3.032557725906372, + "learning_rate": 1.398779757102327e-06, + "loss": 0.3973, + "step": 8223 + }, + { + "epoch": 3.888416075650118, + "grad_norm": 2.843118667602539, + "learning_rate": 1.3982197454521423e-06, + "loss": 0.3418, + "step": 8224 + }, + { + "epoch": 3.888888888888889, + "grad_norm": 2.8620638847351074, + "learning_rate": 1.3976598024049815e-06, + "loss": 0.3751, + "step": 8225 + }, + { + "epoch": 3.8893617021276596, + "grad_norm": 2.532327175140381, + "learning_rate": 1.3970999279957124e-06, + "loss": 0.3541, + "step": 8226 + }, + { + "epoch": 3.88983451536643, + "grad_norm": 3.1074535846710205, + "learning_rate": 1.3965401222591935e-06, + "loss": 0.4706, + "step": 8227 + }, + { + "epoch": 3.890307328605201, + "grad_norm": 3.1558735370635986, + "learning_rate": 1.3959803852302839e-06, + "loss": 0.448, + "step": 8228 + }, + { + "epoch": 3.8907801418439716, + "grad_norm": 3.0862064361572266, + "learning_rate": 1.3954207169438344e-06, + "loss": 0.3308, + "step": 8229 + }, + { + "epoch": 3.8912529550827424, + "grad_norm": 2.9246280193328857, + "learning_rate": 1.3948611174346927e-06, + "loss": 0.3771, + "step": 8230 + }, + { + "epoch": 3.891725768321513, + "grad_norm": 2.7959492206573486, + "learning_rate": 1.394301586737704e-06, + "loss": 0.4248, + "step": 8231 + }, + { + "epoch": 3.8921985815602835, + "grad_norm": 2.787670373916626, + "learning_rate": 1.3937421248877075e-06, + "loss": 0.3416, + "step": 8232 + }, + { + "epoch": 3.8926713947990543, + "grad_norm": 3.0775792598724365, + "learning_rate": 1.393182731919538e-06, + "loss": 0.4345, + "step": 8233 + }, + { + "epoch": 3.893144208037825, + "grad_norm": 2.6338887214660645, + "learning_rate": 1.3926234078680268e-06, + "loss": 0.3995, + "step": 8234 + }, + { + "epoch": 3.8936170212765955, + "grad_norm": 2.9975900650024414, + "learning_rate": 1.392064152767999e-06, + "loss": 0.3997, + "step": 8235 + }, + { + "epoch": 3.8940898345153663, + "grad_norm": 2.8615779876708984, + "learning_rate": 1.3915049666542791e-06, + "loss": 0.3687, + "step": 8236 + }, + { + "epoch": 3.894562647754137, + "grad_norm": 3.0132436752319336, + "learning_rate": 1.3909458495616835e-06, + "loss": 0.4085, + "step": 8237 + }, + { + "epoch": 3.895035460992908, + "grad_norm": 3.141291379928589, + "learning_rate": 1.3903868015250278e-06, + "loss": 0.3903, + "step": 8238 + }, + { + "epoch": 3.8955082742316787, + "grad_norm": 2.6998603343963623, + "learning_rate": 1.3898278225791204e-06, + "loss": 0.3576, + "step": 8239 + }, + { + "epoch": 3.895981087470449, + "grad_norm": 3.212578535079956, + "learning_rate": 1.3892689127587656e-06, + "loss": 0.4321, + "step": 8240 + }, + { + "epoch": 3.89645390070922, + "grad_norm": 3.15732741355896, + "learning_rate": 1.3887100720987662e-06, + "loss": 0.4247, + "step": 8241 + }, + { + "epoch": 3.8969267139479906, + "grad_norm": 2.6001040935516357, + "learning_rate": 1.3881513006339168e-06, + "loss": 0.3376, + "step": 8242 + }, + { + "epoch": 3.897399527186761, + "grad_norm": 2.766188859939575, + "learning_rate": 1.3875925983990113e-06, + "loss": 0.3771, + "step": 8243 + }, + { + "epoch": 3.8978723404255318, + "grad_norm": 2.7471580505371094, + "learning_rate": 1.3870339654288372e-06, + "loss": 0.3311, + "step": 8244 + }, + { + "epoch": 3.8983451536643026, + "grad_norm": 3.577664375305176, + "learning_rate": 1.3864754017581769e-06, + "loss": 0.3725, + "step": 8245 + }, + { + "epoch": 3.8988179669030734, + "grad_norm": 2.8747243881225586, + "learning_rate": 1.3859169074218116e-06, + "loss": 0.3706, + "step": 8246 + }, + { + "epoch": 3.899290780141844, + "grad_norm": 2.5249671936035156, + "learning_rate": 1.3853584824545152e-06, + "loss": 0.3621, + "step": 8247 + }, + { + "epoch": 3.8997635933806145, + "grad_norm": 2.7290890216827393, + "learning_rate": 1.3848001268910589e-06, + "loss": 0.3209, + "step": 8248 + }, + { + "epoch": 3.9002364066193853, + "grad_norm": 3.0917534828186035, + "learning_rate": 1.3842418407662084e-06, + "loss": 0.3904, + "step": 8249 + }, + { + "epoch": 3.900709219858156, + "grad_norm": 3.099494695663452, + "learning_rate": 1.383683624114725e-06, + "loss": 0.3714, + "step": 8250 + }, + { + "epoch": 3.9011820330969265, + "grad_norm": 3.077505588531494, + "learning_rate": 1.3831254769713687e-06, + "loss": 0.4166, + "step": 8251 + }, + { + "epoch": 3.9016548463356973, + "grad_norm": 2.9983766078948975, + "learning_rate": 1.3825673993708915e-06, + "loss": 0.3909, + "step": 8252 + }, + { + "epoch": 3.902127659574468, + "grad_norm": 2.7958667278289795, + "learning_rate": 1.3820093913480415e-06, + "loss": 0.3966, + "step": 8253 + }, + { + "epoch": 3.902600472813239, + "grad_norm": 3.0938336849212646, + "learning_rate": 1.3814514529375656e-06, + "loss": 0.4118, + "step": 8254 + }, + { + "epoch": 3.9030732860520096, + "grad_norm": 3.2711637020111084, + "learning_rate": 1.3808935841742016e-06, + "loss": 0.4021, + "step": 8255 + }, + { + "epoch": 3.90354609929078, + "grad_norm": 3.23563814163208, + "learning_rate": 1.3803357850926885e-06, + "loss": 0.3679, + "step": 8256 + }, + { + "epoch": 3.904018912529551, + "grad_norm": 2.77942156791687, + "learning_rate": 1.3797780557277563e-06, + "loss": 0.3938, + "step": 8257 + }, + { + "epoch": 3.9044917257683216, + "grad_norm": 3.1273257732391357, + "learning_rate": 1.3792203961141313e-06, + "loss": 0.3579, + "step": 8258 + }, + { + "epoch": 3.904964539007092, + "grad_norm": 3.69164776802063, + "learning_rate": 1.378662806286539e-06, + "loss": 0.3712, + "step": 8259 + }, + { + "epoch": 3.9054373522458627, + "grad_norm": 2.8818306922912598, + "learning_rate": 1.3781052862796957e-06, + "loss": 0.3972, + "step": 8260 + }, + { + "epoch": 3.9059101654846335, + "grad_norm": 2.776651382446289, + "learning_rate": 1.377547836128318e-06, + "loss": 0.3605, + "step": 8261 + }, + { + "epoch": 3.9063829787234043, + "grad_norm": 3.1498706340789795, + "learning_rate": 1.376990455867115e-06, + "loss": 0.3995, + "step": 8262 + }, + { + "epoch": 3.906855791962175, + "grad_norm": 2.777390956878662, + "learning_rate": 1.3764331455307916e-06, + "loss": 0.3463, + "step": 8263 + }, + { + "epoch": 3.9073286052009455, + "grad_norm": 2.9953835010528564, + "learning_rate": 1.3758759051540496e-06, + "loss": 0.3881, + "step": 8264 + }, + { + "epoch": 3.9078014184397163, + "grad_norm": 3.737194538116455, + "learning_rate": 1.375318734771585e-06, + "loss": 0.4456, + "step": 8265 + }, + { + "epoch": 3.908274231678487, + "grad_norm": 3.1575849056243896, + "learning_rate": 1.374761634418092e-06, + "loss": 0.3613, + "step": 8266 + }, + { + "epoch": 3.9087470449172574, + "grad_norm": 3.140662908554077, + "learning_rate": 1.374204604128258e-06, + "loss": 0.4462, + "step": 8267 + }, + { + "epoch": 3.9092198581560282, + "grad_norm": 3.2106714248657227, + "learning_rate": 1.3736476439367663e-06, + "loss": 0.3801, + "step": 8268 + }, + { + "epoch": 3.909692671394799, + "grad_norm": 2.888345956802368, + "learning_rate": 1.3730907538782976e-06, + "loss": 0.4209, + "step": 8269 + }, + { + "epoch": 3.91016548463357, + "grad_norm": 2.8903355598449707, + "learning_rate": 1.3725339339875252e-06, + "loss": 0.3612, + "step": 8270 + }, + { + "epoch": 3.9106382978723406, + "grad_norm": 3.2661736011505127, + "learning_rate": 1.371977184299122e-06, + "loss": 0.4151, + "step": 8271 + }, + { + "epoch": 3.911111111111111, + "grad_norm": 3.1532459259033203, + "learning_rate": 1.3714205048477535e-06, + "loss": 0.3706, + "step": 8272 + }, + { + "epoch": 3.911583924349882, + "grad_norm": 2.907306432723999, + "learning_rate": 1.3708638956680804e-06, + "loss": 0.4113, + "step": 8273 + }, + { + "epoch": 3.9120567375886526, + "grad_norm": 2.7301599979400635, + "learning_rate": 1.3703073567947622e-06, + "loss": 0.355, + "step": 8274 + }, + { + "epoch": 3.912529550827423, + "grad_norm": 2.595625877380371, + "learning_rate": 1.3697508882624516e-06, + "loss": 0.3733, + "step": 8275 + }, + { + "epoch": 3.9130023640661937, + "grad_norm": 2.784294366836548, + "learning_rate": 1.369194490105796e-06, + "loss": 0.3366, + "step": 8276 + }, + { + "epoch": 3.9134751773049645, + "grad_norm": 3.0179800987243652, + "learning_rate": 1.3686381623594419e-06, + "loss": 0.3922, + "step": 8277 + }, + { + "epoch": 3.9139479905437353, + "grad_norm": 2.6641111373901367, + "learning_rate": 1.3680819050580291e-06, + "loss": 0.3324, + "step": 8278 + }, + { + "epoch": 3.914420803782506, + "grad_norm": 2.917741060256958, + "learning_rate": 1.3675257182361923e-06, + "loss": 0.3784, + "step": 8279 + }, + { + "epoch": 3.9148936170212765, + "grad_norm": 2.959599018096924, + "learning_rate": 1.3669696019285626e-06, + "loss": 0.3846, + "step": 8280 + }, + { + "epoch": 3.9153664302600473, + "grad_norm": 3.078824043273926, + "learning_rate": 1.3664135561697683e-06, + "loss": 0.4357, + "step": 8281 + }, + { + "epoch": 3.915839243498818, + "grad_norm": 3.0174930095672607, + "learning_rate": 1.3658575809944313e-06, + "loss": 0.3643, + "step": 8282 + }, + { + "epoch": 3.9163120567375884, + "grad_norm": 2.6805408000946045, + "learning_rate": 1.365301676437169e-06, + "loss": 0.3193, + "step": 8283 + }, + { + "epoch": 3.916784869976359, + "grad_norm": 2.6996054649353027, + "learning_rate": 1.3647458425325966e-06, + "loss": 0.3378, + "step": 8284 + }, + { + "epoch": 3.91725768321513, + "grad_norm": 2.7950546741485596, + "learning_rate": 1.3641900793153223e-06, + "loss": 0.3864, + "step": 8285 + }, + { + "epoch": 3.917730496453901, + "grad_norm": 2.9658634662628174, + "learning_rate": 1.363634386819951e-06, + "loss": 0.3452, + "step": 8286 + }, + { + "epoch": 3.9182033096926716, + "grad_norm": 3.0684404373168945, + "learning_rate": 1.363078765081084e-06, + "loss": 0.3278, + "step": 8287 + }, + { + "epoch": 3.918676122931442, + "grad_norm": 3.0293614864349365, + "learning_rate": 1.3625232141333164e-06, + "loss": 0.3827, + "step": 8288 + }, + { + "epoch": 3.9191489361702128, + "grad_norm": 2.9969890117645264, + "learning_rate": 1.3619677340112413e-06, + "loss": 0.3412, + "step": 8289 + }, + { + "epoch": 3.9196217494089836, + "grad_norm": 2.991654396057129, + "learning_rate": 1.3614123247494457e-06, + "loss": 0.3683, + "step": 8290 + }, + { + "epoch": 3.920094562647754, + "grad_norm": 3.032158374786377, + "learning_rate": 1.360856986382511e-06, + "loss": 0.421, + "step": 8291 + }, + { + "epoch": 3.9205673758865247, + "grad_norm": 3.1413731575012207, + "learning_rate": 1.3603017189450173e-06, + "loss": 0.3818, + "step": 8292 + }, + { + "epoch": 3.9210401891252955, + "grad_norm": 3.295527219772339, + "learning_rate": 1.3597465224715387e-06, + "loss": 0.4828, + "step": 8293 + }, + { + "epoch": 3.9215130023640663, + "grad_norm": 3.116053581237793, + "learning_rate": 1.359191396996643e-06, + "loss": 0.4108, + "step": 8294 + }, + { + "epoch": 3.921985815602837, + "grad_norm": 2.957446336746216, + "learning_rate": 1.3586363425548975e-06, + "loss": 0.3482, + "step": 8295 + }, + { + "epoch": 3.9224586288416075, + "grad_norm": 2.745471715927124, + "learning_rate": 1.3580813591808627e-06, + "loss": 0.4184, + "step": 8296 + }, + { + "epoch": 3.9229314420803783, + "grad_norm": 3.0920722484588623, + "learning_rate": 1.3575264469090943e-06, + "loss": 0.3826, + "step": 8297 + }, + { + "epoch": 3.923404255319149, + "grad_norm": 2.8719749450683594, + "learning_rate": 1.3569716057741444e-06, + "loss": 0.3953, + "step": 8298 + }, + { + "epoch": 3.9238770685579194, + "grad_norm": 3.1278762817382812, + "learning_rate": 1.3564168358105597e-06, + "loss": 0.3658, + "step": 8299 + }, + { + "epoch": 3.92434988179669, + "grad_norm": 2.7752785682678223, + "learning_rate": 1.3558621370528851e-06, + "loss": 0.3447, + "step": 8300 + }, + { + "epoch": 3.924822695035461, + "grad_norm": 2.948575735092163, + "learning_rate": 1.3553075095356575e-06, + "loss": 0.3803, + "step": 8301 + }, + { + "epoch": 3.925295508274232, + "grad_norm": 2.8164193630218506, + "learning_rate": 1.354752953293413e-06, + "loss": 0.3724, + "step": 8302 + }, + { + "epoch": 3.9257683215130026, + "grad_norm": 3.2431271076202393, + "learning_rate": 1.3541984683606798e-06, + "loss": 0.382, + "step": 8303 + }, + { + "epoch": 3.926241134751773, + "grad_norm": 2.8485286235809326, + "learning_rate": 1.353644054771983e-06, + "loss": 0.3632, + "step": 8304 + }, + { + "epoch": 3.9267139479905437, + "grad_norm": 3.334914445877075, + "learning_rate": 1.3530897125618456e-06, + "loss": 0.5286, + "step": 8305 + }, + { + "epoch": 3.9271867612293145, + "grad_norm": 3.3895132541656494, + "learning_rate": 1.3525354417647815e-06, + "loss": 0.3838, + "step": 8306 + }, + { + "epoch": 3.927659574468085, + "grad_norm": 3.141935110092163, + "learning_rate": 1.351981242415305e-06, + "loss": 0.3928, + "step": 8307 + }, + { + "epoch": 3.9281323877068557, + "grad_norm": 3.3013596534729004, + "learning_rate": 1.3514271145479225e-06, + "loss": 0.4046, + "step": 8308 + }, + { + "epoch": 3.9286052009456265, + "grad_norm": 2.8704745769500732, + "learning_rate": 1.3508730581971363e-06, + "loss": 0.3542, + "step": 8309 + }, + { + "epoch": 3.9290780141843973, + "grad_norm": 3.179405689239502, + "learning_rate": 1.3503190733974472e-06, + "loss": 0.3911, + "step": 8310 + }, + { + "epoch": 3.929550827423168, + "grad_norm": 3.1091885566711426, + "learning_rate": 1.3497651601833481e-06, + "loss": 0.3552, + "step": 8311 + }, + { + "epoch": 3.9300236406619384, + "grad_norm": 2.687678813934326, + "learning_rate": 1.3492113185893288e-06, + "loss": 0.3462, + "step": 8312 + }, + { + "epoch": 3.9304964539007092, + "grad_norm": 3.4954965114593506, + "learning_rate": 1.3486575486498749e-06, + "loss": 0.4358, + "step": 8313 + }, + { + "epoch": 3.93096926713948, + "grad_norm": 2.8652899265289307, + "learning_rate": 1.3481038503994652e-06, + "loss": 0.3434, + "step": 8314 + }, + { + "epoch": 3.9314420803782504, + "grad_norm": 3.927623748779297, + "learning_rate": 1.3475502238725797e-06, + "loss": 0.4662, + "step": 8315 + }, + { + "epoch": 3.931914893617021, + "grad_norm": 3.1166276931762695, + "learning_rate": 1.346996669103687e-06, + "loss": 0.3953, + "step": 8316 + }, + { + "epoch": 3.932387706855792, + "grad_norm": 3.140003204345703, + "learning_rate": 1.346443186127257e-06, + "loss": 0.3616, + "step": 8317 + }, + { + "epoch": 3.9328605200945628, + "grad_norm": 3.335466146469116, + "learning_rate": 1.3458897749777516e-06, + "loss": 0.3854, + "step": 8318 + }, + { + "epoch": 3.9333333333333336, + "grad_norm": 2.8305466175079346, + "learning_rate": 1.3453364356896282e-06, + "loss": 0.374, + "step": 8319 + }, + { + "epoch": 3.933806146572104, + "grad_norm": 2.9511806964874268, + "learning_rate": 1.344783168297343e-06, + "loss": 0.4235, + "step": 8320 + }, + { + "epoch": 3.9342789598108747, + "grad_norm": 3.1868233680725098, + "learning_rate": 1.3442299728353448e-06, + "loss": 0.4384, + "step": 8321 + }, + { + "epoch": 3.9347517730496455, + "grad_norm": 3.1358237266540527, + "learning_rate": 1.3436768493380766e-06, + "loss": 0.4011, + "step": 8322 + }, + { + "epoch": 3.935224586288416, + "grad_norm": 3.126192808151245, + "learning_rate": 1.343123797839982e-06, + "loss": 0.4061, + "step": 8323 + }, + { + "epoch": 3.9356973995271867, + "grad_norm": 2.9724647998809814, + "learning_rate": 1.3425708183754949e-06, + "loss": 0.3859, + "step": 8324 + }, + { + "epoch": 3.9361702127659575, + "grad_norm": 3.1526355743408203, + "learning_rate": 1.3420179109790485e-06, + "loss": 0.3543, + "step": 8325 + }, + { + "epoch": 3.9366430260047283, + "grad_norm": 3.1289172172546387, + "learning_rate": 1.3414650756850695e-06, + "loss": 0.3836, + "step": 8326 + }, + { + "epoch": 3.937115839243499, + "grad_norm": 2.851264715194702, + "learning_rate": 1.34091231252798e-06, + "loss": 0.3294, + "step": 8327 + }, + { + "epoch": 3.9375886524822694, + "grad_norm": 2.921872138977051, + "learning_rate": 1.3403596215421981e-06, + "loss": 0.3698, + "step": 8328 + }, + { + "epoch": 3.93806146572104, + "grad_norm": 2.947258234024048, + "learning_rate": 1.339807002762137e-06, + "loss": 0.3616, + "step": 8329 + }, + { + "epoch": 3.938534278959811, + "grad_norm": 3.011021375656128, + "learning_rate": 1.3392544562222077e-06, + "loss": 0.3387, + "step": 8330 + }, + { + "epoch": 3.9390070921985814, + "grad_norm": 3.5230746269226074, + "learning_rate": 1.3387019819568134e-06, + "loss": 0.4054, + "step": 8331 + }, + { + "epoch": 3.939479905437352, + "grad_norm": 3.120321035385132, + "learning_rate": 1.3381495800003536e-06, + "loss": 0.4389, + "step": 8332 + }, + { + "epoch": 3.939952718676123, + "grad_norm": 3.0090999603271484, + "learning_rate": 1.3375972503872259e-06, + "loss": 0.4158, + "step": 8333 + }, + { + "epoch": 3.9404255319148938, + "grad_norm": 3.4807989597320557, + "learning_rate": 1.3370449931518198e-06, + "loss": 0.4144, + "step": 8334 + }, + { + "epoch": 3.9408983451536646, + "grad_norm": 2.8535733222961426, + "learning_rate": 1.336492808328523e-06, + "loss": 0.4281, + "step": 8335 + }, + { + "epoch": 3.941371158392435, + "grad_norm": 2.9032745361328125, + "learning_rate": 1.3359406959517174e-06, + "loss": 0.3389, + "step": 8336 + }, + { + "epoch": 3.9418439716312057, + "grad_norm": 2.725823163986206, + "learning_rate": 1.3353886560557793e-06, + "loss": 0.369, + "step": 8337 + }, + { + "epoch": 3.9423167848699765, + "grad_norm": 3.1965179443359375, + "learning_rate": 1.3348366886750844e-06, + "loss": 0.4031, + "step": 8338 + }, + { + "epoch": 3.942789598108747, + "grad_norm": 2.6991076469421387, + "learning_rate": 1.3342847938439985e-06, + "loss": 0.3434, + "step": 8339 + }, + { + "epoch": 3.9432624113475176, + "grad_norm": 4.491400718688965, + "learning_rate": 1.3337329715968877e-06, + "loss": 0.4175, + "step": 8340 + }, + { + "epoch": 3.9437352245862884, + "grad_norm": 4.005452632904053, + "learning_rate": 1.3331812219681112e-06, + "loss": 0.4191, + "step": 8341 + }, + { + "epoch": 3.9442080378250592, + "grad_norm": 3.1575794219970703, + "learning_rate": 1.3326295449920238e-06, + "loss": 0.4135, + "step": 8342 + }, + { + "epoch": 3.94468085106383, + "grad_norm": 3.2383973598480225, + "learning_rate": 1.3320779407029755e-06, + "loss": 0.38, + "step": 8343 + }, + { + "epoch": 3.9451536643026004, + "grad_norm": 2.873703718185425, + "learning_rate": 1.3315264091353119e-06, + "loss": 0.4128, + "step": 8344 + }, + { + "epoch": 3.945626477541371, + "grad_norm": 2.947274923324585, + "learning_rate": 1.330974950323376e-06, + "loss": 0.3342, + "step": 8345 + }, + { + "epoch": 3.946099290780142, + "grad_norm": 3.2874088287353516, + "learning_rate": 1.330423564301504e-06, + "loss": 0.3849, + "step": 8346 + }, + { + "epoch": 3.9465721040189123, + "grad_norm": 2.885772466659546, + "learning_rate": 1.3298722511040275e-06, + "loss": 0.3562, + "step": 8347 + }, + { + "epoch": 3.947044917257683, + "grad_norm": 3.0031309127807617, + "learning_rate": 1.3293210107652753e-06, + "loss": 0.3593, + "step": 8348 + }, + { + "epoch": 3.947517730496454, + "grad_norm": 2.815854549407959, + "learning_rate": 1.3287698433195712e-06, + "loss": 0.3633, + "step": 8349 + }, + { + "epoch": 3.9479905437352247, + "grad_norm": 3.228415012359619, + "learning_rate": 1.328218748801232e-06, + "loss": 0.3869, + "step": 8350 + }, + { + "epoch": 3.9484633569739955, + "grad_norm": 2.8497684001922607, + "learning_rate": 1.3276677272445743e-06, + "loss": 0.3833, + "step": 8351 + }, + { + "epoch": 3.948936170212766, + "grad_norm": 3.2330706119537354, + "learning_rate": 1.3271167786839057e-06, + "loss": 0.4414, + "step": 8352 + }, + { + "epoch": 3.9494089834515367, + "grad_norm": 3.2179152965545654, + "learning_rate": 1.3265659031535332e-06, + "loss": 0.3821, + "step": 8353 + }, + { + "epoch": 3.9498817966903075, + "grad_norm": 3.0861377716064453, + "learning_rate": 1.3260151006877567e-06, + "loss": 0.4014, + "step": 8354 + }, + { + "epoch": 3.950354609929078, + "grad_norm": 3.118872880935669, + "learning_rate": 1.325464371320871e-06, + "loss": 0.3836, + "step": 8355 + }, + { + "epoch": 3.9508274231678486, + "grad_norm": 2.9787702560424805, + "learning_rate": 1.32491371508717e-06, + "loss": 0.3794, + "step": 8356 + }, + { + "epoch": 3.9513002364066194, + "grad_norm": 3.132089853286743, + "learning_rate": 1.3243631320209387e-06, + "loss": 0.3698, + "step": 8357 + }, + { + "epoch": 3.9517730496453902, + "grad_norm": 2.5304882526397705, + "learning_rate": 1.32381262215646e-06, + "loss": 0.3687, + "step": 8358 + }, + { + "epoch": 3.952245862884161, + "grad_norm": 2.9121861457824707, + "learning_rate": 1.3232621855280126e-06, + "loss": 0.3704, + "step": 8359 + }, + { + "epoch": 3.9527186761229314, + "grad_norm": 3.0885608196258545, + "learning_rate": 1.3227118221698688e-06, + "loss": 0.4303, + "step": 8360 + }, + { + "epoch": 3.953191489361702, + "grad_norm": 2.7274837493896484, + "learning_rate": 1.3221615321162979e-06, + "loss": 0.3556, + "step": 8361 + }, + { + "epoch": 3.953664302600473, + "grad_norm": 3.1329922676086426, + "learning_rate": 1.3216113154015625e-06, + "loss": 0.4042, + "step": 8362 + }, + { + "epoch": 3.9541371158392433, + "grad_norm": 2.937380313873291, + "learning_rate": 1.3210611720599243e-06, + "loss": 0.3358, + "step": 8363 + }, + { + "epoch": 3.954609929078014, + "grad_norm": 2.939194440841675, + "learning_rate": 1.3205111021256378e-06, + "loss": 0.3885, + "step": 8364 + }, + { + "epoch": 3.955082742316785, + "grad_norm": 2.9151997566223145, + "learning_rate": 1.3199611056329516e-06, + "loss": 0.4094, + "step": 8365 + }, + { + "epoch": 3.9555555555555557, + "grad_norm": 3.029733419418335, + "learning_rate": 1.3194111826161143e-06, + "loss": 0.3999, + "step": 8366 + }, + { + "epoch": 3.9560283687943265, + "grad_norm": 2.7899951934814453, + "learning_rate": 1.3188613331093653e-06, + "loss": 0.321, + "step": 8367 + }, + { + "epoch": 3.956501182033097, + "grad_norm": 3.1109507083892822, + "learning_rate": 1.3183115571469425e-06, + "loss": 0.4266, + "step": 8368 + }, + { + "epoch": 3.9569739952718677, + "grad_norm": 3.085594415664673, + "learning_rate": 1.3177618547630774e-06, + "loss": 0.4412, + "step": 8369 + }, + { + "epoch": 3.9574468085106385, + "grad_norm": 3.0980300903320312, + "learning_rate": 1.3172122259919968e-06, + "loss": 0.3385, + "step": 8370 + }, + { + "epoch": 3.957919621749409, + "grad_norm": 3.103438138961792, + "learning_rate": 1.3166626708679256e-06, + "loss": 0.3887, + "step": 8371 + }, + { + "epoch": 3.9583924349881796, + "grad_norm": 2.8235526084899902, + "learning_rate": 1.3161131894250812e-06, + "loss": 0.3759, + "step": 8372 + }, + { + "epoch": 3.9588652482269504, + "grad_norm": 2.8316404819488525, + "learning_rate": 1.3155637816976762e-06, + "loss": 0.3666, + "step": 8373 + }, + { + "epoch": 3.959338061465721, + "grad_norm": 2.7873756885528564, + "learning_rate": 1.3150144477199218e-06, + "loss": 0.3284, + "step": 8374 + }, + { + "epoch": 3.959810874704492, + "grad_norm": 3.355039119720459, + "learning_rate": 1.3144651875260218e-06, + "loss": 0.4197, + "step": 8375 + }, + { + "epoch": 3.9602836879432624, + "grad_norm": 3.477721929550171, + "learning_rate": 1.3139160011501761e-06, + "loss": 0.3298, + "step": 8376 + }, + { + "epoch": 3.960756501182033, + "grad_norm": 3.557152032852173, + "learning_rate": 1.3133668886265805e-06, + "loss": 0.3788, + "step": 8377 + }, + { + "epoch": 3.961229314420804, + "grad_norm": 3.06707763671875, + "learning_rate": 1.312817849989424e-06, + "loss": 0.3613, + "step": 8378 + }, + { + "epoch": 3.9617021276595743, + "grad_norm": 2.7702202796936035, + "learning_rate": 1.3122688852728956e-06, + "loss": 0.402, + "step": 8379 + }, + { + "epoch": 3.962174940898345, + "grad_norm": 2.8121016025543213, + "learning_rate": 1.3117199945111746e-06, + "loss": 0.3576, + "step": 8380 + }, + { + "epoch": 3.962647754137116, + "grad_norm": 2.809282064437866, + "learning_rate": 1.3111711777384403e-06, + "loss": 0.3741, + "step": 8381 + }, + { + "epoch": 3.9631205673758867, + "grad_norm": 3.1175687313079834, + "learning_rate": 1.3106224349888638e-06, + "loss": 0.3388, + "step": 8382 + }, + { + "epoch": 3.963593380614657, + "grad_norm": 2.930525064468384, + "learning_rate": 1.310073766296612e-06, + "loss": 0.3593, + "step": 8383 + }, + { + "epoch": 3.964066193853428, + "grad_norm": 3.0673177242279053, + "learning_rate": 1.3095251716958501e-06, + "loss": 0.402, + "step": 8384 + }, + { + "epoch": 3.9645390070921986, + "grad_norm": 2.9725706577301025, + "learning_rate": 1.3089766512207347e-06, + "loss": 0.3707, + "step": 8385 + }, + { + "epoch": 3.965011820330969, + "grad_norm": 2.9790916442871094, + "learning_rate": 1.3084282049054218e-06, + "loss": 0.3292, + "step": 8386 + }, + { + "epoch": 3.96548463356974, + "grad_norm": 3.257035493850708, + "learning_rate": 1.3078798327840598e-06, + "loss": 0.3753, + "step": 8387 + }, + { + "epoch": 3.9659574468085106, + "grad_norm": 3.0534379482269287, + "learning_rate": 1.307331534890792e-06, + "loss": 0.4134, + "step": 8388 + }, + { + "epoch": 3.9664302600472814, + "grad_norm": 2.919243812561035, + "learning_rate": 1.306783311259761e-06, + "loss": 0.4283, + "step": 8389 + }, + { + "epoch": 3.966903073286052, + "grad_norm": 2.7643322944641113, + "learning_rate": 1.306235161925101e-06, + "loss": 0.3454, + "step": 8390 + }, + { + "epoch": 3.9673758865248225, + "grad_norm": 3.0208916664123535, + "learning_rate": 1.3056870869209431e-06, + "loss": 0.385, + "step": 8391 + }, + { + "epoch": 3.9678486997635933, + "grad_norm": 2.8657243251800537, + "learning_rate": 1.3051390862814135e-06, + "loss": 0.3614, + "step": 8392 + }, + { + "epoch": 3.968321513002364, + "grad_norm": 3.2093591690063477, + "learning_rate": 1.3045911600406325e-06, + "loss": 0.3774, + "step": 8393 + }, + { + "epoch": 3.9687943262411345, + "grad_norm": 3.091618537902832, + "learning_rate": 1.3040433082327192e-06, + "loss": 0.4157, + "step": 8394 + }, + { + "epoch": 3.9692671394799053, + "grad_norm": 2.99763560295105, + "learning_rate": 1.3034955308917849e-06, + "loss": 0.4017, + "step": 8395 + }, + { + "epoch": 3.969739952718676, + "grad_norm": 3.063109874725342, + "learning_rate": 1.3029478280519364e-06, + "loss": 0.4568, + "step": 8396 + }, + { + "epoch": 3.970212765957447, + "grad_norm": 3.2660679817199707, + "learning_rate": 1.3024001997472791e-06, + "loss": 0.3999, + "step": 8397 + }, + { + "epoch": 3.9706855791962177, + "grad_norm": 2.860121250152588, + "learning_rate": 1.3018526460119088e-06, + "loss": 0.433, + "step": 8398 + }, + { + "epoch": 3.971158392434988, + "grad_norm": 3.1037673950195312, + "learning_rate": 1.3013051668799216e-06, + "loss": 0.4526, + "step": 8399 + }, + { + "epoch": 3.971631205673759, + "grad_norm": 2.9408578872680664, + "learning_rate": 1.3007577623854053e-06, + "loss": 0.3722, + "step": 8400 + }, + { + "epoch": 3.9721040189125296, + "grad_norm": 3.0684635639190674, + "learning_rate": 1.3002104325624436e-06, + "loss": 0.3789, + "step": 8401 + }, + { + "epoch": 3.9725768321513, + "grad_norm": 2.6469366550445557, + "learning_rate": 1.2996631774451187e-06, + "loss": 0.3409, + "step": 8402 + }, + { + "epoch": 3.9730496453900708, + "grad_norm": 3.3741610050201416, + "learning_rate": 1.2991159970675033e-06, + "loss": 0.3544, + "step": 8403 + }, + { + "epoch": 3.9735224586288416, + "grad_norm": 3.3716588020324707, + "learning_rate": 1.2985688914636701e-06, + "loss": 0.3747, + "step": 8404 + }, + { + "epoch": 3.9739952718676124, + "grad_norm": 3.000469923019409, + "learning_rate": 1.2980218606676837e-06, + "loss": 0.4506, + "step": 8405 + }, + { + "epoch": 3.974468085106383, + "grad_norm": 3.0139408111572266, + "learning_rate": 1.2974749047136057e-06, + "loss": 0.4156, + "step": 8406 + }, + { + "epoch": 3.9749408983451535, + "grad_norm": 2.9494218826293945, + "learning_rate": 1.2969280236354925e-06, + "loss": 0.3378, + "step": 8407 + }, + { + "epoch": 3.9754137115839243, + "grad_norm": 2.6061158180236816, + "learning_rate": 1.2963812174673948e-06, + "loss": 0.3887, + "step": 8408 + }, + { + "epoch": 3.975886524822695, + "grad_norm": 2.873987913131714, + "learning_rate": 1.295834486243362e-06, + "loss": 0.3202, + "step": 8409 + }, + { + "epoch": 3.9763593380614655, + "grad_norm": 3.0106539726257324, + "learning_rate": 1.2952878299974358e-06, + "loss": 0.4142, + "step": 8410 + }, + { + "epoch": 3.9768321513002363, + "grad_norm": 3.0011982917785645, + "learning_rate": 1.2947412487636527e-06, + "loss": 0.4121, + "step": 8411 + }, + { + "epoch": 3.977304964539007, + "grad_norm": 3.1321003437042236, + "learning_rate": 1.294194742576048e-06, + "loss": 0.4033, + "step": 8412 + }, + { + "epoch": 3.977777777777778, + "grad_norm": 2.812255382537842, + "learning_rate": 1.2936483114686487e-06, + "loss": 0.3414, + "step": 8413 + }, + { + "epoch": 3.9782505910165487, + "grad_norm": 2.9594221115112305, + "learning_rate": 1.2931019554754804e-06, + "loss": 0.3666, + "step": 8414 + }, + { + "epoch": 3.978723404255319, + "grad_norm": 3.119440793991089, + "learning_rate": 1.2925556746305612e-06, + "loss": 0.3902, + "step": 8415 + }, + { + "epoch": 3.97919621749409, + "grad_norm": 3.042102098464966, + "learning_rate": 1.2920094689679047e-06, + "loss": 0.344, + "step": 8416 + }, + { + "epoch": 3.9796690307328606, + "grad_norm": 2.8443872928619385, + "learning_rate": 1.2914633385215225e-06, + "loss": 0.372, + "step": 8417 + }, + { + "epoch": 3.980141843971631, + "grad_norm": 3.483201265335083, + "learning_rate": 1.2909172833254187e-06, + "loss": 0.4028, + "step": 8418 + }, + { + "epoch": 3.9806146572104018, + "grad_norm": 2.966996431350708, + "learning_rate": 1.2903713034135934e-06, + "loss": 0.3527, + "step": 8419 + }, + { + "epoch": 3.9810874704491725, + "grad_norm": 2.7813172340393066, + "learning_rate": 1.2898253988200437e-06, + "loss": 0.3873, + "step": 8420 + }, + { + "epoch": 3.9815602836879433, + "grad_norm": 3.24611234664917, + "learning_rate": 1.2892795695787602e-06, + "loss": 0.4783, + "step": 8421 + }, + { + "epoch": 3.982033096926714, + "grad_norm": 3.345573663711548, + "learning_rate": 1.2887338157237289e-06, + "loss": 0.4179, + "step": 8422 + }, + { + "epoch": 3.9825059101654845, + "grad_norm": 3.1726880073547363, + "learning_rate": 1.288188137288931e-06, + "loss": 0.3725, + "step": 8423 + }, + { + "epoch": 3.9829787234042553, + "grad_norm": 3.398966073989868, + "learning_rate": 1.2876425343083449e-06, + "loss": 0.4117, + "step": 8424 + }, + { + "epoch": 3.983451536643026, + "grad_norm": 2.615680456161499, + "learning_rate": 1.2870970068159423e-06, + "loss": 0.324, + "step": 8425 + }, + { + "epoch": 3.9839243498817964, + "grad_norm": 3.0505547523498535, + "learning_rate": 1.2865515548456893e-06, + "loss": 0.3698, + "step": 8426 + }, + { + "epoch": 3.9843971631205672, + "grad_norm": 3.077404260635376, + "learning_rate": 1.2860061784315514e-06, + "loss": 0.3592, + "step": 8427 + }, + { + "epoch": 3.984869976359338, + "grad_norm": 2.654080390930176, + "learning_rate": 1.2854608776074855e-06, + "loss": 0.3451, + "step": 8428 + }, + { + "epoch": 3.985342789598109, + "grad_norm": 3.1023523807525635, + "learning_rate": 1.284915652407444e-06, + "loss": 0.3809, + "step": 8429 + }, + { + "epoch": 3.9858156028368796, + "grad_norm": 3.0526652336120605, + "learning_rate": 1.2843705028653783e-06, + "loss": 0.3633, + "step": 8430 + }, + { + "epoch": 3.98628841607565, + "grad_norm": 2.7829604148864746, + "learning_rate": 1.2838254290152296e-06, + "loss": 0.3213, + "step": 8431 + }, + { + "epoch": 3.986761229314421, + "grad_norm": 3.2218687534332275, + "learning_rate": 1.28328043089094e-06, + "loss": 0.465, + "step": 8432 + }, + { + "epoch": 3.9872340425531916, + "grad_norm": 2.952998161315918, + "learning_rate": 1.2827355085264425e-06, + "loss": 0.4405, + "step": 8433 + }, + { + "epoch": 3.987706855791962, + "grad_norm": 2.81211519241333, + "learning_rate": 1.2821906619556667e-06, + "loss": 0.3444, + "step": 8434 + }, + { + "epoch": 3.9881796690307327, + "grad_norm": 3.1707375049591064, + "learning_rate": 1.281645891212539e-06, + "loss": 0.4019, + "step": 8435 + }, + { + "epoch": 3.9886524822695035, + "grad_norm": 2.791504383087158, + "learning_rate": 1.2811011963309788e-06, + "loss": 0.3606, + "step": 8436 + }, + { + "epoch": 3.9891252955082743, + "grad_norm": 2.954782247543335, + "learning_rate": 1.280556577344903e-06, + "loss": 0.3141, + "step": 8437 + }, + { + "epoch": 3.989598108747045, + "grad_norm": 2.718273878097534, + "learning_rate": 1.2800120342882223e-06, + "loss": 0.3715, + "step": 8438 + }, + { + "epoch": 3.9900709219858155, + "grad_norm": 3.2916250228881836, + "learning_rate": 1.2794675671948425e-06, + "loss": 0.4048, + "step": 8439 + }, + { + "epoch": 3.9905437352245863, + "grad_norm": 3.060060977935791, + "learning_rate": 1.2789231760986655e-06, + "loss": 0.4032, + "step": 8440 + }, + { + "epoch": 3.991016548463357, + "grad_norm": 2.8467273712158203, + "learning_rate": 1.2783788610335882e-06, + "loss": 0.4041, + "step": 8441 + }, + { + "epoch": 3.9914893617021274, + "grad_norm": 3.161790132522583, + "learning_rate": 1.2778346220335013e-06, + "loss": 0.4049, + "step": 8442 + }, + { + "epoch": 3.9919621749408982, + "grad_norm": 2.6512296199798584, + "learning_rate": 1.277290459132295e-06, + "loss": 0.3598, + "step": 8443 + }, + { + "epoch": 3.992434988179669, + "grad_norm": 2.792736291885376, + "learning_rate": 1.276746372363849e-06, + "loss": 0.3874, + "step": 8444 + }, + { + "epoch": 3.99290780141844, + "grad_norm": 2.887047052383423, + "learning_rate": 1.2762023617620433e-06, + "loss": 0.4255, + "step": 8445 + }, + { + "epoch": 3.9933806146572106, + "grad_norm": 3.0420780181884766, + "learning_rate": 1.275658427360751e-06, + "loss": 0.4489, + "step": 8446 + }, + { + "epoch": 3.993853427895981, + "grad_norm": 3.107618570327759, + "learning_rate": 1.2751145691938383e-06, + "loss": 0.4354, + "step": 8447 + }, + { + "epoch": 3.9943262411347518, + "grad_norm": 2.656224250793457, + "learning_rate": 1.2745707872951718e-06, + "loss": 0.4188, + "step": 8448 + }, + { + "epoch": 3.9947990543735226, + "grad_norm": 2.9895219802856445, + "learning_rate": 1.2740270816986079e-06, + "loss": 0.391, + "step": 8449 + }, + { + "epoch": 3.995271867612293, + "grad_norm": 2.919255018234253, + "learning_rate": 1.2734834524380025e-06, + "loss": 0.4058, + "step": 8450 + }, + { + "epoch": 3.9957446808510637, + "grad_norm": 3.4418535232543945, + "learning_rate": 1.2729398995472048e-06, + "loss": 0.3977, + "step": 8451 + }, + { + "epoch": 3.9962174940898345, + "grad_norm": 2.980224132537842, + "learning_rate": 1.272396423060058e-06, + "loss": 0.4417, + "step": 8452 + }, + { + "epoch": 3.9966903073286053, + "grad_norm": 3.6488101482391357, + "learning_rate": 1.2718530230104043e-06, + "loss": 0.4472, + "step": 8453 + }, + { + "epoch": 3.997163120567376, + "grad_norm": 2.725437641143799, + "learning_rate": 1.2713096994320774e-06, + "loss": 0.3125, + "step": 8454 + }, + { + "epoch": 3.9976359338061465, + "grad_norm": 3.453794002532959, + "learning_rate": 1.2707664523589076e-06, + "loss": 0.3792, + "step": 8455 + }, + { + "epoch": 3.9981087470449173, + "grad_norm": 2.8443076610565186, + "learning_rate": 1.270223281824721e-06, + "loss": 0.3627, + "step": 8456 + }, + { + "epoch": 3.998581560283688, + "grad_norm": 3.1851959228515625, + "learning_rate": 1.2696801878633372e-06, + "loss": 0.3745, + "step": 8457 + }, + { + "epoch": 3.9990543735224584, + "grad_norm": 2.897239923477173, + "learning_rate": 1.2691371705085743e-06, + "loss": 0.3817, + "step": 8458 + }, + { + "epoch": 3.999527186761229, + "grad_norm": 2.92111873626709, + "learning_rate": 1.2685942297942416e-06, + "loss": 0.3824, + "step": 8459 + }, + { + "epoch": 4.0, + "grad_norm": 2.9540340900421143, + "learning_rate": 1.268051365754148e-06, + "loss": 0.3656, + "step": 8460 + }, + { + "epoch": 4.000472813238771, + "grad_norm": 2.6815075874328613, + "learning_rate": 1.2675085784220936e-06, + "loss": 0.3296, + "step": 8461 + }, + { + "epoch": 4.000945626477542, + "grad_norm": 3.0823302268981934, + "learning_rate": 1.2669658678318747e-06, + "loss": 0.3918, + "step": 8462 + }, + { + "epoch": 4.001418439716312, + "grad_norm": 2.5451176166534424, + "learning_rate": 1.2664232340172855e-06, + "loss": 0.3528, + "step": 8463 + }, + { + "epoch": 4.001891252955082, + "grad_norm": 2.539541721343994, + "learning_rate": 1.2658806770121119e-06, + "loss": 0.3034, + "step": 8464 + }, + { + "epoch": 4.002364066193853, + "grad_norm": 2.5537798404693604, + "learning_rate": 1.2653381968501374e-06, + "loss": 0.2981, + "step": 8465 + }, + { + "epoch": 4.002836879432624, + "grad_norm": 2.6316089630126953, + "learning_rate": 1.26479579356514e-06, + "loss": 0.3328, + "step": 8466 + }, + { + "epoch": 4.003309692671395, + "grad_norm": 3.080700635910034, + "learning_rate": 1.2642534671908914e-06, + "loss": 0.3471, + "step": 8467 + }, + { + "epoch": 4.0037825059101655, + "grad_norm": 3.0111753940582275, + "learning_rate": 1.2637112177611614e-06, + "loss": 0.3147, + "step": 8468 + }, + { + "epoch": 4.004255319148936, + "grad_norm": 2.759606122970581, + "learning_rate": 1.2631690453097128e-06, + "loss": 0.2634, + "step": 8469 + }, + { + "epoch": 4.004728132387707, + "grad_norm": 2.862098217010498, + "learning_rate": 1.2626269498703048e-06, + "loss": 0.3333, + "step": 8470 + }, + { + "epoch": 4.005200945626478, + "grad_norm": 3.122239589691162, + "learning_rate": 1.262084931476691e-06, + "loss": 0.3311, + "step": 8471 + }, + { + "epoch": 4.005673758865248, + "grad_norm": 2.6428070068359375, + "learning_rate": 1.261542990162619e-06, + "loss": 0.3534, + "step": 8472 + }, + { + "epoch": 4.006146572104019, + "grad_norm": 3.2870724201202393, + "learning_rate": 1.261001125961836e-06, + "loss": 0.3373, + "step": 8473 + }, + { + "epoch": 4.006619385342789, + "grad_norm": 2.7675375938415527, + "learning_rate": 1.26045933890808e-06, + "loss": 0.3117, + "step": 8474 + }, + { + "epoch": 4.00709219858156, + "grad_norm": 2.811736583709717, + "learning_rate": 1.2599176290350844e-06, + "loss": 0.3087, + "step": 8475 + }, + { + "epoch": 4.007565011820331, + "grad_norm": 2.9146902561187744, + "learning_rate": 1.2593759963765817e-06, + "loss": 0.336, + "step": 8476 + }, + { + "epoch": 4.008037825059102, + "grad_norm": 3.074338674545288, + "learning_rate": 1.2588344409662945e-06, + "loss": 0.384, + "step": 8477 + }, + { + "epoch": 4.008510638297873, + "grad_norm": 3.5597734451293945, + "learning_rate": 1.2582929628379455e-06, + "loss": 0.4061, + "step": 8478 + }, + { + "epoch": 4.008983451536643, + "grad_norm": 3.0091497898101807, + "learning_rate": 1.2577515620252489e-06, + "loss": 0.3783, + "step": 8479 + }, + { + "epoch": 4.009456264775413, + "grad_norm": 2.9654228687286377, + "learning_rate": 1.2572102385619145e-06, + "loss": 0.3541, + "step": 8480 + }, + { + "epoch": 4.009929078014184, + "grad_norm": 3.140733242034912, + "learning_rate": 1.2566689924816502e-06, + "loss": 0.3706, + "step": 8481 + }, + { + "epoch": 4.010401891252955, + "grad_norm": 3.2180161476135254, + "learning_rate": 1.2561278238181548e-06, + "loss": 0.3573, + "step": 8482 + }, + { + "epoch": 4.010874704491726, + "grad_norm": 2.91209077835083, + "learning_rate": 1.2555867326051265e-06, + "loss": 0.3619, + "step": 8483 + }, + { + "epoch": 4.0113475177304965, + "grad_norm": 3.016916036605835, + "learning_rate": 1.255045718876256e-06, + "loss": 0.3866, + "step": 8484 + }, + { + "epoch": 4.011820330969267, + "grad_norm": 3.1008472442626953, + "learning_rate": 1.2545047826652294e-06, + "loss": 0.3352, + "step": 8485 + }, + { + "epoch": 4.012293144208038, + "grad_norm": 3.136124610900879, + "learning_rate": 1.2539639240057287e-06, + "loss": 0.3478, + "step": 8486 + }, + { + "epoch": 4.012765957446809, + "grad_norm": 3.0021555423736572, + "learning_rate": 1.2534231429314299e-06, + "loss": 0.3522, + "step": 8487 + }, + { + "epoch": 4.013238770685579, + "grad_norm": 3.2261948585510254, + "learning_rate": 1.2528824394760065e-06, + "loss": 0.3632, + "step": 8488 + }, + { + "epoch": 4.01371158392435, + "grad_norm": 3.0598134994506836, + "learning_rate": 1.2523418136731252e-06, + "loss": 0.3422, + "step": 8489 + }, + { + "epoch": 4.01418439716312, + "grad_norm": 2.9821391105651855, + "learning_rate": 1.2518012655564476e-06, + "loss": 0.3324, + "step": 8490 + }, + { + "epoch": 4.014657210401891, + "grad_norm": 2.583130359649658, + "learning_rate": 1.251260795159633e-06, + "loss": 0.3509, + "step": 8491 + }, + { + "epoch": 4.015130023640662, + "grad_norm": 3.3090853691101074, + "learning_rate": 1.2507204025163333e-06, + "loss": 0.3494, + "step": 8492 + }, + { + "epoch": 4.015602836879433, + "grad_norm": 2.6412856578826904, + "learning_rate": 1.250180087660195e-06, + "loss": 0.3419, + "step": 8493 + }, + { + "epoch": 4.0160756501182036, + "grad_norm": 2.729210615158081, + "learning_rate": 1.2496398506248634e-06, + "loss": 0.3591, + "step": 8494 + }, + { + "epoch": 4.016548463356974, + "grad_norm": 2.892150402069092, + "learning_rate": 1.2490996914439745e-06, + "loss": 0.3866, + "step": 8495 + }, + { + "epoch": 4.017021276595744, + "grad_norm": 3.1967804431915283, + "learning_rate": 1.2485596101511638e-06, + "loss": 0.358, + "step": 8496 + }, + { + "epoch": 4.017494089834515, + "grad_norm": 3.0190439224243164, + "learning_rate": 1.2480196067800588e-06, + "loss": 0.3723, + "step": 8497 + }, + { + "epoch": 4.017966903073286, + "grad_norm": 2.856370210647583, + "learning_rate": 1.2474796813642822e-06, + "loss": 0.3519, + "step": 8498 + }, + { + "epoch": 4.018439716312057, + "grad_norm": 2.979842185974121, + "learning_rate": 1.2469398339374546e-06, + "loss": 0.3483, + "step": 8499 + }, + { + "epoch": 4.0189125295508275, + "grad_norm": 3.0953211784362793, + "learning_rate": 1.246400064533189e-06, + "loss": 0.355, + "step": 8500 + }, + { + "epoch": 4.019385342789598, + "grad_norm": 3.342609167098999, + "learning_rate": 1.2458603731850938e-06, + "loss": 0.4258, + "step": 8501 + }, + { + "epoch": 4.019858156028369, + "grad_norm": 3.2789435386657715, + "learning_rate": 1.2453207599267747e-06, + "loss": 0.3653, + "step": 8502 + }, + { + "epoch": 4.02033096926714, + "grad_norm": 2.8867030143737793, + "learning_rate": 1.2447812247918303e-06, + "loss": 0.3128, + "step": 8503 + }, + { + "epoch": 4.02080378250591, + "grad_norm": 2.9467437267303467, + "learning_rate": 1.2442417678138552e-06, + "loss": 0.3149, + "step": 8504 + }, + { + "epoch": 4.0212765957446805, + "grad_norm": 2.6293485164642334, + "learning_rate": 1.2437023890264377e-06, + "loss": 0.2751, + "step": 8505 + }, + { + "epoch": 4.021749408983451, + "grad_norm": 2.9672160148620605, + "learning_rate": 1.2431630884631648e-06, + "loss": 0.3858, + "step": 8506 + }, + { + "epoch": 4.022222222222222, + "grad_norm": 3.0518734455108643, + "learning_rate": 1.2426238661576154e-06, + "loss": 0.3404, + "step": 8507 + }, + { + "epoch": 4.022695035460993, + "grad_norm": 2.829012632369995, + "learning_rate": 1.2420847221433633e-06, + "loss": 0.3211, + "step": 8508 + }, + { + "epoch": 4.023167848699764, + "grad_norm": 2.855806589126587, + "learning_rate": 1.2415456564539808e-06, + "loss": 0.3462, + "step": 8509 + }, + { + "epoch": 4.0236406619385345, + "grad_norm": 3.491786003112793, + "learning_rate": 1.2410066691230311e-06, + "loss": 0.3793, + "step": 8510 + }, + { + "epoch": 4.024113475177305, + "grad_norm": 2.9612972736358643, + "learning_rate": 1.2404677601840765e-06, + "loss": 0.3899, + "step": 8511 + }, + { + "epoch": 4.024586288416075, + "grad_norm": 2.949498176574707, + "learning_rate": 1.2399289296706718e-06, + "loss": 0.3655, + "step": 8512 + }, + { + "epoch": 4.025059101654846, + "grad_norm": 2.736524820327759, + "learning_rate": 1.2393901776163664e-06, + "loss": 0.318, + "step": 8513 + }, + { + "epoch": 4.025531914893617, + "grad_norm": 3.005297899246216, + "learning_rate": 1.2388515040547077e-06, + "loss": 0.3484, + "step": 8514 + }, + { + "epoch": 4.026004728132388, + "grad_norm": 2.9835290908813477, + "learning_rate": 1.2383129090192361e-06, + "loss": 0.3205, + "step": 8515 + }, + { + "epoch": 4.026477541371158, + "grad_norm": 3.1437056064605713, + "learning_rate": 1.2377743925434865e-06, + "loss": 0.3524, + "step": 8516 + }, + { + "epoch": 4.026950354609929, + "grad_norm": 3.0250096321105957, + "learning_rate": 1.2372359546609917e-06, + "loss": 0.3398, + "step": 8517 + }, + { + "epoch": 4.0274231678487, + "grad_norm": 3.109083890914917, + "learning_rate": 1.2366975954052767e-06, + "loss": 0.3317, + "step": 8518 + }, + { + "epoch": 4.027895981087471, + "grad_norm": 2.7713027000427246, + "learning_rate": 1.2361593148098634e-06, + "loss": 0.335, + "step": 8519 + }, + { + "epoch": 4.028368794326241, + "grad_norm": 2.9302117824554443, + "learning_rate": 1.2356211129082673e-06, + "loss": 0.3054, + "step": 8520 + }, + { + "epoch": 4.0288416075650115, + "grad_norm": 3.1805200576782227, + "learning_rate": 1.2350829897339996e-06, + "loss": 0.3219, + "step": 8521 + }, + { + "epoch": 4.029314420803782, + "grad_norm": 3.2687618732452393, + "learning_rate": 1.2345449453205688e-06, + "loss": 0.3966, + "step": 8522 + }, + { + "epoch": 4.029787234042553, + "grad_norm": 3.2010693550109863, + "learning_rate": 1.2340069797014741e-06, + "loss": 0.3547, + "step": 8523 + }, + { + "epoch": 4.030260047281324, + "grad_norm": 2.7061285972595215, + "learning_rate": 1.233469092910215e-06, + "loss": 0.2829, + "step": 8524 + }, + { + "epoch": 4.030732860520095, + "grad_norm": 3.1565401554107666, + "learning_rate": 1.2329312849802817e-06, + "loss": 0.3376, + "step": 8525 + }, + { + "epoch": 4.0312056737588655, + "grad_norm": 2.8864760398864746, + "learning_rate": 1.2323935559451603e-06, + "loss": 0.3946, + "step": 8526 + }, + { + "epoch": 4.031678486997636, + "grad_norm": 3.4621710777282715, + "learning_rate": 1.2318559058383348e-06, + "loss": 0.3859, + "step": 8527 + }, + { + "epoch": 4.032151300236406, + "grad_norm": 3.074201822280884, + "learning_rate": 1.2313183346932806e-06, + "loss": 0.3583, + "step": 8528 + }, + { + "epoch": 4.032624113475177, + "grad_norm": 3.1746935844421387, + "learning_rate": 1.2307808425434715e-06, + "loss": 0.3766, + "step": 8529 + }, + { + "epoch": 4.033096926713948, + "grad_norm": 3.327202081680298, + "learning_rate": 1.2302434294223738e-06, + "loss": 0.3556, + "step": 8530 + }, + { + "epoch": 4.033569739952719, + "grad_norm": 3.375643730163574, + "learning_rate": 1.2297060953634496e-06, + "loss": 0.3574, + "step": 8531 + }, + { + "epoch": 4.034042553191489, + "grad_norm": 2.8553316593170166, + "learning_rate": 1.2291688404001573e-06, + "loss": 0.2807, + "step": 8532 + }, + { + "epoch": 4.03451536643026, + "grad_norm": 3.439772367477417, + "learning_rate": 1.2286316645659492e-06, + "loss": 0.3519, + "step": 8533 + }, + { + "epoch": 4.034988179669031, + "grad_norm": 2.794694662094116, + "learning_rate": 1.2280945678942724e-06, + "loss": 0.3117, + "step": 8534 + }, + { + "epoch": 4.035460992907802, + "grad_norm": 2.9869043827056885, + "learning_rate": 1.2275575504185697e-06, + "loss": 0.3663, + "step": 8535 + }, + { + "epoch": 4.035933806146572, + "grad_norm": 2.711435317993164, + "learning_rate": 1.2270206121722777e-06, + "loss": 0.3547, + "step": 8536 + }, + { + "epoch": 4.0364066193853425, + "grad_norm": 2.843391180038452, + "learning_rate": 1.2264837531888317e-06, + "loss": 0.3124, + "step": 8537 + }, + { + "epoch": 4.036879432624113, + "grad_norm": 3.2082388401031494, + "learning_rate": 1.225946973501658e-06, + "loss": 0.3573, + "step": 8538 + }, + { + "epoch": 4.037352245862884, + "grad_norm": 2.799604654312134, + "learning_rate": 1.2254102731441786e-06, + "loss": 0.3234, + "step": 8539 + }, + { + "epoch": 4.037825059101655, + "grad_norm": 2.682777166366577, + "learning_rate": 1.2248736521498137e-06, + "loss": 0.3087, + "step": 8540 + }, + { + "epoch": 4.038297872340426, + "grad_norm": 2.8138248920440674, + "learning_rate": 1.2243371105519741e-06, + "loss": 0.3668, + "step": 8541 + }, + { + "epoch": 4.0387706855791965, + "grad_norm": 3.3388478755950928, + "learning_rate": 1.2238006483840702e-06, + "loss": 0.3294, + "step": 8542 + }, + { + "epoch": 4.039243498817967, + "grad_norm": 3.06247615814209, + "learning_rate": 1.2232642656795039e-06, + "loss": 0.3348, + "step": 8543 + }, + { + "epoch": 4.039716312056737, + "grad_norm": 2.742628335952759, + "learning_rate": 1.2227279624716724e-06, + "loss": 0.3427, + "step": 8544 + }, + { + "epoch": 4.040189125295508, + "grad_norm": 3.0785365104675293, + "learning_rate": 1.222191738793971e-06, + "loss": 0.3762, + "step": 8545 + }, + { + "epoch": 4.040661938534279, + "grad_norm": 3.0352790355682373, + "learning_rate": 1.2216555946797862e-06, + "loss": 0.3311, + "step": 8546 + }, + { + "epoch": 4.04113475177305, + "grad_norm": 3.1949729919433594, + "learning_rate": 1.2211195301625028e-06, + "loss": 0.3429, + "step": 8547 + }, + { + "epoch": 4.04160756501182, + "grad_norm": 3.214021921157837, + "learning_rate": 1.2205835452754989e-06, + "loss": 0.3528, + "step": 8548 + }, + { + "epoch": 4.042080378250591, + "grad_norm": 3.206296443939209, + "learning_rate": 1.2200476400521474e-06, + "loss": 0.3499, + "step": 8549 + }, + { + "epoch": 4.042553191489362, + "grad_norm": 3.0067825317382812, + "learning_rate": 1.2195118145258167e-06, + "loss": 0.3597, + "step": 8550 + }, + { + "epoch": 4.043026004728133, + "grad_norm": 2.7811057567596436, + "learning_rate": 1.21897606872987e-06, + "loss": 0.3268, + "step": 8551 + }, + { + "epoch": 4.043498817966903, + "grad_norm": 3.1679844856262207, + "learning_rate": 1.218440402697667e-06, + "loss": 0.4025, + "step": 8552 + }, + { + "epoch": 4.0439716312056735, + "grad_norm": 3.2010326385498047, + "learning_rate": 1.217904816462561e-06, + "loss": 0.3426, + "step": 8553 + }, + { + "epoch": 4.044444444444444, + "grad_norm": 3.381863832473755, + "learning_rate": 1.217369310057899e-06, + "loss": 0.3693, + "step": 8554 + }, + { + "epoch": 4.044917257683215, + "grad_norm": 3.471402168273926, + "learning_rate": 1.2168338835170267e-06, + "loss": 0.3977, + "step": 8555 + }, + { + "epoch": 4.045390070921986, + "grad_norm": 3.0549192428588867, + "learning_rate": 1.2162985368732813e-06, + "loss": 0.3262, + "step": 8556 + }, + { + "epoch": 4.045862884160757, + "grad_norm": 3.02451229095459, + "learning_rate": 1.215763270159998e-06, + "loss": 0.3408, + "step": 8557 + }, + { + "epoch": 4.0463356973995275, + "grad_norm": 3.1335513591766357, + "learning_rate": 1.215228083410505e-06, + "loss": 0.3275, + "step": 8558 + }, + { + "epoch": 4.046808510638298, + "grad_norm": 3.379655599594116, + "learning_rate": 1.2146929766581242e-06, + "loss": 0.3511, + "step": 8559 + }, + { + "epoch": 4.047281323877068, + "grad_norm": 3.210146903991699, + "learning_rate": 1.2141579499361772e-06, + "loss": 0.3607, + "step": 8560 + }, + { + "epoch": 4.047754137115839, + "grad_norm": 3.3693792819976807, + "learning_rate": 1.2136230032779753e-06, + "loss": 0.3642, + "step": 8561 + }, + { + "epoch": 4.04822695035461, + "grad_norm": 3.0397274494171143, + "learning_rate": 1.2130881367168292e-06, + "loss": 0.3376, + "step": 8562 + }, + { + "epoch": 4.048699763593381, + "grad_norm": 3.119372606277466, + "learning_rate": 1.212553350286042e-06, + "loss": 0.3581, + "step": 8563 + }, + { + "epoch": 4.049172576832151, + "grad_norm": 2.9431848526000977, + "learning_rate": 1.2120186440189124e-06, + "loss": 0.3453, + "step": 8564 + }, + { + "epoch": 4.049645390070922, + "grad_norm": 3.256748914718628, + "learning_rate": 1.2114840179487333e-06, + "loss": 0.3766, + "step": 8565 + }, + { + "epoch": 4.050118203309693, + "grad_norm": 2.792759656906128, + "learning_rate": 1.2109494721087953e-06, + "loss": 0.3396, + "step": 8566 + }, + { + "epoch": 4.050591016548464, + "grad_norm": 2.9790122509002686, + "learning_rate": 1.2104150065323813e-06, + "loss": 0.3631, + "step": 8567 + }, + { + "epoch": 4.051063829787234, + "grad_norm": 2.7998805046081543, + "learning_rate": 1.2098806212527705e-06, + "loss": 0.3442, + "step": 8568 + }, + { + "epoch": 4.0515366430260045, + "grad_norm": 3.1292848587036133, + "learning_rate": 1.2093463163032351e-06, + "loss": 0.3798, + "step": 8569 + }, + { + "epoch": 4.052009456264775, + "grad_norm": 3.156205892562866, + "learning_rate": 1.2088120917170465e-06, + "loss": 0.309, + "step": 8570 + }, + { + "epoch": 4.052482269503546, + "grad_norm": 2.8891193866729736, + "learning_rate": 1.208277947527467e-06, + "loss": 0.2989, + "step": 8571 + }, + { + "epoch": 4.052955082742317, + "grad_norm": 3.087719678878784, + "learning_rate": 1.2077438837677548e-06, + "loss": 0.3348, + "step": 8572 + }, + { + "epoch": 4.053427895981088, + "grad_norm": 3.345583915710449, + "learning_rate": 1.2072099004711657e-06, + "loss": 0.3395, + "step": 8573 + }, + { + "epoch": 4.0539007092198585, + "grad_norm": 2.9834377765655518, + "learning_rate": 1.2066759976709463e-06, + "loss": 0.3252, + "step": 8574 + }, + { + "epoch": 4.054373522458629, + "grad_norm": 3.0764353275299072, + "learning_rate": 1.2061421754003425e-06, + "loss": 0.3467, + "step": 8575 + }, + { + "epoch": 4.054846335697399, + "grad_norm": 3.332232713699341, + "learning_rate": 1.2056084336925919e-06, + "loss": 0.3448, + "step": 8576 + }, + { + "epoch": 4.05531914893617, + "grad_norm": 3.1885993480682373, + "learning_rate": 1.2050747725809275e-06, + "loss": 0.325, + "step": 8577 + }, + { + "epoch": 4.055791962174941, + "grad_norm": 3.2727091312408447, + "learning_rate": 1.2045411920985798e-06, + "loss": 0.3755, + "step": 8578 + }, + { + "epoch": 4.0562647754137116, + "grad_norm": 3.0687687397003174, + "learning_rate": 1.2040076922787708e-06, + "loss": 0.2791, + "step": 8579 + }, + { + "epoch": 4.056737588652482, + "grad_norm": 3.2538771629333496, + "learning_rate": 1.2034742731547211e-06, + "loss": 0.3409, + "step": 8580 + }, + { + "epoch": 4.057210401891253, + "grad_norm": 3.237423896789551, + "learning_rate": 1.2029409347596429e-06, + "loss": 0.3803, + "step": 8581 + }, + { + "epoch": 4.057683215130024, + "grad_norm": 3.3347854614257812, + "learning_rate": 1.2024076771267457e-06, + "loss": 0.3123, + "step": 8582 + }, + { + "epoch": 4.058156028368795, + "grad_norm": 3.1294021606445312, + "learning_rate": 1.2018745002892327e-06, + "loss": 0.33, + "step": 8583 + }, + { + "epoch": 4.058628841607565, + "grad_norm": 2.9440014362335205, + "learning_rate": 1.2013414042803013e-06, + "loss": 0.3698, + "step": 8584 + }, + { + "epoch": 4.0591016548463354, + "grad_norm": 3.602764129638672, + "learning_rate": 1.200808389133147e-06, + "loss": 0.3733, + "step": 8585 + }, + { + "epoch": 4.059574468085106, + "grad_norm": 3.2689952850341797, + "learning_rate": 1.2002754548809578e-06, + "loss": 0.3188, + "step": 8586 + }, + { + "epoch": 4.060047281323877, + "grad_norm": 3.15454363822937, + "learning_rate": 1.199742601556916e-06, + "loss": 0.3493, + "step": 8587 + }, + { + "epoch": 4.060520094562648, + "grad_norm": 2.843860387802124, + "learning_rate": 1.1992098291942016e-06, + "loss": 0.3277, + "step": 8588 + }, + { + "epoch": 4.060992907801419, + "grad_norm": 3.0749056339263916, + "learning_rate": 1.1986771378259876e-06, + "loss": 0.3465, + "step": 8589 + }, + { + "epoch": 4.061465721040189, + "grad_norm": 3.3339948654174805, + "learning_rate": 1.1981445274854412e-06, + "loss": 0.3507, + "step": 8590 + }, + { + "epoch": 4.06193853427896, + "grad_norm": 2.7992780208587646, + "learning_rate": 1.1976119982057275e-06, + "loss": 0.302, + "step": 8591 + }, + { + "epoch": 4.06241134751773, + "grad_norm": 3.0862269401550293, + "learning_rate": 1.1970795500200028e-06, + "loss": 0.3365, + "step": 8592 + }, + { + "epoch": 4.062884160756501, + "grad_norm": 3.263456106185913, + "learning_rate": 1.1965471829614222e-06, + "loss": 0.3764, + "step": 8593 + }, + { + "epoch": 4.063356973995272, + "grad_norm": 3.0682623386383057, + "learning_rate": 1.1960148970631332e-06, + "loss": 0.3488, + "step": 8594 + }, + { + "epoch": 4.0638297872340425, + "grad_norm": 2.8910646438598633, + "learning_rate": 1.195482692358278e-06, + "loss": 0.3224, + "step": 8595 + }, + { + "epoch": 4.064302600472813, + "grad_norm": 3.170072555541992, + "learning_rate": 1.1949505688799961e-06, + "loss": 0.3058, + "step": 8596 + }, + { + "epoch": 4.064775413711584, + "grad_norm": 3.018674373626709, + "learning_rate": 1.19441852666142e-06, + "loss": 0.3824, + "step": 8597 + }, + { + "epoch": 4.065248226950355, + "grad_norm": 3.0038044452667236, + "learning_rate": 1.1938865657356773e-06, + "loss": 0.3657, + "step": 8598 + }, + { + "epoch": 4.065721040189126, + "grad_norm": 3.248204469680786, + "learning_rate": 1.193354686135891e-06, + "loss": 0.3305, + "step": 8599 + }, + { + "epoch": 4.066193853427896, + "grad_norm": 3.144714832305908, + "learning_rate": 1.192822887895178e-06, + "loss": 0.3395, + "step": 8600 + }, + { + "epoch": 4.066666666666666, + "grad_norm": 2.9457240104675293, + "learning_rate": 1.1922911710466531e-06, + "loss": 0.3288, + "step": 8601 + }, + { + "epoch": 4.067139479905437, + "grad_norm": 3.1602869033813477, + "learning_rate": 1.1917595356234218e-06, + "loss": 0.3713, + "step": 8602 + }, + { + "epoch": 4.067612293144208, + "grad_norm": 3.0820837020874023, + "learning_rate": 1.1912279816585888e-06, + "loss": 0.2987, + "step": 8603 + }, + { + "epoch": 4.068085106382979, + "grad_norm": 3.0366809368133545, + "learning_rate": 1.1906965091852502e-06, + "loss": 0.4151, + "step": 8604 + }, + { + "epoch": 4.06855791962175, + "grad_norm": 3.229402780532837, + "learning_rate": 1.190165118236498e-06, + "loss": 0.321, + "step": 8605 + }, + { + "epoch": 4.06903073286052, + "grad_norm": 2.832232713699341, + "learning_rate": 1.1896338088454217e-06, + "loss": 0.3551, + "step": 8606 + }, + { + "epoch": 4.069503546099291, + "grad_norm": 3.5618600845336914, + "learning_rate": 1.1891025810451012e-06, + "loss": 0.3704, + "step": 8607 + }, + { + "epoch": 4.069976359338061, + "grad_norm": 3.287827491760254, + "learning_rate": 1.1885714348686158e-06, + "loss": 0.3469, + "step": 8608 + }, + { + "epoch": 4.070449172576832, + "grad_norm": 3.468825101852417, + "learning_rate": 1.188040370349037e-06, + "loss": 0.3687, + "step": 8609 + }, + { + "epoch": 4.070921985815603, + "grad_norm": 3.2931180000305176, + "learning_rate": 1.1875093875194302e-06, + "loss": 0.3832, + "step": 8610 + }, + { + "epoch": 4.0713947990543735, + "grad_norm": 2.9613003730773926, + "learning_rate": 1.18697848641286e-06, + "loss": 0.3314, + "step": 8611 + }, + { + "epoch": 4.071867612293144, + "grad_norm": 3.1507649421691895, + "learning_rate": 1.1864476670623816e-06, + "loss": 0.3153, + "step": 8612 + }, + { + "epoch": 4.072340425531915, + "grad_norm": 2.844064950942993, + "learning_rate": 1.1859169295010478e-06, + "loss": 0.3566, + "step": 8613 + }, + { + "epoch": 4.072813238770686, + "grad_norm": 3.227264881134033, + "learning_rate": 1.1853862737619042e-06, + "loss": 0.3717, + "step": 8614 + }, + { + "epoch": 4.073286052009456, + "grad_norm": 2.9416239261627197, + "learning_rate": 1.1848556998779922e-06, + "loss": 0.3438, + "step": 8615 + }, + { + "epoch": 4.073758865248227, + "grad_norm": 4.1662492752075195, + "learning_rate": 1.18432520788235e-06, + "loss": 0.362, + "step": 8616 + }, + { + "epoch": 4.074231678486997, + "grad_norm": 3.47951602935791, + "learning_rate": 1.183794797808008e-06, + "loss": 0.3672, + "step": 8617 + }, + { + "epoch": 4.074704491725768, + "grad_norm": 2.998969793319702, + "learning_rate": 1.1832644696879919e-06, + "loss": 0.3281, + "step": 8618 + }, + { + "epoch": 4.075177304964539, + "grad_norm": 2.956167221069336, + "learning_rate": 1.182734223555324e-06, + "loss": 0.3059, + "step": 8619 + }, + { + "epoch": 4.07565011820331, + "grad_norm": 3.447821855545044, + "learning_rate": 1.1822040594430195e-06, + "loss": 0.333, + "step": 8620 + }, + { + "epoch": 4.076122931442081, + "grad_norm": 3.072972059249878, + "learning_rate": 1.1816739773840905e-06, + "loss": 0.3737, + "step": 8621 + }, + { + "epoch": 4.076595744680851, + "grad_norm": 3.142913341522217, + "learning_rate": 1.1811439774115424e-06, + "loss": 0.3697, + "step": 8622 + }, + { + "epoch": 4.077068557919622, + "grad_norm": 3.4997763633728027, + "learning_rate": 1.1806140595583745e-06, + "loss": 0.4177, + "step": 8623 + }, + { + "epoch": 4.077541371158392, + "grad_norm": 3.032951831817627, + "learning_rate": 1.1800842238575853e-06, + "loss": 0.351, + "step": 8624 + }, + { + "epoch": 4.078014184397163, + "grad_norm": 2.8878438472747803, + "learning_rate": 1.1795544703421625e-06, + "loss": 0.3409, + "step": 8625 + }, + { + "epoch": 4.078486997635934, + "grad_norm": 2.931614637374878, + "learning_rate": 1.1790247990450936e-06, + "loss": 0.3416, + "step": 8626 + }, + { + "epoch": 4.0789598108747045, + "grad_norm": 3.1719822883605957, + "learning_rate": 1.1784952099993586e-06, + "loss": 0.3574, + "step": 8627 + }, + { + "epoch": 4.079432624113475, + "grad_norm": 2.960068464279175, + "learning_rate": 1.1779657032379322e-06, + "loss": 0.3557, + "step": 8628 + }, + { + "epoch": 4.079905437352246, + "grad_norm": 3.1410937309265137, + "learning_rate": 1.1774362787937843e-06, + "loss": 0.3839, + "step": 8629 + }, + { + "epoch": 4.080378250591017, + "grad_norm": 3.596153736114502, + "learning_rate": 1.1769069366998793e-06, + "loss": 0.3135, + "step": 8630 + }, + { + "epoch": 4.080851063829787, + "grad_norm": 3.385826587677002, + "learning_rate": 1.1763776769891786e-06, + "loss": 0.3624, + "step": 8631 + }, + { + "epoch": 4.081323877068558, + "grad_norm": 3.2531018257141113, + "learning_rate": 1.175848499694636e-06, + "loss": 0.3593, + "step": 8632 + }, + { + "epoch": 4.081796690307328, + "grad_norm": 3.3864004611968994, + "learning_rate": 1.1753194048492004e-06, + "loss": 0.3929, + "step": 8633 + }, + { + "epoch": 4.082269503546099, + "grad_norm": 2.8734285831451416, + "learning_rate": 1.1747903924858175e-06, + "loss": 0.3145, + "step": 8634 + }, + { + "epoch": 4.08274231678487, + "grad_norm": 3.3261659145355225, + "learning_rate": 1.174261462637426e-06, + "loss": 0.3351, + "step": 8635 + }, + { + "epoch": 4.083215130023641, + "grad_norm": 3.413990020751953, + "learning_rate": 1.1737326153369594e-06, + "loss": 0.3984, + "step": 8636 + }, + { + "epoch": 4.083687943262412, + "grad_norm": 3.311741590499878, + "learning_rate": 1.1732038506173481e-06, + "loss": 0.3716, + "step": 8637 + }, + { + "epoch": 4.084160756501182, + "grad_norm": 3.691573143005371, + "learning_rate": 1.1726751685115142e-06, + "loss": 0.3542, + "step": 8638 + }, + { + "epoch": 4.084633569739952, + "grad_norm": 3.1951167583465576, + "learning_rate": 1.1721465690523784e-06, + "loss": 0.3683, + "step": 8639 + }, + { + "epoch": 4.085106382978723, + "grad_norm": 3.1731514930725098, + "learning_rate": 1.1716180522728534e-06, + "loss": 0.3552, + "step": 8640 + }, + { + "epoch": 4.085579196217494, + "grad_norm": 3.1588845252990723, + "learning_rate": 1.1710896182058465e-06, + "loss": 0.3908, + "step": 8641 + }, + { + "epoch": 4.086052009456265, + "grad_norm": 3.6902294158935547, + "learning_rate": 1.1705612668842628e-06, + "loss": 0.4099, + "step": 8642 + }, + { + "epoch": 4.0865248226950355, + "grad_norm": 4.56397819519043, + "learning_rate": 1.1700329983409988e-06, + "loss": 0.3456, + "step": 8643 + }, + { + "epoch": 4.086997635933806, + "grad_norm": 2.924715995788574, + "learning_rate": 1.1695048126089492e-06, + "loss": 0.3885, + "step": 8644 + }, + { + "epoch": 4.087470449172577, + "grad_norm": 3.537550687789917, + "learning_rate": 1.1689767097210009e-06, + "loss": 0.3551, + "step": 8645 + }, + { + "epoch": 4.087943262411348, + "grad_norm": 3.0198440551757812, + "learning_rate": 1.1684486897100364e-06, + "loss": 0.3448, + "step": 8646 + }, + { + "epoch": 4.088416075650118, + "grad_norm": 3.448965072631836, + "learning_rate": 1.1679207526089334e-06, + "loss": 0.3252, + "step": 8647 + }, + { + "epoch": 4.088888888888889, + "grad_norm": 3.057326078414917, + "learning_rate": 1.167392898450563e-06, + "loss": 0.3231, + "step": 8648 + }, + { + "epoch": 4.089361702127659, + "grad_norm": 3.0788655281066895, + "learning_rate": 1.1668651272677948e-06, + "loss": 0.3273, + "step": 8649 + }, + { + "epoch": 4.08983451536643, + "grad_norm": 2.9126291275024414, + "learning_rate": 1.1663374390934893e-06, + "loss": 0.3162, + "step": 8650 + }, + { + "epoch": 4.090307328605201, + "grad_norm": 3.278874635696411, + "learning_rate": 1.1658098339605027e-06, + "loss": 0.3123, + "step": 8651 + }, + { + "epoch": 4.090780141843972, + "grad_norm": 2.8490889072418213, + "learning_rate": 1.1652823119016882e-06, + "loss": 0.3408, + "step": 8652 + }, + { + "epoch": 4.091252955082743, + "grad_norm": 3.0473995208740234, + "learning_rate": 1.164754872949891e-06, + "loss": 0.3349, + "step": 8653 + }, + { + "epoch": 4.091725768321513, + "grad_norm": 2.9052987098693848, + "learning_rate": 1.1642275171379535e-06, + "loss": 0.3113, + "step": 8654 + }, + { + "epoch": 4.092198581560283, + "grad_norm": 3.1060919761657715, + "learning_rate": 1.1637002444987116e-06, + "loss": 0.3139, + "step": 8655 + }, + { + "epoch": 4.092671394799054, + "grad_norm": 3.172394275665283, + "learning_rate": 1.163173055064995e-06, + "loss": 0.3555, + "step": 8656 + }, + { + "epoch": 4.093144208037825, + "grad_norm": 3.36523699760437, + "learning_rate": 1.1626459488696313e-06, + "loss": 0.3835, + "step": 8657 + }, + { + "epoch": 4.093617021276596, + "grad_norm": 2.9513938426971436, + "learning_rate": 1.1621189259454393e-06, + "loss": 0.3432, + "step": 8658 + }, + { + "epoch": 4.0940898345153665, + "grad_norm": 2.8415515422821045, + "learning_rate": 1.1615919863252365e-06, + "loss": 0.3494, + "step": 8659 + }, + { + "epoch": 4.094562647754137, + "grad_norm": 3.3759984970092773, + "learning_rate": 1.1610651300418315e-06, + "loss": 0.3519, + "step": 8660 + }, + { + "epoch": 4.095035460992908, + "grad_norm": 3.1927380561828613, + "learning_rate": 1.1605383571280304e-06, + "loss": 0.3675, + "step": 8661 + }, + { + "epoch": 4.095508274231679, + "grad_norm": 2.800658941268921, + "learning_rate": 1.1600116676166321e-06, + "loss": 0.3291, + "step": 8662 + }, + { + "epoch": 4.095981087470449, + "grad_norm": 2.9948630332946777, + "learning_rate": 1.1594850615404316e-06, + "loss": 0.332, + "step": 8663 + }, + { + "epoch": 4.0964539007092196, + "grad_norm": 3.032003879547119, + "learning_rate": 1.1589585389322176e-06, + "loss": 0.3583, + "step": 8664 + }, + { + "epoch": 4.09692671394799, + "grad_norm": 2.9765310287475586, + "learning_rate": 1.1584320998247757e-06, + "loss": 0.3296, + "step": 8665 + }, + { + "epoch": 4.097399527186761, + "grad_norm": 3.049954414367676, + "learning_rate": 1.1579057442508838e-06, + "loss": 0.4007, + "step": 8666 + }, + { + "epoch": 4.097872340425532, + "grad_norm": 3.3874928951263428, + "learning_rate": 1.1573794722433168e-06, + "loss": 0.3856, + "step": 8667 + }, + { + "epoch": 4.098345153664303, + "grad_norm": 2.56701397895813, + "learning_rate": 1.1568532838348432e-06, + "loss": 0.3291, + "step": 8668 + }, + { + "epoch": 4.0988179669030735, + "grad_norm": 2.956408739089966, + "learning_rate": 1.1563271790582247e-06, + "loss": 0.3538, + "step": 8669 + }, + { + "epoch": 4.099290780141844, + "grad_norm": 3.827467679977417, + "learning_rate": 1.1558011579462225e-06, + "loss": 0.3764, + "step": 8670 + }, + { + "epoch": 4.099763593380614, + "grad_norm": 3.3271424770355225, + "learning_rate": 1.1552752205315867e-06, + "loss": 0.339, + "step": 8671 + }, + { + "epoch": 4.100236406619385, + "grad_norm": 3.0050785541534424, + "learning_rate": 1.1547493668470675e-06, + "loss": 0.3764, + "step": 8672 + }, + { + "epoch": 4.100709219858156, + "grad_norm": 2.6030385494232178, + "learning_rate": 1.1542235969254065e-06, + "loss": 0.3507, + "step": 8673 + }, + { + "epoch": 4.101182033096927, + "grad_norm": 3.081695556640625, + "learning_rate": 1.1536979107993402e-06, + "loss": 0.3386, + "step": 8674 + }, + { + "epoch": 4.101654846335697, + "grad_norm": 2.7685163021087646, + "learning_rate": 1.1531723085016025e-06, + "loss": 0.3456, + "step": 8675 + }, + { + "epoch": 4.102127659574468, + "grad_norm": 3.037252902984619, + "learning_rate": 1.1526467900649195e-06, + "loss": 0.3489, + "step": 8676 + }, + { + "epoch": 4.102600472813239, + "grad_norm": 2.9675045013427734, + "learning_rate": 1.1521213555220129e-06, + "loss": 0.3638, + "step": 8677 + }, + { + "epoch": 4.10307328605201, + "grad_norm": 3.3377575874328613, + "learning_rate": 1.1515960049055994e-06, + "loss": 0.3557, + "step": 8678 + }, + { + "epoch": 4.10354609929078, + "grad_norm": 2.7452030181884766, + "learning_rate": 1.1510707382483888e-06, + "loss": 0.285, + "step": 8679 + }, + { + "epoch": 4.1040189125295505, + "grad_norm": 2.9602560997009277, + "learning_rate": 1.1505455555830897e-06, + "loss": 0.321, + "step": 8680 + }, + { + "epoch": 4.104491725768321, + "grad_norm": 3.122945547103882, + "learning_rate": 1.1500204569424007e-06, + "loss": 0.375, + "step": 8681 + }, + { + "epoch": 4.104964539007092, + "grad_norm": 3.334885835647583, + "learning_rate": 1.149495442359019e-06, + "loss": 0.3399, + "step": 8682 + }, + { + "epoch": 4.105437352245863, + "grad_norm": 3.3663594722747803, + "learning_rate": 1.1489705118656346e-06, + "loss": 0.3482, + "step": 8683 + }, + { + "epoch": 4.105910165484634, + "grad_norm": 2.9761641025543213, + "learning_rate": 1.1484456654949313e-06, + "loss": 0.3317, + "step": 8684 + }, + { + "epoch": 4.1063829787234045, + "grad_norm": 3.486905574798584, + "learning_rate": 1.147920903279591e-06, + "loss": 0.3737, + "step": 8685 + }, + { + "epoch": 4.106855791962175, + "grad_norm": 3.4820523262023926, + "learning_rate": 1.1473962252522875e-06, + "loss": 0.4135, + "step": 8686 + }, + { + "epoch": 4.107328605200945, + "grad_norm": 3.314117431640625, + "learning_rate": 1.146871631445689e-06, + "loss": 0.369, + "step": 8687 + }, + { + "epoch": 4.107801418439716, + "grad_norm": 2.9497411251068115, + "learning_rate": 1.1463471218924615e-06, + "loss": 0.3233, + "step": 8688 + }, + { + "epoch": 4.108274231678487, + "grad_norm": 3.1337075233459473, + "learning_rate": 1.1458226966252624e-06, + "loss": 0.401, + "step": 8689 + }, + { + "epoch": 4.108747044917258, + "grad_norm": 3.0163166522979736, + "learning_rate": 1.1452983556767473e-06, + "loss": 0.3812, + "step": 8690 + }, + { + "epoch": 4.109219858156028, + "grad_norm": 2.976491928100586, + "learning_rate": 1.1447740990795629e-06, + "loss": 0.3508, + "step": 8691 + }, + { + "epoch": 4.109692671394799, + "grad_norm": 3.2449910640716553, + "learning_rate": 1.144249926866353e-06, + "loss": 0.3056, + "step": 8692 + }, + { + "epoch": 4.11016548463357, + "grad_norm": 2.562558650970459, + "learning_rate": 1.1437258390697553e-06, + "loss": 0.2878, + "step": 8693 + }, + { + "epoch": 4.110638297872341, + "grad_norm": 3.1823108196258545, + "learning_rate": 1.1432018357224017e-06, + "loss": 0.2849, + "step": 8694 + }, + { + "epoch": 4.111111111111111, + "grad_norm": 2.9045653343200684, + "learning_rate": 1.1426779168569217e-06, + "loss": 0.3264, + "step": 8695 + }, + { + "epoch": 4.1115839243498815, + "grad_norm": 2.7991254329681396, + "learning_rate": 1.1421540825059355e-06, + "loss": 0.3427, + "step": 8696 + }, + { + "epoch": 4.112056737588652, + "grad_norm": 2.9184927940368652, + "learning_rate": 1.14163033270206e-06, + "loss": 0.3073, + "step": 8697 + }, + { + "epoch": 4.112529550827423, + "grad_norm": 3.189335584640503, + "learning_rate": 1.1411066674779084e-06, + "loss": 0.3836, + "step": 8698 + }, + { + "epoch": 4.113002364066194, + "grad_norm": 2.899711847305298, + "learning_rate": 1.140583086866085e-06, + "loss": 0.3378, + "step": 8699 + }, + { + "epoch": 4.113475177304965, + "grad_norm": 3.167665481567383, + "learning_rate": 1.1400595908991927e-06, + "loss": 0.3273, + "step": 8700 + }, + { + "epoch": 4.1139479905437355, + "grad_norm": 3.2930212020874023, + "learning_rate": 1.1395361796098268e-06, + "loss": 0.3221, + "step": 8701 + }, + { + "epoch": 4.114420803782506, + "grad_norm": 3.0603861808776855, + "learning_rate": 1.1390128530305764e-06, + "loss": 0.3371, + "step": 8702 + }, + { + "epoch": 4.114893617021276, + "grad_norm": 3.6339457035064697, + "learning_rate": 1.1384896111940289e-06, + "loss": 0.3986, + "step": 8703 + }, + { + "epoch": 4.115366430260047, + "grad_norm": 2.975799322128296, + "learning_rate": 1.1379664541327623e-06, + "loss": 0.3021, + "step": 8704 + }, + { + "epoch": 4.115839243498818, + "grad_norm": 2.9100987911224365, + "learning_rate": 1.1374433818793534e-06, + "loss": 0.3473, + "step": 8705 + }, + { + "epoch": 4.116312056737589, + "grad_norm": 2.9515233039855957, + "learning_rate": 1.1369203944663704e-06, + "loss": 0.3004, + "step": 8706 + }, + { + "epoch": 4.116784869976359, + "grad_norm": 3.283583879470825, + "learning_rate": 1.1363974919263774e-06, + "loss": 0.401, + "step": 8707 + }, + { + "epoch": 4.11725768321513, + "grad_norm": 3.307530641555786, + "learning_rate": 1.1358746742919325e-06, + "loss": 0.322, + "step": 8708 + }, + { + "epoch": 4.117730496453901, + "grad_norm": 3.6834614276885986, + "learning_rate": 1.135351941595591e-06, + "loss": 0.3703, + "step": 8709 + }, + { + "epoch": 4.118203309692672, + "grad_norm": 3.0829904079437256, + "learning_rate": 1.1348292938699e-06, + "loss": 0.3283, + "step": 8710 + }, + { + "epoch": 4.118676122931442, + "grad_norm": 2.914794921875, + "learning_rate": 1.1343067311474033e-06, + "loss": 0.337, + "step": 8711 + }, + { + "epoch": 4.1191489361702125, + "grad_norm": 3.550536870956421, + "learning_rate": 1.1337842534606368e-06, + "loss": 0.3752, + "step": 8712 + }, + { + "epoch": 4.119621749408983, + "grad_norm": 3.337012767791748, + "learning_rate": 1.1332618608421353e-06, + "loss": 0.3604, + "step": 8713 + }, + { + "epoch": 4.120094562647754, + "grad_norm": 2.7749485969543457, + "learning_rate": 1.1327395533244248e-06, + "loss": 0.3712, + "step": 8714 + }, + { + "epoch": 4.120567375886525, + "grad_norm": 3.571261405944824, + "learning_rate": 1.1322173309400258e-06, + "loss": 0.4148, + "step": 8715 + }, + { + "epoch": 4.121040189125296, + "grad_norm": 3.264871597290039, + "learning_rate": 1.1316951937214573e-06, + "loss": 0.3229, + "step": 8716 + }, + { + "epoch": 4.1215130023640665, + "grad_norm": 2.974625825881958, + "learning_rate": 1.131173141701228e-06, + "loss": 0.3372, + "step": 8717 + }, + { + "epoch": 4.121985815602837, + "grad_norm": 3.18060302734375, + "learning_rate": 1.1306511749118466e-06, + "loss": 0.4041, + "step": 8718 + }, + { + "epoch": 4.122458628841607, + "grad_norm": 2.7793190479278564, + "learning_rate": 1.1301292933858115e-06, + "loss": 0.3329, + "step": 8719 + }, + { + "epoch": 4.122931442080378, + "grad_norm": 3.0883100032806396, + "learning_rate": 1.1296074971556179e-06, + "loss": 0.3999, + "step": 8720 + }, + { + "epoch": 4.123404255319149, + "grad_norm": 2.984799385070801, + "learning_rate": 1.1290857862537573e-06, + "loss": 0.3432, + "step": 8721 + }, + { + "epoch": 4.12387706855792, + "grad_norm": 3.0691094398498535, + "learning_rate": 1.1285641607127127e-06, + "loss": 0.3043, + "step": 8722 + }, + { + "epoch": 4.12434988179669, + "grad_norm": 3.2218985557556152, + "learning_rate": 1.128042620564965e-06, + "loss": 0.363, + "step": 8723 + }, + { + "epoch": 4.124822695035461, + "grad_norm": 2.951098918914795, + "learning_rate": 1.1275211658429877e-06, + "loss": 0.3459, + "step": 8724 + }, + { + "epoch": 4.125295508274232, + "grad_norm": 3.038513660430908, + "learning_rate": 1.1269997965792493e-06, + "loss": 0.3073, + "step": 8725 + }, + { + "epoch": 4.125768321513003, + "grad_norm": 2.7548015117645264, + "learning_rate": 1.1264785128062129e-06, + "loss": 0.3587, + "step": 8726 + }, + { + "epoch": 4.126241134751773, + "grad_norm": 3.618379592895508, + "learning_rate": 1.125957314556336e-06, + "loss": 0.4009, + "step": 8727 + }, + { + "epoch": 4.1267139479905435, + "grad_norm": 3.264702320098877, + "learning_rate": 1.1254362018620728e-06, + "loss": 0.3684, + "step": 8728 + }, + { + "epoch": 4.127186761229314, + "grad_norm": 3.209995746612549, + "learning_rate": 1.1249151747558704e-06, + "loss": 0.3796, + "step": 8729 + }, + { + "epoch": 4.127659574468085, + "grad_norm": 3.164973735809326, + "learning_rate": 1.1243942332701693e-06, + "loss": 0.3147, + "step": 8730 + }, + { + "epoch": 4.128132387706856, + "grad_norm": 3.309659957885742, + "learning_rate": 1.1238733774374087e-06, + "loss": 0.308, + "step": 8731 + }, + { + "epoch": 4.128605200945627, + "grad_norm": 3.138901710510254, + "learning_rate": 1.1233526072900184e-06, + "loss": 0.3721, + "step": 8732 + }, + { + "epoch": 4.1290780141843975, + "grad_norm": 3.5710649490356445, + "learning_rate": 1.122831922860424e-06, + "loss": 0.3872, + "step": 8733 + }, + { + "epoch": 4.129550827423168, + "grad_norm": 3.192469835281372, + "learning_rate": 1.1223113241810482e-06, + "loss": 0.349, + "step": 8734 + }, + { + "epoch": 4.130023640661938, + "grad_norm": 2.9302608966827393, + "learning_rate": 1.121790811284304e-06, + "loss": 0.3207, + "step": 8735 + }, + { + "epoch": 4.130496453900709, + "grad_norm": 3.022963047027588, + "learning_rate": 1.121270384202604e-06, + "loss": 0.3487, + "step": 8736 + }, + { + "epoch": 4.13096926713948, + "grad_norm": 3.0473732948303223, + "learning_rate": 1.1207500429683513e-06, + "loss": 0.3083, + "step": 8737 + }, + { + "epoch": 4.131442080378251, + "grad_norm": 2.9411537647247314, + "learning_rate": 1.1202297876139448e-06, + "loss": 0.3077, + "step": 8738 + }, + { + "epoch": 4.131914893617021, + "grad_norm": 2.9274520874023438, + "learning_rate": 1.1197096181717804e-06, + "loss": 0.3071, + "step": 8739 + }, + { + "epoch": 4.132387706855792, + "grad_norm": 2.79213285446167, + "learning_rate": 1.1191895346742454e-06, + "loss": 0.3346, + "step": 8740 + }, + { + "epoch": 4.132860520094563, + "grad_norm": 3.2763726711273193, + "learning_rate": 1.1186695371537235e-06, + "loss": 0.3753, + "step": 8741 + }, + { + "epoch": 4.133333333333334, + "grad_norm": 3.245525598526001, + "learning_rate": 1.1181496256425927e-06, + "loss": 0.3586, + "step": 8742 + }, + { + "epoch": 4.133806146572104, + "grad_norm": 3.557176351547241, + "learning_rate": 1.1176298001732244e-06, + "loss": 0.3547, + "step": 8743 + }, + { + "epoch": 4.1342789598108745, + "grad_norm": 3.674633741378784, + "learning_rate": 1.117110060777988e-06, + "loss": 0.3994, + "step": 8744 + }, + { + "epoch": 4.134751773049645, + "grad_norm": 3.168025016784668, + "learning_rate": 1.1165904074892433e-06, + "loss": 0.3568, + "step": 8745 + }, + { + "epoch": 4.135224586288416, + "grad_norm": 2.9492177963256836, + "learning_rate": 1.1160708403393488e-06, + "loss": 0.3257, + "step": 8746 + }, + { + "epoch": 4.135697399527187, + "grad_norm": 3.139941930770874, + "learning_rate": 1.1155513593606548e-06, + "loss": 0.3464, + "step": 8747 + }, + { + "epoch": 4.136170212765958, + "grad_norm": 3.1875250339508057, + "learning_rate": 1.115031964585506e-06, + "loss": 0.3154, + "step": 8748 + }, + { + "epoch": 4.136643026004728, + "grad_norm": 3.0219457149505615, + "learning_rate": 1.1145126560462447e-06, + "loss": 0.3433, + "step": 8749 + }, + { + "epoch": 4.137115839243499, + "grad_norm": 2.992807149887085, + "learning_rate": 1.1139934337752046e-06, + "loss": 0.3127, + "step": 8750 + }, + { + "epoch": 4.137588652482269, + "grad_norm": 3.354733943939209, + "learning_rate": 1.1134742978047163e-06, + "loss": 0.3166, + "step": 8751 + }, + { + "epoch": 4.13806146572104, + "grad_norm": 3.1885886192321777, + "learning_rate": 1.1129552481671042e-06, + "loss": 0.3872, + "step": 8752 + }, + { + "epoch": 4.138534278959811, + "grad_norm": 2.8869078159332275, + "learning_rate": 1.1124362848946858e-06, + "loss": 0.3218, + "step": 8753 + }, + { + "epoch": 4.1390070921985815, + "grad_norm": 3.818469285964966, + "learning_rate": 1.1119174080197762e-06, + "loss": 0.3442, + "step": 8754 + }, + { + "epoch": 4.139479905437352, + "grad_norm": 3.2445592880249023, + "learning_rate": 1.1113986175746833e-06, + "loss": 0.3858, + "step": 8755 + }, + { + "epoch": 4.139952718676123, + "grad_norm": 2.654083490371704, + "learning_rate": 1.1108799135917098e-06, + "loss": 0.3023, + "step": 8756 + }, + { + "epoch": 4.140425531914894, + "grad_norm": 3.129635810852051, + "learning_rate": 1.1103612961031527e-06, + "loss": 0.3179, + "step": 8757 + }, + { + "epoch": 4.140898345153665, + "grad_norm": 2.8118138313293457, + "learning_rate": 1.1098427651413035e-06, + "loss": 0.3374, + "step": 8758 + }, + { + "epoch": 4.141371158392435, + "grad_norm": 3.104051113128662, + "learning_rate": 1.1093243207384506e-06, + "loss": 0.3202, + "step": 8759 + }, + { + "epoch": 4.141843971631205, + "grad_norm": 3.12392520904541, + "learning_rate": 1.1088059629268744e-06, + "loss": 0.3567, + "step": 8760 + }, + { + "epoch": 4.142316784869976, + "grad_norm": 3.467481851577759, + "learning_rate": 1.1082876917388497e-06, + "loss": 0.4148, + "step": 8761 + }, + { + "epoch": 4.142789598108747, + "grad_norm": 3.1120564937591553, + "learning_rate": 1.1077695072066488e-06, + "loss": 0.3838, + "step": 8762 + }, + { + "epoch": 4.143262411347518, + "grad_norm": 3.028073310852051, + "learning_rate": 1.107251409362535e-06, + "loss": 0.3703, + "step": 8763 + }, + { + "epoch": 4.143735224586289, + "grad_norm": 3.091510057449341, + "learning_rate": 1.1067333982387699e-06, + "loss": 0.3695, + "step": 8764 + }, + { + "epoch": 4.144208037825059, + "grad_norm": 3.9426586627960205, + "learning_rate": 1.1062154738676067e-06, + "loss": 0.3934, + "step": 8765 + }, + { + "epoch": 4.14468085106383, + "grad_norm": 2.923741102218628, + "learning_rate": 1.1056976362812939e-06, + "loss": 0.3679, + "step": 8766 + }, + { + "epoch": 4.1451536643026, + "grad_norm": 3.1010327339172363, + "learning_rate": 1.1051798855120757e-06, + "loss": 0.3314, + "step": 8767 + }, + { + "epoch": 4.145626477541371, + "grad_norm": 2.9165778160095215, + "learning_rate": 1.1046622215921896e-06, + "loss": 0.3473, + "step": 8768 + }, + { + "epoch": 4.146099290780142, + "grad_norm": 2.8494462966918945, + "learning_rate": 1.1041446445538692e-06, + "loss": 0.3226, + "step": 8769 + }, + { + "epoch": 4.1465721040189125, + "grad_norm": 2.53379225730896, + "learning_rate": 1.1036271544293412e-06, + "loss": 0.3123, + "step": 8770 + }, + { + "epoch": 4.147044917257683, + "grad_norm": 3.0433695316314697, + "learning_rate": 1.1031097512508274e-06, + "loss": 0.37, + "step": 8771 + }, + { + "epoch": 4.147517730496454, + "grad_norm": 3.418458938598633, + "learning_rate": 1.1025924350505431e-06, + "loss": 0.3266, + "step": 8772 + }, + { + "epoch": 4.147990543735225, + "grad_norm": 2.843733787536621, + "learning_rate": 1.1020752058607017e-06, + "loss": 0.3548, + "step": 8773 + }, + { + "epoch": 4.148463356973995, + "grad_norm": 3.122965097427368, + "learning_rate": 1.1015580637135073e-06, + "loss": 0.3214, + "step": 8774 + }, + { + "epoch": 4.148936170212766, + "grad_norm": 3.0042455196380615, + "learning_rate": 1.1010410086411601e-06, + "loss": 0.3395, + "step": 8775 + }, + { + "epoch": 4.149408983451536, + "grad_norm": 2.841426372528076, + "learning_rate": 1.1005240406758546e-06, + "loss": 0.3381, + "step": 8776 + }, + { + "epoch": 4.149881796690307, + "grad_norm": 2.8241262435913086, + "learning_rate": 1.100007159849781e-06, + "loss": 0.3504, + "step": 8777 + }, + { + "epoch": 4.150354609929078, + "grad_norm": 2.5685677528381348, + "learning_rate": 1.0994903661951223e-06, + "loss": 0.309, + "step": 8778 + }, + { + "epoch": 4.150827423167849, + "grad_norm": 3.197665214538574, + "learning_rate": 1.0989736597440581e-06, + "loss": 0.3722, + "step": 8779 + }, + { + "epoch": 4.15130023640662, + "grad_norm": 3.1483469009399414, + "learning_rate": 1.098457040528761e-06, + "loss": 0.3301, + "step": 8780 + }, + { + "epoch": 4.15177304964539, + "grad_norm": 2.8838415145874023, + "learning_rate": 1.0979405085813972e-06, + "loss": 0.3212, + "step": 8781 + }, + { + "epoch": 4.152245862884161, + "grad_norm": 3.1998705863952637, + "learning_rate": 1.0974240639341312e-06, + "loss": 0.3557, + "step": 8782 + }, + { + "epoch": 4.152718676122931, + "grad_norm": 2.9004411697387695, + "learning_rate": 1.0969077066191187e-06, + "loss": 0.351, + "step": 8783 + }, + { + "epoch": 4.153191489361702, + "grad_norm": 3.036574125289917, + "learning_rate": 1.0963914366685096e-06, + "loss": 0.3762, + "step": 8784 + }, + { + "epoch": 4.153664302600473, + "grad_norm": 3.6683623790740967, + "learning_rate": 1.0958752541144523e-06, + "loss": 0.3938, + "step": 8785 + }, + { + "epoch": 4.1541371158392435, + "grad_norm": 2.922271490097046, + "learning_rate": 1.0953591589890852e-06, + "loss": 0.3375, + "step": 8786 + }, + { + "epoch": 4.154609929078014, + "grad_norm": 3.1750547885894775, + "learning_rate": 1.094843151324545e-06, + "loss": 0.3455, + "step": 8787 + }, + { + "epoch": 4.155082742316785, + "grad_norm": 2.7836148738861084, + "learning_rate": 1.0943272311529602e-06, + "loss": 0.3359, + "step": 8788 + }, + { + "epoch": 4.155555555555556, + "grad_norm": 3.4582557678222656, + "learning_rate": 1.0938113985064553e-06, + "loss": 0.3358, + "step": 8789 + }, + { + "epoch": 4.156028368794326, + "grad_norm": 3.0436923503875732, + "learning_rate": 1.0932956534171483e-06, + "loss": 0.3531, + "step": 8790 + }, + { + "epoch": 4.156501182033097, + "grad_norm": 3.1420092582702637, + "learning_rate": 1.092779995917152e-06, + "loss": 0.3917, + "step": 8791 + }, + { + "epoch": 4.156973995271867, + "grad_norm": 2.9556260108947754, + "learning_rate": 1.0922644260385756e-06, + "loss": 0.3259, + "step": 8792 + }, + { + "epoch": 4.157446808510638, + "grad_norm": 2.8876030445098877, + "learning_rate": 1.091748943813521e-06, + "loss": 0.3447, + "step": 8793 + }, + { + "epoch": 4.157919621749409, + "grad_norm": 3.039207696914673, + "learning_rate": 1.0912335492740836e-06, + "loss": 0.3216, + "step": 8794 + }, + { + "epoch": 4.15839243498818, + "grad_norm": 2.852355480194092, + "learning_rate": 1.0907182424523568e-06, + "loss": 0.2906, + "step": 8795 + }, + { + "epoch": 4.158865248226951, + "grad_norm": 2.931675434112549, + "learning_rate": 1.0902030233804245e-06, + "loss": 0.3124, + "step": 8796 + }, + { + "epoch": 4.159338061465721, + "grad_norm": 3.010590076446533, + "learning_rate": 1.0896878920903691e-06, + "loss": 0.283, + "step": 8797 + }, + { + "epoch": 4.159810874704492, + "grad_norm": 3.093153953552246, + "learning_rate": 1.0891728486142648e-06, + "loss": 0.3269, + "step": 8798 + }, + { + "epoch": 4.160283687943262, + "grad_norm": 2.93019437789917, + "learning_rate": 1.0886578929841798e-06, + "loss": 0.3261, + "step": 8799 + }, + { + "epoch": 4.160756501182033, + "grad_norm": 3.176790475845337, + "learning_rate": 1.0881430252321803e-06, + "loss": 0.2805, + "step": 8800 + }, + { + "epoch": 4.161229314420804, + "grad_norm": 3.215359687805176, + "learning_rate": 1.0876282453903228e-06, + "loss": 0.4022, + "step": 8801 + }, + { + "epoch": 4.1617021276595745, + "grad_norm": 3.3343284130096436, + "learning_rate": 1.0871135534906623e-06, + "loss": 0.3469, + "step": 8802 + }, + { + "epoch": 4.162174940898345, + "grad_norm": 3.030043363571167, + "learning_rate": 1.0865989495652456e-06, + "loss": 0.3548, + "step": 8803 + }, + { + "epoch": 4.162647754137116, + "grad_norm": 2.9456260204315186, + "learning_rate": 1.0860844336461146e-06, + "loss": 0.3356, + "step": 8804 + }, + { + "epoch": 4.163120567375887, + "grad_norm": 2.9399044513702393, + "learning_rate": 1.0855700057653063e-06, + "loss": 0.362, + "step": 8805 + }, + { + "epoch": 4.163593380614657, + "grad_norm": 3.3188061714172363, + "learning_rate": 1.0850556659548513e-06, + "loss": 0.3866, + "step": 8806 + }, + { + "epoch": 4.164066193853428, + "grad_norm": 3.1601030826568604, + "learning_rate": 1.084541414246775e-06, + "loss": 0.3662, + "step": 8807 + }, + { + "epoch": 4.164539007092198, + "grad_norm": 3.0458695888519287, + "learning_rate": 1.0840272506730993e-06, + "loss": 0.3318, + "step": 8808 + }, + { + "epoch": 4.165011820330969, + "grad_norm": 3.056387186050415, + "learning_rate": 1.0835131752658365e-06, + "loss": 0.3538, + "step": 8809 + }, + { + "epoch": 4.16548463356974, + "grad_norm": 2.9833531379699707, + "learning_rate": 1.0829991880569984e-06, + "loss": 0.3088, + "step": 8810 + }, + { + "epoch": 4.165957446808511, + "grad_norm": 3.325438976287842, + "learning_rate": 1.0824852890785876e-06, + "loss": 0.3524, + "step": 8811 + }, + { + "epoch": 4.166430260047282, + "grad_norm": 2.781290054321289, + "learning_rate": 1.0819714783626009e-06, + "loss": 0.3925, + "step": 8812 + }, + { + "epoch": 4.166903073286052, + "grad_norm": 16.3265323638916, + "learning_rate": 1.0814577559410336e-06, + "loss": 0.4248, + "step": 8813 + }, + { + "epoch": 4.167375886524822, + "grad_norm": 2.906619071960449, + "learning_rate": 1.0809441218458708e-06, + "loss": 0.3904, + "step": 8814 + }, + { + "epoch": 4.167848699763593, + "grad_norm": 2.7133800983428955, + "learning_rate": 1.0804305761090957e-06, + "loss": 0.2855, + "step": 8815 + }, + { + "epoch": 4.168321513002364, + "grad_norm": 3.252946376800537, + "learning_rate": 1.0799171187626844e-06, + "loss": 0.3285, + "step": 8816 + }, + { + "epoch": 4.168794326241135, + "grad_norm": 3.0832788944244385, + "learning_rate": 1.0794037498386062e-06, + "loss": 0.3175, + "step": 8817 + }, + { + "epoch": 4.1692671394799055, + "grad_norm": 3.046424150466919, + "learning_rate": 1.0788904693688284e-06, + "loss": 0.3545, + "step": 8818 + }, + { + "epoch": 4.169739952718676, + "grad_norm": 3.643488645553589, + "learning_rate": 1.0783772773853095e-06, + "loss": 0.3889, + "step": 8819 + }, + { + "epoch": 4.170212765957447, + "grad_norm": 3.433997392654419, + "learning_rate": 1.077864173920004e-06, + "loss": 0.311, + "step": 8820 + }, + { + "epoch": 4.170685579196218, + "grad_norm": 3.287684679031372, + "learning_rate": 1.0773511590048605e-06, + "loss": 0.3708, + "step": 8821 + }, + { + "epoch": 4.171158392434988, + "grad_norm": 3.5546534061431885, + "learning_rate": 1.0768382326718212e-06, + "loss": 0.3845, + "step": 8822 + }, + { + "epoch": 4.171631205673759, + "grad_norm": 3.2245540618896484, + "learning_rate": 1.076325394952826e-06, + "loss": 0.4412, + "step": 8823 + }, + { + "epoch": 4.172104018912529, + "grad_norm": 3.199784994125366, + "learning_rate": 1.0758126458798046e-06, + "loss": 0.3635, + "step": 8824 + }, + { + "epoch": 4.1725768321513, + "grad_norm": 2.961003303527832, + "learning_rate": 1.075299985484686e-06, + "loss": 0.3167, + "step": 8825 + }, + { + "epoch": 4.173049645390071, + "grad_norm": 2.8316452503204346, + "learning_rate": 1.07478741379939e-06, + "loss": 0.3342, + "step": 8826 + }, + { + "epoch": 4.173522458628842, + "grad_norm": 3.0721595287323, + "learning_rate": 1.0742749308558316e-06, + "loss": 0.3642, + "step": 8827 + }, + { + "epoch": 4.1739952718676125, + "grad_norm": 3.001324415206909, + "learning_rate": 1.0737625366859225e-06, + "loss": 0.3479, + "step": 8828 + }, + { + "epoch": 4.174468085106383, + "grad_norm": 3.199108839035034, + "learning_rate": 1.0732502313215665e-06, + "loss": 0.3434, + "step": 8829 + }, + { + "epoch": 4.174940898345153, + "grad_norm": 3.602139472961426, + "learning_rate": 1.072738014794661e-06, + "loss": 0.401, + "step": 8830 + }, + { + "epoch": 4.175413711583924, + "grad_norm": 3.2303357124328613, + "learning_rate": 1.0722258871371025e-06, + "loss": 0.3603, + "step": 8831 + }, + { + "epoch": 4.175886524822695, + "grad_norm": 3.138611316680908, + "learning_rate": 1.0717138483807766e-06, + "loss": 0.3481, + "step": 8832 + }, + { + "epoch": 4.176359338061466, + "grad_norm": 3.059134006500244, + "learning_rate": 1.071201898557567e-06, + "loss": 0.357, + "step": 8833 + }, + { + "epoch": 4.176832151300236, + "grad_norm": 3.237121820449829, + "learning_rate": 1.0706900376993501e-06, + "loss": 0.3424, + "step": 8834 + }, + { + "epoch": 4.177304964539007, + "grad_norm": 3.1065425872802734, + "learning_rate": 1.0701782658379974e-06, + "loss": 0.3506, + "step": 8835 + }, + { + "epoch": 4.177777777777778, + "grad_norm": 2.9971365928649902, + "learning_rate": 1.0696665830053743e-06, + "loss": 0.3205, + "step": 8836 + }, + { + "epoch": 4.178250591016549, + "grad_norm": 3.2898313999176025, + "learning_rate": 1.0691549892333406e-06, + "loss": 0.3297, + "step": 8837 + }, + { + "epoch": 4.178723404255319, + "grad_norm": 3.166144609451294, + "learning_rate": 1.0686434845537525e-06, + "loss": 0.3097, + "step": 8838 + }, + { + "epoch": 4.1791962174940895, + "grad_norm": 2.9629571437835693, + "learning_rate": 1.0681320689984581e-06, + "loss": 0.3709, + "step": 8839 + }, + { + "epoch": 4.17966903073286, + "grad_norm": 3.2954351902008057, + "learning_rate": 1.0676207425993004e-06, + "loss": 0.3448, + "step": 8840 + }, + { + "epoch": 4.180141843971631, + "grad_norm": 2.8537824153900146, + "learning_rate": 1.0671095053881194e-06, + "loss": 0.3069, + "step": 8841 + }, + { + "epoch": 4.180614657210402, + "grad_norm": 3.382916212081909, + "learning_rate": 1.0665983573967453e-06, + "loss": 0.3909, + "step": 8842 + }, + { + "epoch": 4.181087470449173, + "grad_norm": 3.4717860221862793, + "learning_rate": 1.0660872986570072e-06, + "loss": 0.3641, + "step": 8843 + }, + { + "epoch": 4.1815602836879435, + "grad_norm": 3.088916778564453, + "learning_rate": 1.0655763292007256e-06, + "loss": 0.3184, + "step": 8844 + }, + { + "epoch": 4.182033096926714, + "grad_norm": 2.8693177700042725, + "learning_rate": 1.065065449059715e-06, + "loss": 0.3486, + "step": 8845 + }, + { + "epoch": 4.182505910165484, + "grad_norm": 3.162811517715454, + "learning_rate": 1.0645546582657881e-06, + "loss": 0.3559, + "step": 8846 + }, + { + "epoch": 4.182978723404255, + "grad_norm": 3.8519816398620605, + "learning_rate": 1.0640439568507475e-06, + "loss": 0.4159, + "step": 8847 + }, + { + "epoch": 4.183451536643026, + "grad_norm": 2.9316959381103516, + "learning_rate": 1.063533344846394e-06, + "loss": 0.34, + "step": 8848 + }, + { + "epoch": 4.183924349881797, + "grad_norm": 3.018986463546753, + "learning_rate": 1.0630228222845205e-06, + "loss": 0.3378, + "step": 8849 + }, + { + "epoch": 4.184397163120567, + "grad_norm": 2.949428081512451, + "learning_rate": 1.062512389196914e-06, + "loss": 0.3634, + "step": 8850 + }, + { + "epoch": 4.184869976359338, + "grad_norm": 3.3298749923706055, + "learning_rate": 1.0620020456153585e-06, + "loss": 0.3067, + "step": 8851 + }, + { + "epoch": 4.185342789598109, + "grad_norm": 3.0566864013671875, + "learning_rate": 1.0614917915716302e-06, + "loss": 0.3534, + "step": 8852 + }, + { + "epoch": 4.18581560283688, + "grad_norm": 3.156620979309082, + "learning_rate": 1.0609816270975007e-06, + "loss": 0.3684, + "step": 8853 + }, + { + "epoch": 4.18628841607565, + "grad_norm": 3.0776474475860596, + "learning_rate": 1.0604715522247352e-06, + "loss": 0.3616, + "step": 8854 + }, + { + "epoch": 4.1867612293144205, + "grad_norm": 3.1254587173461914, + "learning_rate": 1.059961566985093e-06, + "loss": 0.3455, + "step": 8855 + }, + { + "epoch": 4.187234042553191, + "grad_norm": 2.8769783973693848, + "learning_rate": 1.0594516714103306e-06, + "loss": 0.2754, + "step": 8856 + }, + { + "epoch": 4.187706855791962, + "grad_norm": 3.461308240890503, + "learning_rate": 1.0589418655321962e-06, + "loss": 0.3744, + "step": 8857 + }, + { + "epoch": 4.188179669030733, + "grad_norm": 3.3546712398529053, + "learning_rate": 1.0584321493824317e-06, + "loss": 0.4116, + "step": 8858 + }, + { + "epoch": 4.188652482269504, + "grad_norm": 3.233792543411255, + "learning_rate": 1.0579225229927775e-06, + "loss": 0.3591, + "step": 8859 + }, + { + "epoch": 4.1891252955082745, + "grad_norm": 3.295444965362549, + "learning_rate": 1.0574129863949633e-06, + "loss": 0.3179, + "step": 8860 + }, + { + "epoch": 4.189598108747045, + "grad_norm": 3.403062105178833, + "learning_rate": 1.0569035396207178e-06, + "loss": 0.3948, + "step": 8861 + }, + { + "epoch": 4.190070921985815, + "grad_norm": 2.901970148086548, + "learning_rate": 1.0563941827017613e-06, + "loss": 0.3537, + "step": 8862 + }, + { + "epoch": 4.190543735224586, + "grad_norm": 3.1239142417907715, + "learning_rate": 1.0558849156698078e-06, + "loss": 0.3764, + "step": 8863 + }, + { + "epoch": 4.191016548463357, + "grad_norm": 2.8480169773101807, + "learning_rate": 1.0553757385565694e-06, + "loss": 0.3085, + "step": 8864 + }, + { + "epoch": 4.191489361702128, + "grad_norm": 3.0914061069488525, + "learning_rate": 1.0548666513937487e-06, + "loss": 0.3003, + "step": 8865 + }, + { + "epoch": 4.191962174940898, + "grad_norm": 2.9875683784484863, + "learning_rate": 1.0543576542130452e-06, + "loss": 0.3178, + "step": 8866 + }, + { + "epoch": 4.192434988179669, + "grad_norm": 2.952052354812622, + "learning_rate": 1.053848747046152e-06, + "loss": 0.3221, + "step": 8867 + }, + { + "epoch": 4.19290780141844, + "grad_norm": 3.2211997509002686, + "learning_rate": 1.0533399299247559e-06, + "loss": 0.3698, + "step": 8868 + }, + { + "epoch": 4.193380614657211, + "grad_norm": 3.2954046726226807, + "learning_rate": 1.0528312028805392e-06, + "loss": 0.3697, + "step": 8869 + }, + { + "epoch": 4.193853427895981, + "grad_norm": 2.978306293487549, + "learning_rate": 1.0523225659451768e-06, + "loss": 0.3358, + "step": 8870 + }, + { + "epoch": 4.1943262411347515, + "grad_norm": 3.3803653717041016, + "learning_rate": 1.0518140191503415e-06, + "loss": 0.3851, + "step": 8871 + }, + { + "epoch": 4.194799054373522, + "grad_norm": 3.282294273376465, + "learning_rate": 1.051305562527697e-06, + "loss": 0.4518, + "step": 8872 + }, + { + "epoch": 4.195271867612293, + "grad_norm": 2.950310468673706, + "learning_rate": 1.0507971961089017e-06, + "loss": 0.3045, + "step": 8873 + }, + { + "epoch": 4.195744680851064, + "grad_norm": 3.4069037437438965, + "learning_rate": 1.0502889199256114e-06, + "loss": 0.3832, + "step": 8874 + }, + { + "epoch": 4.196217494089835, + "grad_norm": 3.1440858840942383, + "learning_rate": 1.0497807340094722e-06, + "loss": 0.2958, + "step": 8875 + }, + { + "epoch": 4.1966903073286055, + "grad_norm": 3.050755262374878, + "learning_rate": 1.049272638392129e-06, + "loss": 0.3494, + "step": 8876 + }, + { + "epoch": 4.197163120567376, + "grad_norm": 2.908078670501709, + "learning_rate": 1.0487646331052171e-06, + "loss": 0.349, + "step": 8877 + }, + { + "epoch": 4.197635933806146, + "grad_norm": 3.2089946269989014, + "learning_rate": 1.048256718180367e-06, + "loss": 0.3507, + "step": 8878 + }, + { + "epoch": 4.198108747044917, + "grad_norm": 2.984745740890503, + "learning_rate": 1.0477488936492067e-06, + "loss": 0.3252, + "step": 8879 + }, + { + "epoch": 4.198581560283688, + "grad_norm": 2.9207515716552734, + "learning_rate": 1.0472411595433545e-06, + "loss": 0.3192, + "step": 8880 + }, + { + "epoch": 4.199054373522459, + "grad_norm": 3.0090811252593994, + "learning_rate": 1.0467335158944242e-06, + "loss": 0.3827, + "step": 8881 + }, + { + "epoch": 4.199527186761229, + "grad_norm": 3.2763171195983887, + "learning_rate": 1.0462259627340265e-06, + "loss": 0.3481, + "step": 8882 + }, + { + "epoch": 4.2, + "grad_norm": 3.068268299102783, + "learning_rate": 1.0457185000937636e-06, + "loss": 0.3926, + "step": 8883 + }, + { + "epoch": 4.200472813238771, + "grad_norm": 2.6999998092651367, + "learning_rate": 1.0452111280052326e-06, + "loss": 0.2884, + "step": 8884 + }, + { + "epoch": 4.200945626477542, + "grad_norm": 3.1187727451324463, + "learning_rate": 1.044703846500026e-06, + "loss": 0.3797, + "step": 8885 + }, + { + "epoch": 4.201418439716312, + "grad_norm": 2.7876172065734863, + "learning_rate": 1.0441966556097283e-06, + "loss": 0.3284, + "step": 8886 + }, + { + "epoch": 4.2018912529550825, + "grad_norm": 2.973261833190918, + "learning_rate": 1.0436895553659224e-06, + "loss": 0.2845, + "step": 8887 + }, + { + "epoch": 4.202364066193853, + "grad_norm": 3.496096611022949, + "learning_rate": 1.0431825458001811e-06, + "loss": 0.3341, + "step": 8888 + }, + { + "epoch": 4.202836879432624, + "grad_norm": 3.370410680770874, + "learning_rate": 1.0426756269440761e-06, + "loss": 0.3459, + "step": 8889 + }, + { + "epoch": 4.203309692671395, + "grad_norm": 2.864126682281494, + "learning_rate": 1.0421687988291693e-06, + "loss": 0.3195, + "step": 8890 + }, + { + "epoch": 4.203782505910166, + "grad_norm": 3.3575501441955566, + "learning_rate": 1.0416620614870181e-06, + "loss": 0.3424, + "step": 8891 + }, + { + "epoch": 4.2042553191489365, + "grad_norm": 3.4441967010498047, + "learning_rate": 1.0411554149491766e-06, + "loss": 0.3677, + "step": 8892 + }, + { + "epoch": 4.204728132387707, + "grad_norm": 3.014472007751465, + "learning_rate": 1.0406488592471898e-06, + "loss": 0.3004, + "step": 8893 + }, + { + "epoch": 4.205200945626477, + "grad_norm": 3.1186721324920654, + "learning_rate": 1.0401423944126002e-06, + "loss": 0.4182, + "step": 8894 + }, + { + "epoch": 4.205673758865248, + "grad_norm": 3.166337013244629, + "learning_rate": 1.0396360204769426e-06, + "loss": 0.3303, + "step": 8895 + }, + { + "epoch": 4.206146572104019, + "grad_norm": 3.081855058670044, + "learning_rate": 1.0391297374717454e-06, + "loss": 0.3096, + "step": 8896 + }, + { + "epoch": 4.20661938534279, + "grad_norm": 3.0924830436706543, + "learning_rate": 1.0386235454285348e-06, + "loss": 0.3238, + "step": 8897 + }, + { + "epoch": 4.20709219858156, + "grad_norm": 3.043519973754883, + "learning_rate": 1.0381174443788277e-06, + "loss": 0.3322, + "step": 8898 + }, + { + "epoch": 4.207565011820331, + "grad_norm": 3.160785675048828, + "learning_rate": 1.0376114343541377e-06, + "loss": 0.3244, + "step": 8899 + }, + { + "epoch": 4.208037825059102, + "grad_norm": 2.9988417625427246, + "learning_rate": 1.037105515385971e-06, + "loss": 0.3386, + "step": 8900 + }, + { + "epoch": 4.208510638297873, + "grad_norm": 2.981959342956543, + "learning_rate": 1.0365996875058284e-06, + "loss": 0.3412, + "step": 8901 + }, + { + "epoch": 4.208983451536643, + "grad_norm": 3.144815683364868, + "learning_rate": 1.0360939507452075e-06, + "loss": 0.3716, + "step": 8902 + }, + { + "epoch": 4.2094562647754135, + "grad_norm": 2.9644055366516113, + "learning_rate": 1.0355883051355972e-06, + "loss": 0.3488, + "step": 8903 + }, + { + "epoch": 4.209929078014184, + "grad_norm": 3.3212029933929443, + "learning_rate": 1.035082750708481e-06, + "loss": 0.3048, + "step": 8904 + }, + { + "epoch": 4.210401891252955, + "grad_norm": 2.82843279838562, + "learning_rate": 1.034577287495339e-06, + "loss": 0.3141, + "step": 8905 + }, + { + "epoch": 4.210874704491726, + "grad_norm": 3.040215253829956, + "learning_rate": 1.034071915527643e-06, + "loss": 0.3517, + "step": 8906 + }, + { + "epoch": 4.211347517730497, + "grad_norm": 2.850985288619995, + "learning_rate": 1.033566634836862e-06, + "loss": 0.3556, + "step": 8907 + }, + { + "epoch": 4.2118203309692674, + "grad_norm": 3.522962808609009, + "learning_rate": 1.0330614454544564e-06, + "loss": 0.3432, + "step": 8908 + }, + { + "epoch": 4.212293144208038, + "grad_norm": 3.0228631496429443, + "learning_rate": 1.032556347411881e-06, + "loss": 0.3165, + "step": 8909 + }, + { + "epoch": 4.212765957446808, + "grad_norm": 3.275134563446045, + "learning_rate": 1.0320513407405886e-06, + "loss": 0.413, + "step": 8910 + }, + { + "epoch": 4.213238770685579, + "grad_norm": 2.850020408630371, + "learning_rate": 1.0315464254720213e-06, + "loss": 0.3051, + "step": 8911 + }, + { + "epoch": 4.21371158392435, + "grad_norm": 3.153916597366333, + "learning_rate": 1.0310416016376203e-06, + "loss": 0.2973, + "step": 8912 + }, + { + "epoch": 4.2141843971631205, + "grad_norm": 3.423772096633911, + "learning_rate": 1.0305368692688175e-06, + "loss": 0.302, + "step": 8913 + }, + { + "epoch": 4.214657210401891, + "grad_norm": 3.420687198638916, + "learning_rate": 1.0300322283970404e-06, + "loss": 0.3732, + "step": 8914 + }, + { + "epoch": 4.215130023640662, + "grad_norm": 3.2490479946136475, + "learning_rate": 1.02952767905371e-06, + "loss": 0.3793, + "step": 8915 + }, + { + "epoch": 4.215602836879433, + "grad_norm": 3.3043079376220703, + "learning_rate": 1.0290232212702438e-06, + "loss": 0.3472, + "step": 8916 + }, + { + "epoch": 4.216075650118204, + "grad_norm": 3.152435779571533, + "learning_rate": 1.0285188550780516e-06, + "loss": 0.3617, + "step": 8917 + }, + { + "epoch": 4.216548463356974, + "grad_norm": 3.311063766479492, + "learning_rate": 1.0280145805085384e-06, + "loss": 0.3681, + "step": 8918 + }, + { + "epoch": 4.217021276595744, + "grad_norm": 3.1113057136535645, + "learning_rate": 1.0275103975931016e-06, + "loss": 0.3526, + "step": 8919 + }, + { + "epoch": 4.217494089834515, + "grad_norm": 2.7904412746429443, + "learning_rate": 1.0270063063631369e-06, + "loss": 0.3125, + "step": 8920 + }, + { + "epoch": 4.217966903073286, + "grad_norm": 3.3566761016845703, + "learning_rate": 1.0265023068500293e-06, + "loss": 0.3305, + "step": 8921 + }, + { + "epoch": 4.218439716312057, + "grad_norm": 2.97943115234375, + "learning_rate": 1.0259983990851633e-06, + "loss": 0.3277, + "step": 8922 + }, + { + "epoch": 4.218912529550828, + "grad_norm": 3.1507925987243652, + "learning_rate": 1.0254945830999134e-06, + "loss": 0.385, + "step": 8923 + }, + { + "epoch": 4.219385342789598, + "grad_norm": 2.632859706878662, + "learning_rate": 1.0249908589256493e-06, + "loss": 0.2889, + "step": 8924 + }, + { + "epoch": 4.219858156028369, + "grad_norm": 2.9816136360168457, + "learning_rate": 1.0244872265937378e-06, + "loss": 0.2838, + "step": 8925 + }, + { + "epoch": 4.220330969267139, + "grad_norm": 2.751431465148926, + "learning_rate": 1.0239836861355369e-06, + "loss": 0.3069, + "step": 8926 + }, + { + "epoch": 4.22080378250591, + "grad_norm": 3.3390228748321533, + "learning_rate": 1.0234802375823985e-06, + "loss": 0.3074, + "step": 8927 + }, + { + "epoch": 4.221276595744681, + "grad_norm": 3.345242500305176, + "learning_rate": 1.0229768809656726e-06, + "loss": 0.3603, + "step": 8928 + }, + { + "epoch": 4.2217494089834515, + "grad_norm": 3.0684640407562256, + "learning_rate": 1.0224736163166984e-06, + "loss": 0.3343, + "step": 8929 + }, + { + "epoch": 4.222222222222222, + "grad_norm": 3.2813572883605957, + "learning_rate": 1.0219704436668146e-06, + "loss": 0.3173, + "step": 8930 + }, + { + "epoch": 4.222695035460993, + "grad_norm": 3.135668992996216, + "learning_rate": 1.0214673630473504e-06, + "loss": 0.3608, + "step": 8931 + }, + { + "epoch": 4.223167848699764, + "grad_norm": 2.8798727989196777, + "learning_rate": 1.0209643744896303e-06, + "loss": 0.3317, + "step": 8932 + }, + { + "epoch": 4.223640661938534, + "grad_norm": 3.2659590244293213, + "learning_rate": 1.0204614780249731e-06, + "loss": 0.3017, + "step": 8933 + }, + { + "epoch": 4.224113475177305, + "grad_norm": 2.988126516342163, + "learning_rate": 1.0199586736846911e-06, + "loss": 0.3555, + "step": 8934 + }, + { + "epoch": 4.224586288416075, + "grad_norm": 3.3775575160980225, + "learning_rate": 1.0194559615000937e-06, + "loss": 0.3966, + "step": 8935 + }, + { + "epoch": 4.225059101654846, + "grad_norm": 3.1004798412323, + "learning_rate": 1.0189533415024817e-06, + "loss": 0.3192, + "step": 8936 + }, + { + "epoch": 4.225531914893617, + "grad_norm": 3.1722211837768555, + "learning_rate": 1.0184508137231498e-06, + "loss": 0.3075, + "step": 8937 + }, + { + "epoch": 4.226004728132388, + "grad_norm": 3.0679538249969482, + "learning_rate": 1.0179483781933903e-06, + "loss": 0.3475, + "step": 8938 + }, + { + "epoch": 4.226477541371159, + "grad_norm": 3.079246759414673, + "learning_rate": 1.0174460349444857e-06, + "loss": 0.4054, + "step": 8939 + }, + { + "epoch": 4.226950354609929, + "grad_norm": 3.308229684829712, + "learning_rate": 1.0169437840077169e-06, + "loss": 0.3455, + "step": 8940 + }, + { + "epoch": 4.2274231678487, + "grad_norm": 3.363147258758545, + "learning_rate": 1.0164416254143552e-06, + "loss": 0.3538, + "step": 8941 + }, + { + "epoch": 4.22789598108747, + "grad_norm": 2.7227768898010254, + "learning_rate": 1.0159395591956677e-06, + "loss": 0.3206, + "step": 8942 + }, + { + "epoch": 4.228368794326241, + "grad_norm": 3.0010764598846436, + "learning_rate": 1.0154375853829175e-06, + "loss": 0.3593, + "step": 8943 + }, + { + "epoch": 4.228841607565012, + "grad_norm": 3.0478785037994385, + "learning_rate": 1.0149357040073581e-06, + "loss": 0.3808, + "step": 8944 + }, + { + "epoch": 4.2293144208037825, + "grad_norm": 2.804421901702881, + "learning_rate": 1.0144339151002416e-06, + "loss": 0.3633, + "step": 8945 + }, + { + "epoch": 4.229787234042553, + "grad_norm": 3.563140630722046, + "learning_rate": 1.013932218692811e-06, + "loss": 0.2843, + "step": 8946 + }, + { + "epoch": 4.230260047281324, + "grad_norm": 3.1959750652313232, + "learning_rate": 1.0134306148163051e-06, + "loss": 0.377, + "step": 8947 + }, + { + "epoch": 4.230732860520095, + "grad_norm": 3.0841214656829834, + "learning_rate": 1.0129291035019565e-06, + "loss": 0.3234, + "step": 8948 + }, + { + "epoch": 4.231205673758865, + "grad_norm": 3.3893179893493652, + "learning_rate": 1.0124276847809911e-06, + "loss": 0.3823, + "step": 8949 + }, + { + "epoch": 4.231678486997636, + "grad_norm": 3.2250518798828125, + "learning_rate": 1.0119263586846316e-06, + "loss": 0.35, + "step": 8950 + }, + { + "epoch": 4.232151300236406, + "grad_norm": 3.287285566329956, + "learning_rate": 1.0114251252440928e-06, + "loss": 0.3306, + "step": 8951 + }, + { + "epoch": 4.232624113475177, + "grad_norm": 3.5018274784088135, + "learning_rate": 1.0109239844905836e-06, + "loss": 0.378, + "step": 8952 + }, + { + "epoch": 4.233096926713948, + "grad_norm": 3.224838972091675, + "learning_rate": 1.0104229364553093e-06, + "loss": 0.3379, + "step": 8953 + }, + { + "epoch": 4.233569739952719, + "grad_norm": 3.2302494049072266, + "learning_rate": 1.0099219811694668e-06, + "loss": 0.358, + "step": 8954 + }, + { + "epoch": 4.23404255319149, + "grad_norm": 3.098205804824829, + "learning_rate": 1.0094211186642483e-06, + "loss": 0.3669, + "step": 8955 + }, + { + "epoch": 4.23451536643026, + "grad_norm": 3.0045907497406006, + "learning_rate": 1.0089203489708415e-06, + "loss": 0.3293, + "step": 8956 + }, + { + "epoch": 4.234988179669031, + "grad_norm": 3.245818853378296, + "learning_rate": 1.0084196721204254e-06, + "loss": 0.3365, + "step": 8957 + }, + { + "epoch": 4.235460992907801, + "grad_norm": 2.8547208309173584, + "learning_rate": 1.007919088144177e-06, + "loss": 0.3072, + "step": 8958 + }, + { + "epoch": 4.235933806146572, + "grad_norm": 3.2914109230041504, + "learning_rate": 1.0074185970732642e-06, + "loss": 0.3734, + "step": 8959 + }, + { + "epoch": 4.236406619385343, + "grad_norm": 2.527096748352051, + "learning_rate": 1.0069181989388496e-06, + "loss": 0.3091, + "step": 8960 + }, + { + "epoch": 4.2368794326241135, + "grad_norm": 2.921369791030884, + "learning_rate": 1.006417893772093e-06, + "loss": 0.3162, + "step": 8961 + }, + { + "epoch": 4.237352245862884, + "grad_norm": 2.8698911666870117, + "learning_rate": 1.005917681604145e-06, + "loss": 0.3818, + "step": 8962 + }, + { + "epoch": 4.237825059101655, + "grad_norm": 2.958021402359009, + "learning_rate": 1.0054175624661514e-06, + "loss": 0.2934, + "step": 8963 + }, + { + "epoch": 4.238297872340426, + "grad_norm": 3.0945863723754883, + "learning_rate": 1.0049175363892527e-06, + "loss": 0.384, + "step": 8964 + }, + { + "epoch": 4.238770685579196, + "grad_norm": 2.890333890914917, + "learning_rate": 1.0044176034045822e-06, + "loss": 0.3312, + "step": 8965 + }, + { + "epoch": 4.239243498817967, + "grad_norm": 3.460975408554077, + "learning_rate": 1.0039177635432706e-06, + "loss": 0.4015, + "step": 8966 + }, + { + "epoch": 4.239716312056737, + "grad_norm": 3.5411946773529053, + "learning_rate": 1.003418016836439e-06, + "loss": 0.3307, + "step": 8967 + }, + { + "epoch": 4.240189125295508, + "grad_norm": 3.3310446739196777, + "learning_rate": 1.0029183633152061e-06, + "loss": 0.3154, + "step": 8968 + }, + { + "epoch": 4.240661938534279, + "grad_norm": 3.121110677719116, + "learning_rate": 1.0024188030106822e-06, + "loss": 0.3827, + "step": 8969 + }, + { + "epoch": 4.24113475177305, + "grad_norm": 3.421278715133667, + "learning_rate": 1.0019193359539717e-06, + "loss": 0.3677, + "step": 8970 + }, + { + "epoch": 4.241607565011821, + "grad_norm": 3.1790332794189453, + "learning_rate": 1.0014199621761761e-06, + "loss": 0.3219, + "step": 8971 + }, + { + "epoch": 4.242080378250591, + "grad_norm": 3.238412380218506, + "learning_rate": 1.0009206817083878e-06, + "loss": 0.3296, + "step": 8972 + }, + { + "epoch": 4.242553191489361, + "grad_norm": 3.0206923484802246, + "learning_rate": 1.0004214945816959e-06, + "loss": 0.3769, + "step": 8973 + }, + { + "epoch": 4.243026004728132, + "grad_norm": 3.2117667198181152, + "learning_rate": 9.999224008271822e-07, + "loss": 0.3876, + "step": 8974 + }, + { + "epoch": 4.243498817966903, + "grad_norm": 2.849250316619873, + "learning_rate": 9.99423400475922e-07, + "loss": 0.3267, + "step": 8975 + }, + { + "epoch": 4.243971631205674, + "grad_norm": 3.084845542907715, + "learning_rate": 9.989244935589878e-07, + "loss": 0.3074, + "step": 8976 + }, + { + "epoch": 4.2444444444444445, + "grad_norm": 3.0177342891693115, + "learning_rate": 9.984256801074434e-07, + "loss": 0.3524, + "step": 8977 + }, + { + "epoch": 4.244917257683215, + "grad_norm": 3.196692943572998, + "learning_rate": 9.979269601523477e-07, + "loss": 0.3943, + "step": 8978 + }, + { + "epoch": 4.245390070921986, + "grad_norm": 2.849760055541992, + "learning_rate": 9.97428333724753e-07, + "loss": 0.3145, + "step": 8979 + }, + { + "epoch": 4.245862884160757, + "grad_norm": 3.003265857696533, + "learning_rate": 9.969298008557083e-07, + "loss": 0.3393, + "step": 8980 + }, + { + "epoch": 4.246335697399527, + "grad_norm": 2.925597667694092, + "learning_rate": 9.96431361576254e-07, + "loss": 0.331, + "step": 8981 + }, + { + "epoch": 4.246808510638298, + "grad_norm": 2.87599515914917, + "learning_rate": 9.959330159174257e-07, + "loss": 0.3102, + "step": 8982 + }, + { + "epoch": 4.247281323877068, + "grad_norm": 2.841588020324707, + "learning_rate": 9.954347639102528e-07, + "loss": 0.3261, + "step": 8983 + }, + { + "epoch": 4.247754137115839, + "grad_norm": 3.14918851852417, + "learning_rate": 9.949366055857605e-07, + "loss": 0.3366, + "step": 8984 + }, + { + "epoch": 4.24822695035461, + "grad_norm": 3.113927125930786, + "learning_rate": 9.944385409749654e-07, + "loss": 0.3532, + "step": 8985 + }, + { + "epoch": 4.248699763593381, + "grad_norm": 3.0749151706695557, + "learning_rate": 9.939405701088818e-07, + "loss": 0.3659, + "step": 8986 + }, + { + "epoch": 4.2491725768321515, + "grad_norm": 2.831846237182617, + "learning_rate": 9.934426930185145e-07, + "loss": 0.2965, + "step": 8987 + }, + { + "epoch": 4.249645390070922, + "grad_norm": 3.0280253887176514, + "learning_rate": 9.929449097348642e-07, + "loss": 0.3144, + "step": 8988 + }, + { + "epoch": 4.250118203309692, + "grad_norm": 3.250284433364868, + "learning_rate": 9.924472202889267e-07, + "loss": 0.3414, + "step": 8989 + }, + { + "epoch": 4.250591016548463, + "grad_norm": 3.582306146621704, + "learning_rate": 9.9194962471169e-07, + "loss": 0.3593, + "step": 8990 + }, + { + "epoch": 4.251063829787234, + "grad_norm": 2.8985490798950195, + "learning_rate": 9.914521230341382e-07, + "loss": 0.2948, + "step": 8991 + }, + { + "epoch": 4.251536643026005, + "grad_norm": 3.399209499359131, + "learning_rate": 9.909547152872476e-07, + "loss": 0.3942, + "step": 8992 + }, + { + "epoch": 4.2520094562647754, + "grad_norm": 3.344658613204956, + "learning_rate": 9.904574015019895e-07, + "loss": 0.3649, + "step": 8993 + }, + { + "epoch": 4.252482269503546, + "grad_norm": 3.057995319366455, + "learning_rate": 9.899601817093305e-07, + "loss": 0.317, + "step": 8994 + }, + { + "epoch": 4.252955082742317, + "grad_norm": 3.4610090255737305, + "learning_rate": 9.894630559402296e-07, + "loss": 0.3235, + "step": 8995 + }, + { + "epoch": 4.253427895981088, + "grad_norm": 3.6014657020568848, + "learning_rate": 9.889660242256407e-07, + "loss": 0.3057, + "step": 8996 + }, + { + "epoch": 4.253900709219858, + "grad_norm": 2.850391149520874, + "learning_rate": 9.884690865965118e-07, + "loss": 0.3584, + "step": 8997 + }, + { + "epoch": 4.2543735224586285, + "grad_norm": 3.100820541381836, + "learning_rate": 9.879722430837844e-07, + "loss": 0.3802, + "step": 8998 + }, + { + "epoch": 4.254846335697399, + "grad_norm": 3.1044704914093018, + "learning_rate": 9.874754937183962e-07, + "loss": 0.3293, + "step": 8999 + }, + { + "epoch": 4.25531914893617, + "grad_norm": 2.750356912612915, + "learning_rate": 9.869788385312764e-07, + "loss": 0.3218, + "step": 9000 + }, + { + "epoch": 4.255791962174941, + "grad_norm": 3.1008687019348145, + "learning_rate": 9.864822775533494e-07, + "loss": 0.3316, + "step": 9001 + }, + { + "epoch": 4.256264775413712, + "grad_norm": 3.2051985263824463, + "learning_rate": 9.859858108155351e-07, + "loss": 0.3661, + "step": 9002 + }, + { + "epoch": 4.2567375886524825, + "grad_norm": 3.1303839683532715, + "learning_rate": 9.854894383487448e-07, + "loss": 0.3683, + "step": 9003 + }, + { + "epoch": 4.257210401891253, + "grad_norm": 3.0718302726745605, + "learning_rate": 9.84993160183887e-07, + "loss": 0.3284, + "step": 9004 + }, + { + "epoch": 4.257683215130023, + "grad_norm": 2.9759013652801514, + "learning_rate": 9.844969763518625e-07, + "loss": 0.3465, + "step": 9005 + }, + { + "epoch": 4.258156028368794, + "grad_norm": 3.1965582370758057, + "learning_rate": 9.840008868835647e-07, + "loss": 0.3593, + "step": 9006 + }, + { + "epoch": 4.258628841607565, + "grad_norm": 3.2931249141693115, + "learning_rate": 9.835048918098853e-07, + "loss": 0.3631, + "step": 9007 + }, + { + "epoch": 4.259101654846336, + "grad_norm": 3.070627450942993, + "learning_rate": 9.830089911617054e-07, + "loss": 0.3541, + "step": 9008 + }, + { + "epoch": 4.259574468085106, + "grad_norm": 3.209110736846924, + "learning_rate": 9.825131849699051e-07, + "loss": 0.344, + "step": 9009 + }, + { + "epoch": 4.260047281323877, + "grad_norm": 3.2239089012145996, + "learning_rate": 9.820174732653545e-07, + "loss": 0.3469, + "step": 9010 + }, + { + "epoch": 4.260520094562648, + "grad_norm": 3.0812292098999023, + "learning_rate": 9.815218560789199e-07, + "loss": 0.2898, + "step": 9011 + }, + { + "epoch": 4.260992907801419, + "grad_norm": 3.1709752082824707, + "learning_rate": 9.81026333441461e-07, + "loss": 0.381, + "step": 9012 + }, + { + "epoch": 4.261465721040189, + "grad_norm": 3.1551907062530518, + "learning_rate": 9.805309053838308e-07, + "loss": 0.2959, + "step": 9013 + }, + { + "epoch": 4.2619385342789595, + "grad_norm": 3.3751494884490967, + "learning_rate": 9.800355719368793e-07, + "loss": 0.3806, + "step": 9014 + }, + { + "epoch": 4.26241134751773, + "grad_norm": 3.2392799854278564, + "learning_rate": 9.795403331314479e-07, + "loss": 0.3006, + "step": 9015 + }, + { + "epoch": 4.262884160756501, + "grad_norm": 3.1428463459014893, + "learning_rate": 9.790451889983724e-07, + "loss": 0.3212, + "step": 9016 + }, + { + "epoch": 4.263356973995272, + "grad_norm": 3.353379726409912, + "learning_rate": 9.785501395684844e-07, + "loss": 0.3555, + "step": 9017 + }, + { + "epoch": 4.263829787234043, + "grad_norm": 3.3555281162261963, + "learning_rate": 9.780551848726068e-07, + "loss": 0.3729, + "step": 9018 + }, + { + "epoch": 4.2643026004728135, + "grad_norm": 3.0275049209594727, + "learning_rate": 9.775603249415606e-07, + "loss": 0.3579, + "step": 9019 + }, + { + "epoch": 4.264775413711584, + "grad_norm": 3.2631473541259766, + "learning_rate": 9.770655598061569e-07, + "loss": 0.3755, + "step": 9020 + }, + { + "epoch": 4.265248226950354, + "grad_norm": 2.9419705867767334, + "learning_rate": 9.76570889497202e-07, + "loss": 0.28, + "step": 9021 + }, + { + "epoch": 4.265721040189125, + "grad_norm": 2.931673288345337, + "learning_rate": 9.76076314045499e-07, + "loss": 0.3497, + "step": 9022 + }, + { + "epoch": 4.266193853427896, + "grad_norm": 3.218503952026367, + "learning_rate": 9.755818334818416e-07, + "loss": 0.3775, + "step": 9023 + }, + { + "epoch": 4.266666666666667, + "grad_norm": 3.1422977447509766, + "learning_rate": 9.750874478370181e-07, + "loss": 0.3957, + "step": 9024 + }, + { + "epoch": 4.267139479905437, + "grad_norm": 3.066502571105957, + "learning_rate": 9.745931571418134e-07, + "loss": 0.3361, + "step": 9025 + }, + { + "epoch": 4.267612293144208, + "grad_norm": 3.186897039413452, + "learning_rate": 9.740989614270044e-07, + "loss": 0.3794, + "step": 9026 + }, + { + "epoch": 4.268085106382979, + "grad_norm": 3.2698588371276855, + "learning_rate": 9.736048607233623e-07, + "loss": 0.3595, + "step": 9027 + }, + { + "epoch": 4.26855791962175, + "grad_norm": 2.9609718322753906, + "learning_rate": 9.731108550616523e-07, + "loss": 0.3387, + "step": 9028 + }, + { + "epoch": 4.26903073286052, + "grad_norm": 3.10768985748291, + "learning_rate": 9.72616944472633e-07, + "loss": 0.387, + "step": 9029 + }, + { + "epoch": 4.2695035460992905, + "grad_norm": 2.8060896396636963, + "learning_rate": 9.721231289870602e-07, + "loss": 0.3132, + "step": 9030 + }, + { + "epoch": 4.269976359338061, + "grad_norm": 3.0502681732177734, + "learning_rate": 9.716294086356801e-07, + "loss": 0.3246, + "step": 9031 + }, + { + "epoch": 4.270449172576832, + "grad_norm": 2.8298611640930176, + "learning_rate": 9.711357834492356e-07, + "loss": 0.2958, + "step": 9032 + }, + { + "epoch": 4.270921985815603, + "grad_norm": 2.693819761276245, + "learning_rate": 9.70642253458462e-07, + "loss": 0.325, + "step": 9033 + }, + { + "epoch": 4.271394799054374, + "grad_norm": 2.8179452419281006, + "learning_rate": 9.701488186940885e-07, + "loss": 0.3252, + "step": 9034 + }, + { + "epoch": 4.2718676122931445, + "grad_norm": 2.9885077476501465, + "learning_rate": 9.696554791868406e-07, + "loss": 0.3234, + "step": 9035 + }, + { + "epoch": 4.272340425531915, + "grad_norm": 4.8119378089904785, + "learning_rate": 9.691622349674349e-07, + "loss": 0.3814, + "step": 9036 + }, + { + "epoch": 4.272813238770685, + "grad_norm": 3.971498966217041, + "learning_rate": 9.68669086066585e-07, + "loss": 0.3684, + "step": 9037 + }, + { + "epoch": 4.273286052009456, + "grad_norm": 3.0153439044952393, + "learning_rate": 9.681760325149967e-07, + "loss": 0.3449, + "step": 9038 + }, + { + "epoch": 4.273758865248227, + "grad_norm": 3.4421799182891846, + "learning_rate": 9.676830743433688e-07, + "loss": 0.3578, + "step": 9039 + }, + { + "epoch": 4.274231678486998, + "grad_norm": 3.2896533012390137, + "learning_rate": 9.67190211582398e-07, + "loss": 0.3421, + "step": 9040 + }, + { + "epoch": 4.274704491725768, + "grad_norm": 3.388833522796631, + "learning_rate": 9.666974442627717e-07, + "loss": 0.4089, + "step": 9041 + }, + { + "epoch": 4.275177304964539, + "grad_norm": 3.1000685691833496, + "learning_rate": 9.662047724151718e-07, + "loss": 0.4046, + "step": 9042 + }, + { + "epoch": 4.27565011820331, + "grad_norm": 3.5651235580444336, + "learning_rate": 9.657121960702753e-07, + "loss": 0.4275, + "step": 9043 + }, + { + "epoch": 4.276122931442081, + "grad_norm": 2.944434881210327, + "learning_rate": 9.65219715258752e-07, + "loss": 0.3395, + "step": 9044 + }, + { + "epoch": 4.276595744680851, + "grad_norm": 2.7315311431884766, + "learning_rate": 9.64727330011268e-07, + "loss": 0.3305, + "step": 9045 + }, + { + "epoch": 4.2770685579196215, + "grad_norm": 3.423567533493042, + "learning_rate": 9.642350403584805e-07, + "loss": 0.3605, + "step": 9046 + }, + { + "epoch": 4.277541371158392, + "grad_norm": 3.239745616912842, + "learning_rate": 9.637428463310435e-07, + "loss": 0.3519, + "step": 9047 + }, + { + "epoch": 4.278014184397163, + "grad_norm": 3.388700008392334, + "learning_rate": 9.632507479596035e-07, + "loss": 0.359, + "step": 9048 + }, + { + "epoch": 4.278486997635934, + "grad_norm": 3.3524253368377686, + "learning_rate": 9.627587452747996e-07, + "loss": 0.3381, + "step": 9049 + }, + { + "epoch": 4.278959810874705, + "grad_norm": 3.1089365482330322, + "learning_rate": 9.622668383072695e-07, + "loss": 0.3143, + "step": 9050 + }, + { + "epoch": 4.2794326241134755, + "grad_norm": 3.3477213382720947, + "learning_rate": 9.617750270876402e-07, + "loss": 0.3788, + "step": 9051 + }, + { + "epoch": 4.279905437352246, + "grad_norm": 2.934818983078003, + "learning_rate": 9.612833116465342e-07, + "loss": 0.3589, + "step": 9052 + }, + { + "epoch": 4.280378250591016, + "grad_norm": 3.125014305114746, + "learning_rate": 9.607916920145704e-07, + "loss": 0.3181, + "step": 9053 + }, + { + "epoch": 4.280851063829787, + "grad_norm": 3.5860400199890137, + "learning_rate": 9.60300168222358e-07, + "loss": 0.3597, + "step": 9054 + }, + { + "epoch": 4.281323877068558, + "grad_norm": 3.1414008140563965, + "learning_rate": 9.598087403005032e-07, + "loss": 0.3186, + "step": 9055 + }, + { + "epoch": 4.281796690307329, + "grad_norm": 2.841228723526001, + "learning_rate": 9.593174082796046e-07, + "loss": 0.3547, + "step": 9056 + }, + { + "epoch": 4.282269503546099, + "grad_norm": 3.1145405769348145, + "learning_rate": 9.588261721902547e-07, + "loss": 0.3317, + "step": 9057 + }, + { + "epoch": 4.28274231678487, + "grad_norm": 2.9518024921417236, + "learning_rate": 9.58335032063042e-07, + "loss": 0.3723, + "step": 9058 + }, + { + "epoch": 4.283215130023641, + "grad_norm": 2.887479782104492, + "learning_rate": 9.578439879285467e-07, + "loss": 0.3288, + "step": 9059 + }, + { + "epoch": 4.283687943262412, + "grad_norm": 3.253427267074585, + "learning_rate": 9.573530398173444e-07, + "loss": 0.326, + "step": 9060 + }, + { + "epoch": 4.284160756501182, + "grad_norm": 3.0442020893096924, + "learning_rate": 9.568621877600038e-07, + "loss": 0.3807, + "step": 9061 + }, + { + "epoch": 4.2846335697399525, + "grad_norm": 2.928743600845337, + "learning_rate": 9.563714317870877e-07, + "loss": 0.2977, + "step": 9062 + }, + { + "epoch": 4.285106382978723, + "grad_norm": 3.2095022201538086, + "learning_rate": 9.558807719291543e-07, + "loss": 0.3571, + "step": 9063 + }, + { + "epoch": 4.285579196217494, + "grad_norm": 3.3752429485321045, + "learning_rate": 9.55390208216754e-07, + "loss": 0.3928, + "step": 9064 + }, + { + "epoch": 4.286052009456265, + "grad_norm": 3.125702381134033, + "learning_rate": 9.548997406804333e-07, + "loss": 0.3334, + "step": 9065 + }, + { + "epoch": 4.286524822695036, + "grad_norm": 3.058772563934326, + "learning_rate": 9.544093693507308e-07, + "loss": 0.3184, + "step": 9066 + }, + { + "epoch": 4.2869976359338064, + "grad_norm": 3.1085948944091797, + "learning_rate": 9.539190942581785e-07, + "loss": 0.3626, + "step": 9067 + }, + { + "epoch": 4.287470449172577, + "grad_norm": 2.8497378826141357, + "learning_rate": 9.53428915433306e-07, + "loss": 0.3451, + "step": 9068 + }, + { + "epoch": 4.287943262411347, + "grad_norm": 3.411508798599243, + "learning_rate": 9.529388329066325e-07, + "loss": 0.3608, + "step": 9069 + }, + { + "epoch": 4.288416075650118, + "grad_norm": 3.1312575340270996, + "learning_rate": 9.524488467086751e-07, + "loss": 0.3419, + "step": 9070 + }, + { + "epoch": 4.288888888888889, + "grad_norm": 3.1531126499176025, + "learning_rate": 9.519589568699419e-07, + "loss": 0.3261, + "step": 9071 + }, + { + "epoch": 4.2893617021276595, + "grad_norm": 2.8852546215057373, + "learning_rate": 9.514691634209361e-07, + "loss": 0.3258, + "step": 9072 + }, + { + "epoch": 4.28983451536643, + "grad_norm": 3.0486297607421875, + "learning_rate": 9.50979466392156e-07, + "loss": 0.3207, + "step": 9073 + }, + { + "epoch": 4.290307328605201, + "grad_norm": 3.017788887023926, + "learning_rate": 9.504898658140924e-07, + "loss": 0.3288, + "step": 9074 + }, + { + "epoch": 4.290780141843972, + "grad_norm": 3.24040150642395, + "learning_rate": 9.500003617172302e-07, + "loss": 0.2847, + "step": 9075 + }, + { + "epoch": 4.291252955082742, + "grad_norm": 3.6793692111968994, + "learning_rate": 9.49510954132049e-07, + "loss": 0.425, + "step": 9076 + }, + { + "epoch": 4.291725768321513, + "grad_norm": 2.7292215824127197, + "learning_rate": 9.490216430890215e-07, + "loss": 0.3208, + "step": 9077 + }, + { + "epoch": 4.292198581560283, + "grad_norm": 2.650388479232788, + "learning_rate": 9.485324286186159e-07, + "loss": 0.2842, + "step": 9078 + }, + { + "epoch": 4.292671394799054, + "grad_norm": 3.1459171772003174, + "learning_rate": 9.480433107512932e-07, + "loss": 0.3287, + "step": 9079 + }, + { + "epoch": 4.293144208037825, + "grad_norm": 3.1777186393737793, + "learning_rate": 9.475542895175074e-07, + "loss": 0.3385, + "step": 9080 + }, + { + "epoch": 4.293617021276596, + "grad_norm": 3.5608465671539307, + "learning_rate": 9.470653649477096e-07, + "loss": 0.3574, + "step": 9081 + }, + { + "epoch": 4.294089834515367, + "grad_norm": 2.58306884765625, + "learning_rate": 9.465765370723415e-07, + "loss": 0.3156, + "step": 9082 + }, + { + "epoch": 4.294562647754137, + "grad_norm": 3.3265857696533203, + "learning_rate": 9.460878059218415e-07, + "loss": 0.3678, + "step": 9083 + }, + { + "epoch": 4.295035460992908, + "grad_norm": 3.259326696395874, + "learning_rate": 9.455991715266403e-07, + "loss": 0.3675, + "step": 9084 + }, + { + "epoch": 4.295508274231678, + "grad_norm": 3.430608034133911, + "learning_rate": 9.451106339171618e-07, + "loss": 0.3147, + "step": 9085 + }, + { + "epoch": 4.295981087470449, + "grad_norm": 3.2896342277526855, + "learning_rate": 9.44622193123827e-07, + "loss": 0.3482, + "step": 9086 + }, + { + "epoch": 4.29645390070922, + "grad_norm": 2.9680557250976562, + "learning_rate": 9.441338491770474e-07, + "loss": 0.3504, + "step": 9087 + }, + { + "epoch": 4.2969267139479905, + "grad_norm": 2.9656941890716553, + "learning_rate": 9.436456021072313e-07, + "loss": 0.3782, + "step": 9088 + }, + { + "epoch": 4.297399527186761, + "grad_norm": 3.463456630706787, + "learning_rate": 9.431574519447794e-07, + "loss": 0.3517, + "step": 9089 + }, + { + "epoch": 4.297872340425532, + "grad_norm": 3.3658525943756104, + "learning_rate": 9.426693987200864e-07, + "loss": 0.3535, + "step": 9090 + }, + { + "epoch": 4.298345153664303, + "grad_norm": 3.087533712387085, + "learning_rate": 9.421814424635414e-07, + "loss": 0.3007, + "step": 9091 + }, + { + "epoch": 4.298817966903073, + "grad_norm": 3.4596481323242188, + "learning_rate": 9.41693583205526e-07, + "loss": 0.3797, + "step": 9092 + }, + { + "epoch": 4.299290780141844, + "grad_norm": 3.647507667541504, + "learning_rate": 9.412058209764191e-07, + "loss": 0.3803, + "step": 9093 + }, + { + "epoch": 4.299763593380614, + "grad_norm": 2.9130196571350098, + "learning_rate": 9.407181558065909e-07, + "loss": 0.32, + "step": 9094 + }, + { + "epoch": 4.300236406619385, + "grad_norm": 3.2562668323516846, + "learning_rate": 9.402305877264048e-07, + "loss": 0.4103, + "step": 9095 + }, + { + "epoch": 4.300709219858156, + "grad_norm": 3.1416616439819336, + "learning_rate": 9.397431167662216e-07, + "loss": 0.3498, + "step": 9096 + }, + { + "epoch": 4.301182033096927, + "grad_norm": 2.9540042877197266, + "learning_rate": 9.392557429563929e-07, + "loss": 0.3073, + "step": 9097 + }, + { + "epoch": 4.301654846335698, + "grad_norm": 3.0450825691223145, + "learning_rate": 9.387684663272645e-07, + "loss": 0.3295, + "step": 9098 + }, + { + "epoch": 4.302127659574468, + "grad_norm": 3.1060359477996826, + "learning_rate": 9.38281286909179e-07, + "loss": 0.3582, + "step": 9099 + }, + { + "epoch": 4.302600472813239, + "grad_norm": 2.901136636734009, + "learning_rate": 9.377942047324687e-07, + "loss": 0.3162, + "step": 9100 + }, + { + "epoch": 4.303073286052009, + "grad_norm": 3.5618929862976074, + "learning_rate": 9.373072198274641e-07, + "loss": 0.2917, + "step": 9101 + }, + { + "epoch": 4.30354609929078, + "grad_norm": 3.0853395462036133, + "learning_rate": 9.368203322244871e-07, + "loss": 0.3124, + "step": 9102 + }, + { + "epoch": 4.304018912529551, + "grad_norm": 3.00398588180542, + "learning_rate": 9.363335419538524e-07, + "loss": 0.3167, + "step": 9103 + }, + { + "epoch": 4.3044917257683215, + "grad_norm": 3.4705588817596436, + "learning_rate": 9.358468490458725e-07, + "loss": 0.3188, + "step": 9104 + }, + { + "epoch": 4.304964539007092, + "grad_norm": 2.948302745819092, + "learning_rate": 9.353602535308509e-07, + "loss": 0.2739, + "step": 9105 + }, + { + "epoch": 4.305437352245863, + "grad_norm": 3.4512269496917725, + "learning_rate": 9.348737554390852e-07, + "loss": 0.3256, + "step": 9106 + }, + { + "epoch": 4.305910165484634, + "grad_norm": 2.9979147911071777, + "learning_rate": 9.343873548008684e-07, + "loss": 0.3184, + "step": 9107 + }, + { + "epoch": 4.306382978723404, + "grad_norm": 3.1008479595184326, + "learning_rate": 9.339010516464847e-07, + "loss": 0.3251, + "step": 9108 + }, + { + "epoch": 4.306855791962175, + "grad_norm": 2.86930775642395, + "learning_rate": 9.334148460062165e-07, + "loss": 0.3322, + "step": 9109 + }, + { + "epoch": 4.307328605200945, + "grad_norm": 3.2068963050842285, + "learning_rate": 9.329287379103355e-07, + "loss": 0.3845, + "step": 9110 + }, + { + "epoch": 4.307801418439716, + "grad_norm": 3.567309856414795, + "learning_rate": 9.324427273891115e-07, + "loss": 0.4037, + "step": 9111 + }, + { + "epoch": 4.308274231678487, + "grad_norm": 3.2064783573150635, + "learning_rate": 9.319568144728056e-07, + "loss": 0.3481, + "step": 9112 + }, + { + "epoch": 4.308747044917258, + "grad_norm": 3.2492294311523438, + "learning_rate": 9.314709991916721e-07, + "loss": 0.3657, + "step": 9113 + }, + { + "epoch": 4.309219858156029, + "grad_norm": 2.990755081176758, + "learning_rate": 9.309852815759626e-07, + "loss": 0.3582, + "step": 9114 + }, + { + "epoch": 4.309692671394799, + "grad_norm": 3.3375513553619385, + "learning_rate": 9.304996616559187e-07, + "loss": 0.3657, + "step": 9115 + }, + { + "epoch": 4.31016548463357, + "grad_norm": 2.945552349090576, + "learning_rate": 9.300141394617798e-07, + "loss": 0.3075, + "step": 9116 + }, + { + "epoch": 4.31063829787234, + "grad_norm": 3.5318517684936523, + "learning_rate": 9.295287150237764e-07, + "loss": 0.39, + "step": 9117 + }, + { + "epoch": 4.311111111111111, + "grad_norm": 3.452049732208252, + "learning_rate": 9.290433883721326e-07, + "loss": 0.3821, + "step": 9118 + }, + { + "epoch": 4.311583924349882, + "grad_norm": 3.0762388706207275, + "learning_rate": 9.285581595370693e-07, + "loss": 0.316, + "step": 9119 + }, + { + "epoch": 4.3120567375886525, + "grad_norm": 4.13551664352417, + "learning_rate": 9.28073028548799e-07, + "loss": 0.3661, + "step": 9120 + }, + { + "epoch": 4.312529550827423, + "grad_norm": 3.1915719509124756, + "learning_rate": 9.275879954375286e-07, + "loss": 0.371, + "step": 9121 + }, + { + "epoch": 4.313002364066194, + "grad_norm": 3.118861198425293, + "learning_rate": 9.271030602334577e-07, + "loss": 0.3943, + "step": 9122 + }, + { + "epoch": 4.313475177304965, + "grad_norm": 3.042757987976074, + "learning_rate": 9.266182229667836e-07, + "loss": 0.3779, + "step": 9123 + }, + { + "epoch": 4.313947990543735, + "grad_norm": 2.949110746383667, + "learning_rate": 9.261334836676933e-07, + "loss": 0.3721, + "step": 9124 + }, + { + "epoch": 4.314420803782506, + "grad_norm": 2.982090950012207, + "learning_rate": 9.256488423663701e-07, + "loss": 0.2865, + "step": 9125 + }, + { + "epoch": 4.314893617021276, + "grad_norm": 3.6527535915374756, + "learning_rate": 9.25164299092989e-07, + "loss": 0.3689, + "step": 9126 + }, + { + "epoch": 4.315366430260047, + "grad_norm": 3.3310744762420654, + "learning_rate": 9.246798538777227e-07, + "loss": 0.3198, + "step": 9127 + }, + { + "epoch": 4.315839243498818, + "grad_norm": 2.8298583030700684, + "learning_rate": 9.241955067507332e-07, + "loss": 0.3711, + "step": 9128 + }, + { + "epoch": 4.316312056737589, + "grad_norm": 3.636894702911377, + "learning_rate": 9.237112577421809e-07, + "loss": 0.374, + "step": 9129 + }, + { + "epoch": 4.31678486997636, + "grad_norm": 2.896251678466797, + "learning_rate": 9.232271068822166e-07, + "loss": 0.3372, + "step": 9130 + }, + { + "epoch": 4.31725768321513, + "grad_norm": 3.2836971282958984, + "learning_rate": 9.227430542009854e-07, + "loss": 0.3584, + "step": 9131 + }, + { + "epoch": 4.317730496453901, + "grad_norm": 2.9452571868896484, + "learning_rate": 9.222590997286293e-07, + "loss": 0.3658, + "step": 9132 + }, + { + "epoch": 4.318203309692671, + "grad_norm": 2.88613224029541, + "learning_rate": 9.217752434952801e-07, + "loss": 0.3221, + "step": 9133 + }, + { + "epoch": 4.318676122931442, + "grad_norm": 2.7794570922851562, + "learning_rate": 9.212914855310667e-07, + "loss": 0.3142, + "step": 9134 + }, + { + "epoch": 4.319148936170213, + "grad_norm": 3.0195112228393555, + "learning_rate": 9.208078258661102e-07, + "loss": 0.3039, + "step": 9135 + }, + { + "epoch": 4.3196217494089835, + "grad_norm": 3.5178396701812744, + "learning_rate": 9.203242645305253e-07, + "loss": 0.3912, + "step": 9136 + }, + { + "epoch": 4.320094562647754, + "grad_norm": 3.145413875579834, + "learning_rate": 9.198408015544222e-07, + "loss": 0.3045, + "step": 9137 + }, + { + "epoch": 4.320567375886525, + "grad_norm": 3.151193380355835, + "learning_rate": 9.193574369679037e-07, + "loss": 0.341, + "step": 9138 + }, + { + "epoch": 4.321040189125296, + "grad_norm": 3.248255968093872, + "learning_rate": 9.188741708010668e-07, + "loss": 0.4344, + "step": 9139 + }, + { + "epoch": 4.321513002364066, + "grad_norm": 2.953218460083008, + "learning_rate": 9.183910030840021e-07, + "loss": 0.343, + "step": 9140 + }, + { + "epoch": 4.321985815602837, + "grad_norm": 3.1873161792755127, + "learning_rate": 9.179079338467936e-07, + "loss": 0.3082, + "step": 9141 + }, + { + "epoch": 4.322458628841607, + "grad_norm": 3.2587013244628906, + "learning_rate": 9.174249631195218e-07, + "loss": 0.3855, + "step": 9142 + }, + { + "epoch": 4.322931442080378, + "grad_norm": 2.956145763397217, + "learning_rate": 9.169420909322573e-07, + "loss": 0.3156, + "step": 9143 + }, + { + "epoch": 4.323404255319149, + "grad_norm": 3.1664650440216064, + "learning_rate": 9.164593173150683e-07, + "loss": 0.352, + "step": 9144 + }, + { + "epoch": 4.32387706855792, + "grad_norm": 3.2792744636535645, + "learning_rate": 9.159766422980138e-07, + "loss": 0.3963, + "step": 9145 + }, + { + "epoch": 4.3243498817966906, + "grad_norm": 3.1249687671661377, + "learning_rate": 9.154940659111472e-07, + "loss": 0.3405, + "step": 9146 + }, + { + "epoch": 4.324822695035461, + "grad_norm": 3.128340244293213, + "learning_rate": 9.150115881845181e-07, + "loss": 0.2733, + "step": 9147 + }, + { + "epoch": 4.325295508274232, + "grad_norm": 3.1790847778320312, + "learning_rate": 9.145292091481675e-07, + "loss": 0.3676, + "step": 9148 + }, + { + "epoch": 4.325768321513002, + "grad_norm": 2.874678134918213, + "learning_rate": 9.1404692883213e-07, + "loss": 0.2937, + "step": 9149 + }, + { + "epoch": 4.326241134751773, + "grad_norm": 3.102196216583252, + "learning_rate": 9.135647472664369e-07, + "loss": 0.2772, + "step": 9150 + }, + { + "epoch": 4.326713947990544, + "grad_norm": 3.0027546882629395, + "learning_rate": 9.130826644811099e-07, + "loss": 0.3171, + "step": 9151 + }, + { + "epoch": 4.3271867612293144, + "grad_norm": 2.750152587890625, + "learning_rate": 9.126006805061679e-07, + "loss": 0.3689, + "step": 9152 + }, + { + "epoch": 4.327659574468085, + "grad_norm": 3.251054525375366, + "learning_rate": 9.12118795371621e-07, + "loss": 0.3463, + "step": 9153 + }, + { + "epoch": 4.328132387706856, + "grad_norm": 2.8849353790283203, + "learning_rate": 9.116370091074738e-07, + "loss": 0.3, + "step": 9154 + }, + { + "epoch": 4.328605200945627, + "grad_norm": 3.4823720455169678, + "learning_rate": 9.111553217437255e-07, + "loss": 0.336, + "step": 9155 + }, + { + "epoch": 4.329078014184397, + "grad_norm": 2.8170886039733887, + "learning_rate": 9.106737333103677e-07, + "loss": 0.3237, + "step": 9156 + }, + { + "epoch": 4.3295508274231675, + "grad_norm": 3.095379114151001, + "learning_rate": 9.101922438373881e-07, + "loss": 0.3438, + "step": 9157 + }, + { + "epoch": 4.330023640661938, + "grad_norm": 3.1764986515045166, + "learning_rate": 9.097108533547667e-07, + "loss": 0.3174, + "step": 9158 + }, + { + "epoch": 4.330496453900709, + "grad_norm": 3.3972036838531494, + "learning_rate": 9.092295618924763e-07, + "loss": 0.3118, + "step": 9159 + }, + { + "epoch": 4.33096926713948, + "grad_norm": 3.112926959991455, + "learning_rate": 9.087483694804863e-07, + "loss": 0.3521, + "step": 9160 + }, + { + "epoch": 4.331442080378251, + "grad_norm": 3.395550012588501, + "learning_rate": 9.082672761487573e-07, + "loss": 0.3423, + "step": 9161 + }, + { + "epoch": 4.3319148936170215, + "grad_norm": 3.486910343170166, + "learning_rate": 9.077862819272465e-07, + "loss": 0.3655, + "step": 9162 + }, + { + "epoch": 4.332387706855792, + "grad_norm": 3.0986499786376953, + "learning_rate": 9.07305386845902e-07, + "loss": 0.2865, + "step": 9163 + }, + { + "epoch": 4.332860520094562, + "grad_norm": 2.962139844894409, + "learning_rate": 9.068245909346665e-07, + "loss": 0.315, + "step": 9164 + }, + { + "epoch": 4.333333333333333, + "grad_norm": 3.0887413024902344, + "learning_rate": 9.063438942234787e-07, + "loss": 0.3207, + "step": 9165 + }, + { + "epoch": 4.333806146572104, + "grad_norm": 2.909770965576172, + "learning_rate": 9.058632967422678e-07, + "loss": 0.3221, + "step": 9166 + }, + { + "epoch": 4.334278959810875, + "grad_norm": 3.1872079372406006, + "learning_rate": 9.053827985209604e-07, + "loss": 0.3856, + "step": 9167 + }, + { + "epoch": 4.334751773049645, + "grad_norm": 3.196985960006714, + "learning_rate": 9.049023995894738e-07, + "loss": 0.2994, + "step": 9168 + }, + { + "epoch": 4.335224586288416, + "grad_norm": 3.3150243759155273, + "learning_rate": 9.044220999777204e-07, + "loss": 0.3619, + "step": 9169 + }, + { + "epoch": 4.335697399527187, + "grad_norm": 3.1662707328796387, + "learning_rate": 9.039418997156066e-07, + "loss": 0.3347, + "step": 9170 + }, + { + "epoch": 4.336170212765958, + "grad_norm": 3.6789329051971436, + "learning_rate": 9.034617988330318e-07, + "loss": 0.4106, + "step": 9171 + }, + { + "epoch": 4.336643026004728, + "grad_norm": 3.5192553997039795, + "learning_rate": 9.029817973598898e-07, + "loss": 0.3561, + "step": 9172 + }, + { + "epoch": 4.3371158392434985, + "grad_norm": 3.4309239387512207, + "learning_rate": 9.025018953260692e-07, + "loss": 0.3739, + "step": 9173 + }, + { + "epoch": 4.337588652482269, + "grad_norm": 3.1840806007385254, + "learning_rate": 9.020220927614498e-07, + "loss": 0.3479, + "step": 9174 + }, + { + "epoch": 4.33806146572104, + "grad_norm": 3.6679139137268066, + "learning_rate": 9.015423896959088e-07, + "loss": 0.3739, + "step": 9175 + }, + { + "epoch": 4.338534278959811, + "grad_norm": 3.125296115875244, + "learning_rate": 9.010627861593143e-07, + "loss": 0.3046, + "step": 9176 + }, + { + "epoch": 4.339007092198582, + "grad_norm": 3.0710368156433105, + "learning_rate": 9.005832821815278e-07, + "loss": 0.322, + "step": 9177 + }, + { + "epoch": 4.3394799054373525, + "grad_norm": 3.068833351135254, + "learning_rate": 9.001038777924082e-07, + "loss": 0.3229, + "step": 9178 + }, + { + "epoch": 4.339952718676123, + "grad_norm": 3.2746002674102783, + "learning_rate": 8.996245730218037e-07, + "loss": 0.369, + "step": 9179 + }, + { + "epoch": 4.340425531914893, + "grad_norm": 3.1138477325439453, + "learning_rate": 8.991453678995607e-07, + "loss": 0.3456, + "step": 9180 + }, + { + "epoch": 4.340898345153664, + "grad_norm": 3.2195467948913574, + "learning_rate": 8.986662624555159e-07, + "loss": 0.377, + "step": 9181 + }, + { + "epoch": 4.341371158392435, + "grad_norm": 3.1197304725646973, + "learning_rate": 8.981872567195008e-07, + "loss": 0.3007, + "step": 9182 + }, + { + "epoch": 4.341843971631206, + "grad_norm": 3.295881748199463, + "learning_rate": 8.977083507213418e-07, + "loss": 0.4007, + "step": 9183 + }, + { + "epoch": 4.342316784869976, + "grad_norm": 3.8013954162597656, + "learning_rate": 8.972295444908582e-07, + "loss": 0.4322, + "step": 9184 + }, + { + "epoch": 4.342789598108747, + "grad_norm": 3.133434295654297, + "learning_rate": 8.967508380578633e-07, + "loss": 0.3379, + "step": 9185 + }, + { + "epoch": 4.343262411347518, + "grad_norm": 3.0942039489746094, + "learning_rate": 8.962722314521625e-07, + "loss": 0.3753, + "step": 9186 + }, + { + "epoch": 4.343735224586289, + "grad_norm": 2.8691020011901855, + "learning_rate": 8.957937247035583e-07, + "loss": 0.3003, + "step": 9187 + }, + { + "epoch": 4.344208037825059, + "grad_norm": 2.8353092670440674, + "learning_rate": 8.95315317841845e-07, + "loss": 0.3068, + "step": 9188 + }, + { + "epoch": 4.3446808510638295, + "grad_norm": 3.071207046508789, + "learning_rate": 8.948370108968097e-07, + "loss": 0.3147, + "step": 9189 + }, + { + "epoch": 4.3451536643026, + "grad_norm": 3.3605904579162598, + "learning_rate": 8.943588038982359e-07, + "loss": 0.3216, + "step": 9190 + }, + { + "epoch": 4.345626477541371, + "grad_norm": 3.0702717304229736, + "learning_rate": 8.93880696875899e-07, + "loss": 0.3507, + "step": 9191 + }, + { + "epoch": 4.346099290780142, + "grad_norm": 3.261456251144409, + "learning_rate": 8.934026898595675e-07, + "loss": 0.3677, + "step": 9192 + }, + { + "epoch": 4.346572104018913, + "grad_norm": 3.0827512741088867, + "learning_rate": 8.929247828790066e-07, + "loss": 0.3786, + "step": 9193 + }, + { + "epoch": 4.3470449172576835, + "grad_norm": 3.488949775695801, + "learning_rate": 8.924469759639728e-07, + "loss": 0.3685, + "step": 9194 + }, + { + "epoch": 4.347517730496454, + "grad_norm": 2.8565423488616943, + "learning_rate": 8.919692691442162e-07, + "loss": 0.3449, + "step": 9195 + }, + { + "epoch": 4.347990543735224, + "grad_norm": 4.654722213745117, + "learning_rate": 8.914916624494829e-07, + "loss": 0.344, + "step": 9196 + }, + { + "epoch": 4.348463356973995, + "grad_norm": 3.256714344024658, + "learning_rate": 8.910141559095098e-07, + "loss": 0.3487, + "step": 9197 + }, + { + "epoch": 4.348936170212766, + "grad_norm": 3.0921413898468018, + "learning_rate": 8.90536749554031e-07, + "loss": 0.3171, + "step": 9198 + }, + { + "epoch": 4.349408983451537, + "grad_norm": 3.1129112243652344, + "learning_rate": 8.900594434127712e-07, + "loss": 0.3501, + "step": 9199 + }, + { + "epoch": 4.349881796690307, + "grad_norm": 3.077688217163086, + "learning_rate": 8.8958223751545e-07, + "loss": 0.2781, + "step": 9200 + }, + { + "epoch": 4.350354609929078, + "grad_norm": 3.2839295864105225, + "learning_rate": 8.891051318917821e-07, + "loss": 0.3699, + "step": 9201 + }, + { + "epoch": 4.350827423167849, + "grad_norm": 3.0370850563049316, + "learning_rate": 8.886281265714741e-07, + "loss": 0.3344, + "step": 9202 + }, + { + "epoch": 4.35130023640662, + "grad_norm": 3.439702033996582, + "learning_rate": 8.88151221584227e-07, + "loss": 0.3865, + "step": 9203 + }, + { + "epoch": 4.35177304964539, + "grad_norm": 3.133317470550537, + "learning_rate": 8.876744169597357e-07, + "loss": 0.3352, + "step": 9204 + }, + { + "epoch": 4.3522458628841605, + "grad_norm": 3.2529115676879883, + "learning_rate": 8.871977127276876e-07, + "loss": 0.3708, + "step": 9205 + }, + { + "epoch": 4.352718676122931, + "grad_norm": 3.2149887084960938, + "learning_rate": 8.867211089177669e-07, + "loss": 0.3298, + "step": 9206 + }, + { + "epoch": 4.353191489361702, + "grad_norm": 2.778116464614868, + "learning_rate": 8.86244605559648e-07, + "loss": 0.3319, + "step": 9207 + }, + { + "epoch": 4.353664302600473, + "grad_norm": 3.206336736679077, + "learning_rate": 8.85768202683002e-07, + "loss": 0.3793, + "step": 9208 + }, + { + "epoch": 4.354137115839244, + "grad_norm": 3.4236080646514893, + "learning_rate": 8.852919003174921e-07, + "loss": 0.3341, + "step": 9209 + }, + { + "epoch": 4.3546099290780145, + "grad_norm": 3.049886703491211, + "learning_rate": 8.848156984927742e-07, + "loss": 0.3153, + "step": 9210 + }, + { + "epoch": 4.355082742316785, + "grad_norm": 4.048248291015625, + "learning_rate": 8.843395972385013e-07, + "loss": 0.3857, + "step": 9211 + }, + { + "epoch": 4.355555555555555, + "grad_norm": 3.3379292488098145, + "learning_rate": 8.838635965843165e-07, + "loss": 0.3167, + "step": 9212 + }, + { + "epoch": 4.356028368794326, + "grad_norm": 2.963364839553833, + "learning_rate": 8.833876965598598e-07, + "loss": 0.3427, + "step": 9213 + }, + { + "epoch": 4.356501182033097, + "grad_norm": 3.1309237480163574, + "learning_rate": 8.829118971947625e-07, + "loss": 0.3694, + "step": 9214 + }, + { + "epoch": 4.356973995271868, + "grad_norm": 3.4728028774261475, + "learning_rate": 8.824361985186497e-07, + "loss": 0.3769, + "step": 9215 + }, + { + "epoch": 4.357446808510638, + "grad_norm": 2.7183408737182617, + "learning_rate": 8.819606005611431e-07, + "loss": 0.3053, + "step": 9216 + }, + { + "epoch": 4.357919621749409, + "grad_norm": 3.34867262840271, + "learning_rate": 8.814851033518549e-07, + "loss": 0.3316, + "step": 9217 + }, + { + "epoch": 4.35839243498818, + "grad_norm": 3.288097858428955, + "learning_rate": 8.810097069203924e-07, + "loss": 0.362, + "step": 9218 + }, + { + "epoch": 4.358865248226951, + "grad_norm": 3.2768566608428955, + "learning_rate": 8.805344112963563e-07, + "loss": 0.3762, + "step": 9219 + }, + { + "epoch": 4.359338061465721, + "grad_norm": 2.730982542037964, + "learning_rate": 8.800592165093405e-07, + "loss": 0.3184, + "step": 9220 + }, + { + "epoch": 4.3598108747044915, + "grad_norm": 3.2347333431243896, + "learning_rate": 8.795841225889348e-07, + "loss": 0.3414, + "step": 9221 + }, + { + "epoch": 4.360283687943262, + "grad_norm": 2.8792049884796143, + "learning_rate": 8.791091295647208e-07, + "loss": 0.3312, + "step": 9222 + }, + { + "epoch": 4.360756501182033, + "grad_norm": 3.2037971019744873, + "learning_rate": 8.786342374662726e-07, + "loss": 0.3772, + "step": 9223 + }, + { + "epoch": 4.361229314420804, + "grad_norm": 3.765244245529175, + "learning_rate": 8.781594463231621e-07, + "loss": 0.3724, + "step": 9224 + }, + { + "epoch": 4.361702127659575, + "grad_norm": 3.085339069366455, + "learning_rate": 8.776847561649504e-07, + "loss": 0.3468, + "step": 9225 + }, + { + "epoch": 4.3621749408983455, + "grad_norm": 2.8031229972839355, + "learning_rate": 8.772101670211963e-07, + "loss": 0.3219, + "step": 9226 + }, + { + "epoch": 4.362647754137116, + "grad_norm": 2.667694091796875, + "learning_rate": 8.76735678921449e-07, + "loss": 0.3381, + "step": 9227 + }, + { + "epoch": 4.363120567375886, + "grad_norm": 2.898273229598999, + "learning_rate": 8.762612918952526e-07, + "loss": 0.3526, + "step": 9228 + }, + { + "epoch": 4.363593380614657, + "grad_norm": 3.1458849906921387, + "learning_rate": 8.757870059721465e-07, + "loss": 0.3516, + "step": 9229 + }, + { + "epoch": 4.364066193853428, + "grad_norm": 3.1719279289245605, + "learning_rate": 8.753128211816609e-07, + "loss": 0.328, + "step": 9230 + }, + { + "epoch": 4.3645390070921986, + "grad_norm": 3.0799217224121094, + "learning_rate": 8.748387375533224e-07, + "loss": 0.2802, + "step": 9231 + }, + { + "epoch": 4.365011820330969, + "grad_norm": 3.1218812465667725, + "learning_rate": 8.743647551166498e-07, + "loss": 0.3264, + "step": 9232 + }, + { + "epoch": 4.36548463356974, + "grad_norm": 3.231175184249878, + "learning_rate": 8.738908739011556e-07, + "loss": 0.3192, + "step": 9233 + }, + { + "epoch": 4.365957446808511, + "grad_norm": 3.088284730911255, + "learning_rate": 8.734170939363465e-07, + "loss": 0.3569, + "step": 9234 + }, + { + "epoch": 4.366430260047281, + "grad_norm": 3.2510828971862793, + "learning_rate": 8.729434152517217e-07, + "loss": 0.3977, + "step": 9235 + }, + { + "epoch": 4.366903073286052, + "grad_norm": 3.435762405395508, + "learning_rate": 8.724698378767768e-07, + "loss": 0.3201, + "step": 9236 + }, + { + "epoch": 4.3673758865248224, + "grad_norm": 3.6876676082611084, + "learning_rate": 8.719963618409985e-07, + "loss": 0.381, + "step": 9237 + }, + { + "epoch": 4.367848699763593, + "grad_norm": 2.7620339393615723, + "learning_rate": 8.715229871738676e-07, + "loss": 0.2939, + "step": 9238 + }, + { + "epoch": 4.368321513002364, + "grad_norm": 3.412893056869507, + "learning_rate": 8.710497139048604e-07, + "loss": 0.3592, + "step": 9239 + }, + { + "epoch": 4.368794326241135, + "grad_norm": 3.2498574256896973, + "learning_rate": 8.705765420634446e-07, + "loss": 0.4054, + "step": 9240 + }, + { + "epoch": 4.369267139479906, + "grad_norm": 3.138425827026367, + "learning_rate": 8.701034716790821e-07, + "loss": 0.3609, + "step": 9241 + }, + { + "epoch": 4.369739952718676, + "grad_norm": 2.7645158767700195, + "learning_rate": 8.696305027812301e-07, + "loss": 0.3085, + "step": 9242 + }, + { + "epoch": 4.370212765957447, + "grad_norm": 3.5948917865753174, + "learning_rate": 8.691576353993372e-07, + "loss": 0.3846, + "step": 9243 + }, + { + "epoch": 4.370685579196217, + "grad_norm": 3.2185158729553223, + "learning_rate": 8.68684869562848e-07, + "loss": 0.3516, + "step": 9244 + }, + { + "epoch": 4.371158392434988, + "grad_norm": 3.057281494140625, + "learning_rate": 8.68212205301199e-07, + "loss": 0.3197, + "step": 9245 + }, + { + "epoch": 4.371631205673759, + "grad_norm": 2.9788076877593994, + "learning_rate": 8.677396426438198e-07, + "loss": 0.3283, + "step": 9246 + }, + { + "epoch": 4.3721040189125295, + "grad_norm": 2.9246625900268555, + "learning_rate": 8.672671816201366e-07, + "loss": 0.3482, + "step": 9247 + }, + { + "epoch": 4.3725768321513, + "grad_norm": 2.9994964599609375, + "learning_rate": 8.667948222595671e-07, + "loss": 0.3802, + "step": 9248 + }, + { + "epoch": 4.373049645390071, + "grad_norm": 2.692626476287842, + "learning_rate": 8.663225645915222e-07, + "loss": 0.3045, + "step": 9249 + }, + { + "epoch": 4.373522458628842, + "grad_norm": 2.794236660003662, + "learning_rate": 8.658504086454078e-07, + "loss": 0.3056, + "step": 9250 + }, + { + "epoch": 4.373995271867612, + "grad_norm": 3.020534038543701, + "learning_rate": 8.653783544506222e-07, + "loss": 0.3341, + "step": 9251 + }, + { + "epoch": 4.374468085106383, + "grad_norm": 3.2142958641052246, + "learning_rate": 8.649064020365596e-07, + "loss": 0.3435, + "step": 9252 + }, + { + "epoch": 4.374940898345153, + "grad_norm": 3.3818624019622803, + "learning_rate": 8.644345514326049e-07, + "loss": 0.3744, + "step": 9253 + }, + { + "epoch": 4.375413711583924, + "grad_norm": 3.1566405296325684, + "learning_rate": 8.639628026681399e-07, + "loss": 0.3568, + "step": 9254 + }, + { + "epoch": 4.375886524822695, + "grad_norm": 3.3773083686828613, + "learning_rate": 8.63491155772537e-07, + "loss": 0.3523, + "step": 9255 + }, + { + "epoch": 4.376359338061466, + "grad_norm": 3.0850939750671387, + "learning_rate": 8.630196107751634e-07, + "loss": 0.3356, + "step": 9256 + }, + { + "epoch": 4.376832151300237, + "grad_norm": 3.361496686935425, + "learning_rate": 8.625481677053815e-07, + "loss": 0.3619, + "step": 9257 + }, + { + "epoch": 4.377304964539007, + "grad_norm": 3.026015043258667, + "learning_rate": 8.620768265925444e-07, + "loss": 0.3476, + "step": 9258 + }, + { + "epoch": 4.377777777777778, + "grad_norm": 3.142747640609741, + "learning_rate": 8.61605587466002e-07, + "loss": 0.3391, + "step": 9259 + }, + { + "epoch": 4.378250591016548, + "grad_norm": 3.0910356044769287, + "learning_rate": 8.611344503550956e-07, + "loss": 0.3201, + "step": 9260 + }, + { + "epoch": 4.378723404255319, + "grad_norm": 3.4462292194366455, + "learning_rate": 8.606634152891599e-07, + "loss": 0.4075, + "step": 9261 + }, + { + "epoch": 4.37919621749409, + "grad_norm": 2.984248638153076, + "learning_rate": 8.601924822975258e-07, + "loss": 0.3415, + "step": 9262 + }, + { + "epoch": 4.3796690307328605, + "grad_norm": 2.944971799850464, + "learning_rate": 8.597216514095155e-07, + "loss": 0.3163, + "step": 9263 + }, + { + "epoch": 4.380141843971631, + "grad_norm": 3.1562247276306152, + "learning_rate": 8.592509226544457e-07, + "loss": 0.3093, + "step": 9264 + }, + { + "epoch": 4.380614657210402, + "grad_norm": 2.911339282989502, + "learning_rate": 8.587802960616254e-07, + "loss": 0.3287, + "step": 9265 + }, + { + "epoch": 4.381087470449173, + "grad_norm": 3.5560295581817627, + "learning_rate": 8.583097716603605e-07, + "loss": 0.3763, + "step": 9266 + }, + { + "epoch": 4.381560283687943, + "grad_norm": 3.35855770111084, + "learning_rate": 8.578393494799478e-07, + "loss": 0.3703, + "step": 9267 + }, + { + "epoch": 4.382033096926714, + "grad_norm": 3.0229954719543457, + "learning_rate": 8.573690295496778e-07, + "loss": 0.3421, + "step": 9268 + }, + { + "epoch": 4.382505910165484, + "grad_norm": 3.0842833518981934, + "learning_rate": 8.568988118988348e-07, + "loss": 0.3473, + "step": 9269 + }, + { + "epoch": 4.382978723404255, + "grad_norm": 3.2471694946289062, + "learning_rate": 8.564286965566989e-07, + "loss": 0.3025, + "step": 9270 + }, + { + "epoch": 4.383451536643026, + "grad_norm": 3.4435837268829346, + "learning_rate": 8.559586835525404e-07, + "loss": 0.394, + "step": 9271 + }, + { + "epoch": 4.383924349881797, + "grad_norm": 3.4572243690490723, + "learning_rate": 8.554887729156267e-07, + "loss": 0.3745, + "step": 9272 + }, + { + "epoch": 4.384397163120568, + "grad_norm": 3.3646514415740967, + "learning_rate": 8.550189646752161e-07, + "loss": 0.308, + "step": 9273 + }, + { + "epoch": 4.384869976359338, + "grad_norm": 2.794933319091797, + "learning_rate": 8.545492588605606e-07, + "loss": 0.3039, + "step": 9274 + }, + { + "epoch": 4.385342789598109, + "grad_norm": 2.969306707382202, + "learning_rate": 8.540796555009084e-07, + "loss": 0.3815, + "step": 9275 + }, + { + "epoch": 4.385815602836879, + "grad_norm": 2.9203877449035645, + "learning_rate": 8.536101546254982e-07, + "loss": 0.3143, + "step": 9276 + }, + { + "epoch": 4.38628841607565, + "grad_norm": 3.451172113418579, + "learning_rate": 8.531407562635655e-07, + "loss": 0.3673, + "step": 9277 + }, + { + "epoch": 4.386761229314421, + "grad_norm": 3.1196818351745605, + "learning_rate": 8.526714604443365e-07, + "loss": 0.3449, + "step": 9278 + }, + { + "epoch": 4.3872340425531915, + "grad_norm": 3.0087406635284424, + "learning_rate": 8.522022671970312e-07, + "loss": 0.2898, + "step": 9279 + }, + { + "epoch": 4.387706855791962, + "grad_norm": 2.885667085647583, + "learning_rate": 8.517331765508666e-07, + "loss": 0.3119, + "step": 9280 + }, + { + "epoch": 4.388179669030733, + "grad_norm": 3.115769624710083, + "learning_rate": 8.512641885350494e-07, + "loss": 0.3662, + "step": 9281 + }, + { + "epoch": 4.388652482269504, + "grad_norm": 2.935692071914673, + "learning_rate": 8.507953031787818e-07, + "loss": 0.2957, + "step": 9282 + }, + { + "epoch": 4.389125295508274, + "grad_norm": 2.96824312210083, + "learning_rate": 8.503265205112593e-07, + "loss": 0.301, + "step": 9283 + }, + { + "epoch": 4.389598108747045, + "grad_norm": 2.8329155445098877, + "learning_rate": 8.498578405616697e-07, + "loss": 0.3289, + "step": 9284 + }, + { + "epoch": 4.390070921985815, + "grad_norm": 3.3063509464263916, + "learning_rate": 8.493892633591976e-07, + "loss": 0.371, + "step": 9285 + }, + { + "epoch": 4.390543735224586, + "grad_norm": 3.036324977874756, + "learning_rate": 8.489207889330175e-07, + "loss": 0.3111, + "step": 9286 + }, + { + "epoch": 4.391016548463357, + "grad_norm": 3.221714496612549, + "learning_rate": 8.48452417312301e-07, + "loss": 0.3034, + "step": 9287 + }, + { + "epoch": 4.391489361702128, + "grad_norm": 2.956813097000122, + "learning_rate": 8.479841485262108e-07, + "loss": 0.2826, + "step": 9288 + }, + { + "epoch": 4.391962174940899, + "grad_norm": 3.3818461894989014, + "learning_rate": 8.475159826039028e-07, + "loss": 0.3701, + "step": 9289 + }, + { + "epoch": 4.392434988179669, + "grad_norm": 3.1623525619506836, + "learning_rate": 8.470479195745293e-07, + "loss": 0.3405, + "step": 9290 + }, + { + "epoch": 4.39290780141844, + "grad_norm": 3.8068127632141113, + "learning_rate": 8.465799594672342e-07, + "loss": 0.3498, + "step": 9291 + }, + { + "epoch": 4.39338061465721, + "grad_norm": 3.042862892150879, + "learning_rate": 8.461121023111541e-07, + "loss": 0.3025, + "step": 9292 + }, + { + "epoch": 4.393853427895981, + "grad_norm": 3.0237231254577637, + "learning_rate": 8.456443481354221e-07, + "loss": 0.3351, + "step": 9293 + }, + { + "epoch": 4.394326241134752, + "grad_norm": 3.233386754989624, + "learning_rate": 8.451766969691614e-07, + "loss": 0.3314, + "step": 9294 + }, + { + "epoch": 4.3947990543735225, + "grad_norm": 2.922518014907837, + "learning_rate": 8.447091488414924e-07, + "loss": 0.2876, + "step": 9295 + }, + { + "epoch": 4.395271867612293, + "grad_norm": 3.2621119022369385, + "learning_rate": 8.442417037815268e-07, + "loss": 0.3731, + "step": 9296 + }, + { + "epoch": 4.395744680851064, + "grad_norm": 3.4238440990448, + "learning_rate": 8.437743618183697e-07, + "loss": 0.3563, + "step": 9297 + }, + { + "epoch": 4.396217494089835, + "grad_norm": 3.607088804244995, + "learning_rate": 8.43307122981121e-07, + "loss": 0.315, + "step": 9298 + }, + { + "epoch": 4.396690307328605, + "grad_norm": 3.0737040042877197, + "learning_rate": 8.428399872988724e-07, + "loss": 0.2699, + "step": 9299 + }, + { + "epoch": 4.397163120567376, + "grad_norm": 3.2364611625671387, + "learning_rate": 8.423729548007123e-07, + "loss": 0.3054, + "step": 9300 + }, + { + "epoch": 4.397635933806146, + "grad_norm": 3.505194664001465, + "learning_rate": 8.419060255157199e-07, + "loss": 0.3377, + "step": 9301 + }, + { + "epoch": 4.398108747044917, + "grad_norm": 3.337815523147583, + "learning_rate": 8.414391994729676e-07, + "loss": 0.3709, + "step": 9302 + }, + { + "epoch": 4.398581560283688, + "grad_norm": 3.310739040374756, + "learning_rate": 8.409724767015248e-07, + "loss": 0.3125, + "step": 9303 + }, + { + "epoch": 4.399054373522459, + "grad_norm": 2.9035723209381104, + "learning_rate": 8.405058572304506e-07, + "loss": 0.3294, + "step": 9304 + }, + { + "epoch": 4.39952718676123, + "grad_norm": 3.162543535232544, + "learning_rate": 8.400393410888008e-07, + "loss": 0.3563, + "step": 9305 + }, + { + "epoch": 4.4, + "grad_norm": 3.320204973220825, + "learning_rate": 8.395729283056222e-07, + "loss": 0.3681, + "step": 9306 + }, + { + "epoch": 4.400472813238771, + "grad_norm": 3.2953343391418457, + "learning_rate": 8.391066189099562e-07, + "loss": 0.3574, + "step": 9307 + }, + { + "epoch": 4.400945626477541, + "grad_norm": 3.2041780948638916, + "learning_rate": 8.386404129308387e-07, + "loss": 0.3787, + "step": 9308 + }, + { + "epoch": 4.401418439716312, + "grad_norm": 3.493856906890869, + "learning_rate": 8.381743103972973e-07, + "loss": 0.3678, + "step": 9309 + }, + { + "epoch": 4.401891252955083, + "grad_norm": 3.3114027976989746, + "learning_rate": 8.377083113383553e-07, + "loss": 0.3853, + "step": 9310 + }, + { + "epoch": 4.4023640661938535, + "grad_norm": 3.148033857345581, + "learning_rate": 8.372424157830281e-07, + "loss": 0.41, + "step": 9311 + }, + { + "epoch": 4.402836879432624, + "grad_norm": 3.1810758113861084, + "learning_rate": 8.367766237603245e-07, + "loss": 0.3536, + "step": 9312 + }, + { + "epoch": 4.403309692671395, + "grad_norm": 3.110158920288086, + "learning_rate": 8.363109352992474e-07, + "loss": 0.3453, + "step": 9313 + }, + { + "epoch": 4.403782505910166, + "grad_norm": 3.140287399291992, + "learning_rate": 8.358453504287934e-07, + "loss": 0.3617, + "step": 9314 + }, + { + "epoch": 4.404255319148936, + "grad_norm": 3.0819156169891357, + "learning_rate": 8.353798691779516e-07, + "loss": 0.3033, + "step": 9315 + }, + { + "epoch": 4.4047281323877066, + "grad_norm": 3.167506217956543, + "learning_rate": 8.349144915757071e-07, + "loss": 0.325, + "step": 9316 + }, + { + "epoch": 4.405200945626477, + "grad_norm": 3.0423221588134766, + "learning_rate": 8.34449217651035e-07, + "loss": 0.3213, + "step": 9317 + }, + { + "epoch": 4.405673758865248, + "grad_norm": 3.4442083835601807, + "learning_rate": 8.339840474329078e-07, + "loss": 0.3493, + "step": 9318 + }, + { + "epoch": 4.406146572104019, + "grad_norm": 3.2931764125823975, + "learning_rate": 8.335189809502886e-07, + "loss": 0.3947, + "step": 9319 + }, + { + "epoch": 4.40661938534279, + "grad_norm": 3.217146396636963, + "learning_rate": 8.330540182321345e-07, + "loss": 0.3622, + "step": 9320 + }, + { + "epoch": 4.4070921985815605, + "grad_norm": 3.0024805068969727, + "learning_rate": 8.325891593073981e-07, + "loss": 0.3333, + "step": 9321 + }, + { + "epoch": 4.407565011820331, + "grad_norm": 2.9128856658935547, + "learning_rate": 8.321244042050225e-07, + "loss": 0.3069, + "step": 9322 + }, + { + "epoch": 4.408037825059101, + "grad_norm": 3.1456804275512695, + "learning_rate": 8.316597529539477e-07, + "loss": 0.3749, + "step": 9323 + }, + { + "epoch": 4.408510638297872, + "grad_norm": 3.093379497528076, + "learning_rate": 8.31195205583105e-07, + "loss": 0.3899, + "step": 9324 + }, + { + "epoch": 4.408983451536643, + "grad_norm": 2.95357608795166, + "learning_rate": 8.307307621214181e-07, + "loss": 0.3525, + "step": 9325 + }, + { + "epoch": 4.409456264775414, + "grad_norm": 3.0832929611206055, + "learning_rate": 8.30266422597808e-07, + "loss": 0.3976, + "step": 9326 + }, + { + "epoch": 4.409929078014184, + "grad_norm": 3.203678846359253, + "learning_rate": 8.298021870411862e-07, + "loss": 0.3954, + "step": 9327 + }, + { + "epoch": 4.410401891252955, + "grad_norm": 3.0880157947540283, + "learning_rate": 8.293380554804586e-07, + "loss": 0.3153, + "step": 9328 + }, + { + "epoch": 4.410874704491726, + "grad_norm": 2.9109299182891846, + "learning_rate": 8.28874027944524e-07, + "loss": 0.3526, + "step": 9329 + }, + { + "epoch": 4.411347517730497, + "grad_norm": 3.4241647720336914, + "learning_rate": 8.284101044622767e-07, + "loss": 0.3288, + "step": 9330 + }, + { + "epoch": 4.411820330969267, + "grad_norm": 3.110163450241089, + "learning_rate": 8.279462850626024e-07, + "loss": 0.314, + "step": 9331 + }, + { + "epoch": 4.4122931442080375, + "grad_norm": 3.024353504180908, + "learning_rate": 8.274825697743805e-07, + "loss": 0.3107, + "step": 9332 + }, + { + "epoch": 4.412765957446808, + "grad_norm": 2.8271758556365967, + "learning_rate": 8.270189586264859e-07, + "loss": 0.3339, + "step": 9333 + }, + { + "epoch": 4.413238770685579, + "grad_norm": 3.179032325744629, + "learning_rate": 8.265554516477853e-07, + "loss": 0.3365, + "step": 9334 + }, + { + "epoch": 4.41371158392435, + "grad_norm": 2.9746336936950684, + "learning_rate": 8.260920488671376e-07, + "loss": 0.3127, + "step": 9335 + }, + { + "epoch": 4.414184397163121, + "grad_norm": 3.247529983520508, + "learning_rate": 8.256287503133992e-07, + "loss": 0.3251, + "step": 9336 + }, + { + "epoch": 4.4146572104018915, + "grad_norm": 3.3317253589630127, + "learning_rate": 8.251655560154168e-07, + "loss": 0.3246, + "step": 9337 + }, + { + "epoch": 4.415130023640662, + "grad_norm": 2.9890010356903076, + "learning_rate": 8.247024660020303e-07, + "loss": 0.3317, + "step": 9338 + }, + { + "epoch": 4.415602836879432, + "grad_norm": 3.3956406116485596, + "learning_rate": 8.242394803020759e-07, + "loss": 0.4055, + "step": 9339 + }, + { + "epoch": 4.416075650118203, + "grad_norm": 2.9918906688690186, + "learning_rate": 8.237765989443805e-07, + "loss": 0.3415, + "step": 9340 + }, + { + "epoch": 4.416548463356974, + "grad_norm": 3.4310927391052246, + "learning_rate": 8.233138219577671e-07, + "loss": 0.3654, + "step": 9341 + }, + { + "epoch": 4.417021276595745, + "grad_norm": 3.207947254180908, + "learning_rate": 8.2285114937105e-07, + "loss": 0.3773, + "step": 9342 + }, + { + "epoch": 4.417494089834515, + "grad_norm": 3.202953338623047, + "learning_rate": 8.223885812130367e-07, + "loss": 0.3476, + "step": 9343 + }, + { + "epoch": 4.417966903073286, + "grad_norm": 3.160951614379883, + "learning_rate": 8.219261175125315e-07, + "loss": 0.3583, + "step": 9344 + }, + { + "epoch": 4.418439716312057, + "grad_norm": 2.892636775970459, + "learning_rate": 8.214637582983284e-07, + "loss": 0.3241, + "step": 9345 + }, + { + "epoch": 4.418912529550828, + "grad_norm": 2.830085277557373, + "learning_rate": 8.210015035992172e-07, + "loss": 0.2921, + "step": 9346 + }, + { + "epoch": 4.419385342789598, + "grad_norm": 3.2846477031707764, + "learning_rate": 8.205393534439801e-07, + "loss": 0.4281, + "step": 9347 + }, + { + "epoch": 4.4198581560283685, + "grad_norm": 2.6153810024261475, + "learning_rate": 8.200773078613924e-07, + "loss": 0.2848, + "step": 9348 + }, + { + "epoch": 4.420330969267139, + "grad_norm": 3.0541396141052246, + "learning_rate": 8.196153668802253e-07, + "loss": 0.3619, + "step": 9349 + }, + { + "epoch": 4.42080378250591, + "grad_norm": 3.516235589981079, + "learning_rate": 8.191535305292406e-07, + "loss": 0.3996, + "step": 9350 + }, + { + "epoch": 4.421276595744681, + "grad_norm": 3.2205963134765625, + "learning_rate": 8.186917988371956e-07, + "loss": 0.3219, + "step": 9351 + }, + { + "epoch": 4.421749408983452, + "grad_norm": 3.2431082725524902, + "learning_rate": 8.1823017183284e-07, + "loss": 0.3033, + "step": 9352 + }, + { + "epoch": 4.4222222222222225, + "grad_norm": 3.337085485458374, + "learning_rate": 8.177686495449166e-07, + "loss": 0.3467, + "step": 9353 + }, + { + "epoch": 4.422695035460993, + "grad_norm": 3.2539291381835938, + "learning_rate": 8.173072320021641e-07, + "loss": 0.3565, + "step": 9354 + }, + { + "epoch": 4.423167848699763, + "grad_norm": 3.069993734359741, + "learning_rate": 8.168459192333105e-07, + "loss": 0.3627, + "step": 9355 + }, + { + "epoch": 4.423640661938534, + "grad_norm": 3.068195104598999, + "learning_rate": 8.163847112670826e-07, + "loss": 0.2945, + "step": 9356 + }, + { + "epoch": 4.424113475177305, + "grad_norm": 3.273607015609741, + "learning_rate": 8.159236081321959e-07, + "loss": 0.3451, + "step": 9357 + }, + { + "epoch": 4.424586288416076, + "grad_norm": 3.355647325515747, + "learning_rate": 8.154626098573607e-07, + "loss": 0.3469, + "step": 9358 + }, + { + "epoch": 4.425059101654846, + "grad_norm": 2.792948007583618, + "learning_rate": 8.150017164712831e-07, + "loss": 0.3393, + "step": 9359 + }, + { + "epoch": 4.425531914893617, + "grad_norm": 3.031167507171631, + "learning_rate": 8.145409280026607e-07, + "loss": 0.2924, + "step": 9360 + }, + { + "epoch": 4.426004728132388, + "grad_norm": 3.0713601112365723, + "learning_rate": 8.140802444801835e-07, + "loss": 0.3221, + "step": 9361 + }, + { + "epoch": 4.426477541371159, + "grad_norm": 3.225785493850708, + "learning_rate": 8.136196659325374e-07, + "loss": 0.3626, + "step": 9362 + }, + { + "epoch": 4.426950354609929, + "grad_norm": 2.9779045581817627, + "learning_rate": 8.131591923883991e-07, + "loss": 0.3329, + "step": 9363 + }, + { + "epoch": 4.4274231678486995, + "grad_norm": 3.534536600112915, + "learning_rate": 8.126988238764422e-07, + "loss": 0.4029, + "step": 9364 + }, + { + "epoch": 4.42789598108747, + "grad_norm": 3.4237616062164307, + "learning_rate": 8.122385604253311e-07, + "loss": 0.3763, + "step": 9365 + }, + { + "epoch": 4.428368794326241, + "grad_norm": 2.8711681365966797, + "learning_rate": 8.117784020637231e-07, + "loss": 0.3141, + "step": 9366 + }, + { + "epoch": 4.428841607565012, + "grad_norm": 3.0277621746063232, + "learning_rate": 8.113183488202725e-07, + "loss": 0.3848, + "step": 9367 + }, + { + "epoch": 4.429314420803783, + "grad_norm": 3.1275761127471924, + "learning_rate": 8.108584007236226e-07, + "loss": 0.3343, + "step": 9368 + }, + { + "epoch": 4.4297872340425535, + "grad_norm": 3.2320117950439453, + "learning_rate": 8.103985578024143e-07, + "loss": 0.3211, + "step": 9369 + }, + { + "epoch": 4.430260047281324, + "grad_norm": 3.258829355239868, + "learning_rate": 8.099388200852792e-07, + "loss": 0.3473, + "step": 9370 + }, + { + "epoch": 4.430732860520094, + "grad_norm": 2.9609436988830566, + "learning_rate": 8.094791876008423e-07, + "loss": 0.318, + "step": 9371 + }, + { + "epoch": 4.431205673758865, + "grad_norm": 3.128053665161133, + "learning_rate": 8.090196603777245e-07, + "loss": 0.3372, + "step": 9372 + }, + { + "epoch": 4.431678486997636, + "grad_norm": 3.013979196548462, + "learning_rate": 8.085602384445368e-07, + "loss": 0.3098, + "step": 9373 + }, + { + "epoch": 4.432151300236407, + "grad_norm": 3.603433132171631, + "learning_rate": 8.081009218298871e-07, + "loss": 0.4016, + "step": 9374 + }, + { + "epoch": 4.432624113475177, + "grad_norm": 2.687730312347412, + "learning_rate": 8.076417105623743e-07, + "loss": 0.3173, + "step": 9375 + }, + { + "epoch": 4.433096926713948, + "grad_norm": 3.3575692176818848, + "learning_rate": 8.071826046705913e-07, + "loss": 0.3173, + "step": 9376 + }, + { + "epoch": 4.433569739952719, + "grad_norm": 3.3599679470062256, + "learning_rate": 8.06723604183125e-07, + "loss": 0.3466, + "step": 9377 + }, + { + "epoch": 4.43404255319149, + "grad_norm": 3.101696014404297, + "learning_rate": 8.062647091285542e-07, + "loss": 0.3322, + "step": 9378 + }, + { + "epoch": 4.43451536643026, + "grad_norm": 3.0010359287261963, + "learning_rate": 8.05805919535454e-07, + "loss": 0.3863, + "step": 9379 + }, + { + "epoch": 4.4349881796690305, + "grad_norm": 3.1428821086883545, + "learning_rate": 8.053472354323902e-07, + "loss": 0.3402, + "step": 9380 + }, + { + "epoch": 4.435460992907801, + "grad_norm": 3.416954517364502, + "learning_rate": 8.048886568479222e-07, + "loss": 0.3637, + "step": 9381 + }, + { + "epoch": 4.435933806146572, + "grad_norm": 3.015092611312866, + "learning_rate": 8.044301838106059e-07, + "loss": 0.3371, + "step": 9382 + }, + { + "epoch": 4.436406619385343, + "grad_norm": 2.6680097579956055, + "learning_rate": 8.039718163489862e-07, + "loss": 0.3148, + "step": 9383 + }, + { + "epoch": 4.436879432624114, + "grad_norm": 3.098219633102417, + "learning_rate": 8.035135544916056e-07, + "loss": 0.3348, + "step": 9384 + }, + { + "epoch": 4.4373522458628845, + "grad_norm": 3.5400390625, + "learning_rate": 8.030553982669969e-07, + "loss": 0.4211, + "step": 9385 + }, + { + "epoch": 4.437825059101655, + "grad_norm": 3.6074001789093018, + "learning_rate": 8.025973477036872e-07, + "loss": 0.3179, + "step": 9386 + }, + { + "epoch": 4.438297872340425, + "grad_norm": 3.111982583999634, + "learning_rate": 8.021394028301982e-07, + "loss": 0.3414, + "step": 9387 + }, + { + "epoch": 4.438770685579196, + "grad_norm": 3.5494184494018555, + "learning_rate": 8.016815636750439e-07, + "loss": 0.3847, + "step": 9388 + }, + { + "epoch": 4.439243498817967, + "grad_norm": 3.3602912425994873, + "learning_rate": 8.012238302667308e-07, + "loss": 0.3293, + "step": 9389 + }, + { + "epoch": 4.439716312056738, + "grad_norm": 3.1263039112091064, + "learning_rate": 8.007662026337617e-07, + "loss": 0.3675, + "step": 9390 + }, + { + "epoch": 4.440189125295508, + "grad_norm": 3.239863157272339, + "learning_rate": 8.003086808046304e-07, + "loss": 0.3445, + "step": 9391 + }, + { + "epoch": 4.440661938534279, + "grad_norm": 2.8646275997161865, + "learning_rate": 7.998512648078244e-07, + "loss": 0.2715, + "step": 9392 + }, + { + "epoch": 4.44113475177305, + "grad_norm": 2.9777262210845947, + "learning_rate": 7.993939546718255e-07, + "loss": 0.3143, + "step": 9393 + }, + { + "epoch": 4.441607565011821, + "grad_norm": 3.5436604022979736, + "learning_rate": 7.98936750425107e-07, + "loss": 0.406, + "step": 9394 + }, + { + "epoch": 4.442080378250591, + "grad_norm": 3.1395277976989746, + "learning_rate": 7.984796520961391e-07, + "loss": 0.3893, + "step": 9395 + }, + { + "epoch": 4.4425531914893615, + "grad_norm": 3.6379475593566895, + "learning_rate": 7.980226597133814e-07, + "loss": 0.3878, + "step": 9396 + }, + { + "epoch": 4.443026004728132, + "grad_norm": 3.3866498470306396, + "learning_rate": 7.975657733052908e-07, + "loss": 0.3399, + "step": 9397 + }, + { + "epoch": 4.443498817966903, + "grad_norm": 2.9472098350524902, + "learning_rate": 7.971089929003142e-07, + "loss": 0.3436, + "step": 9398 + }, + { + "epoch": 4.443971631205674, + "grad_norm": 3.314652442932129, + "learning_rate": 7.966523185268929e-07, + "loss": 0.305, + "step": 9399 + }, + { + "epoch": 4.444444444444445, + "grad_norm": 3.8230092525482178, + "learning_rate": 7.961957502134638e-07, + "loss": 0.3632, + "step": 9400 + }, + { + "epoch": 4.444917257683215, + "grad_norm": 3.088292360305786, + "learning_rate": 7.957392879884534e-07, + "loss": 0.3373, + "step": 9401 + }, + { + "epoch": 4.445390070921986, + "grad_norm": 3.1412665843963623, + "learning_rate": 7.952829318802854e-07, + "loss": 0.3703, + "step": 9402 + }, + { + "epoch": 4.445862884160756, + "grad_norm": 3.464963674545288, + "learning_rate": 7.948266819173745e-07, + "loss": 0.3485, + "step": 9403 + }, + { + "epoch": 4.446335697399527, + "grad_norm": 3.2092626094818115, + "learning_rate": 7.943705381281281e-07, + "loss": 0.341, + "step": 9404 + }, + { + "epoch": 4.446808510638298, + "grad_norm": 2.5458641052246094, + "learning_rate": 7.939145005409502e-07, + "loss": 0.3059, + "step": 9405 + }, + { + "epoch": 4.4472813238770685, + "grad_norm": 4.1484150886535645, + "learning_rate": 7.934585691842353e-07, + "loss": 0.3405, + "step": 9406 + }, + { + "epoch": 4.447754137115839, + "grad_norm": 2.9794130325317383, + "learning_rate": 7.930027440863716e-07, + "loss": 0.359, + "step": 9407 + }, + { + "epoch": 4.44822695035461, + "grad_norm": 2.951674222946167, + "learning_rate": 7.92547025275743e-07, + "loss": 0.3087, + "step": 9408 + }, + { + "epoch": 4.448699763593381, + "grad_norm": 3.2857377529144287, + "learning_rate": 7.920914127807241e-07, + "loss": 0.3327, + "step": 9409 + }, + { + "epoch": 4.449172576832151, + "grad_norm": 3.4455840587615967, + "learning_rate": 7.916359066296839e-07, + "loss": 0.3731, + "step": 9410 + }, + { + "epoch": 4.449645390070922, + "grad_norm": 3.392779588699341, + "learning_rate": 7.911805068509848e-07, + "loss": 0.3559, + "step": 9411 + }, + { + "epoch": 4.450118203309692, + "grad_norm": 3.3362300395965576, + "learning_rate": 7.90725213472982e-07, + "loss": 0.333, + "step": 9412 + }, + { + "epoch": 4.450591016548463, + "grad_norm": 3.0671608448028564, + "learning_rate": 7.902700265240259e-07, + "loss": 0.3342, + "step": 9413 + }, + { + "epoch": 4.451063829787234, + "grad_norm": 2.8350744247436523, + "learning_rate": 7.898149460324575e-07, + "loss": 0.3157, + "step": 9414 + }, + { + "epoch": 4.451536643026005, + "grad_norm": 2.932446241378784, + "learning_rate": 7.893599720266143e-07, + "loss": 0.379, + "step": 9415 + }, + { + "epoch": 4.452009456264776, + "grad_norm": 3.354112386703491, + "learning_rate": 7.889051045348245e-07, + "loss": 0.3001, + "step": 9416 + }, + { + "epoch": 4.452482269503546, + "grad_norm": 3.068276882171631, + "learning_rate": 7.884503435854104e-07, + "loss": 0.3466, + "step": 9417 + }, + { + "epoch": 4.452955082742317, + "grad_norm": 2.833534002304077, + "learning_rate": 7.879956892066892e-07, + "loss": 0.3278, + "step": 9418 + }, + { + "epoch": 4.453427895981087, + "grad_norm": 2.9622433185577393, + "learning_rate": 7.875411414269687e-07, + "loss": 0.3725, + "step": 9419 + }, + { + "epoch": 4.453900709219858, + "grad_norm": 3.2055954933166504, + "learning_rate": 7.870867002745533e-07, + "loss": 0.3215, + "step": 9420 + }, + { + "epoch": 4.454373522458629, + "grad_norm": 2.877063274383545, + "learning_rate": 7.86632365777738e-07, + "loss": 0.2845, + "step": 9421 + }, + { + "epoch": 4.4548463356973995, + "grad_norm": 3.2809367179870605, + "learning_rate": 7.861781379648117e-07, + "loss": 0.39, + "step": 9422 + }, + { + "epoch": 4.45531914893617, + "grad_norm": 3.404816150665283, + "learning_rate": 7.857240168640587e-07, + "loss": 0.3003, + "step": 9423 + }, + { + "epoch": 4.455791962174941, + "grad_norm": 3.367253303527832, + "learning_rate": 7.85270002503754e-07, + "loss": 0.3414, + "step": 9424 + }, + { + "epoch": 4.456264775413712, + "grad_norm": 3.1247670650482178, + "learning_rate": 7.848160949121678e-07, + "loss": 0.2922, + "step": 9425 + }, + { + "epoch": 4.456737588652482, + "grad_norm": 3.474435806274414, + "learning_rate": 7.843622941175624e-07, + "loss": 0.3601, + "step": 9426 + }, + { + "epoch": 4.457210401891253, + "grad_norm": 3.0552384853363037, + "learning_rate": 7.839086001481933e-07, + "loss": 0.3905, + "step": 9427 + }, + { + "epoch": 4.457683215130023, + "grad_norm": 3.3532586097717285, + "learning_rate": 7.834550130323115e-07, + "loss": 0.3783, + "step": 9428 + }, + { + "epoch": 4.458156028368794, + "grad_norm": 3.7321903705596924, + "learning_rate": 7.830015327981585e-07, + "loss": 0.3765, + "step": 9429 + }, + { + "epoch": 4.458628841607565, + "grad_norm": 3.070158004760742, + "learning_rate": 7.82548159473972e-07, + "loss": 0.3279, + "step": 9430 + }, + { + "epoch": 4.459101654846336, + "grad_norm": 3.498399257659912, + "learning_rate": 7.820948930879807e-07, + "loss": 0.3864, + "step": 9431 + }, + { + "epoch": 4.459574468085107, + "grad_norm": 3.0352776050567627, + "learning_rate": 7.816417336684071e-07, + "loss": 0.2963, + "step": 9432 + }, + { + "epoch": 4.460047281323877, + "grad_norm": 3.190154790878296, + "learning_rate": 7.811886812434686e-07, + "loss": 0.354, + "step": 9433 + }, + { + "epoch": 4.460520094562648, + "grad_norm": 3.1933085918426514, + "learning_rate": 7.807357358413742e-07, + "loss": 0.3613, + "step": 9434 + }, + { + "epoch": 4.460992907801418, + "grad_norm": 4.0385637283325195, + "learning_rate": 7.80282897490326e-07, + "loss": 0.3257, + "step": 9435 + }, + { + "epoch": 4.461465721040189, + "grad_norm": 3.365485191345215, + "learning_rate": 7.798301662185218e-07, + "loss": 0.3093, + "step": 9436 + }, + { + "epoch": 4.46193853427896, + "grad_norm": 3.5345213413238525, + "learning_rate": 7.793775420541497e-07, + "loss": 0.3262, + "step": 9437 + }, + { + "epoch": 4.4624113475177305, + "grad_norm": 3.2894418239593506, + "learning_rate": 7.789250250253941e-07, + "loss": 0.3417, + "step": 9438 + }, + { + "epoch": 4.462884160756501, + "grad_norm": 2.972001791000366, + "learning_rate": 7.784726151604305e-07, + "loss": 0.3396, + "step": 9439 + }, + { + "epoch": 4.463356973995272, + "grad_norm": 3.161794424057007, + "learning_rate": 7.780203124874283e-07, + "loss": 0.3583, + "step": 9440 + }, + { + "epoch": 4.463829787234043, + "grad_norm": 3.0976521968841553, + "learning_rate": 7.775681170345508e-07, + "loss": 0.3743, + "step": 9441 + }, + { + "epoch": 4.464302600472813, + "grad_norm": 3.1454756259918213, + "learning_rate": 7.771160288299534e-07, + "loss": 0.3483, + "step": 9442 + }, + { + "epoch": 4.464775413711584, + "grad_norm": 3.467618942260742, + "learning_rate": 7.766640479017868e-07, + "loss": 0.3253, + "step": 9443 + }, + { + "epoch": 4.465248226950354, + "grad_norm": 3.3349552154541016, + "learning_rate": 7.762121742781933e-07, + "loss": 0.3579, + "step": 9444 + }, + { + "epoch": 4.465721040189125, + "grad_norm": 3.442701578140259, + "learning_rate": 7.757604079873085e-07, + "loss": 0.3854, + "step": 9445 + }, + { + "epoch": 4.466193853427896, + "grad_norm": 3.3095569610595703, + "learning_rate": 7.753087490572633e-07, + "loss": 0.3385, + "step": 9446 + }, + { + "epoch": 4.466666666666667, + "grad_norm": 3.0978634357452393, + "learning_rate": 7.748571975161786e-07, + "loss": 0.3511, + "step": 9447 + }, + { + "epoch": 4.467139479905438, + "grad_norm": 2.9801225662231445, + "learning_rate": 7.744057533921731e-07, + "loss": 0.3239, + "step": 9448 + }, + { + "epoch": 4.467612293144208, + "grad_norm": 3.116586923599243, + "learning_rate": 7.739544167133545e-07, + "loss": 0.3786, + "step": 9449 + }, + { + "epoch": 4.468085106382979, + "grad_norm": 3.2235381603240967, + "learning_rate": 7.73503187507825e-07, + "loss": 0.3285, + "step": 9450 + }, + { + "epoch": 4.468557919621749, + "grad_norm": 3.175649404525757, + "learning_rate": 7.730520658036825e-07, + "loss": 0.315, + "step": 9451 + }, + { + "epoch": 4.46903073286052, + "grad_norm": 3.013848066329956, + "learning_rate": 7.726010516290144e-07, + "loss": 0.3533, + "step": 9452 + }, + { + "epoch": 4.469503546099291, + "grad_norm": 2.87581467628479, + "learning_rate": 7.721501450119057e-07, + "loss": 0.2948, + "step": 9453 + }, + { + "epoch": 4.4699763593380615, + "grad_norm": 3.504119873046875, + "learning_rate": 7.716993459804306e-07, + "loss": 0.354, + "step": 9454 + }, + { + "epoch": 4.470449172576832, + "grad_norm": 3.2914042472839355, + "learning_rate": 7.712486545626591e-07, + "loss": 0.3724, + "step": 9455 + }, + { + "epoch": 4.470921985815603, + "grad_norm": 3.007551908493042, + "learning_rate": 7.707980707866533e-07, + "loss": 0.3923, + "step": 9456 + }, + { + "epoch": 4.471394799054374, + "grad_norm": 3.2758076190948486, + "learning_rate": 7.703475946804687e-07, + "loss": 0.4092, + "step": 9457 + }, + { + "epoch": 4.471867612293144, + "grad_norm": 3.265875816345215, + "learning_rate": 7.698972262721557e-07, + "loss": 0.4249, + "step": 9458 + }, + { + "epoch": 4.472340425531915, + "grad_norm": 3.0962677001953125, + "learning_rate": 7.694469655897565e-07, + "loss": 0.3273, + "step": 9459 + }, + { + "epoch": 4.472813238770685, + "grad_norm": 3.2247416973114014, + "learning_rate": 7.689968126613053e-07, + "loss": 0.366, + "step": 9460 + }, + { + "epoch": 4.473286052009456, + "grad_norm": 3.326211929321289, + "learning_rate": 7.685467675148334e-07, + "loss": 0.4044, + "step": 9461 + }, + { + "epoch": 4.473758865248227, + "grad_norm": 2.9795444011688232, + "learning_rate": 7.68096830178362e-07, + "loss": 0.3763, + "step": 9462 + }, + { + "epoch": 4.474231678486998, + "grad_norm": 3.0721724033355713, + "learning_rate": 7.676470006799061e-07, + "loss": 0.3586, + "step": 9463 + }, + { + "epoch": 4.474704491725769, + "grad_norm": 3.1191349029541016, + "learning_rate": 7.67197279047476e-07, + "loss": 0.3111, + "step": 9464 + }, + { + "epoch": 4.475177304964539, + "grad_norm": 3.2980053424835205, + "learning_rate": 7.667476653090727e-07, + "loss": 0.3413, + "step": 9465 + }, + { + "epoch": 4.47565011820331, + "grad_norm": 3.159794807434082, + "learning_rate": 7.662981594926927e-07, + "loss": 0.3559, + "step": 9466 + }, + { + "epoch": 4.47612293144208, + "grad_norm": 2.9250876903533936, + "learning_rate": 7.658487616263244e-07, + "loss": 0.3582, + "step": 9467 + }, + { + "epoch": 4.476595744680851, + "grad_norm": 2.915234088897705, + "learning_rate": 7.65399471737949e-07, + "loss": 0.3466, + "step": 9468 + }, + { + "epoch": 4.477068557919622, + "grad_norm": 3.2557425498962402, + "learning_rate": 7.649502898555431e-07, + "loss": 0.3772, + "step": 9469 + }, + { + "epoch": 4.4775413711583925, + "grad_norm": 3.1906673908233643, + "learning_rate": 7.645012160070748e-07, + "loss": 0.3379, + "step": 9470 + }, + { + "epoch": 4.478014184397163, + "grad_norm": 3.1513144969940186, + "learning_rate": 7.640522502205056e-07, + "loss": 0.3529, + "step": 9471 + }, + { + "epoch": 4.478486997635934, + "grad_norm": 3.1969199180603027, + "learning_rate": 7.636033925237904e-07, + "loss": 0.363, + "step": 9472 + }, + { + "epoch": 4.478959810874705, + "grad_norm": 3.4546799659729004, + "learning_rate": 7.631546429448785e-07, + "loss": 0.2944, + "step": 9473 + }, + { + "epoch": 4.479432624113475, + "grad_norm": 3.102057456970215, + "learning_rate": 7.627060015117116e-07, + "loss": 0.3309, + "step": 9474 + }, + { + "epoch": 4.479905437352246, + "grad_norm": 3.125751495361328, + "learning_rate": 7.622574682522232e-07, + "loss": 0.3576, + "step": 9475 + }, + { + "epoch": 4.480378250591016, + "grad_norm": 3.071798324584961, + "learning_rate": 7.618090431943432e-07, + "loss": 0.3298, + "step": 9476 + }, + { + "epoch": 4.480851063829787, + "grad_norm": 3.0060672760009766, + "learning_rate": 7.613607263659922e-07, + "loss": 0.3528, + "step": 9477 + }, + { + "epoch": 4.481323877068558, + "grad_norm": 3.254667043685913, + "learning_rate": 7.609125177950846e-07, + "loss": 0.3701, + "step": 9478 + }, + { + "epoch": 4.481796690307329, + "grad_norm": 3.282247304916382, + "learning_rate": 7.604644175095293e-07, + "loss": 0.4025, + "step": 9479 + }, + { + "epoch": 4.4822695035460995, + "grad_norm": 3.231097936630249, + "learning_rate": 7.600164255372266e-07, + "loss": 0.3395, + "step": 9480 + }, + { + "epoch": 4.48274231678487, + "grad_norm": 3.08368182182312, + "learning_rate": 7.595685419060722e-07, + "loss": 0.3356, + "step": 9481 + }, + { + "epoch": 4.48321513002364, + "grad_norm": 3.0406503677368164, + "learning_rate": 7.591207666439532e-07, + "loss": 0.2851, + "step": 9482 + }, + { + "epoch": 4.483687943262411, + "grad_norm": 3.021157741546631, + "learning_rate": 7.586730997787495e-07, + "loss": 0.3691, + "step": 9483 + }, + { + "epoch": 4.484160756501182, + "grad_norm": 2.8793535232543945, + "learning_rate": 7.582255413383375e-07, + "loss": 0.332, + "step": 9484 + }, + { + "epoch": 4.484633569739953, + "grad_norm": 3.319021224975586, + "learning_rate": 7.577780913505833e-07, + "loss": 0.3489, + "step": 9485 + }, + { + "epoch": 4.485106382978723, + "grad_norm": 3.0267672538757324, + "learning_rate": 7.573307498433472e-07, + "loss": 0.2989, + "step": 9486 + }, + { + "epoch": 4.485579196217494, + "grad_norm": 2.8953561782836914, + "learning_rate": 7.568835168444849e-07, + "loss": 0.306, + "step": 9487 + }, + { + "epoch": 4.486052009456265, + "grad_norm": 3.0559732913970947, + "learning_rate": 7.564363923818424e-07, + "loss": 0.3122, + "step": 9488 + }, + { + "epoch": 4.486524822695036, + "grad_norm": 3.369352340698242, + "learning_rate": 7.559893764832607e-07, + "loss": 0.3999, + "step": 9489 + }, + { + "epoch": 4.486997635933806, + "grad_norm": 3.3339598178863525, + "learning_rate": 7.555424691765731e-07, + "loss": 0.3566, + "step": 9490 + }, + { + "epoch": 4.4874704491725765, + "grad_norm": 3.6563758850097656, + "learning_rate": 7.550956704896062e-07, + "loss": 0.3686, + "step": 9491 + }, + { + "epoch": 4.487943262411347, + "grad_norm": 3.492706537246704, + "learning_rate": 7.546489804501811e-07, + "loss": 0.3382, + "step": 9492 + }, + { + "epoch": 4.488416075650118, + "grad_norm": 3.1645941734313965, + "learning_rate": 7.542023990861106e-07, + "loss": 0.3583, + "step": 9493 + }, + { + "epoch": 4.488888888888889, + "grad_norm": 2.9827258586883545, + "learning_rate": 7.537559264252021e-07, + "loss": 0.3243, + "step": 9494 + }, + { + "epoch": 4.48936170212766, + "grad_norm": 3.2876698970794678, + "learning_rate": 7.533095624952547e-07, + "loss": 0.373, + "step": 9495 + }, + { + "epoch": 4.4898345153664305, + "grad_norm": 3.110868453979492, + "learning_rate": 7.528633073240616e-07, + "loss": 0.351, + "step": 9496 + }, + { + "epoch": 4.490307328605201, + "grad_norm": 3.3962604999542236, + "learning_rate": 7.524171609394099e-07, + "loss": 0.3686, + "step": 9497 + }, + { + "epoch": 4.490780141843971, + "grad_norm": 3.272610902786255, + "learning_rate": 7.519711233690777e-07, + "loss": 0.3667, + "step": 9498 + }, + { + "epoch": 4.491252955082742, + "grad_norm": 3.0907654762268066, + "learning_rate": 7.515251946408398e-07, + "loss": 0.3247, + "step": 9499 + }, + { + "epoch": 4.491725768321513, + "grad_norm": 4.225870609283447, + "learning_rate": 7.510793747824613e-07, + "loss": 0.3636, + "step": 9500 + }, + { + "epoch": 4.492198581560284, + "grad_norm": 3.1911606788635254, + "learning_rate": 7.506336638217004e-07, + "loss": 0.3661, + "step": 9501 + }, + { + "epoch": 4.492671394799054, + "grad_norm": 2.907573699951172, + "learning_rate": 7.501880617863114e-07, + "loss": 0.3285, + "step": 9502 + }, + { + "epoch": 4.493144208037825, + "grad_norm": 3.388460397720337, + "learning_rate": 7.497425687040388e-07, + "loss": 0.3354, + "step": 9503 + }, + { + "epoch": 4.493617021276596, + "grad_norm": 2.7236225605010986, + "learning_rate": 7.49297184602622e-07, + "loss": 0.3389, + "step": 9504 + }, + { + "epoch": 4.494089834515367, + "grad_norm": 3.1962947845458984, + "learning_rate": 7.488519095097929e-07, + "loss": 0.3377, + "step": 9505 + }, + { + "epoch": 4.494562647754137, + "grad_norm": 2.936845541000366, + "learning_rate": 7.484067434532763e-07, + "loss": 0.2893, + "step": 9506 + }, + { + "epoch": 4.4950354609929075, + "grad_norm": 2.8567588329315186, + "learning_rate": 7.47961686460792e-07, + "loss": 0.3026, + "step": 9507 + }, + { + "epoch": 4.495508274231678, + "grad_norm": 3.3522651195526123, + "learning_rate": 7.475167385600507e-07, + "loss": 0.3517, + "step": 9508 + }, + { + "epoch": 4.495981087470449, + "grad_norm": 3.338757276535034, + "learning_rate": 7.470718997787572e-07, + "loss": 0.3224, + "step": 9509 + }, + { + "epoch": 4.49645390070922, + "grad_norm": 3.1484947204589844, + "learning_rate": 7.466271701446107e-07, + "loss": 0.3872, + "step": 9510 + }, + { + "epoch": 4.496926713947991, + "grad_norm": 3.3275411128997803, + "learning_rate": 7.461825496853012e-07, + "loss": 0.3287, + "step": 9511 + }, + { + "epoch": 4.4973995271867615, + "grad_norm": 3.101416826248169, + "learning_rate": 7.457380384285151e-07, + "loss": 0.3223, + "step": 9512 + }, + { + "epoch": 4.497872340425532, + "grad_norm": 2.761810779571533, + "learning_rate": 7.45293636401929e-07, + "loss": 0.3264, + "step": 9513 + }, + { + "epoch": 4.498345153664302, + "grad_norm": 3.215078592300415, + "learning_rate": 7.448493436332132e-07, + "loss": 0.3548, + "step": 9514 + }, + { + "epoch": 4.498817966903073, + "grad_norm": 3.00111722946167, + "learning_rate": 7.444051601500335e-07, + "loss": 0.3271, + "step": 9515 + }, + { + "epoch": 4.499290780141844, + "grad_norm": 3.2428977489471436, + "learning_rate": 7.439610859800456e-07, + "loss": 0.3024, + "step": 9516 + }, + { + "epoch": 4.499763593380615, + "grad_norm": 2.7977585792541504, + "learning_rate": 7.435171211509018e-07, + "loss": 0.334, + "step": 9517 + }, + { + "epoch": 4.500236406619385, + "grad_norm": 3.273468494415283, + "learning_rate": 7.430732656902447e-07, + "loss": 0.337, + "step": 9518 + }, + { + "epoch": 4.500709219858156, + "grad_norm": 4.663364410400391, + "learning_rate": 7.426295196257116e-07, + "loss": 0.3492, + "step": 9519 + }, + { + "epoch": 4.501182033096927, + "grad_norm": 3.1147210597991943, + "learning_rate": 7.421858829849327e-07, + "loss": 0.3273, + "step": 9520 + }, + { + "epoch": 4.501654846335697, + "grad_norm": 3.1411445140838623, + "learning_rate": 7.4174235579553e-07, + "loss": 0.3413, + "step": 9521 + }, + { + "epoch": 4.502127659574468, + "grad_norm": 3.0361053943634033, + "learning_rate": 7.412989380851218e-07, + "loss": 0.2908, + "step": 9522 + }, + { + "epoch": 4.5026004728132385, + "grad_norm": 3.4923086166381836, + "learning_rate": 7.408556298813172e-07, + "loss": 0.3659, + "step": 9523 + }, + { + "epoch": 4.503073286052009, + "grad_norm": 3.6827056407928467, + "learning_rate": 7.40412431211718e-07, + "loss": 0.3485, + "step": 9524 + }, + { + "epoch": 4.50354609929078, + "grad_norm": 3.257322311401367, + "learning_rate": 7.399693421039219e-07, + "loss": 0.3592, + "step": 9525 + }, + { + "epoch": 4.504018912529551, + "grad_norm": 3.515291929244995, + "learning_rate": 7.395263625855167e-07, + "loss": 0.3662, + "step": 9526 + }, + { + "epoch": 4.504491725768322, + "grad_norm": 2.899764060974121, + "learning_rate": 7.390834926840865e-07, + "loss": 0.3564, + "step": 9527 + }, + { + "epoch": 4.5049645390070925, + "grad_norm": 3.2578322887420654, + "learning_rate": 7.386407324272055e-07, + "loss": 0.3074, + "step": 9528 + }, + { + "epoch": 4.505437352245863, + "grad_norm": 3.3826515674591064, + "learning_rate": 7.381980818424419e-07, + "loss": 0.3669, + "step": 9529 + }, + { + "epoch": 4.505910165484633, + "grad_norm": 3.470733404159546, + "learning_rate": 7.377555409573594e-07, + "loss": 0.3905, + "step": 9530 + }, + { + "epoch": 4.506382978723404, + "grad_norm": 3.228917121887207, + "learning_rate": 7.373131097995123e-07, + "loss": 0.3336, + "step": 9531 + }, + { + "epoch": 4.506855791962175, + "grad_norm": 3.2193191051483154, + "learning_rate": 7.368707883964476e-07, + "loss": 0.3285, + "step": 9532 + }, + { + "epoch": 4.507328605200946, + "grad_norm": 3.19169020652771, + "learning_rate": 7.36428576775709e-07, + "loss": 0.4022, + "step": 9533 + }, + { + "epoch": 4.507801418439716, + "grad_norm": 3.1887755393981934, + "learning_rate": 7.359864749648296e-07, + "loss": 0.3749, + "step": 9534 + }, + { + "epoch": 4.508274231678487, + "grad_norm": 3.574314832687378, + "learning_rate": 7.355444829913375e-07, + "loss": 0.3549, + "step": 9535 + }, + { + "epoch": 4.508747044917258, + "grad_norm": 3.3482754230499268, + "learning_rate": 7.351026008827527e-07, + "loss": 0.311, + "step": 9536 + }, + { + "epoch": 4.509219858156028, + "grad_norm": 3.5075576305389404, + "learning_rate": 7.34660828666591e-07, + "loss": 0.3575, + "step": 9537 + }, + { + "epoch": 4.509692671394799, + "grad_norm": 3.233328104019165, + "learning_rate": 7.342191663703588e-07, + "loss": 0.3087, + "step": 9538 + }, + { + "epoch": 4.5101654846335695, + "grad_norm": 3.3704137802124023, + "learning_rate": 7.337776140215555e-07, + "loss": 0.356, + "step": 9539 + }, + { + "epoch": 4.51063829787234, + "grad_norm": 4.084654331207275, + "learning_rate": 7.333361716476761e-07, + "loss": 0.3382, + "step": 9540 + }, + { + "epoch": 4.511111111111111, + "grad_norm": 2.985344886779785, + "learning_rate": 7.32894839276207e-07, + "loss": 0.3476, + "step": 9541 + }, + { + "epoch": 4.511583924349882, + "grad_norm": 3.405877113342285, + "learning_rate": 7.324536169346269e-07, + "loss": 0.3402, + "step": 9542 + }, + { + "epoch": 4.512056737588653, + "grad_norm": 3.0168516635894775, + "learning_rate": 7.320125046504103e-07, + "loss": 0.2936, + "step": 9543 + }, + { + "epoch": 4.5125295508274235, + "grad_norm": 2.9846513271331787, + "learning_rate": 7.315715024510219e-07, + "loss": 0.3165, + "step": 9544 + }, + { + "epoch": 4.513002364066194, + "grad_norm": 3.1661694049835205, + "learning_rate": 7.311306103639224e-07, + "loss": 0.3372, + "step": 9545 + }, + { + "epoch": 4.513475177304964, + "grad_norm": 3.3390371799468994, + "learning_rate": 7.306898284165637e-07, + "loss": 0.3427, + "step": 9546 + }, + { + "epoch": 4.513947990543735, + "grad_norm": 3.1360137462615967, + "learning_rate": 7.302491566363904e-07, + "loss": 0.3786, + "step": 9547 + }, + { + "epoch": 4.514420803782506, + "grad_norm": 3.3442773818969727, + "learning_rate": 7.298085950508427e-07, + "loss": 0.3516, + "step": 9548 + }, + { + "epoch": 4.514893617021277, + "grad_norm": 3.1403257846832275, + "learning_rate": 7.293681436873518e-07, + "loss": 0.32, + "step": 9549 + }, + { + "epoch": 4.515366430260047, + "grad_norm": 5.084080696105957, + "learning_rate": 7.289278025733417e-07, + "loss": 0.3036, + "step": 9550 + }, + { + "epoch": 4.515839243498818, + "grad_norm": 3.055558919906616, + "learning_rate": 7.284875717362322e-07, + "loss": 0.3625, + "step": 9551 + }, + { + "epoch": 4.516312056737589, + "grad_norm": 3.3781931400299072, + "learning_rate": 7.280474512034338e-07, + "loss": 0.3759, + "step": 9552 + }, + { + "epoch": 4.516784869976359, + "grad_norm": 3.5266852378845215, + "learning_rate": 7.27607441002351e-07, + "loss": 0.4057, + "step": 9553 + }, + { + "epoch": 4.51725768321513, + "grad_norm": 3.290174961090088, + "learning_rate": 7.271675411603802e-07, + "loss": 0.3471, + "step": 9554 + }, + { + "epoch": 4.5177304964539005, + "grad_norm": 3.399919033050537, + "learning_rate": 7.267277517049137e-07, + "loss": 0.394, + "step": 9555 + }, + { + "epoch": 4.518203309692671, + "grad_norm": 3.4410784244537354, + "learning_rate": 7.262880726633348e-07, + "loss": 0.4351, + "step": 9556 + }, + { + "epoch": 4.518676122931442, + "grad_norm": 2.7317543029785156, + "learning_rate": 7.258485040630192e-07, + "loss": 0.3039, + "step": 9557 + }, + { + "epoch": 4.519148936170213, + "grad_norm": 3.769446849822998, + "learning_rate": 7.254090459313384e-07, + "loss": 0.4061, + "step": 9558 + }, + { + "epoch": 4.519621749408984, + "grad_norm": 3.5625245571136475, + "learning_rate": 7.249696982956553e-07, + "loss": 0.4148, + "step": 9559 + }, + { + "epoch": 4.520094562647754, + "grad_norm": 3.1534764766693115, + "learning_rate": 7.245304611833248e-07, + "loss": 0.3369, + "step": 9560 + }, + { + "epoch": 4.520567375886525, + "grad_norm": 3.179197311401367, + "learning_rate": 7.240913346216982e-07, + "loss": 0.2798, + "step": 9561 + }, + { + "epoch": 4.521040189125295, + "grad_norm": 3.2608659267425537, + "learning_rate": 7.236523186381162e-07, + "loss": 0.3513, + "step": 9562 + }, + { + "epoch": 4.521513002364066, + "grad_norm": 3.055513381958008, + "learning_rate": 7.232134132599158e-07, + "loss": 0.3098, + "step": 9563 + }, + { + "epoch": 4.521985815602837, + "grad_norm": 3.159937620162964, + "learning_rate": 7.227746185144258e-07, + "loss": 0.3234, + "step": 9564 + }, + { + "epoch": 4.5224586288416075, + "grad_norm": 3.176802635192871, + "learning_rate": 7.22335934428966e-07, + "loss": 0.3547, + "step": 9565 + }, + { + "epoch": 4.522931442080378, + "grad_norm": 3.476203680038452, + "learning_rate": 7.218973610308538e-07, + "loss": 0.3659, + "step": 9566 + }, + { + "epoch": 4.523404255319149, + "grad_norm": 3.277595043182373, + "learning_rate": 7.214588983473964e-07, + "loss": 0.3448, + "step": 9567 + }, + { + "epoch": 4.52387706855792, + "grad_norm": 3.0068325996398926, + "learning_rate": 7.210205464058944e-07, + "loss": 0.3341, + "step": 9568 + }, + { + "epoch": 4.52434988179669, + "grad_norm": 3.3836655616760254, + "learning_rate": 7.205823052336425e-07, + "loss": 0.3253, + "step": 9569 + }, + { + "epoch": 4.524822695035461, + "grad_norm": 3.4170496463775635, + "learning_rate": 7.201441748579271e-07, + "loss": 0.3754, + "step": 9570 + }, + { + "epoch": 4.525295508274231, + "grad_norm": 3.5432863235473633, + "learning_rate": 7.197061553060303e-07, + "loss": 0.3265, + "step": 9571 + }, + { + "epoch": 4.525768321513002, + "grad_norm": 2.900520086288452, + "learning_rate": 7.192682466052243e-07, + "loss": 0.2961, + "step": 9572 + }, + { + "epoch": 4.526241134751773, + "grad_norm": 3.028733491897583, + "learning_rate": 7.188304487827768e-07, + "loss": 0.347, + "step": 9573 + }, + { + "epoch": 4.526713947990544, + "grad_norm": 2.8739330768585205, + "learning_rate": 7.183927618659473e-07, + "loss": 0.3265, + "step": 9574 + }, + { + "epoch": 4.527186761229315, + "grad_norm": 3.4727251529693604, + "learning_rate": 7.179551858819873e-07, + "loss": 0.3882, + "step": 9575 + }, + { + "epoch": 4.527659574468085, + "grad_norm": 2.950634002685547, + "learning_rate": 7.175177208581449e-07, + "loss": 0.2699, + "step": 9576 + }, + { + "epoch": 4.528132387706856, + "grad_norm": 3.035752773284912, + "learning_rate": 7.170803668216572e-07, + "loss": 0.3939, + "step": 9577 + }, + { + "epoch": 4.528605200945626, + "grad_norm": 2.9155373573303223, + "learning_rate": 7.166431237997579e-07, + "loss": 0.3112, + "step": 9578 + }, + { + "epoch": 4.529078014184397, + "grad_norm": 3.2655560970306396, + "learning_rate": 7.162059918196715e-07, + "loss": 0.3605, + "step": 9579 + }, + { + "epoch": 4.529550827423168, + "grad_norm": 3.0889620780944824, + "learning_rate": 7.157689709086157e-07, + "loss": 0.3333, + "step": 9580 + }, + { + "epoch": 4.5300236406619385, + "grad_norm": 3.193974256515503, + "learning_rate": 7.153320610938031e-07, + "loss": 0.3206, + "step": 9581 + }, + { + "epoch": 4.530496453900709, + "grad_norm": 3.0833280086517334, + "learning_rate": 7.148952624024374e-07, + "loss": 0.3483, + "step": 9582 + }, + { + "epoch": 4.53096926713948, + "grad_norm": 2.8866562843322754, + "learning_rate": 7.144585748617163e-07, + "loss": 0.3147, + "step": 9583 + }, + { + "epoch": 4.531442080378251, + "grad_norm": 3.2411928176879883, + "learning_rate": 7.140219984988305e-07, + "loss": 0.336, + "step": 9584 + }, + { + "epoch": 4.531914893617021, + "grad_norm": 3.0993618965148926, + "learning_rate": 7.13585533340963e-07, + "loss": 0.3599, + "step": 9585 + }, + { + "epoch": 4.532387706855792, + "grad_norm": 3.361176013946533, + "learning_rate": 7.131491794152917e-07, + "loss": 0.3448, + "step": 9586 + }, + { + "epoch": 4.532860520094562, + "grad_norm": 3.0916879177093506, + "learning_rate": 7.12712936748986e-07, + "loss": 0.3479, + "step": 9587 + }, + { + "epoch": 4.533333333333333, + "grad_norm": 3.254135847091675, + "learning_rate": 7.122768053692078e-07, + "loss": 0.3536, + "step": 9588 + }, + { + "epoch": 4.533806146572104, + "grad_norm": 3.120321035385132, + "learning_rate": 7.118407853031148e-07, + "loss": 0.3604, + "step": 9589 + }, + { + "epoch": 4.534278959810875, + "grad_norm": 3.0456507205963135, + "learning_rate": 7.114048765778544e-07, + "loss": 0.3473, + "step": 9590 + }, + { + "epoch": 4.534751773049646, + "grad_norm": 3.7177469730377197, + "learning_rate": 7.109690792205704e-07, + "loss": 0.374, + "step": 9591 + }, + { + "epoch": 4.535224586288416, + "grad_norm": 3.2694458961486816, + "learning_rate": 7.105333932583972e-07, + "loss": 0.4206, + "step": 9592 + }, + { + "epoch": 4.535697399527187, + "grad_norm": 3.506195068359375, + "learning_rate": 7.100978187184624e-07, + "loss": 0.3483, + "step": 9593 + }, + { + "epoch": 4.536170212765957, + "grad_norm": 3.569413661956787, + "learning_rate": 7.096623556278887e-07, + "loss": 0.3389, + "step": 9594 + }, + { + "epoch": 4.536643026004728, + "grad_norm": 3.2686502933502197, + "learning_rate": 7.092270040137886e-07, + "loss": 0.3571, + "step": 9595 + }, + { + "epoch": 4.537115839243499, + "grad_norm": 3.2042582035064697, + "learning_rate": 7.087917639032718e-07, + "loss": 0.3742, + "step": 9596 + }, + { + "epoch": 4.5375886524822695, + "grad_norm": 3.014989137649536, + "learning_rate": 7.083566353234375e-07, + "loss": 0.3294, + "step": 9597 + }, + { + "epoch": 4.53806146572104, + "grad_norm": 3.4535064697265625, + "learning_rate": 7.079216183013793e-07, + "loss": 0.3434, + "step": 9598 + }, + { + "epoch": 4.538534278959811, + "grad_norm": 3.123633623123169, + "learning_rate": 7.074867128641841e-07, + "loss": 0.357, + "step": 9599 + }, + { + "epoch": 4.539007092198582, + "grad_norm": 3.0646567344665527, + "learning_rate": 7.070519190389305e-07, + "loss": 0.3488, + "step": 9600 + }, + { + "epoch": 4.539479905437352, + "grad_norm": 2.951892852783203, + "learning_rate": 7.066172368526927e-07, + "loss": 0.329, + "step": 9601 + }, + { + "epoch": 4.539952718676123, + "grad_norm": 2.8071751594543457, + "learning_rate": 7.061826663325361e-07, + "loss": 0.2788, + "step": 9602 + }, + { + "epoch": 4.540425531914893, + "grad_norm": 3.9670250415802, + "learning_rate": 7.057482075055183e-07, + "loss": 0.3776, + "step": 9603 + }, + { + "epoch": 4.540898345153664, + "grad_norm": 2.683743476867676, + "learning_rate": 7.053138603986928e-07, + "loss": 0.3044, + "step": 9604 + }, + { + "epoch": 4.541371158392435, + "grad_norm": 2.9766221046447754, + "learning_rate": 7.048796250391038e-07, + "loss": 0.3542, + "step": 9605 + }, + { + "epoch": 4.541843971631206, + "grad_norm": 3.1156277656555176, + "learning_rate": 7.044455014537882e-07, + "loss": 0.3401, + "step": 9606 + }, + { + "epoch": 4.542316784869977, + "grad_norm": 2.8444416522979736, + "learning_rate": 7.040114896697789e-07, + "loss": 0.3437, + "step": 9607 + }, + { + "epoch": 4.542789598108747, + "grad_norm": 2.9964232444763184, + "learning_rate": 7.035775897140984e-07, + "loss": 0.331, + "step": 9608 + }, + { + "epoch": 4.543262411347518, + "grad_norm": 3.511500597000122, + "learning_rate": 7.031438016137648e-07, + "loss": 0.3685, + "step": 9609 + }, + { + "epoch": 4.543735224586288, + "grad_norm": 3.541271686553955, + "learning_rate": 7.027101253957877e-07, + "loss": 0.3945, + "step": 9610 + }, + { + "epoch": 4.544208037825059, + "grad_norm": 3.1483919620513916, + "learning_rate": 7.022765610871696e-07, + "loss": 0.3681, + "step": 9611 + }, + { + "epoch": 4.54468085106383, + "grad_norm": 2.908977508544922, + "learning_rate": 7.01843108714908e-07, + "loss": 0.3468, + "step": 9612 + }, + { + "epoch": 4.5451536643026005, + "grad_norm": 3.3107962608337402, + "learning_rate": 7.014097683059912e-07, + "loss": 0.419, + "step": 9613 + }, + { + "epoch": 4.545626477541371, + "grad_norm": 3.5597898960113525, + "learning_rate": 7.009765398874008e-07, + "loss": 0.3238, + "step": 9614 + }, + { + "epoch": 4.546099290780142, + "grad_norm": 3.091235399246216, + "learning_rate": 7.005434234861136e-07, + "loss": 0.3632, + "step": 9615 + }, + { + "epoch": 4.546572104018913, + "grad_norm": 3.279076099395752, + "learning_rate": 7.001104191290972e-07, + "loss": 0.4006, + "step": 9616 + }, + { + "epoch": 4.547044917257683, + "grad_norm": 3.3877902030944824, + "learning_rate": 6.996775268433126e-07, + "loss": 0.4183, + "step": 9617 + }, + { + "epoch": 4.547517730496454, + "grad_norm": 2.979999542236328, + "learning_rate": 6.992447466557134e-07, + "loss": 0.2921, + "step": 9618 + }, + { + "epoch": 4.547990543735224, + "grad_norm": 3.196361780166626, + "learning_rate": 6.988120785932484e-07, + "loss": 0.3352, + "step": 9619 + }, + { + "epoch": 4.548463356973995, + "grad_norm": 3.3237528800964355, + "learning_rate": 6.983795226828577e-07, + "loss": 0.3487, + "step": 9620 + }, + { + "epoch": 4.548936170212766, + "grad_norm": 3.0740649700164795, + "learning_rate": 6.979470789514731e-07, + "loss": 0.3497, + "step": 9621 + }, + { + "epoch": 4.549408983451537, + "grad_norm": 3.3443479537963867, + "learning_rate": 6.97514747426023e-07, + "loss": 0.3752, + "step": 9622 + }, + { + "epoch": 4.549881796690308, + "grad_norm": 3.450427293777466, + "learning_rate": 6.970825281334254e-07, + "loss": 0.3981, + "step": 9623 + }, + { + "epoch": 4.550354609929078, + "grad_norm": 3.4733047485351562, + "learning_rate": 6.966504211005937e-07, + "loss": 0.3953, + "step": 9624 + }, + { + "epoch": 4.550827423167849, + "grad_norm": 3.1651546955108643, + "learning_rate": 6.962184263544328e-07, + "loss": 0.3012, + "step": 9625 + }, + { + "epoch": 4.551300236406619, + "grad_norm": 2.9222865104675293, + "learning_rate": 6.957865439218405e-07, + "loss": 0.2774, + "step": 9626 + }, + { + "epoch": 4.55177304964539, + "grad_norm": 2.972437620162964, + "learning_rate": 6.953547738297095e-07, + "loss": 0.3478, + "step": 9627 + }, + { + "epoch": 4.552245862884161, + "grad_norm": 3.2741193771362305, + "learning_rate": 6.949231161049239e-07, + "loss": 0.355, + "step": 9628 + }, + { + "epoch": 4.5527186761229315, + "grad_norm": 2.8715150356292725, + "learning_rate": 6.9449157077436e-07, + "loss": 0.3055, + "step": 9629 + }, + { + "epoch": 4.553191489361702, + "grad_norm": 4.50998592376709, + "learning_rate": 6.940601378648895e-07, + "loss": 0.3732, + "step": 9630 + }, + { + "epoch": 4.553664302600473, + "grad_norm": 2.9277849197387695, + "learning_rate": 6.936288174033757e-07, + "loss": 0.3367, + "step": 9631 + }, + { + "epoch": 4.554137115839244, + "grad_norm": 3.169978380203247, + "learning_rate": 6.931976094166746e-07, + "loss": 0.3529, + "step": 9632 + }, + { + "epoch": 4.554609929078014, + "grad_norm": 2.9629712104797363, + "learning_rate": 6.927665139316359e-07, + "loss": 0.3416, + "step": 9633 + }, + { + "epoch": 4.555082742316785, + "grad_norm": 3.1368603706359863, + "learning_rate": 6.923355309751012e-07, + "loss": 0.3267, + "step": 9634 + }, + { + "epoch": 4.555555555555555, + "grad_norm": 2.9895052909851074, + "learning_rate": 6.919046605739071e-07, + "loss": 0.3411, + "step": 9635 + }, + { + "epoch": 4.556028368794326, + "grad_norm": 3.1592509746551514, + "learning_rate": 6.914739027548809e-07, + "loss": 0.3488, + "step": 9636 + }, + { + "epoch": 4.556501182033097, + "grad_norm": 3.0848731994628906, + "learning_rate": 6.910432575448456e-07, + "loss": 0.3732, + "step": 9637 + }, + { + "epoch": 4.556973995271868, + "grad_norm": 3.1475934982299805, + "learning_rate": 6.906127249706143e-07, + "loss": 0.3525, + "step": 9638 + }, + { + "epoch": 4.5574468085106385, + "grad_norm": 2.9435455799102783, + "learning_rate": 6.90182305058994e-07, + "loss": 0.3155, + "step": 9639 + }, + { + "epoch": 4.557919621749409, + "grad_norm": 3.4412894248962402, + "learning_rate": 6.897519978367867e-07, + "loss": 0.3511, + "step": 9640 + }, + { + "epoch": 4.55839243498818, + "grad_norm": 3.3600406646728516, + "learning_rate": 6.893218033307838e-07, + "loss": 0.4311, + "step": 9641 + }, + { + "epoch": 4.55886524822695, + "grad_norm": 3.35927414894104, + "learning_rate": 6.888917215677734e-07, + "loss": 0.387, + "step": 9642 + }, + { + "epoch": 4.559338061465721, + "grad_norm": 3.2481210231781006, + "learning_rate": 6.884617525745343e-07, + "loss": 0.3456, + "step": 9643 + }, + { + "epoch": 4.559810874704492, + "grad_norm": 3.661160469055176, + "learning_rate": 6.880318963778374e-07, + "loss": 0.4276, + "step": 9644 + }, + { + "epoch": 4.560283687943262, + "grad_norm": 3.038726806640625, + "learning_rate": 6.876021530044502e-07, + "loss": 0.3288, + "step": 9645 + }, + { + "epoch": 4.560756501182033, + "grad_norm": 3.0502963066101074, + "learning_rate": 6.871725224811296e-07, + "loss": 0.3334, + "step": 9646 + }, + { + "epoch": 4.561229314420804, + "grad_norm": 3.1810805797576904, + "learning_rate": 6.867430048346268e-07, + "loss": 0.3335, + "step": 9647 + }, + { + "epoch": 4.561702127659575, + "grad_norm": 3.028670072555542, + "learning_rate": 6.863136000916864e-07, + "loss": 0.3235, + "step": 9648 + }, + { + "epoch": 4.562174940898345, + "grad_norm": 2.805989980697632, + "learning_rate": 6.858843082790447e-07, + "loss": 0.3201, + "step": 9649 + }, + { + "epoch": 4.5626477541371155, + "grad_norm": 3.0792744159698486, + "learning_rate": 6.854551294234333e-07, + "loss": 0.3757, + "step": 9650 + }, + { + "epoch": 4.563120567375886, + "grad_norm": 3.115539312362671, + "learning_rate": 6.850260635515735e-07, + "loss": 0.3311, + "step": 9651 + }, + { + "epoch": 4.563593380614657, + "grad_norm": 3.003520965576172, + "learning_rate": 6.845971106901831e-07, + "loss": 0.35, + "step": 9652 + }, + { + "epoch": 4.564066193853428, + "grad_norm": 2.954759359359741, + "learning_rate": 6.841682708659702e-07, + "loss": 0.279, + "step": 9653 + }, + { + "epoch": 4.564539007092199, + "grad_norm": 3.1510894298553467, + "learning_rate": 6.83739544105636e-07, + "loss": 0.333, + "step": 9654 + }, + { + "epoch": 4.5650118203309695, + "grad_norm": 3.3958635330200195, + "learning_rate": 6.833109304358776e-07, + "loss": 0.3668, + "step": 9655 + }, + { + "epoch": 4.56548463356974, + "grad_norm": 3.493522882461548, + "learning_rate": 6.828824298833811e-07, + "loss": 0.3522, + "step": 9656 + }, + { + "epoch": 4.565957446808511, + "grad_norm": 3.217268705368042, + "learning_rate": 6.824540424748275e-07, + "loss": 0.4066, + "step": 9657 + }, + { + "epoch": 4.566430260047281, + "grad_norm": 3.148505210876465, + "learning_rate": 6.820257682368914e-07, + "loss": 0.3252, + "step": 9658 + }, + { + "epoch": 4.566903073286052, + "grad_norm": 3.070316791534424, + "learning_rate": 6.815976071962385e-07, + "loss": 0.3362, + "step": 9659 + }, + { + "epoch": 4.567375886524823, + "grad_norm": 3.0421791076660156, + "learning_rate": 6.811695593795301e-07, + "loss": 0.3894, + "step": 9660 + }, + { + "epoch": 4.567848699763593, + "grad_norm": 2.9165565967559814, + "learning_rate": 6.807416248134177e-07, + "loss": 0.3147, + "step": 9661 + }, + { + "epoch": 4.568321513002364, + "grad_norm": 3.361647129058838, + "learning_rate": 6.803138035245471e-07, + "loss": 0.3346, + "step": 9662 + }, + { + "epoch": 4.568794326241135, + "grad_norm": 3.0013155937194824, + "learning_rate": 6.79886095539557e-07, + "loss": 0.3476, + "step": 9663 + }, + { + "epoch": 4.569267139479906, + "grad_norm": 2.9030165672302246, + "learning_rate": 6.794585008850779e-07, + "loss": 0.3118, + "step": 9664 + }, + { + "epoch": 4.569739952718676, + "grad_norm": 3.229907989501953, + "learning_rate": 6.790310195877361e-07, + "loss": 0.3257, + "step": 9665 + }, + { + "epoch": 4.5702127659574465, + "grad_norm": 3.4075570106506348, + "learning_rate": 6.786036516741479e-07, + "loss": 0.3545, + "step": 9666 + }, + { + "epoch": 4.570685579196217, + "grad_norm": 2.9831581115722656, + "learning_rate": 6.781763971709229e-07, + "loss": 0.3173, + "step": 9667 + }, + { + "epoch": 4.571158392434988, + "grad_norm": 3.8512840270996094, + "learning_rate": 6.777492561046659e-07, + "loss": 0.288, + "step": 9668 + }, + { + "epoch": 4.571631205673759, + "grad_norm": 3.3054401874542236, + "learning_rate": 6.773222285019718e-07, + "loss": 0.369, + "step": 9669 + }, + { + "epoch": 4.57210401891253, + "grad_norm": 2.9155004024505615, + "learning_rate": 6.768953143894308e-07, + "loss": 0.3334, + "step": 9670 + }, + { + "epoch": 4.5725768321513005, + "grad_norm": 3.60557222366333, + "learning_rate": 6.764685137936247e-07, + "loss": 0.4094, + "step": 9671 + }, + { + "epoch": 4.573049645390071, + "grad_norm": 3.271256446838379, + "learning_rate": 6.760418267411275e-07, + "loss": 0.3646, + "step": 9672 + }, + { + "epoch": 4.573522458628842, + "grad_norm": 2.970238447189331, + "learning_rate": 6.756152532585086e-07, + "loss": 0.34, + "step": 9673 + }, + { + "epoch": 4.573995271867612, + "grad_norm": 3.412712574005127, + "learning_rate": 6.751887933723277e-07, + "loss": 0.3674, + "step": 9674 + }, + { + "epoch": 4.574468085106383, + "grad_norm": 2.9984517097473145, + "learning_rate": 6.747624471091396e-07, + "loss": 0.3579, + "step": 9675 + }, + { + "epoch": 4.574940898345154, + "grad_norm": 2.863788366317749, + "learning_rate": 6.743362144954907e-07, + "loss": 0.3234, + "step": 9676 + }, + { + "epoch": 4.575413711583924, + "grad_norm": 3.313793897628784, + "learning_rate": 6.739100955579203e-07, + "loss": 0.334, + "step": 9677 + }, + { + "epoch": 4.575886524822695, + "grad_norm": 3.5350630283355713, + "learning_rate": 6.734840903229611e-07, + "loss": 0.3682, + "step": 9678 + }, + { + "epoch": 4.576359338061466, + "grad_norm": 3.531888723373413, + "learning_rate": 6.730581988171378e-07, + "loss": 0.3434, + "step": 9679 + }, + { + "epoch": 4.576832151300237, + "grad_norm": 3.358574867248535, + "learning_rate": 6.726324210669702e-07, + "loss": 0.3751, + "step": 9680 + }, + { + "epoch": 4.577304964539007, + "grad_norm": 2.9723873138427734, + "learning_rate": 6.722067570989691e-07, + "loss": 0.3077, + "step": 9681 + }, + { + "epoch": 4.5777777777777775, + "grad_norm": 3.2287187576293945, + "learning_rate": 6.717812069396379e-07, + "loss": 0.3493, + "step": 9682 + }, + { + "epoch": 4.578250591016548, + "grad_norm": 2.9089417457580566, + "learning_rate": 6.71355770615475e-07, + "loss": 0.324, + "step": 9683 + }, + { + "epoch": 4.578723404255319, + "grad_norm": 3.2894415855407715, + "learning_rate": 6.709304481529703e-07, + "loss": 0.3066, + "step": 9684 + }, + { + "epoch": 4.57919621749409, + "grad_norm": 3.1914620399475098, + "learning_rate": 6.705052395786052e-07, + "loss": 0.3453, + "step": 9685 + }, + { + "epoch": 4.579669030732861, + "grad_norm": 3.1095924377441406, + "learning_rate": 6.700801449188577e-07, + "loss": 0.3678, + "step": 9686 + }, + { + "epoch": 4.5801418439716315, + "grad_norm": 3.416944980621338, + "learning_rate": 6.696551642001948e-07, + "loss": 0.3754, + "step": 9687 + }, + { + "epoch": 4.580614657210402, + "grad_norm": 3.7102952003479004, + "learning_rate": 6.692302974490797e-07, + "loss": 0.3723, + "step": 9688 + }, + { + "epoch": 4.581087470449172, + "grad_norm": 3.296607494354248, + "learning_rate": 6.688055446919664e-07, + "loss": 0.3607, + "step": 9689 + }, + { + "epoch": 4.581560283687943, + "grad_norm": 3.4449238777160645, + "learning_rate": 6.683809059553014e-07, + "loss": 0.3102, + "step": 9690 + }, + { + "epoch": 4.582033096926714, + "grad_norm": 3.202671766281128, + "learning_rate": 6.679563812655268e-07, + "loss": 0.3535, + "step": 9691 + }, + { + "epoch": 4.582505910165485, + "grad_norm": 3.25919771194458, + "learning_rate": 6.675319706490744e-07, + "loss": 0.3778, + "step": 9692 + }, + { + "epoch": 4.582978723404255, + "grad_norm": 3.316021680831909, + "learning_rate": 6.671076741323718e-07, + "loss": 0.2943, + "step": 9693 + }, + { + "epoch": 4.583451536643026, + "grad_norm": 3.2375826835632324, + "learning_rate": 6.666834917418371e-07, + "loss": 0.3249, + "step": 9694 + }, + { + "epoch": 4.583924349881797, + "grad_norm": 2.875436782836914, + "learning_rate": 6.662594235038827e-07, + "loss": 0.2991, + "step": 9695 + }, + { + "epoch": 4.584397163120567, + "grad_norm": 3.75874924659729, + "learning_rate": 6.658354694449134e-07, + "loss": 0.3718, + "step": 9696 + }, + { + "epoch": 4.584869976359338, + "grad_norm": 3.060943126678467, + "learning_rate": 6.65411629591326e-07, + "loss": 0.3183, + "step": 9697 + }, + { + "epoch": 4.5853427895981085, + "grad_norm": 3.024336576461792, + "learning_rate": 6.649879039695126e-07, + "loss": 0.3118, + "step": 9698 + }, + { + "epoch": 4.585815602836879, + "grad_norm": 3.3640875816345215, + "learning_rate": 6.645642926058562e-07, + "loss": 0.3408, + "step": 9699 + }, + { + "epoch": 4.58628841607565, + "grad_norm": 2.8885910511016846, + "learning_rate": 6.641407955267326e-07, + "loss": 0.3304, + "step": 9700 + }, + { + "epoch": 4.586761229314421, + "grad_norm": 3.8225393295288086, + "learning_rate": 6.637174127585122e-07, + "loss": 0.3469, + "step": 9701 + }, + { + "epoch": 4.587234042553192, + "grad_norm": 3.0624778270721436, + "learning_rate": 6.632941443275567e-07, + "loss": 0.3177, + "step": 9702 + }, + { + "epoch": 4.5877068557919625, + "grad_norm": 3.1422903537750244, + "learning_rate": 6.628709902602204e-07, + "loss": 0.3205, + "step": 9703 + }, + { + "epoch": 4.588179669030733, + "grad_norm": 3.1315362453460693, + "learning_rate": 6.62447950582853e-07, + "loss": 0.3443, + "step": 9704 + }, + { + "epoch": 4.588652482269503, + "grad_norm": 3.096041202545166, + "learning_rate": 6.62025025321793e-07, + "loss": 0.3567, + "step": 9705 + }, + { + "epoch": 4.589125295508274, + "grad_norm": 3.225820302963257, + "learning_rate": 6.616022145033766e-07, + "loss": 0.3873, + "step": 9706 + }, + { + "epoch": 4.589598108747045, + "grad_norm": 3.3879058361053467, + "learning_rate": 6.611795181539288e-07, + "loss": 0.4379, + "step": 9707 + }, + { + "epoch": 4.590070921985816, + "grad_norm": 3.508265733718872, + "learning_rate": 6.60756936299769e-07, + "loss": 0.3182, + "step": 9708 + }, + { + "epoch": 4.590543735224586, + "grad_norm": 3.278857946395874, + "learning_rate": 6.603344689672106e-07, + "loss": 0.3508, + "step": 9709 + }, + { + "epoch": 4.591016548463357, + "grad_norm": 2.9961371421813965, + "learning_rate": 6.599121161825581e-07, + "loss": 0.3178, + "step": 9710 + }, + { + "epoch": 4.591489361702128, + "grad_norm": 3.413717269897461, + "learning_rate": 6.594898779721092e-07, + "loss": 0.363, + "step": 9711 + }, + { + "epoch": 4.591962174940898, + "grad_norm": 3.2014074325561523, + "learning_rate": 6.590677543621557e-07, + "loss": 0.392, + "step": 9712 + }, + { + "epoch": 4.592434988179669, + "grad_norm": 3.0421640872955322, + "learning_rate": 6.586457453789802e-07, + "loss": 0.3119, + "step": 9713 + }, + { + "epoch": 4.5929078014184395, + "grad_norm": 3.0515928268432617, + "learning_rate": 6.582238510488604e-07, + "loss": 0.3021, + "step": 9714 + }, + { + "epoch": 4.59338061465721, + "grad_norm": 3.0824668407440186, + "learning_rate": 6.578020713980648e-07, + "loss": 0.3551, + "step": 9715 + }, + { + "epoch": 4.593853427895981, + "grad_norm": 3.0002171993255615, + "learning_rate": 6.573804064528574e-07, + "loss": 0.3691, + "step": 9716 + }, + { + "epoch": 4.594326241134752, + "grad_norm": 3.0174765586853027, + "learning_rate": 6.569588562394924e-07, + "loss": 0.3289, + "step": 9717 + }, + { + "epoch": 4.594799054373523, + "grad_norm": 4.098819732666016, + "learning_rate": 6.565374207842171e-07, + "loss": 0.3637, + "step": 9718 + }, + { + "epoch": 4.5952718676122934, + "grad_norm": 3.396275281906128, + "learning_rate": 6.561161001132737e-07, + "loss": 0.348, + "step": 9719 + }, + { + "epoch": 4.595744680851064, + "grad_norm": 3.6430864334106445, + "learning_rate": 6.556948942528952e-07, + "loss": 0.3543, + "step": 9720 + }, + { + "epoch": 4.596217494089834, + "grad_norm": 3.170236587524414, + "learning_rate": 6.552738032293093e-07, + "loss": 0.3565, + "step": 9721 + }, + { + "epoch": 4.596690307328605, + "grad_norm": 3.402683734893799, + "learning_rate": 6.548528270687349e-07, + "loss": 0.4001, + "step": 9722 + }, + { + "epoch": 4.597163120567376, + "grad_norm": 2.861463785171509, + "learning_rate": 6.544319657973833e-07, + "loss": 0.3436, + "step": 9723 + }, + { + "epoch": 4.5976359338061465, + "grad_norm": 3.209259510040283, + "learning_rate": 6.540112194414613e-07, + "loss": 0.3317, + "step": 9724 + }, + { + "epoch": 4.598108747044917, + "grad_norm": 3.099533796310425, + "learning_rate": 6.535905880271662e-07, + "loss": 0.3416, + "step": 9725 + }, + { + "epoch": 4.598581560283688, + "grad_norm": 3.3558053970336914, + "learning_rate": 6.531700715806891e-07, + "loss": 0.3567, + "step": 9726 + }, + { + "epoch": 4.599054373522459, + "grad_norm": 3.1330227851867676, + "learning_rate": 6.527496701282135e-07, + "loss": 0.3025, + "step": 9727 + }, + { + "epoch": 4.599527186761229, + "grad_norm": 3.140184164047241, + "learning_rate": 6.523293836959152e-07, + "loss": 0.3195, + "step": 9728 + }, + { + "epoch": 4.6, + "grad_norm": 3.246844530105591, + "learning_rate": 6.519092123099652e-07, + "loss": 0.3367, + "step": 9729 + }, + { + "epoch": 4.60047281323877, + "grad_norm": 3.1590709686279297, + "learning_rate": 6.51489155996525e-07, + "loss": 0.367, + "step": 9730 + }, + { + "epoch": 4.600945626477541, + "grad_norm": 3.122746467590332, + "learning_rate": 6.510692147817488e-07, + "loss": 0.3401, + "step": 9731 + }, + { + "epoch": 4.601418439716312, + "grad_norm": 3.0418715476989746, + "learning_rate": 6.506493886917859e-07, + "loss": 0.2958, + "step": 9732 + }, + { + "epoch": 4.601891252955083, + "grad_norm": 3.06303334236145, + "learning_rate": 6.502296777527756e-07, + "loss": 0.3459, + "step": 9733 + }, + { + "epoch": 4.602364066193854, + "grad_norm": 3.0807206630706787, + "learning_rate": 6.498100819908532e-07, + "loss": 0.3473, + "step": 9734 + }, + { + "epoch": 4.602836879432624, + "grad_norm": 3.790008306503296, + "learning_rate": 6.493906014321441e-07, + "loss": 0.3541, + "step": 9735 + }, + { + "epoch": 4.603309692671395, + "grad_norm": 3.3040049076080322, + "learning_rate": 6.489712361027667e-07, + "loss": 0.3317, + "step": 9736 + }, + { + "epoch": 4.603782505910165, + "grad_norm": 3.3145735263824463, + "learning_rate": 6.485519860288347e-07, + "loss": 0.3351, + "step": 9737 + }, + { + "epoch": 4.604255319148936, + "grad_norm": 3.1374423503875732, + "learning_rate": 6.481328512364515e-07, + "loss": 0.3544, + "step": 9738 + }, + { + "epoch": 4.604728132387707, + "grad_norm": 3.3598453998565674, + "learning_rate": 6.477138317517162e-07, + "loss": 0.4219, + "step": 9739 + }, + { + "epoch": 4.6052009456264775, + "grad_norm": 3.227466583251953, + "learning_rate": 6.472949276007187e-07, + "loss": 0.3179, + "step": 9740 + }, + { + "epoch": 4.605673758865248, + "grad_norm": 2.9815897941589355, + "learning_rate": 6.46876138809542e-07, + "loss": 0.3753, + "step": 9741 + }, + { + "epoch": 4.606146572104019, + "grad_norm": 3.072967290878296, + "learning_rate": 6.464574654042624e-07, + "loss": 0.3288, + "step": 9742 + }, + { + "epoch": 4.60661938534279, + "grad_norm": 3.695613145828247, + "learning_rate": 6.460389074109482e-07, + "loss": 0.3305, + "step": 9743 + }, + { + "epoch": 4.60709219858156, + "grad_norm": 3.205684185028076, + "learning_rate": 6.456204648556628e-07, + "loss": 0.3305, + "step": 9744 + }, + { + "epoch": 4.607565011820331, + "grad_norm": 3.216615915298462, + "learning_rate": 6.452021377644596e-07, + "loss": 0.3416, + "step": 9745 + }, + { + "epoch": 4.608037825059101, + "grad_norm": 3.2224013805389404, + "learning_rate": 6.447839261633856e-07, + "loss": 0.3773, + "step": 9746 + }, + { + "epoch": 4.608510638297872, + "grad_norm": 3.2811145782470703, + "learning_rate": 6.443658300784824e-07, + "loss": 0.3292, + "step": 9747 + }, + { + "epoch": 4.608983451536643, + "grad_norm": 3.8610804080963135, + "learning_rate": 6.439478495357815e-07, + "loss": 0.3975, + "step": 9748 + }, + { + "epoch": 4.609456264775414, + "grad_norm": 3.2154266834259033, + "learning_rate": 6.435299845613102e-07, + "loss": 0.3367, + "step": 9749 + }, + { + "epoch": 4.609929078014185, + "grad_norm": 3.18072509765625, + "learning_rate": 6.431122351810862e-07, + "loss": 0.3972, + "step": 9750 + }, + { + "epoch": 4.610401891252955, + "grad_norm": 3.513521194458008, + "learning_rate": 6.426946014211205e-07, + "loss": 0.374, + "step": 9751 + }, + { + "epoch": 4.610874704491726, + "grad_norm": 3.2900753021240234, + "learning_rate": 6.422770833074188e-07, + "loss": 0.3823, + "step": 9752 + }, + { + "epoch": 4.611347517730496, + "grad_norm": 2.791400194168091, + "learning_rate": 6.418596808659772e-07, + "loss": 0.3187, + "step": 9753 + }, + { + "epoch": 4.611820330969267, + "grad_norm": 3.042336940765381, + "learning_rate": 6.414423941227846e-07, + "loss": 0.3832, + "step": 9754 + }, + { + "epoch": 4.612293144208038, + "grad_norm": 3.130197286605835, + "learning_rate": 6.410252231038255e-07, + "loss": 0.3152, + "step": 9755 + }, + { + "epoch": 4.6127659574468085, + "grad_norm": 3.28125262260437, + "learning_rate": 6.406081678350745e-07, + "loss": 0.3082, + "step": 9756 + }, + { + "epoch": 4.613238770685579, + "grad_norm": 3.5695526599884033, + "learning_rate": 6.401912283424988e-07, + "loss": 0.4303, + "step": 9757 + }, + { + "epoch": 4.61371158392435, + "grad_norm": 2.9045464992523193, + "learning_rate": 6.397744046520612e-07, + "loss": 0.3392, + "step": 9758 + }, + { + "epoch": 4.614184397163121, + "grad_norm": 3.4325780868530273, + "learning_rate": 6.393576967897145e-07, + "loss": 0.3446, + "step": 9759 + }, + { + "epoch": 4.614657210401891, + "grad_norm": 3.1146414279937744, + "learning_rate": 6.389411047814053e-07, + "loss": 0.3444, + "step": 9760 + }, + { + "epoch": 4.615130023640662, + "grad_norm": 3.9922995567321777, + "learning_rate": 6.385246286530722e-07, + "loss": 0.3431, + "step": 9761 + }, + { + "epoch": 4.615602836879432, + "grad_norm": 2.868818759918213, + "learning_rate": 6.381082684306491e-07, + "loss": 0.2819, + "step": 9762 + }, + { + "epoch": 4.616075650118203, + "grad_norm": 3.1957287788391113, + "learning_rate": 6.376920241400597e-07, + "loss": 0.315, + "step": 9763 + }, + { + "epoch": 4.616548463356974, + "grad_norm": 3.327913999557495, + "learning_rate": 6.372758958072215e-07, + "loss": 0.3224, + "step": 9764 + }, + { + "epoch": 4.617021276595745, + "grad_norm": 3.2451798915863037, + "learning_rate": 6.368598834580461e-07, + "loss": 0.3219, + "step": 9765 + }, + { + "epoch": 4.617494089834516, + "grad_norm": 3.328977346420288, + "learning_rate": 6.364439871184355e-07, + "loss": 0.3123, + "step": 9766 + }, + { + "epoch": 4.617966903073286, + "grad_norm": 2.929624557495117, + "learning_rate": 6.36028206814287e-07, + "loss": 0.3137, + "step": 9767 + }, + { + "epoch": 4.618439716312057, + "grad_norm": 3.2356855869293213, + "learning_rate": 6.356125425714888e-07, + "loss": 0.3672, + "step": 9768 + }, + { + "epoch": 4.618912529550827, + "grad_norm": 3.099452018737793, + "learning_rate": 6.351969944159217e-07, + "loss": 0.3875, + "step": 9769 + }, + { + "epoch": 4.619385342789598, + "grad_norm": 4.037657260894775, + "learning_rate": 6.347815623734616e-07, + "loss": 0.3984, + "step": 9770 + }, + { + "epoch": 4.619858156028369, + "grad_norm": 3.350639581680298, + "learning_rate": 6.343662464699743e-07, + "loss": 0.3325, + "step": 9771 + }, + { + "epoch": 4.6203309692671395, + "grad_norm": 3.3933796882629395, + "learning_rate": 6.339510467313206e-07, + "loss": 0.3922, + "step": 9772 + }, + { + "epoch": 4.62080378250591, + "grad_norm": 2.8599045276641846, + "learning_rate": 6.335359631833532e-07, + "loss": 0.2677, + "step": 9773 + }, + { + "epoch": 4.621276595744681, + "grad_norm": 3.0792534351348877, + "learning_rate": 6.331209958519172e-07, + "loss": 0.3784, + "step": 9774 + }, + { + "epoch": 4.621749408983452, + "grad_norm": 3.1678860187530518, + "learning_rate": 6.327061447628507e-07, + "loss": 0.3698, + "step": 9775 + }, + { + "epoch": 4.622222222222222, + "grad_norm": 3.500584602355957, + "learning_rate": 6.322914099419846e-07, + "loss": 0.281, + "step": 9776 + }, + { + "epoch": 4.622695035460993, + "grad_norm": 3.089900016784668, + "learning_rate": 6.318767914151422e-07, + "loss": 0.3202, + "step": 9777 + }, + { + "epoch": 4.623167848699763, + "grad_norm": 3.353118896484375, + "learning_rate": 6.31462289208141e-07, + "loss": 0.3584, + "step": 9778 + }, + { + "epoch": 4.623640661938534, + "grad_norm": 3.1742143630981445, + "learning_rate": 6.310479033467893e-07, + "loss": 0.3309, + "step": 9779 + }, + { + "epoch": 4.624113475177305, + "grad_norm": 3.5430498123168945, + "learning_rate": 6.306336338568903e-07, + "loss": 0.3972, + "step": 9780 + }, + { + "epoch": 4.624586288416076, + "grad_norm": 3.141406774520874, + "learning_rate": 6.302194807642379e-07, + "loss": 0.3875, + "step": 9781 + }, + { + "epoch": 4.625059101654847, + "grad_norm": 3.1661601066589355, + "learning_rate": 6.298054440946188e-07, + "loss": 0.3969, + "step": 9782 + }, + { + "epoch": 4.625531914893617, + "grad_norm": 2.9834651947021484, + "learning_rate": 6.293915238738149e-07, + "loss": 0.3357, + "step": 9783 + }, + { + "epoch": 4.626004728132388, + "grad_norm": 3.497030258178711, + "learning_rate": 6.289777201275979e-07, + "loss": 0.3683, + "step": 9784 + }, + { + "epoch": 4.626477541371158, + "grad_norm": 3.519390106201172, + "learning_rate": 6.285640328817347e-07, + "loss": 0.3647, + "step": 9785 + }, + { + "epoch": 4.626950354609929, + "grad_norm": 3.0032200813293457, + "learning_rate": 6.281504621619833e-07, + "loss": 0.2854, + "step": 9786 + }, + { + "epoch": 4.6274231678487, + "grad_norm": 2.9891152381896973, + "learning_rate": 6.277370079940939e-07, + "loss": 0.3771, + "step": 9787 + }, + { + "epoch": 4.6278959810874705, + "grad_norm": 3.379671812057495, + "learning_rate": 6.273236704038122e-07, + "loss": 0.3916, + "step": 9788 + }, + { + "epoch": 4.628368794326241, + "grad_norm": 2.9964048862457275, + "learning_rate": 6.26910449416874e-07, + "loss": 0.3618, + "step": 9789 + }, + { + "epoch": 4.628841607565012, + "grad_norm": 3.0143628120422363, + "learning_rate": 6.264973450590089e-07, + "loss": 0.336, + "step": 9790 + }, + { + "epoch": 4.629314420803783, + "grad_norm": 2.956737756729126, + "learning_rate": 6.260843573559392e-07, + "loss": 0.3657, + "step": 9791 + }, + { + "epoch": 4.629787234042553, + "grad_norm": 3.057551145553589, + "learning_rate": 6.256714863333787e-07, + "loss": 0.3475, + "step": 9792 + }, + { + "epoch": 4.630260047281324, + "grad_norm": 3.9289608001708984, + "learning_rate": 6.25258732017037e-07, + "loss": 0.3679, + "step": 9793 + }, + { + "epoch": 4.630732860520094, + "grad_norm": 3.8519062995910645, + "learning_rate": 6.248460944326129e-07, + "loss": 0.4182, + "step": 9794 + }, + { + "epoch": 4.631205673758865, + "grad_norm": 3.6360673904418945, + "learning_rate": 6.244335736058007e-07, + "loss": 0.3836, + "step": 9795 + }, + { + "epoch": 4.631678486997636, + "grad_norm": 3.1905548572540283, + "learning_rate": 6.240211695622861e-07, + "loss": 0.357, + "step": 9796 + }, + { + "epoch": 4.632151300236407, + "grad_norm": 3.3542017936706543, + "learning_rate": 6.236088823277465e-07, + "loss": 0.3191, + "step": 9797 + }, + { + "epoch": 4.6326241134751776, + "grad_norm": 3.453275442123413, + "learning_rate": 6.231967119278546e-07, + "loss": 0.3346, + "step": 9798 + }, + { + "epoch": 4.633096926713948, + "grad_norm": 3.559972047805786, + "learning_rate": 6.227846583882741e-07, + "loss": 0.365, + "step": 9799 + }, + { + "epoch": 4.633569739952719, + "grad_norm": 2.795891046524048, + "learning_rate": 6.223727217346606e-07, + "loss": 0.3346, + "step": 9800 + }, + { + "epoch": 4.634042553191489, + "grad_norm": 3.176762342453003, + "learning_rate": 6.219609019926653e-07, + "loss": 0.3692, + "step": 9801 + }, + { + "epoch": 4.63451536643026, + "grad_norm": 3.490229845046997, + "learning_rate": 6.215491991879294e-07, + "loss": 0.3334, + "step": 9802 + }, + { + "epoch": 4.634988179669031, + "grad_norm": 3.27502179145813, + "learning_rate": 6.211376133460884e-07, + "loss": 0.3484, + "step": 9803 + }, + { + "epoch": 4.6354609929078014, + "grad_norm": 2.9768311977386475, + "learning_rate": 6.207261444927698e-07, + "loss": 0.3342, + "step": 9804 + }, + { + "epoch": 4.635933806146572, + "grad_norm": 3.1726930141448975, + "learning_rate": 6.203147926535938e-07, + "loss": 0.3187, + "step": 9805 + }, + { + "epoch": 4.636406619385343, + "grad_norm": 3.1797916889190674, + "learning_rate": 6.199035578541737e-07, + "loss": 0.3418, + "step": 9806 + }, + { + "epoch": 4.636879432624114, + "grad_norm": 3.1262030601501465, + "learning_rate": 6.194924401201141e-07, + "loss": 0.3099, + "step": 9807 + }, + { + "epoch": 4.637352245862884, + "grad_norm": 3.556866407394409, + "learning_rate": 6.190814394770153e-07, + "loss": 0.2879, + "step": 9808 + }, + { + "epoch": 4.6378250591016545, + "grad_norm": 3.508984327316284, + "learning_rate": 6.186705559504678e-07, + "loss": 0.3414, + "step": 9809 + }, + { + "epoch": 4.638297872340425, + "grad_norm": 3.266221761703491, + "learning_rate": 6.182597895660544e-07, + "loss": 0.3281, + "step": 9810 + }, + { + "epoch": 4.638770685579196, + "grad_norm": 3.3781862258911133, + "learning_rate": 6.178491403493537e-07, + "loss": 0.3583, + "step": 9811 + }, + { + "epoch": 4.639243498817967, + "grad_norm": 3.4480984210968018, + "learning_rate": 6.174386083259329e-07, + "loss": 0.3704, + "step": 9812 + }, + { + "epoch": 4.639716312056738, + "grad_norm": 3.2882535457611084, + "learning_rate": 6.170281935213563e-07, + "loss": 0.3515, + "step": 9813 + }, + { + "epoch": 4.6401891252955085, + "grad_norm": 2.852627992630005, + "learning_rate": 6.166178959611774e-07, + "loss": 0.3266, + "step": 9814 + }, + { + "epoch": 4.640661938534279, + "grad_norm": 3.5469841957092285, + "learning_rate": 6.162077156709431e-07, + "loss": 0.3374, + "step": 9815 + }, + { + "epoch": 4.64113475177305, + "grad_norm": 3.343583345413208, + "learning_rate": 6.157976526761947e-07, + "loss": 0.3084, + "step": 9816 + }, + { + "epoch": 4.64160756501182, + "grad_norm": 3.028337001800537, + "learning_rate": 6.153877070024639e-07, + "loss": 0.3083, + "step": 9817 + }, + { + "epoch": 4.642080378250591, + "grad_norm": 3.1543455123901367, + "learning_rate": 6.149778786752775e-07, + "loss": 0.3273, + "step": 9818 + }, + { + "epoch": 4.642553191489362, + "grad_norm": 3.2126576900482178, + "learning_rate": 6.145681677201529e-07, + "loss": 0.3107, + "step": 9819 + }, + { + "epoch": 4.643026004728132, + "grad_norm": 3.4443142414093018, + "learning_rate": 6.141585741626014e-07, + "loss": 0.3193, + "step": 9820 + }, + { + "epoch": 4.643498817966903, + "grad_norm": 3.1558680534362793, + "learning_rate": 6.137490980281255e-07, + "loss": 0.3855, + "step": 9821 + }, + { + "epoch": 4.643971631205674, + "grad_norm": 3.370654821395874, + "learning_rate": 6.133397393422228e-07, + "loss": 0.309, + "step": 9822 + }, + { + "epoch": 4.644444444444445, + "grad_norm": 3.0980682373046875, + "learning_rate": 6.129304981303822e-07, + "loss": 0.2784, + "step": 9823 + }, + { + "epoch": 4.644917257683215, + "grad_norm": 3.102229356765747, + "learning_rate": 6.125213744180844e-07, + "loss": 0.3064, + "step": 9824 + }, + { + "epoch": 4.6453900709219855, + "grad_norm": 2.9737658500671387, + "learning_rate": 6.121123682308039e-07, + "loss": 0.2926, + "step": 9825 + }, + { + "epoch": 4.645862884160756, + "grad_norm": 3.3927671909332275, + "learning_rate": 6.117034795940089e-07, + "loss": 0.404, + "step": 9826 + }, + { + "epoch": 4.646335697399527, + "grad_norm": 2.885082721710205, + "learning_rate": 6.112947085331581e-07, + "loss": 0.3375, + "step": 9827 + }, + { + "epoch": 4.646808510638298, + "grad_norm": 2.9711341857910156, + "learning_rate": 6.108860550737034e-07, + "loss": 0.3051, + "step": 9828 + }, + { + "epoch": 4.647281323877069, + "grad_norm": 3.1437952518463135, + "learning_rate": 6.104775192410911e-07, + "loss": 0.3408, + "step": 9829 + }, + { + "epoch": 4.6477541371158395, + "grad_norm": 3.055950164794922, + "learning_rate": 6.100691010607579e-07, + "loss": 0.316, + "step": 9830 + }, + { + "epoch": 4.64822695035461, + "grad_norm": 3.515423536300659, + "learning_rate": 6.096608005581353e-07, + "loss": 0.3994, + "step": 9831 + }, + { + "epoch": 4.648699763593381, + "grad_norm": 3.1165153980255127, + "learning_rate": 6.092526177586455e-07, + "loss": 0.3908, + "step": 9832 + }, + { + "epoch": 4.649172576832151, + "grad_norm": 3.504673719406128, + "learning_rate": 6.088445526877043e-07, + "loss": 0.3328, + "step": 9833 + }, + { + "epoch": 4.649645390070922, + "grad_norm": 3.4175243377685547, + "learning_rate": 6.084366053707208e-07, + "loss": 0.3234, + "step": 9834 + }, + { + "epoch": 4.650118203309693, + "grad_norm": 3.14725661277771, + "learning_rate": 6.080287758330946e-07, + "loss": 0.3118, + "step": 9835 + }, + { + "epoch": 4.650591016548463, + "grad_norm": 3.7654550075531006, + "learning_rate": 6.076210641002217e-07, + "loss": 0.4177, + "step": 9836 + }, + { + "epoch": 4.651063829787234, + "grad_norm": 3.188804864883423, + "learning_rate": 6.072134701974871e-07, + "loss": 0.3468, + "step": 9837 + }, + { + "epoch": 4.651536643026005, + "grad_norm": 3.2176342010498047, + "learning_rate": 6.068059941502702e-07, + "loss": 0.3486, + "step": 9838 + }, + { + "epoch": 4.652009456264776, + "grad_norm": 4.188257217407227, + "learning_rate": 6.063986359839424e-07, + "loss": 0.3973, + "step": 9839 + }, + { + "epoch": 4.652482269503546, + "grad_norm": 3.206559896469116, + "learning_rate": 6.059913957238678e-07, + "loss": 0.3088, + "step": 9840 + }, + { + "epoch": 4.6529550827423165, + "grad_norm": 3.033918857574463, + "learning_rate": 6.055842733954048e-07, + "loss": 0.3331, + "step": 9841 + }, + { + "epoch": 4.653427895981087, + "grad_norm": 3.2453384399414062, + "learning_rate": 6.051772690239022e-07, + "loss": 0.323, + "step": 9842 + }, + { + "epoch": 4.653900709219858, + "grad_norm": 3.001999855041504, + "learning_rate": 6.047703826347017e-07, + "loss": 0.3763, + "step": 9843 + }, + { + "epoch": 4.654373522458629, + "grad_norm": 3.845486640930176, + "learning_rate": 6.043636142531401e-07, + "loss": 0.3595, + "step": 9844 + }, + { + "epoch": 4.6548463356974, + "grad_norm": 3.347628593444824, + "learning_rate": 6.039569639045434e-07, + "loss": 0.3388, + "step": 9845 + }, + { + "epoch": 4.6553191489361705, + "grad_norm": 3.1889400482177734, + "learning_rate": 6.035504316142333e-07, + "loss": 0.335, + "step": 9846 + }, + { + "epoch": 4.655791962174941, + "grad_norm": 3.3385977745056152, + "learning_rate": 6.031440174075221e-07, + "loss": 0.3985, + "step": 9847 + }, + { + "epoch": 4.656264775413711, + "grad_norm": 2.849853277206421, + "learning_rate": 6.027377213097146e-07, + "loss": 0.3604, + "step": 9848 + }, + { + "epoch": 4.656737588652482, + "grad_norm": 3.243053436279297, + "learning_rate": 6.02331543346111e-07, + "loss": 0.3257, + "step": 9849 + }, + { + "epoch": 4.657210401891253, + "grad_norm": 3.344167709350586, + "learning_rate": 6.01925483542001e-07, + "loss": 0.3511, + "step": 9850 + }, + { + "epoch": 4.657683215130024, + "grad_norm": 2.9741430282592773, + "learning_rate": 6.015195419226677e-07, + "loss": 0.3303, + "step": 9851 + }, + { + "epoch": 4.658156028368794, + "grad_norm": 3.0257937908172607, + "learning_rate": 6.011137185133883e-07, + "loss": 0.3716, + "step": 9852 + }, + { + "epoch": 4.658628841607565, + "grad_norm": 3.5770089626312256, + "learning_rate": 6.007080133394316e-07, + "loss": 0.3258, + "step": 9853 + }, + { + "epoch": 4.659101654846336, + "grad_norm": 3.363703489303589, + "learning_rate": 6.003024264260587e-07, + "loss": 0.3924, + "step": 9854 + }, + { + "epoch": 4.659574468085106, + "grad_norm": 3.3533787727355957, + "learning_rate": 5.998969577985239e-07, + "loss": 0.3242, + "step": 9855 + }, + { + "epoch": 4.660047281323877, + "grad_norm": 2.7335259914398193, + "learning_rate": 5.994916074820731e-07, + "loss": 0.3269, + "step": 9856 + }, + { + "epoch": 4.6605200945626475, + "grad_norm": 3.7654764652252197, + "learning_rate": 5.990863755019471e-07, + "loss": 0.4, + "step": 9857 + }, + { + "epoch": 4.660992907801418, + "grad_norm": 3.372542381286621, + "learning_rate": 5.986812618833765e-07, + "loss": 0.3423, + "step": 9858 + }, + { + "epoch": 4.661465721040189, + "grad_norm": 2.797814130783081, + "learning_rate": 5.982762666515873e-07, + "loss": 0.3228, + "step": 9859 + }, + { + "epoch": 4.66193853427896, + "grad_norm": 3.0121023654937744, + "learning_rate": 5.978713898317964e-07, + "loss": 0.3063, + "step": 9860 + }, + { + "epoch": 4.662411347517731, + "grad_norm": 3.052292823791504, + "learning_rate": 5.974666314492126e-07, + "loss": 0.3029, + "step": 9861 + }, + { + "epoch": 4.6628841607565015, + "grad_norm": 2.990906238555908, + "learning_rate": 5.970619915290399e-07, + "loss": 0.3788, + "step": 9862 + }, + { + "epoch": 4.663356973995272, + "grad_norm": 3.214334726333618, + "learning_rate": 5.966574700964722e-07, + "loss": 0.3138, + "step": 9863 + }, + { + "epoch": 4.663829787234042, + "grad_norm": 3.5982940196990967, + "learning_rate": 5.962530671766989e-07, + "loss": 0.3685, + "step": 9864 + }, + { + "epoch": 4.664302600472813, + "grad_norm": 3.2522151470184326, + "learning_rate": 5.958487827948991e-07, + "loss": 0.3086, + "step": 9865 + }, + { + "epoch": 4.664775413711584, + "grad_norm": 3.070181131362915, + "learning_rate": 5.954446169762457e-07, + "loss": 0.3534, + "step": 9866 + }, + { + "epoch": 4.665248226950355, + "grad_norm": 3.3051350116729736, + "learning_rate": 5.950405697459055e-07, + "loss": 0.3871, + "step": 9867 + }, + { + "epoch": 4.665721040189125, + "grad_norm": 2.8587753772735596, + "learning_rate": 5.946366411290358e-07, + "loss": 0.3157, + "step": 9868 + }, + { + "epoch": 4.666193853427896, + "grad_norm": 3.154926061630249, + "learning_rate": 5.942328311507878e-07, + "loss": 0.2967, + "step": 9869 + }, + { + "epoch": 4.666666666666667, + "grad_norm": 3.0322320461273193, + "learning_rate": 5.938291398363049e-07, + "loss": 0.323, + "step": 9870 + }, + { + "epoch": 4.667139479905437, + "grad_norm": 3.0678954124450684, + "learning_rate": 5.934255672107222e-07, + "loss": 0.3337, + "step": 9871 + }, + { + "epoch": 4.667612293144208, + "grad_norm": 3.4822635650634766, + "learning_rate": 5.930221132991704e-07, + "loss": 0.4052, + "step": 9872 + }, + { + "epoch": 4.6680851063829785, + "grad_norm": 3.26842999458313, + "learning_rate": 5.926187781267695e-07, + "loss": 0.3501, + "step": 9873 + }, + { + "epoch": 4.668557919621749, + "grad_norm": 2.911407709121704, + "learning_rate": 5.922155617186332e-07, + "loss": 0.3544, + "step": 9874 + }, + { + "epoch": 4.66903073286052, + "grad_norm": 3.1876001358032227, + "learning_rate": 5.91812464099869e-07, + "loss": 0.3389, + "step": 9875 + }, + { + "epoch": 4.669503546099291, + "grad_norm": 3.4954607486724854, + "learning_rate": 5.914094852955749e-07, + "loss": 0.3461, + "step": 9876 + }, + { + "epoch": 4.669976359338062, + "grad_norm": 3.8845367431640625, + "learning_rate": 5.910066253308439e-07, + "loss": 0.3868, + "step": 9877 + }, + { + "epoch": 4.6704491725768325, + "grad_norm": 3.18038272857666, + "learning_rate": 5.906038842307598e-07, + "loss": 0.3311, + "step": 9878 + }, + { + "epoch": 4.670921985815603, + "grad_norm": 3.5944042205810547, + "learning_rate": 5.902012620203984e-07, + "loss": 0.3246, + "step": 9879 + }, + { + "epoch": 4.671394799054373, + "grad_norm": 2.980142116546631, + "learning_rate": 5.897987587248311e-07, + "loss": 0.3361, + "step": 9880 + }, + { + "epoch": 4.671867612293144, + "grad_norm": 4.3120269775390625, + "learning_rate": 5.893963743691183e-07, + "loss": 0.3213, + "step": 9881 + }, + { + "epoch": 4.672340425531915, + "grad_norm": 3.42366361618042, + "learning_rate": 5.889941089783163e-07, + "loss": 0.3515, + "step": 9882 + }, + { + "epoch": 4.6728132387706856, + "grad_norm": 2.910720109939575, + "learning_rate": 5.885919625774716e-07, + "loss": 0.3417, + "step": 9883 + }, + { + "epoch": 4.673286052009456, + "grad_norm": 3.122042179107666, + "learning_rate": 5.881899351916242e-07, + "loss": 0.3714, + "step": 9884 + }, + { + "epoch": 4.673758865248227, + "grad_norm": 3.0564188957214355, + "learning_rate": 5.877880268458064e-07, + "loss": 0.3146, + "step": 9885 + }, + { + "epoch": 4.674231678486998, + "grad_norm": 3.303421974182129, + "learning_rate": 5.873862375650427e-07, + "loss": 0.3476, + "step": 9886 + }, + { + "epoch": 4.674704491725768, + "grad_norm": 3.3057096004486084, + "learning_rate": 5.869845673743521e-07, + "loss": 0.3237, + "step": 9887 + }, + { + "epoch": 4.675177304964539, + "grad_norm": 3.1843838691711426, + "learning_rate": 5.865830162987443e-07, + "loss": 0.3789, + "step": 9888 + }, + { + "epoch": 4.6756501182033094, + "grad_norm": 2.865844964981079, + "learning_rate": 5.861815843632213e-07, + "loss": 0.3173, + "step": 9889 + }, + { + "epoch": 4.67612293144208, + "grad_norm": 2.986262083053589, + "learning_rate": 5.857802715927796e-07, + "loss": 0.2697, + "step": 9890 + }, + { + "epoch": 4.676595744680851, + "grad_norm": 3.2936089038848877, + "learning_rate": 5.853790780124063e-07, + "loss": 0.3839, + "step": 9891 + }, + { + "epoch": 4.677068557919622, + "grad_norm": 2.7130303382873535, + "learning_rate": 5.849780036470831e-07, + "loss": 0.3004, + "step": 9892 + }, + { + "epoch": 4.677541371158393, + "grad_norm": 3.0076770782470703, + "learning_rate": 5.845770485217827e-07, + "loss": 0.3467, + "step": 9893 + }, + { + "epoch": 4.678014184397163, + "grad_norm": 3.5340375900268555, + "learning_rate": 5.841762126614697e-07, + "loss": 0.3759, + "step": 9894 + }, + { + "epoch": 4.678486997635934, + "grad_norm": 3.0034375190734863, + "learning_rate": 5.837754960911041e-07, + "loss": 0.3099, + "step": 9895 + }, + { + "epoch": 4.678959810874704, + "grad_norm": 3.576899766921997, + "learning_rate": 5.833748988356358e-07, + "loss": 0.3612, + "step": 9896 + }, + { + "epoch": 4.679432624113475, + "grad_norm": 3.0961546897888184, + "learning_rate": 5.829744209200077e-07, + "loss": 0.3098, + "step": 9897 + }, + { + "epoch": 4.679905437352246, + "grad_norm": 3.1387925148010254, + "learning_rate": 5.825740623691576e-07, + "loss": 0.3538, + "step": 9898 + }, + { + "epoch": 4.6803782505910165, + "grad_norm": 3.4131572246551514, + "learning_rate": 5.821738232080127e-07, + "loss": 0.3984, + "step": 9899 + }, + { + "epoch": 4.680851063829787, + "grad_norm": 3.1346065998077393, + "learning_rate": 5.817737034614934e-07, + "loss": 0.3585, + "step": 9900 + }, + { + "epoch": 4.681323877068558, + "grad_norm": 3.148144483566284, + "learning_rate": 5.813737031545155e-07, + "loss": 0.3774, + "step": 9901 + }, + { + "epoch": 4.681796690307329, + "grad_norm": 3.2461299896240234, + "learning_rate": 5.809738223119843e-07, + "loss": 0.3181, + "step": 9902 + }, + { + "epoch": 4.682269503546099, + "grad_norm": 3.1998214721679688, + "learning_rate": 5.805740609587981e-07, + "loss": 0.3452, + "step": 9903 + }, + { + "epoch": 4.68274231678487, + "grad_norm": 2.897399425506592, + "learning_rate": 5.801744191198483e-07, + "loss": 0.3247, + "step": 9904 + }, + { + "epoch": 4.68321513002364, + "grad_norm": 2.921877384185791, + "learning_rate": 5.797748968200198e-07, + "loss": 0.2842, + "step": 9905 + }, + { + "epoch": 4.683687943262411, + "grad_norm": 3.17667818069458, + "learning_rate": 5.793754940841887e-07, + "loss": 0.3218, + "step": 9906 + }, + { + "epoch": 4.684160756501182, + "grad_norm": 3.499068260192871, + "learning_rate": 5.78976210937223e-07, + "loss": 0.3352, + "step": 9907 + }, + { + "epoch": 4.684633569739953, + "grad_norm": 3.2782368659973145, + "learning_rate": 5.785770474039859e-07, + "loss": 0.3671, + "step": 9908 + }, + { + "epoch": 4.685106382978724, + "grad_norm": 3.089757204055786, + "learning_rate": 5.781780035093304e-07, + "loss": 0.3613, + "step": 9909 + }, + { + "epoch": 4.685579196217494, + "grad_norm": 3.082561492919922, + "learning_rate": 5.77779079278104e-07, + "loss": 0.3351, + "step": 9910 + }, + { + "epoch": 4.686052009456265, + "grad_norm": 3.6009864807128906, + "learning_rate": 5.773802747351462e-07, + "loss": 0.3545, + "step": 9911 + }, + { + "epoch": 4.686524822695035, + "grad_norm": 3.488717555999756, + "learning_rate": 5.769815899052872e-07, + "loss": 0.3926, + "step": 9912 + }, + { + "epoch": 4.686997635933806, + "grad_norm": 3.5619056224823, + "learning_rate": 5.765830248133531e-07, + "loss": 0.4171, + "step": 9913 + }, + { + "epoch": 4.687470449172577, + "grad_norm": 3.30653977394104, + "learning_rate": 5.761845794841594e-07, + "loss": 0.3713, + "step": 9914 + }, + { + "epoch": 4.6879432624113475, + "grad_norm": 2.8256847858428955, + "learning_rate": 5.757862539425171e-07, + "loss": 0.2633, + "step": 9915 + }, + { + "epoch": 4.688416075650118, + "grad_norm": 3.0387041568756104, + "learning_rate": 5.753880482132274e-07, + "loss": 0.3169, + "step": 9916 + }, + { + "epoch": 4.688888888888889, + "grad_norm": 3.2312963008880615, + "learning_rate": 5.749899623210845e-07, + "loss": 0.3238, + "step": 9917 + }, + { + "epoch": 4.68936170212766, + "grad_norm": 3.077155351638794, + "learning_rate": 5.74591996290876e-07, + "loss": 0.3101, + "step": 9918 + }, + { + "epoch": 4.68983451536643, + "grad_norm": 3.461580991744995, + "learning_rate": 5.741941501473811e-07, + "loss": 0.3756, + "step": 9919 + }, + { + "epoch": 4.690307328605201, + "grad_norm": 3.8845605850219727, + "learning_rate": 5.737964239153712e-07, + "loss": 0.3747, + "step": 9920 + }, + { + "epoch": 4.690780141843971, + "grad_norm": 3.1688292026519775, + "learning_rate": 5.733988176196129e-07, + "loss": 0.3663, + "step": 9921 + }, + { + "epoch": 4.691252955082742, + "grad_norm": 3.2730917930603027, + "learning_rate": 5.730013312848614e-07, + "loss": 0.3697, + "step": 9922 + }, + { + "epoch": 4.691725768321513, + "grad_norm": 3.093761682510376, + "learning_rate": 5.726039649358681e-07, + "loss": 0.3215, + "step": 9923 + }, + { + "epoch": 4.692198581560284, + "grad_norm": 3.1679420471191406, + "learning_rate": 5.722067185973746e-07, + "loss": 0.3019, + "step": 9924 + }, + { + "epoch": 4.692671394799055, + "grad_norm": 3.4821531772613525, + "learning_rate": 5.718095922941147e-07, + "loss": 0.3659, + "step": 9925 + }, + { + "epoch": 4.693144208037825, + "grad_norm": 2.985276699066162, + "learning_rate": 5.714125860508177e-07, + "loss": 0.3293, + "step": 9926 + }, + { + "epoch": 4.693617021276596, + "grad_norm": 3.171663999557495, + "learning_rate": 5.710156998922015e-07, + "loss": 0.3647, + "step": 9927 + }, + { + "epoch": 4.694089834515366, + "grad_norm": 3.3699564933776855, + "learning_rate": 5.706189338429798e-07, + "loss": 0.4021, + "step": 9928 + }, + { + "epoch": 4.694562647754137, + "grad_norm": 3.0827202796936035, + "learning_rate": 5.702222879278571e-07, + "loss": 0.329, + "step": 9929 + }, + { + "epoch": 4.695035460992908, + "grad_norm": 3.5798332691192627, + "learning_rate": 5.698257621715303e-07, + "loss": 0.3777, + "step": 9930 + }, + { + "epoch": 4.6955082742316785, + "grad_norm": 2.741230010986328, + "learning_rate": 5.6942935659869e-07, + "loss": 0.31, + "step": 9931 + }, + { + "epoch": 4.695981087470449, + "grad_norm": 2.9929327964782715, + "learning_rate": 5.690330712340187e-07, + "loss": 0.3132, + "step": 9932 + }, + { + "epoch": 4.69645390070922, + "grad_norm": 3.062685489654541, + "learning_rate": 5.68636906102191e-07, + "loss": 0.3204, + "step": 9933 + }, + { + "epoch": 4.696926713947991, + "grad_norm": 3.166281223297119, + "learning_rate": 5.682408612278742e-07, + "loss": 0.3444, + "step": 9934 + }, + { + "epoch": 4.697399527186761, + "grad_norm": 3.0413401126861572, + "learning_rate": 5.678449366357278e-07, + "loss": 0.3506, + "step": 9935 + }, + { + "epoch": 4.697872340425532, + "grad_norm": 3.7843124866485596, + "learning_rate": 5.674491323504059e-07, + "loss": 0.3349, + "step": 9936 + }, + { + "epoch": 4.698345153664302, + "grad_norm": 2.9070212841033936, + "learning_rate": 5.670534483965514e-07, + "loss": 0.2954, + "step": 9937 + }, + { + "epoch": 4.698817966903073, + "grad_norm": 2.924229383468628, + "learning_rate": 5.666578847988041e-07, + "loss": 0.3392, + "step": 9938 + }, + { + "epoch": 4.699290780141844, + "grad_norm": 3.1302332878112793, + "learning_rate": 5.662624415817924e-07, + "loss": 0.3198, + "step": 9939 + }, + { + "epoch": 4.699763593380615, + "grad_norm": 3.163005828857422, + "learning_rate": 5.65867118770139e-07, + "loss": 0.3817, + "step": 9940 + }, + { + "epoch": 4.700236406619386, + "grad_norm": 3.4002792835235596, + "learning_rate": 5.654719163884598e-07, + "loss": 0.3961, + "step": 9941 + }, + { + "epoch": 4.700709219858156, + "grad_norm": 3.9756014347076416, + "learning_rate": 5.650768344613616e-07, + "loss": 0.4011, + "step": 9942 + }, + { + "epoch": 4.701182033096927, + "grad_norm": 3.118243455886841, + "learning_rate": 5.64681873013444e-07, + "loss": 0.3675, + "step": 9943 + }, + { + "epoch": 4.701654846335697, + "grad_norm": 3.0520825386047363, + "learning_rate": 5.642870320693005e-07, + "loss": 0.2782, + "step": 9944 + }, + { + "epoch": 4.702127659574468, + "grad_norm": 3.380565643310547, + "learning_rate": 5.638923116535152e-07, + "loss": 0.3632, + "step": 9945 + }, + { + "epoch": 4.702600472813239, + "grad_norm": 3.2340569496154785, + "learning_rate": 5.634977117906668e-07, + "loss": 0.3754, + "step": 9946 + }, + { + "epoch": 4.7030732860520095, + "grad_norm": 3.0068717002868652, + "learning_rate": 5.631032325053243e-07, + "loss": 0.3879, + "step": 9947 + }, + { + "epoch": 4.70354609929078, + "grad_norm": 3.4717891216278076, + "learning_rate": 5.627088738220507e-07, + "loss": 0.4053, + "step": 9948 + }, + { + "epoch": 4.704018912529551, + "grad_norm": 4.362999439239502, + "learning_rate": 5.623146357654008e-07, + "loss": 0.3115, + "step": 9949 + }, + { + "epoch": 4.704491725768322, + "grad_norm": 3.2190041542053223, + "learning_rate": 5.619205183599211e-07, + "loss": 0.3267, + "step": 9950 + }, + { + "epoch": 4.704964539007092, + "grad_norm": 3.413800001144409, + "learning_rate": 5.615265216301532e-07, + "loss": 0.4012, + "step": 9951 + }, + { + "epoch": 4.705437352245863, + "grad_norm": 3.5244312286376953, + "learning_rate": 5.611326456006291e-07, + "loss": 0.3484, + "step": 9952 + }, + { + "epoch": 4.705910165484633, + "grad_norm": 3.055433511734009, + "learning_rate": 5.607388902958727e-07, + "loss": 0.3637, + "step": 9953 + }, + { + "epoch": 4.706382978723404, + "grad_norm": 3.4459595680236816, + "learning_rate": 5.603452557404029e-07, + "loss": 0.3632, + "step": 9954 + }, + { + "epoch": 4.706855791962175, + "grad_norm": 2.972321033477783, + "learning_rate": 5.59951741958728e-07, + "loss": 0.3411, + "step": 9955 + }, + { + "epoch": 4.707328605200946, + "grad_norm": 3.2460532188415527, + "learning_rate": 5.595583489753523e-07, + "loss": 0.3779, + "step": 9956 + }, + { + "epoch": 4.707801418439717, + "grad_norm": 3.514521837234497, + "learning_rate": 5.591650768147694e-07, + "loss": 0.3313, + "step": 9957 + }, + { + "epoch": 4.708274231678487, + "grad_norm": 2.8473336696624756, + "learning_rate": 5.587719255014662e-07, + "loss": 0.3078, + "step": 9958 + }, + { + "epoch": 4.708747044917258, + "grad_norm": 3.309263229370117, + "learning_rate": 5.583788950599239e-07, + "loss": 0.3905, + "step": 9959 + }, + { + "epoch": 4.709219858156028, + "grad_norm": 3.435980796813965, + "learning_rate": 5.579859855146133e-07, + "loss": 0.3507, + "step": 9960 + }, + { + "epoch": 4.709692671394799, + "grad_norm": 3.0237598419189453, + "learning_rate": 5.575931968900006e-07, + "loss": 0.3349, + "step": 9961 + }, + { + "epoch": 4.71016548463357, + "grad_norm": 3.6978237628936768, + "learning_rate": 5.572005292105426e-07, + "loss": 0.3672, + "step": 9962 + }, + { + "epoch": 4.7106382978723405, + "grad_norm": 3.3029704093933105, + "learning_rate": 5.568079825006883e-07, + "loss": 0.3438, + "step": 9963 + }, + { + "epoch": 4.711111111111111, + "grad_norm": 2.9121241569519043, + "learning_rate": 5.5641555678488e-07, + "loss": 0.3299, + "step": 9964 + }, + { + "epoch": 4.711583924349882, + "grad_norm": 3.2730703353881836, + "learning_rate": 5.56023252087553e-07, + "loss": 0.3572, + "step": 9965 + }, + { + "epoch": 4.712056737588653, + "grad_norm": 3.316593885421753, + "learning_rate": 5.556310684331343e-07, + "loss": 0.3139, + "step": 9966 + }, + { + "epoch": 4.712529550827423, + "grad_norm": 3.1281843185424805, + "learning_rate": 5.552390058460427e-07, + "loss": 0.3362, + "step": 9967 + }, + { + "epoch": 4.7130023640661936, + "grad_norm": 3.3069980144500732, + "learning_rate": 5.548470643506904e-07, + "loss": 0.3839, + "step": 9968 + }, + { + "epoch": 4.713475177304964, + "grad_norm": 4.4018354415893555, + "learning_rate": 5.544552439714826e-07, + "loss": 0.2954, + "step": 9969 + }, + { + "epoch": 4.713947990543735, + "grad_norm": 2.797149658203125, + "learning_rate": 5.540635447328161e-07, + "loss": 0.3253, + "step": 9970 + }, + { + "epoch": 4.714420803782506, + "grad_norm": 3.0065677165985107, + "learning_rate": 5.536719666590792e-07, + "loss": 0.3376, + "step": 9971 + }, + { + "epoch": 4.714893617021277, + "grad_norm": 3.1383140087127686, + "learning_rate": 5.532805097746552e-07, + "loss": 0.3444, + "step": 9972 + }, + { + "epoch": 4.7153664302600475, + "grad_norm": 2.983229398727417, + "learning_rate": 5.528891741039169e-07, + "loss": 0.3173, + "step": 9973 + }, + { + "epoch": 4.715839243498818, + "grad_norm": 3.119361162185669, + "learning_rate": 5.524979596712326e-07, + "loss": 0.3829, + "step": 9974 + }, + { + "epoch": 4.716312056737589, + "grad_norm": 3.4099128246307373, + "learning_rate": 5.52106866500961e-07, + "loss": 0.3363, + "step": 9975 + }, + { + "epoch": 4.716784869976359, + "grad_norm": 2.818964719772339, + "learning_rate": 5.517158946174528e-07, + "loss": 0.321, + "step": 9976 + }, + { + "epoch": 4.71725768321513, + "grad_norm": 3.4968421459198, + "learning_rate": 5.513250440450538e-07, + "loss": 0.3973, + "step": 9977 + }, + { + "epoch": 4.717730496453901, + "grad_norm": 3.3777382373809814, + "learning_rate": 5.509343148080987e-07, + "loss": 0.3607, + "step": 9978 + }, + { + "epoch": 4.718203309692671, + "grad_norm": 2.95882511138916, + "learning_rate": 5.50543706930918e-07, + "loss": 0.3483, + "step": 9979 + }, + { + "epoch": 4.718676122931442, + "grad_norm": 2.8768858909606934, + "learning_rate": 5.501532204378327e-07, + "loss": 0.3488, + "step": 9980 + }, + { + "epoch": 4.719148936170213, + "grad_norm": 2.9310572147369385, + "learning_rate": 5.497628553531565e-07, + "loss": 0.3174, + "step": 9981 + }, + { + "epoch": 4.719621749408984, + "grad_norm": 3.1057486534118652, + "learning_rate": 5.493726117011957e-07, + "loss": 0.346, + "step": 9982 + }, + { + "epoch": 4.720094562647754, + "grad_norm": 3.681593418121338, + "learning_rate": 5.489824895062487e-07, + "loss": 0.3371, + "step": 9983 + }, + { + "epoch": 4.7205673758865245, + "grad_norm": 3.0641729831695557, + "learning_rate": 5.485924887926075e-07, + "loss": 0.3614, + "step": 9984 + }, + { + "epoch": 4.721040189125295, + "grad_norm": 3.2925705909729004, + "learning_rate": 5.482026095845555e-07, + "loss": 0.3023, + "step": 9985 + }, + { + "epoch": 4.721513002364066, + "grad_norm": 2.963693141937256, + "learning_rate": 5.47812851906368e-07, + "loss": 0.3706, + "step": 9986 + }, + { + "epoch": 4.721985815602837, + "grad_norm": 3.187870740890503, + "learning_rate": 5.474232157823147e-07, + "loss": 0.3332, + "step": 9987 + }, + { + "epoch": 4.722458628841608, + "grad_norm": 3.9346799850463867, + "learning_rate": 5.470337012366556e-07, + "loss": 0.3738, + "step": 9988 + }, + { + "epoch": 4.7229314420803785, + "grad_norm": 3.385035753250122, + "learning_rate": 5.466443082936446e-07, + "loss": 0.3194, + "step": 9989 + }, + { + "epoch": 4.723404255319149, + "grad_norm": 3.0829477310180664, + "learning_rate": 5.462550369775277e-07, + "loss": 0.2877, + "step": 9990 + }, + { + "epoch": 4.72387706855792, + "grad_norm": 2.8730506896972656, + "learning_rate": 5.458658873125419e-07, + "loss": 0.3352, + "step": 9991 + }, + { + "epoch": 4.72434988179669, + "grad_norm": 3.198498249053955, + "learning_rate": 5.454768593229193e-07, + "loss": 0.3697, + "step": 9992 + }, + { + "epoch": 4.724822695035461, + "grad_norm": 3.37144136428833, + "learning_rate": 5.450879530328824e-07, + "loss": 0.4245, + "step": 9993 + }, + { + "epoch": 4.725295508274232, + "grad_norm": 3.6235079765319824, + "learning_rate": 5.446991684666461e-07, + "loss": 0.3707, + "step": 9994 + }, + { + "epoch": 4.725768321513002, + "grad_norm": 3.5587494373321533, + "learning_rate": 5.443105056484194e-07, + "loss": 0.3297, + "step": 9995 + }, + { + "epoch": 4.726241134751773, + "grad_norm": 3.5308549404144287, + "learning_rate": 5.439219646024018e-07, + "loss": 0.3521, + "step": 9996 + }, + { + "epoch": 4.726713947990544, + "grad_norm": 3.16542649269104, + "learning_rate": 5.435335453527868e-07, + "loss": 0.3499, + "step": 9997 + }, + { + "epoch": 4.727186761229315, + "grad_norm": 3.2565104961395264, + "learning_rate": 5.431452479237586e-07, + "loss": 0.338, + "step": 9998 + }, + { + "epoch": 4.727659574468085, + "grad_norm": 3.371232032775879, + "learning_rate": 5.427570723394951e-07, + "loss": 0.3641, + "step": 9999 + }, + { + "epoch": 4.7281323877068555, + "grad_norm": 2.9784507751464844, + "learning_rate": 5.423690186241668e-07, + "loss": 0.3667, + "step": 10000 + }, + { + "epoch": 4.728605200945626, + "grad_norm": 3.0877480506896973, + "learning_rate": 5.419810868019351e-07, + "loss": 0.3098, + "step": 10001 + }, + { + "epoch": 4.729078014184397, + "grad_norm": 4.132823467254639, + "learning_rate": 5.415932768969562e-07, + "loss": 0.3712, + "step": 10002 + }, + { + "epoch": 4.729550827423168, + "grad_norm": 2.8105905055999756, + "learning_rate": 5.412055889333767e-07, + "loss": 0.2829, + "step": 10003 + }, + { + "epoch": 4.730023640661939, + "grad_norm": 3.543795585632324, + "learning_rate": 5.408180229353352e-07, + "loss": 0.3101, + "step": 10004 + }, + { + "epoch": 4.7304964539007095, + "grad_norm": 3.307525157928467, + "learning_rate": 5.404305789269657e-07, + "loss": 0.3585, + "step": 10005 + }, + { + "epoch": 4.73096926713948, + "grad_norm": 3.0976414680480957, + "learning_rate": 5.400432569323905e-07, + "loss": 0.3202, + "step": 10006 + }, + { + "epoch": 4.73144208037825, + "grad_norm": 3.0249791145324707, + "learning_rate": 5.396560569757284e-07, + "loss": 0.3468, + "step": 10007 + }, + { + "epoch": 4.731914893617021, + "grad_norm": 3.0199971199035645, + "learning_rate": 5.392689790810879e-07, + "loss": 0.3483, + "step": 10008 + }, + { + "epoch": 4.732387706855792, + "grad_norm": 3.177297592163086, + "learning_rate": 5.388820232725697e-07, + "loss": 0.3333, + "step": 10009 + }, + { + "epoch": 4.732860520094563, + "grad_norm": 3.247121572494507, + "learning_rate": 5.384951895742693e-07, + "loss": 0.2881, + "step": 10010 + }, + { + "epoch": 4.733333333333333, + "grad_norm": 3.513106346130371, + "learning_rate": 5.381084780102727e-07, + "loss": 0.3786, + "step": 10011 + }, + { + "epoch": 4.733806146572104, + "grad_norm": 2.8936305046081543, + "learning_rate": 5.377218886046584e-07, + "loss": 0.3174, + "step": 10012 + }, + { + "epoch": 4.734278959810875, + "grad_norm": 3.1088016033172607, + "learning_rate": 5.373354213814977e-07, + "loss": 0.3108, + "step": 10013 + }, + { + "epoch": 4.734751773049645, + "grad_norm": 2.693617343902588, + "learning_rate": 5.369490763648539e-07, + "loss": 0.3441, + "step": 10014 + }, + { + "epoch": 4.735224586288416, + "grad_norm": 3.4399259090423584, + "learning_rate": 5.365628535787837e-07, + "loss": 0.3937, + "step": 10015 + }, + { + "epoch": 4.7356973995271865, + "grad_norm": 3.28714919090271, + "learning_rate": 5.361767530473355e-07, + "loss": 0.2993, + "step": 10016 + }, + { + "epoch": 4.736170212765957, + "grad_norm": 3.1407346725463867, + "learning_rate": 5.35790774794549e-07, + "loss": 0.3605, + "step": 10017 + }, + { + "epoch": 4.736643026004728, + "grad_norm": 3.464386224746704, + "learning_rate": 5.354049188444588e-07, + "loss": 0.382, + "step": 10018 + }, + { + "epoch": 4.737115839243499, + "grad_norm": 3.303809881210327, + "learning_rate": 5.350191852210889e-07, + "loss": 0.3438, + "step": 10019 + }, + { + "epoch": 4.73758865248227, + "grad_norm": 3.3727755546569824, + "learning_rate": 5.346335739484593e-07, + "loss": 0.3524, + "step": 10020 + }, + { + "epoch": 4.7380614657210405, + "grad_norm": 3.125762939453125, + "learning_rate": 5.342480850505788e-07, + "loss": 0.3762, + "step": 10021 + }, + { + "epoch": 4.738534278959811, + "grad_norm": 3.32598876953125, + "learning_rate": 5.3386271855145e-07, + "loss": 0.345, + "step": 10022 + }, + { + "epoch": 4.739007092198581, + "grad_norm": 2.889338970184326, + "learning_rate": 5.334774744750692e-07, + "loss": 0.3245, + "step": 10023 + }, + { + "epoch": 4.739479905437352, + "grad_norm": 3.3369252681732178, + "learning_rate": 5.330923528454223e-07, + "loss": 0.366, + "step": 10024 + }, + { + "epoch": 4.739952718676123, + "grad_norm": 3.008836269378662, + "learning_rate": 5.327073536864908e-07, + "loss": 0.358, + "step": 10025 + }, + { + "epoch": 4.740425531914894, + "grad_norm": 3.1076738834381104, + "learning_rate": 5.323224770222457e-07, + "loss": 0.3398, + "step": 10026 + }, + { + "epoch": 4.740898345153664, + "grad_norm": 3.269164800643921, + "learning_rate": 5.319377228766523e-07, + "loss": 0.3364, + "step": 10027 + }, + { + "epoch": 4.741371158392435, + "grad_norm": 3.3928871154785156, + "learning_rate": 5.315530912736671e-07, + "loss": 0.3376, + "step": 10028 + }, + { + "epoch": 4.741843971631206, + "grad_norm": 2.7413101196289062, + "learning_rate": 5.31168582237239e-07, + "loss": 0.3551, + "step": 10029 + }, + { + "epoch": 4.742316784869976, + "grad_norm": 2.837280035018921, + "learning_rate": 5.307841957913104e-07, + "loss": 0.316, + "step": 10030 + }, + { + "epoch": 4.742789598108747, + "grad_norm": 3.140482187271118, + "learning_rate": 5.303999319598158e-07, + "loss": 0.3951, + "step": 10031 + }, + { + "epoch": 4.7432624113475175, + "grad_norm": 2.978053331375122, + "learning_rate": 5.3001579076668e-07, + "loss": 0.3328, + "step": 10032 + }, + { + "epoch": 4.743735224586288, + "grad_norm": 3.3469338417053223, + "learning_rate": 5.296317722358235e-07, + "loss": 0.328, + "step": 10033 + }, + { + "epoch": 4.744208037825059, + "grad_norm": 3.1574513912200928, + "learning_rate": 5.29247876391156e-07, + "loss": 0.3375, + "step": 10034 + }, + { + "epoch": 4.74468085106383, + "grad_norm": 2.9314582347869873, + "learning_rate": 5.288641032565825e-07, + "loss": 0.3025, + "step": 10035 + }, + { + "epoch": 4.745153664302601, + "grad_norm": 3.298856258392334, + "learning_rate": 5.284804528559981e-07, + "loss": 0.3071, + "step": 10036 + }, + { + "epoch": 4.7456264775413715, + "grad_norm": 3.489758014678955, + "learning_rate": 5.280969252132903e-07, + "loss": 0.3392, + "step": 10037 + }, + { + "epoch": 4.746099290780142, + "grad_norm": 3.1727964878082275, + "learning_rate": 5.277135203523412e-07, + "loss": 0.3472, + "step": 10038 + }, + { + "epoch": 4.746572104018912, + "grad_norm": 3.267204761505127, + "learning_rate": 5.27330238297023e-07, + "loss": 0.3555, + "step": 10039 + }, + { + "epoch": 4.747044917257683, + "grad_norm": 3.376077175140381, + "learning_rate": 5.269470790712003e-07, + "loss": 0.4018, + "step": 10040 + }, + { + "epoch": 4.747517730496454, + "grad_norm": 3.2389678955078125, + "learning_rate": 5.265640426987321e-07, + "loss": 0.3742, + "step": 10041 + }, + { + "epoch": 4.7479905437352246, + "grad_norm": 3.0280439853668213, + "learning_rate": 5.261811292034668e-07, + "loss": 0.3254, + "step": 10042 + }, + { + "epoch": 4.748463356973995, + "grad_norm": 3.1756322383880615, + "learning_rate": 5.257983386092486e-07, + "loss": 0.3434, + "step": 10043 + }, + { + "epoch": 4.748936170212766, + "grad_norm": 3.220245599746704, + "learning_rate": 5.254156709399111e-07, + "loss": 0.3795, + "step": 10044 + }, + { + "epoch": 4.749408983451537, + "grad_norm": 3.4887516498565674, + "learning_rate": 5.250331262192815e-07, + "loss": 0.353, + "step": 10045 + }, + { + "epoch": 4.749881796690307, + "grad_norm": 3.1106226444244385, + "learning_rate": 5.246507044711791e-07, + "loss": 0.3329, + "step": 10046 + }, + { + "epoch": 4.750354609929078, + "grad_norm": 3.0493836402893066, + "learning_rate": 5.24268405719415e-07, + "loss": 0.3372, + "step": 10047 + }, + { + "epoch": 4.7508274231678485, + "grad_norm": 3.0885660648345947, + "learning_rate": 5.238862299877948e-07, + "loss": 0.3583, + "step": 10048 + }, + { + "epoch": 4.751300236406619, + "grad_norm": 3.194566011428833, + "learning_rate": 5.23504177300114e-07, + "loss": 0.3886, + "step": 10049 + }, + { + "epoch": 4.75177304964539, + "grad_norm": 2.9062368869781494, + "learning_rate": 5.231222476801606e-07, + "loss": 0.3267, + "step": 10050 + }, + { + "epoch": 4.752245862884161, + "grad_norm": 2.9814155101776123, + "learning_rate": 5.227404411517173e-07, + "loss": 0.3817, + "step": 10051 + }, + { + "epoch": 4.752718676122932, + "grad_norm": 3.526301383972168, + "learning_rate": 5.22358757738556e-07, + "loss": 0.3405, + "step": 10052 + }, + { + "epoch": 4.753191489361702, + "grad_norm": 3.2342031002044678, + "learning_rate": 5.219771974644439e-07, + "loss": 0.3429, + "step": 10053 + }, + { + "epoch": 4.753664302600473, + "grad_norm": 3.0213656425476074, + "learning_rate": 5.215957603531383e-07, + "loss": 0.3482, + "step": 10054 + }, + { + "epoch": 4.754137115839243, + "grad_norm": 3.566260576248169, + "learning_rate": 5.212144464283889e-07, + "loss": 0.3633, + "step": 10055 + }, + { + "epoch": 4.754609929078014, + "grad_norm": 3.3363420963287354, + "learning_rate": 5.208332557139398e-07, + "loss": 0.3528, + "step": 10056 + }, + { + "epoch": 4.755082742316785, + "grad_norm": 3.3407959938049316, + "learning_rate": 5.204521882335251e-07, + "loss": 0.3219, + "step": 10057 + }, + { + "epoch": 4.7555555555555555, + "grad_norm": 2.9756882190704346, + "learning_rate": 5.200712440108729e-07, + "loss": 0.3141, + "step": 10058 + }, + { + "epoch": 4.756028368794326, + "grad_norm": 3.7191832065582275, + "learning_rate": 5.19690423069703e-07, + "loss": 0.3657, + "step": 10059 + }, + { + "epoch": 4.756501182033097, + "grad_norm": 3.175494432449341, + "learning_rate": 5.193097254337268e-07, + "loss": 0.2922, + "step": 10060 + }, + { + "epoch": 4.756973995271868, + "grad_norm": 2.9288907051086426, + "learning_rate": 5.189291511266489e-07, + "loss": 0.3097, + "step": 10061 + }, + { + "epoch": 4.757446808510638, + "grad_norm": 3.1014389991760254, + "learning_rate": 5.185487001721656e-07, + "loss": 0.3443, + "step": 10062 + }, + { + "epoch": 4.757919621749409, + "grad_norm": 3.3224666118621826, + "learning_rate": 5.181683725939668e-07, + "loss": 0.3408, + "step": 10063 + }, + { + "epoch": 4.758392434988179, + "grad_norm": 3.248089075088501, + "learning_rate": 5.177881684157335e-07, + "loss": 0.366, + "step": 10064 + }, + { + "epoch": 4.75886524822695, + "grad_norm": 3.3183906078338623, + "learning_rate": 5.174080876611385e-07, + "loss": 0.3774, + "step": 10065 + }, + { + "epoch": 4.759338061465721, + "grad_norm": 3.1653311252593994, + "learning_rate": 5.17028130353849e-07, + "loss": 0.3208, + "step": 10066 + }, + { + "epoch": 4.759810874704492, + "grad_norm": 2.9300882816314697, + "learning_rate": 5.166482965175229e-07, + "loss": 0.3494, + "step": 10067 + }, + { + "epoch": 4.760283687943263, + "grad_norm": 3.504225254058838, + "learning_rate": 5.162685861758099e-07, + "loss": 0.3777, + "step": 10068 + }, + { + "epoch": 4.760756501182033, + "grad_norm": 3.3933908939361572, + "learning_rate": 5.158889993523544e-07, + "loss": 0.3575, + "step": 10069 + }, + { + "epoch": 4.761229314420804, + "grad_norm": 4.30021333694458, + "learning_rate": 5.155095360707901e-07, + "loss": 0.3435, + "step": 10070 + }, + { + "epoch": 4.761702127659574, + "grad_norm": 3.226658582687378, + "learning_rate": 5.151301963547462e-07, + "loss": 0.3473, + "step": 10071 + }, + { + "epoch": 4.762174940898345, + "grad_norm": 3.222884178161621, + "learning_rate": 5.14750980227841e-07, + "loss": 0.314, + "step": 10072 + }, + { + "epoch": 4.762647754137116, + "grad_norm": 3.077139377593994, + "learning_rate": 5.143718877136872e-07, + "loss": 0.2929, + "step": 10073 + }, + { + "epoch": 4.7631205673758865, + "grad_norm": 2.9789531230926514, + "learning_rate": 5.139929188358894e-07, + "loss": 0.3594, + "step": 10074 + }, + { + "epoch": 4.763593380614657, + "grad_norm": 3.558417797088623, + "learning_rate": 5.136140736180445e-07, + "loss": 0.356, + "step": 10075 + }, + { + "epoch": 4.764066193853428, + "grad_norm": 2.8887953758239746, + "learning_rate": 5.13235352083741e-07, + "loss": 0.2957, + "step": 10076 + }, + { + "epoch": 4.764539007092199, + "grad_norm": 3.187857151031494, + "learning_rate": 5.128567542565605e-07, + "loss": 0.3879, + "step": 10077 + }, + { + "epoch": 4.765011820330969, + "grad_norm": 3.761465072631836, + "learning_rate": 5.124782801600758e-07, + "loss": 0.3163, + "step": 10078 + }, + { + "epoch": 4.76548463356974, + "grad_norm": 3.4338560104370117, + "learning_rate": 5.120999298178541e-07, + "loss": 0.3924, + "step": 10079 + }, + { + "epoch": 4.76595744680851, + "grad_norm": 2.8551666736602783, + "learning_rate": 5.117217032534528e-07, + "loss": 0.329, + "step": 10080 + }, + { + "epoch": 4.766430260047281, + "grad_norm": 3.4713878631591797, + "learning_rate": 5.113436004904232e-07, + "loss": 0.3802, + "step": 10081 + }, + { + "epoch": 4.766903073286052, + "grad_norm": 3.1913888454437256, + "learning_rate": 5.109656215523076e-07, + "loss": 0.3273, + "step": 10082 + }, + { + "epoch": 4.767375886524823, + "grad_norm": 2.8070812225341797, + "learning_rate": 5.105877664626402e-07, + "loss": 0.3398, + "step": 10083 + }, + { + "epoch": 4.767848699763594, + "grad_norm": 3.316321849822998, + "learning_rate": 5.102100352449502e-07, + "loss": 0.3649, + "step": 10084 + }, + { + "epoch": 4.768321513002364, + "grad_norm": 3.3555870056152344, + "learning_rate": 5.098324279227557e-07, + "loss": 0.333, + "step": 10085 + }, + { + "epoch": 4.768794326241135, + "grad_norm": 3.0964810848236084, + "learning_rate": 5.094549445195699e-07, + "loss": 0.3384, + "step": 10086 + }, + { + "epoch": 4.769267139479905, + "grad_norm": 3.0406007766723633, + "learning_rate": 5.090775850588963e-07, + "loss": 0.3582, + "step": 10087 + }, + { + "epoch": 4.769739952718676, + "grad_norm": 2.934340238571167, + "learning_rate": 5.087003495642309e-07, + "loss": 0.3306, + "step": 10088 + }, + { + "epoch": 4.770212765957447, + "grad_norm": 3.441734552383423, + "learning_rate": 5.083232380590641e-07, + "loss": 0.386, + "step": 10089 + }, + { + "epoch": 4.7706855791962175, + "grad_norm": 3.176483631134033, + "learning_rate": 5.079462505668758e-07, + "loss": 0.3516, + "step": 10090 + }, + { + "epoch": 4.771158392434988, + "grad_norm": 3.1490824222564697, + "learning_rate": 5.075693871111395e-07, + "loss": 0.3233, + "step": 10091 + }, + { + "epoch": 4.771631205673759, + "grad_norm": 3.300335645675659, + "learning_rate": 5.07192647715321e-07, + "loss": 0.2975, + "step": 10092 + }, + { + "epoch": 4.77210401891253, + "grad_norm": 3.199085235595703, + "learning_rate": 5.068160324028776e-07, + "loss": 0.3468, + "step": 10093 + }, + { + "epoch": 4.7725768321513, + "grad_norm": 3.4611270427703857, + "learning_rate": 5.064395411972605e-07, + "loss": 0.3319, + "step": 10094 + }, + { + "epoch": 4.773049645390071, + "grad_norm": 3.0549957752227783, + "learning_rate": 5.060631741219119e-07, + "loss": 0.3542, + "step": 10095 + }, + { + "epoch": 4.773522458628841, + "grad_norm": 3.085744619369507, + "learning_rate": 5.056869312002655e-07, + "loss": 0.3611, + "step": 10096 + }, + { + "epoch": 4.773995271867612, + "grad_norm": 3.4383676052093506, + "learning_rate": 5.053108124557496e-07, + "loss": 0.3606, + "step": 10097 + }, + { + "epoch": 4.774468085106383, + "grad_norm": 2.8119592666625977, + "learning_rate": 5.049348179117825e-07, + "loss": 0.3192, + "step": 10098 + }, + { + "epoch": 4.774940898345154, + "grad_norm": 2.8554961681365967, + "learning_rate": 5.045589475917767e-07, + "loss": 0.321, + "step": 10099 + }, + { + "epoch": 4.775413711583925, + "grad_norm": 3.612732410430908, + "learning_rate": 5.041832015191356e-07, + "loss": 0.3385, + "step": 10100 + }, + { + "epoch": 4.775886524822695, + "grad_norm": 3.432650327682495, + "learning_rate": 5.038075797172543e-07, + "loss": 0.3494, + "step": 10101 + }, + { + "epoch": 4.776359338061466, + "grad_norm": 3.241612672805786, + "learning_rate": 5.034320822095228e-07, + "loss": 0.3377, + "step": 10102 + }, + { + "epoch": 4.776832151300236, + "grad_norm": 3.5062692165374756, + "learning_rate": 5.030567090193203e-07, + "loss": 0.4038, + "step": 10103 + }, + { + "epoch": 4.777304964539007, + "grad_norm": 2.9015917778015137, + "learning_rate": 5.026814601700205e-07, + "loss": 0.2987, + "step": 10104 + }, + { + "epoch": 4.777777777777778, + "grad_norm": 3.0691189765930176, + "learning_rate": 5.023063356849886e-07, + "loss": 0.3725, + "step": 10105 + }, + { + "epoch": 4.7782505910165485, + "grad_norm": 3.1556789875030518, + "learning_rate": 5.019313355875813e-07, + "loss": 0.3554, + "step": 10106 + }, + { + "epoch": 4.778723404255319, + "grad_norm": 2.84529447555542, + "learning_rate": 5.01556459901148e-07, + "loss": 0.3369, + "step": 10107 + }, + { + "epoch": 4.77919621749409, + "grad_norm": 3.322565793991089, + "learning_rate": 5.011817086490315e-07, + "loss": 0.3641, + "step": 10108 + }, + { + "epoch": 4.779669030732861, + "grad_norm": 3.011988639831543, + "learning_rate": 5.008070818545654e-07, + "loss": 0.3153, + "step": 10109 + }, + { + "epoch": 4.780141843971631, + "grad_norm": 2.937770128250122, + "learning_rate": 5.004325795410764e-07, + "loss": 0.3235, + "step": 10110 + }, + { + "epoch": 4.780614657210402, + "grad_norm": 3.0186142921447754, + "learning_rate": 5.00058201731882e-07, + "loss": 0.3443, + "step": 10111 + }, + { + "epoch": 4.781087470449172, + "grad_norm": 3.1810684204101562, + "learning_rate": 4.996839484502946e-07, + "loss": 0.3511, + "step": 10112 + }, + { + "epoch": 4.781560283687943, + "grad_norm": 3.5470240116119385, + "learning_rate": 4.993098197196167e-07, + "loss": 0.4096, + "step": 10113 + }, + { + "epoch": 4.782033096926714, + "grad_norm": 3.1422345638275146, + "learning_rate": 4.989358155631427e-07, + "loss": 0.3566, + "step": 10114 + }, + { + "epoch": 4.782505910165485, + "grad_norm": 3.3392271995544434, + "learning_rate": 4.985619360041619e-07, + "loss": 0.3278, + "step": 10115 + }, + { + "epoch": 4.782978723404256, + "grad_norm": 3.020026206970215, + "learning_rate": 4.981881810659525e-07, + "loss": 0.3349, + "step": 10116 + }, + { + "epoch": 4.783451536643026, + "grad_norm": 3.061652660369873, + "learning_rate": 4.97814550771788e-07, + "loss": 0.3275, + "step": 10117 + }, + { + "epoch": 4.783924349881797, + "grad_norm": 3.5875346660614014, + "learning_rate": 4.974410451449321e-07, + "loss": 0.3694, + "step": 10118 + }, + { + "epoch": 4.784397163120567, + "grad_norm": 3.848348379135132, + "learning_rate": 4.970676642086408e-07, + "loss": 0.3539, + "step": 10119 + }, + { + "epoch": 4.784869976359338, + "grad_norm": 3.237959146499634, + "learning_rate": 4.966944079861641e-07, + "loss": 0.3468, + "step": 10120 + }, + { + "epoch": 4.785342789598109, + "grad_norm": 3.4829745292663574, + "learning_rate": 4.96321276500742e-07, + "loss": 0.348, + "step": 10121 + }, + { + "epoch": 4.7858156028368795, + "grad_norm": 3.29961895942688, + "learning_rate": 4.959482697756085e-07, + "loss": 0.3499, + "step": 10122 + }, + { + "epoch": 4.78628841607565, + "grad_norm": 3.291260242462158, + "learning_rate": 4.955753878339886e-07, + "loss": 0.3525, + "step": 10123 + }, + { + "epoch": 4.786761229314421, + "grad_norm": 3.543893575668335, + "learning_rate": 4.952026306991004e-07, + "loss": 0.4274, + "step": 10124 + }, + { + "epoch": 4.787234042553192, + "grad_norm": 3.551354169845581, + "learning_rate": 4.948299983941534e-07, + "loss": 0.3116, + "step": 10125 + }, + { + "epoch": 4.787706855791962, + "grad_norm": 3.1988296508789062, + "learning_rate": 4.944574909423497e-07, + "loss": 0.3273, + "step": 10126 + }, + { + "epoch": 4.7881796690307326, + "grad_norm": 2.8899428844451904, + "learning_rate": 4.940851083668843e-07, + "loss": 0.3518, + "step": 10127 + }, + { + "epoch": 4.788652482269503, + "grad_norm": 3.279688835144043, + "learning_rate": 4.937128506909439e-07, + "loss": 0.3735, + "step": 10128 + }, + { + "epoch": 4.789125295508274, + "grad_norm": 3.0784502029418945, + "learning_rate": 4.933407179377059e-07, + "loss": 0.327, + "step": 10129 + }, + { + "epoch": 4.789598108747045, + "grad_norm": 3.390169858932495, + "learning_rate": 4.929687101303435e-07, + "loss": 0.3895, + "step": 10130 + }, + { + "epoch": 4.790070921985816, + "grad_norm": 3.72928524017334, + "learning_rate": 4.925968272920181e-07, + "loss": 0.3598, + "step": 10131 + }, + { + "epoch": 4.7905437352245865, + "grad_norm": 3.3786826133728027, + "learning_rate": 4.922250694458866e-07, + "loss": 0.363, + "step": 10132 + }, + { + "epoch": 4.791016548463357, + "grad_norm": 3.086150884628296, + "learning_rate": 4.918534366150965e-07, + "loss": 0.2877, + "step": 10133 + }, + { + "epoch": 4.791489361702128, + "grad_norm": 3.3568673133850098, + "learning_rate": 4.914819288227865e-07, + "loss": 0.3153, + "step": 10134 + }, + { + "epoch": 4.791962174940898, + "grad_norm": 3.294382095336914, + "learning_rate": 4.911105460920904e-07, + "loss": 0.3327, + "step": 10135 + }, + { + "epoch": 4.792434988179669, + "grad_norm": 3.0562479496002197, + "learning_rate": 4.907392884461321e-07, + "loss": 0.3368, + "step": 10136 + }, + { + "epoch": 4.79290780141844, + "grad_norm": 2.928912878036499, + "learning_rate": 4.90368155908027e-07, + "loss": 0.295, + "step": 10137 + }, + { + "epoch": 4.79338061465721, + "grad_norm": 3.0252797603607178, + "learning_rate": 4.899971485008858e-07, + "loss": 0.2985, + "step": 10138 + }, + { + "epoch": 4.793853427895981, + "grad_norm": 2.830035924911499, + "learning_rate": 4.896262662478085e-07, + "loss": 0.3518, + "step": 10139 + }, + { + "epoch": 4.794326241134752, + "grad_norm": 3.042524576187134, + "learning_rate": 4.892555091718884e-07, + "loss": 0.2871, + "step": 10140 + }, + { + "epoch": 4.794799054373523, + "grad_norm": 2.920741558074951, + "learning_rate": 4.888848772962107e-07, + "loss": 0.3234, + "step": 10141 + }, + { + "epoch": 4.795271867612293, + "grad_norm": 3.3935956954956055, + "learning_rate": 4.885143706438527e-07, + "loss": 0.3612, + "step": 10142 + }, + { + "epoch": 4.7957446808510635, + "grad_norm": 3.1501455307006836, + "learning_rate": 4.881439892378853e-07, + "loss": 0.3239, + "step": 10143 + }, + { + "epoch": 4.796217494089834, + "grad_norm": 3.233794927597046, + "learning_rate": 4.877737331013696e-07, + "loss": 0.3185, + "step": 10144 + }, + { + "epoch": 4.796690307328605, + "grad_norm": 3.1155240535736084, + "learning_rate": 4.874036022573605e-07, + "loss": 0.3128, + "step": 10145 + }, + { + "epoch": 4.797163120567376, + "grad_norm": 3.313546895980835, + "learning_rate": 4.870335967289042e-07, + "loss": 0.3136, + "step": 10146 + }, + { + "epoch": 4.797635933806147, + "grad_norm": 5.024696350097656, + "learning_rate": 4.866637165390387e-07, + "loss": 0.4032, + "step": 10147 + }, + { + "epoch": 4.7981087470449175, + "grad_norm": 3.109086275100708, + "learning_rate": 4.862939617107959e-07, + "loss": 0.3822, + "step": 10148 + }, + { + "epoch": 4.798581560283688, + "grad_norm": 3.144777536392212, + "learning_rate": 4.859243322671978e-07, + "loss": 0.3362, + "step": 10149 + }, + { + "epoch": 4.799054373522459, + "grad_norm": 3.402974843978882, + "learning_rate": 4.855548282312605e-07, + "loss": 0.3803, + "step": 10150 + }, + { + "epoch": 4.799527186761229, + "grad_norm": 2.6077685356140137, + "learning_rate": 4.851854496259911e-07, + "loss": 0.3043, + "step": 10151 + }, + { + "epoch": 4.8, + "grad_norm": 3.38386607170105, + "learning_rate": 4.848161964743883e-07, + "loss": 0.3129, + "step": 10152 + }, + { + "epoch": 4.800472813238771, + "grad_norm": 3.193723440170288, + "learning_rate": 4.844470687994454e-07, + "loss": 0.3544, + "step": 10153 + }, + { + "epoch": 4.800945626477541, + "grad_norm": 2.9620895385742188, + "learning_rate": 4.840780666241457e-07, + "loss": 0.3376, + "step": 10154 + }, + { + "epoch": 4.801418439716312, + "grad_norm": 4.192742824554443, + "learning_rate": 4.83709189971465e-07, + "loss": 0.3325, + "step": 10155 + }, + { + "epoch": 4.801891252955083, + "grad_norm": 2.996617555618286, + "learning_rate": 4.83340438864372e-07, + "loss": 0.3237, + "step": 10156 + }, + { + "epoch": 4.802364066193854, + "grad_norm": 3.255037307739258, + "learning_rate": 4.829718133258263e-07, + "loss": 0.3575, + "step": 10157 + }, + { + "epoch": 4.802836879432624, + "grad_norm": 3.1065316200256348, + "learning_rate": 4.826033133787822e-07, + "loss": 0.347, + "step": 10158 + }, + { + "epoch": 4.8033096926713945, + "grad_norm": 3.321096420288086, + "learning_rate": 4.822349390461831e-07, + "loss": 0.3628, + "step": 10159 + }, + { + "epoch": 4.803782505910165, + "grad_norm": 3.549182653427124, + "learning_rate": 4.818666903509672e-07, + "loss": 0.3539, + "step": 10160 + }, + { + "epoch": 4.804255319148936, + "grad_norm": 2.9063286781311035, + "learning_rate": 4.814985673160633e-07, + "loss": 0.2956, + "step": 10161 + }, + { + "epoch": 4.804728132387707, + "grad_norm": 3.1669399738311768, + "learning_rate": 4.81130569964392e-07, + "loss": 0.3263, + "step": 10162 + }, + { + "epoch": 4.805200945626478, + "grad_norm": 3.667128562927246, + "learning_rate": 4.807626983188684e-07, + "loss": 0.3514, + "step": 10163 + }, + { + "epoch": 4.8056737588652485, + "grad_norm": 3.1469576358795166, + "learning_rate": 4.803949524023976e-07, + "loss": 0.3273, + "step": 10164 + }, + { + "epoch": 4.806146572104019, + "grad_norm": 3.6988110542297363, + "learning_rate": 4.800273322378768e-07, + "loss": 0.3293, + "step": 10165 + }, + { + "epoch": 4.80661938534279, + "grad_norm": 3.6419219970703125, + "learning_rate": 4.79659837848197e-07, + "loss": 0.3696, + "step": 10166 + }, + { + "epoch": 4.80709219858156, + "grad_norm": 3.4860944747924805, + "learning_rate": 4.792924692562398e-07, + "loss": 0.3372, + "step": 10167 + }, + { + "epoch": 4.807565011820331, + "grad_norm": 2.879600763320923, + "learning_rate": 4.789252264848806e-07, + "loss": 0.3192, + "step": 10168 + }, + { + "epoch": 4.808037825059102, + "grad_norm": 3.4475104808807373, + "learning_rate": 4.785581095569855e-07, + "loss": 0.3285, + "step": 10169 + }, + { + "epoch": 4.808510638297872, + "grad_norm": 3.528397560119629, + "learning_rate": 4.78191118495413e-07, + "loss": 0.3612, + "step": 10170 + }, + { + "epoch": 4.808983451536643, + "grad_norm": 3.056796073913574, + "learning_rate": 4.778242533230138e-07, + "loss": 0.3077, + "step": 10171 + }, + { + "epoch": 4.809456264775414, + "grad_norm": 3.302171230316162, + "learning_rate": 4.774575140626317e-07, + "loss": 0.2963, + "step": 10172 + }, + { + "epoch": 4.809929078014184, + "grad_norm": 3.1446237564086914, + "learning_rate": 4.770909007371016e-07, + "loss": 0.3438, + "step": 10173 + }, + { + "epoch": 4.810401891252955, + "grad_norm": 2.917919635772705, + "learning_rate": 4.767244133692511e-07, + "loss": 0.3353, + "step": 10174 + }, + { + "epoch": 4.8108747044917255, + "grad_norm": 3.0808987617492676, + "learning_rate": 4.763580519818989e-07, + "loss": 0.3574, + "step": 10175 + }, + { + "epoch": 4.811347517730496, + "grad_norm": 3.2861616611480713, + "learning_rate": 4.75991816597858e-07, + "loss": 0.3891, + "step": 10176 + }, + { + "epoch": 4.811820330969267, + "grad_norm": 2.835925340652466, + "learning_rate": 4.7562570723993116e-07, + "loss": 0.3785, + "step": 10177 + }, + { + "epoch": 4.812293144208038, + "grad_norm": 3.5441393852233887, + "learning_rate": 4.7525972393091534e-07, + "loss": 0.3914, + "step": 10178 + }, + { + "epoch": 4.812765957446809, + "grad_norm": 3.94022798538208, + "learning_rate": 4.748938666935984e-07, + "loss": 0.3564, + "step": 10179 + }, + { + "epoch": 4.8132387706855795, + "grad_norm": 3.1686532497406006, + "learning_rate": 4.7452813555076e-07, + "loss": 0.3348, + "step": 10180 + }, + { + "epoch": 4.81371158392435, + "grad_norm": 3.534032106399536, + "learning_rate": 4.7416253052517374e-07, + "loss": 0.3165, + "step": 10181 + }, + { + "epoch": 4.81418439716312, + "grad_norm": 3.1169021129608154, + "learning_rate": 4.7379705163960317e-07, + "loss": 0.3515, + "step": 10182 + }, + { + "epoch": 4.814657210401891, + "grad_norm": 3.564509391784668, + "learning_rate": 4.7343169891680585e-07, + "loss": 0.4195, + "step": 10183 + }, + { + "epoch": 4.815130023640662, + "grad_norm": 3.2813005447387695, + "learning_rate": 4.7306647237953085e-07, + "loss": 0.3574, + "step": 10184 + }, + { + "epoch": 4.815602836879433, + "grad_norm": 3.053349018096924, + "learning_rate": 4.727013720505177e-07, + "loss": 0.3792, + "step": 10185 + }, + { + "epoch": 4.816075650118203, + "grad_norm": 3.069258689880371, + "learning_rate": 4.723363979525017e-07, + "loss": 0.3377, + "step": 10186 + }, + { + "epoch": 4.816548463356974, + "grad_norm": 2.999802350997925, + "learning_rate": 4.71971550108207e-07, + "loss": 0.3232, + "step": 10187 + }, + { + "epoch": 4.817021276595745, + "grad_norm": 2.941810131072998, + "learning_rate": 4.7160682854035107e-07, + "loss": 0.342, + "step": 10188 + }, + { + "epoch": 4.817494089834515, + "grad_norm": 3.407975196838379, + "learning_rate": 4.71242233271644e-07, + "loss": 0.3697, + "step": 10189 + }, + { + "epoch": 4.817966903073286, + "grad_norm": 3.148359537124634, + "learning_rate": 4.708777643247864e-07, + "loss": 0.3297, + "step": 10190 + }, + { + "epoch": 4.8184397163120565, + "grad_norm": 2.9067797660827637, + "learning_rate": 4.7051342172247354e-07, + "loss": 0.2646, + "step": 10191 + }, + { + "epoch": 4.818912529550827, + "grad_norm": 3.4185385704040527, + "learning_rate": 4.70149205487391e-07, + "loss": 0.3296, + "step": 10192 + }, + { + "epoch": 4.819385342789598, + "grad_norm": 3.363966464996338, + "learning_rate": 4.697851156422162e-07, + "loss": 0.3744, + "step": 10193 + }, + { + "epoch": 4.819858156028369, + "grad_norm": 2.944939613342285, + "learning_rate": 4.6942115220962067e-07, + "loss": 0.3311, + "step": 10194 + }, + { + "epoch": 4.82033096926714, + "grad_norm": 3.2023603916168213, + "learning_rate": 4.6905731521226544e-07, + "loss": 0.3114, + "step": 10195 + }, + { + "epoch": 4.8208037825059105, + "grad_norm": 2.9747812747955322, + "learning_rate": 4.686936046728063e-07, + "loss": 0.2891, + "step": 10196 + }, + { + "epoch": 4.821276595744681, + "grad_norm": 3.6693246364593506, + "learning_rate": 4.6833002061388965e-07, + "loss": 0.3758, + "step": 10197 + }, + { + "epoch": 4.821749408983451, + "grad_norm": 3.4812891483306885, + "learning_rate": 4.679665630581534e-07, + "loss": 0.3274, + "step": 10198 + }, + { + "epoch": 4.822222222222222, + "grad_norm": 2.888956308364868, + "learning_rate": 4.676032320282295e-07, + "loss": 0.3304, + "step": 10199 + }, + { + "epoch": 4.822695035460993, + "grad_norm": 3.2659964561462402, + "learning_rate": 4.6724002754674006e-07, + "loss": 0.3267, + "step": 10200 + }, + { + "epoch": 4.823167848699764, + "grad_norm": 3.2733213901519775, + "learning_rate": 4.6687694963630127e-07, + "loss": 0.3067, + "step": 10201 + }, + { + "epoch": 4.823640661938534, + "grad_norm": 3.0957846641540527, + "learning_rate": 4.6651399831951995e-07, + "loss": 0.3586, + "step": 10202 + }, + { + "epoch": 4.824113475177305, + "grad_norm": 2.9597535133361816, + "learning_rate": 4.6615117361899526e-07, + "loss": 0.3409, + "step": 10203 + }, + { + "epoch": 4.824586288416076, + "grad_norm": 3.0622851848602295, + "learning_rate": 4.657884755573189e-07, + "loss": 0.3112, + "step": 10204 + }, + { + "epoch": 4.825059101654846, + "grad_norm": 3.088568925857544, + "learning_rate": 4.6542590415707355e-07, + "loss": 0.3161, + "step": 10205 + }, + { + "epoch": 4.825531914893617, + "grad_norm": 3.2927064895629883, + "learning_rate": 4.650634594408368e-07, + "loss": 0.3368, + "step": 10206 + }, + { + "epoch": 4.8260047281323875, + "grad_norm": 2.9728758335113525, + "learning_rate": 4.647011414311753e-07, + "loss": 0.3615, + "step": 10207 + }, + { + "epoch": 4.826477541371158, + "grad_norm": 3.301173686981201, + "learning_rate": 4.643389501506487e-07, + "loss": 0.3597, + "step": 10208 + }, + { + "epoch": 4.826950354609929, + "grad_norm": 3.421177864074707, + "learning_rate": 4.639768856218102e-07, + "loss": 0.3087, + "step": 10209 + }, + { + "epoch": 4.8274231678487, + "grad_norm": 3.1131463050842285, + "learning_rate": 4.636149478672031e-07, + "loss": 0.3776, + "step": 10210 + }, + { + "epoch": 4.827895981087471, + "grad_norm": 3.5807228088378906, + "learning_rate": 4.6325313690936347e-07, + "loss": 0.3556, + "step": 10211 + }, + { + "epoch": 4.828368794326241, + "grad_norm": 3.2873311042785645, + "learning_rate": 4.6289145277082085e-07, + "loss": 0.377, + "step": 10212 + }, + { + "epoch": 4.828841607565012, + "grad_norm": 3.502228021621704, + "learning_rate": 4.6252989547409423e-07, + "loss": 0.3699, + "step": 10213 + }, + { + "epoch": 4.829314420803782, + "grad_norm": 3.8895792961120605, + "learning_rate": 4.621684650416977e-07, + "loss": 0.3594, + "step": 10214 + }, + { + "epoch": 4.829787234042553, + "grad_norm": 3.11706805229187, + "learning_rate": 4.6180716149613505e-07, + "loss": 0.3404, + "step": 10215 + }, + { + "epoch": 4.830260047281324, + "grad_norm": 3.174584150314331, + "learning_rate": 4.614459848599029e-07, + "loss": 0.3684, + "step": 10216 + }, + { + "epoch": 4.8307328605200945, + "grad_norm": 3.0028135776519775, + "learning_rate": 4.610849351554908e-07, + "loss": 0.3505, + "step": 10217 + }, + { + "epoch": 4.831205673758865, + "grad_norm": 3.053354024887085, + "learning_rate": 4.6072401240537965e-07, + "loss": 0.392, + "step": 10218 + }, + { + "epoch": 4.831678486997636, + "grad_norm": 3.2726800441741943, + "learning_rate": 4.603632166320424e-07, + "loss": 0.3506, + "step": 10219 + }, + { + "epoch": 4.832151300236407, + "grad_norm": 3.5746219158172607, + "learning_rate": 4.600025478579437e-07, + "loss": 0.3585, + "step": 10220 + }, + { + "epoch": 4.832624113475177, + "grad_norm": 3.0742499828338623, + "learning_rate": 4.596420061055409e-07, + "loss": 0.332, + "step": 10221 + }, + { + "epoch": 4.833096926713948, + "grad_norm": 3.161022186279297, + "learning_rate": 4.5928159139728426e-07, + "loss": 0.3303, + "step": 10222 + }, + { + "epoch": 4.833569739952718, + "grad_norm": 3.2312185764312744, + "learning_rate": 4.5892130375561395e-07, + "loss": 0.3532, + "step": 10223 + }, + { + "epoch": 4.834042553191489, + "grad_norm": 3.501893997192383, + "learning_rate": 4.585611432029649e-07, + "loss": 0.3862, + "step": 10224 + }, + { + "epoch": 4.83451536643026, + "grad_norm": 3.2005560398101807, + "learning_rate": 4.5820110976176194e-07, + "loss": 0.3626, + "step": 10225 + }, + { + "epoch": 4.834988179669031, + "grad_norm": 3.4039556980133057, + "learning_rate": 4.578412034544225e-07, + "loss": 0.3789, + "step": 10226 + }, + { + "epoch": 4.835460992907802, + "grad_norm": 3.4461448192596436, + "learning_rate": 4.574814243033571e-07, + "loss": 0.3714, + "step": 10227 + }, + { + "epoch": 4.835933806146572, + "grad_norm": 3.435886859893799, + "learning_rate": 4.571217723309665e-07, + "loss": 0.3015, + "step": 10228 + }, + { + "epoch": 4.836406619385343, + "grad_norm": 3.612645149230957, + "learning_rate": 4.567622475596462e-07, + "loss": 0.3738, + "step": 10229 + }, + { + "epoch": 4.836879432624113, + "grad_norm": 3.1911067962646484, + "learning_rate": 4.564028500117815e-07, + "loss": 0.2994, + "step": 10230 + }, + { + "epoch": 4.837352245862884, + "grad_norm": 2.9745163917541504, + "learning_rate": 4.5604357970974956e-07, + "loss": 0.3353, + "step": 10231 + }, + { + "epoch": 4.837825059101655, + "grad_norm": 3.4999606609344482, + "learning_rate": 4.556844366759222e-07, + "loss": 0.3796, + "step": 10232 + }, + { + "epoch": 4.8382978723404255, + "grad_norm": 2.8130152225494385, + "learning_rate": 4.553254209326607e-07, + "loss": 0.2964, + "step": 10233 + }, + { + "epoch": 4.838770685579196, + "grad_norm": 3.5461673736572266, + "learning_rate": 4.5496653250232005e-07, + "loss": 0.3626, + "step": 10234 + }, + { + "epoch": 4.839243498817967, + "grad_norm": 3.3498404026031494, + "learning_rate": 4.546077714072458e-07, + "loss": 0.2982, + "step": 10235 + }, + { + "epoch": 4.839716312056738, + "grad_norm": 2.8942501544952393, + "learning_rate": 4.5424913766977635e-07, + "loss": 0.2447, + "step": 10236 + }, + { + "epoch": 4.840189125295508, + "grad_norm": 3.3506743907928467, + "learning_rate": 4.5389063131224346e-07, + "loss": 0.2908, + "step": 10237 + }, + { + "epoch": 4.840661938534279, + "grad_norm": 3.058872699737549, + "learning_rate": 4.535322523569691e-07, + "loss": 0.3275, + "step": 10238 + }, + { + "epoch": 4.841134751773049, + "grad_norm": 3.0573856830596924, + "learning_rate": 4.5317400082626696e-07, + "loss": 0.3096, + "step": 10239 + }, + { + "epoch": 4.84160756501182, + "grad_norm": 3.3260257244110107, + "learning_rate": 4.5281587674244563e-07, + "loss": 0.3334, + "step": 10240 + }, + { + "epoch": 4.842080378250591, + "grad_norm": 3.265740156173706, + "learning_rate": 4.5245788012780234e-07, + "loss": 0.3698, + "step": 10241 + }, + { + "epoch": 4.842553191489362, + "grad_norm": 3.4116036891937256, + "learning_rate": 4.521000110046292e-07, + "loss": 0.4159, + "step": 10242 + }, + { + "epoch": 4.843026004728133, + "grad_norm": 3.3263189792633057, + "learning_rate": 4.5174226939520865e-07, + "loss": 0.3579, + "step": 10243 + }, + { + "epoch": 4.843498817966903, + "grad_norm": 3.4223177433013916, + "learning_rate": 4.5138465532181514e-07, + "loss": 0.3539, + "step": 10244 + }, + { + "epoch": 4.843971631205674, + "grad_norm": 3.481016159057617, + "learning_rate": 4.5102716880671665e-07, + "loss": 0.3527, + "step": 10245 + }, + { + "epoch": 4.844444444444444, + "grad_norm": 2.830122232437134, + "learning_rate": 4.5066980987217124e-07, + "loss": 0.3339, + "step": 10246 + }, + { + "epoch": 4.844917257683215, + "grad_norm": 2.895792007446289, + "learning_rate": 4.5031257854043163e-07, + "loss": 0.3056, + "step": 10247 + }, + { + "epoch": 4.845390070921986, + "grad_norm": 2.9748036861419678, + "learning_rate": 4.499554748337398e-07, + "loss": 0.2794, + "step": 10248 + }, + { + "epoch": 4.8458628841607565, + "grad_norm": 3.223539113998413, + "learning_rate": 4.49598498774331e-07, + "loss": 0.3756, + "step": 10249 + }, + { + "epoch": 4.846335697399527, + "grad_norm": 3.491365432739258, + "learning_rate": 4.492416503844335e-07, + "loss": 0.378, + "step": 10250 + }, + { + "epoch": 4.846808510638298, + "grad_norm": 2.7236695289611816, + "learning_rate": 4.48884929686266e-07, + "loss": 0.297, + "step": 10251 + }, + { + "epoch": 4.847281323877069, + "grad_norm": 3.3814051151275635, + "learning_rate": 4.4852833670204045e-07, + "loss": 0.2923, + "step": 10252 + }, + { + "epoch": 4.847754137115839, + "grad_norm": 3.168334722518921, + "learning_rate": 4.4817187145395956e-07, + "loss": 0.3455, + "step": 10253 + }, + { + "epoch": 4.84822695035461, + "grad_norm": 3.0346829891204834, + "learning_rate": 4.4781553396421873e-07, + "loss": 0.3416, + "step": 10254 + }, + { + "epoch": 4.84869976359338, + "grad_norm": 3.1232426166534424, + "learning_rate": 4.4745932425500657e-07, + "loss": 0.3494, + "step": 10255 + }, + { + "epoch": 4.849172576832151, + "grad_norm": 3.0737383365631104, + "learning_rate": 4.471032423485017e-07, + "loss": 0.3246, + "step": 10256 + }, + { + "epoch": 4.849645390070922, + "grad_norm": 3.421461582183838, + "learning_rate": 4.467472882668769e-07, + "loss": 0.3807, + "step": 10257 + }, + { + "epoch": 4.850118203309693, + "grad_norm": 3.3846490383148193, + "learning_rate": 4.463914620322951e-07, + "loss": 0.3695, + "step": 10258 + }, + { + "epoch": 4.850591016548464, + "grad_norm": 4.0876007080078125, + "learning_rate": 4.460357636669116e-07, + "loss": 0.3913, + "step": 10259 + }, + { + "epoch": 4.851063829787234, + "grad_norm": 3.2078847885131836, + "learning_rate": 4.456801931928753e-07, + "loss": 0.3424, + "step": 10260 + }, + { + "epoch": 4.851536643026005, + "grad_norm": 3.4163241386413574, + "learning_rate": 4.453247506323255e-07, + "loss": 0.3907, + "step": 10261 + }, + { + "epoch": 4.852009456264775, + "grad_norm": 2.989793539047241, + "learning_rate": 4.449694360073931e-07, + "loss": 0.3313, + "step": 10262 + }, + { + "epoch": 4.852482269503546, + "grad_norm": 3.291537284851074, + "learning_rate": 4.446142493402039e-07, + "loss": 0.3594, + "step": 10263 + }, + { + "epoch": 4.852955082742317, + "grad_norm": 3.6327221393585205, + "learning_rate": 4.4425919065287204e-07, + "loss": 0.3844, + "step": 10264 + }, + { + "epoch": 4.8534278959810875, + "grad_norm": 3.486333131790161, + "learning_rate": 4.439042599675067e-07, + "loss": 0.3666, + "step": 10265 + }, + { + "epoch": 4.853900709219858, + "grad_norm": 3.7585315704345703, + "learning_rate": 4.435494573062074e-07, + "loss": 0.3287, + "step": 10266 + }, + { + "epoch": 4.854373522458629, + "grad_norm": 3.3496108055114746, + "learning_rate": 4.4319478269106625e-07, + "loss": 0.4021, + "step": 10267 + }, + { + "epoch": 4.8548463356974, + "grad_norm": 3.4681267738342285, + "learning_rate": 4.428402361441672e-07, + "loss": 0.3119, + "step": 10268 + }, + { + "epoch": 4.85531914893617, + "grad_norm": 2.9935829639434814, + "learning_rate": 4.4248581768758567e-07, + "loss": 0.305, + "step": 10269 + }, + { + "epoch": 4.855791962174941, + "grad_norm": 3.5839056968688965, + "learning_rate": 4.42131527343391e-07, + "loss": 0.4095, + "step": 10270 + }, + { + "epoch": 4.856264775413711, + "grad_norm": 3.088690757751465, + "learning_rate": 4.4177736513364237e-07, + "loss": 0.3391, + "step": 10271 + }, + { + "epoch": 4.856737588652482, + "grad_norm": 3.2721431255340576, + "learning_rate": 4.414233310803917e-07, + "loss": 0.3741, + "step": 10272 + }, + { + "epoch": 4.857210401891253, + "grad_norm": 3.108041524887085, + "learning_rate": 4.4106942520568437e-07, + "loss": 0.4041, + "step": 10273 + }, + { + "epoch": 4.857683215130024, + "grad_norm": 3.0035696029663086, + "learning_rate": 4.407156475315549e-07, + "loss": 0.3408, + "step": 10274 + }, + { + "epoch": 4.858156028368795, + "grad_norm": 3.0572783946990967, + "learning_rate": 4.4036199808003334e-07, + "loss": 0.3207, + "step": 10275 + }, + { + "epoch": 4.858628841607565, + "grad_norm": 3.1695926189422607, + "learning_rate": 4.4000847687313857e-07, + "loss": 0.3605, + "step": 10276 + }, + { + "epoch": 4.859101654846336, + "grad_norm": 3.690382957458496, + "learning_rate": 4.396550839328828e-07, + "loss": 0.4076, + "step": 10277 + }, + { + "epoch": 4.859574468085106, + "grad_norm": 3.271988868713379, + "learning_rate": 4.393018192812712e-07, + "loss": 0.4169, + "step": 10278 + }, + { + "epoch": 4.860047281323877, + "grad_norm": 2.8622982501983643, + "learning_rate": 4.389486829402986e-07, + "loss": 0.3114, + "step": 10279 + }, + { + "epoch": 4.860520094562648, + "grad_norm": 3.3875632286071777, + "learning_rate": 4.385956749319548e-07, + "loss": 0.3664, + "step": 10280 + }, + { + "epoch": 4.8609929078014185, + "grad_norm": 2.98962664604187, + "learning_rate": 4.382427952782195e-07, + "loss": 0.314, + "step": 10281 + }, + { + "epoch": 4.861465721040189, + "grad_norm": 2.899529457092285, + "learning_rate": 4.3789004400106473e-07, + "loss": 0.3588, + "step": 10282 + }, + { + "epoch": 4.86193853427896, + "grad_norm": 3.11767578125, + "learning_rate": 4.3753742112245476e-07, + "loss": 0.3311, + "step": 10283 + }, + { + "epoch": 4.862411347517731, + "grad_norm": 2.9610254764556885, + "learning_rate": 4.3718492666434576e-07, + "loss": 0.3234, + "step": 10284 + }, + { + "epoch": 4.862884160756501, + "grad_norm": 2.9350297451019287, + "learning_rate": 4.368325606486859e-07, + "loss": 0.3086, + "step": 10285 + }, + { + "epoch": 4.863356973995272, + "grad_norm": 3.0126571655273438, + "learning_rate": 4.3648032309741626e-07, + "loss": 0.3033, + "step": 10286 + }, + { + "epoch": 4.863829787234042, + "grad_norm": 3.0580496788024902, + "learning_rate": 4.3612821403246795e-07, + "loss": 0.3631, + "step": 10287 + }, + { + "epoch": 4.864302600472813, + "grad_norm": 2.9186129570007324, + "learning_rate": 4.3577623347576676e-07, + "loss": 0.3449, + "step": 10288 + }, + { + "epoch": 4.864775413711584, + "grad_norm": 3.146562099456787, + "learning_rate": 4.354243814492282e-07, + "loss": 0.369, + "step": 10289 + }, + { + "epoch": 4.865248226950355, + "grad_norm": 2.646812915802002, + "learning_rate": 4.350726579747597e-07, + "loss": 0.331, + "step": 10290 + }, + { + "epoch": 4.8657210401891255, + "grad_norm": 3.2851274013519287, + "learning_rate": 4.3472106307426293e-07, + "loss": 0.3445, + "step": 10291 + }, + { + "epoch": 4.866193853427896, + "grad_norm": 3.144446849822998, + "learning_rate": 4.34369596769629e-07, + "loss": 0.3687, + "step": 10292 + }, + { + "epoch": 4.866666666666667, + "grad_norm": 3.01517915725708, + "learning_rate": 4.3401825908274353e-07, + "loss": 0.3282, + "step": 10293 + }, + { + "epoch": 4.867139479905437, + "grad_norm": 3.171759605407715, + "learning_rate": 4.33667050035482e-07, + "loss": 0.351, + "step": 10294 + }, + { + "epoch": 4.867612293144208, + "grad_norm": 3.5374269485473633, + "learning_rate": 4.333159696497119e-07, + "loss": 0.3586, + "step": 10295 + }, + { + "epoch": 4.868085106382979, + "grad_norm": 3.4506356716156006, + "learning_rate": 4.3296501794729494e-07, + "loss": 0.4076, + "step": 10296 + }, + { + "epoch": 4.868557919621749, + "grad_norm": 3.348048448562622, + "learning_rate": 4.326141949500826e-07, + "loss": 0.3256, + "step": 10297 + }, + { + "epoch": 4.86903073286052, + "grad_norm": 3.235438108444214, + "learning_rate": 4.322635006799192e-07, + "loss": 0.3215, + "step": 10298 + }, + { + "epoch": 4.869503546099291, + "grad_norm": 3.2025554180145264, + "learning_rate": 4.319129351586407e-07, + "loss": 0.335, + "step": 10299 + }, + { + "epoch": 4.869976359338062, + "grad_norm": 3.0318121910095215, + "learning_rate": 4.315624984080749e-07, + "loss": 0.3304, + "step": 10300 + }, + { + "epoch": 4.870449172576832, + "grad_norm": 2.9115359783172607, + "learning_rate": 4.312121904500433e-07, + "loss": 0.3459, + "step": 10301 + }, + { + "epoch": 4.8709219858156025, + "grad_norm": 3.41164493560791, + "learning_rate": 4.3086201130635633e-07, + "loss": 0.3846, + "step": 10302 + }, + { + "epoch": 4.871394799054373, + "grad_norm": 3.5832016468048096, + "learning_rate": 4.305119609988198e-07, + "loss": 0.3422, + "step": 10303 + }, + { + "epoch": 4.871867612293144, + "grad_norm": 3.5244979858398438, + "learning_rate": 4.30162039549229e-07, + "loss": 0.3862, + "step": 10304 + }, + { + "epoch": 4.872340425531915, + "grad_norm": 3.0881710052490234, + "learning_rate": 4.298122469793714e-07, + "loss": 0.358, + "step": 10305 + }, + { + "epoch": 4.872813238770686, + "grad_norm": 3.3237557411193848, + "learning_rate": 4.294625833110283e-07, + "loss": 0.3742, + "step": 10306 + }, + { + "epoch": 4.8732860520094565, + "grad_norm": 3.1959686279296875, + "learning_rate": 4.291130485659711e-07, + "loss": 0.3426, + "step": 10307 + }, + { + "epoch": 4.873758865248227, + "grad_norm": 3.1890714168548584, + "learning_rate": 4.2876364276596333e-07, + "loss": 0.3131, + "step": 10308 + }, + { + "epoch": 4.874231678486998, + "grad_norm": 2.9387660026550293, + "learning_rate": 4.284143659327619e-07, + "loss": 0.3227, + "step": 10309 + }, + { + "epoch": 4.874704491725768, + "grad_norm": 3.6868603229522705, + "learning_rate": 4.2806521808811367e-07, + "loss": 0.3159, + "step": 10310 + }, + { + "epoch": 4.875177304964539, + "grad_norm": 3.1396310329437256, + "learning_rate": 4.277161992537596e-07, + "loss": 0.3757, + "step": 10311 + }, + { + "epoch": 4.87565011820331, + "grad_norm": 3.4745748043060303, + "learning_rate": 4.273673094514313e-07, + "loss": 0.347, + "step": 10312 + }, + { + "epoch": 4.87612293144208, + "grad_norm": 3.1869146823883057, + "learning_rate": 4.270185487028525e-07, + "loss": 0.3364, + "step": 10313 + }, + { + "epoch": 4.876595744680851, + "grad_norm": 2.8646297454833984, + "learning_rate": 4.2666991702973807e-07, + "loss": 0.2987, + "step": 10314 + }, + { + "epoch": 4.877068557919622, + "grad_norm": 3.3483452796936035, + "learning_rate": 4.263214144537975e-07, + "loss": 0.307, + "step": 10315 + }, + { + "epoch": 4.877541371158393, + "grad_norm": 2.8557562828063965, + "learning_rate": 4.259730409967294e-07, + "loss": 0.3406, + "step": 10316 + }, + { + "epoch": 4.878014184397163, + "grad_norm": 3.351121664047241, + "learning_rate": 4.256247966802257e-07, + "loss": 0.3571, + "step": 10317 + }, + { + "epoch": 4.8784869976359335, + "grad_norm": 3.1691417694091797, + "learning_rate": 4.252766815259696e-07, + "loss": 0.3686, + "step": 10318 + }, + { + "epoch": 4.878959810874704, + "grad_norm": 2.957632303237915, + "learning_rate": 4.249286955556378e-07, + "loss": 0.3055, + "step": 10319 + }, + { + "epoch": 4.879432624113475, + "grad_norm": 3.234708070755005, + "learning_rate": 4.2458083879089645e-07, + "loss": 0.3733, + "step": 10320 + }, + { + "epoch": 4.879905437352246, + "grad_norm": 3.469207525253296, + "learning_rate": 4.242331112534065e-07, + "loss": 0.3758, + "step": 10321 + }, + { + "epoch": 4.880378250591017, + "grad_norm": 3.2442891597747803, + "learning_rate": 4.2388551296481896e-07, + "loss": 0.3515, + "step": 10322 + }, + { + "epoch": 4.8808510638297875, + "grad_norm": 3.3709537982940674, + "learning_rate": 4.235380439467762e-07, + "loss": 0.421, + "step": 10323 + }, + { + "epoch": 4.881323877068558, + "grad_norm": 2.730891227722168, + "learning_rate": 4.231907042209149e-07, + "loss": 0.3105, + "step": 10324 + }, + { + "epoch": 4.881796690307329, + "grad_norm": 3.6933813095092773, + "learning_rate": 4.228434938088616e-07, + "loss": 0.338, + "step": 10325 + }, + { + "epoch": 4.882269503546099, + "grad_norm": 3.2480294704437256, + "learning_rate": 4.224964127322362e-07, + "loss": 0.3695, + "step": 10326 + }, + { + "epoch": 4.88274231678487, + "grad_norm": 3.229762554168701, + "learning_rate": 4.2214946101264976e-07, + "loss": 0.3768, + "step": 10327 + }, + { + "epoch": 4.883215130023641, + "grad_norm": 3.3844475746154785, + "learning_rate": 4.218026386717047e-07, + "loss": 0.3441, + "step": 10328 + }, + { + "epoch": 4.883687943262411, + "grad_norm": 3.159759283065796, + "learning_rate": 4.2145594573099745e-07, + "loss": 0.3459, + "step": 10329 + }, + { + "epoch": 4.884160756501182, + "grad_norm": 3.5672366619110107, + "learning_rate": 4.21109382212114e-07, + "loss": 0.3908, + "step": 10330 + }, + { + "epoch": 4.884633569739953, + "grad_norm": 3.2481353282928467, + "learning_rate": 4.2076294813663405e-07, + "loss": 0.3778, + "step": 10331 + }, + { + "epoch": 4.885106382978723, + "grad_norm": 3.3311941623687744, + "learning_rate": 4.2041664352612785e-07, + "loss": 0.3171, + "step": 10332 + }, + { + "epoch": 4.885579196217494, + "grad_norm": 3.4712841510772705, + "learning_rate": 4.2007046840215783e-07, + "loss": 0.3858, + "step": 10333 + }, + { + "epoch": 4.8860520094562645, + "grad_norm": 3.1591062545776367, + "learning_rate": 4.197244227862804e-07, + "loss": 0.327, + "step": 10334 + }, + { + "epoch": 4.886524822695035, + "grad_norm": 3.400400400161743, + "learning_rate": 4.1937850670004136e-07, + "loss": 0.3231, + "step": 10335 + }, + { + "epoch": 4.886997635933806, + "grad_norm": 2.9156908988952637, + "learning_rate": 4.190327201649788e-07, + "loss": 0.2834, + "step": 10336 + }, + { + "epoch": 4.887470449172577, + "grad_norm": 3.0125153064727783, + "learning_rate": 4.1868706320262467e-07, + "loss": 0.3143, + "step": 10337 + }, + { + "epoch": 4.887943262411348, + "grad_norm": 2.656107187271118, + "learning_rate": 4.183415358345003e-07, + "loss": 0.3348, + "step": 10338 + }, + { + "epoch": 4.8884160756501185, + "grad_norm": 3.0910565853118896, + "learning_rate": 4.17996138082121e-07, + "loss": 0.3212, + "step": 10339 + }, + { + "epoch": 4.888888888888889, + "grad_norm": 3.1303164958953857, + "learning_rate": 4.1765086996699315e-07, + "loss": 0.3573, + "step": 10340 + }, + { + "epoch": 4.889361702127659, + "grad_norm": 3.504901885986328, + "learning_rate": 4.173057315106141e-07, + "loss": 0.3912, + "step": 10341 + }, + { + "epoch": 4.88983451536643, + "grad_norm": 2.994338035583496, + "learning_rate": 4.1696072273447547e-07, + "loss": 0.3896, + "step": 10342 + }, + { + "epoch": 4.890307328605201, + "grad_norm": 3.0409624576568604, + "learning_rate": 4.1661584366005814e-07, + "loss": 0.3109, + "step": 10343 + }, + { + "epoch": 4.890780141843972, + "grad_norm": 3.479952096939087, + "learning_rate": 4.1627109430883743e-07, + "loss": 0.3265, + "step": 10344 + }, + { + "epoch": 4.891252955082742, + "grad_norm": 3.0288894176483154, + "learning_rate": 4.159264747022787e-07, + "loss": 0.3345, + "step": 10345 + }, + { + "epoch": 4.891725768321513, + "grad_norm": 3.7433063983917236, + "learning_rate": 4.1558198486184005e-07, + "loss": 0.3888, + "step": 10346 + }, + { + "epoch": 4.892198581560284, + "grad_norm": 3.431964635848999, + "learning_rate": 4.152376248089715e-07, + "loss": 0.3062, + "step": 10347 + }, + { + "epoch": 4.892671394799054, + "grad_norm": 3.3993113040924072, + "learning_rate": 4.1489339456511376e-07, + "loss": 0.3955, + "step": 10348 + }, + { + "epoch": 4.893144208037825, + "grad_norm": 3.09287428855896, + "learning_rate": 4.145492941517024e-07, + "loss": 0.2857, + "step": 10349 + }, + { + "epoch": 4.8936170212765955, + "grad_norm": 3.355915069580078, + "learning_rate": 4.1420532359016166e-07, + "loss": 0.3403, + "step": 10350 + }, + { + "epoch": 4.894089834515366, + "grad_norm": 4.00920295715332, + "learning_rate": 4.1386148290190915e-07, + "loss": 0.3455, + "step": 10351 + }, + { + "epoch": 4.894562647754137, + "grad_norm": 3.408311605453491, + "learning_rate": 4.1351777210835524e-07, + "loss": 0.3606, + "step": 10352 + }, + { + "epoch": 4.895035460992908, + "grad_norm": 3.031616449356079, + "learning_rate": 4.1317419123090007e-07, + "loss": 0.3696, + "step": 10353 + }, + { + "epoch": 4.895508274231679, + "grad_norm": 3.555751085281372, + "learning_rate": 4.1283074029093814e-07, + "loss": 0.3197, + "step": 10354 + }, + { + "epoch": 4.8959810874704495, + "grad_norm": 3.3839752674102783, + "learning_rate": 4.124874193098541e-07, + "loss": 0.3744, + "step": 10355 + }, + { + "epoch": 4.89645390070922, + "grad_norm": 3.514296531677246, + "learning_rate": 4.1214422830902406e-07, + "loss": 0.29, + "step": 10356 + }, + { + "epoch": 4.89692671394799, + "grad_norm": 3.056325674057007, + "learning_rate": 4.1180116730981905e-07, + "loss": 0.371, + "step": 10357 + }, + { + "epoch": 4.897399527186761, + "grad_norm": 3.7567055225372314, + "learning_rate": 4.1145823633359865e-07, + "loss": 0.4105, + "step": 10358 + }, + { + "epoch": 4.897872340425532, + "grad_norm": 3.0050766468048096, + "learning_rate": 4.111154354017152e-07, + "loss": 0.3262, + "step": 10359 + }, + { + "epoch": 4.898345153664303, + "grad_norm": 3.2767333984375, + "learning_rate": 4.1077276453551476e-07, + "loss": 0.3253, + "step": 10360 + }, + { + "epoch": 4.898817966903073, + "grad_norm": 4.133147239685059, + "learning_rate": 4.1043022375633347e-07, + "loss": 0.4549, + "step": 10361 + }, + { + "epoch": 4.899290780141844, + "grad_norm": 3.372962236404419, + "learning_rate": 4.1008781308549934e-07, + "loss": 0.315, + "step": 10362 + }, + { + "epoch": 4.899763593380615, + "grad_norm": 3.4167628288269043, + "learning_rate": 4.0974553254433335e-07, + "loss": 0.3832, + "step": 10363 + }, + { + "epoch": 4.900236406619385, + "grad_norm": 3.103311061859131, + "learning_rate": 4.094033821541468e-07, + "loss": 0.3347, + "step": 10364 + }, + { + "epoch": 4.900709219858156, + "grad_norm": 2.95872163772583, + "learning_rate": 4.0906136193624547e-07, + "loss": 0.2861, + "step": 10365 + }, + { + "epoch": 4.9011820330969265, + "grad_norm": 3.1035397052764893, + "learning_rate": 4.087194719119239e-07, + "loss": 0.3089, + "step": 10366 + }, + { + "epoch": 4.901654846335697, + "grad_norm": 3.0228095054626465, + "learning_rate": 4.083777121024715e-07, + "loss": 0.38, + "step": 10367 + }, + { + "epoch": 4.902127659574468, + "grad_norm": 3.3528707027435303, + "learning_rate": 4.080360825291674e-07, + "loss": 0.3614, + "step": 10368 + }, + { + "epoch": 4.902600472813239, + "grad_norm": 3.5866968631744385, + "learning_rate": 4.076945832132828e-07, + "loss": 0.3751, + "step": 10369 + }, + { + "epoch": 4.90307328605201, + "grad_norm": 3.388880729675293, + "learning_rate": 4.0735321417608276e-07, + "loss": 0.3358, + "step": 10370 + }, + { + "epoch": 4.9035460992907804, + "grad_norm": 3.5489447116851807, + "learning_rate": 4.070119754388213e-07, + "loss": 0.3437, + "step": 10371 + }, + { + "epoch": 4.904018912529551, + "grad_norm": 2.8234825134277344, + "learning_rate": 4.0667086702274733e-07, + "loss": 0.3164, + "step": 10372 + }, + { + "epoch": 4.904491725768321, + "grad_norm": 3.337445020675659, + "learning_rate": 4.0632988894909965e-07, + "loss": 0.3213, + "step": 10373 + }, + { + "epoch": 4.904964539007092, + "grad_norm": 3.639477491378784, + "learning_rate": 4.0598904123910847e-07, + "loss": 0.3642, + "step": 10374 + }, + { + "epoch": 4.905437352245863, + "grad_norm": 3.101829767227173, + "learning_rate": 4.0564832391399857e-07, + "loss": 0.3415, + "step": 10375 + }, + { + "epoch": 4.9059101654846335, + "grad_norm": 2.8291256427764893, + "learning_rate": 4.05307736994984e-07, + "loss": 0.3014, + "step": 10376 + }, + { + "epoch": 4.906382978723404, + "grad_norm": 2.8689401149749756, + "learning_rate": 4.049672805032717e-07, + "loss": 0.3151, + "step": 10377 + }, + { + "epoch": 4.906855791962175, + "grad_norm": 3.468038320541382, + "learning_rate": 4.046269544600598e-07, + "loss": 0.3956, + "step": 10378 + }, + { + "epoch": 4.907328605200946, + "grad_norm": 3.5246312618255615, + "learning_rate": 4.042867588865401e-07, + "loss": 0.3003, + "step": 10379 + }, + { + "epoch": 4.907801418439716, + "grad_norm": 3.273010730743408, + "learning_rate": 4.039466938038944e-07, + "loss": 0.3036, + "step": 10380 + }, + { + "epoch": 4.908274231678487, + "grad_norm": 3.064718008041382, + "learning_rate": 4.0360675923329733e-07, + "loss": 0.3353, + "step": 10381 + }, + { + "epoch": 4.908747044917257, + "grad_norm": 3.413242816925049, + "learning_rate": 4.032669551959142e-07, + "loss": 0.354, + "step": 10382 + }, + { + "epoch": 4.909219858156028, + "grad_norm": 3.136293411254883, + "learning_rate": 4.029272817129046e-07, + "loss": 0.331, + "step": 10383 + }, + { + "epoch": 4.909692671394799, + "grad_norm": 3.0966274738311768, + "learning_rate": 4.025877388054172e-07, + "loss": 0.2878, + "step": 10384 + }, + { + "epoch": 4.91016548463357, + "grad_norm": 3.334113836288452, + "learning_rate": 4.022483264945948e-07, + "loss": 0.3276, + "step": 10385 + }, + { + "epoch": 4.910638297872341, + "grad_norm": 3.2662229537963867, + "learning_rate": 4.019090448015711e-07, + "loss": 0.3265, + "step": 10386 + }, + { + "epoch": 4.911111111111111, + "grad_norm": 3.134220838546753, + "learning_rate": 4.0156989374747047e-07, + "loss": 0.3684, + "step": 10387 + }, + { + "epoch": 4.911583924349882, + "grad_norm": 3.803694725036621, + "learning_rate": 4.012308733534118e-07, + "loss": 0.3394, + "step": 10388 + }, + { + "epoch": 4.912056737588652, + "grad_norm": 2.788388252258301, + "learning_rate": 4.008919836405034e-07, + "loss": 0.2835, + "step": 10389 + }, + { + "epoch": 4.912529550827423, + "grad_norm": 3.3408966064453125, + "learning_rate": 4.005532246298474e-07, + "loss": 0.3694, + "step": 10390 + }, + { + "epoch": 4.913002364066194, + "grad_norm": 2.913114547729492, + "learning_rate": 4.0021459634253605e-07, + "loss": 0.3456, + "step": 10391 + }, + { + "epoch": 4.9134751773049645, + "grad_norm": 3.778111457824707, + "learning_rate": 3.9987609879965414e-07, + "loss": 0.3887, + "step": 10392 + }, + { + "epoch": 4.913947990543735, + "grad_norm": 2.871978282928467, + "learning_rate": 3.995377320222796e-07, + "loss": 0.28, + "step": 10393 + }, + { + "epoch": 4.914420803782506, + "grad_norm": 3.5189783573150635, + "learning_rate": 3.9919949603147987e-07, + "loss": 0.3802, + "step": 10394 + }, + { + "epoch": 4.914893617021277, + "grad_norm": 3.381014585494995, + "learning_rate": 3.9886139084831607e-07, + "loss": 0.3661, + "step": 10395 + }, + { + "epoch": 4.915366430260047, + "grad_norm": 2.908207654953003, + "learning_rate": 3.9852341649384006e-07, + "loss": 0.3228, + "step": 10396 + }, + { + "epoch": 4.915839243498818, + "grad_norm": 3.4134814739227295, + "learning_rate": 3.981855729890957e-07, + "loss": 0.3149, + "step": 10397 + }, + { + "epoch": 4.916312056737588, + "grad_norm": 4.496891975402832, + "learning_rate": 3.9784786035512004e-07, + "loss": 0.3516, + "step": 10398 + }, + { + "epoch": 4.916784869976359, + "grad_norm": 3.2910919189453125, + "learning_rate": 3.975102786129398e-07, + "loss": 0.3329, + "step": 10399 + }, + { + "epoch": 4.91725768321513, + "grad_norm": 3.6607260704040527, + "learning_rate": 3.97172827783576e-07, + "loss": 0.3878, + "step": 10400 + }, + { + "epoch": 4.917730496453901, + "grad_norm": 3.1500742435455322, + "learning_rate": 3.9683550788803983e-07, + "loss": 0.3323, + "step": 10401 + }, + { + "epoch": 4.918203309692672, + "grad_norm": 3.263714075088501, + "learning_rate": 3.964983189473337e-07, + "loss": 0.352, + "step": 10402 + }, + { + "epoch": 4.918676122931442, + "grad_norm": 3.433868408203125, + "learning_rate": 3.961612609824542e-07, + "loss": 0.3308, + "step": 10403 + }, + { + "epoch": 4.919148936170213, + "grad_norm": 3.3086423873901367, + "learning_rate": 3.95824334014388e-07, + "loss": 0.3641, + "step": 10404 + }, + { + "epoch": 4.919621749408983, + "grad_norm": 3.2854621410369873, + "learning_rate": 3.954875380641135e-07, + "loss": 0.3405, + "step": 10405 + }, + { + "epoch": 4.920094562647754, + "grad_norm": 3.1408650875091553, + "learning_rate": 3.9515087315260244e-07, + "loss": 0.3103, + "step": 10406 + }, + { + "epoch": 4.920567375886525, + "grad_norm": 2.9340312480926514, + "learning_rate": 3.948143393008164e-07, + "loss": 0.3405, + "step": 10407 + }, + { + "epoch": 4.9210401891252955, + "grad_norm": 3.525876522064209, + "learning_rate": 3.944779365297113e-07, + "loss": 0.3464, + "step": 10408 + }, + { + "epoch": 4.921513002364066, + "grad_norm": 3.26991605758667, + "learning_rate": 3.9414166486023253e-07, + "loss": 0.3529, + "step": 10409 + }, + { + "epoch": 4.921985815602837, + "grad_norm": 2.7669694423675537, + "learning_rate": 3.938055243133182e-07, + "loss": 0.3242, + "step": 10410 + }, + { + "epoch": 4.922458628841608, + "grad_norm": 2.8268136978149414, + "learning_rate": 3.934695149098988e-07, + "loss": 0.3086, + "step": 10411 + }, + { + "epoch": 4.922931442080378, + "grad_norm": 3.119053602218628, + "learning_rate": 3.931336366708952e-07, + "loss": 0.3065, + "step": 10412 + }, + { + "epoch": 4.923404255319149, + "grad_norm": 3.1537275314331055, + "learning_rate": 3.9279788961722215e-07, + "loss": 0.3325, + "step": 10413 + }, + { + "epoch": 4.923877068557919, + "grad_norm": 3.1365256309509277, + "learning_rate": 3.9246227376978476e-07, + "loss": 0.4139, + "step": 10414 + }, + { + "epoch": 4.92434988179669, + "grad_norm": 3.3495218753814697, + "learning_rate": 3.921267891494798e-07, + "loss": 0.3463, + "step": 10415 + }, + { + "epoch": 4.924822695035461, + "grad_norm": 3.2402634620666504, + "learning_rate": 3.9179143577719736e-07, + "loss": 0.3499, + "step": 10416 + }, + { + "epoch": 4.925295508274232, + "grad_norm": 2.986429452896118, + "learning_rate": 3.914562136738176e-07, + "loss": 0.3326, + "step": 10417 + }, + { + "epoch": 4.925768321513003, + "grad_norm": 3.1276674270629883, + "learning_rate": 3.9112112286021407e-07, + "loss": 0.3087, + "step": 10418 + }, + { + "epoch": 4.926241134751773, + "grad_norm": 3.1767871379852295, + "learning_rate": 3.9078616335725126e-07, + "loss": 0.3804, + "step": 10419 + }, + { + "epoch": 4.926713947990544, + "grad_norm": 3.1657216548919678, + "learning_rate": 3.904513351857847e-07, + "loss": 0.333, + "step": 10420 + }, + { + "epoch": 4.927186761229314, + "grad_norm": 3.005009174346924, + "learning_rate": 3.901166383666641e-07, + "loss": 0.3583, + "step": 10421 + }, + { + "epoch": 4.927659574468085, + "grad_norm": 2.900146722793579, + "learning_rate": 3.897820729207283e-07, + "loss": 0.3061, + "step": 10422 + }, + { + "epoch": 4.928132387706856, + "grad_norm": 3.2418317794799805, + "learning_rate": 3.8944763886881037e-07, + "loss": 0.3822, + "step": 10423 + }, + { + "epoch": 4.9286052009456265, + "grad_norm": 3.1222848892211914, + "learning_rate": 3.8911333623173344e-07, + "loss": 0.3167, + "step": 10424 + }, + { + "epoch": 4.929078014184397, + "grad_norm": 2.727388858795166, + "learning_rate": 3.8877916503031325e-07, + "loss": 0.2977, + "step": 10425 + }, + { + "epoch": 4.929550827423168, + "grad_norm": 3.190159797668457, + "learning_rate": 3.884451252853569e-07, + "loss": 0.3399, + "step": 10426 + }, + { + "epoch": 4.930023640661939, + "grad_norm": 3.253791570663452, + "learning_rate": 3.8811121701766373e-07, + "loss": 0.3806, + "step": 10427 + }, + { + "epoch": 4.930496453900709, + "grad_norm": 3.4284887313842773, + "learning_rate": 3.8777744024802414e-07, + "loss": 0.3437, + "step": 10428 + }, + { + "epoch": 4.93096926713948, + "grad_norm": 3.3665032386779785, + "learning_rate": 3.874437949972221e-07, + "loss": 0.375, + "step": 10429 + }, + { + "epoch": 4.93144208037825, + "grad_norm": 3.1024677753448486, + "learning_rate": 3.8711028128603084e-07, + "loss": 0.3493, + "step": 10430 + }, + { + "epoch": 4.931914893617021, + "grad_norm": 3.599743604660034, + "learning_rate": 3.867768991352186e-07, + "loss": 0.3852, + "step": 10431 + }, + { + "epoch": 4.932387706855792, + "grad_norm": 3.3676376342773438, + "learning_rate": 3.8644364856554236e-07, + "loss": 0.3489, + "step": 10432 + }, + { + "epoch": 4.932860520094563, + "grad_norm": 3.2186801433563232, + "learning_rate": 3.861105295977521e-07, + "loss": 0.3413, + "step": 10433 + }, + { + "epoch": 4.933333333333334, + "grad_norm": 3.3672704696655273, + "learning_rate": 3.8577754225259055e-07, + "loss": 0.3392, + "step": 10434 + }, + { + "epoch": 4.933806146572104, + "grad_norm": 3.4285950660705566, + "learning_rate": 3.854446865507902e-07, + "loss": 0.3188, + "step": 10435 + }, + { + "epoch": 4.934278959810875, + "grad_norm": 3.187617063522339, + "learning_rate": 3.8511196251307783e-07, + "loss": 0.3258, + "step": 10436 + }, + { + "epoch": 4.934751773049645, + "grad_norm": 2.9744882583618164, + "learning_rate": 3.847793701601699e-07, + "loss": 0.3879, + "step": 10437 + }, + { + "epoch": 4.935224586288416, + "grad_norm": 2.9745848178863525, + "learning_rate": 3.844469095127751e-07, + "loss": 0.3178, + "step": 10438 + }, + { + "epoch": 4.935697399527187, + "grad_norm": 3.7419471740722656, + "learning_rate": 3.841145805915955e-07, + "loss": 0.3167, + "step": 10439 + }, + { + "epoch": 4.9361702127659575, + "grad_norm": 4.295339107513428, + "learning_rate": 3.837823834173232e-07, + "loss": 0.3209, + "step": 10440 + }, + { + "epoch": 4.936643026004728, + "grad_norm": 3.4612984657287598, + "learning_rate": 3.8345031801064217e-07, + "loss": 0.3351, + "step": 10441 + }, + { + "epoch": 4.937115839243499, + "grad_norm": 3.0626909732818604, + "learning_rate": 3.8311838439222953e-07, + "loss": 0.3395, + "step": 10442 + }, + { + "epoch": 4.93758865248227, + "grad_norm": 3.606682538986206, + "learning_rate": 3.827865825827518e-07, + "loss": 0.4112, + "step": 10443 + }, + { + "epoch": 4.93806146572104, + "grad_norm": 3.3908627033233643, + "learning_rate": 3.8245491260287064e-07, + "loss": 0.3367, + "step": 10444 + }, + { + "epoch": 4.938534278959811, + "grad_norm": 2.8598084449768066, + "learning_rate": 3.821233744732364e-07, + "loss": 0.3408, + "step": 10445 + }, + { + "epoch": 4.939007092198581, + "grad_norm": 3.207010269165039, + "learning_rate": 3.8179196821449354e-07, + "loss": 0.3301, + "step": 10446 + }, + { + "epoch": 4.939479905437352, + "grad_norm": 3.018414258956909, + "learning_rate": 3.8146069384727674e-07, + "loss": 0.3622, + "step": 10447 + }, + { + "epoch": 4.939952718676123, + "grad_norm": 3.399415969848633, + "learning_rate": 3.811295513922125e-07, + "loss": 0.3525, + "step": 10448 + }, + { + "epoch": 4.940425531914894, + "grad_norm": 3.175705671310425, + "learning_rate": 3.807985408699208e-07, + "loss": 0.322, + "step": 10449 + }, + { + "epoch": 4.9408983451536646, + "grad_norm": 2.906064033508301, + "learning_rate": 3.804676623010109e-07, + "loss": 0.3246, + "step": 10450 + }, + { + "epoch": 4.941371158392435, + "grad_norm": 3.1224400997161865, + "learning_rate": 3.8013691570608634e-07, + "loss": 0.3607, + "step": 10451 + }, + { + "epoch": 4.941843971631206, + "grad_norm": 3.4386677742004395, + "learning_rate": 3.7980630110574067e-07, + "loss": 0.3315, + "step": 10452 + }, + { + "epoch": 4.942316784869976, + "grad_norm": 3.432509183883667, + "learning_rate": 3.794758185205594e-07, + "loss": 0.3713, + "step": 10453 + }, + { + "epoch": 4.942789598108747, + "grad_norm": 3.314802646636963, + "learning_rate": 3.7914546797112097e-07, + "loss": 0.3587, + "step": 10454 + }, + { + "epoch": 4.943262411347518, + "grad_norm": 2.9151065349578857, + "learning_rate": 3.788152494779948e-07, + "loss": 0.367, + "step": 10455 + }, + { + "epoch": 4.9437352245862884, + "grad_norm": 3.3444712162017822, + "learning_rate": 3.784851630617414e-07, + "loss": 0.4009, + "step": 10456 + }, + { + "epoch": 4.944208037825059, + "grad_norm": 3.2677152156829834, + "learning_rate": 3.7815520874291494e-07, + "loss": 0.3553, + "step": 10457 + }, + { + "epoch": 4.94468085106383, + "grad_norm": 3.2326159477233887, + "learning_rate": 3.7782538654205946e-07, + "loss": 0.4008, + "step": 10458 + }, + { + "epoch": 4.945153664302601, + "grad_norm": 3.3304033279418945, + "learning_rate": 3.774956964797119e-07, + "loss": 0.3591, + "step": 10459 + }, + { + "epoch": 4.945626477541371, + "grad_norm": 3.038605213165283, + "learning_rate": 3.7716613857640026e-07, + "loss": 0.2907, + "step": 10460 + }, + { + "epoch": 4.9460992907801415, + "grad_norm": 3.016227960586548, + "learning_rate": 3.768367128526443e-07, + "loss": 0.2898, + "step": 10461 + }, + { + "epoch": 4.946572104018912, + "grad_norm": 3.354973316192627, + "learning_rate": 3.76507419328957e-07, + "loss": 0.3498, + "step": 10462 + }, + { + "epoch": 4.947044917257683, + "grad_norm": 3.5561892986297607, + "learning_rate": 3.761782580258408e-07, + "loss": 0.4384, + "step": 10463 + }, + { + "epoch": 4.947517730496454, + "grad_norm": 3.2498281002044678, + "learning_rate": 3.7584922896379244e-07, + "loss": 0.3289, + "step": 10464 + }, + { + "epoch": 4.947990543735225, + "grad_norm": 3.250598907470703, + "learning_rate": 3.755203321632986e-07, + "loss": 0.4104, + "step": 10465 + }, + { + "epoch": 4.9484633569739955, + "grad_norm": 2.8788363933563232, + "learning_rate": 3.7519156764483727e-07, + "loss": 0.2896, + "step": 10466 + }, + { + "epoch": 4.948936170212766, + "grad_norm": 3.068180561065674, + "learning_rate": 3.7486293542888075e-07, + "loss": 0.3346, + "step": 10467 + }, + { + "epoch": 4.949408983451537, + "grad_norm": 3.4533181190490723, + "learning_rate": 3.7453443553589043e-07, + "loss": 0.3917, + "step": 10468 + }, + { + "epoch": 4.949881796690307, + "grad_norm": 2.8812358379364014, + "learning_rate": 3.7420606798632104e-07, + "loss": 0.3276, + "step": 10469 + }, + { + "epoch": 4.950354609929078, + "grad_norm": 3.0952184200286865, + "learning_rate": 3.7387783280061875e-07, + "loss": 0.3261, + "step": 10470 + }, + { + "epoch": 4.950827423167849, + "grad_norm": 3.2409560680389404, + "learning_rate": 3.735497299992205e-07, + "loss": 0.3504, + "step": 10471 + }, + { + "epoch": 4.951300236406619, + "grad_norm": 3.3790557384490967, + "learning_rate": 3.73221759602557e-07, + "loss": 0.3316, + "step": 10472 + }, + { + "epoch": 4.95177304964539, + "grad_norm": 3.2161364555358887, + "learning_rate": 3.728939216310487e-07, + "loss": 0.3364, + "step": 10473 + }, + { + "epoch": 4.952245862884161, + "grad_norm": 3.3514342308044434, + "learning_rate": 3.7256621610510884e-07, + "loss": 0.3912, + "step": 10474 + }, + { + "epoch": 4.952718676122932, + "grad_norm": 2.7333486080169678, + "learning_rate": 3.722386430451422e-07, + "loss": 0.3145, + "step": 10475 + }, + { + "epoch": 4.953191489361702, + "grad_norm": 3.104905128479004, + "learning_rate": 3.719112024715449e-07, + "loss": 0.3599, + "step": 10476 + }, + { + "epoch": 4.9536643026004725, + "grad_norm": 3.16666579246521, + "learning_rate": 3.715838944047059e-07, + "loss": 0.3462, + "step": 10477 + }, + { + "epoch": 4.954137115839243, + "grad_norm": 3.078171491622925, + "learning_rate": 3.7125671886500514e-07, + "loss": 0.3119, + "step": 10478 + }, + { + "epoch": 4.954609929078014, + "grad_norm": 3.261456251144409, + "learning_rate": 3.709296758728137e-07, + "loss": 0.3959, + "step": 10479 + }, + { + "epoch": 4.955082742316785, + "grad_norm": 3.0302278995513916, + "learning_rate": 3.706027654484962e-07, + "loss": 0.3526, + "step": 10480 + }, + { + "epoch": 4.955555555555556, + "grad_norm": 3.175342559814453, + "learning_rate": 3.702759876124068e-07, + "loss": 0.3237, + "step": 10481 + }, + { + "epoch": 4.9560283687943265, + "grad_norm": 3.4779844284057617, + "learning_rate": 3.699493423848938e-07, + "loss": 0.3075, + "step": 10482 + }, + { + "epoch": 4.956501182033097, + "grad_norm": 2.809904098510742, + "learning_rate": 3.69622829786295e-07, + "loss": 0.3238, + "step": 10483 + }, + { + "epoch": 4.956973995271868, + "grad_norm": 3.092604875564575, + "learning_rate": 3.692964498369406e-07, + "loss": 0.3344, + "step": 10484 + }, + { + "epoch": 4.957446808510638, + "grad_norm": 3.477560520172119, + "learning_rate": 3.689702025571543e-07, + "loss": 0.3525, + "step": 10485 + }, + { + "epoch": 4.957919621749409, + "grad_norm": 4.119097709655762, + "learning_rate": 3.6864408796724815e-07, + "loss": 0.3953, + "step": 10486 + }, + { + "epoch": 4.95839243498818, + "grad_norm": 3.1418824195861816, + "learning_rate": 3.6831810608752986e-07, + "loss": 0.3689, + "step": 10487 + }, + { + "epoch": 4.95886524822695, + "grad_norm": 3.1947824954986572, + "learning_rate": 3.6799225693829596e-07, + "loss": 0.3427, + "step": 10488 + }, + { + "epoch": 4.959338061465721, + "grad_norm": 3.196894884109497, + "learning_rate": 3.6766654053983554e-07, + "loss": 0.3138, + "step": 10489 + }, + { + "epoch": 4.959810874704492, + "grad_norm": 2.9747161865234375, + "learning_rate": 3.6734095691242975e-07, + "loss": 0.3336, + "step": 10490 + }, + { + "epoch": 4.960283687943263, + "grad_norm": 3.2788970470428467, + "learning_rate": 3.670155060763503e-07, + "loss": 0.3418, + "step": 10491 + }, + { + "epoch": 4.960756501182033, + "grad_norm": 3.1619482040405273, + "learning_rate": 3.6669018805186335e-07, + "loss": 0.3173, + "step": 10492 + }, + { + "epoch": 4.9612293144208035, + "grad_norm": 2.9894869327545166, + "learning_rate": 3.6636500285922386e-07, + "loss": 0.3057, + "step": 10493 + }, + { + "epoch": 4.961702127659574, + "grad_norm": 3.1162378787994385, + "learning_rate": 3.660399505186793e-07, + "loss": 0.3404, + "step": 10494 + }, + { + "epoch": 4.962174940898345, + "grad_norm": 2.811485528945923, + "learning_rate": 3.657150310504706e-07, + "loss": 0.3199, + "step": 10495 + }, + { + "epoch": 4.962647754137116, + "grad_norm": 2.8914854526519775, + "learning_rate": 3.653902444748278e-07, + "loss": 0.3666, + "step": 10496 + }, + { + "epoch": 4.963120567375887, + "grad_norm": 4.075942516326904, + "learning_rate": 3.6506559081197517e-07, + "loss": 0.3737, + "step": 10497 + }, + { + "epoch": 4.9635933806146575, + "grad_norm": 4.395053863525391, + "learning_rate": 3.647410700821266e-07, + "loss": 0.338, + "step": 10498 + }, + { + "epoch": 4.964066193853428, + "grad_norm": 2.89145565032959, + "learning_rate": 3.644166823054884e-07, + "loss": 0.2893, + "step": 10499 + }, + { + "epoch": 4.964539007092198, + "grad_norm": 2.8189663887023926, + "learning_rate": 3.640924275022595e-07, + "loss": 0.289, + "step": 10500 + }, + { + "epoch": 4.965011820330969, + "grad_norm": 3.0912365913391113, + "learning_rate": 3.6376830569262946e-07, + "loss": 0.3474, + "step": 10501 + }, + { + "epoch": 4.96548463356974, + "grad_norm": 3.3087918758392334, + "learning_rate": 3.634443168967797e-07, + "loss": 0.3104, + "step": 10502 + }, + { + "epoch": 4.965957446808511, + "grad_norm": 2.855022430419922, + "learning_rate": 3.6312046113488403e-07, + "loss": 0.3328, + "step": 10503 + }, + { + "epoch": 4.966430260047281, + "grad_norm": 3.5445404052734375, + "learning_rate": 3.627967384271072e-07, + "loss": 0.322, + "step": 10504 + }, + { + "epoch": 4.966903073286052, + "grad_norm": 3.526319742202759, + "learning_rate": 3.624731487936065e-07, + "loss": 0.3264, + "step": 10505 + }, + { + "epoch": 4.967375886524823, + "grad_norm": 3.521204948425293, + "learning_rate": 3.621496922545298e-07, + "loss": 0.369, + "step": 10506 + }, + { + "epoch": 4.967848699763593, + "grad_norm": 2.8956806659698486, + "learning_rate": 3.618263688300172e-07, + "loss": 0.3396, + "step": 10507 + }, + { + "epoch": 4.968321513002364, + "grad_norm": 3.155200958251953, + "learning_rate": 3.615031785402015e-07, + "loss": 0.354, + "step": 10508 + }, + { + "epoch": 4.9687943262411345, + "grad_norm": 3.2896533012390137, + "learning_rate": 3.611801214052052e-07, + "loss": 0.3034, + "step": 10509 + }, + { + "epoch": 4.969267139479905, + "grad_norm": 3.0860259532928467, + "learning_rate": 3.608571974451447e-07, + "loss": 0.3354, + "step": 10510 + }, + { + "epoch": 4.969739952718676, + "grad_norm": 3.3194656372070312, + "learning_rate": 3.6053440668012697e-07, + "loss": 0.3714, + "step": 10511 + }, + { + "epoch": 4.970212765957447, + "grad_norm": 2.9831063747406006, + "learning_rate": 3.602117491302498e-07, + "loss": 0.311, + "step": 10512 + }, + { + "epoch": 4.970685579196218, + "grad_norm": 3.175940752029419, + "learning_rate": 3.59889224815605e-07, + "loss": 0.3658, + "step": 10513 + }, + { + "epoch": 4.9711583924349885, + "grad_norm": 3.051496982574463, + "learning_rate": 3.5956683375627324e-07, + "loss": 0.3458, + "step": 10514 + }, + { + "epoch": 4.971631205673759, + "grad_norm": 3.0264453887939453, + "learning_rate": 3.592445759723298e-07, + "loss": 0.2843, + "step": 10515 + }, + { + "epoch": 4.972104018912529, + "grad_norm": 3.404376745223999, + "learning_rate": 3.589224514838399e-07, + "loss": 0.366, + "step": 10516 + }, + { + "epoch": 4.9725768321513, + "grad_norm": 3.640212297439575, + "learning_rate": 3.586004603108598e-07, + "loss": 0.3248, + "step": 10517 + }, + { + "epoch": 4.973049645390071, + "grad_norm": 3.0829873085021973, + "learning_rate": 3.5827860247344e-07, + "loss": 0.3613, + "step": 10518 + }, + { + "epoch": 4.973522458628842, + "grad_norm": 3.6157045364379883, + "learning_rate": 3.5795687799162064e-07, + "loss": 0.3599, + "step": 10519 + }, + { + "epoch": 4.973995271867612, + "grad_norm": 3.150632619857788, + "learning_rate": 3.576352868854335e-07, + "loss": 0.3242, + "step": 10520 + }, + { + "epoch": 4.974468085106383, + "grad_norm": 3.04829740524292, + "learning_rate": 3.5731382917490286e-07, + "loss": 0.3819, + "step": 10521 + }, + { + "epoch": 4.974940898345154, + "grad_norm": 3.216092348098755, + "learning_rate": 3.5699250488004516e-07, + "loss": 0.3538, + "step": 10522 + }, + { + "epoch": 4.975413711583924, + "grad_norm": 3.36538028717041, + "learning_rate": 3.5667131402086717e-07, + "loss": 0.3381, + "step": 10523 + }, + { + "epoch": 4.975886524822695, + "grad_norm": 3.3398420810699463, + "learning_rate": 3.563502566173685e-07, + "loss": 0.3085, + "step": 10524 + }, + { + "epoch": 4.9763593380614655, + "grad_norm": 3.10583233833313, + "learning_rate": 3.5602933268953893e-07, + "loss": 0.3023, + "step": 10525 + }, + { + "epoch": 4.976832151300236, + "grad_norm": 3.422929525375366, + "learning_rate": 3.557085422573625e-07, + "loss": 0.3319, + "step": 10526 + }, + { + "epoch": 4.977304964539007, + "grad_norm": 3.7357773780822754, + "learning_rate": 3.5538788534081214e-07, + "loss": 0.3762, + "step": 10527 + }, + { + "epoch": 4.977777777777778, + "grad_norm": 3.0172133445739746, + "learning_rate": 3.550673619598549e-07, + "loss": 0.3292, + "step": 10528 + }, + { + "epoch": 4.978250591016549, + "grad_norm": 3.2497189044952393, + "learning_rate": 3.5474697213444763e-07, + "loss": 0.3292, + "step": 10529 + }, + { + "epoch": 4.9787234042553195, + "grad_norm": 2.8510115146636963, + "learning_rate": 3.544267158845394e-07, + "loss": 0.3717, + "step": 10530 + }, + { + "epoch": 4.97919621749409, + "grad_norm": 3.2559750080108643, + "learning_rate": 3.541065932300719e-07, + "loss": 0.3656, + "step": 10531 + }, + { + "epoch": 4.97966903073286, + "grad_norm": 3.3215935230255127, + "learning_rate": 3.537866041909768e-07, + "loss": 0.364, + "step": 10532 + }, + { + "epoch": 4.980141843971631, + "grad_norm": 3.4923696517944336, + "learning_rate": 3.5346674878717954e-07, + "loss": 0.3464, + "step": 10533 + }, + { + "epoch": 4.980614657210402, + "grad_norm": 3.5320425033569336, + "learning_rate": 3.531470270385959e-07, + "loss": 0.3506, + "step": 10534 + }, + { + "epoch": 4.9810874704491725, + "grad_norm": 3.290199041366577, + "learning_rate": 3.528274389651323e-07, + "loss": 0.4092, + "step": 10535 + }, + { + "epoch": 4.981560283687943, + "grad_norm": 3.108628034591675, + "learning_rate": 3.5250798458668966e-07, + "loss": 0.3522, + "step": 10536 + }, + { + "epoch": 4.982033096926714, + "grad_norm": 3.3015148639678955, + "learning_rate": 3.521886639231584e-07, + "loss": 0.3609, + "step": 10537 + }, + { + "epoch": 4.982505910165485, + "grad_norm": 3.506431818008423, + "learning_rate": 3.518694769944211e-07, + "loss": 0.3458, + "step": 10538 + }, + { + "epoch": 4.982978723404255, + "grad_norm": 3.560453414916992, + "learning_rate": 3.5155042382035236e-07, + "loss": 0.3803, + "step": 10539 + }, + { + "epoch": 4.983451536643026, + "grad_norm": 3.1382486820220947, + "learning_rate": 3.5123150442081757e-07, + "loss": 0.3209, + "step": 10540 + }, + { + "epoch": 4.9839243498817964, + "grad_norm": 4.326927661895752, + "learning_rate": 3.5091271881567523e-07, + "loss": 0.3649, + "step": 10541 + }, + { + "epoch": 4.984397163120567, + "grad_norm": 3.0951757431030273, + "learning_rate": 3.50594067024774e-07, + "loss": 0.3808, + "step": 10542 + }, + { + "epoch": 4.984869976359338, + "grad_norm": 3.264277458190918, + "learning_rate": 3.5027554906795574e-07, + "loss": 0.3408, + "step": 10543 + }, + { + "epoch": 4.985342789598109, + "grad_norm": 3.3679237365722656, + "learning_rate": 3.4995716496505293e-07, + "loss": 0.3746, + "step": 10544 + }, + { + "epoch": 4.98581560283688, + "grad_norm": 3.489201545715332, + "learning_rate": 3.496389147358892e-07, + "loss": 0.3725, + "step": 10545 + }, + { + "epoch": 4.98628841607565, + "grad_norm": 2.8233766555786133, + "learning_rate": 3.4932079840028193e-07, + "loss": 0.3178, + "step": 10546 + }, + { + "epoch": 4.986761229314421, + "grad_norm": 3.1723084449768066, + "learning_rate": 3.490028159780373e-07, + "loss": 0.348, + "step": 10547 + }, + { + "epoch": 4.987234042553191, + "grad_norm": 3.2631607055664062, + "learning_rate": 3.4868496748895616e-07, + "loss": 0.3608, + "step": 10548 + }, + { + "epoch": 4.987706855791962, + "grad_norm": 3.4170608520507812, + "learning_rate": 3.483672529528287e-07, + "loss": 0.3819, + "step": 10549 + }, + { + "epoch": 4.988179669030733, + "grad_norm": 3.002686023712158, + "learning_rate": 3.480496723894375e-07, + "loss": 0.2695, + "step": 10550 + }, + { + "epoch": 4.9886524822695035, + "grad_norm": 3.051232099533081, + "learning_rate": 3.4773222581855753e-07, + "loss": 0.3638, + "step": 10551 + }, + { + "epoch": 4.989125295508274, + "grad_norm": 2.959977149963379, + "learning_rate": 3.474149132599544e-07, + "loss": 0.3338, + "step": 10552 + }, + { + "epoch": 4.989598108747045, + "grad_norm": 2.925457000732422, + "learning_rate": 3.470977347333859e-07, + "loss": 0.3212, + "step": 10553 + }, + { + "epoch": 4.990070921985816, + "grad_norm": 3.0996408462524414, + "learning_rate": 3.4678069025860154e-07, + "loss": 0.3447, + "step": 10554 + }, + { + "epoch": 4.990543735224586, + "grad_norm": 2.8487865924835205, + "learning_rate": 3.4646377985534106e-07, + "loss": 0.3434, + "step": 10555 + }, + { + "epoch": 4.991016548463357, + "grad_norm": 2.8337016105651855, + "learning_rate": 3.461470035433387e-07, + "loss": 0.342, + "step": 10556 + }, + { + "epoch": 4.991489361702127, + "grad_norm": 2.9243876934051514, + "learning_rate": 3.4583036134231805e-07, + "loss": 0.3256, + "step": 10557 + }, + { + "epoch": 4.991962174940898, + "grad_norm": 3.2548747062683105, + "learning_rate": 3.455138532719948e-07, + "loss": 0.3313, + "step": 10558 + }, + { + "epoch": 4.992434988179669, + "grad_norm": 3.03932523727417, + "learning_rate": 3.451974793520771e-07, + "loss": 0.3854, + "step": 10559 + }, + { + "epoch": 4.99290780141844, + "grad_norm": 3.4757370948791504, + "learning_rate": 3.44881239602263e-07, + "loss": 0.3909, + "step": 10560 + }, + { + "epoch": 4.993380614657211, + "grad_norm": 2.9729294776916504, + "learning_rate": 3.4456513404224513e-07, + "loss": 0.3645, + "step": 10561 + }, + { + "epoch": 4.993853427895981, + "grad_norm": 3.2144060134887695, + "learning_rate": 3.4424916269170495e-07, + "loss": 0.3236, + "step": 10562 + }, + { + "epoch": 4.994326241134752, + "grad_norm": 3.742386817932129, + "learning_rate": 3.4393332557031615e-07, + "loss": 0.332, + "step": 10563 + }, + { + "epoch": 4.994799054373522, + "grad_norm": 3.2569401264190674, + "learning_rate": 3.4361762269774557e-07, + "loss": 0.3774, + "step": 10564 + }, + { + "epoch": 4.995271867612293, + "grad_norm": 2.91739821434021, + "learning_rate": 3.433020540936499e-07, + "loss": 0.3061, + "step": 10565 + }, + { + "epoch": 4.995744680851064, + "grad_norm": 3.534137487411499, + "learning_rate": 3.429866197776788e-07, + "loss": 0.357, + "step": 10566 + }, + { + "epoch": 4.9962174940898345, + "grad_norm": 3.215837001800537, + "learning_rate": 3.4267131976947284e-07, + "loss": 0.3395, + "step": 10567 + }, + { + "epoch": 4.996690307328605, + "grad_norm": 3.294857978820801, + "learning_rate": 3.4235615408866384e-07, + "loss": 0.3273, + "step": 10568 + }, + { + "epoch": 4.997163120567376, + "grad_norm": 3.519171953201294, + "learning_rate": 3.4204112275487646e-07, + "loss": 0.3712, + "step": 10569 + }, + { + "epoch": 4.997635933806147, + "grad_norm": 3.037527084350586, + "learning_rate": 3.4172622578772544e-07, + "loss": 0.2949, + "step": 10570 + }, + { + "epoch": 4.998108747044917, + "grad_norm": 3.309682846069336, + "learning_rate": 3.4141146320681913e-07, + "loss": 0.3068, + "step": 10571 + }, + { + "epoch": 4.998581560283688, + "grad_norm": 3.2197179794311523, + "learning_rate": 3.410968350317559e-07, + "loss": 0.3725, + "step": 10572 + }, + { + "epoch": 4.999054373522458, + "grad_norm": 3.0465641021728516, + "learning_rate": 3.4078234128212537e-07, + "loss": 0.3505, + "step": 10573 + }, + { + "epoch": 4.999527186761229, + "grad_norm": 3.066941022872925, + "learning_rate": 3.404679819775114e-07, + "loss": 0.3435, + "step": 10574 + }, + { + "epoch": 5.0, + "grad_norm": 3.3947532176971436, + "learning_rate": 3.401537571374869e-07, + "loss": 0.3344, + "step": 10575 + }, + { + "epoch": 5.000472813238771, + "grad_norm": 3.2180113792419434, + "learning_rate": 3.398396667816167e-07, + "loss": 0.2986, + "step": 10576 + }, + { + "epoch": 5.000945626477542, + "grad_norm": 3.039257049560547, + "learning_rate": 3.395257109294592e-07, + "loss": 0.3435, + "step": 10577 + }, + { + "epoch": 5.001418439716312, + "grad_norm": 2.675401210784912, + "learning_rate": 3.392118896005614e-07, + "loss": 0.2722, + "step": 10578 + }, + { + "epoch": 5.001891252955082, + "grad_norm": 3.395113468170166, + "learning_rate": 3.388982028144652e-07, + "loss": 0.3091, + "step": 10579 + }, + { + "epoch": 5.002364066193853, + "grad_norm": 2.859116315841675, + "learning_rate": 3.385846505907017e-07, + "loss": 0.2849, + "step": 10580 + }, + { + "epoch": 5.002836879432624, + "grad_norm": 2.703989028930664, + "learning_rate": 3.3827123294879416e-07, + "loss": 0.3236, + "step": 10581 + }, + { + "epoch": 5.003309692671395, + "grad_norm": 3.112179756164551, + "learning_rate": 3.379579499082583e-07, + "loss": 0.36, + "step": 10582 + }, + { + "epoch": 5.0037825059101655, + "grad_norm": 3.0279061794281006, + "learning_rate": 3.376448014886008e-07, + "loss": 0.3079, + "step": 10583 + }, + { + "epoch": 5.004255319148936, + "grad_norm": 3.638592481613159, + "learning_rate": 3.373317877093199e-07, + "loss": 0.3535, + "step": 10584 + }, + { + "epoch": 5.004728132387707, + "grad_norm": 3.7547621726989746, + "learning_rate": 3.3701890858990476e-07, + "loss": 0.3489, + "step": 10585 + }, + { + "epoch": 5.005200945626478, + "grad_norm": 3.151641607284546, + "learning_rate": 3.367061641498387e-07, + "loss": 0.2844, + "step": 10586 + }, + { + "epoch": 5.005673758865248, + "grad_norm": 2.8997435569763184, + "learning_rate": 3.363935544085939e-07, + "loss": 0.297, + "step": 10587 + }, + { + "epoch": 5.006146572104019, + "grad_norm": 3.1785173416137695, + "learning_rate": 3.360810793856345e-07, + "loss": 0.3055, + "step": 10588 + }, + { + "epoch": 5.006619385342789, + "grad_norm": 2.8876235485076904, + "learning_rate": 3.357687391004186e-07, + "loss": 0.3005, + "step": 10589 + }, + { + "epoch": 5.00709219858156, + "grad_norm": 2.9282939434051514, + "learning_rate": 3.354565335723936e-07, + "loss": 0.2367, + "step": 10590 + }, + { + "epoch": 5.007565011820331, + "grad_norm": 3.283669948577881, + "learning_rate": 3.3514446282099823e-07, + "loss": 0.3386, + "step": 10591 + }, + { + "epoch": 5.008037825059102, + "grad_norm": 3.2054243087768555, + "learning_rate": 3.348325268656652e-07, + "loss": 0.3192, + "step": 10592 + }, + { + "epoch": 5.008510638297873, + "grad_norm": 2.9525279998779297, + "learning_rate": 3.345207257258162e-07, + "loss": 0.3742, + "step": 10593 + }, + { + "epoch": 5.008983451536643, + "grad_norm": 2.8707218170166016, + "learning_rate": 3.3420905942086664e-07, + "loss": 0.2783, + "step": 10594 + }, + { + "epoch": 5.009456264775413, + "grad_norm": 2.9152603149414062, + "learning_rate": 3.3389752797022253e-07, + "loss": 0.3267, + "step": 10595 + }, + { + "epoch": 5.009929078014184, + "grad_norm": 3.2035791873931885, + "learning_rate": 3.3358613139328063e-07, + "loss": 0.3519, + "step": 10596 + }, + { + "epoch": 5.010401891252955, + "grad_norm": 3.4502341747283936, + "learning_rate": 3.3327486970943123e-07, + "loss": 0.3766, + "step": 10597 + }, + { + "epoch": 5.010874704491726, + "grad_norm": 2.740959405899048, + "learning_rate": 3.32963742938055e-07, + "loss": 0.2985, + "step": 10598 + }, + { + "epoch": 5.0113475177304965, + "grad_norm": 2.6652681827545166, + "learning_rate": 3.326527510985239e-07, + "loss": 0.3016, + "step": 10599 + }, + { + "epoch": 5.011820330969267, + "grad_norm": 2.980694532394409, + "learning_rate": 3.3234189421020304e-07, + "loss": 0.3407, + "step": 10600 + }, + { + "epoch": 5.012293144208038, + "grad_norm": 2.976670742034912, + "learning_rate": 3.320311722924474e-07, + "loss": 0.3156, + "step": 10601 + }, + { + "epoch": 5.012765957446809, + "grad_norm": 3.8425865173339844, + "learning_rate": 3.317205853646044e-07, + "loss": 0.3362, + "step": 10602 + }, + { + "epoch": 5.013238770685579, + "grad_norm": 3.247138023376465, + "learning_rate": 3.314101334460129e-07, + "loss": 0.2753, + "step": 10603 + }, + { + "epoch": 5.01371158392435, + "grad_norm": 3.321345567703247, + "learning_rate": 3.310998165560031e-07, + "loss": 0.3464, + "step": 10604 + }, + { + "epoch": 5.01418439716312, + "grad_norm": 3.3347668647766113, + "learning_rate": 3.307896347138978e-07, + "loss": 0.3349, + "step": 10605 + }, + { + "epoch": 5.014657210401891, + "grad_norm": 2.7465898990631104, + "learning_rate": 3.304795879390096e-07, + "loss": 0.3201, + "step": 10606 + }, + { + "epoch": 5.015130023640662, + "grad_norm": 2.986417293548584, + "learning_rate": 3.301696762506448e-07, + "loss": 0.345, + "step": 10607 + }, + { + "epoch": 5.015602836879433, + "grad_norm": 2.928632974624634, + "learning_rate": 3.2985989966810017e-07, + "loss": 0.3285, + "step": 10608 + }, + { + "epoch": 5.0160756501182036, + "grad_norm": 3.1549665927886963, + "learning_rate": 3.29550258210663e-07, + "loss": 0.3129, + "step": 10609 + }, + { + "epoch": 5.016548463356974, + "grad_norm": 3.2318291664123535, + "learning_rate": 3.292407518976148e-07, + "loss": 0.3152, + "step": 10610 + }, + { + "epoch": 5.017021276595744, + "grad_norm": 2.8423120975494385, + "learning_rate": 3.2893138074822613e-07, + "loss": 0.3186, + "step": 10611 + }, + { + "epoch": 5.017494089834515, + "grad_norm": 3.5018017292022705, + "learning_rate": 3.2862214478176084e-07, + "loss": 0.3586, + "step": 10612 + }, + { + "epoch": 5.017966903073286, + "grad_norm": 3.0157501697540283, + "learning_rate": 3.283130440174734e-07, + "loss": 0.2825, + "step": 10613 + }, + { + "epoch": 5.018439716312057, + "grad_norm": 2.756171226501465, + "learning_rate": 3.280040784746097e-07, + "loss": 0.2934, + "step": 10614 + }, + { + "epoch": 5.0189125295508275, + "grad_norm": 3.8830623626708984, + "learning_rate": 3.2769524817240855e-07, + "loss": 0.3515, + "step": 10615 + }, + { + "epoch": 5.019385342789598, + "grad_norm": 3.1528213024139404, + "learning_rate": 3.2738655313009903e-07, + "loss": 0.37, + "step": 10616 + }, + { + "epoch": 5.019858156028369, + "grad_norm": 2.943558692932129, + "learning_rate": 3.270779933669022e-07, + "loss": 0.2928, + "step": 10617 + }, + { + "epoch": 5.02033096926714, + "grad_norm": 3.229234457015991, + "learning_rate": 3.267695689020309e-07, + "loss": 0.3264, + "step": 10618 + }, + { + "epoch": 5.02080378250591, + "grad_norm": 3.3662054538726807, + "learning_rate": 3.264612797546884e-07, + "loss": 0.3564, + "step": 10619 + }, + { + "epoch": 5.0212765957446805, + "grad_norm": 2.895679235458374, + "learning_rate": 3.2615312594407227e-07, + "loss": 0.3013, + "step": 10620 + }, + { + "epoch": 5.021749408983451, + "grad_norm": 3.1217849254608154, + "learning_rate": 3.258451074893687e-07, + "loss": 0.3122, + "step": 10621 + }, + { + "epoch": 5.022222222222222, + "grad_norm": 3.1026153564453125, + "learning_rate": 3.255372244097563e-07, + "loss": 0.3466, + "step": 10622 + }, + { + "epoch": 5.022695035460993, + "grad_norm": 2.99145245552063, + "learning_rate": 3.2522947672440684e-07, + "loss": 0.2763, + "step": 10623 + }, + { + "epoch": 5.023167848699764, + "grad_norm": 3.6011345386505127, + "learning_rate": 3.2492186445248125e-07, + "loss": 0.3652, + "step": 10624 + }, + { + "epoch": 5.0236406619385345, + "grad_norm": 3.3117830753326416, + "learning_rate": 3.246143876131344e-07, + "loss": 0.3419, + "step": 10625 + }, + { + "epoch": 5.024113475177305, + "grad_norm": 3.467041492462158, + "learning_rate": 3.243070462255107e-07, + "loss": 0.3495, + "step": 10626 + }, + { + "epoch": 5.024586288416075, + "grad_norm": 3.3493213653564453, + "learning_rate": 3.2399984030874657e-07, + "loss": 0.3214, + "step": 10627 + }, + { + "epoch": 5.025059101654846, + "grad_norm": 2.9000561237335205, + "learning_rate": 3.236927698819714e-07, + "loss": 0.3358, + "step": 10628 + }, + { + "epoch": 5.025531914893617, + "grad_norm": 3.1759841442108154, + "learning_rate": 3.233858349643043e-07, + "loss": 0.3122, + "step": 10629 + }, + { + "epoch": 5.026004728132388, + "grad_norm": 3.0218966007232666, + "learning_rate": 3.2307903557485754e-07, + "loss": 0.3051, + "step": 10630 + }, + { + "epoch": 5.026477541371158, + "grad_norm": 3.0489871501922607, + "learning_rate": 3.227723717327336e-07, + "loss": 0.3551, + "step": 10631 + }, + { + "epoch": 5.026950354609929, + "grad_norm": 3.190955638885498, + "learning_rate": 3.224658434570271e-07, + "loss": 0.3282, + "step": 10632 + }, + { + "epoch": 5.0274231678487, + "grad_norm": 3.4504268169403076, + "learning_rate": 3.2215945076682463e-07, + "loss": 0.3401, + "step": 10633 + }, + { + "epoch": 5.027895981087471, + "grad_norm": 2.847717761993408, + "learning_rate": 3.2185319368120304e-07, + "loss": 0.3045, + "step": 10634 + }, + { + "epoch": 5.028368794326241, + "grad_norm": 3.450645923614502, + "learning_rate": 3.2154707221923265e-07, + "loss": 0.3026, + "step": 10635 + }, + { + "epoch": 5.0288416075650115, + "grad_norm": 3.739544630050659, + "learning_rate": 3.212410863999738e-07, + "loss": 0.3375, + "step": 10636 + }, + { + "epoch": 5.029314420803782, + "grad_norm": 3.1687986850738525, + "learning_rate": 3.2093523624247837e-07, + "loss": 0.2995, + "step": 10637 + }, + { + "epoch": 5.029787234042553, + "grad_norm": 3.5549468994140625, + "learning_rate": 3.2062952176579147e-07, + "loss": 0.3618, + "step": 10638 + }, + { + "epoch": 5.030260047281324, + "grad_norm": 3.1378657817840576, + "learning_rate": 3.2032394298894766e-07, + "loss": 0.3621, + "step": 10639 + }, + { + "epoch": 5.030732860520095, + "grad_norm": 2.9576902389526367, + "learning_rate": 3.200184999309747e-07, + "loss": 0.2966, + "step": 10640 + }, + { + "epoch": 5.0312056737588655, + "grad_norm": 2.8072168827056885, + "learning_rate": 3.197131926108907e-07, + "loss": 0.3185, + "step": 10641 + }, + { + "epoch": 5.031678486997636, + "grad_norm": 2.894251585006714, + "learning_rate": 3.1940802104770547e-07, + "loss": 0.3022, + "step": 10642 + }, + { + "epoch": 5.032151300236406, + "grad_norm": 3.3244433403015137, + "learning_rate": 3.191029852604216e-07, + "loss": 0.2689, + "step": 10643 + }, + { + "epoch": 5.032624113475177, + "grad_norm": 2.8501358032226562, + "learning_rate": 3.187980852680314e-07, + "loss": 0.2812, + "step": 10644 + }, + { + "epoch": 5.033096926713948, + "grad_norm": 2.975888252258301, + "learning_rate": 3.184933210895208e-07, + "loss": 0.2849, + "step": 10645 + }, + { + "epoch": 5.033569739952719, + "grad_norm": 3.391071081161499, + "learning_rate": 3.1818869274386543e-07, + "loss": 0.3458, + "step": 10646 + }, + { + "epoch": 5.034042553191489, + "grad_norm": 2.735366106033325, + "learning_rate": 3.178842002500332e-07, + "loss": 0.32, + "step": 10647 + }, + { + "epoch": 5.03451536643026, + "grad_norm": 3.1227850914001465, + "learning_rate": 3.175798436269836e-07, + "loss": 0.268, + "step": 10648 + }, + { + "epoch": 5.034988179669031, + "grad_norm": 3.4545602798461914, + "learning_rate": 3.172756228936674e-07, + "loss": 0.3474, + "step": 10649 + }, + { + "epoch": 5.035460992907802, + "grad_norm": 3.0225183963775635, + "learning_rate": 3.1697153806902665e-07, + "loss": 0.3511, + "step": 10650 + }, + { + "epoch": 5.035933806146572, + "grad_norm": 3.1183130741119385, + "learning_rate": 3.166675891719967e-07, + "loss": 0.3074, + "step": 10651 + }, + { + "epoch": 5.0364066193853425, + "grad_norm": 4.368288040161133, + "learning_rate": 3.163637762215016e-07, + "loss": 0.3359, + "step": 10652 + }, + { + "epoch": 5.036879432624113, + "grad_norm": 3.097292184829712, + "learning_rate": 3.1606009923645986e-07, + "loss": 0.2928, + "step": 10653 + }, + { + "epoch": 5.037352245862884, + "grad_norm": 3.81937837600708, + "learning_rate": 3.157565582357794e-07, + "loss": 0.3097, + "step": 10654 + }, + { + "epoch": 5.037825059101655, + "grad_norm": 3.05619215965271, + "learning_rate": 3.1545315323835977e-07, + "loss": 0.3061, + "step": 10655 + }, + { + "epoch": 5.038297872340426, + "grad_norm": 3.4275028705596924, + "learning_rate": 3.15149884263094e-07, + "loss": 0.3396, + "step": 10656 + }, + { + "epoch": 5.0387706855791965, + "grad_norm": 2.899402379989624, + "learning_rate": 3.148467513288639e-07, + "loss": 0.3049, + "step": 10657 + }, + { + "epoch": 5.039243498817967, + "grad_norm": 2.9791312217712402, + "learning_rate": 3.145437544545457e-07, + "loss": 0.286, + "step": 10658 + }, + { + "epoch": 5.039716312056737, + "grad_norm": 2.78840708732605, + "learning_rate": 3.1424089365900524e-07, + "loss": 0.3045, + "step": 10659 + }, + { + "epoch": 5.040189125295508, + "grad_norm": 3.066490650177002, + "learning_rate": 3.139381689610993e-07, + "loss": 0.3093, + "step": 10660 + }, + { + "epoch": 5.040661938534279, + "grad_norm": 3.0271544456481934, + "learning_rate": 3.136355803796784e-07, + "loss": 0.3018, + "step": 10661 + }, + { + "epoch": 5.04113475177305, + "grad_norm": 3.203864336013794, + "learning_rate": 3.13333127933583e-07, + "loss": 0.3668, + "step": 10662 + }, + { + "epoch": 5.04160756501182, + "grad_norm": 3.2266061305999756, + "learning_rate": 3.13030811641645e-07, + "loss": 0.3551, + "step": 10663 + }, + { + "epoch": 5.042080378250591, + "grad_norm": 3.262101650238037, + "learning_rate": 3.1272863152268935e-07, + "loss": 0.346, + "step": 10664 + }, + { + "epoch": 5.042553191489362, + "grad_norm": 3.701680898666382, + "learning_rate": 3.12426587595531e-07, + "loss": 0.2847, + "step": 10665 + }, + { + "epoch": 5.043026004728133, + "grad_norm": 3.3046317100524902, + "learning_rate": 3.121246798789768e-07, + "loss": 0.3787, + "step": 10666 + }, + { + "epoch": 5.043498817966903, + "grad_norm": 3.1488304138183594, + "learning_rate": 3.118229083918245e-07, + "loss": 0.2567, + "step": 10667 + }, + { + "epoch": 5.0439716312056735, + "grad_norm": 3.2503244876861572, + "learning_rate": 3.115212731528655e-07, + "loss": 0.3322, + "step": 10668 + }, + { + "epoch": 5.044444444444444, + "grad_norm": 3.071131706237793, + "learning_rate": 3.112197741808809e-07, + "loss": 0.2992, + "step": 10669 + }, + { + "epoch": 5.044917257683215, + "grad_norm": 3.5055527687072754, + "learning_rate": 3.1091841149464256e-07, + "loss": 0.3817, + "step": 10670 + }, + { + "epoch": 5.045390070921986, + "grad_norm": 3.1892800331115723, + "learning_rate": 3.106171851129164e-07, + "loss": 0.3413, + "step": 10671 + }, + { + "epoch": 5.045862884160757, + "grad_norm": 3.267735481262207, + "learning_rate": 3.1031609505445814e-07, + "loss": 0.3352, + "step": 10672 + }, + { + "epoch": 5.0463356973995275, + "grad_norm": 3.1450655460357666, + "learning_rate": 3.100151413380145e-07, + "loss": 0.3002, + "step": 10673 + }, + { + "epoch": 5.046808510638298, + "grad_norm": 3.761446475982666, + "learning_rate": 3.097143239823261e-07, + "loss": 0.366, + "step": 10674 + }, + { + "epoch": 5.047281323877068, + "grad_norm": 3.272583246231079, + "learning_rate": 3.0941364300612177e-07, + "loss": 0.3651, + "step": 10675 + }, + { + "epoch": 5.047754137115839, + "grad_norm": 3.0774614810943604, + "learning_rate": 3.0911309842812487e-07, + "loss": 0.3441, + "step": 10676 + }, + { + "epoch": 5.04822695035461, + "grad_norm": 3.3999059200286865, + "learning_rate": 3.088126902670488e-07, + "loss": 0.3242, + "step": 10677 + }, + { + "epoch": 5.048699763593381, + "grad_norm": 3.13442325592041, + "learning_rate": 3.0851241854159804e-07, + "loss": 0.3085, + "step": 10678 + }, + { + "epoch": 5.049172576832151, + "grad_norm": 3.2339842319488525, + "learning_rate": 3.082122832704698e-07, + "loss": 0.3315, + "step": 10679 + }, + { + "epoch": 5.049645390070922, + "grad_norm": 2.742600202560425, + "learning_rate": 3.079122844723523e-07, + "loss": 0.2974, + "step": 10680 + }, + { + "epoch": 5.050118203309693, + "grad_norm": 2.778710126876831, + "learning_rate": 3.0761242216592463e-07, + "loss": 0.317, + "step": 10681 + }, + { + "epoch": 5.050591016548464, + "grad_norm": 3.219449520111084, + "learning_rate": 3.0731269636985805e-07, + "loss": 0.3692, + "step": 10682 + }, + { + "epoch": 5.051063829787234, + "grad_norm": 3.0930869579315186, + "learning_rate": 3.0701310710281486e-07, + "loss": 0.3278, + "step": 10683 + }, + { + "epoch": 5.0515366430260045, + "grad_norm": 3.427046298980713, + "learning_rate": 3.0671365438345006e-07, + "loss": 0.3162, + "step": 10684 + }, + { + "epoch": 5.052009456264775, + "grad_norm": 3.582806348800659, + "learning_rate": 3.06414338230408e-07, + "loss": 0.3586, + "step": 10685 + }, + { + "epoch": 5.052482269503546, + "grad_norm": 3.381739377975464, + "learning_rate": 3.0611515866232704e-07, + "loss": 0.3436, + "step": 10686 + }, + { + "epoch": 5.052955082742317, + "grad_norm": 3.377415418624878, + "learning_rate": 3.058161156978356e-07, + "loss": 0.3767, + "step": 10687 + }, + { + "epoch": 5.053427895981088, + "grad_norm": 3.4613475799560547, + "learning_rate": 3.055172093555525e-07, + "loss": 0.3364, + "step": 10688 + }, + { + "epoch": 5.0539007092198585, + "grad_norm": 2.8797378540039062, + "learning_rate": 3.052184396540911e-07, + "loss": 0.2771, + "step": 10689 + }, + { + "epoch": 5.054373522458629, + "grad_norm": 3.4278573989868164, + "learning_rate": 3.0491980661205287e-07, + "loss": 0.355, + "step": 10690 + }, + { + "epoch": 5.054846335697399, + "grad_norm": 2.910229444503784, + "learning_rate": 3.046213102480339e-07, + "loss": 0.2778, + "step": 10691 + }, + { + "epoch": 5.05531914893617, + "grad_norm": 2.9667794704437256, + "learning_rate": 3.0432295058061945e-07, + "loss": 0.2809, + "step": 10692 + }, + { + "epoch": 5.055791962174941, + "grad_norm": 3.119274854660034, + "learning_rate": 3.040247276283864e-07, + "loss": 0.3429, + "step": 10693 + }, + { + "epoch": 5.0562647754137116, + "grad_norm": 3.1743738651275635, + "learning_rate": 3.037266414099052e-07, + "loss": 0.2842, + "step": 10694 + }, + { + "epoch": 5.056737588652482, + "grad_norm": 2.9281697273254395, + "learning_rate": 3.0342869194373544e-07, + "loss": 0.3083, + "step": 10695 + }, + { + "epoch": 5.057210401891253, + "grad_norm": 2.8758695125579834, + "learning_rate": 3.0313087924842974e-07, + "loss": 0.3253, + "step": 10696 + }, + { + "epoch": 5.057683215130024, + "grad_norm": 3.0644049644470215, + "learning_rate": 3.0283320334253074e-07, + "loss": 0.3108, + "step": 10697 + }, + { + "epoch": 5.058156028368795, + "grad_norm": 3.1170527935028076, + "learning_rate": 3.025356642445737e-07, + "loss": 0.3578, + "step": 10698 + }, + { + "epoch": 5.058628841607565, + "grad_norm": 3.1002635955810547, + "learning_rate": 3.022382619730857e-07, + "loss": 0.2711, + "step": 10699 + }, + { + "epoch": 5.0591016548463354, + "grad_norm": 3.3275279998779297, + "learning_rate": 3.0194099654658414e-07, + "loss": 0.3266, + "step": 10700 + }, + { + "epoch": 5.059574468085106, + "grad_norm": 3.003758192062378, + "learning_rate": 3.0164386798357814e-07, + "loss": 0.325, + "step": 10701 + }, + { + "epoch": 5.060047281323877, + "grad_norm": 3.391663074493408, + "learning_rate": 3.013468763025695e-07, + "loss": 0.3394, + "step": 10702 + }, + { + "epoch": 5.060520094562648, + "grad_norm": 3.3918750286102295, + "learning_rate": 3.0105002152204936e-07, + "loss": 0.3408, + "step": 10703 + }, + { + "epoch": 5.060992907801419, + "grad_norm": 3.5883846282958984, + "learning_rate": 3.0075330366050317e-07, + "loss": 0.3174, + "step": 10704 + }, + { + "epoch": 5.061465721040189, + "grad_norm": 2.7748823165893555, + "learning_rate": 3.00456722736405e-07, + "loss": 0.2628, + "step": 10705 + }, + { + "epoch": 5.06193853427896, + "grad_norm": 3.1777453422546387, + "learning_rate": 3.0016027876822147e-07, + "loss": 0.314, + "step": 10706 + }, + { + "epoch": 5.06241134751773, + "grad_norm": 2.857104778289795, + "learning_rate": 2.99863971774412e-07, + "loss": 0.2587, + "step": 10707 + }, + { + "epoch": 5.062884160756501, + "grad_norm": 3.162203550338745, + "learning_rate": 2.995678017734252e-07, + "loss": 0.3167, + "step": 10708 + }, + { + "epoch": 5.063356973995272, + "grad_norm": 2.9197676181793213, + "learning_rate": 2.992717687837032e-07, + "loss": 0.2971, + "step": 10709 + }, + { + "epoch": 5.0638297872340425, + "grad_norm": 3.4100844860076904, + "learning_rate": 2.9897587282367856e-07, + "loss": 0.2964, + "step": 10710 + }, + { + "epoch": 5.064302600472813, + "grad_norm": 3.562899351119995, + "learning_rate": 2.986801139117748e-07, + "loss": 0.3344, + "step": 10711 + }, + { + "epoch": 5.064775413711584, + "grad_norm": 2.8942716121673584, + "learning_rate": 2.9838449206640806e-07, + "loss": 0.3082, + "step": 10712 + }, + { + "epoch": 5.065248226950355, + "grad_norm": 3.159891366958618, + "learning_rate": 2.9808900730598445e-07, + "loss": 0.3353, + "step": 10713 + }, + { + "epoch": 5.065721040189126, + "grad_norm": 2.9876580238342285, + "learning_rate": 2.9779365964890395e-07, + "loss": 0.3224, + "step": 10714 + }, + { + "epoch": 5.066193853427896, + "grad_norm": 3.038656234741211, + "learning_rate": 2.974984491135557e-07, + "loss": 0.347, + "step": 10715 + }, + { + "epoch": 5.066666666666666, + "grad_norm": 3.407076358795166, + "learning_rate": 2.9720337571832126e-07, + "loss": 0.3057, + "step": 10716 + }, + { + "epoch": 5.067139479905437, + "grad_norm": 3.6783857345581055, + "learning_rate": 2.9690843948157384e-07, + "loss": 0.3764, + "step": 10717 + }, + { + "epoch": 5.067612293144208, + "grad_norm": 3.081132173538208, + "learning_rate": 2.9661364042167765e-07, + "loss": 0.326, + "step": 10718 + }, + { + "epoch": 5.068085106382979, + "grad_norm": 3.4150638580322266, + "learning_rate": 2.9631897855698784e-07, + "loss": 0.3286, + "step": 10719 + }, + { + "epoch": 5.06855791962175, + "grad_norm": 3.6125104427337646, + "learning_rate": 2.960244539058532e-07, + "loss": 0.3523, + "step": 10720 + }, + { + "epoch": 5.06903073286052, + "grad_norm": 3.38793683052063, + "learning_rate": 2.9573006648661084e-07, + "loss": 0.3552, + "step": 10721 + }, + { + "epoch": 5.069503546099291, + "grad_norm": 3.3725569248199463, + "learning_rate": 2.9543581631759243e-07, + "loss": 0.3445, + "step": 10722 + }, + { + "epoch": 5.069976359338061, + "grad_norm": 3.1193649768829346, + "learning_rate": 2.9514170341711917e-07, + "loss": 0.312, + "step": 10723 + }, + { + "epoch": 5.070449172576832, + "grad_norm": 3.3182318210601807, + "learning_rate": 2.948477278035033e-07, + "loss": 0.357, + "step": 10724 + }, + { + "epoch": 5.070921985815603, + "grad_norm": 2.945859909057617, + "learning_rate": 2.9455388949505087e-07, + "loss": 0.2992, + "step": 10725 + }, + { + "epoch": 5.0713947990543735, + "grad_norm": 3.269683599472046, + "learning_rate": 2.942601885100571e-07, + "loss": 0.3415, + "step": 10726 + }, + { + "epoch": 5.071867612293144, + "grad_norm": 3.268453598022461, + "learning_rate": 2.939666248668094e-07, + "loss": 0.3655, + "step": 10727 + }, + { + "epoch": 5.072340425531915, + "grad_norm": 3.0961155891418457, + "learning_rate": 2.936731985835864e-07, + "loss": 0.3281, + "step": 10728 + }, + { + "epoch": 5.072813238770686, + "grad_norm": 2.9241912364959717, + "learning_rate": 2.9337990967865935e-07, + "loss": 0.3229, + "step": 10729 + }, + { + "epoch": 5.073286052009456, + "grad_norm": 3.4160847663879395, + "learning_rate": 2.9308675817028955e-07, + "loss": 0.3711, + "step": 10730 + }, + { + "epoch": 5.073758865248227, + "grad_norm": 3.7135009765625, + "learning_rate": 2.927937440767298e-07, + "loss": 0.3194, + "step": 10731 + }, + { + "epoch": 5.074231678486997, + "grad_norm": 3.281792163848877, + "learning_rate": 2.925008674162258e-07, + "loss": 0.3679, + "step": 10732 + }, + { + "epoch": 5.074704491725768, + "grad_norm": 3.290614366531372, + "learning_rate": 2.922081282070133e-07, + "loss": 0.3811, + "step": 10733 + }, + { + "epoch": 5.075177304964539, + "grad_norm": 3.0560739040374756, + "learning_rate": 2.9191552646731904e-07, + "loss": 0.3258, + "step": 10734 + }, + { + "epoch": 5.07565011820331, + "grad_norm": 3.7462680339813232, + "learning_rate": 2.916230622153635e-07, + "loss": 0.3541, + "step": 10735 + }, + { + "epoch": 5.076122931442081, + "grad_norm": 3.315908193588257, + "learning_rate": 2.9133073546935564e-07, + "loss": 0.3232, + "step": 10736 + }, + { + "epoch": 5.076595744680851, + "grad_norm": 5.690020561218262, + "learning_rate": 2.9103854624749907e-07, + "loss": 0.2898, + "step": 10737 + }, + { + "epoch": 5.077068557919622, + "grad_norm": 2.9824652671813965, + "learning_rate": 2.907464945679861e-07, + "loss": 0.3538, + "step": 10738 + }, + { + "epoch": 5.077541371158392, + "grad_norm": 3.5233614444732666, + "learning_rate": 2.9045458044900084e-07, + "loss": 0.3549, + "step": 10739 + }, + { + "epoch": 5.078014184397163, + "grad_norm": 3.1448633670806885, + "learning_rate": 2.901628039087212e-07, + "loss": 0.3548, + "step": 10740 + }, + { + "epoch": 5.078486997635934, + "grad_norm": 3.2169477939605713, + "learning_rate": 2.8987116496531356e-07, + "loss": 0.371, + "step": 10741 + }, + { + "epoch": 5.0789598108747045, + "grad_norm": 2.862016201019287, + "learning_rate": 2.895796636369372e-07, + "loss": 0.3063, + "step": 10742 + }, + { + "epoch": 5.079432624113475, + "grad_norm": 2.875854253768921, + "learning_rate": 2.892882999417429e-07, + "loss": 0.3384, + "step": 10743 + }, + { + "epoch": 5.079905437352246, + "grad_norm": 2.9410934448242188, + "learning_rate": 2.8899707389787285e-07, + "loss": 0.2969, + "step": 10744 + }, + { + "epoch": 5.080378250591017, + "grad_norm": 3.3200604915618896, + "learning_rate": 2.8870598552345973e-07, + "loss": 0.3632, + "step": 10745 + }, + { + "epoch": 5.080851063829787, + "grad_norm": 2.933659315109253, + "learning_rate": 2.884150348366288e-07, + "loss": 0.2858, + "step": 10746 + }, + { + "epoch": 5.081323877068558, + "grad_norm": 3.1934702396392822, + "learning_rate": 2.8812422185549584e-07, + "loss": 0.322, + "step": 10747 + }, + { + "epoch": 5.081796690307328, + "grad_norm": 3.283770799636841, + "learning_rate": 2.878335465981691e-07, + "loss": 0.3101, + "step": 10748 + }, + { + "epoch": 5.082269503546099, + "grad_norm": 3.2978734970092773, + "learning_rate": 2.8754300908274665e-07, + "loss": 0.3595, + "step": 10749 + }, + { + "epoch": 5.08274231678487, + "grad_norm": 3.335786819458008, + "learning_rate": 2.872526093273206e-07, + "loss": 0.3647, + "step": 10750 + }, + { + "epoch": 5.083215130023641, + "grad_norm": 3.0334603786468506, + "learning_rate": 2.8696234734997163e-07, + "loss": 0.2988, + "step": 10751 + }, + { + "epoch": 5.083687943262412, + "grad_norm": 4.127038478851318, + "learning_rate": 2.86672223168773e-07, + "loss": 0.3852, + "step": 10752 + }, + { + "epoch": 5.084160756501182, + "grad_norm": 2.993657350540161, + "learning_rate": 2.8638223680179044e-07, + "loss": 0.3036, + "step": 10753 + }, + { + "epoch": 5.084633569739952, + "grad_norm": 3.19315505027771, + "learning_rate": 2.860923882670791e-07, + "loss": 0.3314, + "step": 10754 + }, + { + "epoch": 5.085106382978723, + "grad_norm": 3.3229005336761475, + "learning_rate": 2.8580267758268747e-07, + "loss": 0.3511, + "step": 10755 + }, + { + "epoch": 5.085579196217494, + "grad_norm": 2.942401885986328, + "learning_rate": 2.855131047666543e-07, + "loss": 0.2576, + "step": 10756 + }, + { + "epoch": 5.086052009456265, + "grad_norm": 3.43926739692688, + "learning_rate": 2.8522366983700924e-07, + "loss": 0.315, + "step": 10757 + }, + { + "epoch": 5.0865248226950355, + "grad_norm": 3.041480541229248, + "learning_rate": 2.849343728117754e-07, + "loss": 0.323, + "step": 10758 + }, + { + "epoch": 5.086997635933806, + "grad_norm": 3.1940221786499023, + "learning_rate": 2.846452137089653e-07, + "loss": 0.3232, + "step": 10759 + }, + { + "epoch": 5.087470449172577, + "grad_norm": 3.105175018310547, + "learning_rate": 2.8435619254658347e-07, + "loss": 0.3276, + "step": 10760 + }, + { + "epoch": 5.087943262411348, + "grad_norm": 3.0791914463043213, + "learning_rate": 2.840673093426266e-07, + "loss": 0.3319, + "step": 10761 + }, + { + "epoch": 5.088416075650118, + "grad_norm": 3.131887435913086, + "learning_rate": 2.837785641150811e-07, + "loss": 0.3716, + "step": 10762 + }, + { + "epoch": 5.088888888888889, + "grad_norm": 3.0012013912200928, + "learning_rate": 2.8348995688192716e-07, + "loss": 0.3748, + "step": 10763 + }, + { + "epoch": 5.089361702127659, + "grad_norm": 3.021074056625366, + "learning_rate": 2.832014876611339e-07, + "loss": 0.3179, + "step": 10764 + }, + { + "epoch": 5.08983451536643, + "grad_norm": 3.4271838665008545, + "learning_rate": 2.829131564706641e-07, + "loss": 0.3445, + "step": 10765 + }, + { + "epoch": 5.090307328605201, + "grad_norm": 3.4268107414245605, + "learning_rate": 2.8262496332847057e-07, + "loss": 0.3633, + "step": 10766 + }, + { + "epoch": 5.090780141843972, + "grad_norm": 2.9987759590148926, + "learning_rate": 2.823369082524971e-07, + "loss": 0.2973, + "step": 10767 + }, + { + "epoch": 5.091252955082743, + "grad_norm": 3.0510613918304443, + "learning_rate": 2.8204899126068054e-07, + "loss": 0.3014, + "step": 10768 + }, + { + "epoch": 5.091725768321513, + "grad_norm": 3.7389485836029053, + "learning_rate": 2.817612123709482e-07, + "loss": 0.3693, + "step": 10769 + }, + { + "epoch": 5.092198581560283, + "grad_norm": 3.183263063430786, + "learning_rate": 2.814735716012176e-07, + "loss": 0.2895, + "step": 10770 + }, + { + "epoch": 5.092671394799054, + "grad_norm": 3.173675537109375, + "learning_rate": 2.8118606896940043e-07, + "loss": 0.3107, + "step": 10771 + }, + { + "epoch": 5.093144208037825, + "grad_norm": 3.2929723262786865, + "learning_rate": 2.808987044933972e-07, + "loss": 0.3406, + "step": 10772 + }, + { + "epoch": 5.093617021276596, + "grad_norm": 3.4711458683013916, + "learning_rate": 2.806114781911015e-07, + "loss": 0.372, + "step": 10773 + }, + { + "epoch": 5.0940898345153665, + "grad_norm": 3.105397939682007, + "learning_rate": 2.803243900803973e-07, + "loss": 0.328, + "step": 10774 + }, + { + "epoch": 5.094562647754137, + "grad_norm": 2.959874391555786, + "learning_rate": 2.800374401791606e-07, + "loss": 0.2888, + "step": 10775 + }, + { + "epoch": 5.095035460992908, + "grad_norm": 2.6287355422973633, + "learning_rate": 2.7975062850525785e-07, + "loss": 0.2786, + "step": 10776 + }, + { + "epoch": 5.095508274231679, + "grad_norm": 3.32594633102417, + "learning_rate": 2.794639550765477e-07, + "loss": 0.333, + "step": 10777 + }, + { + "epoch": 5.095981087470449, + "grad_norm": 3.1033830642700195, + "learning_rate": 2.7917741991088096e-07, + "loss": 0.3285, + "step": 10778 + }, + { + "epoch": 5.0964539007092196, + "grad_norm": 2.793583869934082, + "learning_rate": 2.78891023026098e-07, + "loss": 0.3097, + "step": 10779 + }, + { + "epoch": 5.09692671394799, + "grad_norm": 3.6989400386810303, + "learning_rate": 2.786047644400314e-07, + "loss": 0.3181, + "step": 10780 + }, + { + "epoch": 5.097399527186761, + "grad_norm": 3.064781427383423, + "learning_rate": 2.78318644170506e-07, + "loss": 0.332, + "step": 10781 + }, + { + "epoch": 5.097872340425532, + "grad_norm": 3.2844104766845703, + "learning_rate": 2.7803266223533633e-07, + "loss": 0.2825, + "step": 10782 + }, + { + "epoch": 5.098345153664303, + "grad_norm": 2.9916985034942627, + "learning_rate": 2.777468186523305e-07, + "loss": 0.3234, + "step": 10783 + }, + { + "epoch": 5.0988179669030735, + "grad_norm": 3.1486685276031494, + "learning_rate": 2.774611134392857e-07, + "loss": 0.3577, + "step": 10784 + }, + { + "epoch": 5.099290780141844, + "grad_norm": 3.140198230743408, + "learning_rate": 2.7717554661399135e-07, + "loss": 0.3689, + "step": 10785 + }, + { + "epoch": 5.099763593380614, + "grad_norm": 3.095646619796753, + "learning_rate": 2.768901181942296e-07, + "loss": 0.2729, + "step": 10786 + }, + { + "epoch": 5.100236406619385, + "grad_norm": 3.2754757404327393, + "learning_rate": 2.766048281977715e-07, + "loss": 0.3382, + "step": 10787 + }, + { + "epoch": 5.100709219858156, + "grad_norm": 3.1195523738861084, + "learning_rate": 2.76319676642382e-07, + "loss": 0.3374, + "step": 10788 + }, + { + "epoch": 5.101182033096927, + "grad_norm": 3.3726742267608643, + "learning_rate": 2.760346635458158e-07, + "loss": 0.3821, + "step": 10789 + }, + { + "epoch": 5.101654846335697, + "grad_norm": 3.679518461227417, + "learning_rate": 2.757497889258193e-07, + "loss": 0.3251, + "step": 10790 + }, + { + "epoch": 5.102127659574468, + "grad_norm": 3.3941988945007324, + "learning_rate": 2.7546505280013015e-07, + "loss": 0.339, + "step": 10791 + }, + { + "epoch": 5.102600472813239, + "grad_norm": 3.409128427505493, + "learning_rate": 2.7518045518647755e-07, + "loss": 0.3833, + "step": 10792 + }, + { + "epoch": 5.10307328605201, + "grad_norm": 3.2601206302642822, + "learning_rate": 2.7489599610258293e-07, + "loss": 0.3334, + "step": 10793 + }, + { + "epoch": 5.10354609929078, + "grad_norm": 3.1256062984466553, + "learning_rate": 2.746116755661579e-07, + "loss": 0.3033, + "step": 10794 + }, + { + "epoch": 5.1040189125295505, + "grad_norm": 3.135474443435669, + "learning_rate": 2.74327493594905e-07, + "loss": 0.3405, + "step": 10795 + }, + { + "epoch": 5.104491725768321, + "grad_norm": 3.2788431644439697, + "learning_rate": 2.740434502065206e-07, + "loss": 0.324, + "step": 10796 + }, + { + "epoch": 5.104964539007092, + "grad_norm": 2.980475902557373, + "learning_rate": 2.7375954541869005e-07, + "loss": 0.3592, + "step": 10797 + }, + { + "epoch": 5.105437352245863, + "grad_norm": 3.1811654567718506, + "learning_rate": 2.7347577924909023e-07, + "loss": 0.3358, + "step": 10798 + }, + { + "epoch": 5.105910165484634, + "grad_norm": 3.5174286365509033, + "learning_rate": 2.7319215171539126e-07, + "loss": 0.3746, + "step": 10799 + }, + { + "epoch": 5.1063829787234045, + "grad_norm": 3.340458869934082, + "learning_rate": 2.7290866283525203e-07, + "loss": 0.328, + "step": 10800 + }, + { + "epoch": 5.106855791962175, + "grad_norm": 2.9427425861358643, + "learning_rate": 2.7262531262632535e-07, + "loss": 0.3039, + "step": 10801 + }, + { + "epoch": 5.107328605200945, + "grad_norm": 3.4074652194976807, + "learning_rate": 2.723421011062541e-07, + "loss": 0.2857, + "step": 10802 + }, + { + "epoch": 5.107801418439716, + "grad_norm": 3.5216293334960938, + "learning_rate": 2.7205902829267136e-07, + "loss": 0.3802, + "step": 10803 + }, + { + "epoch": 5.108274231678487, + "grad_norm": 3.243743658065796, + "learning_rate": 2.717760942032041e-07, + "loss": 0.3502, + "step": 10804 + }, + { + "epoch": 5.108747044917258, + "grad_norm": 2.869746685028076, + "learning_rate": 2.7149329885546945e-07, + "loss": 0.2878, + "step": 10805 + }, + { + "epoch": 5.109219858156028, + "grad_norm": 3.643076181411743, + "learning_rate": 2.712106422670743e-07, + "loss": 0.334, + "step": 10806 + }, + { + "epoch": 5.109692671394799, + "grad_norm": 3.114863157272339, + "learning_rate": 2.7092812445562053e-07, + "loss": 0.3094, + "step": 10807 + }, + { + "epoch": 5.11016548463357, + "grad_norm": 3.081373691558838, + "learning_rate": 2.706457454386979e-07, + "loss": 0.3218, + "step": 10808 + }, + { + "epoch": 5.110638297872341, + "grad_norm": 3.2016193866729736, + "learning_rate": 2.703635052338896e-07, + "loss": 0.307, + "step": 10809 + }, + { + "epoch": 5.111111111111111, + "grad_norm": 3.39798641204834, + "learning_rate": 2.700814038587685e-07, + "loss": 0.3047, + "step": 10810 + }, + { + "epoch": 5.1115839243498815, + "grad_norm": 3.0165305137634277, + "learning_rate": 2.6979944133090076e-07, + "loss": 0.3074, + "step": 10811 + }, + { + "epoch": 5.112056737588652, + "grad_norm": 3.063884735107422, + "learning_rate": 2.6951761766784295e-07, + "loss": 0.3493, + "step": 10812 + }, + { + "epoch": 5.112529550827423, + "grad_norm": 3.4540178775787354, + "learning_rate": 2.692359328871422e-07, + "loss": 0.3983, + "step": 10813 + }, + { + "epoch": 5.113002364066194, + "grad_norm": 3.297858476638794, + "learning_rate": 2.689543870063388e-07, + "loss": 0.3209, + "step": 10814 + }, + { + "epoch": 5.113475177304965, + "grad_norm": 3.200965642929077, + "learning_rate": 2.686729800429627e-07, + "loss": 0.3495, + "step": 10815 + }, + { + "epoch": 5.1139479905437355, + "grad_norm": 3.497377395629883, + "learning_rate": 2.683917120145357e-07, + "loss": 0.3548, + "step": 10816 + }, + { + "epoch": 5.114420803782506, + "grad_norm": 3.0813331604003906, + "learning_rate": 2.681105829385719e-07, + "loss": 0.3473, + "step": 10817 + }, + { + "epoch": 5.114893617021276, + "grad_norm": 3.73785138130188, + "learning_rate": 2.6782959283257525e-07, + "loss": 0.3621, + "step": 10818 + }, + { + "epoch": 5.115366430260047, + "grad_norm": 2.8933136463165283, + "learning_rate": 2.675487417140424e-07, + "loss": 0.3183, + "step": 10819 + }, + { + "epoch": 5.115839243498818, + "grad_norm": 3.239682197570801, + "learning_rate": 2.6726802960046037e-07, + "loss": 0.289, + "step": 10820 + }, + { + "epoch": 5.116312056737589, + "grad_norm": 3.0757391452789307, + "learning_rate": 2.6698745650930754e-07, + "loss": 0.303, + "step": 10821 + }, + { + "epoch": 5.116784869976359, + "grad_norm": 3.6242589950561523, + "learning_rate": 2.667070224580548e-07, + "loss": 0.3479, + "step": 10822 + }, + { + "epoch": 5.11725768321513, + "grad_norm": 3.0780205726623535, + "learning_rate": 2.6642672746416296e-07, + "loss": 0.3395, + "step": 10823 + }, + { + "epoch": 5.117730496453901, + "grad_norm": 3.202171564102173, + "learning_rate": 2.6614657154508506e-07, + "loss": 0.3551, + "step": 10824 + }, + { + "epoch": 5.118203309692672, + "grad_norm": 3.312523365020752, + "learning_rate": 2.658665547182651e-07, + "loss": 0.3553, + "step": 10825 + }, + { + "epoch": 5.118676122931442, + "grad_norm": 3.0124878883361816, + "learning_rate": 2.6558667700113757e-07, + "loss": 0.3094, + "step": 10826 + }, + { + "epoch": 5.1191489361702125, + "grad_norm": 3.496133804321289, + "learning_rate": 2.653069384111306e-07, + "loss": 0.3639, + "step": 10827 + }, + { + "epoch": 5.119621749408983, + "grad_norm": 3.0873615741729736, + "learning_rate": 2.6502733896566157e-07, + "loss": 0.3423, + "step": 10828 + }, + { + "epoch": 5.120094562647754, + "grad_norm": 3.3728535175323486, + "learning_rate": 2.647478786821403e-07, + "loss": 0.3022, + "step": 10829 + }, + { + "epoch": 5.120567375886525, + "grad_norm": 2.9433000087738037, + "learning_rate": 2.6446855757796736e-07, + "loss": 0.325, + "step": 10830 + }, + { + "epoch": 5.121040189125296, + "grad_norm": 3.447678804397583, + "learning_rate": 2.6418937567053436e-07, + "loss": 0.3677, + "step": 10831 + }, + { + "epoch": 5.1215130023640665, + "grad_norm": 3.1059327125549316, + "learning_rate": 2.6391033297722554e-07, + "loss": 0.3491, + "step": 10832 + }, + { + "epoch": 5.121985815602837, + "grad_norm": 3.0744543075561523, + "learning_rate": 2.636314295154152e-07, + "loss": 0.3571, + "step": 10833 + }, + { + "epoch": 5.122458628841607, + "grad_norm": 3.4055752754211426, + "learning_rate": 2.633526653024698e-07, + "loss": 0.3139, + "step": 10834 + }, + { + "epoch": 5.122931442080378, + "grad_norm": 3.4030723571777344, + "learning_rate": 2.630740403557466e-07, + "loss": 0.3373, + "step": 10835 + }, + { + "epoch": 5.123404255319149, + "grad_norm": 3.2269482612609863, + "learning_rate": 2.6279555469259366e-07, + "loss": 0.3636, + "step": 10836 + }, + { + "epoch": 5.12387706855792, + "grad_norm": 3.069843053817749, + "learning_rate": 2.625172083303523e-07, + "loss": 0.2865, + "step": 10837 + }, + { + "epoch": 5.12434988179669, + "grad_norm": 3.2536098957061768, + "learning_rate": 2.622390012863532e-07, + "loss": 0.3573, + "step": 10838 + }, + { + "epoch": 5.124822695035461, + "grad_norm": 3.369499683380127, + "learning_rate": 2.619609335779194e-07, + "loss": 0.3227, + "step": 10839 + }, + { + "epoch": 5.125295508274232, + "grad_norm": 3.2754480838775635, + "learning_rate": 2.6168300522236456e-07, + "loss": 0.3547, + "step": 10840 + }, + { + "epoch": 5.125768321513003, + "grad_norm": 3.356863498687744, + "learning_rate": 2.61405216236994e-07, + "loss": 0.2894, + "step": 10841 + }, + { + "epoch": 5.126241134751773, + "grad_norm": 2.770984172821045, + "learning_rate": 2.61127566639105e-07, + "loss": 0.3068, + "step": 10842 + }, + { + "epoch": 5.1267139479905435, + "grad_norm": 3.1126701831817627, + "learning_rate": 2.608500564459851e-07, + "loss": 0.3644, + "step": 10843 + }, + { + "epoch": 5.127186761229314, + "grad_norm": 3.6737382411956787, + "learning_rate": 2.605726856749136e-07, + "loss": 0.3572, + "step": 10844 + }, + { + "epoch": 5.127659574468085, + "grad_norm": 3.1673147678375244, + "learning_rate": 2.6029545434316184e-07, + "loss": 0.3328, + "step": 10845 + }, + { + "epoch": 5.128132387706856, + "grad_norm": 3.1279728412628174, + "learning_rate": 2.6001836246799085e-07, + "loss": 0.3002, + "step": 10846 + }, + { + "epoch": 5.128605200945627, + "grad_norm": 3.544572591781616, + "learning_rate": 2.597414100666548e-07, + "loss": 0.332, + "step": 10847 + }, + { + "epoch": 5.1290780141843975, + "grad_norm": 3.3752872943878174, + "learning_rate": 2.5946459715639777e-07, + "loss": 0.3625, + "step": 10848 + }, + { + "epoch": 5.129550827423168, + "grad_norm": 3.279550075531006, + "learning_rate": 2.591879237544556e-07, + "loss": 0.3792, + "step": 10849 + }, + { + "epoch": 5.130023640661938, + "grad_norm": 3.277784824371338, + "learning_rate": 2.58911389878056e-07, + "loss": 0.2996, + "step": 10850 + }, + { + "epoch": 5.130496453900709, + "grad_norm": 3.387439012527466, + "learning_rate": 2.5863499554441703e-07, + "loss": 0.2971, + "step": 10851 + }, + { + "epoch": 5.13096926713948, + "grad_norm": 3.296769618988037, + "learning_rate": 2.5835874077074913e-07, + "loss": 0.2694, + "step": 10852 + }, + { + "epoch": 5.131442080378251, + "grad_norm": 3.693293333053589, + "learning_rate": 2.580826255742533e-07, + "loss": 0.4239, + "step": 10853 + }, + { + "epoch": 5.131914893617021, + "grad_norm": 3.611297845840454, + "learning_rate": 2.578066499721216e-07, + "loss": 0.3555, + "step": 10854 + }, + { + "epoch": 5.132387706855792, + "grad_norm": 3.4243459701538086, + "learning_rate": 2.575308139815383e-07, + "loss": 0.3395, + "step": 10855 + }, + { + "epoch": 5.132860520094563, + "grad_norm": 2.9317305088043213, + "learning_rate": 2.5725511761967775e-07, + "loss": 0.2877, + "step": 10856 + }, + { + "epoch": 5.133333333333334, + "grad_norm": 6.177371501922607, + "learning_rate": 2.5697956090370735e-07, + "loss": 0.3329, + "step": 10857 + }, + { + "epoch": 5.133806146572104, + "grad_norm": 3.071885108947754, + "learning_rate": 2.567041438507842e-07, + "loss": 0.3357, + "step": 10858 + }, + { + "epoch": 5.1342789598108745, + "grad_norm": 3.5805132389068604, + "learning_rate": 2.56428866478057e-07, + "loss": 0.317, + "step": 10859 + }, + { + "epoch": 5.134751773049645, + "grad_norm": 3.055856704711914, + "learning_rate": 2.561537288026672e-07, + "loss": 0.3098, + "step": 10860 + }, + { + "epoch": 5.135224586288416, + "grad_norm": 2.9992854595184326, + "learning_rate": 2.558787308417451e-07, + "loss": 0.3025, + "step": 10861 + }, + { + "epoch": 5.135697399527187, + "grad_norm": 2.9692001342773438, + "learning_rate": 2.5560387261241493e-07, + "loss": 0.2837, + "step": 10862 + }, + { + "epoch": 5.136170212765958, + "grad_norm": 3.0246338844299316, + "learning_rate": 2.553291541317901e-07, + "loss": 0.2664, + "step": 10863 + }, + { + "epoch": 5.136643026004728, + "grad_norm": 3.2685534954071045, + "learning_rate": 2.5505457541697587e-07, + "loss": 0.3348, + "step": 10864 + }, + { + "epoch": 5.137115839243499, + "grad_norm": 3.8202273845672607, + "learning_rate": 2.5478013648506964e-07, + "loss": 0.369, + "step": 10865 + }, + { + "epoch": 5.137588652482269, + "grad_norm": 3.5275821685791016, + "learning_rate": 2.545058373531595e-07, + "loss": 0.3216, + "step": 10866 + }, + { + "epoch": 5.13806146572104, + "grad_norm": 3.353627920150757, + "learning_rate": 2.542316780383242e-07, + "loss": 0.3581, + "step": 10867 + }, + { + "epoch": 5.138534278959811, + "grad_norm": 3.2958984375, + "learning_rate": 2.539576585576356e-07, + "loss": 0.3056, + "step": 10868 + }, + { + "epoch": 5.1390070921985815, + "grad_norm": 3.8927693367004395, + "learning_rate": 2.536837789281546e-07, + "loss": 0.3333, + "step": 10869 + }, + { + "epoch": 5.139479905437352, + "grad_norm": 2.967616319656372, + "learning_rate": 2.5341003916693503e-07, + "loss": 0.3337, + "step": 10870 + }, + { + "epoch": 5.139952718676123, + "grad_norm": 3.290929079055786, + "learning_rate": 2.53136439291021e-07, + "loss": 0.2962, + "step": 10871 + }, + { + "epoch": 5.140425531914894, + "grad_norm": 3.2634384632110596, + "learning_rate": 2.5286297931744916e-07, + "loss": 0.2564, + "step": 10872 + }, + { + "epoch": 5.140898345153665, + "grad_norm": 2.955380439758301, + "learning_rate": 2.52589659263246e-07, + "loss": 0.3439, + "step": 10873 + }, + { + "epoch": 5.141371158392435, + "grad_norm": 3.039886951446533, + "learning_rate": 2.523164791454297e-07, + "loss": 0.3105, + "step": 10874 + }, + { + "epoch": 5.141843971631205, + "grad_norm": 3.562945604324341, + "learning_rate": 2.520434389810111e-07, + "loss": 0.323, + "step": 10875 + }, + { + "epoch": 5.142316784869976, + "grad_norm": 3.2454209327697754, + "learning_rate": 2.5177053878699036e-07, + "loss": 0.3365, + "step": 10876 + }, + { + "epoch": 5.142789598108747, + "grad_norm": 3.3156983852386475, + "learning_rate": 2.5149777858035964e-07, + "loss": 0.4163, + "step": 10877 + }, + { + "epoch": 5.143262411347518, + "grad_norm": 3.329939126968384, + "learning_rate": 2.5122515837810335e-07, + "loss": 0.3355, + "step": 10878 + }, + { + "epoch": 5.143735224586289, + "grad_norm": 3.1439883708953857, + "learning_rate": 2.509526781971955e-07, + "loss": 0.3297, + "step": 10879 + }, + { + "epoch": 5.144208037825059, + "grad_norm": 3.3979413509368896, + "learning_rate": 2.5068033805460274e-07, + "loss": 0.3214, + "step": 10880 + }, + { + "epoch": 5.14468085106383, + "grad_norm": 3.2689614295959473, + "learning_rate": 2.5040813796728266e-07, + "loss": 0.3377, + "step": 10881 + }, + { + "epoch": 5.1451536643026, + "grad_norm": 3.1197023391723633, + "learning_rate": 2.5013607795218303e-07, + "loss": 0.3081, + "step": 10882 + }, + { + "epoch": 5.145626477541371, + "grad_norm": 2.990771770477295, + "learning_rate": 2.498641580262451e-07, + "loss": 0.3112, + "step": 10883 + }, + { + "epoch": 5.146099290780142, + "grad_norm": 3.3050241470336914, + "learning_rate": 2.4959237820639884e-07, + "loss": 0.2987, + "step": 10884 + }, + { + "epoch": 5.1465721040189125, + "grad_norm": 3.4153940677642822, + "learning_rate": 2.493207385095681e-07, + "loss": 0.3271, + "step": 10885 + }, + { + "epoch": 5.147044917257683, + "grad_norm": 3.5337910652160645, + "learning_rate": 2.490492389526658e-07, + "loss": 0.3203, + "step": 10886 + }, + { + "epoch": 5.147517730496454, + "grad_norm": 3.0076584815979004, + "learning_rate": 2.487778795525975e-07, + "loss": 0.303, + "step": 10887 + }, + { + "epoch": 5.147990543735225, + "grad_norm": 3.292414665222168, + "learning_rate": 2.4850666032625925e-07, + "loss": 0.3296, + "step": 10888 + }, + { + "epoch": 5.148463356973995, + "grad_norm": 2.9681763648986816, + "learning_rate": 2.482355812905388e-07, + "loss": 0.3118, + "step": 10889 + }, + { + "epoch": 5.148936170212766, + "grad_norm": 2.911698818206787, + "learning_rate": 2.4796464246231446e-07, + "loss": 0.3224, + "step": 10890 + }, + { + "epoch": 5.149408983451536, + "grad_norm": 3.751932144165039, + "learning_rate": 2.4769384385845723e-07, + "loss": 0.3493, + "step": 10891 + }, + { + "epoch": 5.149881796690307, + "grad_norm": 3.595027208328247, + "learning_rate": 2.474231854958281e-07, + "loss": 0.364, + "step": 10892 + }, + { + "epoch": 5.150354609929078, + "grad_norm": 3.140770196914673, + "learning_rate": 2.4715266739128054e-07, + "loss": 0.3421, + "step": 10893 + }, + { + "epoch": 5.150827423167849, + "grad_norm": 3.0348970890045166, + "learning_rate": 2.4688228956165764e-07, + "loss": 0.2917, + "step": 10894 + }, + { + "epoch": 5.15130023640662, + "grad_norm": 3.466560125350952, + "learning_rate": 2.466120520237947e-07, + "loss": 0.3694, + "step": 10895 + }, + { + "epoch": 5.15177304964539, + "grad_norm": 3.148052930831909, + "learning_rate": 2.4634195479451867e-07, + "loss": 0.3096, + "step": 10896 + }, + { + "epoch": 5.152245862884161, + "grad_norm": 3.451205253601074, + "learning_rate": 2.46071997890647e-07, + "loss": 0.3961, + "step": 10897 + }, + { + "epoch": 5.152718676122931, + "grad_norm": 3.2556569576263428, + "learning_rate": 2.4580218132898904e-07, + "loss": 0.2722, + "step": 10898 + }, + { + "epoch": 5.153191489361702, + "grad_norm": 2.98459529876709, + "learning_rate": 2.455325051263449e-07, + "loss": 0.3148, + "step": 10899 + }, + { + "epoch": 5.153664302600473, + "grad_norm": 3.6685876846313477, + "learning_rate": 2.4526296929950565e-07, + "loss": 0.3534, + "step": 10900 + }, + { + "epoch": 5.1541371158392435, + "grad_norm": 3.508319139480591, + "learning_rate": 2.4499357386525534e-07, + "loss": 0.3375, + "step": 10901 + }, + { + "epoch": 5.154609929078014, + "grad_norm": 2.869081497192383, + "learning_rate": 2.4472431884036693e-07, + "loss": 0.3068, + "step": 10902 + }, + { + "epoch": 5.155082742316785, + "grad_norm": 3.3256514072418213, + "learning_rate": 2.444552042416062e-07, + "loss": 0.3254, + "step": 10903 + }, + { + "epoch": 5.155555555555556, + "grad_norm": 3.066772699356079, + "learning_rate": 2.441862300857298e-07, + "loss": 0.2726, + "step": 10904 + }, + { + "epoch": 5.156028368794326, + "grad_norm": 3.2044811248779297, + "learning_rate": 2.4391739638948476e-07, + "loss": 0.3227, + "step": 10905 + }, + { + "epoch": 5.156501182033097, + "grad_norm": 3.2946972846984863, + "learning_rate": 2.4364870316961135e-07, + "loss": 0.3755, + "step": 10906 + }, + { + "epoch": 5.156973995271867, + "grad_norm": 3.8998544216156006, + "learning_rate": 2.43380150442839e-07, + "loss": 0.2534, + "step": 10907 + }, + { + "epoch": 5.157446808510638, + "grad_norm": 3.953619956970215, + "learning_rate": 2.431117382258902e-07, + "loss": 0.2846, + "step": 10908 + }, + { + "epoch": 5.157919621749409, + "grad_norm": 3.1574389934539795, + "learning_rate": 2.42843466535477e-07, + "loss": 0.2833, + "step": 10909 + }, + { + "epoch": 5.15839243498818, + "grad_norm": 3.2663321495056152, + "learning_rate": 2.425753353883037e-07, + "loss": 0.3256, + "step": 10910 + }, + { + "epoch": 5.158865248226951, + "grad_norm": 3.3251729011535645, + "learning_rate": 2.42307344801066e-07, + "loss": 0.3335, + "step": 10911 + }, + { + "epoch": 5.159338061465721, + "grad_norm": 2.856858253479004, + "learning_rate": 2.420394947904503e-07, + "loss": 0.3167, + "step": 10912 + }, + { + "epoch": 5.159810874704492, + "grad_norm": 3.1201767921447754, + "learning_rate": 2.4177178537313383e-07, + "loss": 0.3299, + "step": 10913 + }, + { + "epoch": 5.160283687943262, + "grad_norm": 3.229917049407959, + "learning_rate": 2.4150421656578686e-07, + "loss": 0.3436, + "step": 10914 + }, + { + "epoch": 5.160756501182033, + "grad_norm": 3.437685012817383, + "learning_rate": 2.412367883850686e-07, + "loss": 0.3346, + "step": 10915 + }, + { + "epoch": 5.161229314420804, + "grad_norm": 3.7792129516601562, + "learning_rate": 2.4096950084763175e-07, + "loss": 0.378, + "step": 10916 + }, + { + "epoch": 5.1617021276595745, + "grad_norm": 3.606621503829956, + "learning_rate": 2.4070235397011837e-07, + "loss": 0.3108, + "step": 10917 + }, + { + "epoch": 5.162174940898345, + "grad_norm": 3.6572203636169434, + "learning_rate": 2.404353477691626e-07, + "loss": 0.3648, + "step": 10918 + }, + { + "epoch": 5.162647754137116, + "grad_norm": 2.890166759490967, + "learning_rate": 2.4016848226139004e-07, + "loss": 0.3349, + "step": 10919 + }, + { + "epoch": 5.163120567375887, + "grad_norm": 3.198918342590332, + "learning_rate": 2.399017574634166e-07, + "loss": 0.3144, + "step": 10920 + }, + { + "epoch": 5.163593380614657, + "grad_norm": 3.5493834018707275, + "learning_rate": 2.3963517339185063e-07, + "loss": 0.3892, + "step": 10921 + }, + { + "epoch": 5.164066193853428, + "grad_norm": 3.3824760913848877, + "learning_rate": 2.393687300632913e-07, + "loss": 0.3246, + "step": 10922 + }, + { + "epoch": 5.164539007092198, + "grad_norm": 2.872697591781616, + "learning_rate": 2.3910242749432793e-07, + "loss": 0.2936, + "step": 10923 + }, + { + "epoch": 5.165011820330969, + "grad_norm": 3.121612548828125, + "learning_rate": 2.3883626570154333e-07, + "loss": 0.3245, + "step": 10924 + }, + { + "epoch": 5.16548463356974, + "grad_norm": 3.810166835784912, + "learning_rate": 2.3857024470150896e-07, + "loss": 0.362, + "step": 10925 + }, + { + "epoch": 5.165957446808511, + "grad_norm": 2.994041681289673, + "learning_rate": 2.3830436451078992e-07, + "loss": 0.2917, + "step": 10926 + }, + { + "epoch": 5.166430260047282, + "grad_norm": 3.181262254714966, + "learning_rate": 2.3803862514594073e-07, + "loss": 0.3083, + "step": 10927 + }, + { + "epoch": 5.166903073286052, + "grad_norm": 3.463592052459717, + "learning_rate": 2.377730266235076e-07, + "loss": 0.3156, + "step": 10928 + }, + { + "epoch": 5.167375886524822, + "grad_norm": 3.687401056289673, + "learning_rate": 2.3750756896002898e-07, + "loss": 0.3474, + "step": 10929 + }, + { + "epoch": 5.167848699763593, + "grad_norm": 3.6340384483337402, + "learning_rate": 2.3724225217203302e-07, + "loss": 0.3602, + "step": 10930 + }, + { + "epoch": 5.168321513002364, + "grad_norm": 2.7620654106140137, + "learning_rate": 2.369770762760404e-07, + "loss": 0.3001, + "step": 10931 + }, + { + "epoch": 5.168794326241135, + "grad_norm": 3.4769256114959717, + "learning_rate": 2.3671204128856207e-07, + "loss": 0.3518, + "step": 10932 + }, + { + "epoch": 5.1692671394799055, + "grad_norm": 3.533514976501465, + "learning_rate": 2.3644714722610097e-07, + "loss": 0.3631, + "step": 10933 + }, + { + "epoch": 5.169739952718676, + "grad_norm": 2.992804765701294, + "learning_rate": 2.3618239410515053e-07, + "loss": 0.3175, + "step": 10934 + }, + { + "epoch": 5.170212765957447, + "grad_norm": 2.7561144828796387, + "learning_rate": 2.3591778194219539e-07, + "loss": 0.3038, + "step": 10935 + }, + { + "epoch": 5.170685579196218, + "grad_norm": 3.2408576011657715, + "learning_rate": 2.356533107537126e-07, + "loss": 0.3015, + "step": 10936 + }, + { + "epoch": 5.171158392434988, + "grad_norm": 3.2628087997436523, + "learning_rate": 2.3538898055616932e-07, + "loss": 0.3084, + "step": 10937 + }, + { + "epoch": 5.171631205673759, + "grad_norm": 3.4984447956085205, + "learning_rate": 2.35124791366024e-07, + "loss": 0.354, + "step": 10938 + }, + { + "epoch": 5.172104018912529, + "grad_norm": 3.113236904144287, + "learning_rate": 2.3486074319972684e-07, + "loss": 0.3384, + "step": 10939 + }, + { + "epoch": 5.1725768321513, + "grad_norm": 3.231961250305176, + "learning_rate": 2.3459683607371914e-07, + "loss": 0.3153, + "step": 10940 + }, + { + "epoch": 5.173049645390071, + "grad_norm": 3.6173667907714844, + "learning_rate": 2.3433307000443244e-07, + "loss": 0.3237, + "step": 10941 + }, + { + "epoch": 5.173522458628842, + "grad_norm": 3.2845072746276855, + "learning_rate": 2.3406944500829116e-07, + "loss": 0.3672, + "step": 10942 + }, + { + "epoch": 5.1739952718676125, + "grad_norm": 3.26304030418396, + "learning_rate": 2.3380596110170932e-07, + "loss": 0.3485, + "step": 10943 + }, + { + "epoch": 5.174468085106383, + "grad_norm": 3.4171030521392822, + "learning_rate": 2.3354261830109388e-07, + "loss": 0.3617, + "step": 10944 + }, + { + "epoch": 5.174940898345153, + "grad_norm": 3.248331308364868, + "learning_rate": 2.3327941662284136e-07, + "loss": 0.3254, + "step": 10945 + }, + { + "epoch": 5.175413711583924, + "grad_norm": 2.9847238063812256, + "learning_rate": 2.3301635608333983e-07, + "loss": 0.2844, + "step": 10946 + }, + { + "epoch": 5.175886524822695, + "grad_norm": 3.0677788257598877, + "learning_rate": 2.3275343669897004e-07, + "loss": 0.3071, + "step": 10947 + }, + { + "epoch": 5.176359338061466, + "grad_norm": 3.5211243629455566, + "learning_rate": 2.3249065848610198e-07, + "loss": 0.2984, + "step": 10948 + }, + { + "epoch": 5.176832151300236, + "grad_norm": 3.17779803276062, + "learning_rate": 2.3222802146109753e-07, + "loss": 0.2971, + "step": 10949 + }, + { + "epoch": 5.177304964539007, + "grad_norm": 2.7590653896331787, + "learning_rate": 2.3196552564031082e-07, + "loss": 0.2786, + "step": 10950 + }, + { + "epoch": 5.177777777777778, + "grad_norm": 2.994401693344116, + "learning_rate": 2.317031710400855e-07, + "loss": 0.2987, + "step": 10951 + }, + { + "epoch": 5.178250591016549, + "grad_norm": 3.2990148067474365, + "learning_rate": 2.314409576767579e-07, + "loss": 0.3324, + "step": 10952 + }, + { + "epoch": 5.178723404255319, + "grad_norm": 2.8851301670074463, + "learning_rate": 2.3117888556665386e-07, + "loss": 0.3067, + "step": 10953 + }, + { + "epoch": 5.1791962174940895, + "grad_norm": 3.1797075271606445, + "learning_rate": 2.3091695472609255e-07, + "loss": 0.2849, + "step": 10954 + }, + { + "epoch": 5.17966903073286, + "grad_norm": 3.2816412448883057, + "learning_rate": 2.3065516517138286e-07, + "loss": 0.3587, + "step": 10955 + }, + { + "epoch": 5.180141843971631, + "grad_norm": 3.3883779048919678, + "learning_rate": 2.3039351691882511e-07, + "loss": 0.386, + "step": 10956 + }, + { + "epoch": 5.180614657210402, + "grad_norm": 3.026245355606079, + "learning_rate": 2.3013200998471124e-07, + "loss": 0.2924, + "step": 10957 + }, + { + "epoch": 5.181087470449173, + "grad_norm": 3.2695093154907227, + "learning_rate": 2.298706443853238e-07, + "loss": 0.3446, + "step": 10958 + }, + { + "epoch": 5.1815602836879435, + "grad_norm": 3.4874610900878906, + "learning_rate": 2.2960942013693727e-07, + "loss": 0.3506, + "step": 10959 + }, + { + "epoch": 5.182033096926714, + "grad_norm": 3.31487774848938, + "learning_rate": 2.2934833725581695e-07, + "loss": 0.2892, + "step": 10960 + }, + { + "epoch": 5.182505910165484, + "grad_norm": 3.3388876914978027, + "learning_rate": 2.2908739575821876e-07, + "loss": 0.3508, + "step": 10961 + }, + { + "epoch": 5.182978723404255, + "grad_norm": 3.380781888961792, + "learning_rate": 2.2882659566039133e-07, + "loss": 0.317, + "step": 10962 + }, + { + "epoch": 5.183451536643026, + "grad_norm": 3.557917356491089, + "learning_rate": 2.285659369785728e-07, + "loss": 0.4085, + "step": 10963 + }, + { + "epoch": 5.183924349881797, + "grad_norm": 2.9980416297912598, + "learning_rate": 2.28305419728993e-07, + "loss": 0.3078, + "step": 10964 + }, + { + "epoch": 5.184397163120567, + "grad_norm": 3.1981256008148193, + "learning_rate": 2.280450439278742e-07, + "loss": 0.3257, + "step": 10965 + }, + { + "epoch": 5.184869976359338, + "grad_norm": 3.2910351753234863, + "learning_rate": 2.2778480959142841e-07, + "loss": 0.3591, + "step": 10966 + }, + { + "epoch": 5.185342789598109, + "grad_norm": 3.043903112411499, + "learning_rate": 2.2752471673585878e-07, + "loss": 0.316, + "step": 10967 + }, + { + "epoch": 5.18581560283688, + "grad_norm": 2.654751777648926, + "learning_rate": 2.2726476537736076e-07, + "loss": 0.2766, + "step": 10968 + }, + { + "epoch": 5.18628841607565, + "grad_norm": 2.8760786056518555, + "learning_rate": 2.2700495553211966e-07, + "loss": 0.2834, + "step": 10969 + }, + { + "epoch": 5.1867612293144205, + "grad_norm": 3.352128505706787, + "learning_rate": 2.2674528721631368e-07, + "loss": 0.3811, + "step": 10970 + }, + { + "epoch": 5.187234042553191, + "grad_norm": 3.2096493244171143, + "learning_rate": 2.2648576044611043e-07, + "loss": 0.3684, + "step": 10971 + }, + { + "epoch": 5.187706855791962, + "grad_norm": 3.0004148483276367, + "learning_rate": 2.2622637523767005e-07, + "loss": 0.285, + "step": 10972 + }, + { + "epoch": 5.188179669030733, + "grad_norm": 3.2039647102355957, + "learning_rate": 2.2596713160714324e-07, + "loss": 0.3276, + "step": 10973 + }, + { + "epoch": 5.188652482269504, + "grad_norm": 3.132408857345581, + "learning_rate": 2.2570802957067124e-07, + "loss": 0.3127, + "step": 10974 + }, + { + "epoch": 5.1891252955082745, + "grad_norm": 3.1553800106048584, + "learning_rate": 2.2544906914438808e-07, + "loss": 0.3257, + "step": 10975 + }, + { + "epoch": 5.189598108747045, + "grad_norm": 2.853139877319336, + "learning_rate": 2.2519025034441755e-07, + "loss": 0.2917, + "step": 10976 + }, + { + "epoch": 5.190070921985815, + "grad_norm": 3.3710334300994873, + "learning_rate": 2.249315731868759e-07, + "loss": 0.2887, + "step": 10977 + }, + { + "epoch": 5.190543735224586, + "grad_norm": 2.9652931690216064, + "learning_rate": 2.246730376878692e-07, + "loss": 0.3032, + "step": 10978 + }, + { + "epoch": 5.191016548463357, + "grad_norm": 3.727853536605835, + "learning_rate": 2.2441464386349505e-07, + "loss": 0.3195, + "step": 10979 + }, + { + "epoch": 5.191489361702128, + "grad_norm": 3.270364999771118, + "learning_rate": 2.241563917298431e-07, + "loss": 0.3178, + "step": 10980 + }, + { + "epoch": 5.191962174940898, + "grad_norm": 3.409079074859619, + "learning_rate": 2.2389828130299358e-07, + "loss": 0.3831, + "step": 10981 + }, + { + "epoch": 5.192434988179669, + "grad_norm": 3.069596529006958, + "learning_rate": 2.2364031259901776e-07, + "loss": 0.2832, + "step": 10982 + }, + { + "epoch": 5.19290780141844, + "grad_norm": 3.050508499145508, + "learning_rate": 2.233824856339778e-07, + "loss": 0.3265, + "step": 10983 + }, + { + "epoch": 5.193380614657211, + "grad_norm": 3.175166130065918, + "learning_rate": 2.2312480042392755e-07, + "loss": 0.3027, + "step": 10984 + }, + { + "epoch": 5.193853427895981, + "grad_norm": 3.2861170768737793, + "learning_rate": 2.2286725698491274e-07, + "loss": 0.3604, + "step": 10985 + }, + { + "epoch": 5.1943262411347515, + "grad_norm": 3.084089756011963, + "learning_rate": 2.2260985533296893e-07, + "loss": 0.2893, + "step": 10986 + }, + { + "epoch": 5.194799054373522, + "grad_norm": 2.9948923587799072, + "learning_rate": 2.2235259548412268e-07, + "loss": 0.2897, + "step": 10987 + }, + { + "epoch": 5.195271867612293, + "grad_norm": 2.8823280334472656, + "learning_rate": 2.2209547745439375e-07, + "loss": 0.3006, + "step": 10988 + }, + { + "epoch": 5.195744680851064, + "grad_norm": 3.556781053543091, + "learning_rate": 2.2183850125979066e-07, + "loss": 0.3501, + "step": 10989 + }, + { + "epoch": 5.196217494089835, + "grad_norm": 3.4599523544311523, + "learning_rate": 2.2158166691631483e-07, + "loss": 0.3735, + "step": 10990 + }, + { + "epoch": 5.1966903073286055, + "grad_norm": 2.926719903945923, + "learning_rate": 2.2132497443995843e-07, + "loss": 0.3082, + "step": 10991 + }, + { + "epoch": 5.197163120567376, + "grad_norm": 3.20700740814209, + "learning_rate": 2.210684238467037e-07, + "loss": 0.2761, + "step": 10992 + }, + { + "epoch": 5.197635933806146, + "grad_norm": 2.722289800643921, + "learning_rate": 2.208120151525256e-07, + "loss": 0.2892, + "step": 10993 + }, + { + "epoch": 5.198108747044917, + "grad_norm": 2.9280753135681152, + "learning_rate": 2.2055574837338916e-07, + "loss": 0.3129, + "step": 10994 + }, + { + "epoch": 5.198581560283688, + "grad_norm": 3.241697311401367, + "learning_rate": 2.2029962352525137e-07, + "loss": 0.3287, + "step": 10995 + }, + { + "epoch": 5.199054373522459, + "grad_norm": 3.356199264526367, + "learning_rate": 2.2004364062405997e-07, + "loss": 0.2941, + "step": 10996 + }, + { + "epoch": 5.199527186761229, + "grad_norm": 3.0350935459136963, + "learning_rate": 2.1978779968575392e-07, + "loss": 0.2941, + "step": 10997 + }, + { + "epoch": 5.2, + "grad_norm": 2.901796340942383, + "learning_rate": 2.1953210072626297e-07, + "loss": 0.2897, + "step": 10998 + }, + { + "epoch": 5.200472813238771, + "grad_norm": 3.085937976837158, + "learning_rate": 2.1927654376150826e-07, + "loss": 0.2953, + "step": 10999 + }, + { + "epoch": 5.200945626477542, + "grad_norm": 3.3783273696899414, + "learning_rate": 2.190211288074029e-07, + "loss": 0.3429, + "step": 11000 + }, + { + "epoch": 5.201418439716312, + "grad_norm": 3.5308191776275635, + "learning_rate": 2.1876585587984999e-07, + "loss": 0.3742, + "step": 11001 + }, + { + "epoch": 5.2018912529550825, + "grad_norm": 3.7020623683929443, + "learning_rate": 2.185107249947438e-07, + "loss": 0.3477, + "step": 11002 + }, + { + "epoch": 5.202364066193853, + "grad_norm": 3.2107656002044678, + "learning_rate": 2.182557361679713e-07, + "loss": 0.3137, + "step": 11003 + }, + { + "epoch": 5.202836879432624, + "grad_norm": 3.5506410598754883, + "learning_rate": 2.180008894154087e-07, + "loss": 0.3594, + "step": 11004 + }, + { + "epoch": 5.203309692671395, + "grad_norm": 3.173083543777466, + "learning_rate": 2.177461847529247e-07, + "loss": 0.3096, + "step": 11005 + }, + { + "epoch": 5.203782505910166, + "grad_norm": 3.69927716255188, + "learning_rate": 2.1749162219637827e-07, + "loss": 0.3383, + "step": 11006 + }, + { + "epoch": 5.2042553191489365, + "grad_norm": 3.133638620376587, + "learning_rate": 2.1723720176161978e-07, + "loss": 0.3013, + "step": 11007 + }, + { + "epoch": 5.204728132387707, + "grad_norm": 3.6174583435058594, + "learning_rate": 2.1698292346449158e-07, + "loss": 0.3753, + "step": 11008 + }, + { + "epoch": 5.205200945626477, + "grad_norm": 3.191314458847046, + "learning_rate": 2.1672878732082596e-07, + "loss": 0.3121, + "step": 11009 + }, + { + "epoch": 5.205673758865248, + "grad_norm": 3.505059242248535, + "learning_rate": 2.1647479334644646e-07, + "loss": 0.3372, + "step": 11010 + }, + { + "epoch": 5.206146572104019, + "grad_norm": 3.329693555831909, + "learning_rate": 2.1622094155716894e-07, + "loss": 0.3221, + "step": 11011 + }, + { + "epoch": 5.20661938534279, + "grad_norm": 3.3008434772491455, + "learning_rate": 2.1596723196879942e-07, + "loss": 0.2895, + "step": 11012 + }, + { + "epoch": 5.20709219858156, + "grad_norm": 3.349616527557373, + "learning_rate": 2.1571366459713472e-07, + "loss": 0.3413, + "step": 11013 + }, + { + "epoch": 5.207565011820331, + "grad_norm": 2.940938949584961, + "learning_rate": 2.154602394579644e-07, + "loss": 0.2919, + "step": 11014 + }, + { + "epoch": 5.208037825059102, + "grad_norm": 3.792189598083496, + "learning_rate": 2.1520695656706725e-07, + "loss": 0.318, + "step": 11015 + }, + { + "epoch": 5.208510638297873, + "grad_norm": 3.4563612937927246, + "learning_rate": 2.149538159402148e-07, + "loss": 0.3264, + "step": 11016 + }, + { + "epoch": 5.208983451536643, + "grad_norm": 2.9630939960479736, + "learning_rate": 2.1470081759316779e-07, + "loss": 0.2884, + "step": 11017 + }, + { + "epoch": 5.2094562647754135, + "grad_norm": 3.3553643226623535, + "learning_rate": 2.1444796154168086e-07, + "loss": 0.2755, + "step": 11018 + }, + { + "epoch": 5.209929078014184, + "grad_norm": 3.1282877922058105, + "learning_rate": 2.1419524780149748e-07, + "loss": 0.3648, + "step": 11019 + }, + { + "epoch": 5.210401891252955, + "grad_norm": 3.1164352893829346, + "learning_rate": 2.139426763883523e-07, + "loss": 0.3363, + "step": 11020 + }, + { + "epoch": 5.210874704491726, + "grad_norm": 3.6382009983062744, + "learning_rate": 2.1369024731797334e-07, + "loss": 0.3696, + "step": 11021 + }, + { + "epoch": 5.211347517730497, + "grad_norm": 3.323014497756958, + "learning_rate": 2.1343796060607714e-07, + "loss": 0.3995, + "step": 11022 + }, + { + "epoch": 5.2118203309692674, + "grad_norm": 3.161573648452759, + "learning_rate": 2.1318581626837308e-07, + "loss": 0.304, + "step": 11023 + }, + { + "epoch": 5.212293144208038, + "grad_norm": 3.022866725921631, + "learning_rate": 2.1293381432056116e-07, + "loss": 0.3012, + "step": 11024 + }, + { + "epoch": 5.212765957446808, + "grad_norm": 3.100696563720703, + "learning_rate": 2.1268195477833152e-07, + "loss": 0.2727, + "step": 11025 + }, + { + "epoch": 5.213238770685579, + "grad_norm": 3.2253260612487793, + "learning_rate": 2.1243023765736725e-07, + "loss": 0.2991, + "step": 11026 + }, + { + "epoch": 5.21371158392435, + "grad_norm": 2.9985997676849365, + "learning_rate": 2.1217866297334105e-07, + "loss": 0.3228, + "step": 11027 + }, + { + "epoch": 5.2141843971631205, + "grad_norm": 3.197525978088379, + "learning_rate": 2.1192723074191817e-07, + "loss": 0.3843, + "step": 11028 + }, + { + "epoch": 5.214657210401891, + "grad_norm": 3.6619510650634766, + "learning_rate": 2.116759409787539e-07, + "loss": 0.3678, + "step": 11029 + }, + { + "epoch": 5.215130023640662, + "grad_norm": 3.445089340209961, + "learning_rate": 2.1142479369949454e-07, + "loss": 0.3212, + "step": 11030 + }, + { + "epoch": 5.215602836879433, + "grad_norm": 3.347994565963745, + "learning_rate": 2.1117378891977847e-07, + "loss": 0.3505, + "step": 11031 + }, + { + "epoch": 5.216075650118204, + "grad_norm": 3.0974318981170654, + "learning_rate": 2.10922926655234e-07, + "loss": 0.312, + "step": 11032 + }, + { + "epoch": 5.216548463356974, + "grad_norm": 3.206122398376465, + "learning_rate": 2.1067220692148143e-07, + "loss": 0.3245, + "step": 11033 + }, + { + "epoch": 5.217021276595744, + "grad_norm": 3.105717897415161, + "learning_rate": 2.1042162973413244e-07, + "loss": 0.3045, + "step": 11034 + }, + { + "epoch": 5.217494089834515, + "grad_norm": 3.065800905227661, + "learning_rate": 2.101711951087887e-07, + "loss": 0.3078, + "step": 11035 + }, + { + "epoch": 5.217966903073286, + "grad_norm": 3.374640464782715, + "learning_rate": 2.099209030610444e-07, + "loss": 0.3105, + "step": 11036 + }, + { + "epoch": 5.218439716312057, + "grad_norm": 3.3342158794403076, + "learning_rate": 2.0967075360648375e-07, + "loss": 0.3075, + "step": 11037 + }, + { + "epoch": 5.218912529550828, + "grad_norm": 2.6468522548675537, + "learning_rate": 2.0942074676068208e-07, + "loss": 0.3, + "step": 11038 + }, + { + "epoch": 5.219385342789598, + "grad_norm": 2.8999321460723877, + "learning_rate": 2.091708825392072e-07, + "loss": 0.325, + "step": 11039 + }, + { + "epoch": 5.219858156028369, + "grad_norm": 3.4187169075012207, + "learning_rate": 2.0892116095761584e-07, + "loss": 0.3048, + "step": 11040 + }, + { + "epoch": 5.220330969267139, + "grad_norm": 4.025125503540039, + "learning_rate": 2.0867158203145832e-07, + "loss": 0.3412, + "step": 11041 + }, + { + "epoch": 5.22080378250591, + "grad_norm": 3.3094186782836914, + "learning_rate": 2.0842214577627418e-07, + "loss": 0.3174, + "step": 11042 + }, + { + "epoch": 5.221276595744681, + "grad_norm": 3.254117488861084, + "learning_rate": 2.0817285220759432e-07, + "loss": 0.3346, + "step": 11043 + }, + { + "epoch": 5.2217494089834515, + "grad_norm": 3.184781074523926, + "learning_rate": 2.0792370134094186e-07, + "loss": 0.3554, + "step": 11044 + }, + { + "epoch": 5.222222222222222, + "grad_norm": 3.116020917892456, + "learning_rate": 2.0767469319183026e-07, + "loss": 0.338, + "step": 11045 + }, + { + "epoch": 5.222695035460993, + "grad_norm": 3.0979223251342773, + "learning_rate": 2.0742582777576376e-07, + "loss": 0.3223, + "step": 11046 + }, + { + "epoch": 5.223167848699764, + "grad_norm": 3.083552837371826, + "learning_rate": 2.0717710510823857e-07, + "loss": 0.2609, + "step": 11047 + }, + { + "epoch": 5.223640661938534, + "grad_norm": 3.228064775466919, + "learning_rate": 2.0692852520474066e-07, + "loss": 0.3362, + "step": 11048 + }, + { + "epoch": 5.224113475177305, + "grad_norm": 3.6322951316833496, + "learning_rate": 2.066800880807493e-07, + "loss": 0.3319, + "step": 11049 + }, + { + "epoch": 5.224586288416075, + "grad_norm": 3.0053210258483887, + "learning_rate": 2.064317937517324e-07, + "loss": 0.2936, + "step": 11050 + }, + { + "epoch": 5.225059101654846, + "grad_norm": 3.0131611824035645, + "learning_rate": 2.0618364223315118e-07, + "loss": 0.3449, + "step": 11051 + }, + { + "epoch": 5.225531914893617, + "grad_norm": 2.7796759605407715, + "learning_rate": 2.0593563354045638e-07, + "loss": 0.3093, + "step": 11052 + }, + { + "epoch": 5.226004728132388, + "grad_norm": 3.0809054374694824, + "learning_rate": 2.0568776768909032e-07, + "loss": 0.3143, + "step": 11053 + }, + { + "epoch": 5.226477541371159, + "grad_norm": 3.3395402431488037, + "learning_rate": 2.054400446944868e-07, + "loss": 0.3055, + "step": 11054 + }, + { + "epoch": 5.226950354609929, + "grad_norm": 3.47823166847229, + "learning_rate": 2.0519246457207016e-07, + "loss": 0.3361, + "step": 11055 + }, + { + "epoch": 5.2274231678487, + "grad_norm": 3.4772236347198486, + "learning_rate": 2.0494502733725663e-07, + "loss": 0.2907, + "step": 11056 + }, + { + "epoch": 5.22789598108747, + "grad_norm": 3.7166171073913574, + "learning_rate": 2.046977330054528e-07, + "loss": 0.3157, + "step": 11057 + }, + { + "epoch": 5.228368794326241, + "grad_norm": 3.654662609100342, + "learning_rate": 2.044505815920564e-07, + "loss": 0.3212, + "step": 11058 + }, + { + "epoch": 5.228841607565012, + "grad_norm": 3.1599886417388916, + "learning_rate": 2.042035731124567e-07, + "loss": 0.3675, + "step": 11059 + }, + { + "epoch": 5.2293144208037825, + "grad_norm": 3.1967053413391113, + "learning_rate": 2.039567075820337e-07, + "loss": 0.3512, + "step": 11060 + }, + { + "epoch": 5.229787234042553, + "grad_norm": 3.4255285263061523, + "learning_rate": 2.0370998501615862e-07, + "loss": 0.3127, + "step": 11061 + }, + { + "epoch": 5.230260047281324, + "grad_norm": 3.137645959854126, + "learning_rate": 2.03463405430194e-07, + "loss": 0.3265, + "step": 11062 + }, + { + "epoch": 5.230732860520095, + "grad_norm": 4.049947261810303, + "learning_rate": 2.0321696883949277e-07, + "loss": 0.413, + "step": 11063 + }, + { + "epoch": 5.231205673758865, + "grad_norm": 3.052638530731201, + "learning_rate": 2.0297067525940017e-07, + "loss": 0.3051, + "step": 11064 + }, + { + "epoch": 5.231678486997636, + "grad_norm": 3.339456796646118, + "learning_rate": 2.0272452470525144e-07, + "loss": 0.2821, + "step": 11065 + }, + { + "epoch": 5.232151300236406, + "grad_norm": 3.2044780254364014, + "learning_rate": 2.0247851719237266e-07, + "loss": 0.2997, + "step": 11066 + }, + { + "epoch": 5.232624113475177, + "grad_norm": 2.9565982818603516, + "learning_rate": 2.0223265273608295e-07, + "loss": 0.3197, + "step": 11067 + }, + { + "epoch": 5.233096926713948, + "grad_norm": 3.1882998943328857, + "learning_rate": 2.0198693135169007e-07, + "loss": 0.3151, + "step": 11068 + }, + { + "epoch": 5.233569739952719, + "grad_norm": 3.3345093727111816, + "learning_rate": 2.017413530544951e-07, + "loss": 0.3441, + "step": 11069 + }, + { + "epoch": 5.23404255319149, + "grad_norm": 2.9199023246765137, + "learning_rate": 2.0149591785978835e-07, + "loss": 0.2556, + "step": 11070 + }, + { + "epoch": 5.23451536643026, + "grad_norm": 3.177485466003418, + "learning_rate": 2.0125062578285143e-07, + "loss": 0.3017, + "step": 11071 + }, + { + "epoch": 5.234988179669031, + "grad_norm": 3.4713990688323975, + "learning_rate": 2.010054768389591e-07, + "loss": 0.3658, + "step": 11072 + }, + { + "epoch": 5.235460992907801, + "grad_norm": 3.084087371826172, + "learning_rate": 2.0076047104337443e-07, + "loss": 0.315, + "step": 11073 + }, + { + "epoch": 5.235933806146572, + "grad_norm": 3.1807777881622314, + "learning_rate": 2.0051560841135382e-07, + "loss": 0.3464, + "step": 11074 + }, + { + "epoch": 5.236406619385343, + "grad_norm": 2.933351993560791, + "learning_rate": 2.002708889581434e-07, + "loss": 0.2666, + "step": 11075 + }, + { + "epoch": 5.2368794326241135, + "grad_norm": 3.0530927181243896, + "learning_rate": 2.000263126989807e-07, + "loss": 0.3033, + "step": 11076 + }, + { + "epoch": 5.237352245862884, + "grad_norm": 2.93255615234375, + "learning_rate": 1.997818796490944e-07, + "loss": 0.2745, + "step": 11077 + }, + { + "epoch": 5.237825059101655, + "grad_norm": 3.159719944000244, + "learning_rate": 1.995375898237037e-07, + "loss": 0.2827, + "step": 11078 + }, + { + "epoch": 5.238297872340426, + "grad_norm": 2.9349069595336914, + "learning_rate": 1.9929344323802057e-07, + "loss": 0.2904, + "step": 11079 + }, + { + "epoch": 5.238770685579196, + "grad_norm": 3.3207859992980957, + "learning_rate": 1.990494399072465e-07, + "loss": 0.314, + "step": 11080 + }, + { + "epoch": 5.239243498817967, + "grad_norm": 3.2512917518615723, + "learning_rate": 1.9880557984657406e-07, + "loss": 0.3448, + "step": 11081 + }, + { + "epoch": 5.239716312056737, + "grad_norm": 3.1114752292633057, + "learning_rate": 1.985618630711883e-07, + "loss": 0.298, + "step": 11082 + }, + { + "epoch": 5.240189125295508, + "grad_norm": 3.120422840118408, + "learning_rate": 1.9831828959626348e-07, + "loss": 0.3168, + "step": 11083 + }, + { + "epoch": 5.240661938534279, + "grad_norm": 3.0151801109313965, + "learning_rate": 1.9807485943696608e-07, + "loss": 0.3034, + "step": 11084 + }, + { + "epoch": 5.24113475177305, + "grad_norm": 3.2933578491210938, + "learning_rate": 1.978315726084537e-07, + "loss": 0.3404, + "step": 11085 + }, + { + "epoch": 5.241607565011821, + "grad_norm": 2.931190252304077, + "learning_rate": 1.9758842912587423e-07, + "loss": 0.3329, + "step": 11086 + }, + { + "epoch": 5.242080378250591, + "grad_norm": 3.5645482540130615, + "learning_rate": 1.9734542900436804e-07, + "loss": 0.3416, + "step": 11087 + }, + { + "epoch": 5.242553191489361, + "grad_norm": 2.992359161376953, + "learning_rate": 1.9710257225906497e-07, + "loss": 0.3187, + "step": 11088 + }, + { + "epoch": 5.243026004728132, + "grad_norm": 2.9263901710510254, + "learning_rate": 1.9685985890508657e-07, + "loss": 0.2672, + "step": 11089 + }, + { + "epoch": 5.243498817966903, + "grad_norm": 3.1584653854370117, + "learning_rate": 1.966172889575463e-07, + "loss": 0.2854, + "step": 11090 + }, + { + "epoch": 5.243971631205674, + "grad_norm": 3.632632255554199, + "learning_rate": 1.9637486243154676e-07, + "loss": 0.328, + "step": 11091 + }, + { + "epoch": 5.2444444444444445, + "grad_norm": 3.4155561923980713, + "learning_rate": 1.9613257934218393e-07, + "loss": 0.321, + "step": 11092 + }, + { + "epoch": 5.244917257683215, + "grad_norm": 2.820805549621582, + "learning_rate": 1.958904397045433e-07, + "loss": 0.3102, + "step": 11093 + }, + { + "epoch": 5.245390070921986, + "grad_norm": 3.3208353519439697, + "learning_rate": 1.9564844353370167e-07, + "loss": 0.363, + "step": 11094 + }, + { + "epoch": 5.245862884160757, + "grad_norm": 3.082474708557129, + "learning_rate": 1.9540659084472724e-07, + "loss": 0.3116, + "step": 11095 + }, + { + "epoch": 5.246335697399527, + "grad_norm": 3.274022102355957, + "learning_rate": 1.9516488165267854e-07, + "loss": 0.31, + "step": 11096 + }, + { + "epoch": 5.246808510638298, + "grad_norm": 3.6933083534240723, + "learning_rate": 1.949233159726069e-07, + "loss": 0.3488, + "step": 11097 + }, + { + "epoch": 5.247281323877068, + "grad_norm": 3.25183367729187, + "learning_rate": 1.9468189381955272e-07, + "loss": 0.346, + "step": 11098 + }, + { + "epoch": 5.247754137115839, + "grad_norm": 3.847667694091797, + "learning_rate": 1.944406152085482e-07, + "loss": 0.3151, + "step": 11099 + }, + { + "epoch": 5.24822695035461, + "grad_norm": 7.3193559646606445, + "learning_rate": 1.9419948015461737e-07, + "loss": 0.3774, + "step": 11100 + }, + { + "epoch": 5.248699763593381, + "grad_norm": 2.9225029945373535, + "learning_rate": 1.9395848867277383e-07, + "loss": 0.2671, + "step": 11101 + }, + { + "epoch": 5.2491725768321515, + "grad_norm": 3.179107189178467, + "learning_rate": 1.9371764077802417e-07, + "loss": 0.3126, + "step": 11102 + }, + { + "epoch": 5.249645390070922, + "grad_norm": 3.3329577445983887, + "learning_rate": 1.9347693648536392e-07, + "loss": 0.3322, + "step": 11103 + }, + { + "epoch": 5.250118203309692, + "grad_norm": 3.384031057357788, + "learning_rate": 1.9323637580978076e-07, + "loss": 0.3786, + "step": 11104 + }, + { + "epoch": 5.250591016548463, + "grad_norm": 3.2742369174957275, + "learning_rate": 1.9299595876625416e-07, + "loss": 0.2924, + "step": 11105 + }, + { + "epoch": 5.251063829787234, + "grad_norm": 3.4118731021881104, + "learning_rate": 1.927556853697532e-07, + "loss": 0.452, + "step": 11106 + }, + { + "epoch": 5.251536643026005, + "grad_norm": 3.110896110534668, + "learning_rate": 1.9251555563523816e-07, + "loss": 0.3204, + "step": 11107 + }, + { + "epoch": 5.2520094562647754, + "grad_norm": 3.403810977935791, + "learning_rate": 1.922755695776621e-07, + "loss": 0.357, + "step": 11108 + }, + { + "epoch": 5.252482269503546, + "grad_norm": 3.437980890274048, + "learning_rate": 1.9203572721196716e-07, + "loss": 0.3091, + "step": 11109 + }, + { + "epoch": 5.252955082742317, + "grad_norm": 3.0351603031158447, + "learning_rate": 1.917960285530876e-07, + "loss": 0.3458, + "step": 11110 + }, + { + "epoch": 5.253427895981088, + "grad_norm": 3.085078477859497, + "learning_rate": 1.9155647361594782e-07, + "loss": 0.3112, + "step": 11111 + }, + { + "epoch": 5.253900709219858, + "grad_norm": 3.1089236736297607, + "learning_rate": 1.9131706241546395e-07, + "loss": 0.3101, + "step": 11112 + }, + { + "epoch": 5.2543735224586285, + "grad_norm": 3.224823236465454, + "learning_rate": 1.9107779496654377e-07, + "loss": 0.3156, + "step": 11113 + }, + { + "epoch": 5.254846335697399, + "grad_norm": 3.4039080142974854, + "learning_rate": 1.9083867128408456e-07, + "loss": 0.2953, + "step": 11114 + }, + { + "epoch": 5.25531914893617, + "grad_norm": 2.9961628913879395, + "learning_rate": 1.9059969138297636e-07, + "loss": 0.2958, + "step": 11115 + }, + { + "epoch": 5.255791962174941, + "grad_norm": 3.0796079635620117, + "learning_rate": 1.9036085527809889e-07, + "loss": 0.3311, + "step": 11116 + }, + { + "epoch": 5.256264775413712, + "grad_norm": 3.2201931476593018, + "learning_rate": 1.9012216298432306e-07, + "loss": 0.3793, + "step": 11117 + }, + { + "epoch": 5.2567375886524825, + "grad_norm": 2.967545509338379, + "learning_rate": 1.89883614516512e-07, + "loss": 0.3513, + "step": 11118 + }, + { + "epoch": 5.257210401891253, + "grad_norm": 3.1672980785369873, + "learning_rate": 1.8964520988951852e-07, + "loss": 0.3064, + "step": 11119 + }, + { + "epoch": 5.257683215130023, + "grad_norm": 3.426481246948242, + "learning_rate": 1.894069491181874e-07, + "loss": 0.3429, + "step": 11120 + }, + { + "epoch": 5.258156028368794, + "grad_norm": 2.934448480606079, + "learning_rate": 1.8916883221735404e-07, + "loss": 0.3262, + "step": 11121 + }, + { + "epoch": 5.258628841607565, + "grad_norm": 2.945082426071167, + "learning_rate": 1.8893085920184433e-07, + "loss": 0.3318, + "step": 11122 + }, + { + "epoch": 5.259101654846336, + "grad_norm": 3.222621440887451, + "learning_rate": 1.886930300864767e-07, + "loss": 0.3538, + "step": 11123 + }, + { + "epoch": 5.259574468085106, + "grad_norm": 2.8572630882263184, + "learning_rate": 1.8845534488605933e-07, + "loss": 0.2634, + "step": 11124 + }, + { + "epoch": 5.260047281323877, + "grad_norm": 3.195322036743164, + "learning_rate": 1.88217803615392e-07, + "loss": 0.3027, + "step": 11125 + }, + { + "epoch": 5.260520094562648, + "grad_norm": 3.427511692047119, + "learning_rate": 1.8798040628926483e-07, + "loss": 0.3469, + "step": 11126 + }, + { + "epoch": 5.260992907801419, + "grad_norm": 3.2642531394958496, + "learning_rate": 1.8774315292245965e-07, + "loss": 0.3377, + "step": 11127 + }, + { + "epoch": 5.261465721040189, + "grad_norm": 3.043724298477173, + "learning_rate": 1.8750604352974987e-07, + "loss": 0.303, + "step": 11128 + }, + { + "epoch": 5.2619385342789595, + "grad_norm": 3.303520441055298, + "learning_rate": 1.8726907812589874e-07, + "loss": 0.3193, + "step": 11129 + }, + { + "epoch": 5.26241134751773, + "grad_norm": 4.076786994934082, + "learning_rate": 1.870322567256605e-07, + "loss": 0.3555, + "step": 11130 + }, + { + "epoch": 5.262884160756501, + "grad_norm": 3.092057466506958, + "learning_rate": 1.8679557934378202e-07, + "loss": 0.3223, + "step": 11131 + }, + { + "epoch": 5.263356973995272, + "grad_norm": 3.3769257068634033, + "learning_rate": 1.8655904599499952e-07, + "loss": 0.3498, + "step": 11132 + }, + { + "epoch": 5.263829787234043, + "grad_norm": 3.1874232292175293, + "learning_rate": 1.8632265669404126e-07, + "loss": 0.3292, + "step": 11133 + }, + { + "epoch": 5.2643026004728135, + "grad_norm": 3.0761327743530273, + "learning_rate": 1.8608641145562627e-07, + "loss": 0.293, + "step": 11134 + }, + { + "epoch": 5.264775413711584, + "grad_norm": 3.3254477977752686, + "learning_rate": 1.8585031029446364e-07, + "loss": 0.3414, + "step": 11135 + }, + { + "epoch": 5.265248226950354, + "grad_norm": 3.382084608078003, + "learning_rate": 1.8561435322525552e-07, + "loss": 0.3533, + "step": 11136 + }, + { + "epoch": 5.265721040189125, + "grad_norm": 3.598961114883423, + "learning_rate": 1.8537854026269259e-07, + "loss": 0.3581, + "step": 11137 + }, + { + "epoch": 5.266193853427896, + "grad_norm": 3.4922592639923096, + "learning_rate": 1.8514287142145926e-07, + "loss": 0.2492, + "step": 11138 + }, + { + "epoch": 5.266666666666667, + "grad_norm": 2.9022061824798584, + "learning_rate": 1.8490734671622905e-07, + "loss": 0.2886, + "step": 11139 + }, + { + "epoch": 5.267139479905437, + "grad_norm": 2.8640410900115967, + "learning_rate": 1.8467196616166694e-07, + "loss": 0.276, + "step": 11140 + }, + { + "epoch": 5.267612293144208, + "grad_norm": 3.756704092025757, + "learning_rate": 1.8443672977242922e-07, + "loss": 0.3003, + "step": 11141 + }, + { + "epoch": 5.268085106382979, + "grad_norm": 3.145064353942871, + "learning_rate": 1.8420163756316222e-07, + "loss": 0.2707, + "step": 11142 + }, + { + "epoch": 5.26855791962175, + "grad_norm": 3.542193651199341, + "learning_rate": 1.8396668954850538e-07, + "loss": 0.369, + "step": 11143 + }, + { + "epoch": 5.26903073286052, + "grad_norm": 3.2682089805603027, + "learning_rate": 1.8373188574308726e-07, + "loss": 0.3045, + "step": 11144 + }, + { + "epoch": 5.2695035460992905, + "grad_norm": 3.3174448013305664, + "learning_rate": 1.8349722616152753e-07, + "loss": 0.313, + "step": 11145 + }, + { + "epoch": 5.269976359338061, + "grad_norm": 5.076638221740723, + "learning_rate": 1.8326271081843873e-07, + "loss": 0.3667, + "step": 11146 + }, + { + "epoch": 5.270449172576832, + "grad_norm": 3.259685516357422, + "learning_rate": 1.830283397284216e-07, + "loss": 0.3415, + "step": 11147 + }, + { + "epoch": 5.270921985815603, + "grad_norm": 3.1790366172790527, + "learning_rate": 1.8279411290607064e-07, + "loss": 0.3193, + "step": 11148 + }, + { + "epoch": 5.271394799054374, + "grad_norm": 3.091585636138916, + "learning_rate": 1.8256003036596942e-07, + "loss": 0.3042, + "step": 11149 + }, + { + "epoch": 5.2718676122931445, + "grad_norm": 3.1135261058807373, + "learning_rate": 1.8232609212269326e-07, + "loss": 0.357, + "step": 11150 + }, + { + "epoch": 5.272340425531915, + "grad_norm": 3.1233057975769043, + "learning_rate": 1.8209229819080908e-07, + "loss": 0.3221, + "step": 11151 + }, + { + "epoch": 5.272813238770685, + "grad_norm": 3.1838364601135254, + "learning_rate": 1.8185864858487335e-07, + "loss": 0.3301, + "step": 11152 + }, + { + "epoch": 5.273286052009456, + "grad_norm": 3.0430169105529785, + "learning_rate": 1.816251433194352e-07, + "loss": 0.3051, + "step": 11153 + }, + { + "epoch": 5.273758865248227, + "grad_norm": 3.4865505695343018, + "learning_rate": 1.813917824090336e-07, + "loss": 0.3492, + "step": 11154 + }, + { + "epoch": 5.274231678486998, + "grad_norm": 3.3414416313171387, + "learning_rate": 1.8115856586819884e-07, + "loss": 0.3322, + "step": 11155 + }, + { + "epoch": 5.274704491725768, + "grad_norm": 3.0825676918029785, + "learning_rate": 1.8092549371145212e-07, + "loss": 0.3356, + "step": 11156 + }, + { + "epoch": 5.275177304964539, + "grad_norm": 3.4176392555236816, + "learning_rate": 1.8069256595330654e-07, + "loss": 0.3219, + "step": 11157 + }, + { + "epoch": 5.27565011820331, + "grad_norm": 3.1088545322418213, + "learning_rate": 1.8045978260826491e-07, + "loss": 0.3039, + "step": 11158 + }, + { + "epoch": 5.276122931442081, + "grad_norm": 3.428644895553589, + "learning_rate": 1.802271436908215e-07, + "loss": 0.3175, + "step": 11159 + }, + { + "epoch": 5.276595744680851, + "grad_norm": 2.932908296585083, + "learning_rate": 1.7999464921546196e-07, + "loss": 0.3166, + "step": 11160 + }, + { + "epoch": 5.2770685579196215, + "grad_norm": 3.0005784034729004, + "learning_rate": 1.7976229919666272e-07, + "loss": 0.2563, + "step": 11161 + }, + { + "epoch": 5.277541371158392, + "grad_norm": 2.9349775314331055, + "learning_rate": 1.7953009364889141e-07, + "loss": 0.3195, + "step": 11162 + }, + { + "epoch": 5.278014184397163, + "grad_norm": 3.6350808143615723, + "learning_rate": 1.792980325866059e-07, + "loss": 0.3435, + "step": 11163 + }, + { + "epoch": 5.278486997635934, + "grad_norm": 3.0954113006591797, + "learning_rate": 1.79066116024256e-07, + "loss": 0.3151, + "step": 11164 + }, + { + "epoch": 5.278959810874705, + "grad_norm": 3.31296443939209, + "learning_rate": 1.7883434397628186e-07, + "loss": 0.3618, + "step": 11165 + }, + { + "epoch": 5.2794326241134755, + "grad_norm": 2.9701640605926514, + "learning_rate": 1.7860271645711552e-07, + "loss": 0.3066, + "step": 11166 + }, + { + "epoch": 5.279905437352246, + "grad_norm": 2.9575884342193604, + "learning_rate": 1.7837123348117906e-07, + "loss": 0.3099, + "step": 11167 + }, + { + "epoch": 5.280378250591016, + "grad_norm": 3.3828840255737305, + "learning_rate": 1.7813989506288542e-07, + "loss": 0.3156, + "step": 11168 + }, + { + "epoch": 5.280851063829787, + "grad_norm": 3.2371773719787598, + "learning_rate": 1.7790870121663972e-07, + "loss": 0.3153, + "step": 11169 + }, + { + "epoch": 5.281323877068558, + "grad_norm": 3.176521062850952, + "learning_rate": 1.7767765195683684e-07, + "loss": 0.3007, + "step": 11170 + }, + { + "epoch": 5.281796690307329, + "grad_norm": 3.5833663940429688, + "learning_rate": 1.7744674729786387e-07, + "loss": 0.3289, + "step": 11171 + }, + { + "epoch": 5.282269503546099, + "grad_norm": 3.3106162548065186, + "learning_rate": 1.7721598725409794e-07, + "loss": 0.3204, + "step": 11172 + }, + { + "epoch": 5.28274231678487, + "grad_norm": 3.2758400440216064, + "learning_rate": 1.7698537183990727e-07, + "loss": 0.3158, + "step": 11173 + }, + { + "epoch": 5.283215130023641, + "grad_norm": 3.1461472511291504, + "learning_rate": 1.7675490106965147e-07, + "loss": 0.2968, + "step": 11174 + }, + { + "epoch": 5.283687943262412, + "grad_norm": 3.1824328899383545, + "learning_rate": 1.765245749576805e-07, + "loss": 0.327, + "step": 11175 + }, + { + "epoch": 5.284160756501182, + "grad_norm": 3.493117332458496, + "learning_rate": 1.7629439351833644e-07, + "loss": 0.3457, + "step": 11176 + }, + { + "epoch": 5.2846335697399525, + "grad_norm": 2.929245948791504, + "learning_rate": 1.7606435676595146e-07, + "loss": 0.3446, + "step": 11177 + }, + { + "epoch": 5.285106382978723, + "grad_norm": 3.2307543754577637, + "learning_rate": 1.7583446471484827e-07, + "loss": 0.3238, + "step": 11178 + }, + { + "epoch": 5.285579196217494, + "grad_norm": 3.2233543395996094, + "learning_rate": 1.7560471737934238e-07, + "loss": 0.3217, + "step": 11179 + }, + { + "epoch": 5.286052009456265, + "grad_norm": 3.2125415802001953, + "learning_rate": 1.7537511477373843e-07, + "loss": 0.3346, + "step": 11180 + }, + { + "epoch": 5.286524822695036, + "grad_norm": 3.675832986831665, + "learning_rate": 1.7514565691233249e-07, + "loss": 0.3461, + "step": 11181 + }, + { + "epoch": 5.2869976359338064, + "grad_norm": 3.942467451095581, + "learning_rate": 1.749163438094126e-07, + "loss": 0.3889, + "step": 11182 + }, + { + "epoch": 5.287470449172577, + "grad_norm": 3.043288230895996, + "learning_rate": 1.7468717547925673e-07, + "loss": 0.3068, + "step": 11183 + }, + { + "epoch": 5.287943262411347, + "grad_norm": 2.9957966804504395, + "learning_rate": 1.7445815193613436e-07, + "loss": 0.2928, + "step": 11184 + }, + { + "epoch": 5.288416075650118, + "grad_norm": 5.408661365509033, + "learning_rate": 1.742292731943057e-07, + "loss": 0.3067, + "step": 11185 + }, + { + "epoch": 5.288888888888889, + "grad_norm": 2.905014753341675, + "learning_rate": 1.7400053926802192e-07, + "loss": 0.2998, + "step": 11186 + }, + { + "epoch": 5.2893617021276595, + "grad_norm": 2.9951369762420654, + "learning_rate": 1.7377195017152542e-07, + "loss": 0.2876, + "step": 11187 + }, + { + "epoch": 5.28983451536643, + "grad_norm": 3.637373208999634, + "learning_rate": 1.735435059190496e-07, + "loss": 0.3146, + "step": 11188 + }, + { + "epoch": 5.290307328605201, + "grad_norm": 3.262631893157959, + "learning_rate": 1.733152065248181e-07, + "loss": 0.3384, + "step": 11189 + }, + { + "epoch": 5.290780141843972, + "grad_norm": 3.2098207473754883, + "learning_rate": 1.7308705200304699e-07, + "loss": 0.3152, + "step": 11190 + }, + { + "epoch": 5.291252955082742, + "grad_norm": 3.331793785095215, + "learning_rate": 1.7285904236794105e-07, + "loss": 0.3168, + "step": 11191 + }, + { + "epoch": 5.291725768321513, + "grad_norm": 3.5311834812164307, + "learning_rate": 1.726311776336992e-07, + "loss": 0.3262, + "step": 11192 + }, + { + "epoch": 5.292198581560283, + "grad_norm": 3.2608871459960938, + "learning_rate": 1.7240345781450784e-07, + "loss": 0.3193, + "step": 11193 + }, + { + "epoch": 5.292671394799054, + "grad_norm": 3.322798013687134, + "learning_rate": 1.7217588292454762e-07, + "loss": 0.3669, + "step": 11194 + }, + { + "epoch": 5.293144208037825, + "grad_norm": 4.009716033935547, + "learning_rate": 1.7194845297798802e-07, + "loss": 0.3765, + "step": 11195 + }, + { + "epoch": 5.293617021276596, + "grad_norm": 3.0177745819091797, + "learning_rate": 1.7172116798898964e-07, + "loss": 0.3112, + "step": 11196 + }, + { + "epoch": 5.294089834515367, + "grad_norm": 2.870234251022339, + "learning_rate": 1.7149402797170532e-07, + "loss": 0.2938, + "step": 11197 + }, + { + "epoch": 5.294562647754137, + "grad_norm": 3.297724723815918, + "learning_rate": 1.7126703294027714e-07, + "loss": 0.3328, + "step": 11198 + }, + { + "epoch": 5.295035460992908, + "grad_norm": 3.5743629932403564, + "learning_rate": 1.7104018290884012e-07, + "loss": 0.3555, + "step": 11199 + }, + { + "epoch": 5.295508274231678, + "grad_norm": 3.3210482597351074, + "learning_rate": 1.7081347789151882e-07, + "loss": 0.2825, + "step": 11200 + }, + { + "epoch": 5.295981087470449, + "grad_norm": 3.647606611251831, + "learning_rate": 1.705869179024286e-07, + "loss": 0.3557, + "step": 11201 + }, + { + "epoch": 5.29645390070922, + "grad_norm": 3.379843235015869, + "learning_rate": 1.7036050295567736e-07, + "loss": 0.3648, + "step": 11202 + }, + { + "epoch": 5.2969267139479905, + "grad_norm": 3.3235721588134766, + "learning_rate": 1.7013423306536215e-07, + "loss": 0.3344, + "step": 11203 + }, + { + "epoch": 5.297399527186761, + "grad_norm": 3.673867702484131, + "learning_rate": 1.6990810824557225e-07, + "loss": 0.3667, + "step": 11204 + }, + { + "epoch": 5.297872340425532, + "grad_norm": 3.368685245513916, + "learning_rate": 1.6968212851038723e-07, + "loss": 0.3315, + "step": 11205 + }, + { + "epoch": 5.298345153664303, + "grad_norm": 3.5644888877868652, + "learning_rate": 1.6945629387387779e-07, + "loss": 0.3407, + "step": 11206 + }, + { + "epoch": 5.298817966903073, + "grad_norm": 3.110358715057373, + "learning_rate": 1.6923060435010602e-07, + "loss": 0.2986, + "step": 11207 + }, + { + "epoch": 5.299290780141844, + "grad_norm": 3.757765293121338, + "learning_rate": 1.6900505995312427e-07, + "loss": 0.3656, + "step": 11208 + }, + { + "epoch": 5.299763593380614, + "grad_norm": 3.415292501449585, + "learning_rate": 1.6877966069697605e-07, + "loss": 0.3133, + "step": 11209 + }, + { + "epoch": 5.300236406619385, + "grad_norm": 3.2273685932159424, + "learning_rate": 1.6855440659569678e-07, + "loss": 0.2993, + "step": 11210 + }, + { + "epoch": 5.300709219858156, + "grad_norm": 3.3077101707458496, + "learning_rate": 1.6832929766331107e-07, + "loss": 0.3175, + "step": 11211 + }, + { + "epoch": 5.301182033096927, + "grad_norm": 3.5129048824310303, + "learning_rate": 1.6810433391383634e-07, + "loss": 0.35, + "step": 11212 + }, + { + "epoch": 5.301654846335698, + "grad_norm": 3.1101224422454834, + "learning_rate": 1.6787951536127944e-07, + "loss": 0.3582, + "step": 11213 + }, + { + "epoch": 5.302127659574468, + "grad_norm": 3.20463490486145, + "learning_rate": 1.676548420196389e-07, + "loss": 0.3495, + "step": 11214 + }, + { + "epoch": 5.302600472813239, + "grad_norm": 3.0679001808166504, + "learning_rate": 1.6743031390290486e-07, + "loss": 0.2934, + "step": 11215 + }, + { + "epoch": 5.303073286052009, + "grad_norm": 3.2553749084472656, + "learning_rate": 1.672059310250565e-07, + "loss": 0.2841, + "step": 11216 + }, + { + "epoch": 5.30354609929078, + "grad_norm": 3.304185152053833, + "learning_rate": 1.669816934000662e-07, + "loss": 0.2784, + "step": 11217 + }, + { + "epoch": 5.304018912529551, + "grad_norm": 3.133167028427124, + "learning_rate": 1.6675760104189615e-07, + "loss": 0.3408, + "step": 11218 + }, + { + "epoch": 5.3044917257683215, + "grad_norm": 3.26667857170105, + "learning_rate": 1.6653365396449907e-07, + "loss": 0.3421, + "step": 11219 + }, + { + "epoch": 5.304964539007092, + "grad_norm": 3.201523542404175, + "learning_rate": 1.6630985218181904e-07, + "loss": 0.3737, + "step": 11220 + }, + { + "epoch": 5.305437352245863, + "grad_norm": 3.4249019622802734, + "learning_rate": 1.660861957077922e-07, + "loss": 0.3158, + "step": 11221 + }, + { + "epoch": 5.305910165484634, + "grad_norm": 3.1601033210754395, + "learning_rate": 1.658626845563438e-07, + "loss": 0.2986, + "step": 11222 + }, + { + "epoch": 5.306382978723404, + "grad_norm": 3.339919090270996, + "learning_rate": 1.6563931874139127e-07, + "loss": 0.2935, + "step": 11223 + }, + { + "epoch": 5.306855791962175, + "grad_norm": 3.149475574493408, + "learning_rate": 1.6541609827684212e-07, + "loss": 0.3152, + "step": 11224 + }, + { + "epoch": 5.307328605200945, + "grad_norm": 3.6351609230041504, + "learning_rate": 1.6519302317659607e-07, + "loss": 0.3369, + "step": 11225 + }, + { + "epoch": 5.307801418439716, + "grad_norm": 2.76548433303833, + "learning_rate": 1.6497009345454257e-07, + "loss": 0.2826, + "step": 11226 + }, + { + "epoch": 5.308274231678487, + "grad_norm": 3.4355554580688477, + "learning_rate": 1.6474730912456217e-07, + "loss": 0.3589, + "step": 11227 + }, + { + "epoch": 5.308747044917258, + "grad_norm": 3.1523945331573486, + "learning_rate": 1.6452467020052766e-07, + "loss": 0.3197, + "step": 11228 + }, + { + "epoch": 5.309219858156029, + "grad_norm": 2.867628574371338, + "learning_rate": 1.6430217669630043e-07, + "loss": 0.2954, + "step": 11229 + }, + { + "epoch": 5.309692671394799, + "grad_norm": 3.791811227798462, + "learning_rate": 1.6407982862573557e-07, + "loss": 0.3447, + "step": 11230 + }, + { + "epoch": 5.31016548463357, + "grad_norm": 3.406834602355957, + "learning_rate": 1.6385762600267697e-07, + "loss": 0.3429, + "step": 11231 + }, + { + "epoch": 5.31063829787234, + "grad_norm": 3.3508188724517822, + "learning_rate": 1.6363556884096022e-07, + "loss": 0.3246, + "step": 11232 + }, + { + "epoch": 5.311111111111111, + "grad_norm": 3.1348748207092285, + "learning_rate": 1.6341365715441205e-07, + "loss": 0.3299, + "step": 11233 + }, + { + "epoch": 5.311583924349882, + "grad_norm": 3.1326138973236084, + "learning_rate": 1.6319189095684944e-07, + "loss": 0.3769, + "step": 11234 + }, + { + "epoch": 5.3120567375886525, + "grad_norm": 3.0878756046295166, + "learning_rate": 1.6297027026208166e-07, + "loss": 0.3454, + "step": 11235 + }, + { + "epoch": 5.312529550827423, + "grad_norm": 3.303295612335205, + "learning_rate": 1.627487950839074e-07, + "loss": 0.3706, + "step": 11236 + }, + { + "epoch": 5.313002364066194, + "grad_norm": 3.2477850914001465, + "learning_rate": 1.6252746543611726e-07, + "loss": 0.2839, + "step": 11237 + }, + { + "epoch": 5.313475177304965, + "grad_norm": 3.166362762451172, + "learning_rate": 1.6230628133249244e-07, + "loss": 0.3596, + "step": 11238 + }, + { + "epoch": 5.313947990543735, + "grad_norm": 3.155400276184082, + "learning_rate": 1.6208524278680442e-07, + "loss": 0.2984, + "step": 11239 + }, + { + "epoch": 5.314420803782506, + "grad_norm": 3.271667718887329, + "learning_rate": 1.6186434981281752e-07, + "loss": 0.3046, + "step": 11240 + }, + { + "epoch": 5.314893617021276, + "grad_norm": 3.2715513706207275, + "learning_rate": 1.6164360242428513e-07, + "loss": 0.3416, + "step": 11241 + }, + { + "epoch": 5.315366430260047, + "grad_norm": 3.2481918334960938, + "learning_rate": 1.6142300063495154e-07, + "loss": 0.2906, + "step": 11242 + }, + { + "epoch": 5.315839243498818, + "grad_norm": 3.1383323669433594, + "learning_rate": 1.612025444585541e-07, + "loss": 0.2907, + "step": 11243 + }, + { + "epoch": 5.316312056737589, + "grad_norm": 3.3941144943237305, + "learning_rate": 1.6098223390881823e-07, + "loss": 0.2903, + "step": 11244 + }, + { + "epoch": 5.31678486997636, + "grad_norm": 3.858114004135132, + "learning_rate": 1.6076206899946267e-07, + "loss": 0.3112, + "step": 11245 + }, + { + "epoch": 5.31725768321513, + "grad_norm": 2.7895848751068115, + "learning_rate": 1.605420497441962e-07, + "loss": 0.3045, + "step": 11246 + }, + { + "epoch": 5.317730496453901, + "grad_norm": 2.904611349105835, + "learning_rate": 1.6032217615671753e-07, + "loss": 0.2947, + "step": 11247 + }, + { + "epoch": 5.318203309692671, + "grad_norm": 3.3088219165802, + "learning_rate": 1.6010244825071796e-07, + "loss": 0.3159, + "step": 11248 + }, + { + "epoch": 5.318676122931442, + "grad_norm": 3.478278160095215, + "learning_rate": 1.5988286603987852e-07, + "loss": 0.3566, + "step": 11249 + }, + { + "epoch": 5.319148936170213, + "grad_norm": 3.9452574253082275, + "learning_rate": 1.596634295378724e-07, + "loss": 0.3653, + "step": 11250 + }, + { + "epoch": 5.3196217494089835, + "grad_norm": 3.097100257873535, + "learning_rate": 1.5944413875836255e-07, + "loss": 0.3187, + "step": 11251 + }, + { + "epoch": 5.320094562647754, + "grad_norm": 3.021803379058838, + "learning_rate": 1.5922499371500282e-07, + "loss": 0.2922, + "step": 11252 + }, + { + "epoch": 5.320567375886525, + "grad_norm": 3.598921775817871, + "learning_rate": 1.5900599442143893e-07, + "loss": 0.3554, + "step": 11253 + }, + { + "epoch": 5.321040189125296, + "grad_norm": 3.154602527618408, + "learning_rate": 1.5878714089130692e-07, + "loss": 0.3184, + "step": 11254 + }, + { + "epoch": 5.321513002364066, + "grad_norm": 3.1292645931243896, + "learning_rate": 1.585684331382334e-07, + "loss": 0.311, + "step": 11255 + }, + { + "epoch": 5.321985815602837, + "grad_norm": 3.6723337173461914, + "learning_rate": 1.583498711758369e-07, + "loss": 0.3814, + "step": 11256 + }, + { + "epoch": 5.322458628841607, + "grad_norm": 2.859097480773926, + "learning_rate": 1.581314550177257e-07, + "loss": 0.247, + "step": 11257 + }, + { + "epoch": 5.322931442080378, + "grad_norm": 3.692857265472412, + "learning_rate": 1.5791318467750033e-07, + "loss": 0.338, + "step": 11258 + }, + { + "epoch": 5.323404255319149, + "grad_norm": 3.3693666458129883, + "learning_rate": 1.576950601687513e-07, + "loss": 0.305, + "step": 11259 + }, + { + "epoch": 5.32387706855792, + "grad_norm": 3.567326784133911, + "learning_rate": 1.574770815050597e-07, + "loss": 0.321, + "step": 11260 + }, + { + "epoch": 5.3243498817966906, + "grad_norm": 3.062784194946289, + "learning_rate": 1.5725924869999908e-07, + "loss": 0.2773, + "step": 11261 + }, + { + "epoch": 5.324822695035461, + "grad_norm": 3.2731950283050537, + "learning_rate": 1.5704156176713197e-07, + "loss": 0.2954, + "step": 11262 + }, + { + "epoch": 5.325295508274232, + "grad_norm": 3.111539840698242, + "learning_rate": 1.5682402072001363e-07, + "loss": 0.3003, + "step": 11263 + }, + { + "epoch": 5.325768321513002, + "grad_norm": 3.278005361557007, + "learning_rate": 1.5660662557218903e-07, + "loss": 0.349, + "step": 11264 + }, + { + "epoch": 5.326241134751773, + "grad_norm": 3.2765769958496094, + "learning_rate": 1.5638937633719402e-07, + "loss": 0.3284, + "step": 11265 + }, + { + "epoch": 5.326713947990544, + "grad_norm": 3.123499631881714, + "learning_rate": 1.561722730285567e-07, + "loss": 0.2833, + "step": 11266 + }, + { + "epoch": 5.3271867612293144, + "grad_norm": 3.308847188949585, + "learning_rate": 1.5595531565979428e-07, + "loss": 0.3134, + "step": 11267 + }, + { + "epoch": 5.327659574468085, + "grad_norm": 3.1868162155151367, + "learning_rate": 1.5573850424441622e-07, + "loss": 0.3164, + "step": 11268 + }, + { + "epoch": 5.328132387706856, + "grad_norm": 3.9105653762817383, + "learning_rate": 1.5552183879592203e-07, + "loss": 0.2944, + "step": 11269 + }, + { + "epoch": 5.328605200945627, + "grad_norm": 3.2461166381835938, + "learning_rate": 1.553053193278023e-07, + "loss": 0.2594, + "step": 11270 + }, + { + "epoch": 5.329078014184397, + "grad_norm": 3.391753673553467, + "learning_rate": 1.5508894585353983e-07, + "loss": 0.3062, + "step": 11271 + }, + { + "epoch": 5.3295508274231675, + "grad_norm": 3.1136906147003174, + "learning_rate": 1.5487271838660584e-07, + "loss": 0.3046, + "step": 11272 + }, + { + "epoch": 5.330023640661938, + "grad_norm": 3.557689905166626, + "learning_rate": 1.5465663694046535e-07, + "loss": 0.3514, + "step": 11273 + }, + { + "epoch": 5.330496453900709, + "grad_norm": 3.394179105758667, + "learning_rate": 1.5444070152857177e-07, + "loss": 0.303, + "step": 11274 + }, + { + "epoch": 5.33096926713948, + "grad_norm": 3.120568037033081, + "learning_rate": 1.5422491216437047e-07, + "loss": 0.3611, + "step": 11275 + }, + { + "epoch": 5.331442080378251, + "grad_norm": 3.091876983642578, + "learning_rate": 1.5400926886129847e-07, + "loss": 0.2758, + "step": 11276 + }, + { + "epoch": 5.3319148936170215, + "grad_norm": 3.1780812740325928, + "learning_rate": 1.5379377163278226e-07, + "loss": 0.299, + "step": 11277 + }, + { + "epoch": 5.332387706855792, + "grad_norm": 3.105471134185791, + "learning_rate": 1.5357842049223969e-07, + "loss": 0.3318, + "step": 11278 + }, + { + "epoch": 5.332860520094562, + "grad_norm": 3.2104663848876953, + "learning_rate": 1.5336321545308063e-07, + "loss": 0.312, + "step": 11279 + }, + { + "epoch": 5.333333333333333, + "grad_norm": 3.0989763736724854, + "learning_rate": 1.5314815652870407e-07, + "loss": 0.2851, + "step": 11280 + }, + { + "epoch": 5.333806146572104, + "grad_norm": 3.057668685913086, + "learning_rate": 1.529332437325015e-07, + "loss": 0.3174, + "step": 11281 + }, + { + "epoch": 5.334278959810875, + "grad_norm": 3.587592840194702, + "learning_rate": 1.527184770778542e-07, + "loss": 0.4058, + "step": 11282 + }, + { + "epoch": 5.334751773049645, + "grad_norm": 3.0807034969329834, + "learning_rate": 1.5250385657813478e-07, + "loss": 0.3145, + "step": 11283 + }, + { + "epoch": 5.335224586288416, + "grad_norm": 3.3721396923065186, + "learning_rate": 1.5228938224670698e-07, + "loss": 0.338, + "step": 11284 + }, + { + "epoch": 5.335697399527187, + "grad_norm": 3.361618995666504, + "learning_rate": 1.5207505409692431e-07, + "loss": 0.3018, + "step": 11285 + }, + { + "epoch": 5.336170212765958, + "grad_norm": 3.107353448867798, + "learning_rate": 1.518608721421333e-07, + "loss": 0.3612, + "step": 11286 + }, + { + "epoch": 5.336643026004728, + "grad_norm": 2.945077896118164, + "learning_rate": 1.5164683639566941e-07, + "loss": 0.2925, + "step": 11287 + }, + { + "epoch": 5.3371158392434985, + "grad_norm": 3.1644442081451416, + "learning_rate": 1.5143294687085918e-07, + "loss": 0.3336, + "step": 11288 + }, + { + "epoch": 5.337588652482269, + "grad_norm": 3.0085582733154297, + "learning_rate": 1.5121920358102172e-07, + "loss": 0.2928, + "step": 11289 + }, + { + "epoch": 5.33806146572104, + "grad_norm": 3.3779399394989014, + "learning_rate": 1.5100560653946522e-07, + "loss": 0.3453, + "step": 11290 + }, + { + "epoch": 5.338534278959811, + "grad_norm": 3.1938469409942627, + "learning_rate": 1.5079215575948963e-07, + "loss": 0.2911, + "step": 11291 + }, + { + "epoch": 5.339007092198582, + "grad_norm": 3.22524094581604, + "learning_rate": 1.505788512543857e-07, + "loss": 0.321, + "step": 11292 + }, + { + "epoch": 5.3394799054373525, + "grad_norm": 3.5342066287994385, + "learning_rate": 1.5036569303743447e-07, + "loss": 0.3417, + "step": 11293 + }, + { + "epoch": 5.339952718676123, + "grad_norm": 3.2065529823303223, + "learning_rate": 1.5015268112190894e-07, + "loss": 0.3537, + "step": 11294 + }, + { + "epoch": 5.340425531914893, + "grad_norm": 3.2699520587921143, + "learning_rate": 1.4993981552107185e-07, + "loss": 0.2918, + "step": 11295 + }, + { + "epoch": 5.340898345153664, + "grad_norm": 2.932130813598633, + "learning_rate": 1.4972709624817812e-07, + "loss": 0.2711, + "step": 11296 + }, + { + "epoch": 5.341371158392435, + "grad_norm": 3.151578187942505, + "learning_rate": 1.4951452331647244e-07, + "loss": 0.3254, + "step": 11297 + }, + { + "epoch": 5.341843971631206, + "grad_norm": 3.479327440261841, + "learning_rate": 1.4930209673919062e-07, + "loss": 0.3184, + "step": 11298 + }, + { + "epoch": 5.342316784869976, + "grad_norm": 3.1652722358703613, + "learning_rate": 1.4908981652955985e-07, + "loss": 0.3118, + "step": 11299 + }, + { + "epoch": 5.342789598108747, + "grad_norm": 3.445828437805176, + "learning_rate": 1.488776827007979e-07, + "loss": 0.3279, + "step": 11300 + }, + { + "epoch": 5.343262411347518, + "grad_norm": 2.9347822666168213, + "learning_rate": 1.4866569526611334e-07, + "loss": 0.2942, + "step": 11301 + }, + { + "epoch": 5.343735224586289, + "grad_norm": 3.1422226428985596, + "learning_rate": 1.4845385423870568e-07, + "loss": 0.3397, + "step": 11302 + }, + { + "epoch": 5.344208037825059, + "grad_norm": 3.2264926433563232, + "learning_rate": 1.4824215963176485e-07, + "loss": 0.3245, + "step": 11303 + }, + { + "epoch": 5.3446808510638295, + "grad_norm": 3.4117064476013184, + "learning_rate": 1.4803061145847286e-07, + "loss": 0.3367, + "step": 11304 + }, + { + "epoch": 5.3451536643026, + "grad_norm": 3.502324342727661, + "learning_rate": 1.4781920973200164e-07, + "loss": 0.3258, + "step": 11305 + }, + { + "epoch": 5.345626477541371, + "grad_norm": 3.0881006717681885, + "learning_rate": 1.47607954465514e-07, + "loss": 0.2901, + "step": 11306 + }, + { + "epoch": 5.346099290780142, + "grad_norm": 3.305974006652832, + "learning_rate": 1.4739684567216445e-07, + "loss": 0.3963, + "step": 11307 + }, + { + "epoch": 5.346572104018913, + "grad_norm": 3.3296890258789062, + "learning_rate": 1.4718588336509714e-07, + "loss": 0.2985, + "step": 11308 + }, + { + "epoch": 5.3470449172576835, + "grad_norm": 3.5175423622131348, + "learning_rate": 1.4697506755744827e-07, + "loss": 0.3539, + "step": 11309 + }, + { + "epoch": 5.347517730496454, + "grad_norm": 3.3303592205047607, + "learning_rate": 1.4676439826234428e-07, + "loss": 0.3071, + "step": 11310 + }, + { + "epoch": 5.347990543735224, + "grad_norm": 2.9972405433654785, + "learning_rate": 1.4655387549290245e-07, + "loss": 0.3161, + "step": 11311 + }, + { + "epoch": 5.348463356973995, + "grad_norm": 3.5059444904327393, + "learning_rate": 1.4634349926223146e-07, + "loss": 0.2829, + "step": 11312 + }, + { + "epoch": 5.348936170212766, + "grad_norm": 3.2246110439300537, + "learning_rate": 1.4613326958342973e-07, + "loss": 0.278, + "step": 11313 + }, + { + "epoch": 5.349408983451537, + "grad_norm": 3.0903613567352295, + "learning_rate": 1.459231864695884e-07, + "loss": 0.2877, + "step": 11314 + }, + { + "epoch": 5.349881796690307, + "grad_norm": 3.164947748184204, + "learning_rate": 1.457132499337882e-07, + "loss": 0.3533, + "step": 11315 + }, + { + "epoch": 5.350354609929078, + "grad_norm": 3.2235376834869385, + "learning_rate": 1.455034599891003e-07, + "loss": 0.3384, + "step": 11316 + }, + { + "epoch": 5.350827423167849, + "grad_norm": 3.2294938564300537, + "learning_rate": 1.4529381664858816e-07, + "loss": 0.2604, + "step": 11317 + }, + { + "epoch": 5.35130023640662, + "grad_norm": 3.0628116130828857, + "learning_rate": 1.4508431992530437e-07, + "loss": 0.3166, + "step": 11318 + }, + { + "epoch": 5.35177304964539, + "grad_norm": 2.876624584197998, + "learning_rate": 1.448749698322943e-07, + "loss": 0.3192, + "step": 11319 + }, + { + "epoch": 5.3522458628841605, + "grad_norm": 3.4103848934173584, + "learning_rate": 1.4466576638259317e-07, + "loss": 0.3254, + "step": 11320 + }, + { + "epoch": 5.352718676122931, + "grad_norm": 3.0813376903533936, + "learning_rate": 1.444567095892263e-07, + "loss": 0.3265, + "step": 11321 + }, + { + "epoch": 5.353191489361702, + "grad_norm": 4.374908447265625, + "learning_rate": 1.4424779946521196e-07, + "loss": 0.2742, + "step": 11322 + }, + { + "epoch": 5.353664302600473, + "grad_norm": 3.1611711978912354, + "learning_rate": 1.4403903602355745e-07, + "loss": 0.3475, + "step": 11323 + }, + { + "epoch": 5.354137115839244, + "grad_norm": 3.284346103668213, + "learning_rate": 1.4383041927726104e-07, + "loss": 0.3245, + "step": 11324 + }, + { + "epoch": 5.3546099290780145, + "grad_norm": 3.416294813156128, + "learning_rate": 1.4362194923931345e-07, + "loss": 0.3363, + "step": 11325 + }, + { + "epoch": 5.355082742316785, + "grad_norm": 3.465210437774658, + "learning_rate": 1.4341362592269454e-07, + "loss": 0.3639, + "step": 11326 + }, + { + "epoch": 5.355555555555555, + "grad_norm": 3.2791237831115723, + "learning_rate": 1.4320544934037588e-07, + "loss": 0.368, + "step": 11327 + }, + { + "epoch": 5.356028368794326, + "grad_norm": 3.206949234008789, + "learning_rate": 1.4299741950531964e-07, + "loss": 0.3332, + "step": 11328 + }, + { + "epoch": 5.356501182033097, + "grad_norm": 3.33746600151062, + "learning_rate": 1.4278953643047877e-07, + "loss": 0.3373, + "step": 11329 + }, + { + "epoch": 5.356973995271868, + "grad_norm": 3.0807836055755615, + "learning_rate": 1.4258180012879762e-07, + "loss": 0.3308, + "step": 11330 + }, + { + "epoch": 5.357446808510638, + "grad_norm": 3.2733700275421143, + "learning_rate": 1.4237421061321087e-07, + "loss": 0.3612, + "step": 11331 + }, + { + "epoch": 5.357919621749409, + "grad_norm": 3.3739452362060547, + "learning_rate": 1.4216676789664402e-07, + "loss": 0.3428, + "step": 11332 + }, + { + "epoch": 5.35839243498818, + "grad_norm": 3.522418975830078, + "learning_rate": 1.4195947199201393e-07, + "loss": 0.3346, + "step": 11333 + }, + { + "epoch": 5.358865248226951, + "grad_norm": 3.081836223602295, + "learning_rate": 1.4175232291222723e-07, + "loss": 0.3529, + "step": 11334 + }, + { + "epoch": 5.359338061465721, + "grad_norm": 3.188351631164551, + "learning_rate": 1.4154532067018333e-07, + "loss": 0.2999, + "step": 11335 + }, + { + "epoch": 5.3598108747044915, + "grad_norm": 3.2901625633239746, + "learning_rate": 1.4133846527877054e-07, + "loss": 0.3404, + "step": 11336 + }, + { + "epoch": 5.360283687943262, + "grad_norm": 3.5738883018493652, + "learning_rate": 1.411317567508694e-07, + "loss": 0.2881, + "step": 11337 + }, + { + "epoch": 5.360756501182033, + "grad_norm": 3.4825687408447266, + "learning_rate": 1.409251950993501e-07, + "loss": 0.3163, + "step": 11338 + }, + { + "epoch": 5.361229314420804, + "grad_norm": 4.5908918380737305, + "learning_rate": 1.4071878033707464e-07, + "loss": 0.3113, + "step": 11339 + }, + { + "epoch": 5.361702127659575, + "grad_norm": 3.299997091293335, + "learning_rate": 1.405125124768958e-07, + "loss": 0.3187, + "step": 11340 + }, + { + "epoch": 5.3621749408983455, + "grad_norm": 3.676666021347046, + "learning_rate": 1.403063915316566e-07, + "loss": 0.3299, + "step": 11341 + }, + { + "epoch": 5.362647754137116, + "grad_norm": 3.4060325622558594, + "learning_rate": 1.401004175141918e-07, + "loss": 0.2792, + "step": 11342 + }, + { + "epoch": 5.363120567375886, + "grad_norm": 2.99606990814209, + "learning_rate": 1.3989459043732583e-07, + "loss": 0.2889, + "step": 11343 + }, + { + "epoch": 5.363593380614657, + "grad_norm": 3.141608715057373, + "learning_rate": 1.3968891031387488e-07, + "loss": 0.2819, + "step": 11344 + }, + { + "epoch": 5.364066193853428, + "grad_norm": 4.1492085456848145, + "learning_rate": 1.3948337715664617e-07, + "loss": 0.3781, + "step": 11345 + }, + { + "epoch": 5.3645390070921986, + "grad_norm": 2.9972193241119385, + "learning_rate": 1.3927799097843698e-07, + "loss": 0.2949, + "step": 11346 + }, + { + "epoch": 5.365011820330969, + "grad_norm": 4.015519618988037, + "learning_rate": 1.3907275179203593e-07, + "loss": 0.3573, + "step": 11347 + }, + { + "epoch": 5.36548463356974, + "grad_norm": 3.056424617767334, + "learning_rate": 1.3886765961022202e-07, + "loss": 0.3216, + "step": 11348 + }, + { + "epoch": 5.365957446808511, + "grad_norm": 3.364178419113159, + "learning_rate": 1.3866271444576556e-07, + "loss": 0.3519, + "step": 11349 + }, + { + "epoch": 5.366430260047281, + "grad_norm": 3.101644515991211, + "learning_rate": 1.3845791631142802e-07, + "loss": 0.3582, + "step": 11350 + }, + { + "epoch": 5.366903073286052, + "grad_norm": 3.060175895690918, + "learning_rate": 1.3825326521996114e-07, + "loss": 0.3359, + "step": 11351 + }, + { + "epoch": 5.3673758865248224, + "grad_norm": 3.402256488800049, + "learning_rate": 1.3804876118410693e-07, + "loss": 0.3383, + "step": 11352 + }, + { + "epoch": 5.367848699763593, + "grad_norm": 3.2685110569000244, + "learning_rate": 1.3784440421659968e-07, + "loss": 0.2726, + "step": 11353 + }, + { + "epoch": 5.368321513002364, + "grad_norm": 3.367396593093872, + "learning_rate": 1.3764019433016362e-07, + "loss": 0.35, + "step": 11354 + }, + { + "epoch": 5.368794326241135, + "grad_norm": 3.225039005279541, + "learning_rate": 1.3743613153751418e-07, + "loss": 0.2712, + "step": 11355 + }, + { + "epoch": 5.369267139479906, + "grad_norm": 3.262666940689087, + "learning_rate": 1.3723221585135753e-07, + "loss": 0.2581, + "step": 11356 + }, + { + "epoch": 5.369739952718676, + "grad_norm": 3.117643117904663, + "learning_rate": 1.3702844728438968e-07, + "loss": 0.3097, + "step": 11357 + }, + { + "epoch": 5.370212765957447, + "grad_norm": 3.1812870502471924, + "learning_rate": 1.3682482584929962e-07, + "loss": 0.3229, + "step": 11358 + }, + { + "epoch": 5.370685579196217, + "grad_norm": 3.211583137512207, + "learning_rate": 1.3662135155876526e-07, + "loss": 0.372, + "step": 11359 + }, + { + "epoch": 5.371158392434988, + "grad_norm": 3.5000617504119873, + "learning_rate": 1.3641802442545648e-07, + "loss": 0.3151, + "step": 11360 + }, + { + "epoch": 5.371631205673759, + "grad_norm": 3.493748426437378, + "learning_rate": 1.3621484446203315e-07, + "loss": 0.2983, + "step": 11361 + }, + { + "epoch": 5.3721040189125295, + "grad_norm": 3.1911051273345947, + "learning_rate": 1.360118116811468e-07, + "loss": 0.2753, + "step": 11362 + }, + { + "epoch": 5.3725768321513, + "grad_norm": 3.3586859703063965, + "learning_rate": 1.3580892609543871e-07, + "loss": 0.2857, + "step": 11363 + }, + { + "epoch": 5.373049645390071, + "grad_norm": 3.4908719062805176, + "learning_rate": 1.356061877175424e-07, + "loss": 0.333, + "step": 11364 + }, + { + "epoch": 5.373522458628842, + "grad_norm": 3.3023927211761475, + "learning_rate": 1.354035965600814e-07, + "loss": 0.3248, + "step": 11365 + }, + { + "epoch": 5.373995271867612, + "grad_norm": 3.048675060272217, + "learning_rate": 1.3520115263566974e-07, + "loss": 0.3392, + "step": 11366 + }, + { + "epoch": 5.374468085106383, + "grad_norm": 3.3775367736816406, + "learning_rate": 1.3499885595691293e-07, + "loss": 0.332, + "step": 11367 + }, + { + "epoch": 5.374940898345153, + "grad_norm": 3.3747823238372803, + "learning_rate": 1.3479670653640726e-07, + "loss": 0.4079, + "step": 11368 + }, + { + "epoch": 5.375413711583924, + "grad_norm": 3.4345645904541016, + "learning_rate": 1.3459470438673932e-07, + "loss": 0.3924, + "step": 11369 + }, + { + "epoch": 5.375886524822695, + "grad_norm": 3.0778472423553467, + "learning_rate": 1.343928495204874e-07, + "loss": 0.2865, + "step": 11370 + }, + { + "epoch": 5.376359338061466, + "grad_norm": 3.4024085998535156, + "learning_rate": 1.3419114195022008e-07, + "loss": 0.2823, + "step": 11371 + }, + { + "epoch": 5.376832151300237, + "grad_norm": 3.141794443130493, + "learning_rate": 1.3398958168849619e-07, + "loss": 0.2678, + "step": 11372 + }, + { + "epoch": 5.377304964539007, + "grad_norm": 3.4689128398895264, + "learning_rate": 1.337881687478665e-07, + "loss": 0.3944, + "step": 11373 + }, + { + "epoch": 5.377777777777778, + "grad_norm": 2.932950973510742, + "learning_rate": 1.3358690314087236e-07, + "loss": 0.3301, + "step": 11374 + }, + { + "epoch": 5.378250591016548, + "grad_norm": 3.4097161293029785, + "learning_rate": 1.333857848800446e-07, + "loss": 0.3421, + "step": 11375 + }, + { + "epoch": 5.378723404255319, + "grad_norm": 2.732140064239502, + "learning_rate": 1.331848139779074e-07, + "loss": 0.3288, + "step": 11376 + }, + { + "epoch": 5.37919621749409, + "grad_norm": 3.342212677001953, + "learning_rate": 1.3298399044697318e-07, + "loss": 0.3651, + "step": 11377 + }, + { + "epoch": 5.3796690307328605, + "grad_norm": 3.049415111541748, + "learning_rate": 1.3278331429974728e-07, + "loss": 0.3287, + "step": 11378 + }, + { + "epoch": 5.380141843971631, + "grad_norm": 2.870662212371826, + "learning_rate": 1.3258278554872439e-07, + "loss": 0.3344, + "step": 11379 + }, + { + "epoch": 5.380614657210402, + "grad_norm": 3.2381114959716797, + "learning_rate": 1.3238240420639065e-07, + "loss": 0.3337, + "step": 11380 + }, + { + "epoch": 5.381087470449173, + "grad_norm": 3.0187833309173584, + "learning_rate": 1.3218217028522272e-07, + "loss": 0.2675, + "step": 11381 + }, + { + "epoch": 5.381560283687943, + "grad_norm": 3.0431411266326904, + "learning_rate": 1.3198208379768844e-07, + "loss": 0.2848, + "step": 11382 + }, + { + "epoch": 5.382033096926714, + "grad_norm": 3.216615676879883, + "learning_rate": 1.3178214475624645e-07, + "loss": 0.3268, + "step": 11383 + }, + { + "epoch": 5.382505910165484, + "grad_norm": 3.2666940689086914, + "learning_rate": 1.3158235317334622e-07, + "loss": 0.3412, + "step": 11384 + }, + { + "epoch": 5.382978723404255, + "grad_norm": 3.4213311672210693, + "learning_rate": 1.3138270906142726e-07, + "loss": 0.3205, + "step": 11385 + }, + { + "epoch": 5.383451536643026, + "grad_norm": 3.3023040294647217, + "learning_rate": 1.31183212432921e-07, + "loss": 0.2655, + "step": 11386 + }, + { + "epoch": 5.383924349881797, + "grad_norm": 3.4553420543670654, + "learning_rate": 1.3098386330024915e-07, + "loss": 0.3463, + "step": 11387 + }, + { + "epoch": 5.384397163120568, + "grad_norm": 3.350341558456421, + "learning_rate": 1.307846616758246e-07, + "loss": 0.3076, + "step": 11388 + }, + { + "epoch": 5.384869976359338, + "grad_norm": 3.283419132232666, + "learning_rate": 1.305856075720502e-07, + "loss": 0.3357, + "step": 11389 + }, + { + "epoch": 5.385342789598109, + "grad_norm": 3.3644227981567383, + "learning_rate": 1.3038670100132046e-07, + "loss": 0.3462, + "step": 11390 + }, + { + "epoch": 5.385815602836879, + "grad_norm": 3.1411266326904297, + "learning_rate": 1.3018794197602047e-07, + "loss": 0.3337, + "step": 11391 + }, + { + "epoch": 5.38628841607565, + "grad_norm": 3.033926486968994, + "learning_rate": 1.299893305085259e-07, + "loss": 0.2947, + "step": 11392 + }, + { + "epoch": 5.386761229314421, + "grad_norm": 3.4088027477264404, + "learning_rate": 1.2979086661120356e-07, + "loss": 0.332, + "step": 11393 + }, + { + "epoch": 5.3872340425531915, + "grad_norm": 3.315154790878296, + "learning_rate": 1.2959255029641126e-07, + "loss": 0.3148, + "step": 11394 + }, + { + "epoch": 5.387706855791962, + "grad_norm": 3.702969789505005, + "learning_rate": 1.293943815764967e-07, + "loss": 0.3644, + "step": 11395 + }, + { + "epoch": 5.388179669030733, + "grad_norm": 2.868751049041748, + "learning_rate": 1.291963604637994e-07, + "loss": 0.3484, + "step": 11396 + }, + { + "epoch": 5.388652482269504, + "grad_norm": 3.31673264503479, + "learning_rate": 1.2899848697064898e-07, + "loss": 0.3058, + "step": 11397 + }, + { + "epoch": 5.389125295508274, + "grad_norm": 3.311613082885742, + "learning_rate": 1.2880076110936585e-07, + "loss": 0.3186, + "step": 11398 + }, + { + "epoch": 5.389598108747045, + "grad_norm": 3.3614962100982666, + "learning_rate": 1.286031828922621e-07, + "loss": 0.3101, + "step": 11399 + }, + { + "epoch": 5.390070921985815, + "grad_norm": 3.302133321762085, + "learning_rate": 1.2840575233163983e-07, + "loss": 0.3511, + "step": 11400 + }, + { + "epoch": 5.390543735224586, + "grad_norm": 3.287588357925415, + "learning_rate": 1.2820846943979254e-07, + "loss": 0.3495, + "step": 11401 + }, + { + "epoch": 5.391016548463357, + "grad_norm": 3.3546504974365234, + "learning_rate": 1.2801133422900374e-07, + "loss": 0.3508, + "step": 11402 + }, + { + "epoch": 5.391489361702128, + "grad_norm": 3.441850423812866, + "learning_rate": 1.278143467115481e-07, + "loss": 0.3296, + "step": 11403 + }, + { + "epoch": 5.391962174940899, + "grad_norm": 3.529294490814209, + "learning_rate": 1.2761750689969154e-07, + "loss": 0.3393, + "step": 11404 + }, + { + "epoch": 5.392434988179669, + "grad_norm": 3.0237176418304443, + "learning_rate": 1.274208148056902e-07, + "loss": 0.344, + "step": 11405 + }, + { + "epoch": 5.39290780141844, + "grad_norm": 3.643930435180664, + "learning_rate": 1.2722427044179142e-07, + "loss": 0.3439, + "step": 11406 + }, + { + "epoch": 5.39338061465721, + "grad_norm": 3.300459623336792, + "learning_rate": 1.2702787382023296e-07, + "loss": 0.3344, + "step": 11407 + }, + { + "epoch": 5.393853427895981, + "grad_norm": 3.3594553470611572, + "learning_rate": 1.2683162495324335e-07, + "loss": 0.3267, + "step": 11408 + }, + { + "epoch": 5.394326241134752, + "grad_norm": 3.8660781383514404, + "learning_rate": 1.2663552385304284e-07, + "loss": 0.3595, + "step": 11409 + }, + { + "epoch": 5.3947990543735225, + "grad_norm": 3.7904021739959717, + "learning_rate": 1.2643957053184107e-07, + "loss": 0.3893, + "step": 11410 + }, + { + "epoch": 5.395271867612293, + "grad_norm": 2.8576457500457764, + "learning_rate": 1.2624376500183973e-07, + "loss": 0.2404, + "step": 11411 + }, + { + "epoch": 5.395744680851064, + "grad_norm": 4.3170013427734375, + "learning_rate": 1.2604810727523066e-07, + "loss": 0.3223, + "step": 11412 + }, + { + "epoch": 5.396217494089835, + "grad_norm": 3.2365522384643555, + "learning_rate": 1.258525973641958e-07, + "loss": 0.2327, + "step": 11413 + }, + { + "epoch": 5.396690307328605, + "grad_norm": 3.4322235584259033, + "learning_rate": 1.2565723528091017e-07, + "loss": 0.3513, + "step": 11414 + }, + { + "epoch": 5.397163120567376, + "grad_norm": 3.695038080215454, + "learning_rate": 1.2546202103753652e-07, + "loss": 0.3993, + "step": 11415 + }, + { + "epoch": 5.397635933806146, + "grad_norm": 3.412743091583252, + "learning_rate": 1.252669546462315e-07, + "loss": 0.3392, + "step": 11416 + }, + { + "epoch": 5.398108747044917, + "grad_norm": 3.074951171875, + "learning_rate": 1.2507203611914016e-07, + "loss": 0.3231, + "step": 11417 + }, + { + "epoch": 5.398581560283688, + "grad_norm": 3.4410054683685303, + "learning_rate": 1.2487726546839884e-07, + "loss": 0.3044, + "step": 11418 + }, + { + "epoch": 5.399054373522459, + "grad_norm": 3.304527521133423, + "learning_rate": 1.2468264270613622e-07, + "loss": 0.3058, + "step": 11419 + }, + { + "epoch": 5.39952718676123, + "grad_norm": 3.286989212036133, + "learning_rate": 1.2448816784446982e-07, + "loss": 0.3508, + "step": 11420 + }, + { + "epoch": 5.4, + "grad_norm": 3.223222494125366, + "learning_rate": 1.2429384089550884e-07, + "loss": 0.3327, + "step": 11421 + }, + { + "epoch": 5.400472813238771, + "grad_norm": 3.333634853363037, + "learning_rate": 1.240996618713533e-07, + "loss": 0.3509, + "step": 11422 + }, + { + "epoch": 5.400945626477541, + "grad_norm": 2.9547529220581055, + "learning_rate": 1.2390563078409356e-07, + "loss": 0.27, + "step": 11423 + }, + { + "epoch": 5.401418439716312, + "grad_norm": 3.4548802375793457, + "learning_rate": 1.2371174764581161e-07, + "loss": 0.3565, + "step": 11424 + }, + { + "epoch": 5.401891252955083, + "grad_norm": 3.143460273742676, + "learning_rate": 1.2351801246857947e-07, + "loss": 0.2743, + "step": 11425 + }, + { + "epoch": 5.4023640661938535, + "grad_norm": 3.1186251640319824, + "learning_rate": 1.2332442526445997e-07, + "loss": 0.3026, + "step": 11426 + }, + { + "epoch": 5.402836879432624, + "grad_norm": 3.14628529548645, + "learning_rate": 1.231309860455071e-07, + "loss": 0.3026, + "step": 11427 + }, + { + "epoch": 5.403309692671395, + "grad_norm": 3.3630118370056152, + "learning_rate": 1.2293769482376565e-07, + "loss": 0.3164, + "step": 11428 + }, + { + "epoch": 5.403782505910166, + "grad_norm": 2.9600207805633545, + "learning_rate": 1.2274455161127074e-07, + "loss": 0.2775, + "step": 11429 + }, + { + "epoch": 5.404255319148936, + "grad_norm": 3.3323616981506348, + "learning_rate": 1.2255155642004885e-07, + "loss": 0.2931, + "step": 11430 + }, + { + "epoch": 5.4047281323877066, + "grad_norm": 3.339144468307495, + "learning_rate": 1.223587092621162e-07, + "loss": 0.3357, + "step": 11431 + }, + { + "epoch": 5.405200945626477, + "grad_norm": 3.590852737426758, + "learning_rate": 1.2216601014948148e-07, + "loss": 0.296, + "step": 11432 + }, + { + "epoch": 5.405673758865248, + "grad_norm": 3.0115602016448975, + "learning_rate": 1.2197345909414237e-07, + "loss": 0.2851, + "step": 11433 + }, + { + "epoch": 5.406146572104019, + "grad_norm": 3.1093897819519043, + "learning_rate": 1.2178105610808928e-07, + "loss": 0.3334, + "step": 11434 + }, + { + "epoch": 5.40661938534279, + "grad_norm": 3.2710671424865723, + "learning_rate": 1.2158880120330147e-07, + "loss": 0.3456, + "step": 11435 + }, + { + "epoch": 5.4070921985815605, + "grad_norm": 3.035571336746216, + "learning_rate": 1.2139669439174968e-07, + "loss": 0.2914, + "step": 11436 + }, + { + "epoch": 5.407565011820331, + "grad_norm": 2.946021795272827, + "learning_rate": 1.2120473568539598e-07, + "loss": 0.274, + "step": 11437 + }, + { + "epoch": 5.408037825059101, + "grad_norm": 3.0848777294158936, + "learning_rate": 1.210129250961925e-07, + "loss": 0.2875, + "step": 11438 + }, + { + "epoch": 5.408510638297872, + "grad_norm": 3.231348752975464, + "learning_rate": 1.20821262636083e-07, + "loss": 0.3107, + "step": 11439 + }, + { + "epoch": 5.408983451536643, + "grad_norm": 3.4804441928863525, + "learning_rate": 1.2062974831700131e-07, + "loss": 0.34, + "step": 11440 + }, + { + "epoch": 5.409456264775414, + "grad_norm": 3.396928310394287, + "learning_rate": 1.2043838215087144e-07, + "loss": 0.3577, + "step": 11441 + }, + { + "epoch": 5.409929078014184, + "grad_norm": 3.4840028285980225, + "learning_rate": 1.2024716414960975e-07, + "loss": 0.296, + "step": 11442 + }, + { + "epoch": 5.410401891252955, + "grad_norm": 3.8356549739837646, + "learning_rate": 1.200560943251222e-07, + "loss": 0.3444, + "step": 11443 + }, + { + "epoch": 5.410874704491726, + "grad_norm": 3.8281893730163574, + "learning_rate": 1.1986517268930603e-07, + "loss": 0.3962, + "step": 11444 + }, + { + "epoch": 5.411347517730497, + "grad_norm": 3.0665910243988037, + "learning_rate": 1.1967439925404888e-07, + "loss": 0.3336, + "step": 11445 + }, + { + "epoch": 5.411820330969267, + "grad_norm": 3.4650213718414307, + "learning_rate": 1.1948377403122906e-07, + "loss": 0.3278, + "step": 11446 + }, + { + "epoch": 5.4122931442080375, + "grad_norm": 3.654869794845581, + "learning_rate": 1.1929329703271707e-07, + "loss": 0.3093, + "step": 11447 + }, + { + "epoch": 5.412765957446808, + "grad_norm": 3.377669095993042, + "learning_rate": 1.1910296827037204e-07, + "loss": 0.3239, + "step": 11448 + }, + { + "epoch": 5.413238770685579, + "grad_norm": 3.475485324859619, + "learning_rate": 1.1891278775604503e-07, + "loss": 0.3307, + "step": 11449 + }, + { + "epoch": 5.41371158392435, + "grad_norm": 2.9799742698669434, + "learning_rate": 1.1872275550157824e-07, + "loss": 0.3012, + "step": 11450 + }, + { + "epoch": 5.414184397163121, + "grad_norm": 2.9590864181518555, + "learning_rate": 1.185328715188036e-07, + "loss": 0.2913, + "step": 11451 + }, + { + "epoch": 5.4146572104018915, + "grad_norm": 3.7972922325134277, + "learning_rate": 1.1834313581954498e-07, + "loss": 0.3938, + "step": 11452 + }, + { + "epoch": 5.415130023640662, + "grad_norm": 3.234405279159546, + "learning_rate": 1.1815354841561627e-07, + "loss": 0.3548, + "step": 11453 + }, + { + "epoch": 5.415602836879432, + "grad_norm": 3.439035654067993, + "learning_rate": 1.1796410931882136e-07, + "loss": 0.316, + "step": 11454 + }, + { + "epoch": 5.416075650118203, + "grad_norm": 3.5416979789733887, + "learning_rate": 1.177748185409569e-07, + "loss": 0.2477, + "step": 11455 + }, + { + "epoch": 5.416548463356974, + "grad_norm": 3.2233688831329346, + "learning_rate": 1.175856760938085e-07, + "loss": 0.2967, + "step": 11456 + }, + { + "epoch": 5.417021276595745, + "grad_norm": 3.328793525695801, + "learning_rate": 1.1739668198915366e-07, + "loss": 0.3869, + "step": 11457 + }, + { + "epoch": 5.417494089834515, + "grad_norm": 3.4466869831085205, + "learning_rate": 1.1720783623875991e-07, + "loss": 0.3281, + "step": 11458 + }, + { + "epoch": 5.417966903073286, + "grad_norm": 2.856494665145874, + "learning_rate": 1.1701913885438621e-07, + "loss": 0.2944, + "step": 11459 + }, + { + "epoch": 5.418439716312057, + "grad_norm": 3.5706748962402344, + "learning_rate": 1.1683058984778172e-07, + "loss": 0.302, + "step": 11460 + }, + { + "epoch": 5.418912529550828, + "grad_norm": 3.267441511154175, + "learning_rate": 1.1664218923068599e-07, + "loss": 0.2935, + "step": 11461 + }, + { + "epoch": 5.419385342789598, + "grad_norm": 2.9759864807128906, + "learning_rate": 1.1645393701483099e-07, + "loss": 0.3258, + "step": 11462 + }, + { + "epoch": 5.4198581560283685, + "grad_norm": 3.288041353225708, + "learning_rate": 1.1626583321193763e-07, + "loss": 0.2809, + "step": 11463 + }, + { + "epoch": 5.420330969267139, + "grad_norm": 3.2921197414398193, + "learning_rate": 1.1607787783371794e-07, + "loss": 0.3365, + "step": 11464 + }, + { + "epoch": 5.42080378250591, + "grad_norm": 3.1783058643341064, + "learning_rate": 1.1589007089187615e-07, + "loss": 0.3345, + "step": 11465 + }, + { + "epoch": 5.421276595744681, + "grad_norm": 3.314274787902832, + "learning_rate": 1.1570241239810542e-07, + "loss": 0.35, + "step": 11466 + }, + { + "epoch": 5.421749408983452, + "grad_norm": 3.0419068336486816, + "learning_rate": 1.1551490236409085e-07, + "loss": 0.3057, + "step": 11467 + }, + { + "epoch": 5.4222222222222225, + "grad_norm": 3.2282440662384033, + "learning_rate": 1.1532754080150782e-07, + "loss": 0.3441, + "step": 11468 + }, + { + "epoch": 5.422695035460993, + "grad_norm": 2.974402666091919, + "learning_rate": 1.1514032772202172e-07, + "loss": 0.2715, + "step": 11469 + }, + { + "epoch": 5.423167848699763, + "grad_norm": 2.95684814453125, + "learning_rate": 1.1495326313729071e-07, + "loss": 0.2794, + "step": 11470 + }, + { + "epoch": 5.423640661938534, + "grad_norm": 3.2254786491394043, + "learning_rate": 1.1476634705896162e-07, + "loss": 0.3483, + "step": 11471 + }, + { + "epoch": 5.424113475177305, + "grad_norm": 3.2899491786956787, + "learning_rate": 1.1457957949867316e-07, + "loss": 0.3176, + "step": 11472 + }, + { + "epoch": 5.424586288416076, + "grad_norm": 3.2927584648132324, + "learning_rate": 1.1439296046805464e-07, + "loss": 0.321, + "step": 11473 + }, + { + "epoch": 5.425059101654846, + "grad_norm": 2.9653894901275635, + "learning_rate": 1.1420648997872596e-07, + "loss": 0.3307, + "step": 11474 + }, + { + "epoch": 5.425531914893617, + "grad_norm": 3.266310214996338, + "learning_rate": 1.1402016804229782e-07, + "loss": 0.3411, + "step": 11475 + }, + { + "epoch": 5.426004728132388, + "grad_norm": 3.2980828285217285, + "learning_rate": 1.1383399467037148e-07, + "loss": 0.3512, + "step": 11476 + }, + { + "epoch": 5.426477541371159, + "grad_norm": 3.2396881580352783, + "learning_rate": 1.1364796987453908e-07, + "loss": 0.3564, + "step": 11477 + }, + { + "epoch": 5.426950354609929, + "grad_norm": 3.1427674293518066, + "learning_rate": 1.1346209366638383e-07, + "loss": 0.3109, + "step": 11478 + }, + { + "epoch": 5.4274231678486995, + "grad_norm": 3.4141311645507812, + "learning_rate": 1.1327636605747926e-07, + "loss": 0.3694, + "step": 11479 + }, + { + "epoch": 5.42789598108747, + "grad_norm": 3.2203590869903564, + "learning_rate": 1.1309078705939031e-07, + "loss": 0.3153, + "step": 11480 + }, + { + "epoch": 5.428368794326241, + "grad_norm": 3.506516456604004, + "learning_rate": 1.1290535668367159e-07, + "loss": 0.3423, + "step": 11481 + }, + { + "epoch": 5.428841607565012, + "grad_norm": 4.035604953765869, + "learning_rate": 1.1272007494186887e-07, + "loss": 0.3452, + "step": 11482 + }, + { + "epoch": 5.429314420803783, + "grad_norm": 3.4485437870025635, + "learning_rate": 1.1253494184551961e-07, + "loss": 0.3539, + "step": 11483 + }, + { + "epoch": 5.4297872340425535, + "grad_norm": 3.707043409347534, + "learning_rate": 1.123499574061504e-07, + "loss": 0.3519, + "step": 11484 + }, + { + "epoch": 5.430260047281324, + "grad_norm": 4.046982765197754, + "learning_rate": 1.121651216352801e-07, + "loss": 0.333, + "step": 11485 + }, + { + "epoch": 5.430732860520094, + "grad_norm": 2.920891046524048, + "learning_rate": 1.1198043454441754e-07, + "loss": 0.3112, + "step": 11486 + }, + { + "epoch": 5.431205673758865, + "grad_norm": 3.5038676261901855, + "learning_rate": 1.1179589614506159e-07, + "loss": 0.3393, + "step": 11487 + }, + { + "epoch": 5.431678486997636, + "grad_norm": 3.349278211593628, + "learning_rate": 1.1161150644870389e-07, + "loss": 0.3814, + "step": 11488 + }, + { + "epoch": 5.432151300236407, + "grad_norm": 3.5188348293304443, + "learning_rate": 1.1142726546682469e-07, + "loss": 0.3791, + "step": 11489 + }, + { + "epoch": 5.432624113475177, + "grad_norm": 3.6900806427001953, + "learning_rate": 1.1124317321089595e-07, + "loss": 0.286, + "step": 11490 + }, + { + "epoch": 5.433096926713948, + "grad_norm": 2.804166793823242, + "learning_rate": 1.110592296923807e-07, + "loss": 0.2746, + "step": 11491 + }, + { + "epoch": 5.433569739952719, + "grad_norm": 3.1634747982025146, + "learning_rate": 1.1087543492273145e-07, + "loss": 0.3192, + "step": 11492 + }, + { + "epoch": 5.43404255319149, + "grad_norm": 3.8363192081451416, + "learning_rate": 1.106917889133935e-07, + "loss": 0.3442, + "step": 11493 + }, + { + "epoch": 5.43451536643026, + "grad_norm": 3.2438740730285645, + "learning_rate": 1.1050829167580073e-07, + "loss": 0.3487, + "step": 11494 + }, + { + "epoch": 5.4349881796690305, + "grad_norm": 3.2521681785583496, + "learning_rate": 1.1032494322137877e-07, + "loss": 0.3285, + "step": 11495 + }, + { + "epoch": 5.435460992907801, + "grad_norm": 2.9817872047424316, + "learning_rate": 1.1014174356154484e-07, + "loss": 0.2932, + "step": 11496 + }, + { + "epoch": 5.435933806146572, + "grad_norm": 3.4062857627868652, + "learning_rate": 1.0995869270770454e-07, + "loss": 0.3418, + "step": 11497 + }, + { + "epoch": 5.436406619385343, + "grad_norm": 3.4945690631866455, + "learning_rate": 1.097757906712571e-07, + "loss": 0.3411, + "step": 11498 + }, + { + "epoch": 5.436879432624114, + "grad_norm": 3.2375268936157227, + "learning_rate": 1.0959303746359007e-07, + "loss": 0.3243, + "step": 11499 + }, + { + "epoch": 5.4373522458628845, + "grad_norm": 3.2625772953033447, + "learning_rate": 1.0941043309608295e-07, + "loss": 0.3195, + "step": 11500 + }, + { + "epoch": 5.437825059101655, + "grad_norm": 3.473390579223633, + "learning_rate": 1.0922797758010584e-07, + "loss": 0.3755, + "step": 11501 + }, + { + "epoch": 5.438297872340425, + "grad_norm": 3.1895859241485596, + "learning_rate": 1.0904567092701907e-07, + "loss": 0.2959, + "step": 11502 + }, + { + "epoch": 5.438770685579196, + "grad_norm": 3.3767261505126953, + "learning_rate": 1.0886351314817467e-07, + "loss": 0.3274, + "step": 11503 + }, + { + "epoch": 5.439243498817967, + "grad_norm": 3.0629384517669678, + "learning_rate": 1.0868150425491469e-07, + "loss": 0.3347, + "step": 11504 + }, + { + "epoch": 5.439716312056738, + "grad_norm": 3.68630051612854, + "learning_rate": 1.0849964425857174e-07, + "loss": 0.3857, + "step": 11505 + }, + { + "epoch": 5.440189125295508, + "grad_norm": 3.170816659927368, + "learning_rate": 1.0831793317046895e-07, + "loss": 0.3009, + "step": 11506 + }, + { + "epoch": 5.440661938534279, + "grad_norm": 3.1225314140319824, + "learning_rate": 1.0813637100192176e-07, + "loss": 0.3531, + "step": 11507 + }, + { + "epoch": 5.44113475177305, + "grad_norm": 3.2253336906433105, + "learning_rate": 1.079549577642347e-07, + "loss": 0.3218, + "step": 11508 + }, + { + "epoch": 5.441607565011821, + "grad_norm": 2.8939125537872314, + "learning_rate": 1.0777369346870376e-07, + "loss": 0.2759, + "step": 11509 + }, + { + "epoch": 5.442080378250591, + "grad_norm": 3.3873422145843506, + "learning_rate": 1.0759257812661489e-07, + "loss": 0.3115, + "step": 11510 + }, + { + "epoch": 5.4425531914893615, + "grad_norm": 3.2324378490448, + "learning_rate": 1.0741161174924603e-07, + "loss": 0.3287, + "step": 11511 + }, + { + "epoch": 5.443026004728132, + "grad_norm": 3.369802713394165, + "learning_rate": 1.0723079434786482e-07, + "loss": 0.3218, + "step": 11512 + }, + { + "epoch": 5.443498817966903, + "grad_norm": 3.3094370365142822, + "learning_rate": 1.0705012593373032e-07, + "loss": 0.3079, + "step": 11513 + }, + { + "epoch": 5.443971631205674, + "grad_norm": 3.8659892082214355, + "learning_rate": 1.0686960651809158e-07, + "loss": 0.3199, + "step": 11514 + }, + { + "epoch": 5.444444444444445, + "grad_norm": 3.1500043869018555, + "learning_rate": 1.0668923611218851e-07, + "loss": 0.3224, + "step": 11515 + }, + { + "epoch": 5.444917257683215, + "grad_norm": 3.008122444152832, + "learning_rate": 1.0650901472725295e-07, + "loss": 0.2918, + "step": 11516 + }, + { + "epoch": 5.445390070921986, + "grad_norm": 3.1699118614196777, + "learning_rate": 1.0632894237450564e-07, + "loss": 0.3058, + "step": 11517 + }, + { + "epoch": 5.445862884160756, + "grad_norm": 2.907656192779541, + "learning_rate": 1.0614901906515901e-07, + "loss": 0.2993, + "step": 11518 + }, + { + "epoch": 5.446335697399527, + "grad_norm": 3.3137612342834473, + "learning_rate": 1.059692448104166e-07, + "loss": 0.2962, + "step": 11519 + }, + { + "epoch": 5.446808510638298, + "grad_norm": 3.1978371143341064, + "learning_rate": 1.0578961962147139e-07, + "loss": 0.3076, + "step": 11520 + }, + { + "epoch": 5.4472813238770685, + "grad_norm": 2.8629150390625, + "learning_rate": 1.056101435095086e-07, + "loss": 0.2902, + "step": 11521 + }, + { + "epoch": 5.447754137115839, + "grad_norm": 3.287623643875122, + "learning_rate": 1.0543081648570291e-07, + "loss": 0.2816, + "step": 11522 + }, + { + "epoch": 5.44822695035461, + "grad_norm": 3.0980396270751953, + "learning_rate": 1.0525163856122068e-07, + "loss": 0.3373, + "step": 11523 + }, + { + "epoch": 5.448699763593381, + "grad_norm": 3.444204092025757, + "learning_rate": 1.0507260974721795e-07, + "loss": 0.2918, + "step": 11524 + }, + { + "epoch": 5.449172576832151, + "grad_norm": 3.946662187576294, + "learning_rate": 1.0489373005484248e-07, + "loss": 0.3052, + "step": 11525 + }, + { + "epoch": 5.449645390070922, + "grad_norm": 3.6034319400787354, + "learning_rate": 1.0471499949523234e-07, + "loss": 0.319, + "step": 11526 + }, + { + "epoch": 5.450118203309692, + "grad_norm": 3.5294625759124756, + "learning_rate": 1.0453641807951609e-07, + "loss": 0.3523, + "step": 11527 + }, + { + "epoch": 5.450591016548463, + "grad_norm": 3.2164146900177, + "learning_rate": 1.043579858188129e-07, + "loss": 0.3397, + "step": 11528 + }, + { + "epoch": 5.451063829787234, + "grad_norm": 3.060518741607666, + "learning_rate": 1.0417970272423388e-07, + "loss": 0.3483, + "step": 11529 + }, + { + "epoch": 5.451536643026005, + "grad_norm": 3.1823830604553223, + "learning_rate": 1.0400156880687906e-07, + "loss": 0.3493, + "step": 11530 + }, + { + "epoch": 5.452009456264776, + "grad_norm": 3.1154868602752686, + "learning_rate": 1.0382358407784065e-07, + "loss": 0.2982, + "step": 11531 + }, + { + "epoch": 5.452482269503546, + "grad_norm": 3.1093451976776123, + "learning_rate": 1.0364574854820064e-07, + "loss": 0.2735, + "step": 11532 + }, + { + "epoch": 5.452955082742317, + "grad_norm": 3.1191015243530273, + "learning_rate": 1.0346806222903211e-07, + "loss": 0.3361, + "step": 11533 + }, + { + "epoch": 5.453427895981087, + "grad_norm": 3.328355312347412, + "learning_rate": 1.0329052513139898e-07, + "loss": 0.296, + "step": 11534 + }, + { + "epoch": 5.453900709219858, + "grad_norm": 2.975257158279419, + "learning_rate": 1.0311313726635546e-07, + "loss": 0.2943, + "step": 11535 + }, + { + "epoch": 5.454373522458629, + "grad_norm": 2.889234781265259, + "learning_rate": 1.029358986449469e-07, + "loss": 0.3056, + "step": 11536 + }, + { + "epoch": 5.4548463356973995, + "grad_norm": 3.158823251724243, + "learning_rate": 1.0275880927820948e-07, + "loss": 0.3092, + "step": 11537 + }, + { + "epoch": 5.45531914893617, + "grad_norm": 3.256237506866455, + "learning_rate": 1.0258186917716906e-07, + "loss": 0.3353, + "step": 11538 + }, + { + "epoch": 5.455791962174941, + "grad_norm": 3.170668125152588, + "learning_rate": 1.0240507835284353e-07, + "loss": 0.2833, + "step": 11539 + }, + { + "epoch": 5.456264775413712, + "grad_norm": 3.2530853748321533, + "learning_rate": 1.0222843681624045e-07, + "loss": 0.3327, + "step": 11540 + }, + { + "epoch": 5.456737588652482, + "grad_norm": 3.224292516708374, + "learning_rate": 1.0205194457835854e-07, + "loss": 0.3421, + "step": 11541 + }, + { + "epoch": 5.457210401891253, + "grad_norm": 3.392624855041504, + "learning_rate": 1.0187560165018761e-07, + "loss": 0.4025, + "step": 11542 + }, + { + "epoch": 5.457683215130023, + "grad_norm": 3.517279863357544, + "learning_rate": 1.0169940804270723e-07, + "loss": 0.3634, + "step": 11543 + }, + { + "epoch": 5.458156028368794, + "grad_norm": 3.68953275680542, + "learning_rate": 1.0152336376688887e-07, + "loss": 0.3512, + "step": 11544 + }, + { + "epoch": 5.458628841607565, + "grad_norm": 3.429471492767334, + "learning_rate": 1.013474688336935e-07, + "loss": 0.3601, + "step": 11545 + }, + { + "epoch": 5.459101654846336, + "grad_norm": 3.0927700996398926, + "learning_rate": 1.0117172325407321e-07, + "loss": 0.3181, + "step": 11546 + }, + { + "epoch": 5.459574468085107, + "grad_norm": 3.3231961727142334, + "learning_rate": 1.0099612703897144e-07, + "loss": 0.3376, + "step": 11547 + }, + { + "epoch": 5.460047281323877, + "grad_norm": 3.1489665508270264, + "learning_rate": 1.008206801993214e-07, + "loss": 0.2726, + "step": 11548 + }, + { + "epoch": 5.460520094562648, + "grad_norm": 2.998042345046997, + "learning_rate": 1.006453827460474e-07, + "loss": 0.3258, + "step": 11549 + }, + { + "epoch": 5.460992907801418, + "grad_norm": 3.317922592163086, + "learning_rate": 1.0047023469006489e-07, + "loss": 0.3156, + "step": 11550 + }, + { + "epoch": 5.461465721040189, + "grad_norm": 3.0401973724365234, + "learning_rate": 1.0029523604227904e-07, + "loss": 0.3063, + "step": 11551 + }, + { + "epoch": 5.46193853427896, + "grad_norm": 2.92132568359375, + "learning_rate": 1.0012038681358638e-07, + "loss": 0.3335, + "step": 11552 + }, + { + "epoch": 5.4624113475177305, + "grad_norm": 3.06042742729187, + "learning_rate": 9.994568701487434e-08, + "loss": 0.3317, + "step": 11553 + }, + { + "epoch": 5.462884160756501, + "grad_norm": 2.959724187850952, + "learning_rate": 9.977113665702059e-08, + "loss": 0.3015, + "step": 11554 + }, + { + "epoch": 5.463356973995272, + "grad_norm": 3.12992787361145, + "learning_rate": 9.959673575089313e-08, + "loss": 0.366, + "step": 11555 + }, + { + "epoch": 5.463829787234043, + "grad_norm": 3.3487796783447266, + "learning_rate": 9.94224843073513e-08, + "loss": 0.3029, + "step": 11556 + }, + { + "epoch": 5.464302600472813, + "grad_norm": 3.027540445327759, + "learning_rate": 9.92483823372456e-08, + "loss": 0.3269, + "step": 11557 + }, + { + "epoch": 5.464775413711584, + "grad_norm": 3.2602925300598145, + "learning_rate": 9.907442985141569e-08, + "loss": 0.3376, + "step": 11558 + }, + { + "epoch": 5.465248226950354, + "grad_norm": 3.346432685852051, + "learning_rate": 9.890062686069374e-08, + "loss": 0.3409, + "step": 11559 + }, + { + "epoch": 5.465721040189125, + "grad_norm": 3.288007974624634, + "learning_rate": 9.872697337590109e-08, + "loss": 0.3229, + "step": 11560 + }, + { + "epoch": 5.466193853427896, + "grad_norm": 3.5697832107543945, + "learning_rate": 9.85534694078502e-08, + "loss": 0.4162, + "step": 11561 + }, + { + "epoch": 5.466666666666667, + "grad_norm": 3.2786145210266113, + "learning_rate": 9.838011496734523e-08, + "loss": 0.3339, + "step": 11562 + }, + { + "epoch": 5.467139479905438, + "grad_norm": 3.6333465576171875, + "learning_rate": 9.820691006517947e-08, + "loss": 0.2769, + "step": 11563 + }, + { + "epoch": 5.467612293144208, + "grad_norm": 3.409446954727173, + "learning_rate": 9.803385471213788e-08, + "loss": 0.314, + "step": 11564 + }, + { + "epoch": 5.468085106382979, + "grad_norm": 3.376220941543579, + "learning_rate": 9.786094891899606e-08, + "loss": 0.3211, + "step": 11565 + }, + { + "epoch": 5.468557919621749, + "grad_norm": 3.073551893234253, + "learning_rate": 9.768819269651952e-08, + "loss": 0.2867, + "step": 11566 + }, + { + "epoch": 5.46903073286052, + "grad_norm": 2.9466781616210938, + "learning_rate": 9.75155860554658e-08, + "loss": 0.3323, + "step": 11567 + }, + { + "epoch": 5.469503546099291, + "grad_norm": 2.8090531826019287, + "learning_rate": 9.734312900658182e-08, + "loss": 0.2598, + "step": 11568 + }, + { + "epoch": 5.4699763593380615, + "grad_norm": 2.9262332916259766, + "learning_rate": 9.717082156060598e-08, + "loss": 0.3059, + "step": 11569 + }, + { + "epoch": 5.470449172576832, + "grad_norm": 3.2151377201080322, + "learning_rate": 9.699866372826661e-08, + "loss": 0.2855, + "step": 11570 + }, + { + "epoch": 5.470921985815603, + "grad_norm": 3.633512020111084, + "learning_rate": 9.682665552028404e-08, + "loss": 0.3495, + "step": 11571 + }, + { + "epoch": 5.471394799054374, + "grad_norm": 3.3621714115142822, + "learning_rate": 9.665479694736773e-08, + "loss": 0.2937, + "step": 11572 + }, + { + "epoch": 5.471867612293144, + "grad_norm": 2.9446847438812256, + "learning_rate": 9.648308802021916e-08, + "loss": 0.297, + "step": 11573 + }, + { + "epoch": 5.472340425531915, + "grad_norm": 3.4728331565856934, + "learning_rate": 9.63115287495292e-08, + "loss": 0.3365, + "step": 11574 + }, + { + "epoch": 5.472813238770685, + "grad_norm": 3.0443341732025146, + "learning_rate": 9.614011914598071e-08, + "loss": 0.3059, + "step": 11575 + }, + { + "epoch": 5.473286052009456, + "grad_norm": 3.659133195877075, + "learning_rate": 9.596885922024623e-08, + "loss": 0.3561, + "step": 11576 + }, + { + "epoch": 5.473758865248227, + "grad_norm": 3.166276216506958, + "learning_rate": 9.579774898298976e-08, + "loss": 0.2852, + "step": 11577 + }, + { + "epoch": 5.474231678486998, + "grad_norm": 3.019780158996582, + "learning_rate": 9.562678844486528e-08, + "loss": 0.3586, + "step": 11578 + }, + { + "epoch": 5.474704491725769, + "grad_norm": 3.729064464569092, + "learning_rate": 9.545597761651759e-08, + "loss": 0.3311, + "step": 11579 + }, + { + "epoch": 5.475177304964539, + "grad_norm": 3.478250026702881, + "learning_rate": 9.528531650858291e-08, + "loss": 0.3097, + "step": 11580 + }, + { + "epoch": 5.47565011820331, + "grad_norm": 2.939131021499634, + "learning_rate": 9.511480513168691e-08, + "loss": 0.2958, + "step": 11581 + }, + { + "epoch": 5.47612293144208, + "grad_norm": 3.405210256576538, + "learning_rate": 9.494444349644721e-08, + "loss": 0.2855, + "step": 11582 + }, + { + "epoch": 5.476595744680851, + "grad_norm": 3.166900873184204, + "learning_rate": 9.477423161347088e-08, + "loss": 0.278, + "step": 11583 + }, + { + "epoch": 5.477068557919622, + "grad_norm": 3.497921943664551, + "learning_rate": 9.460416949335665e-08, + "loss": 0.3507, + "step": 11584 + }, + { + "epoch": 5.4775413711583925, + "grad_norm": 2.780923843383789, + "learning_rate": 9.443425714669358e-08, + "loss": 0.2639, + "step": 11585 + }, + { + "epoch": 5.478014184397163, + "grad_norm": 3.697463274002075, + "learning_rate": 9.426449458406123e-08, + "loss": 0.3468, + "step": 11586 + }, + { + "epoch": 5.478486997635934, + "grad_norm": 3.344977617263794, + "learning_rate": 9.409488181602977e-08, + "loss": 0.3299, + "step": 11587 + }, + { + "epoch": 5.478959810874705, + "grad_norm": 3.280545949935913, + "learning_rate": 9.392541885316075e-08, + "loss": 0.3453, + "step": 11588 + }, + { + "epoch": 5.479432624113475, + "grad_norm": 4.446796894073486, + "learning_rate": 9.375610570600518e-08, + "loss": 0.3241, + "step": 11589 + }, + { + "epoch": 5.479905437352246, + "grad_norm": 3.322175979614258, + "learning_rate": 9.358694238510602e-08, + "loss": 0.3399, + "step": 11590 + }, + { + "epoch": 5.480378250591016, + "grad_norm": 3.0891571044921875, + "learning_rate": 9.341792890099621e-08, + "loss": 0.3048, + "step": 11591 + }, + { + "epoch": 5.480851063829787, + "grad_norm": 2.99324631690979, + "learning_rate": 9.32490652641993e-08, + "loss": 0.2799, + "step": 11592 + }, + { + "epoch": 5.481323877068558, + "grad_norm": 3.1435763835906982, + "learning_rate": 9.308035148522993e-08, + "loss": 0.3421, + "step": 11593 + }, + { + "epoch": 5.481796690307329, + "grad_norm": 3.177278518676758, + "learning_rate": 9.291178757459274e-08, + "loss": 0.3049, + "step": 11594 + }, + { + "epoch": 5.4822695035460995, + "grad_norm": 3.1422131061553955, + "learning_rate": 9.274337354278406e-08, + "loss": 0.3157, + "step": 11595 + }, + { + "epoch": 5.48274231678487, + "grad_norm": 3.261337995529175, + "learning_rate": 9.257510940029024e-08, + "loss": 0.3353, + "step": 11596 + }, + { + "epoch": 5.48321513002364, + "grad_norm": 3.239210367202759, + "learning_rate": 9.24069951575876e-08, + "loss": 0.3122, + "step": 11597 + }, + { + "epoch": 5.483687943262411, + "grad_norm": 2.982391595840454, + "learning_rate": 9.223903082514474e-08, + "loss": 0.318, + "step": 11598 + }, + { + "epoch": 5.484160756501182, + "grad_norm": 3.305668592453003, + "learning_rate": 9.207121641341937e-08, + "loss": 0.3537, + "step": 11599 + }, + { + "epoch": 5.484633569739953, + "grad_norm": 3.434743881225586, + "learning_rate": 9.190355193286121e-08, + "loss": 0.3408, + "step": 11600 + }, + { + "epoch": 5.485106382978723, + "grad_norm": 3.5249133110046387, + "learning_rate": 9.173603739390996e-08, + "loss": 0.2927, + "step": 11601 + }, + { + "epoch": 5.485579196217494, + "grad_norm": 3.4672904014587402, + "learning_rate": 9.156867280699533e-08, + "loss": 0.3742, + "step": 11602 + }, + { + "epoch": 5.486052009456265, + "grad_norm": 3.588812828063965, + "learning_rate": 9.140145818253898e-08, + "loss": 0.3527, + "step": 11603 + }, + { + "epoch": 5.486524822695036, + "grad_norm": 2.998471260070801, + "learning_rate": 9.123439353095231e-08, + "loss": 0.3108, + "step": 11604 + }, + { + "epoch": 5.486997635933806, + "grad_norm": 3.70161509513855, + "learning_rate": 9.106747886263784e-08, + "loss": 0.3732, + "step": 11605 + }, + { + "epoch": 5.4874704491725765, + "grad_norm": 3.575035333633423, + "learning_rate": 9.090071418798862e-08, + "loss": 0.3097, + "step": 11606 + }, + { + "epoch": 5.487943262411347, + "grad_norm": 2.9582679271698, + "learning_rate": 9.073409951738832e-08, + "loss": 0.31, + "step": 11607 + }, + { + "epoch": 5.488416075650118, + "grad_norm": 2.889258861541748, + "learning_rate": 9.05676348612114e-08, + "loss": 0.321, + "step": 11608 + }, + { + "epoch": 5.488888888888889, + "grad_norm": 3.2909512519836426, + "learning_rate": 9.040132022982262e-08, + "loss": 0.2967, + "step": 11609 + }, + { + "epoch": 5.48936170212766, + "grad_norm": 3.4053189754486084, + "learning_rate": 9.023515563357815e-08, + "loss": 0.3731, + "step": 11610 + }, + { + "epoch": 5.4898345153664305, + "grad_norm": 3.4993951320648193, + "learning_rate": 9.006914108282388e-08, + "loss": 0.3544, + "step": 11611 + }, + { + "epoch": 5.490307328605201, + "grad_norm": 3.1488454341888428, + "learning_rate": 8.990327658789683e-08, + "loss": 0.3326, + "step": 11612 + }, + { + "epoch": 5.490780141843971, + "grad_norm": 3.155266523361206, + "learning_rate": 8.97375621591251e-08, + "loss": 0.284, + "step": 11613 + }, + { + "epoch": 5.491252955082742, + "grad_norm": 3.5206151008605957, + "learning_rate": 8.957199780682657e-08, + "loss": 0.3104, + "step": 11614 + }, + { + "epoch": 5.491725768321513, + "grad_norm": 3.3629000186920166, + "learning_rate": 8.94065835413102e-08, + "loss": 0.3524, + "step": 11615 + }, + { + "epoch": 5.492198581560284, + "grad_norm": 2.872361183166504, + "learning_rate": 8.924131937287583e-08, + "loss": 0.2656, + "step": 11616 + }, + { + "epoch": 5.492671394799054, + "grad_norm": 3.2643532752990723, + "learning_rate": 8.907620531181382e-08, + "loss": 0.3039, + "step": 11617 + }, + { + "epoch": 5.493144208037825, + "grad_norm": 3.0820932388305664, + "learning_rate": 8.891124136840484e-08, + "loss": 0.2849, + "step": 11618 + }, + { + "epoch": 5.493617021276596, + "grad_norm": 3.398268222808838, + "learning_rate": 8.87464275529204e-08, + "loss": 0.275, + "step": 11619 + }, + { + "epoch": 5.494089834515367, + "grad_norm": 3.5962584018707275, + "learning_rate": 8.858176387562284e-08, + "loss": 0.3416, + "step": 11620 + }, + { + "epoch": 5.494562647754137, + "grad_norm": 3.0873095989227295, + "learning_rate": 8.841725034676535e-08, + "loss": 0.3669, + "step": 11621 + }, + { + "epoch": 5.4950354609929075, + "grad_norm": 3.2669544219970703, + "learning_rate": 8.825288697659084e-08, + "loss": 0.3739, + "step": 11622 + }, + { + "epoch": 5.495508274231678, + "grad_norm": 3.4504950046539307, + "learning_rate": 8.80886737753342e-08, + "loss": 0.295, + "step": 11623 + }, + { + "epoch": 5.495981087470449, + "grad_norm": 3.5143895149230957, + "learning_rate": 8.792461075322e-08, + "loss": 0.3655, + "step": 11624 + }, + { + "epoch": 5.49645390070922, + "grad_norm": 3.2914931774139404, + "learning_rate": 8.776069792046316e-08, + "loss": 0.2812, + "step": 11625 + }, + { + "epoch": 5.496926713947991, + "grad_norm": 3.2811596393585205, + "learning_rate": 8.759693528727075e-08, + "loss": 0.3118, + "step": 11626 + }, + { + "epoch": 5.4973995271867615, + "grad_norm": 2.9466712474823, + "learning_rate": 8.743332286383882e-08, + "loss": 0.3293, + "step": 11627 + }, + { + "epoch": 5.497872340425532, + "grad_norm": 3.4942102432250977, + "learning_rate": 8.726986066035531e-08, + "loss": 0.3452, + "step": 11628 + }, + { + "epoch": 5.498345153664302, + "grad_norm": 3.010925054550171, + "learning_rate": 8.71065486869982e-08, + "loss": 0.2651, + "step": 11629 + }, + { + "epoch": 5.498817966903073, + "grad_norm": 3.4343719482421875, + "learning_rate": 8.694338695393573e-08, + "loss": 0.2914, + "step": 11630 + }, + { + "epoch": 5.499290780141844, + "grad_norm": 3.32430362701416, + "learning_rate": 8.678037547132784e-08, + "loss": 0.3505, + "step": 11631 + }, + { + "epoch": 5.499763593380615, + "grad_norm": 3.294867992401123, + "learning_rate": 8.661751424932419e-08, + "loss": 0.3154, + "step": 11632 + }, + { + "epoch": 5.500236406619385, + "grad_norm": 3.3598666191101074, + "learning_rate": 8.645480329806583e-08, + "loss": 0.3787, + "step": 11633 + }, + { + "epoch": 5.500709219858156, + "grad_norm": 3.0285251140594482, + "learning_rate": 8.629224262768382e-08, + "loss": 0.3218, + "step": 11634 + }, + { + "epoch": 5.501182033096927, + "grad_norm": 3.117607355117798, + "learning_rate": 8.612983224829952e-08, + "loss": 0.3229, + "step": 11635 + }, + { + "epoch": 5.501654846335697, + "grad_norm": 3.201794385910034, + "learning_rate": 8.596757217002649e-08, + "loss": 0.3228, + "step": 11636 + }, + { + "epoch": 5.502127659574468, + "grad_norm": 3.0462050437927246, + "learning_rate": 8.58054624029675e-08, + "loss": 0.334, + "step": 11637 + }, + { + "epoch": 5.5026004728132385, + "grad_norm": 3.228955030441284, + "learning_rate": 8.564350295721613e-08, + "loss": 0.365, + "step": 11638 + }, + { + "epoch": 5.503073286052009, + "grad_norm": 3.2337913513183594, + "learning_rate": 8.548169384285765e-08, + "loss": 0.3145, + "step": 11639 + }, + { + "epoch": 5.50354609929078, + "grad_norm": 3.1510117053985596, + "learning_rate": 8.532003506996623e-08, + "loss": 0.3456, + "step": 11640 + }, + { + "epoch": 5.504018912529551, + "grad_norm": 3.1749494075775146, + "learning_rate": 8.51585266486088e-08, + "loss": 0.3133, + "step": 11641 + }, + { + "epoch": 5.504491725768322, + "grad_norm": 3.043325185775757, + "learning_rate": 8.499716858884094e-08, + "loss": 0.2652, + "step": 11642 + }, + { + "epoch": 5.5049645390070925, + "grad_norm": 3.722120761871338, + "learning_rate": 8.483596090070962e-08, + "loss": 0.3797, + "step": 11643 + }, + { + "epoch": 5.505437352245863, + "grad_norm": 3.287811756134033, + "learning_rate": 8.46749035942529e-08, + "loss": 0.3523, + "step": 11644 + }, + { + "epoch": 5.505910165484633, + "grad_norm": 3.303345203399658, + "learning_rate": 8.451399667949917e-08, + "loss": 0.2991, + "step": 11645 + }, + { + "epoch": 5.506382978723404, + "grad_norm": 3.18681001663208, + "learning_rate": 8.435324016646734e-08, + "loss": 0.2794, + "step": 11646 + }, + { + "epoch": 5.506855791962175, + "grad_norm": 3.742612361907959, + "learning_rate": 8.419263406516692e-08, + "loss": 0.319, + "step": 11647 + }, + { + "epoch": 5.507328605200946, + "grad_norm": 3.2310562133789062, + "learning_rate": 8.403217838559796e-08, + "loss": 0.2719, + "step": 11648 + }, + { + "epoch": 5.507801418439716, + "grad_norm": 3.409172773361206, + "learning_rate": 8.387187313775191e-08, + "loss": 0.3047, + "step": 11649 + }, + { + "epoch": 5.508274231678487, + "grad_norm": 2.940403699874878, + "learning_rate": 8.371171833160996e-08, + "loss": 0.2628, + "step": 11650 + }, + { + "epoch": 5.508747044917258, + "grad_norm": 3.0617125034332275, + "learning_rate": 8.355171397714413e-08, + "loss": 0.3272, + "step": 11651 + }, + { + "epoch": 5.509219858156028, + "grad_norm": 3.500164270401001, + "learning_rate": 8.339186008431726e-08, + "loss": 0.3373, + "step": 11652 + }, + { + "epoch": 5.509692671394799, + "grad_norm": 2.9918899536132812, + "learning_rate": 8.323215666308227e-08, + "loss": 0.2424, + "step": 11653 + }, + { + "epoch": 5.5101654846335695, + "grad_norm": 3.025858163833618, + "learning_rate": 8.307260372338421e-08, + "loss": 0.3076, + "step": 11654 + }, + { + "epoch": 5.51063829787234, + "grad_norm": 3.5850777626037598, + "learning_rate": 8.291320127515684e-08, + "loss": 0.4006, + "step": 11655 + }, + { + "epoch": 5.511111111111111, + "grad_norm": 3.1886472702026367, + "learning_rate": 8.275394932832609e-08, + "loss": 0.3171, + "step": 11656 + }, + { + "epoch": 5.511583924349882, + "grad_norm": 3.2190792560577393, + "learning_rate": 8.259484789280763e-08, + "loss": 0.346, + "step": 11657 + }, + { + "epoch": 5.512056737588653, + "grad_norm": 3.3583483695983887, + "learning_rate": 8.243589697850774e-08, + "loss": 0.3635, + "step": 11658 + }, + { + "epoch": 5.5125295508274235, + "grad_norm": 3.2143733501434326, + "learning_rate": 8.227709659532429e-08, + "loss": 0.2806, + "step": 11659 + }, + { + "epoch": 5.513002364066194, + "grad_norm": 3.288947582244873, + "learning_rate": 8.211844675314412e-08, + "loss": 0.2995, + "step": 11660 + }, + { + "epoch": 5.513475177304964, + "grad_norm": 3.106860637664795, + "learning_rate": 8.195994746184655e-08, + "loss": 0.3263, + "step": 11661 + }, + { + "epoch": 5.513947990543735, + "grad_norm": 3.416684627532959, + "learning_rate": 8.180159873130006e-08, + "loss": 0.327, + "step": 11662 + }, + { + "epoch": 5.514420803782506, + "grad_norm": 3.3379695415496826, + "learning_rate": 8.164340057136455e-08, + "loss": 0.3585, + "step": 11663 + }, + { + "epoch": 5.514893617021277, + "grad_norm": 3.3482465744018555, + "learning_rate": 8.148535299189048e-08, + "loss": 0.3341, + "step": 11664 + }, + { + "epoch": 5.515366430260047, + "grad_norm": 3.6307144165039062, + "learning_rate": 8.132745600271858e-08, + "loss": 0.3141, + "step": 11665 + }, + { + "epoch": 5.515839243498818, + "grad_norm": 3.4326119422912598, + "learning_rate": 8.116970961368042e-08, + "loss": 0.3056, + "step": 11666 + }, + { + "epoch": 5.516312056737589, + "grad_norm": 3.681443452835083, + "learning_rate": 8.101211383459817e-08, + "loss": 0.2637, + "step": 11667 + }, + { + "epoch": 5.516784869976359, + "grad_norm": 3.742844343185425, + "learning_rate": 8.085466867528452e-08, + "loss": 0.3315, + "step": 11668 + }, + { + "epoch": 5.51725768321513, + "grad_norm": 3.017211675643921, + "learning_rate": 8.069737414554302e-08, + "loss": 0.3134, + "step": 11669 + }, + { + "epoch": 5.5177304964539005, + "grad_norm": 3.2987756729125977, + "learning_rate": 8.054023025516805e-08, + "loss": 0.3359, + "step": 11670 + }, + { + "epoch": 5.518203309692671, + "grad_norm": 3.357034683227539, + "learning_rate": 8.038323701394346e-08, + "loss": 0.2902, + "step": 11671 + }, + { + "epoch": 5.518676122931442, + "grad_norm": 3.073740005493164, + "learning_rate": 8.022639443164531e-08, + "loss": 0.3432, + "step": 11672 + }, + { + "epoch": 5.519148936170213, + "grad_norm": 3.571768045425415, + "learning_rate": 8.006970251803886e-08, + "loss": 0.3142, + "step": 11673 + }, + { + "epoch": 5.519621749408984, + "grad_norm": 3.1152892112731934, + "learning_rate": 7.991316128288129e-08, + "loss": 0.2887, + "step": 11674 + }, + { + "epoch": 5.520094562647754, + "grad_norm": 3.6066105365753174, + "learning_rate": 7.975677073591925e-08, + "loss": 0.3432, + "step": 11675 + }, + { + "epoch": 5.520567375886525, + "grad_norm": 3.2668449878692627, + "learning_rate": 7.960053088689052e-08, + "loss": 0.3382, + "step": 11676 + }, + { + "epoch": 5.521040189125295, + "grad_norm": 2.9738216400146484, + "learning_rate": 7.944444174552395e-08, + "loss": 0.3304, + "step": 11677 + }, + { + "epoch": 5.521513002364066, + "grad_norm": 3.436067819595337, + "learning_rate": 7.928850332153793e-08, + "loss": 0.2707, + "step": 11678 + }, + { + "epoch": 5.521985815602837, + "grad_norm": 3.2292418479919434, + "learning_rate": 7.913271562464242e-08, + "loss": 0.3526, + "step": 11679 + }, + { + "epoch": 5.5224586288416075, + "grad_norm": 3.575700044631958, + "learning_rate": 7.897707866453746e-08, + "loss": 0.3719, + "step": 11680 + }, + { + "epoch": 5.522931442080378, + "grad_norm": 3.5845069885253906, + "learning_rate": 7.88215924509142e-08, + "loss": 0.3398, + "step": 11681 + }, + { + "epoch": 5.523404255319149, + "grad_norm": 3.5110747814178467, + "learning_rate": 7.866625699345376e-08, + "loss": 0.3762, + "step": 11682 + }, + { + "epoch": 5.52387706855792, + "grad_norm": 3.4815001487731934, + "learning_rate": 7.851107230182786e-08, + "loss": 0.3138, + "step": 11683 + }, + { + "epoch": 5.52434988179669, + "grad_norm": 3.2306344509124756, + "learning_rate": 7.835603838569989e-08, + "loss": 0.3055, + "step": 11684 + }, + { + "epoch": 5.524822695035461, + "grad_norm": 3.54038143157959, + "learning_rate": 7.820115525472294e-08, + "loss": 0.3291, + "step": 11685 + }, + { + "epoch": 5.525295508274231, + "grad_norm": 3.423480987548828, + "learning_rate": 7.804642291854043e-08, + "loss": 0.3283, + "step": 11686 + }, + { + "epoch": 5.525768321513002, + "grad_norm": 3.666048765182495, + "learning_rate": 7.78918413867874e-08, + "loss": 0.3376, + "step": 11687 + }, + { + "epoch": 5.526241134751773, + "grad_norm": 2.820924997329712, + "learning_rate": 7.773741066908896e-08, + "loss": 0.3115, + "step": 11688 + }, + { + "epoch": 5.526713947990544, + "grad_norm": 3.3126847743988037, + "learning_rate": 7.758313077506018e-08, + "loss": 0.3428, + "step": 11689 + }, + { + "epoch": 5.527186761229315, + "grad_norm": 3.6306893825531006, + "learning_rate": 7.742900171430839e-08, + "loss": 0.3362, + "step": 11690 + }, + { + "epoch": 5.527659574468085, + "grad_norm": 3.184955358505249, + "learning_rate": 7.727502349642952e-08, + "loss": 0.3129, + "step": 11691 + }, + { + "epoch": 5.528132387706856, + "grad_norm": 3.0288753509521484, + "learning_rate": 7.712119613101199e-08, + "loss": 0.2864, + "step": 11692 + }, + { + "epoch": 5.528605200945626, + "grad_norm": 3.0057592391967773, + "learning_rate": 7.696751962763343e-08, + "loss": 0.3508, + "step": 11693 + }, + { + "epoch": 5.529078014184397, + "grad_norm": 3.4207417964935303, + "learning_rate": 7.68139939958626e-08, + "loss": 0.3392, + "step": 11694 + }, + { + "epoch": 5.529550827423168, + "grad_norm": 3.4315693378448486, + "learning_rate": 7.66606192452593e-08, + "loss": 0.3296, + "step": 11695 + }, + { + "epoch": 5.5300236406619385, + "grad_norm": 3.496645927429199, + "learning_rate": 7.650739538537317e-08, + "loss": 0.3628, + "step": 11696 + }, + { + "epoch": 5.530496453900709, + "grad_norm": 3.160109281539917, + "learning_rate": 7.635432242574486e-08, + "loss": 0.3529, + "step": 11697 + }, + { + "epoch": 5.53096926713948, + "grad_norm": 3.434227228164673, + "learning_rate": 7.62014003759054e-08, + "loss": 0.3261, + "step": 11698 + }, + { + "epoch": 5.531442080378251, + "grad_norm": 3.334365129470825, + "learning_rate": 7.604862924537659e-08, + "loss": 0.3512, + "step": 11699 + }, + { + "epoch": 5.531914893617021, + "grad_norm": 3.228555917739868, + "learning_rate": 7.589600904367111e-08, + "loss": 0.3433, + "step": 11700 + }, + { + "epoch": 5.532387706855792, + "grad_norm": 3.4976108074188232, + "learning_rate": 7.574353978029164e-08, + "loss": 0.3406, + "step": 11701 + }, + { + "epoch": 5.532860520094562, + "grad_norm": 3.207275152206421, + "learning_rate": 7.559122146473197e-08, + "loss": 0.2936, + "step": 11702 + }, + { + "epoch": 5.533333333333333, + "grad_norm": 3.077650547027588, + "learning_rate": 7.543905410647645e-08, + "loss": 0.3193, + "step": 11703 + }, + { + "epoch": 5.533806146572104, + "grad_norm": 3.0646450519561768, + "learning_rate": 7.528703771499918e-08, + "loss": 0.2966, + "step": 11704 + }, + { + "epoch": 5.534278959810875, + "grad_norm": 3.450676202774048, + "learning_rate": 7.513517229976646e-08, + "loss": 0.335, + "step": 11705 + }, + { + "epoch": 5.534751773049646, + "grad_norm": 3.3997511863708496, + "learning_rate": 7.498345787023353e-08, + "loss": 0.3309, + "step": 11706 + }, + { + "epoch": 5.535224586288416, + "grad_norm": 3.0745410919189453, + "learning_rate": 7.483189443584754e-08, + "loss": 0.3025, + "step": 11707 + }, + { + "epoch": 5.535697399527187, + "grad_norm": 3.0367109775543213, + "learning_rate": 7.468048200604539e-08, + "loss": 0.3248, + "step": 11708 + }, + { + "epoch": 5.536170212765957, + "grad_norm": 3.1814827919006348, + "learning_rate": 7.45292205902548e-08, + "loss": 0.32, + "step": 11709 + }, + { + "epoch": 5.536643026004728, + "grad_norm": 3.1039483547210693, + "learning_rate": 7.437811019789437e-08, + "loss": 0.2985, + "step": 11710 + }, + { + "epoch": 5.537115839243499, + "grad_norm": 3.4284253120422363, + "learning_rate": 7.422715083837296e-08, + "loss": 0.3141, + "step": 11711 + }, + { + "epoch": 5.5375886524822695, + "grad_norm": 2.922107458114624, + "learning_rate": 7.407634252109025e-08, + "loss": 0.3227, + "step": 11712 + }, + { + "epoch": 5.53806146572104, + "grad_norm": 2.925189733505249, + "learning_rate": 7.392568525543625e-08, + "loss": 0.2847, + "step": 11713 + }, + { + "epoch": 5.538534278959811, + "grad_norm": 3.3388655185699463, + "learning_rate": 7.377517905079179e-08, + "loss": 0.3311, + "step": 11714 + }, + { + "epoch": 5.539007092198582, + "grad_norm": 3.5923469066619873, + "learning_rate": 7.362482391652853e-08, + "loss": 0.2915, + "step": 11715 + }, + { + "epoch": 5.539479905437352, + "grad_norm": 3.1141269207000732, + "learning_rate": 7.347461986200788e-08, + "loss": 0.2816, + "step": 11716 + }, + { + "epoch": 5.539952718676123, + "grad_norm": 3.2038733959198, + "learning_rate": 7.332456689658263e-08, + "loss": 0.3387, + "step": 11717 + }, + { + "epoch": 5.540425531914893, + "grad_norm": 3.1562764644622803, + "learning_rate": 7.317466502959585e-08, + "loss": 0.3439, + "step": 11718 + }, + { + "epoch": 5.540898345153664, + "grad_norm": 3.1141879558563232, + "learning_rate": 7.302491427038149e-08, + "loss": 0.3039, + "step": 11719 + }, + { + "epoch": 5.541371158392435, + "grad_norm": 3.2877326011657715, + "learning_rate": 7.287531462826375e-08, + "loss": 0.2694, + "step": 11720 + }, + { + "epoch": 5.541843971631206, + "grad_norm": 3.3915371894836426, + "learning_rate": 7.272586611255766e-08, + "loss": 0.3022, + "step": 11721 + }, + { + "epoch": 5.542316784869977, + "grad_norm": 2.9387362003326416, + "learning_rate": 7.257656873256858e-08, + "loss": 0.3702, + "step": 11722 + }, + { + "epoch": 5.542789598108747, + "grad_norm": 3.7734436988830566, + "learning_rate": 7.242742249759265e-08, + "loss": 0.3237, + "step": 11723 + }, + { + "epoch": 5.543262411347518, + "grad_norm": 3.3691086769104004, + "learning_rate": 7.22784274169161e-08, + "loss": 0.3418, + "step": 11724 + }, + { + "epoch": 5.543735224586288, + "grad_norm": 3.0379209518432617, + "learning_rate": 7.21295834998173e-08, + "loss": 0.3512, + "step": 11725 + }, + { + "epoch": 5.544208037825059, + "grad_norm": 3.5555078983306885, + "learning_rate": 7.198089075556302e-08, + "loss": 0.3474, + "step": 11726 + }, + { + "epoch": 5.54468085106383, + "grad_norm": 3.1998801231384277, + "learning_rate": 7.183234919341226e-08, + "loss": 0.3437, + "step": 11727 + }, + { + "epoch": 5.5451536643026005, + "grad_norm": 3.2798357009887695, + "learning_rate": 7.168395882261397e-08, + "loss": 0.3047, + "step": 11728 + }, + { + "epoch": 5.545626477541371, + "grad_norm": 3.1311612129211426, + "learning_rate": 7.153571965240774e-08, + "loss": 0.3099, + "step": 11729 + }, + { + "epoch": 5.546099290780142, + "grad_norm": 3.3740992546081543, + "learning_rate": 7.138763169202367e-08, + "loss": 0.3337, + "step": 11730 + }, + { + "epoch": 5.546572104018913, + "grad_norm": 3.4915285110473633, + "learning_rate": 7.123969495068272e-08, + "loss": 0.3462, + "step": 11731 + }, + { + "epoch": 5.547044917257683, + "grad_norm": 3.236283540725708, + "learning_rate": 7.109190943759615e-08, + "loss": 0.2688, + "step": 11732 + }, + { + "epoch": 5.547517730496454, + "grad_norm": 2.8927080631256104, + "learning_rate": 7.094427516196601e-08, + "loss": 0.3078, + "step": 11733 + }, + { + "epoch": 5.547990543735224, + "grad_norm": 2.8691484928131104, + "learning_rate": 7.079679213298468e-08, + "loss": 0.265, + "step": 11734 + }, + { + "epoch": 5.548463356973995, + "grad_norm": 2.9787049293518066, + "learning_rate": 7.064946035983511e-08, + "loss": 0.2901, + "step": 11735 + }, + { + "epoch": 5.548936170212766, + "grad_norm": 2.8145298957824707, + "learning_rate": 7.050227985169161e-08, + "loss": 0.2754, + "step": 11736 + }, + { + "epoch": 5.549408983451537, + "grad_norm": 3.2856605052948, + "learning_rate": 7.035525061771769e-08, + "loss": 0.355, + "step": 11737 + }, + { + "epoch": 5.549881796690308, + "grad_norm": 2.770224094390869, + "learning_rate": 7.020837266706909e-08, + "loss": 0.2802, + "step": 11738 + }, + { + "epoch": 5.550354609929078, + "grad_norm": 2.7485382556915283, + "learning_rate": 7.00616460088907e-08, + "loss": 0.2839, + "step": 11739 + }, + { + "epoch": 5.550827423167849, + "grad_norm": 3.6368305683135986, + "learning_rate": 6.991507065231857e-08, + "loss": 0.3458, + "step": 11740 + }, + { + "epoch": 5.551300236406619, + "grad_norm": 3.0573692321777344, + "learning_rate": 6.976864660647925e-08, + "loss": 0.3257, + "step": 11741 + }, + { + "epoch": 5.55177304964539, + "grad_norm": 3.5397047996520996, + "learning_rate": 6.96223738804902e-08, + "loss": 0.3507, + "step": 11742 + }, + { + "epoch": 5.552245862884161, + "grad_norm": 3.4442644119262695, + "learning_rate": 6.947625248345912e-08, + "loss": 0.3329, + "step": 11743 + }, + { + "epoch": 5.5527186761229315, + "grad_norm": 3.613973617553711, + "learning_rate": 6.93302824244843e-08, + "loss": 0.3207, + "step": 11744 + }, + { + "epoch": 5.553191489361702, + "grad_norm": 3.704705238342285, + "learning_rate": 6.918446371265458e-08, + "loss": 0.2735, + "step": 11745 + }, + { + "epoch": 5.553664302600473, + "grad_norm": 3.4365522861480713, + "learning_rate": 6.903879635704963e-08, + "loss": 0.3157, + "step": 11746 + }, + { + "epoch": 5.554137115839244, + "grad_norm": 3.4732930660247803, + "learning_rate": 6.889328036673914e-08, + "loss": 0.3197, + "step": 11747 + }, + { + "epoch": 5.554609929078014, + "grad_norm": 3.4623477458953857, + "learning_rate": 6.874791575078421e-08, + "loss": 0.3476, + "step": 11748 + }, + { + "epoch": 5.555082742316785, + "grad_norm": 3.1602671146392822, + "learning_rate": 6.860270251823564e-08, + "loss": 0.3074, + "step": 11749 + }, + { + "epoch": 5.555555555555555, + "grad_norm": 4.155186653137207, + "learning_rate": 6.845764067813538e-08, + "loss": 0.3359, + "step": 11750 + }, + { + "epoch": 5.556028368794326, + "grad_norm": 3.526486396789551, + "learning_rate": 6.831273023951618e-08, + "loss": 0.3285, + "step": 11751 + }, + { + "epoch": 5.556501182033097, + "grad_norm": 3.4824352264404297, + "learning_rate": 6.816797121140029e-08, + "loss": 0.315, + "step": 11752 + }, + { + "epoch": 5.556973995271868, + "grad_norm": 3.783975601196289, + "learning_rate": 6.802336360280187e-08, + "loss": 0.3279, + "step": 11753 + }, + { + "epoch": 5.5574468085106385, + "grad_norm": 3.435145378112793, + "learning_rate": 6.787890742272457e-08, + "loss": 0.3309, + "step": 11754 + }, + { + "epoch": 5.557919621749409, + "grad_norm": 3.1065540313720703, + "learning_rate": 6.773460268016285e-08, + "loss": 0.3235, + "step": 11755 + }, + { + "epoch": 5.55839243498818, + "grad_norm": 3.195692539215088, + "learning_rate": 6.759044938410287e-08, + "loss": 0.2965, + "step": 11756 + }, + { + "epoch": 5.55886524822695, + "grad_norm": 3.553255796432495, + "learning_rate": 6.744644754351909e-08, + "loss": 0.3367, + "step": 11757 + }, + { + "epoch": 5.559338061465721, + "grad_norm": 3.1541762351989746, + "learning_rate": 6.730259716737908e-08, + "loss": 0.3525, + "step": 11758 + }, + { + "epoch": 5.559810874704492, + "grad_norm": 3.8875532150268555, + "learning_rate": 6.715889826463956e-08, + "loss": 0.3558, + "step": 11759 + }, + { + "epoch": 5.560283687943262, + "grad_norm": 3.260861396789551, + "learning_rate": 6.701535084424726e-08, + "loss": 0.29, + "step": 11760 + }, + { + "epoch": 5.560756501182033, + "grad_norm": 3.1773221492767334, + "learning_rate": 6.687195491514114e-08, + "loss": 0.2756, + "step": 11761 + }, + { + "epoch": 5.561229314420804, + "grad_norm": 3.6206512451171875, + "learning_rate": 6.672871048624934e-08, + "loss": 0.3412, + "step": 11762 + }, + { + "epoch": 5.561702127659575, + "grad_norm": 3.5456814765930176, + "learning_rate": 6.658561756649084e-08, + "loss": 0.3552, + "step": 11763 + }, + { + "epoch": 5.562174940898345, + "grad_norm": 3.60017991065979, + "learning_rate": 6.6442676164776e-08, + "loss": 0.358, + "step": 11764 + }, + { + "epoch": 5.5626477541371155, + "grad_norm": 3.1833202838897705, + "learning_rate": 6.629988629000467e-08, + "loss": 0.2945, + "step": 11765 + }, + { + "epoch": 5.563120567375886, + "grad_norm": 4.464661598205566, + "learning_rate": 6.615724795106803e-08, + "loss": 0.3482, + "step": 11766 + }, + { + "epoch": 5.563593380614657, + "grad_norm": 3.1042027473449707, + "learning_rate": 6.601476115684762e-08, + "loss": 0.358, + "step": 11767 + }, + { + "epoch": 5.564066193853428, + "grad_norm": 3.243957281112671, + "learning_rate": 6.587242591621524e-08, + "loss": 0.2892, + "step": 11768 + }, + { + "epoch": 5.564539007092199, + "grad_norm": 5.185178279876709, + "learning_rate": 6.573024223803349e-08, + "loss": 0.2877, + "step": 11769 + }, + { + "epoch": 5.5650118203309695, + "grad_norm": 3.488581657409668, + "learning_rate": 6.558821013115557e-08, + "loss": 0.3234, + "step": 11770 + }, + { + "epoch": 5.56548463356974, + "grad_norm": 2.9220681190490723, + "learning_rate": 6.544632960442554e-08, + "loss": 0.2763, + "step": 11771 + }, + { + "epoch": 5.565957446808511, + "grad_norm": 3.5609130859375, + "learning_rate": 6.530460066667715e-08, + "loss": 0.3491, + "step": 11772 + }, + { + "epoch": 5.566430260047281, + "grad_norm": 3.2538235187530518, + "learning_rate": 6.516302332673558e-08, + "loss": 0.3427, + "step": 11773 + }, + { + "epoch": 5.566903073286052, + "grad_norm": 3.371915817260742, + "learning_rate": 6.502159759341598e-08, + "loss": 0.33, + "step": 11774 + }, + { + "epoch": 5.567375886524823, + "grad_norm": 3.090161085128784, + "learning_rate": 6.488032347552464e-08, + "loss": 0.306, + "step": 11775 + }, + { + "epoch": 5.567848699763593, + "grad_norm": 3.563584566116333, + "learning_rate": 6.473920098185787e-08, + "loss": 0.3073, + "step": 11776 + }, + { + "epoch": 5.568321513002364, + "grad_norm": 3.434272050857544, + "learning_rate": 6.459823012120226e-08, + "loss": 0.3244, + "step": 11777 + }, + { + "epoch": 5.568794326241135, + "grad_norm": 3.05387544631958, + "learning_rate": 6.445741090233659e-08, + "loss": 0.3239, + "step": 11778 + }, + { + "epoch": 5.569267139479906, + "grad_norm": 3.3350319862365723, + "learning_rate": 6.431674333402804e-08, + "loss": 0.2885, + "step": 11779 + }, + { + "epoch": 5.569739952718676, + "grad_norm": 3.7317748069763184, + "learning_rate": 6.417622742503571e-08, + "loss": 0.4105, + "step": 11780 + }, + { + "epoch": 5.5702127659574465, + "grad_norm": 3.36405873298645, + "learning_rate": 6.403586318410899e-08, + "loss": 0.2809, + "step": 11781 + }, + { + "epoch": 5.570685579196217, + "grad_norm": 3.437410831451416, + "learning_rate": 6.389565061998781e-08, + "loss": 0.3327, + "step": 11782 + }, + { + "epoch": 5.571158392434988, + "grad_norm": 3.6284446716308594, + "learning_rate": 6.375558974140244e-08, + "loss": 0.3734, + "step": 11783 + }, + { + "epoch": 5.571631205673759, + "grad_norm": 3.400245428085327, + "learning_rate": 6.361568055707367e-08, + "loss": 0.3404, + "step": 11784 + }, + { + "epoch": 5.57210401891253, + "grad_norm": 3.2299258708953857, + "learning_rate": 6.347592307571371e-08, + "loss": 0.2886, + "step": 11785 + }, + { + "epoch": 5.5725768321513005, + "grad_norm": 3.069549798965454, + "learning_rate": 6.333631730602364e-08, + "loss": 0.3234, + "step": 11786 + }, + { + "epoch": 5.573049645390071, + "grad_norm": 3.2106802463531494, + "learning_rate": 6.319686325669705e-08, + "loss": 0.3116, + "step": 11787 + }, + { + "epoch": 5.573522458628842, + "grad_norm": 3.651440143585205, + "learning_rate": 6.305756093641646e-08, + "loss": 0.3248, + "step": 11788 + }, + { + "epoch": 5.573995271867612, + "grad_norm": 3.5746328830718994, + "learning_rate": 6.291841035385605e-08, + "loss": 0.3589, + "step": 11789 + }, + { + "epoch": 5.574468085106383, + "grad_norm": 3.3536272048950195, + "learning_rate": 6.277941151767997e-08, + "loss": 0.3187, + "step": 11790 + }, + { + "epoch": 5.574940898345154, + "grad_norm": 3.221473455429077, + "learning_rate": 6.264056443654271e-08, + "loss": 0.3281, + "step": 11791 + }, + { + "epoch": 5.575413711583924, + "grad_norm": 3.3866493701934814, + "learning_rate": 6.25018691190904e-08, + "loss": 0.3696, + "step": 11792 + }, + { + "epoch": 5.575886524822695, + "grad_norm": 4.225553512573242, + "learning_rate": 6.236332557395835e-08, + "loss": 0.3851, + "step": 11793 + }, + { + "epoch": 5.576359338061466, + "grad_norm": 2.9920523166656494, + "learning_rate": 6.222493380977357e-08, + "loss": 0.3036, + "step": 11794 + }, + { + "epoch": 5.576832151300237, + "grad_norm": 3.26487135887146, + "learning_rate": 6.208669383515276e-08, + "loss": 0.3429, + "step": 11795 + }, + { + "epoch": 5.577304964539007, + "grad_norm": 3.1462907791137695, + "learning_rate": 6.19486056587032e-08, + "loss": 0.3068, + "step": 11796 + }, + { + "epoch": 5.5777777777777775, + "grad_norm": 2.806856155395508, + "learning_rate": 6.181066928902385e-08, + "loss": 0.2884, + "step": 11797 + }, + { + "epoch": 5.578250591016548, + "grad_norm": 3.231782913208008, + "learning_rate": 6.167288473470284e-08, + "loss": 0.3355, + "step": 11798 + }, + { + "epoch": 5.578723404255319, + "grad_norm": 3.313009023666382, + "learning_rate": 6.15352520043197e-08, + "loss": 0.3334, + "step": 11799 + }, + { + "epoch": 5.57919621749409, + "grad_norm": 3.366187572479248, + "learning_rate": 6.139777110644397e-08, + "loss": 0.336, + "step": 11800 + }, + { + "epoch": 5.579669030732861, + "grad_norm": 3.2782907485961914, + "learning_rate": 6.126044204963572e-08, + "loss": 0.3261, + "step": 11801 + }, + { + "epoch": 5.5801418439716315, + "grad_norm": 2.9769949913024902, + "learning_rate": 6.112326484244674e-08, + "loss": 0.3018, + "step": 11802 + }, + { + "epoch": 5.580614657210402, + "grad_norm": 3.3499436378479004, + "learning_rate": 6.098623949341743e-08, + "loss": 0.3115, + "step": 11803 + }, + { + "epoch": 5.581087470449172, + "grad_norm": 3.1725752353668213, + "learning_rate": 6.084936601108066e-08, + "loss": 0.3284, + "step": 11804 + }, + { + "epoch": 5.581560283687943, + "grad_norm": 3.5985076427459717, + "learning_rate": 6.071264440395852e-08, + "loss": 0.3121, + "step": 11805 + }, + { + "epoch": 5.582033096926714, + "grad_norm": 3.3965110778808594, + "learning_rate": 6.057607468056365e-08, + "loss": 0.2935, + "step": 11806 + }, + { + "epoch": 5.582505910165485, + "grad_norm": 3.3994903564453125, + "learning_rate": 6.043965684940034e-08, + "loss": 0.3089, + "step": 11807 + }, + { + "epoch": 5.582978723404255, + "grad_norm": 3.0140533447265625, + "learning_rate": 6.030339091896265e-08, + "loss": 0.2828, + "step": 11808 + }, + { + "epoch": 5.583451536643026, + "grad_norm": 3.218672752380371, + "learning_rate": 6.016727689773488e-08, + "loss": 0.3021, + "step": 11809 + }, + { + "epoch": 5.583924349881797, + "grad_norm": 3.5624725818634033, + "learning_rate": 6.00313147941925e-08, + "loss": 0.3645, + "step": 11810 + }, + { + "epoch": 5.584397163120567, + "grad_norm": 3.484443187713623, + "learning_rate": 5.989550461680093e-08, + "loss": 0.3595, + "step": 11811 + }, + { + "epoch": 5.584869976359338, + "grad_norm": 3.805352210998535, + "learning_rate": 5.975984637401677e-08, + "loss": 0.3161, + "step": 11812 + }, + { + "epoch": 5.5853427895981085, + "grad_norm": 3.4212005138397217, + "learning_rate": 5.962434007428714e-08, + "loss": 0.3195, + "step": 11813 + }, + { + "epoch": 5.585815602836879, + "grad_norm": 3.2629356384277344, + "learning_rate": 5.9488985726048885e-08, + "loss": 0.3264, + "step": 11814 + }, + { + "epoch": 5.58628841607565, + "grad_norm": 2.9457015991210938, + "learning_rate": 5.9353783337730284e-08, + "loss": 0.3001, + "step": 11815 + }, + { + "epoch": 5.586761229314421, + "grad_norm": 3.5118699073791504, + "learning_rate": 5.921873291774932e-08, + "loss": 0.3388, + "step": 11816 + }, + { + "epoch": 5.587234042553192, + "grad_norm": 3.5850439071655273, + "learning_rate": 5.908383447451593e-08, + "loss": 0.2999, + "step": 11817 + }, + { + "epoch": 5.5877068557919625, + "grad_norm": 3.5888917446136475, + "learning_rate": 5.8949088016428954e-08, + "loss": 0.3797, + "step": 11818 + }, + { + "epoch": 5.588179669030733, + "grad_norm": 3.3090274333953857, + "learning_rate": 5.881449355187807e-08, + "loss": 0.2644, + "step": 11819 + }, + { + "epoch": 5.588652482269503, + "grad_norm": 3.3722126483917236, + "learning_rate": 5.8680051089244906e-08, + "loss": 0.3319, + "step": 11820 + }, + { + "epoch": 5.589125295508274, + "grad_norm": 3.2284746170043945, + "learning_rate": 5.8545760636899985e-08, + "loss": 0.3166, + "step": 11821 + }, + { + "epoch": 5.589598108747045, + "grad_norm": 3.022043228149414, + "learning_rate": 5.841162220320496e-08, + "loss": 0.3077, + "step": 11822 + }, + { + "epoch": 5.590070921985816, + "grad_norm": 3.0781049728393555, + "learning_rate": 5.8277635796512574e-08, + "loss": 0.3499, + "step": 11823 + }, + { + "epoch": 5.590543735224586, + "grad_norm": 2.954606294631958, + "learning_rate": 5.8143801425164794e-08, + "loss": 0.2903, + "step": 11824 + }, + { + "epoch": 5.591016548463357, + "grad_norm": 3.109799861907959, + "learning_rate": 5.801011909749549e-08, + "loss": 0.3418, + "step": 11825 + }, + { + "epoch": 5.591489361702128, + "grad_norm": 3.096015214920044, + "learning_rate": 5.7876588821828003e-08, + "loss": 0.296, + "step": 11826 + }, + { + "epoch": 5.591962174940898, + "grad_norm": 3.375032663345337, + "learning_rate": 5.774321060647708e-08, + "loss": 0.2903, + "step": 11827 + }, + { + "epoch": 5.592434988179669, + "grad_norm": 3.2566537857055664, + "learning_rate": 5.7609984459747745e-08, + "loss": 0.3486, + "step": 11828 + }, + { + "epoch": 5.5929078014184395, + "grad_norm": 3.127394914627075, + "learning_rate": 5.747691038993475e-08, + "loss": 0.3028, + "step": 11829 + }, + { + "epoch": 5.59338061465721, + "grad_norm": 3.6006057262420654, + "learning_rate": 5.734398840532451e-08, + "loss": 0.3155, + "step": 11830 + }, + { + "epoch": 5.593853427895981, + "grad_norm": 3.31854510307312, + "learning_rate": 5.721121851419348e-08, + "loss": 0.3526, + "step": 11831 + }, + { + "epoch": 5.594326241134752, + "grad_norm": 3.2922966480255127, + "learning_rate": 5.7078600724808365e-08, + "loss": 0.2947, + "step": 11832 + }, + { + "epoch": 5.594799054373523, + "grad_norm": 3.170687198638916, + "learning_rate": 5.694613504542701e-08, + "loss": 0.3703, + "step": 11833 + }, + { + "epoch": 5.5952718676122934, + "grad_norm": 3.654935836791992, + "learning_rate": 5.6813821484296985e-08, + "loss": 0.2927, + "step": 11834 + }, + { + "epoch": 5.595744680851064, + "grad_norm": 3.3022713661193848, + "learning_rate": 5.6681660049657805e-08, + "loss": 0.2826, + "step": 11835 + }, + { + "epoch": 5.596217494089834, + "grad_norm": 3.0262742042541504, + "learning_rate": 5.6549650749737893e-08, + "loss": 0.2905, + "step": 11836 + }, + { + "epoch": 5.596690307328605, + "grad_norm": 3.2589111328125, + "learning_rate": 5.6417793592756786e-08, + "loss": 0.3173, + "step": 11837 + }, + { + "epoch": 5.597163120567376, + "grad_norm": 3.257753849029541, + "learning_rate": 5.628608858692514e-08, + "loss": 0.3277, + "step": 11838 + }, + { + "epoch": 5.5976359338061465, + "grad_norm": 2.7493152618408203, + "learning_rate": 5.615453574044333e-08, + "loss": 0.2915, + "step": 11839 + }, + { + "epoch": 5.598108747044917, + "grad_norm": 3.183833360671997, + "learning_rate": 5.602313506150286e-08, + "loss": 0.2871, + "step": 11840 + }, + { + "epoch": 5.598581560283688, + "grad_norm": 3.640209197998047, + "learning_rate": 5.589188655828498e-08, + "loss": 0.3322, + "step": 11841 + }, + { + "epoch": 5.599054373522459, + "grad_norm": 3.1197102069854736, + "learning_rate": 5.576079023896203e-08, + "loss": 0.2604, + "step": 11842 + }, + { + "epoch": 5.599527186761229, + "grad_norm": 3.4404499530792236, + "learning_rate": 5.5629846111697473e-08, + "loss": 0.3492, + "step": 11843 + }, + { + "epoch": 5.6, + "grad_norm": 3.158811330795288, + "learning_rate": 5.5499054184643683e-08, + "loss": 0.2998, + "step": 11844 + }, + { + "epoch": 5.60047281323877, + "grad_norm": 3.6570982933044434, + "learning_rate": 5.5368414465945263e-08, + "loss": 0.3322, + "step": 11845 + }, + { + "epoch": 5.600945626477541, + "grad_norm": 3.2857882976531982, + "learning_rate": 5.523792696373626e-08, + "loss": 0.3372, + "step": 11846 + }, + { + "epoch": 5.601418439716312, + "grad_norm": 2.7943503856658936, + "learning_rate": 5.5107591686141545e-08, + "loss": 0.2956, + "step": 11847 + }, + { + "epoch": 5.601891252955083, + "grad_norm": 3.353516101837158, + "learning_rate": 5.4977408641276595e-08, + "loss": 0.3247, + "step": 11848 + }, + { + "epoch": 5.602364066193854, + "grad_norm": 3.132786273956299, + "learning_rate": 5.484737783724714e-08, + "loss": 0.3129, + "step": 11849 + }, + { + "epoch": 5.602836879432624, + "grad_norm": 3.406188488006592, + "learning_rate": 5.471749928215003e-08, + "loss": 0.3419, + "step": 11850 + }, + { + "epoch": 5.603309692671395, + "grad_norm": 3.224926233291626, + "learning_rate": 5.4587772984071866e-08, + "loss": 0.3356, + "step": 11851 + }, + { + "epoch": 5.603782505910165, + "grad_norm": 3.068500518798828, + "learning_rate": 5.4458198951090337e-08, + "loss": 0.3135, + "step": 11852 + }, + { + "epoch": 5.604255319148936, + "grad_norm": 3.1098759174346924, + "learning_rate": 5.432877719127344e-08, + "loss": 0.35, + "step": 11853 + }, + { + "epoch": 5.604728132387707, + "grad_norm": 3.4230198860168457, + "learning_rate": 5.419950771267973e-08, + "loss": 0.3183, + "step": 11854 + }, + { + "epoch": 5.6052009456264775, + "grad_norm": 3.6213667392730713, + "learning_rate": 5.4070390523357775e-08, + "loss": 0.3438, + "step": 11855 + }, + { + "epoch": 5.605673758865248, + "grad_norm": 3.7087268829345703, + "learning_rate": 5.39414256313478e-08, + "loss": 0.3632, + "step": 11856 + }, + { + "epoch": 5.606146572104019, + "grad_norm": 3.3394203186035156, + "learning_rate": 5.38126130446795e-08, + "loss": 0.2977, + "step": 11857 + }, + { + "epoch": 5.60661938534279, + "grad_norm": 3.342027425765991, + "learning_rate": 5.368395277137367e-08, + "loss": 0.2957, + "step": 11858 + }, + { + "epoch": 5.60709219858156, + "grad_norm": 3.2655093669891357, + "learning_rate": 5.355544481944141e-08, + "loss": 0.3546, + "step": 11859 + }, + { + "epoch": 5.607565011820331, + "grad_norm": 3.1710312366485596, + "learning_rate": 5.3427089196884104e-08, + "loss": 0.3428, + "step": 11860 + }, + { + "epoch": 5.608037825059101, + "grad_norm": 3.242621898651123, + "learning_rate": 5.3298885911694244e-08, + "loss": 0.269, + "step": 11861 + }, + { + "epoch": 5.608510638297872, + "grad_norm": 3.228151798248291, + "learning_rate": 5.317083497185377e-08, + "loss": 0.3415, + "step": 11862 + }, + { + "epoch": 5.608983451536643, + "grad_norm": 3.1227009296417236, + "learning_rate": 5.304293638533686e-08, + "loss": 0.2991, + "step": 11863 + }, + { + "epoch": 5.609456264775414, + "grad_norm": 2.8883254528045654, + "learning_rate": 5.29151901601066e-08, + "loss": 0.3063, + "step": 11864 + }, + { + "epoch": 5.609929078014185, + "grad_norm": 2.9985411167144775, + "learning_rate": 5.2787596304117174e-08, + "loss": 0.2947, + "step": 11865 + }, + { + "epoch": 5.610401891252955, + "grad_norm": 3.177762985229492, + "learning_rate": 5.266015482531389e-08, + "loss": 0.3063, + "step": 11866 + }, + { + "epoch": 5.610874704491726, + "grad_norm": 2.8067426681518555, + "learning_rate": 5.2532865731630966e-08, + "loss": 0.2996, + "step": 11867 + }, + { + "epoch": 5.611347517730496, + "grad_norm": 3.2768452167510986, + "learning_rate": 5.240572903099484e-08, + "loss": 0.289, + "step": 11868 + }, + { + "epoch": 5.611820330969267, + "grad_norm": 3.130014419555664, + "learning_rate": 5.227874473132166e-08, + "loss": 0.2907, + "step": 11869 + }, + { + "epoch": 5.612293144208038, + "grad_norm": 3.2271652221679688, + "learning_rate": 5.21519128405179e-08, + "loss": 0.319, + "step": 11870 + }, + { + "epoch": 5.6127659574468085, + "grad_norm": 3.3185455799102783, + "learning_rate": 5.202523336648141e-08, + "loss": 0.3638, + "step": 11871 + }, + { + "epoch": 5.613238770685579, + "grad_norm": 3.3324077129364014, + "learning_rate": 5.189870631709948e-08, + "loss": 0.3147, + "step": 11872 + }, + { + "epoch": 5.61371158392435, + "grad_norm": 3.229886770248413, + "learning_rate": 5.1772331700250533e-08, + "loss": 0.3706, + "step": 11873 + }, + { + "epoch": 5.614184397163121, + "grad_norm": 3.1973488330841064, + "learning_rate": 5.164610952380328e-08, + "loss": 0.3078, + "step": 11874 + }, + { + "epoch": 5.614657210401891, + "grad_norm": 3.336578130722046, + "learning_rate": 5.152003979561671e-08, + "loss": 0.3573, + "step": 11875 + }, + { + "epoch": 5.615130023640662, + "grad_norm": 4.805183410644531, + "learning_rate": 5.139412252354148e-08, + "loss": 0.3777, + "step": 11876 + }, + { + "epoch": 5.615602836879432, + "grad_norm": 3.332260847091675, + "learning_rate": 5.126835771541716e-08, + "loss": 0.2983, + "step": 11877 + }, + { + "epoch": 5.616075650118203, + "grad_norm": 3.1888558864593506, + "learning_rate": 5.114274537907499e-08, + "loss": 0.3025, + "step": 11878 + }, + { + "epoch": 5.616548463356974, + "grad_norm": 3.5608468055725098, + "learning_rate": 5.10172855223362e-08, + "loss": 0.3507, + "step": 11879 + }, + { + "epoch": 5.617021276595745, + "grad_norm": 3.5171632766723633, + "learning_rate": 5.0891978153012336e-08, + "loss": 0.3073, + "step": 11880 + }, + { + "epoch": 5.617494089834516, + "grad_norm": 3.1395633220672607, + "learning_rate": 5.076682327890603e-08, + "loss": 0.2811, + "step": 11881 + }, + { + "epoch": 5.617966903073286, + "grad_norm": 3.0650174617767334, + "learning_rate": 5.064182090781022e-08, + "loss": 0.307, + "step": 11882 + }, + { + "epoch": 5.618439716312057, + "grad_norm": 2.9738526344299316, + "learning_rate": 5.0516971047508135e-08, + "loss": 0.328, + "step": 11883 + }, + { + "epoch": 5.618912529550827, + "grad_norm": 3.3862271308898926, + "learning_rate": 5.0392273705773544e-08, + "loss": 0.3554, + "step": 11884 + }, + { + "epoch": 5.619385342789598, + "grad_norm": 3.0164403915405273, + "learning_rate": 5.026772889037052e-08, + "loss": 0.2704, + "step": 11885 + }, + { + "epoch": 5.619858156028369, + "grad_norm": 3.204324245452881, + "learning_rate": 5.0143336609054795e-08, + "loss": 0.334, + "step": 11886 + }, + { + "epoch": 5.6203309692671395, + "grad_norm": 3.560706853866577, + "learning_rate": 5.0019096869571015e-08, + "loss": 0.3248, + "step": 11887 + }, + { + "epoch": 5.62080378250591, + "grad_norm": 3.8754491806030273, + "learning_rate": 4.989500967965522e-08, + "loss": 0.3389, + "step": 11888 + }, + { + "epoch": 5.621276595744681, + "grad_norm": 3.0526723861694336, + "learning_rate": 4.977107504703399e-08, + "loss": 0.2743, + "step": 11889 + }, + { + "epoch": 5.621749408983452, + "grad_norm": 3.337905168533325, + "learning_rate": 4.964729297942339e-08, + "loss": 0.2813, + "step": 11890 + }, + { + "epoch": 5.622222222222222, + "grad_norm": 3.149933099746704, + "learning_rate": 4.952366348453197e-08, + "loss": 0.3346, + "step": 11891 + }, + { + "epoch": 5.622695035460993, + "grad_norm": 3.5325448513031006, + "learning_rate": 4.9400186570056904e-08, + "loss": 0.4096, + "step": 11892 + }, + { + "epoch": 5.623167848699763, + "grad_norm": 2.5968239307403564, + "learning_rate": 4.927686224368622e-08, + "loss": 0.2666, + "step": 11893 + }, + { + "epoch": 5.623640661938534, + "grad_norm": 3.005018711090088, + "learning_rate": 4.9153690513099874e-08, + "loss": 0.267, + "step": 11894 + }, + { + "epoch": 5.624113475177305, + "grad_norm": 3.210495710372925, + "learning_rate": 4.90306713859659e-08, + "loss": 0.3286, + "step": 11895 + }, + { + "epoch": 5.624586288416076, + "grad_norm": 3.488629102706909, + "learning_rate": 4.8907804869945394e-08, + "loss": 0.3275, + "step": 11896 + }, + { + "epoch": 5.625059101654847, + "grad_norm": 3.438709020614624, + "learning_rate": 4.8785090972688073e-08, + "loss": 0.3774, + "step": 11897 + }, + { + "epoch": 5.625531914893617, + "grad_norm": 3.4456019401550293, + "learning_rate": 4.8662529701834496e-08, + "loss": 0.3146, + "step": 11898 + }, + { + "epoch": 5.626004728132388, + "grad_norm": 3.248830556869507, + "learning_rate": 4.854012106501688e-08, + "loss": 0.2914, + "step": 11899 + }, + { + "epoch": 5.626477541371158, + "grad_norm": 3.252931833267212, + "learning_rate": 4.841786506985635e-08, + "loss": 0.355, + "step": 11900 + }, + { + "epoch": 5.626950354609929, + "grad_norm": 3.1938722133636475, + "learning_rate": 4.829576172396544e-08, + "loss": 0.3503, + "step": 11901 + }, + { + "epoch": 5.6274231678487, + "grad_norm": 3.425889492034912, + "learning_rate": 4.8173811034947224e-08, + "loss": 0.334, + "step": 11902 + }, + { + "epoch": 5.6278959810874705, + "grad_norm": 2.86318302154541, + "learning_rate": 4.805201301039508e-08, + "loss": 0.2957, + "step": 11903 + }, + { + "epoch": 5.628368794326241, + "grad_norm": 3.708336591720581, + "learning_rate": 4.7930367657892384e-08, + "loss": 0.3276, + "step": 11904 + }, + { + "epoch": 5.628841607565012, + "grad_norm": 3.3034868240356445, + "learning_rate": 4.780887498501363e-08, + "loss": 0.3087, + "step": 11905 + }, + { + "epoch": 5.629314420803783, + "grad_norm": 3.2360143661499023, + "learning_rate": 4.76875349993236e-08, + "loss": 0.3501, + "step": 11906 + }, + { + "epoch": 5.629787234042553, + "grad_norm": 3.3256373405456543, + "learning_rate": 4.756634770837793e-08, + "loss": 0.3206, + "step": 11907 + }, + { + "epoch": 5.630260047281324, + "grad_norm": 2.9592063426971436, + "learning_rate": 4.744531311972195e-08, + "loss": 0.3043, + "step": 11908 + }, + { + "epoch": 5.630732860520094, + "grad_norm": 3.3246355056762695, + "learning_rate": 4.732443124089214e-08, + "loss": 0.3046, + "step": 11909 + }, + { + "epoch": 5.631205673758865, + "grad_norm": 3.342833995819092, + "learning_rate": 4.7203702079415825e-08, + "loss": 0.348, + "step": 11910 + }, + { + "epoch": 5.631678486997636, + "grad_norm": 3.3094377517700195, + "learning_rate": 4.70831256428092e-08, + "loss": 0.3195, + "step": 11911 + }, + { + "epoch": 5.632151300236407, + "grad_norm": 3.05210018157959, + "learning_rate": 4.696270193858099e-08, + "loss": 0.3044, + "step": 11912 + }, + { + "epoch": 5.6326241134751776, + "grad_norm": 2.889557123184204, + "learning_rate": 4.68424309742288e-08, + "loss": 0.2952, + "step": 11913 + }, + { + "epoch": 5.633096926713948, + "grad_norm": 3.7573699951171875, + "learning_rate": 4.672231275724193e-08, + "loss": 0.3667, + "step": 11914 + }, + { + "epoch": 5.633569739952719, + "grad_norm": 3.1630661487579346, + "learning_rate": 4.660234729509938e-08, + "loss": 0.3001, + "step": 11915 + }, + { + "epoch": 5.634042553191489, + "grad_norm": 3.4200289249420166, + "learning_rate": 4.6482534595270466e-08, + "loss": 0.332, + "step": 11916 + }, + { + "epoch": 5.63451536643026, + "grad_norm": 3.23447847366333, + "learning_rate": 4.636287466521616e-08, + "loss": 0.3547, + "step": 11917 + }, + { + "epoch": 5.634988179669031, + "grad_norm": 3.3752806186676025, + "learning_rate": 4.624336751238689e-08, + "loss": 0.3021, + "step": 11918 + }, + { + "epoch": 5.6354609929078014, + "grad_norm": 3.221673011779785, + "learning_rate": 4.612401314422338e-08, + "loss": 0.3359, + "step": 11919 + }, + { + "epoch": 5.635933806146572, + "grad_norm": 3.3513898849487305, + "learning_rate": 4.600481156815773e-08, + "loss": 0.2969, + "step": 11920 + }, + { + "epoch": 5.636406619385343, + "grad_norm": 3.0128650665283203, + "learning_rate": 4.588576279161205e-08, + "loss": 0.2816, + "step": 11921 + }, + { + "epoch": 5.636879432624114, + "grad_norm": 5.3624725341796875, + "learning_rate": 4.5766866821999046e-08, + "loss": 0.3359, + "step": 11922 + }, + { + "epoch": 5.637352245862884, + "grad_norm": 3.1999800205230713, + "learning_rate": 4.5648123666721665e-08, + "loss": 0.3015, + "step": 11923 + }, + { + "epoch": 5.6378250591016545, + "grad_norm": 3.211670398712158, + "learning_rate": 4.5529533333173446e-08, + "loss": 0.2871, + "step": 11924 + }, + { + "epoch": 5.638297872340425, + "grad_norm": 3.2410199642181396, + "learning_rate": 4.5411095828739037e-08, + "loss": 0.2818, + "step": 11925 + }, + { + "epoch": 5.638770685579196, + "grad_norm": 3.5467281341552734, + "learning_rate": 4.529281116079226e-08, + "loss": 0.3765, + "step": 11926 + }, + { + "epoch": 5.639243498817967, + "grad_norm": 3.2093350887298584, + "learning_rate": 4.517467933669889e-08, + "loss": 0.3477, + "step": 11927 + }, + { + "epoch": 5.639716312056738, + "grad_norm": 3.8581385612487793, + "learning_rate": 4.5056700363813874e-08, + "loss": 0.3879, + "step": 11928 + }, + { + "epoch": 5.6401891252955085, + "grad_norm": 3.2723116874694824, + "learning_rate": 4.4938874249483565e-08, + "loss": 0.3368, + "step": 11929 + }, + { + "epoch": 5.640661938534279, + "grad_norm": 3.173933982849121, + "learning_rate": 4.482120100104459e-08, + "loss": 0.2887, + "step": 11930 + }, + { + "epoch": 5.64113475177305, + "grad_norm": 3.4990296363830566, + "learning_rate": 4.470368062582359e-08, + "loss": 0.3602, + "step": 11931 + }, + { + "epoch": 5.64160756501182, + "grad_norm": 3.5351905822753906, + "learning_rate": 4.458631313113831e-08, + "loss": 0.3043, + "step": 11932 + }, + { + "epoch": 5.642080378250591, + "grad_norm": 3.6162710189819336, + "learning_rate": 4.4469098524296815e-08, + "loss": 0.311, + "step": 11933 + }, + { + "epoch": 5.642553191489362, + "grad_norm": 3.2108359336853027, + "learning_rate": 4.4352036812597144e-08, + "loss": 0.2995, + "step": 11934 + }, + { + "epoch": 5.643026004728132, + "grad_norm": 3.0152573585510254, + "learning_rate": 4.423512800332874e-08, + "loss": 0.2889, + "step": 11935 + }, + { + "epoch": 5.643498817966903, + "grad_norm": 3.6868278980255127, + "learning_rate": 4.4118372103770514e-08, + "loss": 0.2943, + "step": 11936 + }, + { + "epoch": 5.643971631205674, + "grad_norm": 3.021240711212158, + "learning_rate": 4.400176912119275e-08, + "loss": 0.3377, + "step": 11937 + }, + { + "epoch": 5.644444444444445, + "grad_norm": 3.117161989212036, + "learning_rate": 4.388531906285548e-08, + "loss": 0.3161, + "step": 11938 + }, + { + "epoch": 5.644917257683215, + "grad_norm": 3.1031835079193115, + "learning_rate": 4.3769021936009295e-08, + "loss": 0.2707, + "step": 11939 + }, + { + "epoch": 5.6453900709219855, + "grad_norm": 3.2828316688537598, + "learning_rate": 4.365287774789617e-08, + "loss": 0.3368, + "step": 11940 + }, + { + "epoch": 5.645862884160756, + "grad_norm": 2.9725148677825928, + "learning_rate": 4.3536886505747555e-08, + "loss": 0.3074, + "step": 11941 + }, + { + "epoch": 5.646335697399527, + "grad_norm": 3.046449661254883, + "learning_rate": 4.342104821678572e-08, + "loss": 0.3057, + "step": 11942 + }, + { + "epoch": 5.646808510638298, + "grad_norm": 3.3085083961486816, + "learning_rate": 4.330536288822379e-08, + "loss": 0.3137, + "step": 11943 + }, + { + "epoch": 5.647281323877069, + "grad_norm": 3.4323384761810303, + "learning_rate": 4.318983052726406e-08, + "loss": 0.3116, + "step": 11944 + }, + { + "epoch": 5.6477541371158395, + "grad_norm": 3.387890577316284, + "learning_rate": 4.307445114110104e-08, + "loss": 0.2882, + "step": 11945 + }, + { + "epoch": 5.64822695035461, + "grad_norm": 3.2343697547912598, + "learning_rate": 4.295922473691872e-08, + "loss": 0.3324, + "step": 11946 + }, + { + "epoch": 5.648699763593381, + "grad_norm": 3.0639030933380127, + "learning_rate": 4.2844151321891626e-08, + "loss": 0.3739, + "step": 11947 + }, + { + "epoch": 5.649172576832151, + "grad_norm": 3.201939105987549, + "learning_rate": 4.272923090318487e-08, + "loss": 0.3241, + "step": 11948 + }, + { + "epoch": 5.649645390070922, + "grad_norm": 3.60882830619812, + "learning_rate": 4.2614463487954114e-08, + "loss": 0.274, + "step": 11949 + }, + { + "epoch": 5.650118203309693, + "grad_norm": 3.4274656772613525, + "learning_rate": 4.249984908334559e-08, + "loss": 0.3164, + "step": 11950 + }, + { + "epoch": 5.650591016548463, + "grad_norm": 3.332573175430298, + "learning_rate": 4.2385387696495525e-08, + "loss": 0.2875, + "step": 11951 + }, + { + "epoch": 5.651063829787234, + "grad_norm": 3.1913769245147705, + "learning_rate": 4.227107933453101e-08, + "loss": 0.3334, + "step": 11952 + }, + { + "epoch": 5.651536643026005, + "grad_norm": 3.683591842651367, + "learning_rate": 4.2156924004569664e-08, + "loss": 0.3544, + "step": 11953 + }, + { + "epoch": 5.652009456264776, + "grad_norm": 3.415381669998169, + "learning_rate": 4.204292171371915e-08, + "loss": 0.3764, + "step": 11954 + }, + { + "epoch": 5.652482269503546, + "grad_norm": 3.341519832611084, + "learning_rate": 4.192907246907824e-08, + "loss": 0.3001, + "step": 11955 + }, + { + "epoch": 5.6529550827423165, + "grad_norm": 3.161620855331421, + "learning_rate": 4.1815376277735686e-08, + "loss": 0.3118, + "step": 11956 + }, + { + "epoch": 5.653427895981087, + "grad_norm": 2.9771671295166016, + "learning_rate": 4.170183314677084e-08, + "loss": 0.2941, + "step": 11957 + }, + { + "epoch": 5.653900709219858, + "grad_norm": 3.0971219539642334, + "learning_rate": 4.1588443083253605e-08, + "loss": 0.2684, + "step": 11958 + }, + { + "epoch": 5.654373522458629, + "grad_norm": 3.714979410171509, + "learning_rate": 4.147520609424388e-08, + "loss": 0.3968, + "step": 11959 + }, + { + "epoch": 5.6548463356974, + "grad_norm": 2.89011549949646, + "learning_rate": 4.1362122186792976e-08, + "loss": 0.3221, + "step": 11960 + }, + { + "epoch": 5.6553191489361705, + "grad_norm": 3.0199105739593506, + "learning_rate": 4.124919136794192e-08, + "loss": 0.2991, + "step": 11961 + }, + { + "epoch": 5.655791962174941, + "grad_norm": 3.516275644302368, + "learning_rate": 4.113641364472232e-08, + "loss": 0.3635, + "step": 11962 + }, + { + "epoch": 5.656264775413711, + "grad_norm": 3.062115430831909, + "learning_rate": 4.1023789024156605e-08, + "loss": 0.3347, + "step": 11963 + }, + { + "epoch": 5.656737588652482, + "grad_norm": 3.2312190532684326, + "learning_rate": 4.091131751325694e-08, + "loss": 0.2771, + "step": 11964 + }, + { + "epoch": 5.657210401891253, + "grad_norm": 3.2892868518829346, + "learning_rate": 4.0798999119027184e-08, + "loss": 0.3055, + "step": 11965 + }, + { + "epoch": 5.657683215130024, + "grad_norm": 2.9591119289398193, + "learning_rate": 4.068683384846034e-08, + "loss": 0.2923, + "step": 11966 + }, + { + "epoch": 5.658156028368794, + "grad_norm": 3.423375129699707, + "learning_rate": 4.057482170854055e-08, + "loss": 0.3126, + "step": 11967 + }, + { + "epoch": 5.658628841607565, + "grad_norm": 2.994536876678467, + "learning_rate": 4.04629627062425e-08, + "loss": 0.3032, + "step": 11968 + }, + { + "epoch": 5.659101654846336, + "grad_norm": 3.4704277515411377, + "learning_rate": 4.035125684853064e-08, + "loss": 0.3722, + "step": 11969 + }, + { + "epoch": 5.659574468085106, + "grad_norm": 3.5841195583343506, + "learning_rate": 4.023970414236134e-08, + "loss": 0.3954, + "step": 11970 + }, + { + "epoch": 5.660047281323877, + "grad_norm": 3.0709569454193115, + "learning_rate": 4.012830459467959e-08, + "loss": 0.3378, + "step": 11971 + }, + { + "epoch": 5.6605200945626475, + "grad_norm": 3.0199382305145264, + "learning_rate": 4.00170582124218e-08, + "loss": 0.3131, + "step": 11972 + }, + { + "epoch": 5.660992907801418, + "grad_norm": 2.95902419090271, + "learning_rate": 3.990596500251576e-08, + "loss": 0.269, + "step": 11973 + }, + { + "epoch": 5.661465721040189, + "grad_norm": 3.6806280612945557, + "learning_rate": 3.979502497187759e-08, + "loss": 0.3914, + "step": 11974 + }, + { + "epoch": 5.66193853427896, + "grad_norm": 3.4212417602539062, + "learning_rate": 3.9684238127415675e-08, + "loss": 0.3453, + "step": 11975 + }, + { + "epoch": 5.662411347517731, + "grad_norm": 2.9063239097595215, + "learning_rate": 3.9573604476028096e-08, + "loss": 0.3388, + "step": 11976 + }, + { + "epoch": 5.6628841607565015, + "grad_norm": 3.5526041984558105, + "learning_rate": 3.946312402460351e-08, + "loss": 0.3707, + "step": 11977 + }, + { + "epoch": 5.663356973995272, + "grad_norm": 3.1709578037261963, + "learning_rate": 3.935279678002113e-08, + "loss": 0.2742, + "step": 11978 + }, + { + "epoch": 5.663829787234042, + "grad_norm": 3.9405689239501953, + "learning_rate": 3.924262274915047e-08, + "loss": 0.3712, + "step": 11979 + }, + { + "epoch": 5.664302600472813, + "grad_norm": 3.2250664234161377, + "learning_rate": 3.9132601938851324e-08, + "loss": 0.3081, + "step": 11980 + }, + { + "epoch": 5.664775413711584, + "grad_norm": 3.6658518314361572, + "learning_rate": 3.9022734355974866e-08, + "loss": 0.3464, + "step": 11981 + }, + { + "epoch": 5.665248226950355, + "grad_norm": 3.3371896743774414, + "learning_rate": 3.891302000736175e-08, + "loss": 0.3253, + "step": 11982 + }, + { + "epoch": 5.665721040189125, + "grad_norm": 3.1414871215820312, + "learning_rate": 3.880345889984316e-08, + "loss": 0.2796, + "step": 11983 + }, + { + "epoch": 5.666193853427896, + "grad_norm": 3.2845115661621094, + "learning_rate": 3.8694051040240876e-08, + "loss": 0.328, + "step": 11984 + }, + { + "epoch": 5.666666666666667, + "grad_norm": 3.565127372741699, + "learning_rate": 3.858479643536778e-08, + "loss": 0.3527, + "step": 11985 + }, + { + "epoch": 5.667139479905437, + "grad_norm": 3.017859935760498, + "learning_rate": 3.8475695092026476e-08, + "loss": 0.3379, + "step": 11986 + }, + { + "epoch": 5.667612293144208, + "grad_norm": 3.2508249282836914, + "learning_rate": 3.836674701701015e-08, + "loss": 0.3073, + "step": 11987 + }, + { + "epoch": 5.6680851063829785, + "grad_norm": 3.1742143630981445, + "learning_rate": 3.825795221710255e-08, + "loss": 0.3184, + "step": 11988 + }, + { + "epoch": 5.668557919621749, + "grad_norm": 3.096966028213501, + "learning_rate": 3.8149310699077956e-08, + "loss": 0.2686, + "step": 11989 + }, + { + "epoch": 5.66903073286052, + "grad_norm": 3.26834774017334, + "learning_rate": 3.80408224697007e-08, + "loss": 0.3532, + "step": 11990 + }, + { + "epoch": 5.669503546099291, + "grad_norm": 2.8902487754821777, + "learning_rate": 3.7932487535725924e-08, + "loss": 0.3226, + "step": 11991 + }, + { + "epoch": 5.669976359338062, + "grad_norm": 3.4881465435028076, + "learning_rate": 3.7824305903899626e-08, + "loss": 0.3187, + "step": 11992 + }, + { + "epoch": 5.6704491725768325, + "grad_norm": 3.0470924377441406, + "learning_rate": 3.771627758095725e-08, + "loss": 0.249, + "step": 11993 + }, + { + "epoch": 5.670921985815603, + "grad_norm": 3.4965827465057373, + "learning_rate": 3.760840257362564e-08, + "loss": 0.3688, + "step": 11994 + }, + { + "epoch": 5.671394799054373, + "grad_norm": 3.615798234939575, + "learning_rate": 3.7500680888621355e-08, + "loss": 0.3235, + "step": 11995 + }, + { + "epoch": 5.671867612293144, + "grad_norm": 3.8858070373535156, + "learning_rate": 3.739311253265237e-08, + "loss": 0.3749, + "step": 11996 + }, + { + "epoch": 5.672340425531915, + "grad_norm": 3.0234808921813965, + "learning_rate": 3.7285697512415844e-08, + "loss": 0.2837, + "step": 11997 + }, + { + "epoch": 5.6728132387706856, + "grad_norm": 4.143755912780762, + "learning_rate": 3.717843583460001e-08, + "loss": 0.3399, + "step": 11998 + }, + { + "epoch": 5.673286052009456, + "grad_norm": 3.064124822616577, + "learning_rate": 3.707132750588399e-08, + "loss": 0.3285, + "step": 11999 + }, + { + "epoch": 5.673758865248227, + "grad_norm": 3.348433256149292, + "learning_rate": 3.696437253293689e-08, + "loss": 0.3401, + "step": 12000 + }, + { + "epoch": 5.674231678486998, + "grad_norm": 3.011606216430664, + "learning_rate": 3.68575709224181e-08, + "loss": 0.2829, + "step": 12001 + }, + { + "epoch": 5.674704491725768, + "grad_norm": 2.935004234313965, + "learning_rate": 3.675092268097785e-08, + "loss": 0.3275, + "step": 12002 + }, + { + "epoch": 5.675177304964539, + "grad_norm": 3.3137762546539307, + "learning_rate": 3.664442781525668e-08, + "loss": 0.3274, + "step": 12003 + }, + { + "epoch": 5.6756501182033094, + "grad_norm": 3.837202310562134, + "learning_rate": 3.653808633188538e-08, + "loss": 0.324, + "step": 12004 + }, + { + "epoch": 5.67612293144208, + "grad_norm": 2.972749710083008, + "learning_rate": 3.643189823748561e-08, + "loss": 0.2805, + "step": 12005 + }, + { + "epoch": 5.676595744680851, + "grad_norm": 4.356303691864014, + "learning_rate": 3.632586353866902e-08, + "loss": 0.3509, + "step": 12006 + }, + { + "epoch": 5.677068557919622, + "grad_norm": 3.2707748413085938, + "learning_rate": 3.62199822420381e-08, + "loss": 0.3416, + "step": 12007 + }, + { + "epoch": 5.677541371158393, + "grad_norm": 3.460958957672119, + "learning_rate": 3.611425435418536e-08, + "loss": 0.323, + "step": 12008 + }, + { + "epoch": 5.678014184397163, + "grad_norm": 3.1501171588897705, + "learning_rate": 3.600867988169443e-08, + "loss": 0.3542, + "step": 12009 + }, + { + "epoch": 5.678486997635934, + "grad_norm": 3.2512545585632324, + "learning_rate": 3.590325883113838e-08, + "loss": 0.3031, + "step": 12010 + }, + { + "epoch": 5.678959810874704, + "grad_norm": 3.1285507678985596, + "learning_rate": 3.579799120908195e-08, + "loss": 0.3128, + "step": 12011 + }, + { + "epoch": 5.679432624113475, + "grad_norm": 3.4275474548339844, + "learning_rate": 3.569287702207963e-08, + "loss": 0.3152, + "step": 12012 + }, + { + "epoch": 5.679905437352246, + "grad_norm": 3.314131498336792, + "learning_rate": 3.558791627667563e-08, + "loss": 0.3164, + "step": 12013 + }, + { + "epoch": 5.6803782505910165, + "grad_norm": 3.040032148361206, + "learning_rate": 3.548310897940638e-08, + "loss": 0.2703, + "step": 12014 + }, + { + "epoch": 5.680851063829787, + "grad_norm": 3.3257627487182617, + "learning_rate": 3.537845513679722e-08, + "loss": 0.2802, + "step": 12015 + }, + { + "epoch": 5.681323877068558, + "grad_norm": 3.1988141536712646, + "learning_rate": 3.527395475536488e-08, + "loss": 0.3512, + "step": 12016 + }, + { + "epoch": 5.681796690307329, + "grad_norm": 3.1683738231658936, + "learning_rate": 3.516960784161555e-08, + "loss": 0.2829, + "step": 12017 + }, + { + "epoch": 5.682269503546099, + "grad_norm": 3.181068181991577, + "learning_rate": 3.5065414402046805e-08, + "loss": 0.2705, + "step": 12018 + }, + { + "epoch": 5.68274231678487, + "grad_norm": 3.3845975399017334, + "learning_rate": 3.4961374443146235e-08, + "loss": 0.3357, + "step": 12019 + }, + { + "epoch": 5.68321513002364, + "grad_norm": 3.1401607990264893, + "learning_rate": 3.485748797139199e-08, + "loss": 0.342, + "step": 12020 + }, + { + "epoch": 5.683687943262411, + "grad_norm": 2.891299247741699, + "learning_rate": 3.4753754993253064e-08, + "loss": 0.3359, + "step": 12021 + }, + { + "epoch": 5.684160756501182, + "grad_norm": 3.0465214252471924, + "learning_rate": 3.465017551518762e-08, + "loss": 0.3198, + "step": 12022 + }, + { + "epoch": 5.684633569739953, + "grad_norm": 3.44197678565979, + "learning_rate": 3.4546749543645506e-08, + "loss": 0.3345, + "step": 12023 + }, + { + "epoch": 5.685106382978724, + "grad_norm": 3.3884904384613037, + "learning_rate": 3.4443477085066847e-08, + "loss": 0.316, + "step": 12024 + }, + { + "epoch": 5.685579196217494, + "grad_norm": 3.266526937484741, + "learning_rate": 3.434035814588177e-08, + "loss": 0.2804, + "step": 12025 + }, + { + "epoch": 5.686052009456265, + "grad_norm": 3.4479098320007324, + "learning_rate": 3.42373927325107e-08, + "loss": 0.3282, + "step": 12026 + }, + { + "epoch": 5.686524822695035, + "grad_norm": 3.1241097450256348, + "learning_rate": 3.4134580851365176e-08, + "loss": 0.3045, + "step": 12027 + }, + { + "epoch": 5.686997635933806, + "grad_norm": 3.032581090927124, + "learning_rate": 3.4031922508847014e-08, + "loss": 0.3118, + "step": 12028 + }, + { + "epoch": 5.687470449172577, + "grad_norm": 3.0940909385681152, + "learning_rate": 3.392941771134806e-08, + "loss": 0.3088, + "step": 12029 + }, + { + "epoch": 5.6879432624113475, + "grad_norm": 3.195840835571289, + "learning_rate": 3.382706646525069e-08, + "loss": 0.3332, + "step": 12030 + }, + { + "epoch": 5.688416075650118, + "grad_norm": 3.198594093322754, + "learning_rate": 3.3724868776928434e-08, + "loss": 0.3845, + "step": 12031 + }, + { + "epoch": 5.688888888888889, + "grad_norm": 3.1207070350646973, + "learning_rate": 3.362282465274397e-08, + "loss": 0.3079, + "step": 12032 + }, + { + "epoch": 5.68936170212766, + "grad_norm": 3.221534252166748, + "learning_rate": 3.352093409905138e-08, + "loss": 0.2855, + "step": 12033 + }, + { + "epoch": 5.68983451536643, + "grad_norm": 3.2485337257385254, + "learning_rate": 3.3419197122195325e-08, + "loss": 0.3146, + "step": 12034 + }, + { + "epoch": 5.690307328605201, + "grad_norm": 3.200204372406006, + "learning_rate": 3.331761372851017e-08, + "loss": 0.3504, + "step": 12035 + }, + { + "epoch": 5.690780141843971, + "grad_norm": 3.3995420932769775, + "learning_rate": 3.321618392432085e-08, + "loss": 0.3378, + "step": 12036 + }, + { + "epoch": 5.691252955082742, + "grad_norm": 2.949434757232666, + "learning_rate": 3.311490771594372e-08, + "loss": 0.3069, + "step": 12037 + }, + { + "epoch": 5.691725768321513, + "grad_norm": 3.4283101558685303, + "learning_rate": 3.3013785109684005e-08, + "loss": 0.3318, + "step": 12038 + }, + { + "epoch": 5.692198581560284, + "grad_norm": 3.3977811336517334, + "learning_rate": 3.291281611183861e-08, + "loss": 0.3437, + "step": 12039 + }, + { + "epoch": 5.692671394799055, + "grad_norm": 3.0468637943267822, + "learning_rate": 3.281200072869417e-08, + "loss": 0.2615, + "step": 12040 + }, + { + "epoch": 5.693144208037825, + "grad_norm": 3.370964527130127, + "learning_rate": 3.271133896652817e-08, + "loss": 0.2964, + "step": 12041 + }, + { + "epoch": 5.693617021276596, + "grad_norm": 3.3542773723602295, + "learning_rate": 3.261083083160838e-08, + "loss": 0.3612, + "step": 12042 + }, + { + "epoch": 5.694089834515366, + "grad_norm": 3.369004964828491, + "learning_rate": 3.251047633019283e-08, + "loss": 0.3721, + "step": 12043 + }, + { + "epoch": 5.694562647754137, + "grad_norm": 3.358621835708618, + "learning_rate": 3.2410275468530705e-08, + "loss": 0.3454, + "step": 12044 + }, + { + "epoch": 5.695035460992908, + "grad_norm": 2.8612661361694336, + "learning_rate": 3.231022825286034e-08, + "loss": 0.3404, + "step": 12045 + }, + { + "epoch": 5.6955082742316785, + "grad_norm": 3.1585562229156494, + "learning_rate": 3.221033468941176e-08, + "loss": 0.3204, + "step": 12046 + }, + { + "epoch": 5.695981087470449, + "grad_norm": 3.128363609313965, + "learning_rate": 3.2110594784404706e-08, + "loss": 0.3188, + "step": 12047 + }, + { + "epoch": 5.69645390070922, + "grad_norm": 3.4005603790283203, + "learning_rate": 3.2011008544049206e-08, + "loss": 0.3185, + "step": 12048 + }, + { + "epoch": 5.696926713947991, + "grad_norm": 3.1740269660949707, + "learning_rate": 3.191157597454669e-08, + "loss": 0.2825, + "step": 12049 + }, + { + "epoch": 5.697399527186761, + "grad_norm": 3.5193979740142822, + "learning_rate": 3.181229708208833e-08, + "loss": 0.3448, + "step": 12050 + }, + { + "epoch": 5.697872340425532, + "grad_norm": 3.2826907634735107, + "learning_rate": 3.1713171872855e-08, + "loss": 0.2779, + "step": 12051 + }, + { + "epoch": 5.698345153664302, + "grad_norm": 2.7100746631622314, + "learning_rate": 3.161420035301982e-08, + "loss": 0.2536, + "step": 12052 + }, + { + "epoch": 5.698817966903073, + "grad_norm": 3.387242317199707, + "learning_rate": 3.151538252874453e-08, + "loss": 0.3291, + "step": 12053 + }, + { + "epoch": 5.699290780141844, + "grad_norm": 3.3984251022338867, + "learning_rate": 3.1416718406182545e-08, + "loss": 0.2843, + "step": 12054 + }, + { + "epoch": 5.699763593380615, + "grad_norm": 3.0349318981170654, + "learning_rate": 3.1318207991477e-08, + "loss": 0.3216, + "step": 12055 + }, + { + "epoch": 5.700236406619386, + "grad_norm": 3.406585693359375, + "learning_rate": 3.12198512907616e-08, + "loss": 0.3496, + "step": 12056 + }, + { + "epoch": 5.700709219858156, + "grad_norm": 3.709542989730835, + "learning_rate": 3.112164831016118e-08, + "loss": 0.3285, + "step": 12057 + }, + { + "epoch": 5.701182033096927, + "grad_norm": 3.145622491836548, + "learning_rate": 3.102359905579e-08, + "loss": 0.3323, + "step": 12058 + }, + { + "epoch": 5.701654846335697, + "grad_norm": 3.2133283615112305, + "learning_rate": 3.0925703533752905e-08, + "loss": 0.2981, + "step": 12059 + }, + { + "epoch": 5.702127659574468, + "grad_norm": 3.4080936908721924, + "learning_rate": 3.082796175014585e-08, + "loss": 0.3176, + "step": 12060 + }, + { + "epoch": 5.702600472813239, + "grad_norm": 3.290942668914795, + "learning_rate": 3.073037371105453e-08, + "loss": 0.361, + "step": 12061 + }, + { + "epoch": 5.7030732860520095, + "grad_norm": 2.5797150135040283, + "learning_rate": 3.0632939422555466e-08, + "loss": 0.2767, + "step": 12062 + }, + { + "epoch": 5.70354609929078, + "grad_norm": 3.083340883255005, + "learning_rate": 3.053565889071547e-08, + "loss": 0.347, + "step": 12063 + }, + { + "epoch": 5.704018912529551, + "grad_norm": 3.1182093620300293, + "learning_rate": 3.043853212159165e-08, + "loss": 0.2469, + "step": 12064 + }, + { + "epoch": 5.704491725768322, + "grad_norm": 3.1092607975006104, + "learning_rate": 3.034155912123193e-08, + "loss": 0.3342, + "step": 12065 + }, + { + "epoch": 5.704964539007092, + "grad_norm": 3.1561546325683594, + "learning_rate": 3.0244739895673725e-08, + "loss": 0.3184, + "step": 12066 + }, + { + "epoch": 5.705437352245863, + "grad_norm": 3.974445343017578, + "learning_rate": 3.014807445094636e-08, + "loss": 0.3318, + "step": 12067 + }, + { + "epoch": 5.705910165484633, + "grad_norm": 2.7729475498199463, + "learning_rate": 3.0051562793068646e-08, + "loss": 0.2742, + "step": 12068 + }, + { + "epoch": 5.706382978723404, + "grad_norm": 3.2135114669799805, + "learning_rate": 2.99552049280491e-08, + "loss": 0.3302, + "step": 12069 + }, + { + "epoch": 5.706855791962175, + "grad_norm": 3.0425431728363037, + "learning_rate": 2.9859000861888757e-08, + "loss": 0.2774, + "step": 12070 + }, + { + "epoch": 5.707328605200946, + "grad_norm": 3.1506264209747314, + "learning_rate": 2.976295060057671e-08, + "loss": 0.3346, + "step": 12071 + }, + { + "epoch": 5.707801418439717, + "grad_norm": 3.517425537109375, + "learning_rate": 2.96670541500943e-08, + "loss": 0.3034, + "step": 12072 + }, + { + "epoch": 5.708274231678487, + "grad_norm": 3.0956437587738037, + "learning_rate": 2.9571311516412295e-08, + "loss": 0.3602, + "step": 12073 + }, + { + "epoch": 5.708747044917258, + "grad_norm": 3.1720614433288574, + "learning_rate": 2.9475722705492037e-08, + "loss": 0.3173, + "step": 12074 + }, + { + "epoch": 5.709219858156028, + "grad_norm": 2.999795436859131, + "learning_rate": 2.9380287723285994e-08, + "loss": 0.3113, + "step": 12075 + }, + { + "epoch": 5.709692671394799, + "grad_norm": 3.511803150177002, + "learning_rate": 2.9285006575735798e-08, + "loss": 0.2905, + "step": 12076 + }, + { + "epoch": 5.71016548463357, + "grad_norm": 3.7468740940093994, + "learning_rate": 2.918987926877448e-08, + "loss": 0.3837, + "step": 12077 + }, + { + "epoch": 5.7106382978723405, + "grad_norm": 3.5052263736724854, + "learning_rate": 2.9094905808325358e-08, + "loss": 0.3343, + "step": 12078 + }, + { + "epoch": 5.711111111111111, + "grad_norm": 3.2323622703552246, + "learning_rate": 2.9000086200302036e-08, + "loss": 0.3023, + "step": 12079 + }, + { + "epoch": 5.711583924349882, + "grad_norm": 2.9456934928894043, + "learning_rate": 2.8905420450608122e-08, + "loss": 0.3137, + "step": 12080 + }, + { + "epoch": 5.712056737588653, + "grad_norm": 3.1025915145874023, + "learning_rate": 2.881090856513835e-08, + "loss": 0.3041, + "step": 12081 + }, + { + "epoch": 5.712529550827423, + "grad_norm": 2.798013687133789, + "learning_rate": 2.8716550549777177e-08, + "loss": 0.2928, + "step": 12082 + }, + { + "epoch": 5.7130023640661936, + "grad_norm": 3.002685070037842, + "learning_rate": 2.8622346410400183e-08, + "loss": 0.2916, + "step": 12083 + }, + { + "epoch": 5.713475177304964, + "grad_norm": 3.4977736473083496, + "learning_rate": 2.8528296152873225e-08, + "loss": 0.3135, + "step": 12084 + }, + { + "epoch": 5.713947990543735, + "grad_norm": 3.023202419281006, + "learning_rate": 2.843439978305218e-08, + "loss": 0.3818, + "step": 12085 + }, + { + "epoch": 5.714420803782506, + "grad_norm": 3.6928040981292725, + "learning_rate": 2.834065730678348e-08, + "loss": 0.3388, + "step": 12086 + }, + { + "epoch": 5.714893617021277, + "grad_norm": 3.3528921604156494, + "learning_rate": 2.8247068729904126e-08, + "loss": 0.3263, + "step": 12087 + }, + { + "epoch": 5.7153664302600475, + "grad_norm": 3.0671277046203613, + "learning_rate": 2.8153634058241397e-08, + "loss": 0.3563, + "step": 12088 + }, + { + "epoch": 5.715839243498818, + "grad_norm": 3.4238786697387695, + "learning_rate": 2.806035329761314e-08, + "loss": 0.3039, + "step": 12089 + }, + { + "epoch": 5.716312056737589, + "grad_norm": 3.6761975288391113, + "learning_rate": 2.7967226453827767e-08, + "loss": 0.2712, + "step": 12090 + }, + { + "epoch": 5.716784869976359, + "grad_norm": 3.2580909729003906, + "learning_rate": 2.787425353268369e-08, + "loss": 0.2774, + "step": 12091 + }, + { + "epoch": 5.71725768321513, + "grad_norm": 3.3871915340423584, + "learning_rate": 2.7781434539969897e-08, + "loss": 0.3026, + "step": 12092 + }, + { + "epoch": 5.717730496453901, + "grad_norm": 3.752911329269409, + "learning_rate": 2.768876948146565e-08, + "loss": 0.3212, + "step": 12093 + }, + { + "epoch": 5.718203309692671, + "grad_norm": 3.4740591049194336, + "learning_rate": 2.7596258362941052e-08, + "loss": 0.377, + "step": 12094 + }, + { + "epoch": 5.718676122931442, + "grad_norm": 3.0362772941589355, + "learning_rate": 2.7503901190156502e-08, + "loss": 0.3247, + "step": 12095 + }, + { + "epoch": 5.719148936170213, + "grad_norm": 3.3042995929718018, + "learning_rate": 2.7411697968862117e-08, + "loss": 0.3184, + "step": 12096 + }, + { + "epoch": 5.719621749408984, + "grad_norm": 2.989022731781006, + "learning_rate": 2.731964870479942e-08, + "loss": 0.3252, + "step": 12097 + }, + { + "epoch": 5.720094562647754, + "grad_norm": 3.2194690704345703, + "learning_rate": 2.722775340369993e-08, + "loss": 0.3424, + "step": 12098 + }, + { + "epoch": 5.7205673758865245, + "grad_norm": 3.509066581726074, + "learning_rate": 2.713601207128547e-08, + "loss": 0.3128, + "step": 12099 + }, + { + "epoch": 5.721040189125295, + "grad_norm": 3.277519941329956, + "learning_rate": 2.7044424713268124e-08, + "loss": 0.3411, + "step": 12100 + }, + { + "epoch": 5.721513002364066, + "grad_norm": 3.0034539699554443, + "learning_rate": 2.695299133535112e-08, + "loss": 0.3048, + "step": 12101 + }, + { + "epoch": 5.721985815602837, + "grad_norm": 3.2064356803894043, + "learning_rate": 2.686171194322712e-08, + "loss": 0.3154, + "step": 12102 + }, + { + "epoch": 5.722458628841608, + "grad_norm": 3.3201537132263184, + "learning_rate": 2.6770586542579914e-08, + "loss": 0.3004, + "step": 12103 + }, + { + "epoch": 5.7229314420803785, + "grad_norm": 2.984123706817627, + "learning_rate": 2.6679615139083847e-08, + "loss": 0.2802, + "step": 12104 + }, + { + "epoch": 5.723404255319149, + "grad_norm": 3.4404940605163574, + "learning_rate": 2.658879773840245e-08, + "loss": 0.3032, + "step": 12105 + }, + { + "epoch": 5.72387706855792, + "grad_norm": 3.38144850730896, + "learning_rate": 2.6498134346191474e-08, + "loss": 0.3569, + "step": 12106 + }, + { + "epoch": 5.72434988179669, + "grad_norm": 3.053532838821411, + "learning_rate": 2.6407624968095568e-08, + "loss": 0.3277, + "step": 12107 + }, + { + "epoch": 5.724822695035461, + "grad_norm": 3.94144606590271, + "learning_rate": 2.6317269609750507e-08, + "loss": 0.355, + "step": 12108 + }, + { + "epoch": 5.725295508274232, + "grad_norm": 2.9377691745758057, + "learning_rate": 2.622706827678234e-08, + "loss": 0.3036, + "step": 12109 + }, + { + "epoch": 5.725768321513002, + "grad_norm": 3.1034493446350098, + "learning_rate": 2.6137020974807415e-08, + "loss": 0.2807, + "step": 12110 + }, + { + "epoch": 5.726241134751773, + "grad_norm": 3.27827787399292, + "learning_rate": 2.604712770943263e-08, + "loss": 0.3221, + "step": 12111 + }, + { + "epoch": 5.726713947990544, + "grad_norm": 3.3290865421295166, + "learning_rate": 2.5957388486255175e-08, + "loss": 0.3486, + "step": 12112 + }, + { + "epoch": 5.727186761229315, + "grad_norm": 3.1709330081939697, + "learning_rate": 2.5867803310862806e-08, + "loss": 0.3082, + "step": 12113 + }, + { + "epoch": 5.727659574468085, + "grad_norm": 3.3823163509368896, + "learning_rate": 2.5778372188833555e-08, + "loss": 0.3593, + "step": 12114 + }, + { + "epoch": 5.7281323877068555, + "grad_norm": 3.1143476963043213, + "learning_rate": 2.568909512573603e-08, + "loss": 0.3166, + "step": 12115 + }, + { + "epoch": 5.728605200945626, + "grad_norm": 3.666964054107666, + "learning_rate": 2.5599972127128834e-08, + "loss": 0.3397, + "step": 12116 + }, + { + "epoch": 5.729078014184397, + "grad_norm": 3.3389058113098145, + "learning_rate": 2.5511003198561424e-08, + "loss": 0.3055, + "step": 12117 + }, + { + "epoch": 5.729550827423168, + "grad_norm": 2.8332576751708984, + "learning_rate": 2.5422188345573807e-08, + "loss": 0.3106, + "step": 12118 + }, + { + "epoch": 5.730023640661939, + "grad_norm": 3.3573875427246094, + "learning_rate": 2.5333527573695728e-08, + "loss": 0.3222, + "step": 12119 + }, + { + "epoch": 5.7304964539007095, + "grad_norm": 3.2714033126831055, + "learning_rate": 2.5245020888447768e-08, + "loss": 0.3462, + "step": 12120 + }, + { + "epoch": 5.73096926713948, + "grad_norm": 2.817124605178833, + "learning_rate": 2.515666829534108e-08, + "loss": 0.2802, + "step": 12121 + }, + { + "epoch": 5.73144208037825, + "grad_norm": 3.2658917903900146, + "learning_rate": 2.506846979987654e-08, + "loss": 0.3319, + "step": 12122 + }, + { + "epoch": 5.731914893617021, + "grad_norm": 3.310659408569336, + "learning_rate": 2.498042540754614e-08, + "loss": 0.3271, + "step": 12123 + }, + { + "epoch": 5.732387706855792, + "grad_norm": 3.0620813369750977, + "learning_rate": 2.489253512383216e-08, + "loss": 0.338, + "step": 12124 + }, + { + "epoch": 5.732860520094563, + "grad_norm": 2.6192173957824707, + "learning_rate": 2.48047989542069e-08, + "loss": 0.2672, + "step": 12125 + }, + { + "epoch": 5.733333333333333, + "grad_norm": 3.0177063941955566, + "learning_rate": 2.4717216904133478e-08, + "loss": 0.3229, + "step": 12126 + }, + { + "epoch": 5.733806146572104, + "grad_norm": 3.2077372074127197, + "learning_rate": 2.462978897906504e-08, + "loss": 0.3353, + "step": 12127 + }, + { + "epoch": 5.734278959810875, + "grad_norm": 3.0624101161956787, + "learning_rate": 2.4542515184445836e-08, + "loss": 0.2807, + "step": 12128 + }, + { + "epoch": 5.734751773049645, + "grad_norm": 3.681156635284424, + "learning_rate": 2.44553955257093e-08, + "loss": 0.3494, + "step": 12129 + }, + { + "epoch": 5.735224586288416, + "grad_norm": 3.5133650302886963, + "learning_rate": 2.4368430008280265e-08, + "loss": 0.3255, + "step": 12130 + }, + { + "epoch": 5.7356973995271865, + "grad_norm": 3.5622475147247314, + "learning_rate": 2.4281618637574113e-08, + "loss": 0.3481, + "step": 12131 + }, + { + "epoch": 5.736170212765957, + "grad_norm": 3.804569959640503, + "learning_rate": 2.4194961418995687e-08, + "loss": 0.3104, + "step": 12132 + }, + { + "epoch": 5.736643026004728, + "grad_norm": 3.2981555461883545, + "learning_rate": 2.4108458357940957e-08, + "loss": 0.3233, + "step": 12133 + }, + { + "epoch": 5.737115839243499, + "grad_norm": 3.428004026412964, + "learning_rate": 2.4022109459796162e-08, + "loss": 0.3342, + "step": 12134 + }, + { + "epoch": 5.73758865248227, + "grad_norm": 2.9092800617218018, + "learning_rate": 2.393591472993756e-08, + "loss": 0.3019, + "step": 12135 + }, + { + "epoch": 5.7380614657210405, + "grad_norm": 3.9165987968444824, + "learning_rate": 2.3849874173732246e-08, + "loss": 0.3334, + "step": 12136 + }, + { + "epoch": 5.738534278959811, + "grad_norm": 3.541109561920166, + "learning_rate": 2.3763987796538156e-08, + "loss": 0.2841, + "step": 12137 + }, + { + "epoch": 5.739007092198581, + "grad_norm": 3.3009915351867676, + "learning_rate": 2.3678255603702127e-08, + "loss": 0.3493, + "step": 12138 + }, + { + "epoch": 5.739479905437352, + "grad_norm": 3.619457721710205, + "learning_rate": 2.359267760056294e-08, + "loss": 0.3453, + "step": 12139 + }, + { + "epoch": 5.739952718676123, + "grad_norm": 2.9422662258148193, + "learning_rate": 2.3507253792448835e-08, + "loss": 0.2815, + "step": 12140 + }, + { + "epoch": 5.740425531914894, + "grad_norm": 3.0050137042999268, + "learning_rate": 2.342198418467917e-08, + "loss": 0.3371, + "step": 12141 + }, + { + "epoch": 5.740898345153664, + "grad_norm": 3.315720796585083, + "learning_rate": 2.3336868782563026e-08, + "loss": 0.2809, + "step": 12142 + }, + { + "epoch": 5.741371158392435, + "grad_norm": 3.035090684890747, + "learning_rate": 2.3251907591400335e-08, + "loss": 0.2758, + "step": 12143 + }, + { + "epoch": 5.741843971631206, + "grad_norm": 3.3625571727752686, + "learning_rate": 2.316710061648103e-08, + "loss": 0.3082, + "step": 12144 + }, + { + "epoch": 5.742316784869976, + "grad_norm": 3.2533843517303467, + "learning_rate": 2.3082447863085887e-08, + "loss": 0.36, + "step": 12145 + }, + { + "epoch": 5.742789598108747, + "grad_norm": 3.4281437397003174, + "learning_rate": 2.2997949336485693e-08, + "loss": 0.3048, + "step": 12146 + }, + { + "epoch": 5.7432624113475175, + "grad_norm": 3.467841148376465, + "learning_rate": 2.291360504194179e-08, + "loss": 0.3086, + "step": 12147 + }, + { + "epoch": 5.743735224586288, + "grad_norm": 3.471487283706665, + "learning_rate": 2.282941498470581e-08, + "loss": 0.3201, + "step": 12148 + }, + { + "epoch": 5.744208037825059, + "grad_norm": 3.9886951446533203, + "learning_rate": 2.274537917002051e-08, + "loss": 0.3493, + "step": 12149 + }, + { + "epoch": 5.74468085106383, + "grad_norm": 3.6634814739227295, + "learning_rate": 2.266149760311781e-08, + "loss": 0.2909, + "step": 12150 + }, + { + "epoch": 5.745153664302601, + "grad_norm": 3.017439126968384, + "learning_rate": 2.2577770289220758e-08, + "loss": 0.3173, + "step": 12151 + }, + { + "epoch": 5.7456264775413715, + "grad_norm": 3.0666286945343018, + "learning_rate": 2.2494197233542957e-08, + "loss": 0.3455, + "step": 12152 + }, + { + "epoch": 5.746099290780142, + "grad_norm": 3.200896978378296, + "learning_rate": 2.241077844128775e-08, + "loss": 0.373, + "step": 12153 + }, + { + "epoch": 5.746572104018912, + "grad_norm": 3.3661558628082275, + "learning_rate": 2.232751391764959e-08, + "loss": 0.3274, + "step": 12154 + }, + { + "epoch": 5.747044917257683, + "grad_norm": 3.8031365871429443, + "learning_rate": 2.2244403667812942e-08, + "loss": 0.383, + "step": 12155 + }, + { + "epoch": 5.747517730496454, + "grad_norm": 3.1112916469573975, + "learning_rate": 2.2161447696952555e-08, + "loss": 0.3553, + "step": 12156 + }, + { + "epoch": 5.7479905437352246, + "grad_norm": 3.2407584190368652, + "learning_rate": 2.2078646010234016e-08, + "loss": 0.3317, + "step": 12157 + }, + { + "epoch": 5.748463356973995, + "grad_norm": 3.176625967025757, + "learning_rate": 2.1995998612812654e-08, + "loss": 0.3131, + "step": 12158 + }, + { + "epoch": 5.748936170212766, + "grad_norm": 3.056150197982788, + "learning_rate": 2.19135055098349e-08, + "loss": 0.3119, + "step": 12159 + }, + { + "epoch": 5.749408983451537, + "grad_norm": 3.1598548889160156, + "learning_rate": 2.1831166706436925e-08, + "loss": 0.2552, + "step": 12160 + }, + { + "epoch": 5.749881796690307, + "grad_norm": 3.460984230041504, + "learning_rate": 2.174898220774574e-08, + "loss": 0.3239, + "step": 12161 + }, + { + "epoch": 5.750354609929078, + "grad_norm": 3.1156563758850098, + "learning_rate": 2.166695201887864e-08, + "loss": 0.3393, + "step": 12162 + }, + { + "epoch": 5.7508274231678485, + "grad_norm": 3.162257671356201, + "learning_rate": 2.1585076144943208e-08, + "loss": 0.3006, + "step": 12163 + }, + { + "epoch": 5.751300236406619, + "grad_norm": 3.323511838912964, + "learning_rate": 2.1503354591037585e-08, + "loss": 0.2765, + "step": 12164 + }, + { + "epoch": 5.75177304964539, + "grad_norm": 3.358081817626953, + "learning_rate": 2.1421787362250478e-08, + "loss": 0.3604, + "step": 12165 + }, + { + "epoch": 5.752245862884161, + "grad_norm": 3.3027939796447754, + "learning_rate": 2.1340374463660053e-08, + "loss": 0.345, + "step": 12166 + }, + { + "epoch": 5.752718676122932, + "grad_norm": 3.702924966812134, + "learning_rate": 2.1259115900336137e-08, + "loss": 0.3644, + "step": 12167 + }, + { + "epoch": 5.753191489361702, + "grad_norm": 3.689192056655884, + "learning_rate": 2.117801167733774e-08, + "loss": 0.3382, + "step": 12168 + }, + { + "epoch": 5.753664302600473, + "grad_norm": 3.0726840496063232, + "learning_rate": 2.1097061799715824e-08, + "loss": 0.3438, + "step": 12169 + }, + { + "epoch": 5.754137115839243, + "grad_norm": 3.3648977279663086, + "learning_rate": 2.1016266272509967e-08, + "loss": 0.3409, + "step": 12170 + }, + { + "epoch": 5.754609929078014, + "grad_norm": 4.000081539154053, + "learning_rate": 2.0935625100751144e-08, + "loss": 0.3031, + "step": 12171 + }, + { + "epoch": 5.755082742316785, + "grad_norm": 3.1993863582611084, + "learning_rate": 2.085513828946062e-08, + "loss": 0.3335, + "step": 12172 + }, + { + "epoch": 5.7555555555555555, + "grad_norm": 4.029566287994385, + "learning_rate": 2.0774805843649937e-08, + "loss": 0.3427, + "step": 12173 + }, + { + "epoch": 5.756028368794326, + "grad_norm": 3.3171730041503906, + "learning_rate": 2.069462776832093e-08, + "loss": 0.2913, + "step": 12174 + }, + { + "epoch": 5.756501182033097, + "grad_norm": 2.9666988849639893, + "learning_rate": 2.0614604068465993e-08, + "loss": 0.3242, + "step": 12175 + }, + { + "epoch": 5.756973995271868, + "grad_norm": 3.070720672607422, + "learning_rate": 2.0534734749068087e-08, + "loss": 0.2847, + "step": 12176 + }, + { + "epoch": 5.757446808510638, + "grad_norm": 3.260769844055176, + "learning_rate": 2.0455019815099897e-08, + "loss": 0.3313, + "step": 12177 + }, + { + "epoch": 5.757919621749409, + "grad_norm": 3.9504573345184326, + "learning_rate": 2.0375459271525232e-08, + "loss": 0.3834, + "step": 12178 + }, + { + "epoch": 5.758392434988179, + "grad_norm": 3.1657321453094482, + "learning_rate": 2.029605312329791e-08, + "loss": 0.3396, + "step": 12179 + }, + { + "epoch": 5.75886524822695, + "grad_norm": 3.1386959552764893, + "learning_rate": 2.0216801375362304e-08, + "loss": 0.2789, + "step": 12180 + }, + { + "epoch": 5.759338061465721, + "grad_norm": 3.144196033477783, + "learning_rate": 2.0137704032652528e-08, + "loss": 0.3203, + "step": 12181 + }, + { + "epoch": 5.759810874704492, + "grad_norm": 3.669904947280884, + "learning_rate": 2.005876110009436e-08, + "loss": 0.3259, + "step": 12182 + }, + { + "epoch": 5.760283687943263, + "grad_norm": 3.2438392639160156, + "learning_rate": 1.9979972582603034e-08, + "loss": 0.3269, + "step": 12183 + }, + { + "epoch": 5.760756501182033, + "grad_norm": 3.5162949562072754, + "learning_rate": 1.9901338485084075e-08, + "loss": 0.3881, + "step": 12184 + }, + { + "epoch": 5.761229314420804, + "grad_norm": 3.1763012409210205, + "learning_rate": 1.9822858812433844e-08, + "loss": 0.3194, + "step": 12185 + }, + { + "epoch": 5.761702127659574, + "grad_norm": 2.9922640323638916, + "learning_rate": 1.974453356953898e-08, + "loss": 0.336, + "step": 12186 + }, + { + "epoch": 5.762174940898345, + "grad_norm": 3.721003293991089, + "learning_rate": 1.9666362761276425e-08, + "loss": 0.3112, + "step": 12187 + }, + { + "epoch": 5.762647754137116, + "grad_norm": 3.0573668479919434, + "learning_rate": 1.958834639251367e-08, + "loss": 0.2955, + "step": 12188 + }, + { + "epoch": 5.7631205673758865, + "grad_norm": 3.172262191772461, + "learning_rate": 1.9510484468108215e-08, + "loss": 0.3028, + "step": 12189 + }, + { + "epoch": 5.763593380614657, + "grad_norm": 3.4394562244415283, + "learning_rate": 1.943277699290841e-08, + "loss": 0.3573, + "step": 12190 + }, + { + "epoch": 5.764066193853428, + "grad_norm": 3.005544900894165, + "learning_rate": 1.9355223971752324e-08, + "loss": 0.2809, + "step": 12191 + }, + { + "epoch": 5.764539007092199, + "grad_norm": 3.416400194168091, + "learning_rate": 1.927782540946943e-08, + "loss": 0.2926, + "step": 12192 + }, + { + "epoch": 5.765011820330969, + "grad_norm": 2.908393144607544, + "learning_rate": 1.9200581310878373e-08, + "loss": 0.3074, + "step": 12193 + }, + { + "epoch": 5.76548463356974, + "grad_norm": 2.976839542388916, + "learning_rate": 1.9123491680789473e-08, + "loss": 0.285, + "step": 12194 + }, + { + "epoch": 5.76595744680851, + "grad_norm": 3.4792447090148926, + "learning_rate": 1.9046556524002225e-08, + "loss": 0.367, + "step": 12195 + }, + { + "epoch": 5.766430260047281, + "grad_norm": 3.3577182292938232, + "learning_rate": 1.8969775845307238e-08, + "loss": 0.3373, + "step": 12196 + }, + { + "epoch": 5.766903073286052, + "grad_norm": 3.4297826290130615, + "learning_rate": 1.889314964948541e-08, + "loss": 0.3967, + "step": 12197 + }, + { + "epoch": 5.767375886524823, + "grad_norm": 3.2004456520080566, + "learning_rate": 1.8816677941307925e-08, + "loss": 0.32, + "step": 12198 + }, + { + "epoch": 5.767848699763594, + "grad_norm": 3.613758087158203, + "learning_rate": 1.8740360725535977e-08, + "loss": 0.3547, + "step": 12199 + }, + { + "epoch": 5.768321513002364, + "grad_norm": 3.1576809883117676, + "learning_rate": 1.8664198006921875e-08, + "loss": 0.3024, + "step": 12200 + }, + { + "epoch": 5.768794326241135, + "grad_norm": 3.2107057571411133, + "learning_rate": 1.8588189790207934e-08, + "loss": 0.3262, + "step": 12201 + }, + { + "epoch": 5.769267139479905, + "grad_norm": 3.1241042613983154, + "learning_rate": 1.851233608012648e-08, + "loss": 0.3157, + "step": 12202 + }, + { + "epoch": 5.769739952718676, + "grad_norm": 3.1932709217071533, + "learning_rate": 1.8436636881401238e-08, + "loss": 0.3348, + "step": 12203 + }, + { + "epoch": 5.770212765957447, + "grad_norm": 3.298868417739868, + "learning_rate": 1.8361092198745102e-08, + "loss": 0.3168, + "step": 12204 + }, + { + "epoch": 5.7706855791962175, + "grad_norm": 3.7978901863098145, + "learning_rate": 1.8285702036862086e-08, + "loss": 0.3432, + "step": 12205 + }, + { + "epoch": 5.771158392434988, + "grad_norm": 3.0930938720703125, + "learning_rate": 1.821046640044649e-08, + "loss": 0.3154, + "step": 12206 + }, + { + "epoch": 5.771631205673759, + "grad_norm": 3.205141067504883, + "learning_rate": 1.8135385294182904e-08, + "loss": 0.3347, + "step": 12207 + }, + { + "epoch": 5.77210401891253, + "grad_norm": 3.1829981803894043, + "learning_rate": 1.80604587227462e-08, + "loss": 0.2739, + "step": 12208 + }, + { + "epoch": 5.7725768321513, + "grad_norm": 3.0086073875427246, + "learning_rate": 1.798568669080153e-08, + "loss": 0.2861, + "step": 12209 + }, + { + "epoch": 5.773049645390071, + "grad_norm": 3.0506770610809326, + "learning_rate": 1.7911069203004895e-08, + "loss": 0.2738, + "step": 12210 + }, + { + "epoch": 5.773522458628841, + "grad_norm": 3.29156231880188, + "learning_rate": 1.7836606264002577e-08, + "loss": 0.34, + "step": 12211 + }, + { + "epoch": 5.773995271867612, + "grad_norm": 3.8883163928985596, + "learning_rate": 1.776229787843059e-08, + "loss": 0.2994, + "step": 12212 + }, + { + "epoch": 5.774468085106383, + "grad_norm": 3.5105197429656982, + "learning_rate": 1.7688144050916066e-08, + "loss": 0.3567, + "step": 12213 + }, + { + "epoch": 5.774940898345154, + "grad_norm": 3.268084764480591, + "learning_rate": 1.761414478607615e-08, + "loss": 0.2715, + "step": 12214 + }, + { + "epoch": 5.775413711583925, + "grad_norm": 3.4444260597229004, + "learning_rate": 1.7540300088518814e-08, + "loss": 0.3666, + "step": 12215 + }, + { + "epoch": 5.775886524822695, + "grad_norm": 3.3592841625213623, + "learning_rate": 1.7466609962841497e-08, + "loss": 0.3023, + "step": 12216 + }, + { + "epoch": 5.776359338061466, + "grad_norm": 2.869286060333252, + "learning_rate": 1.739307441363275e-08, + "loss": 0.2475, + "step": 12217 + }, + { + "epoch": 5.776832151300236, + "grad_norm": 3.429243326187134, + "learning_rate": 1.731969344547141e-08, + "loss": 0.3671, + "step": 12218 + }, + { + "epoch": 5.777304964539007, + "grad_norm": 3.57275128364563, + "learning_rate": 1.7246467062926598e-08, + "loss": 0.3131, + "step": 12219 + }, + { + "epoch": 5.777777777777778, + "grad_norm": 3.172421455383301, + "learning_rate": 1.7173395270557445e-08, + "loss": 0.339, + "step": 12220 + }, + { + "epoch": 5.7782505910165485, + "grad_norm": 3.8230342864990234, + "learning_rate": 1.7100478072914483e-08, + "loss": 0.3199, + "step": 12221 + }, + { + "epoch": 5.778723404255319, + "grad_norm": 2.9650866985321045, + "learning_rate": 1.702771547453741e-08, + "loss": 0.3053, + "step": 12222 + }, + { + "epoch": 5.77919621749409, + "grad_norm": 3.665842056274414, + "learning_rate": 1.6955107479957045e-08, + "loss": 0.3845, + "step": 12223 + }, + { + "epoch": 5.779669030732861, + "grad_norm": 3.350053548812866, + "learning_rate": 1.6882654093694495e-08, + "loss": 0.3402, + "step": 12224 + }, + { + "epoch": 5.780141843971631, + "grad_norm": 3.5756731033325195, + "learning_rate": 1.6810355320260597e-08, + "loss": 0.3388, + "step": 12225 + }, + { + "epoch": 5.780614657210402, + "grad_norm": 3.5228781700134277, + "learning_rate": 1.673821116415758e-08, + "loss": 0.3381, + "step": 12226 + }, + { + "epoch": 5.781087470449172, + "grad_norm": 3.166459798812866, + "learning_rate": 1.666622162987713e-08, + "loss": 0.3039, + "step": 12227 + }, + { + "epoch": 5.781560283687943, + "grad_norm": 3.1998205184936523, + "learning_rate": 1.6594386721902335e-08, + "loss": 0.2946, + "step": 12228 + }, + { + "epoch": 5.782033096926714, + "grad_norm": 3.009943962097168, + "learning_rate": 1.652270644470544e-08, + "loss": 0.2878, + "step": 12229 + }, + { + "epoch": 5.782505910165485, + "grad_norm": 3.5119593143463135, + "learning_rate": 1.645118080274982e-08, + "loss": 0.3112, + "step": 12230 + }, + { + "epoch": 5.782978723404256, + "grad_norm": 3.491492986679077, + "learning_rate": 1.637980980048942e-08, + "loss": 0.3286, + "step": 12231 + }, + { + "epoch": 5.783451536643026, + "grad_norm": 3.200819730758667, + "learning_rate": 1.6308593442367625e-08, + "loss": 0.3425, + "step": 12232 + }, + { + "epoch": 5.783924349881797, + "grad_norm": 3.3669025897979736, + "learning_rate": 1.6237531732819222e-08, + "loss": 0.346, + "step": 12233 + }, + { + "epoch": 5.784397163120567, + "grad_norm": 3.1921918392181396, + "learning_rate": 1.6166624676268727e-08, + "loss": 0.3124, + "step": 12234 + }, + { + "epoch": 5.784869976359338, + "grad_norm": 2.852695941925049, + "learning_rate": 1.609587227713122e-08, + "loss": 0.3012, + "step": 12235 + }, + { + "epoch": 5.785342789598109, + "grad_norm": 3.1297335624694824, + "learning_rate": 1.602527453981234e-08, + "loss": 0.3346, + "step": 12236 + }, + { + "epoch": 5.7858156028368795, + "grad_norm": 3.3038461208343506, + "learning_rate": 1.5954831468707467e-08, + "loss": 0.2996, + "step": 12237 + }, + { + "epoch": 5.78628841607565, + "grad_norm": 3.753922462463379, + "learning_rate": 1.5884543068203086e-08, + "loss": 0.3139, + "step": 12238 + }, + { + "epoch": 5.786761229314421, + "grad_norm": 3.3022499084472656, + "learning_rate": 1.5814409342675695e-08, + "loss": 0.3602, + "step": 12239 + }, + { + "epoch": 5.787234042553192, + "grad_norm": 3.618237257003784, + "learning_rate": 1.5744430296492084e-08, + "loss": 0.3555, + "step": 12240 + }, + { + "epoch": 5.787706855791962, + "grad_norm": 3.0856997966766357, + "learning_rate": 1.567460593400988e-08, + "loss": 0.2944, + "step": 12241 + }, + { + "epoch": 5.7881796690307326, + "grad_norm": 3.40259051322937, + "learning_rate": 1.5604936259576432e-08, + "loss": 0.3086, + "step": 12242 + }, + { + "epoch": 5.788652482269503, + "grad_norm": 3.634979248046875, + "learning_rate": 1.553542127752994e-08, + "loss": 0.3264, + "step": 12243 + }, + { + "epoch": 5.789125295508274, + "grad_norm": 3.5231590270996094, + "learning_rate": 1.5466060992198605e-08, + "loss": 0.3841, + "step": 12244 + }, + { + "epoch": 5.789598108747045, + "grad_norm": 3.14682936668396, + "learning_rate": 1.5396855407901202e-08, + "loss": 0.3367, + "step": 12245 + }, + { + "epoch": 5.790070921985816, + "grad_norm": 3.124629020690918, + "learning_rate": 1.5327804528947333e-08, + "loss": 0.3139, + "step": 12246 + }, + { + "epoch": 5.7905437352245865, + "grad_norm": 3.116682291030884, + "learning_rate": 1.525890835963578e-08, + "loss": 0.304, + "step": 12247 + }, + { + "epoch": 5.791016548463357, + "grad_norm": 3.085913896560669, + "learning_rate": 1.5190166904256732e-08, + "loss": 0.2882, + "step": 12248 + }, + { + "epoch": 5.791489361702128, + "grad_norm": 3.3703277111053467, + "learning_rate": 1.5121580167090642e-08, + "loss": 0.3308, + "step": 12249 + }, + { + "epoch": 5.791962174940898, + "grad_norm": 3.1782917976379395, + "learning_rate": 1.5053148152407714e-08, + "loss": 0.3246, + "step": 12250 + }, + { + "epoch": 5.792434988179669, + "grad_norm": 3.1751880645751953, + "learning_rate": 1.498487086446926e-08, + "loss": 0.3074, + "step": 12251 + }, + { + "epoch": 5.79290780141844, + "grad_norm": 3.3677165508270264, + "learning_rate": 1.491674830752632e-08, + "loss": 0.3022, + "step": 12252 + }, + { + "epoch": 5.79338061465721, + "grad_norm": 2.866220474243164, + "learning_rate": 1.484878048582078e-08, + "loss": 0.2842, + "step": 12253 + }, + { + "epoch": 5.793853427895981, + "grad_norm": 3.148810386657715, + "learning_rate": 1.4780967403584534e-08, + "loss": 0.3069, + "step": 12254 + }, + { + "epoch": 5.794326241134752, + "grad_norm": 3.145711898803711, + "learning_rate": 1.4713309065040039e-08, + "loss": 0.3266, + "step": 12255 + }, + { + "epoch": 5.794799054373523, + "grad_norm": 3.4532599449157715, + "learning_rate": 1.4645805474400032e-08, + "loss": 0.3314, + "step": 12256 + }, + { + "epoch": 5.795271867612293, + "grad_norm": 2.9855432510375977, + "learning_rate": 1.45784566358681e-08, + "loss": 0.3385, + "step": 12257 + }, + { + "epoch": 5.7957446808510635, + "grad_norm": 3.3268442153930664, + "learning_rate": 1.4511262553637274e-08, + "loss": 0.3553, + "step": 12258 + }, + { + "epoch": 5.796217494089834, + "grad_norm": 2.8287642002105713, + "learning_rate": 1.444422323189143e-08, + "loss": 0.2748, + "step": 12259 + }, + { + "epoch": 5.796690307328605, + "grad_norm": 3.328403949737549, + "learning_rate": 1.4377338674805007e-08, + "loss": 0.3055, + "step": 12260 + }, + { + "epoch": 5.797163120567376, + "grad_norm": 3.004183530807495, + "learning_rate": 1.4310608886542732e-08, + "loss": 0.3025, + "step": 12261 + }, + { + "epoch": 5.797635933806147, + "grad_norm": 3.0779333114624023, + "learning_rate": 1.424403387125961e-08, + "loss": 0.276, + "step": 12262 + }, + { + "epoch": 5.7981087470449175, + "grad_norm": 3.425168037414551, + "learning_rate": 1.4177613633100384e-08, + "loss": 0.3771, + "step": 12263 + }, + { + "epoch": 5.798581560283688, + "grad_norm": 3.3431761264801025, + "learning_rate": 1.411134817620119e-08, + "loss": 0.3604, + "step": 12264 + }, + { + "epoch": 5.799054373522459, + "grad_norm": 3.708122491836548, + "learning_rate": 1.404523750468817e-08, + "loss": 0.3647, + "step": 12265 + }, + { + "epoch": 5.799527186761229, + "grad_norm": 3.414480447769165, + "learning_rate": 1.3979281622677753e-08, + "loss": 0.3148, + "step": 12266 + }, + { + "epoch": 5.8, + "grad_norm": 3.335441827774048, + "learning_rate": 1.391348053427638e-08, + "loss": 0.2949, + "step": 12267 + }, + { + "epoch": 5.800472813238771, + "grad_norm": 3.2817869186401367, + "learning_rate": 1.3847834243581603e-08, + "loss": 0.3031, + "step": 12268 + }, + { + "epoch": 5.800945626477541, + "grad_norm": 3.3932013511657715, + "learning_rate": 1.378234275468071e-08, + "loss": 0.3035, + "step": 12269 + }, + { + "epoch": 5.801418439716312, + "grad_norm": 3.355982780456543, + "learning_rate": 1.3717006071651551e-08, + "loss": 0.3207, + "step": 12270 + }, + { + "epoch": 5.801891252955083, + "grad_norm": 3.441032886505127, + "learning_rate": 1.3651824198562258e-08, + "loss": 0.2979, + "step": 12271 + }, + { + "epoch": 5.802364066193854, + "grad_norm": 3.2429022789001465, + "learning_rate": 1.3586797139471807e-08, + "loss": 0.315, + "step": 12272 + }, + { + "epoch": 5.802836879432624, + "grad_norm": 2.954455852508545, + "learning_rate": 1.352192489842863e-08, + "loss": 0.3364, + "step": 12273 + }, + { + "epoch": 5.8033096926713945, + "grad_norm": 3.57834792137146, + "learning_rate": 1.3457207479472545e-08, + "loss": 0.3548, + "step": 12274 + }, + { + "epoch": 5.803782505910165, + "grad_norm": 3.009730577468872, + "learning_rate": 1.3392644886633111e-08, + "loss": 0.2929, + "step": 12275 + }, + { + "epoch": 5.804255319148936, + "grad_norm": 3.3504159450531006, + "learning_rate": 1.3328237123929888e-08, + "loss": 0.3276, + "step": 12276 + }, + { + "epoch": 5.804728132387707, + "grad_norm": 3.3094518184661865, + "learning_rate": 1.3263984195373836e-08, + "loss": 0.3314, + "step": 12277 + }, + { + "epoch": 5.805200945626478, + "grad_norm": 3.4735429286956787, + "learning_rate": 1.3199886104965088e-08, + "loss": 0.3502, + "step": 12278 + }, + { + "epoch": 5.8056737588652485, + "grad_norm": 3.1584722995758057, + "learning_rate": 1.3135942856695728e-08, + "loss": 0.3202, + "step": 12279 + }, + { + "epoch": 5.806146572104019, + "grad_norm": 3.2173585891723633, + "learning_rate": 1.3072154454546181e-08, + "loss": 0.3285, + "step": 12280 + }, + { + "epoch": 5.80661938534279, + "grad_norm": 3.024911403656006, + "learning_rate": 1.300852090248883e-08, + "loss": 0.3164, + "step": 12281 + }, + { + "epoch": 5.80709219858156, + "grad_norm": 3.0423598289489746, + "learning_rate": 1.2945042204486058e-08, + "loss": 0.3111, + "step": 12282 + }, + { + "epoch": 5.807565011820331, + "grad_norm": 3.4012279510498047, + "learning_rate": 1.2881718364489704e-08, + "loss": 0.3103, + "step": 12283 + }, + { + "epoch": 5.808037825059102, + "grad_norm": 3.214341878890991, + "learning_rate": 1.281854938644328e-08, + "loss": 0.3463, + "step": 12284 + }, + { + "epoch": 5.808510638297872, + "grad_norm": 3.1404919624328613, + "learning_rate": 1.2755535274279751e-08, + "loss": 0.3568, + "step": 12285 + }, + { + "epoch": 5.808983451536643, + "grad_norm": 2.9638466835021973, + "learning_rate": 1.2692676031922924e-08, + "loss": 0.2977, + "step": 12286 + }, + { + "epoch": 5.809456264775414, + "grad_norm": 3.213879108428955, + "learning_rate": 1.2629971663286611e-08, + "loss": 0.317, + "step": 12287 + }, + { + "epoch": 5.809929078014184, + "grad_norm": 3.2124996185302734, + "learning_rate": 1.2567422172274912e-08, + "loss": 0.3242, + "step": 12288 + }, + { + "epoch": 5.810401891252955, + "grad_norm": 3.4042906761169434, + "learning_rate": 1.2505027562783046e-08, + "loss": 0.3923, + "step": 12289 + }, + { + "epoch": 5.8108747044917255, + "grad_norm": 3.2831614017486572, + "learning_rate": 1.244278783869568e-08, + "loss": 0.3182, + "step": 12290 + }, + { + "epoch": 5.811347517730496, + "grad_norm": 3.221754789352417, + "learning_rate": 1.2380703003888328e-08, + "loss": 0.3148, + "step": 12291 + }, + { + "epoch": 5.811820330969267, + "grad_norm": 3.3793656826019287, + "learning_rate": 1.2318773062226786e-08, + "loss": 0.302, + "step": 12292 + }, + { + "epoch": 5.812293144208038, + "grad_norm": 3.3612375259399414, + "learning_rate": 1.2256998017566857e-08, + "loss": 0.3948, + "step": 12293 + }, + { + "epoch": 5.812765957446809, + "grad_norm": 3.12095308303833, + "learning_rate": 1.2195377873755466e-08, + "loss": 0.3008, + "step": 12294 + }, + { + "epoch": 5.8132387706855795, + "grad_norm": 3.474125385284424, + "learning_rate": 1.2133912634628987e-08, + "loss": 0.3286, + "step": 12295 + }, + { + "epoch": 5.81371158392435, + "grad_norm": 3.325859308242798, + "learning_rate": 1.2072602304014914e-08, + "loss": 0.3191, + "step": 12296 + }, + { + "epoch": 5.81418439716312, + "grad_norm": 3.4525535106658936, + "learning_rate": 1.2011446885730748e-08, + "loss": 0.3264, + "step": 12297 + }, + { + "epoch": 5.814657210401891, + "grad_norm": 3.212451934814453, + "learning_rate": 1.195044638358428e-08, + "loss": 0.3474, + "step": 12298 + }, + { + "epoch": 5.815130023640662, + "grad_norm": 3.5907411575317383, + "learning_rate": 1.1889600801373579e-08, + "loss": 0.3518, + "step": 12299 + }, + { + "epoch": 5.815602836879433, + "grad_norm": 3.051485538482666, + "learning_rate": 1.1828910142887562e-08, + "loss": 0.3446, + "step": 12300 + }, + { + "epoch": 5.816075650118203, + "grad_norm": 2.894104480743408, + "learning_rate": 1.1768374411905147e-08, + "loss": 0.2675, + "step": 12301 + }, + { + "epoch": 5.816548463356974, + "grad_norm": 3.1474084854125977, + "learning_rate": 1.1707993612195267e-08, + "loss": 0.3322, + "step": 12302 + }, + { + "epoch": 5.817021276595745, + "grad_norm": 3.2870020866394043, + "learning_rate": 1.1647767747517691e-08, + "loss": 0.3422, + "step": 12303 + }, + { + "epoch": 5.817494089834515, + "grad_norm": 3.2131540775299072, + "learning_rate": 1.1587696821622751e-08, + "loss": 0.3539, + "step": 12304 + }, + { + "epoch": 5.817966903073286, + "grad_norm": 3.316063165664673, + "learning_rate": 1.1527780838250513e-08, + "loss": 0.3317, + "step": 12305 + }, + { + "epoch": 5.8184397163120565, + "grad_norm": 3.1365461349487305, + "learning_rate": 1.1468019801131603e-08, + "loss": 0.2794, + "step": 12306 + }, + { + "epoch": 5.818912529550827, + "grad_norm": 3.479088544845581, + "learning_rate": 1.140841371398721e-08, + "loss": 0.3171, + "step": 12307 + }, + { + "epoch": 5.819385342789598, + "grad_norm": 3.3644347190856934, + "learning_rate": 1.1348962580529087e-08, + "loss": 0.3226, + "step": 12308 + }, + { + "epoch": 5.819858156028369, + "grad_norm": 3.183668375015259, + "learning_rate": 1.1289666404458166e-08, + "loss": 0.3176, + "step": 12309 + }, + { + "epoch": 5.82033096926714, + "grad_norm": 3.087696075439453, + "learning_rate": 1.1230525189467323e-08, + "loss": 0.3031, + "step": 12310 + }, + { + "epoch": 5.8208037825059105, + "grad_norm": 3.1411244869232178, + "learning_rate": 1.1171538939238614e-08, + "loss": 0.3757, + "step": 12311 + }, + { + "epoch": 5.821276595744681, + "grad_norm": 3.7644147872924805, + "learning_rate": 1.111270765744521e-08, + "loss": 0.3617, + "step": 12312 + }, + { + "epoch": 5.821749408983451, + "grad_norm": 3.530128240585327, + "learning_rate": 1.1054031347750016e-08, + "loss": 0.2842, + "step": 12313 + }, + { + "epoch": 5.822222222222222, + "grad_norm": 3.054417848587036, + "learning_rate": 1.0995510013806499e-08, + "loss": 0.2845, + "step": 12314 + }, + { + "epoch": 5.822695035460993, + "grad_norm": 3.606204032897949, + "learning_rate": 1.0937143659258686e-08, + "loss": 0.3339, + "step": 12315 + }, + { + "epoch": 5.823167848699764, + "grad_norm": 3.1252198219299316, + "learning_rate": 1.0878932287740618e-08, + "loss": 0.3387, + "step": 12316 + }, + { + "epoch": 5.823640661938534, + "grad_norm": 3.3949391841888428, + "learning_rate": 1.082087590287717e-08, + "loss": 0.292, + "step": 12317 + }, + { + "epoch": 5.824113475177305, + "grad_norm": 3.2029218673706055, + "learning_rate": 1.0762974508282954e-08, + "loss": 0.3053, + "step": 12318 + }, + { + "epoch": 5.824586288416076, + "grad_norm": 3.2031943798065186, + "learning_rate": 1.0705228107563138e-08, + "loss": 0.3362, + "step": 12319 + }, + { + "epoch": 5.825059101654846, + "grad_norm": 3.107954502105713, + "learning_rate": 1.0647636704313736e-08, + "loss": 0.3196, + "step": 12320 + }, + { + "epoch": 5.825531914893617, + "grad_norm": 3.4354865550994873, + "learning_rate": 1.0590200302120767e-08, + "loss": 0.3165, + "step": 12321 + }, + { + "epoch": 5.8260047281323875, + "grad_norm": 3.4960227012634277, + "learning_rate": 1.0532918904559709e-08, + "loss": 0.3052, + "step": 12322 + }, + { + "epoch": 5.826477541371158, + "grad_norm": 3.085123062133789, + "learning_rate": 1.0475792515198258e-08, + "loss": 0.3575, + "step": 12323 + }, + { + "epoch": 5.826950354609929, + "grad_norm": 3.4668917655944824, + "learning_rate": 1.0418821137592738e-08, + "loss": 0.3975, + "step": 12324 + }, + { + "epoch": 5.8274231678487, + "grad_norm": 4.228496074676514, + "learning_rate": 1.0362004775290868e-08, + "loss": 0.3641, + "step": 12325 + }, + { + "epoch": 5.827895981087471, + "grad_norm": 3.3614673614501953, + "learning_rate": 1.0305343431830372e-08, + "loss": 0.3568, + "step": 12326 + }, + { + "epoch": 5.828368794326241, + "grad_norm": 3.3814778327941895, + "learning_rate": 1.0248837110738708e-08, + "loss": 0.3703, + "step": 12327 + }, + { + "epoch": 5.828841607565012, + "grad_norm": 3.4079811573028564, + "learning_rate": 1.0192485815535003e-08, + "loss": 0.3243, + "step": 12328 + }, + { + "epoch": 5.829314420803782, + "grad_norm": 2.895949363708496, + "learning_rate": 1.013628954972784e-08, + "loss": 0.2907, + "step": 12329 + }, + { + "epoch": 5.829787234042553, + "grad_norm": 3.183469295501709, + "learning_rate": 1.0080248316816089e-08, + "loss": 0.2966, + "step": 12330 + }, + { + "epoch": 5.830260047281324, + "grad_norm": 3.2694790363311768, + "learning_rate": 1.0024362120289178e-08, + "loss": 0.337, + "step": 12331 + }, + { + "epoch": 5.8307328605200945, + "grad_norm": 3.0339701175689697, + "learning_rate": 9.968630963627101e-09, + "loss": 0.3384, + "step": 12332 + }, + { + "epoch": 5.831205673758865, + "grad_norm": 3.1445910930633545, + "learning_rate": 9.913054850300142e-09, + "loss": 0.3344, + "step": 12333 + }, + { + "epoch": 5.831678486997636, + "grad_norm": 2.9692158699035645, + "learning_rate": 9.857633783768306e-09, + "loss": 0.3272, + "step": 12334 + }, + { + "epoch": 5.832151300236407, + "grad_norm": 2.62898588180542, + "learning_rate": 9.802367767482723e-09, + "loss": 0.2624, + "step": 12335 + }, + { + "epoch": 5.832624113475177, + "grad_norm": 3.2836642265319824, + "learning_rate": 9.74725680488453e-09, + "loss": 0.2886, + "step": 12336 + }, + { + "epoch": 5.833096926713948, + "grad_norm": 3.3103747367858887, + "learning_rate": 9.69230089940515e-09, + "loss": 0.3477, + "step": 12337 + }, + { + "epoch": 5.833569739952718, + "grad_norm": 3.3658392429351807, + "learning_rate": 9.637500054466564e-09, + "loss": 0.3145, + "step": 12338 + }, + { + "epoch": 5.834042553191489, + "grad_norm": 2.9636175632476807, + "learning_rate": 9.582854273481047e-09, + "loss": 0.3039, + "step": 12339 + }, + { + "epoch": 5.83451536643026, + "grad_norm": 3.365917444229126, + "learning_rate": 9.528363559850873e-09, + "loss": 0.3193, + "step": 12340 + }, + { + "epoch": 5.834988179669031, + "grad_norm": 3.360271453857422, + "learning_rate": 9.474027916969164e-09, + "loss": 0.3712, + "step": 12341 + }, + { + "epoch": 5.835460992907802, + "grad_norm": 3.8133535385131836, + "learning_rate": 9.419847348219047e-09, + "loss": 0.3263, + "step": 12342 + }, + { + "epoch": 5.835933806146572, + "grad_norm": 3.439425468444824, + "learning_rate": 9.365821856974489e-09, + "loss": 0.3077, + "step": 12343 + }, + { + "epoch": 5.836406619385343, + "grad_norm": 3.4898314476013184, + "learning_rate": 9.31195144659891e-09, + "loss": 0.3472, + "step": 12344 + }, + { + "epoch": 5.836879432624113, + "grad_norm": 2.860374689102173, + "learning_rate": 9.258236120446573e-09, + "loss": 0.3145, + "step": 12345 + }, + { + "epoch": 5.837352245862884, + "grad_norm": 3.079913854598999, + "learning_rate": 9.204675881862579e-09, + "loss": 0.3144, + "step": 12346 + }, + { + "epoch": 5.837825059101655, + "grad_norm": 3.398139715194702, + "learning_rate": 9.151270734181482e-09, + "loss": 0.2983, + "step": 12347 + }, + { + "epoch": 5.8382978723404255, + "grad_norm": 3.1991748809814453, + "learning_rate": 9.09802068072868e-09, + "loss": 0.3142, + "step": 12348 + }, + { + "epoch": 5.838770685579196, + "grad_norm": 4.70399808883667, + "learning_rate": 9.044925724819852e-09, + "loss": 0.3238, + "step": 12349 + }, + { + "epoch": 5.839243498817967, + "grad_norm": 3.3638081550598145, + "learning_rate": 8.991985869760966e-09, + "loss": 0.3358, + "step": 12350 + }, + { + "epoch": 5.839716312056738, + "grad_norm": 2.6990113258361816, + "learning_rate": 8.93920111884855e-09, + "loss": 0.292, + "step": 12351 + }, + { + "epoch": 5.840189125295508, + "grad_norm": 3.119417428970337, + "learning_rate": 8.886571475368865e-09, + "loss": 0.3018, + "step": 12352 + }, + { + "epoch": 5.840661938534279, + "grad_norm": 3.0563628673553467, + "learning_rate": 8.834096942599568e-09, + "loss": 0.303, + "step": 12353 + }, + { + "epoch": 5.841134751773049, + "grad_norm": 3.306051015853882, + "learning_rate": 8.781777523807212e-09, + "loss": 0.3207, + "step": 12354 + }, + { + "epoch": 5.84160756501182, + "grad_norm": 3.476424217224121, + "learning_rate": 8.729613222250022e-09, + "loss": 0.2788, + "step": 12355 + }, + { + "epoch": 5.842080378250591, + "grad_norm": 2.975752353668213, + "learning_rate": 8.677604041175957e-09, + "loss": 0.3066, + "step": 12356 + }, + { + "epoch": 5.842553191489362, + "grad_norm": 2.9331040382385254, + "learning_rate": 8.625749983823261e-09, + "loss": 0.28, + "step": 12357 + }, + { + "epoch": 5.843026004728133, + "grad_norm": 3.269192695617676, + "learning_rate": 8.574051053421017e-09, + "loss": 0.3116, + "step": 12358 + }, + { + "epoch": 5.843498817966903, + "grad_norm": 3.3965625762939453, + "learning_rate": 8.522507253188039e-09, + "loss": 0.337, + "step": 12359 + }, + { + "epoch": 5.843971631205674, + "grad_norm": 3.294217348098755, + "learning_rate": 8.471118586333426e-09, + "loss": 0.3315, + "step": 12360 + }, + { + "epoch": 5.844444444444444, + "grad_norm": 3.662712812423706, + "learning_rate": 8.419885056057398e-09, + "loss": 0.3138, + "step": 12361 + }, + { + "epoch": 5.844917257683215, + "grad_norm": 2.9847161769866943, + "learning_rate": 8.3688066655499e-09, + "loss": 0.306, + "step": 12362 + }, + { + "epoch": 5.845390070921986, + "grad_norm": 3.5540273189544678, + "learning_rate": 8.317883417991168e-09, + "loss": 0.3661, + "step": 12363 + }, + { + "epoch": 5.8458628841607565, + "grad_norm": 3.339153528213501, + "learning_rate": 8.267115316552555e-09, + "loss": 0.4123, + "step": 12364 + }, + { + "epoch": 5.846335697399527, + "grad_norm": 2.979498863220215, + "learning_rate": 8.216502364394307e-09, + "loss": 0.2785, + "step": 12365 + }, + { + "epoch": 5.846808510638298, + "grad_norm": 3.3412187099456787, + "learning_rate": 8.166044564668629e-09, + "loss": 0.2999, + "step": 12366 + }, + { + "epoch": 5.847281323877069, + "grad_norm": 3.178144693374634, + "learning_rate": 8.115741920516895e-09, + "loss": 0.2632, + "step": 12367 + }, + { + "epoch": 5.847754137115839, + "grad_norm": 3.2854394912719727, + "learning_rate": 8.065594435071044e-09, + "loss": 0.4104, + "step": 12368 + }, + { + "epoch": 5.84822695035461, + "grad_norm": 3.0959322452545166, + "learning_rate": 8.015602111454135e-09, + "loss": 0.279, + "step": 12369 + }, + { + "epoch": 5.84869976359338, + "grad_norm": 3.244652509689331, + "learning_rate": 7.965764952778677e-09, + "loss": 0.3887, + "step": 12370 + }, + { + "epoch": 5.849172576832151, + "grad_norm": 2.9431116580963135, + "learning_rate": 7.916082962147742e-09, + "loss": 0.2984, + "step": 12371 + }, + { + "epoch": 5.849645390070922, + "grad_norm": 3.488229513168335, + "learning_rate": 7.866556142654691e-09, + "loss": 0.3402, + "step": 12372 + }, + { + "epoch": 5.850118203309693, + "grad_norm": 3.5097646713256836, + "learning_rate": 7.817184497383723e-09, + "loss": 0.3445, + "step": 12373 + }, + { + "epoch": 5.850591016548464, + "grad_norm": 3.3530054092407227, + "learning_rate": 7.767968029408767e-09, + "loss": 0.3033, + "step": 12374 + }, + { + "epoch": 5.851063829787234, + "grad_norm": 3.4520263671875, + "learning_rate": 7.718906741794041e-09, + "loss": 0.3666, + "step": 12375 + }, + { + "epoch": 5.851536643026005, + "grad_norm": 3.006080389022827, + "learning_rate": 7.670000637594876e-09, + "loss": 0.2951, + "step": 12376 + }, + { + "epoch": 5.852009456264775, + "grad_norm": 3.80802059173584, + "learning_rate": 7.621249719856338e-09, + "loss": 0.3493, + "step": 12377 + }, + { + "epoch": 5.852482269503546, + "grad_norm": 2.9159436225891113, + "learning_rate": 7.572653991613777e-09, + "loss": 0.3175, + "step": 12378 + }, + { + "epoch": 5.852955082742317, + "grad_norm": 3.0945873260498047, + "learning_rate": 7.524213455893104e-09, + "loss": 0.3494, + "step": 12379 + }, + { + "epoch": 5.8534278959810875, + "grad_norm": 3.1333816051483154, + "learning_rate": 7.475928115710518e-09, + "loss": 0.3044, + "step": 12380 + }, + { + "epoch": 5.853900709219858, + "grad_norm": 3.30086350440979, + "learning_rate": 7.427797974072226e-09, + "loss": 0.3642, + "step": 12381 + }, + { + "epoch": 5.854373522458629, + "grad_norm": 3.21052622795105, + "learning_rate": 7.37982303397583e-09, + "loss": 0.364, + "step": 12382 + }, + { + "epoch": 5.8548463356974, + "grad_norm": 2.969539165496826, + "learning_rate": 7.3320032984075526e-09, + "loss": 0.2997, + "step": 12383 + }, + { + "epoch": 5.85531914893617, + "grad_norm": 3.7157652378082275, + "learning_rate": 7.284338770345567e-09, + "loss": 0.3246, + "step": 12384 + }, + { + "epoch": 5.855791962174941, + "grad_norm": 3.2974917888641357, + "learning_rate": 7.236829452757776e-09, + "loss": 0.3296, + "step": 12385 + }, + { + "epoch": 5.856264775413711, + "grad_norm": 3.3709514141082764, + "learning_rate": 7.189475348601815e-09, + "loss": 0.3505, + "step": 12386 + }, + { + "epoch": 5.856737588652482, + "grad_norm": 3.266540050506592, + "learning_rate": 7.142276460826991e-09, + "loss": 0.327, + "step": 12387 + }, + { + "epoch": 5.857210401891253, + "grad_norm": 3.181105613708496, + "learning_rate": 7.095232792371509e-09, + "loss": 0.3706, + "step": 12388 + }, + { + "epoch": 5.857683215130024, + "grad_norm": 3.1525869369506836, + "learning_rate": 7.048344346164693e-09, + "loss": 0.3277, + "step": 12389 + }, + { + "epoch": 5.858156028368795, + "grad_norm": 2.794084310531616, + "learning_rate": 7.001611125126429e-09, + "loss": 0.2824, + "step": 12390 + }, + { + "epoch": 5.858628841607565, + "grad_norm": 3.000612258911133, + "learning_rate": 6.955033132166333e-09, + "loss": 0.2764, + "step": 12391 + }, + { + "epoch": 5.859101654846336, + "grad_norm": 3.364813804626465, + "learning_rate": 6.908610370184587e-09, + "loss": 0.3611, + "step": 12392 + }, + { + "epoch": 5.859574468085106, + "grad_norm": 3.3845462799072266, + "learning_rate": 6.862342842071934e-09, + "loss": 0.2859, + "step": 12393 + }, + { + "epoch": 5.860047281323877, + "grad_norm": 3.277588367462158, + "learning_rate": 6.816230550709124e-09, + "loss": 0.2971, + "step": 12394 + }, + { + "epoch": 5.860520094562648, + "grad_norm": 3.191347599029541, + "learning_rate": 6.770273498967195e-09, + "loss": 0.345, + "step": 12395 + }, + { + "epoch": 5.8609929078014185, + "grad_norm": 3.713665246963501, + "learning_rate": 6.724471689708023e-09, + "loss": 0.2481, + "step": 12396 + }, + { + "epoch": 5.861465721040189, + "grad_norm": 4.008725166320801, + "learning_rate": 6.678825125783217e-09, + "loss": 0.3994, + "step": 12397 + }, + { + "epoch": 5.86193853427896, + "grad_norm": 3.227405548095703, + "learning_rate": 6.633333810034948e-09, + "loss": 0.3209, + "step": 12398 + }, + { + "epoch": 5.862411347517731, + "grad_norm": 3.2580344676971436, + "learning_rate": 6.587997745295949e-09, + "loss": 0.2656, + "step": 12399 + }, + { + "epoch": 5.862884160756501, + "grad_norm": 3.393422842025757, + "learning_rate": 6.54281693438924e-09, + "loss": 0.3324, + "step": 12400 + }, + { + "epoch": 5.863356973995272, + "grad_norm": 3.3529460430145264, + "learning_rate": 6.497791380127572e-09, + "loss": 0.3522, + "step": 12401 + }, + { + "epoch": 5.863829787234042, + "grad_norm": 3.7341339588165283, + "learning_rate": 6.4529210853150895e-09, + "loss": 0.3344, + "step": 12402 + }, + { + "epoch": 5.864302600472813, + "grad_norm": 3.214383363723755, + "learning_rate": 6.408206052745114e-09, + "loss": 0.3159, + "step": 12403 + }, + { + "epoch": 5.864775413711584, + "grad_norm": 3.2047109603881836, + "learning_rate": 6.363646285202085e-09, + "loss": 0.332, + "step": 12404 + }, + { + "epoch": 5.865248226950355, + "grad_norm": 3.943801164627075, + "learning_rate": 6.319241785460728e-09, + "loss": 0.2709, + "step": 12405 + }, + { + "epoch": 5.8657210401891255, + "grad_norm": 3.622441053390503, + "learning_rate": 6.274992556285497e-09, + "loss": 0.3297, + "step": 12406 + }, + { + "epoch": 5.866193853427896, + "grad_norm": 3.2042336463928223, + "learning_rate": 6.230898600431967e-09, + "loss": 0.3415, + "step": 12407 + }, + { + "epoch": 5.866666666666667, + "grad_norm": 3.1624488830566406, + "learning_rate": 6.186959920645163e-09, + "loss": 0.2946, + "step": 12408 + }, + { + "epoch": 5.867139479905437, + "grad_norm": 3.4413647651672363, + "learning_rate": 6.143176519661786e-09, + "loss": 0.3362, + "step": 12409 + }, + { + "epoch": 5.867612293144208, + "grad_norm": 2.8282558917999268, + "learning_rate": 6.099548400207434e-09, + "loss": 0.3343, + "step": 12410 + }, + { + "epoch": 5.868085106382979, + "grad_norm": 3.3495492935180664, + "learning_rate": 6.05607556499882e-09, + "loss": 0.3359, + "step": 12411 + }, + { + "epoch": 5.868557919621749, + "grad_norm": 3.4601523876190186, + "learning_rate": 6.01275801674267e-09, + "loss": 0.3307, + "step": 12412 + }, + { + "epoch": 5.86903073286052, + "grad_norm": 3.019951581954956, + "learning_rate": 5.969595758136271e-09, + "loss": 0.2836, + "step": 12413 + }, + { + "epoch": 5.869503546099291, + "grad_norm": 3.4495139122009277, + "learning_rate": 5.926588791867194e-09, + "loss": 0.3655, + "step": 12414 + }, + { + "epoch": 5.869976359338062, + "grad_norm": 3.377807140350342, + "learning_rate": 5.8837371206132975e-09, + "loss": 0.3818, + "step": 12415 + }, + { + "epoch": 5.870449172576832, + "grad_norm": 3.2344791889190674, + "learning_rate": 5.841040747042448e-09, + "loss": 0.3216, + "step": 12416 + }, + { + "epoch": 5.8709219858156025, + "grad_norm": 3.171966791152954, + "learning_rate": 5.798499673813629e-09, + "loss": 0.337, + "step": 12417 + }, + { + "epoch": 5.871394799054373, + "grad_norm": 3.327817440032959, + "learning_rate": 5.7561139035755555e-09, + "loss": 0.3246, + "step": 12418 + }, + { + "epoch": 5.871867612293144, + "grad_norm": 2.9902989864349365, + "learning_rate": 5.713883438967227e-09, + "loss": 0.2921, + "step": 12419 + }, + { + "epoch": 5.872340425531915, + "grad_norm": 3.0194108486175537, + "learning_rate": 5.671808282618485e-09, + "loss": 0.3122, + "step": 12420 + }, + { + "epoch": 5.872813238770686, + "grad_norm": 3.165776252746582, + "learning_rate": 5.629888437148623e-09, + "loss": 0.3049, + "step": 12421 + }, + { + "epoch": 5.8732860520094565, + "grad_norm": 3.222511053085327, + "learning_rate": 5.58812390516833e-09, + "loss": 0.3259, + "step": 12422 + }, + { + "epoch": 5.873758865248227, + "grad_norm": 3.37233567237854, + "learning_rate": 5.546514689277749e-09, + "loss": 0.3604, + "step": 12423 + }, + { + "epoch": 5.874231678486998, + "grad_norm": 3.3566384315490723, + "learning_rate": 5.50506079206814e-09, + "loss": 0.3313, + "step": 12424 + }, + { + "epoch": 5.874704491725768, + "grad_norm": 3.334174394607544, + "learning_rate": 5.463762216119939e-09, + "loss": 0.3089, + "step": 12425 + }, + { + "epoch": 5.875177304964539, + "grad_norm": 3.4556798934936523, + "learning_rate": 5.422618964005255e-09, + "loss": 0.3692, + "step": 12426 + }, + { + "epoch": 5.87565011820331, + "grad_norm": 3.3695571422576904, + "learning_rate": 5.3816310382859286e-09, + "loss": 0.3467, + "step": 12427 + }, + { + "epoch": 5.87612293144208, + "grad_norm": 3.069835662841797, + "learning_rate": 5.340798441513528e-09, + "loss": 0.2695, + "step": 12428 + }, + { + "epoch": 5.876595744680851, + "grad_norm": 3.415329694747925, + "learning_rate": 5.300121176231021e-09, + "loss": 0.3901, + "step": 12429 + }, + { + "epoch": 5.877068557919622, + "grad_norm": 3.3829095363616943, + "learning_rate": 5.2595992449711034e-09, + "loss": 0.308, + "step": 12430 + }, + { + "epoch": 5.877541371158393, + "grad_norm": 3.1284217834472656, + "learning_rate": 5.219232650256756e-09, + "loss": 0.2772, + "step": 12431 + }, + { + "epoch": 5.878014184397163, + "grad_norm": 3.5533947944641113, + "learning_rate": 5.179021394601525e-09, + "loss": 0.3212, + "step": 12432 + }, + { + "epoch": 5.8784869976359335, + "grad_norm": 3.0174765586853027, + "learning_rate": 5.1389654805089616e-09, + "loss": 0.3086, + "step": 12433 + }, + { + "epoch": 5.878959810874704, + "grad_norm": 2.8360655307769775, + "learning_rate": 5.099064910473461e-09, + "loss": 0.2882, + "step": 12434 + }, + { + "epoch": 5.879432624113475, + "grad_norm": 3.238159418106079, + "learning_rate": 5.0593196869797025e-09, + "loss": 0.3694, + "step": 12435 + }, + { + "epoch": 5.879905437352246, + "grad_norm": 3.080636739730835, + "learning_rate": 5.019729812501817e-09, + "loss": 0.3548, + "step": 12436 + }, + { + "epoch": 5.880378250591017, + "grad_norm": 3.242600202560425, + "learning_rate": 4.9802952895050546e-09, + "loss": 0.3184, + "step": 12437 + }, + { + "epoch": 5.8808510638297875, + "grad_norm": 3.2375903129577637, + "learning_rate": 4.94101612044523e-09, + "loss": 0.3342, + "step": 12438 + }, + { + "epoch": 5.881323877068558, + "grad_norm": 3.0890920162200928, + "learning_rate": 4.901892307767886e-09, + "loss": 0.3014, + "step": 12439 + }, + { + "epoch": 5.881796690307329, + "grad_norm": 3.177412271499634, + "learning_rate": 4.862923853908852e-09, + "loss": 0.3068, + "step": 12440 + }, + { + "epoch": 5.882269503546099, + "grad_norm": 3.2599844932556152, + "learning_rate": 4.824110761294798e-09, + "loss": 0.3313, + "step": 12441 + }, + { + "epoch": 5.88274231678487, + "grad_norm": 3.3482279777526855, + "learning_rate": 4.785453032342402e-09, + "loss": 0.2724, + "step": 12442 + }, + { + "epoch": 5.883215130023641, + "grad_norm": 3.0876619815826416, + "learning_rate": 4.74695066945835e-09, + "loss": 0.2917, + "step": 12443 + }, + { + "epoch": 5.883687943262411, + "grad_norm": 3.6275599002838135, + "learning_rate": 4.708603675040724e-09, + "loss": 0.3604, + "step": 12444 + }, + { + "epoch": 5.884160756501182, + "grad_norm": 3.1014959812164307, + "learning_rate": 4.670412051476503e-09, + "loss": 0.3094, + "step": 12445 + }, + { + "epoch": 5.884633569739953, + "grad_norm": 3.465449810028076, + "learning_rate": 4.6323758011443394e-09, + "loss": 0.3155, + "step": 12446 + }, + { + "epoch": 5.885106382978723, + "grad_norm": 3.210188150405884, + "learning_rate": 4.594494926412063e-09, + "loss": 0.3525, + "step": 12447 + }, + { + "epoch": 5.885579196217494, + "grad_norm": 3.4170849323272705, + "learning_rate": 4.556769429638619e-09, + "loss": 0.328, + "step": 12448 + }, + { + "epoch": 5.8860520094562645, + "grad_norm": 3.739753246307373, + "learning_rate": 4.519199313172962e-09, + "loss": 0.3263, + "step": 12449 + }, + { + "epoch": 5.886524822695035, + "grad_norm": 2.84963321685791, + "learning_rate": 4.481784579354331e-09, + "loss": 0.2709, + "step": 12450 + }, + { + "epoch": 5.886997635933806, + "grad_norm": 3.2073376178741455, + "learning_rate": 4.444525230512531e-09, + "loss": 0.3467, + "step": 12451 + }, + { + "epoch": 5.887470449172577, + "grad_norm": 3.142899751663208, + "learning_rate": 4.407421268967371e-09, + "loss": 0.2971, + "step": 12452 + }, + { + "epoch": 5.887943262411348, + "grad_norm": 3.3176493644714355, + "learning_rate": 4.370472697029504e-09, + "loss": 0.3345, + "step": 12453 + }, + { + "epoch": 5.8884160756501185, + "grad_norm": 3.3850505352020264, + "learning_rate": 4.3336795169990344e-09, + "loss": 0.3284, + "step": 12454 + }, + { + "epoch": 5.888888888888889, + "grad_norm": 3.3981900215148926, + "learning_rate": 4.297041731167184e-09, + "loss": 0.2875, + "step": 12455 + }, + { + "epoch": 5.889361702127659, + "grad_norm": 3.187206983566284, + "learning_rate": 4.2605593418154625e-09, + "loss": 0.337, + "step": 12456 + }, + { + "epoch": 5.88983451536643, + "grad_norm": 3.8661656379699707, + "learning_rate": 4.22423235121483e-09, + "loss": 0.3013, + "step": 12457 + }, + { + "epoch": 5.890307328605201, + "grad_norm": 3.324201822280884, + "learning_rate": 4.1880607616279214e-09, + "loss": 0.3417, + "step": 12458 + }, + { + "epoch": 5.890780141843972, + "grad_norm": 2.9349443912506104, + "learning_rate": 4.152044575306546e-09, + "loss": 0.2964, + "step": 12459 + }, + { + "epoch": 5.891252955082742, + "grad_norm": 3.042742967605591, + "learning_rate": 4.116183794493633e-09, + "loss": 0.3165, + "step": 12460 + }, + { + "epoch": 5.891725768321513, + "grad_norm": 3.0013630390167236, + "learning_rate": 4.080478421421563e-09, + "loss": 0.3267, + "step": 12461 + }, + { + "epoch": 5.892198581560284, + "grad_norm": 2.738849401473999, + "learning_rate": 4.044928458313835e-09, + "loss": 0.268, + "step": 12462 + }, + { + "epoch": 5.892671394799054, + "grad_norm": 3.13663649559021, + "learning_rate": 4.0095339073839554e-09, + "loss": 0.2789, + "step": 12463 + }, + { + "epoch": 5.893144208037825, + "grad_norm": 3.3638975620269775, + "learning_rate": 3.974294770835996e-09, + "loss": 0.3157, + "step": 12464 + }, + { + "epoch": 5.8936170212765955, + "grad_norm": 2.984937906265259, + "learning_rate": 3.939211050863756e-09, + "loss": 0.2691, + "step": 12465 + }, + { + "epoch": 5.894089834515366, + "grad_norm": 3.2892327308654785, + "learning_rate": 3.9042827496518775e-09, + "loss": 0.3298, + "step": 12466 + }, + { + "epoch": 5.894562647754137, + "grad_norm": 3.8666255474090576, + "learning_rate": 3.869509869375565e-09, + "loss": 0.3105, + "step": 12467 + }, + { + "epoch": 5.895035460992908, + "grad_norm": 3.3934621810913086, + "learning_rate": 3.834892412199476e-09, + "loss": 0.3226, + "step": 12468 + }, + { + "epoch": 5.895508274231679, + "grad_norm": 3.4978415966033936, + "learning_rate": 3.8004303802793855e-09, + "loss": 0.2955, + "step": 12469 + }, + { + "epoch": 5.8959810874704495, + "grad_norm": 3.097461462020874, + "learning_rate": 3.766123775760799e-09, + "loss": 0.3478, + "step": 12470 + }, + { + "epoch": 5.89645390070922, + "grad_norm": 2.8790197372436523, + "learning_rate": 3.731972600780065e-09, + "loss": 0.262, + "step": 12471 + }, + { + "epoch": 5.89692671394799, + "grad_norm": 3.2017362117767334, + "learning_rate": 3.6979768574638144e-09, + "loss": 0.2947, + "step": 12472 + }, + { + "epoch": 5.897399527186761, + "grad_norm": 3.7744874954223633, + "learning_rate": 3.6641365479284096e-09, + "loss": 0.4205, + "step": 12473 + }, + { + "epoch": 5.897872340425532, + "grad_norm": 4.276632308959961, + "learning_rate": 3.630451674281055e-09, + "loss": 0.3479, + "step": 12474 + }, + { + "epoch": 5.898345153664303, + "grad_norm": 3.855257511138916, + "learning_rate": 3.596922238619516e-09, + "loss": 0.3854, + "step": 12475 + }, + { + "epoch": 5.898817966903073, + "grad_norm": 3.5090348720550537, + "learning_rate": 3.563548243031012e-09, + "loss": 0.3254, + "step": 12476 + }, + { + "epoch": 5.899290780141844, + "grad_norm": 3.134324550628662, + "learning_rate": 3.530329689593881e-09, + "loss": 0.3049, + "step": 12477 + }, + { + "epoch": 5.899763593380615, + "grad_norm": 3.6900384426116943, + "learning_rate": 3.497266580376191e-09, + "loss": 0.3574, + "step": 12478 + }, + { + "epoch": 5.900236406619385, + "grad_norm": 3.2664496898651123, + "learning_rate": 3.4643589174371272e-09, + "loss": 0.3132, + "step": 12479 + }, + { + "epoch": 5.900709219858156, + "grad_norm": 3.4255833625793457, + "learning_rate": 3.4316067028253298e-09, + "loss": 0.389, + "step": 12480 + }, + { + "epoch": 5.9011820330969265, + "grad_norm": 3.3752331733703613, + "learning_rate": 3.399009938580555e-09, + "loss": 0.3262, + "step": 12481 + }, + { + "epoch": 5.901654846335697, + "grad_norm": 3.054422378540039, + "learning_rate": 3.3665686267317364e-09, + "loss": 0.3246, + "step": 12482 + }, + { + "epoch": 5.902127659574468, + "grad_norm": 3.2461447715759277, + "learning_rate": 3.3342827692994793e-09, + "loss": 0.3379, + "step": 12483 + }, + { + "epoch": 5.902600472813239, + "grad_norm": 3.4133243560791016, + "learning_rate": 3.30215236829412e-09, + "loss": 0.3372, + "step": 12484 + }, + { + "epoch": 5.90307328605201, + "grad_norm": 3.128157377243042, + "learning_rate": 3.270177425715726e-09, + "loss": 0.2789, + "step": 12485 + }, + { + "epoch": 5.9035460992907804, + "grad_norm": 3.051811933517456, + "learning_rate": 3.238357943555481e-09, + "loss": 0.304, + "step": 12486 + }, + { + "epoch": 5.904018912529551, + "grad_norm": 3.2219812870025635, + "learning_rate": 3.2066939237951343e-09, + "loss": 0.3286, + "step": 12487 + }, + { + "epoch": 5.904491725768321, + "grad_norm": 3.207859754562378, + "learning_rate": 3.1751853684053314e-09, + "loss": 0.3245, + "step": 12488 + }, + { + "epoch": 5.904964539007092, + "grad_norm": 3.4851181507110596, + "learning_rate": 3.143832279348946e-09, + "loss": 0.3181, + "step": 12489 + }, + { + "epoch": 5.905437352245863, + "grad_norm": 4.016768455505371, + "learning_rate": 3.1126346585774734e-09, + "loss": 0.3812, + "step": 12490 + }, + { + "epoch": 5.9059101654846335, + "grad_norm": 2.883434772491455, + "learning_rate": 3.081592508033804e-09, + "loss": 0.2807, + "step": 12491 + }, + { + "epoch": 5.906382978723404, + "grad_norm": 3.171082019805908, + "learning_rate": 3.0507058296505575e-09, + "loss": 0.2997, + "step": 12492 + }, + { + "epoch": 5.906855791962175, + "grad_norm": 3.3019914627075195, + "learning_rate": 3.019974625351196e-09, + "loss": 0.3439, + "step": 12493 + }, + { + "epoch": 5.907328605200946, + "grad_norm": 3.309513568878174, + "learning_rate": 2.9893988970491896e-09, + "loss": 0.3314, + "step": 12494 + }, + { + "epoch": 5.907801418439716, + "grad_norm": 3.239665985107422, + "learning_rate": 2.958978646648292e-09, + "loss": 0.3184, + "step": 12495 + }, + { + "epoch": 5.908274231678487, + "grad_norm": 2.879519462585449, + "learning_rate": 2.928713876042266e-09, + "loss": 0.3068, + "step": 12496 + }, + { + "epoch": 5.908747044917257, + "grad_norm": 3.4216079711914062, + "learning_rate": 2.8986045871162717e-09, + "loss": 0.3702, + "step": 12497 + }, + { + "epoch": 5.909219858156028, + "grad_norm": 3.076555013656616, + "learning_rate": 2.8686507817443644e-09, + "loss": 0.2686, + "step": 12498 + }, + { + "epoch": 5.909692671394799, + "grad_norm": 3.1961801052093506, + "learning_rate": 2.8388524617922743e-09, + "loss": 0.3345, + "step": 12499 + }, + { + "epoch": 5.91016548463357, + "grad_norm": 3.0233850479125977, + "learning_rate": 2.8092096291149063e-09, + "loss": 0.2938, + "step": 12500 + }, + { + "epoch": 5.910638297872341, + "grad_norm": 3.181387186050415, + "learning_rate": 2.7797222855582838e-09, + "loss": 0.3442, + "step": 12501 + }, + { + "epoch": 5.911111111111111, + "grad_norm": 2.9414830207824707, + "learning_rate": 2.7503904329584385e-09, + "loss": 0.3352, + "step": 12502 + }, + { + "epoch": 5.911583924349882, + "grad_norm": 2.9812917709350586, + "learning_rate": 2.7212140731414095e-09, + "loss": 0.3527, + "step": 12503 + }, + { + "epoch": 5.912056737588652, + "grad_norm": 3.1165359020233154, + "learning_rate": 2.692193207924354e-09, + "loss": 0.3205, + "step": 12504 + }, + { + "epoch": 5.912529550827423, + "grad_norm": 3.0312206745147705, + "learning_rate": 2.6633278391141603e-09, + "loss": 0.3348, + "step": 12505 + }, + { + "epoch": 5.913002364066194, + "grad_norm": 3.4665310382843018, + "learning_rate": 2.634617968508002e-09, + "loss": 0.3353, + "step": 12506 + }, + { + "epoch": 5.9134751773049645, + "grad_norm": 3.696239948272705, + "learning_rate": 2.606063597893338e-09, + "loss": 0.3089, + "step": 12507 + }, + { + "epoch": 5.913947990543735, + "grad_norm": 4.07038688659668, + "learning_rate": 2.5776647290487453e-09, + "loss": 0.3589, + "step": 12508 + }, + { + "epoch": 5.914420803782506, + "grad_norm": 3.50469970703125, + "learning_rate": 2.5494213637416997e-09, + "loss": 0.3651, + "step": 12509 + }, + { + "epoch": 5.914893617021277, + "grad_norm": 3.255824565887451, + "learning_rate": 2.521333503731349e-09, + "loss": 0.332, + "step": 12510 + }, + { + "epoch": 5.915366430260047, + "grad_norm": 3.1768665313720703, + "learning_rate": 2.4934011507665723e-09, + "loss": 0.3498, + "step": 12511 + }, + { + "epoch": 5.915839243498818, + "grad_norm": 3.471471071243286, + "learning_rate": 2.465624306586256e-09, + "loss": 0.3494, + "step": 12512 + }, + { + "epoch": 5.916312056737588, + "grad_norm": 2.9033734798431396, + "learning_rate": 2.4380029729204057e-09, + "loss": 0.2507, + "step": 12513 + }, + { + "epoch": 5.916784869976359, + "grad_norm": 3.356685161590576, + "learning_rate": 2.410537151488479e-09, + "loss": 0.2937, + "step": 12514 + }, + { + "epoch": 5.91725768321513, + "grad_norm": 2.9482526779174805, + "learning_rate": 2.3832268440007743e-09, + "loss": 0.3046, + "step": 12515 + }, + { + "epoch": 5.917730496453901, + "grad_norm": 3.0559420585632324, + "learning_rate": 2.3560720521578763e-09, + "loss": 0.3166, + "step": 12516 + }, + { + "epoch": 5.918203309692672, + "grad_norm": 3.1434214115142822, + "learning_rate": 2.329072777650376e-09, + "loss": 0.3308, + "step": 12517 + }, + { + "epoch": 5.918676122931442, + "grad_norm": 3.422903299331665, + "learning_rate": 2.3022290221597078e-09, + "loss": 0.3738, + "step": 12518 + }, + { + "epoch": 5.919148936170213, + "grad_norm": 2.9760549068450928, + "learning_rate": 2.2755407873570332e-09, + "loss": 0.3015, + "step": 12519 + }, + { + "epoch": 5.919621749408983, + "grad_norm": 3.3623647689819336, + "learning_rate": 2.2490080749043573e-09, + "loss": 0.2757, + "step": 12520 + }, + { + "epoch": 5.920094562647754, + "grad_norm": 2.9579522609710693, + "learning_rate": 2.2226308864536917e-09, + "loss": 0.2836, + "step": 12521 + }, + { + "epoch": 5.920567375886525, + "grad_norm": 4.006960868835449, + "learning_rate": 2.1964092236473335e-09, + "loss": 0.3794, + "step": 12522 + }, + { + "epoch": 5.9210401891252955, + "grad_norm": 3.3528733253479004, + "learning_rate": 2.170343088118143e-09, + "loss": 0.2683, + "step": 12523 + }, + { + "epoch": 5.921513002364066, + "grad_norm": 3.1904187202453613, + "learning_rate": 2.1444324814887118e-09, + "loss": 0.305, + "step": 12524 + }, + { + "epoch": 5.921985815602837, + "grad_norm": 3.4845130443573, + "learning_rate": 2.1186774053730265e-09, + "loss": 0.3027, + "step": 12525 + }, + { + "epoch": 5.922458628841608, + "grad_norm": 2.857910394668579, + "learning_rate": 2.0930778613742485e-09, + "loss": 0.2847, + "step": 12526 + }, + { + "epoch": 5.922931442080378, + "grad_norm": 2.882591485977173, + "learning_rate": 2.067633851086659e-09, + "loss": 0.3152, + "step": 12527 + }, + { + "epoch": 5.923404255319149, + "grad_norm": 3.1664819717407227, + "learning_rate": 2.042345376094268e-09, + "loss": 0.3165, + "step": 12528 + }, + { + "epoch": 5.923877068557919, + "grad_norm": 3.645685911178589, + "learning_rate": 2.0172124379716498e-09, + "loss": 0.3531, + "step": 12529 + }, + { + "epoch": 5.92434988179669, + "grad_norm": 3.2750864028930664, + "learning_rate": 1.992235038284218e-09, + "loss": 0.3481, + "step": 12530 + }, + { + "epoch": 5.924822695035461, + "grad_norm": 3.969918727874756, + "learning_rate": 1.967413178586841e-09, + "loss": 0.3035, + "step": 12531 + }, + { + "epoch": 5.925295508274232, + "grad_norm": 3.5722131729125977, + "learning_rate": 1.942746860424949e-09, + "loss": 0.3565, + "step": 12532 + }, + { + "epoch": 5.925768321513003, + "grad_norm": 3.3106608390808105, + "learning_rate": 1.918236085334535e-09, + "loss": 0.3029, + "step": 12533 + }, + { + "epoch": 5.926241134751773, + "grad_norm": 2.830862522125244, + "learning_rate": 1.8938808548418785e-09, + "loss": 0.3048, + "step": 12534 + }, + { + "epoch": 5.926713947990544, + "grad_norm": 3.785337209701538, + "learning_rate": 1.8696811704635443e-09, + "loss": 0.3596, + "step": 12535 + }, + { + "epoch": 5.927186761229314, + "grad_norm": 2.95487904548645, + "learning_rate": 1.845637033705827e-09, + "loss": 0.3138, + "step": 12536 + }, + { + "epoch": 5.927659574468085, + "grad_norm": 3.2796683311462402, + "learning_rate": 1.821748446066418e-09, + "loss": 0.2851, + "step": 12537 + }, + { + "epoch": 5.928132387706856, + "grad_norm": 3.2564337253570557, + "learning_rate": 1.7980154090327384e-09, + "loss": 0.3185, + "step": 12538 + }, + { + "epoch": 5.9286052009456265, + "grad_norm": 3.140939950942993, + "learning_rate": 1.7744379240822173e-09, + "loss": 0.3562, + "step": 12539 + }, + { + "epoch": 5.929078014184397, + "grad_norm": 3.1424221992492676, + "learning_rate": 1.7510159926828473e-09, + "loss": 0.3174, + "step": 12540 + }, + { + "epoch": 5.929550827423168, + "grad_norm": 3.8264408111572266, + "learning_rate": 1.7277496162934615e-09, + "loss": 0.3059, + "step": 12541 + }, + { + "epoch": 5.930023640661939, + "grad_norm": 3.426374673843384, + "learning_rate": 1.7046387963626232e-09, + "loss": 0.335, + "step": 12542 + }, + { + "epoch": 5.930496453900709, + "grad_norm": 3.537398099899292, + "learning_rate": 1.6816835343289039e-09, + "loss": 0.3797, + "step": 12543 + }, + { + "epoch": 5.93096926713948, + "grad_norm": 3.5586562156677246, + "learning_rate": 1.6588838316219935e-09, + "loss": 0.4037, + "step": 12544 + }, + { + "epoch": 5.93144208037825, + "grad_norm": 3.411766290664673, + "learning_rate": 1.6362396896618672e-09, + "loss": 0.3927, + "step": 12545 + }, + { + "epoch": 5.931914893617021, + "grad_norm": 3.1392691135406494, + "learning_rate": 1.6137511098576752e-09, + "loss": 0.2881, + "step": 12546 + }, + { + "epoch": 5.932387706855792, + "grad_norm": 3.751774787902832, + "learning_rate": 1.5914180936102418e-09, + "loss": 0.3058, + "step": 12547 + }, + { + "epoch": 5.932860520094563, + "grad_norm": 3.5063114166259766, + "learning_rate": 1.5692406423101215e-09, + "loss": 0.2882, + "step": 12548 + }, + { + "epoch": 5.933333333333334, + "grad_norm": 3.547232151031494, + "learning_rate": 1.5472187573378762e-09, + "loss": 0.2838, + "step": 12549 + }, + { + "epoch": 5.933806146572104, + "grad_norm": 3.0414838790893555, + "learning_rate": 1.5253524400651864e-09, + "loss": 0.2985, + "step": 12550 + }, + { + "epoch": 5.934278959810875, + "grad_norm": 3.165259599685669, + "learning_rate": 1.503641691853186e-09, + "loss": 0.304, + "step": 12551 + }, + { + "epoch": 5.934751773049645, + "grad_norm": 2.8662290573120117, + "learning_rate": 1.4820865140538488e-09, + "loss": 0.2988, + "step": 12552 + }, + { + "epoch": 5.935224586288416, + "grad_norm": 3.483565330505371, + "learning_rate": 1.4606869080091567e-09, + "loss": 0.3328, + "step": 12553 + }, + { + "epoch": 5.935697399527187, + "grad_norm": 3.4734973907470703, + "learning_rate": 1.439442875051933e-09, + "loss": 0.3263, + "step": 12554 + }, + { + "epoch": 5.9361702127659575, + "grad_norm": 3.7281854152679443, + "learning_rate": 1.4183544165047303e-09, + "loss": 0.3849, + "step": 12555 + }, + { + "epoch": 5.936643026004728, + "grad_norm": 3.1552608013153076, + "learning_rate": 1.3974215336806652e-09, + "loss": 0.3537, + "step": 12556 + }, + { + "epoch": 5.937115839243499, + "grad_norm": 3.2095046043395996, + "learning_rate": 1.3766442278831393e-09, + "loss": 0.3154, + "step": 12557 + }, + { + "epoch": 5.93758865248227, + "grad_norm": 3.242431402206421, + "learning_rate": 1.3560225004055627e-09, + "loss": 0.2952, + "step": 12558 + }, + { + "epoch": 5.93806146572104, + "grad_norm": 3.4177846908569336, + "learning_rate": 1.3355563525324632e-09, + "loss": 0.373, + "step": 12559 + }, + { + "epoch": 5.938534278959811, + "grad_norm": 3.2495975494384766, + "learning_rate": 1.3152457855380995e-09, + "loss": 0.3472, + "step": 12560 + }, + { + "epoch": 5.939007092198581, + "grad_norm": 3.5553295612335205, + "learning_rate": 1.295090800686738e-09, + "loss": 0.302, + "step": 12561 + }, + { + "epoch": 5.939479905437352, + "grad_norm": 3.769275188446045, + "learning_rate": 1.2750913992337632e-09, + "loss": 0.3566, + "step": 12562 + }, + { + "epoch": 5.939952718676123, + "grad_norm": 3.7617647647857666, + "learning_rate": 1.2552475824240128e-09, + "loss": 0.3565, + "step": 12563 + }, + { + "epoch": 5.940425531914894, + "grad_norm": 3.5219485759735107, + "learning_rate": 1.2355593514934428e-09, + "loss": 0.3704, + "step": 12564 + }, + { + "epoch": 5.9408983451536646, + "grad_norm": 3.0995635986328125, + "learning_rate": 1.2160267076680166e-09, + "loss": 0.3038, + "step": 12565 + }, + { + "epoch": 5.941371158392435, + "grad_norm": 3.5231125354766846, + "learning_rate": 1.196649652163706e-09, + "loss": 0.3078, + "step": 12566 + }, + { + "epoch": 5.941843971631206, + "grad_norm": 3.571253776550293, + "learning_rate": 1.1774281861867687e-09, + "loss": 0.3179, + "step": 12567 + }, + { + "epoch": 5.942316784869976, + "grad_norm": 3.3609511852264404, + "learning_rate": 1.1583623109348575e-09, + "loss": 0.3057, + "step": 12568 + }, + { + "epoch": 5.942789598108747, + "grad_norm": 3.4012279510498047, + "learning_rate": 1.1394520275942455e-09, + "loss": 0.328, + "step": 12569 + }, + { + "epoch": 5.943262411347518, + "grad_norm": 3.3525643348693848, + "learning_rate": 1.1206973373428798e-09, + "loss": 0.325, + "step": 12570 + }, + { + "epoch": 5.9437352245862884, + "grad_norm": 3.5196216106414795, + "learning_rate": 1.1020982413487146e-09, + "loss": 0.3484, + "step": 12571 + }, + { + "epoch": 5.944208037825059, + "grad_norm": 3.3853707313537598, + "learning_rate": 1.0836547407691577e-09, + "loss": 0.3082, + "step": 12572 + }, + { + "epoch": 5.94468085106383, + "grad_norm": 3.011427640914917, + "learning_rate": 1.0653668367532898e-09, + "loss": 0.3214, + "step": 12573 + }, + { + "epoch": 5.945153664302601, + "grad_norm": 3.4044291973114014, + "learning_rate": 1.0472345304393671e-09, + "loss": 0.2829, + "step": 12574 + }, + { + "epoch": 5.945626477541371, + "grad_norm": 3.5128118991851807, + "learning_rate": 1.0292578229564864e-09, + "loss": 0.3547, + "step": 12575 + }, + { + "epoch": 5.9460992907801415, + "grad_norm": 3.8453915119171143, + "learning_rate": 1.0114367154243076e-09, + "loss": 0.3614, + "step": 12576 + }, + { + "epoch": 5.946572104018912, + "grad_norm": 3.520470380783081, + "learning_rate": 9.93771208952221e-10, + "loss": 0.3365, + "step": 12577 + }, + { + "epoch": 5.947044917257683, + "grad_norm": 2.913461208343506, + "learning_rate": 9.762613046399029e-10, + "loss": 0.3205, + "step": 12578 + }, + { + "epoch": 5.947517730496454, + "grad_norm": 3.097443103790283, + "learning_rate": 9.589070035781468e-10, + "loss": 0.3293, + "step": 12579 + }, + { + "epoch": 5.947990543735225, + "grad_norm": 3.0502073764801025, + "learning_rate": 9.417083068472e-10, + "loss": 0.2959, + "step": 12580 + }, + { + "epoch": 5.9484633569739955, + "grad_norm": 3.49462628364563, + "learning_rate": 9.246652155181501e-10, + "loss": 0.3158, + "step": 12581 + }, + { + "epoch": 5.948936170212766, + "grad_norm": 3.5302231311798096, + "learning_rate": 9.077777306518154e-10, + "loss": 0.3397, + "step": 12582 + }, + { + "epoch": 5.949408983451537, + "grad_norm": 3.1912498474121094, + "learning_rate": 8.910458533004096e-10, + "loss": 0.3169, + "step": 12583 + }, + { + "epoch": 5.949881796690307, + "grad_norm": 3.133894205093384, + "learning_rate": 8.744695845050444e-10, + "loss": 0.3104, + "step": 12584 + }, + { + "epoch": 5.950354609929078, + "grad_norm": 3.1867423057556152, + "learning_rate": 8.580489252979495e-10, + "loss": 0.2583, + "step": 12585 + }, + { + "epoch": 5.950827423167849, + "grad_norm": 3.6315085887908936, + "learning_rate": 8.417838767019182e-10, + "loss": 0.3268, + "step": 12586 + }, + { + "epoch": 5.951300236406619, + "grad_norm": 3.5569820404052734, + "learning_rate": 8.256744397294736e-10, + "loss": 0.3381, + "step": 12587 + }, + { + "epoch": 5.95177304964539, + "grad_norm": 3.262516736984253, + "learning_rate": 8.097206153839798e-10, + "loss": 0.3579, + "step": 12588 + }, + { + "epoch": 5.952245862884161, + "grad_norm": 3.626657485961914, + "learning_rate": 7.939224046582539e-10, + "loss": 0.3547, + "step": 12589 + }, + { + "epoch": 5.952718676122932, + "grad_norm": 3.1696643829345703, + "learning_rate": 7.782798085365084e-10, + "loss": 0.3459, + "step": 12590 + }, + { + "epoch": 5.953191489361702, + "grad_norm": 3.387317419052124, + "learning_rate": 7.627928279924091e-10, + "loss": 0.3752, + "step": 12591 + }, + { + "epoch": 5.9536643026004725, + "grad_norm": 3.160543441772461, + "learning_rate": 7.474614639904621e-10, + "loss": 0.3343, + "step": 12592 + }, + { + "epoch": 5.954137115839243, + "grad_norm": 3.4762585163116455, + "learning_rate": 7.32285717484904e-10, + "loss": 0.3407, + "step": 12593 + }, + { + "epoch": 5.954609929078014, + "grad_norm": 3.5852251052856445, + "learning_rate": 7.172655894213676e-10, + "loss": 0.2727, + "step": 12594 + }, + { + "epoch": 5.955082742316785, + "grad_norm": 3.3408043384552, + "learning_rate": 7.024010807343829e-10, + "loss": 0.3203, + "step": 12595 + }, + { + "epoch": 5.955555555555556, + "grad_norm": 4.2062087059021, + "learning_rate": 6.876921923498758e-10, + "loss": 0.3682, + "step": 12596 + }, + { + "epoch": 5.9560283687943265, + "grad_norm": 4.376652717590332, + "learning_rate": 6.73138925183503e-10, + "loss": 0.3384, + "step": 12597 + }, + { + "epoch": 5.956501182033097, + "grad_norm": 3.4473304748535156, + "learning_rate": 6.587412801417614e-10, + "loss": 0.2883, + "step": 12598 + }, + { + "epoch": 5.956973995271868, + "grad_norm": 3.453704357147217, + "learning_rate": 6.444992581208786e-10, + "loss": 0.3558, + "step": 12599 + }, + { + "epoch": 5.957446808510638, + "grad_norm": 3.6638705730438232, + "learning_rate": 6.304128600076453e-10, + "loss": 0.3481, + "step": 12600 + }, + { + "epoch": 5.957919621749409, + "grad_norm": 3.319033145904541, + "learning_rate": 6.164820866791377e-10, + "loss": 0.3474, + "step": 12601 + }, + { + "epoch": 5.95839243498818, + "grad_norm": 3.0885703563690186, + "learning_rate": 6.02706939002995e-10, + "loss": 0.3023, + "step": 12602 + }, + { + "epoch": 5.95886524822695, + "grad_norm": 3.0238006114959717, + "learning_rate": 5.890874178365868e-10, + "loss": 0.3123, + "step": 12603 + }, + { + "epoch": 5.959338061465721, + "grad_norm": 3.1967854499816895, + "learning_rate": 5.756235240281238e-10, + "loss": 0.2907, + "step": 12604 + }, + { + "epoch": 5.959810874704492, + "grad_norm": 3.2789015769958496, + "learning_rate": 5.623152584161018e-10, + "loss": 0.3304, + "step": 12605 + }, + { + "epoch": 5.960283687943263, + "grad_norm": 2.9390599727630615, + "learning_rate": 5.491626218290247e-10, + "loss": 0.2878, + "step": 12606 + }, + { + "epoch": 5.960756501182033, + "grad_norm": 3.0502469539642334, + "learning_rate": 5.36165615085682e-10, + "loss": 0.2774, + "step": 12607 + }, + { + "epoch": 5.9612293144208035, + "grad_norm": 3.309628963470459, + "learning_rate": 5.233242389954262e-10, + "loss": 0.3192, + "step": 12608 + }, + { + "epoch": 5.961702127659574, + "grad_norm": 3.4259536266326904, + "learning_rate": 5.106384943578957e-10, + "loss": 0.303, + "step": 12609 + }, + { + "epoch": 5.962174940898345, + "grad_norm": 3.8046417236328125, + "learning_rate": 4.981083819630139e-10, + "loss": 0.3016, + "step": 12610 + }, + { + "epoch": 5.962647754137116, + "grad_norm": 3.7150895595550537, + "learning_rate": 4.857339025909902e-10, + "loss": 0.3508, + "step": 12611 + }, + { + "epoch": 5.963120567375887, + "grad_norm": 3.202986717224121, + "learning_rate": 4.735150570123192e-10, + "loss": 0.2981, + "step": 12612 + }, + { + "epoch": 5.9635933806146575, + "grad_norm": 3.3235676288604736, + "learning_rate": 4.614518459877815e-10, + "loss": 0.3602, + "step": 12613 + }, + { + "epoch": 5.964066193853428, + "grad_norm": 3.443566083908081, + "learning_rate": 4.4954427026844273e-10, + "loss": 0.3364, + "step": 12614 + }, + { + "epoch": 5.964539007092198, + "grad_norm": 3.0922253131866455, + "learning_rate": 4.377923305956544e-10, + "loss": 0.3483, + "step": 12615 + }, + { + "epoch": 5.965011820330969, + "grad_norm": 3.164511203765869, + "learning_rate": 4.261960277013311e-10, + "loss": 0.3303, + "step": 12616 + }, + { + "epoch": 5.96548463356974, + "grad_norm": 3.2433879375457764, + "learning_rate": 4.1475536230767275e-10, + "loss": 0.3382, + "step": 12617 + }, + { + "epoch": 5.965957446808511, + "grad_norm": 3.1344754695892334, + "learning_rate": 4.0347033512661007e-10, + "loss": 0.3195, + "step": 12618 + }, + { + "epoch": 5.966430260047281, + "grad_norm": 3.2667789459228516, + "learning_rate": 3.923409468611916e-10, + "loss": 0.3353, + "step": 12619 + }, + { + "epoch": 5.966903073286052, + "grad_norm": 3.0841329097747803, + "learning_rate": 3.8136719820419666e-10, + "loss": 0.3321, + "step": 12620 + }, + { + "epoch": 5.967375886524823, + "grad_norm": 3.5874977111816406, + "learning_rate": 3.7054908983896743e-10, + "loss": 0.339, + "step": 12621 + }, + { + "epoch": 5.967848699763593, + "grad_norm": 3.3214657306671143, + "learning_rate": 3.5988662243913174e-10, + "loss": 0.2722, + "step": 12622 + }, + { + "epoch": 5.968321513002364, + "grad_norm": 3.529088258743286, + "learning_rate": 3.49379796668603e-10, + "loss": 0.3119, + "step": 12623 + }, + { + "epoch": 5.9687943262411345, + "grad_norm": 3.0304949283599854, + "learning_rate": 3.390286131815801e-10, + "loss": 0.2982, + "step": 12624 + }, + { + "epoch": 5.969267139479905, + "grad_norm": 3.099729061126709, + "learning_rate": 3.288330726225475e-10, + "loss": 0.3886, + "step": 12625 + }, + { + "epoch": 5.969739952718676, + "grad_norm": 4.111376762390137, + "learning_rate": 3.187931756262752e-10, + "loss": 0.3365, + "step": 12626 + }, + { + "epoch": 5.970212765957447, + "grad_norm": 3.641390562057495, + "learning_rate": 3.089089228178188e-10, + "loss": 0.3272, + "step": 12627 + }, + { + "epoch": 5.970685579196218, + "grad_norm": 3.644512891769409, + "learning_rate": 2.991803148130745e-10, + "loss": 0.3679, + "step": 12628 + }, + { + "epoch": 5.9711583924349885, + "grad_norm": 3.3023669719696045, + "learning_rate": 2.8960735221739146e-10, + "loss": 0.3153, + "step": 12629 + }, + { + "epoch": 5.971631205673759, + "grad_norm": 3.5878617763519287, + "learning_rate": 2.8019003562695937e-10, + "loss": 0.3384, + "step": 12630 + }, + { + "epoch": 5.972104018912529, + "grad_norm": 3.2044191360473633, + "learning_rate": 2.709283656282535e-10, + "loss": 0.3238, + "step": 12631 + }, + { + "epoch": 5.9725768321513, + "grad_norm": 3.1008009910583496, + "learning_rate": 2.6182234279775733e-10, + "loss": 0.3038, + "step": 12632 + }, + { + "epoch": 5.973049645390071, + "grad_norm": 3.578662872314453, + "learning_rate": 2.528719677025171e-10, + "loss": 0.3529, + "step": 12633 + }, + { + "epoch": 5.973522458628842, + "grad_norm": 3.211794137954712, + "learning_rate": 2.4407724089986483e-10, + "loss": 0.3326, + "step": 12634 + }, + { + "epoch": 5.973995271867612, + "grad_norm": 3.3977439403533936, + "learning_rate": 2.3543816293741807e-10, + "loss": 0.3552, + "step": 12635 + }, + { + "epoch": 5.974468085106383, + "grad_norm": 3.276383638381958, + "learning_rate": 2.2695473435335735e-10, + "loss": 0.323, + "step": 12636 + }, + { + "epoch": 5.974940898345154, + "grad_norm": 3.3910670280456543, + "learning_rate": 2.1862695567531623e-10, + "loss": 0.3771, + "step": 12637 + }, + { + "epoch": 5.975413711583924, + "grad_norm": 3.0644946098327637, + "learning_rate": 2.1045482742232392e-10, + "loss": 0.328, + "step": 12638 + }, + { + "epoch": 5.975886524822695, + "grad_norm": 3.269826650619507, + "learning_rate": 2.0243835010314016e-10, + "loss": 0.3167, + "step": 12639 + }, + { + "epoch": 5.9763593380614655, + "grad_norm": 3.083881378173828, + "learning_rate": 1.945775242168102e-10, + "loss": 0.299, + "step": 12640 + }, + { + "epoch": 5.976832151300236, + "grad_norm": 3.8910908699035645, + "learning_rate": 1.8687235025266482e-10, + "loss": 0.3703, + "step": 12641 + }, + { + "epoch": 5.977304964539007, + "grad_norm": 3.2514829635620117, + "learning_rate": 1.793228286905979e-10, + "loss": 0.3372, + "step": 12642 + }, + { + "epoch": 5.977777777777778, + "grad_norm": 3.677476167678833, + "learning_rate": 1.7192896000078896e-10, + "loss": 0.3025, + "step": 12643 + }, + { + "epoch": 5.978250591016549, + "grad_norm": 3.4517319202423096, + "learning_rate": 1.6469074464370295e-10, + "loss": 0.3534, + "step": 12644 + }, + { + "epoch": 5.9787234042553195, + "grad_norm": 3.403575897216797, + "learning_rate": 1.5760818306981286e-10, + "loss": 0.3381, + "step": 12645 + }, + { + "epoch": 5.97919621749409, + "grad_norm": 3.3433964252471924, + "learning_rate": 1.5068127572015477e-10, + "loss": 0.3, + "step": 12646 + }, + { + "epoch": 5.97966903073286, + "grad_norm": 3.4642858505249023, + "learning_rate": 1.439100230260504e-10, + "loss": 0.3366, + "step": 12647 + }, + { + "epoch": 5.980141843971631, + "grad_norm": 3.233149290084839, + "learning_rate": 1.3729442540910687e-10, + "loss": 0.3518, + "step": 12648 + }, + { + "epoch": 5.980614657210402, + "grad_norm": 4.615501880645752, + "learning_rate": 1.3083448328121694e-10, + "loss": 0.3188, + "step": 12649 + }, + { + "epoch": 5.9810874704491725, + "grad_norm": 3.344639778137207, + "learning_rate": 1.2453019704483648e-10, + "loss": 0.3275, + "step": 12650 + }, + { + "epoch": 5.981560283687943, + "grad_norm": 3.085968017578125, + "learning_rate": 1.1838156709215176e-10, + "loss": 0.3019, + "step": 12651 + }, + { + "epoch": 5.982033096926714, + "grad_norm": 3.3743207454681396, + "learning_rate": 1.1238859380618971e-10, + "loss": 0.3372, + "step": 12652 + }, + { + "epoch": 5.982505910165485, + "grad_norm": 3.885718822479248, + "learning_rate": 1.0655127756026285e-10, + "loss": 0.3697, + "step": 12653 + }, + { + "epoch": 5.982978723404255, + "grad_norm": 3.1877360343933105, + "learning_rate": 1.0086961871769163e-10, + "loss": 0.314, + "step": 12654 + }, + { + "epoch": 5.983451536643026, + "grad_norm": 3.0923852920532227, + "learning_rate": 9.534361763208211e-11, + "loss": 0.3071, + "step": 12655 + }, + { + "epoch": 5.9839243498817964, + "grad_norm": 3.0377893447875977, + "learning_rate": 8.997327464788097e-11, + "loss": 0.2921, + "step": 12656 + }, + { + "epoch": 5.984397163120567, + "grad_norm": 3.351410150527954, + "learning_rate": 8.475859009898779e-11, + "loss": 0.3607, + "step": 12657 + }, + { + "epoch": 5.984869976359338, + "grad_norm": 3.2637524604797363, + "learning_rate": 7.969956431069792e-11, + "loss": 0.3243, + "step": 12658 + }, + { + "epoch": 5.985342789598109, + "grad_norm": 3.276374101638794, + "learning_rate": 7.479619759748203e-11, + "loss": 0.3364, + "step": 12659 + }, + { + "epoch": 5.98581560283688, + "grad_norm": 3.1528098583221436, + "learning_rate": 7.004849026492899e-11, + "loss": 0.3108, + "step": 12660 + }, + { + "epoch": 5.98628841607565, + "grad_norm": 3.1490159034729004, + "learning_rate": 6.545644260863571e-11, + "loss": 0.2671, + "step": 12661 + }, + { + "epoch": 5.986761229314421, + "grad_norm": 3.179043769836426, + "learning_rate": 6.10200549144846e-11, + "loss": 0.3132, + "step": 12662 + }, + { + "epoch": 5.987234042553191, + "grad_norm": 3.195889711380005, + "learning_rate": 5.673932745864363e-11, + "loss": 0.3239, + "step": 12663 + }, + { + "epoch": 5.987706855791962, + "grad_norm": 4.262112140655518, + "learning_rate": 5.261426050756635e-11, + "loss": 0.3875, + "step": 12664 + }, + { + "epoch": 5.988179669030733, + "grad_norm": 3.3035454750061035, + "learning_rate": 4.864485431854693e-11, + "loss": 0.3001, + "step": 12665 + }, + { + "epoch": 5.9886524822695035, + "grad_norm": 3.0973124504089355, + "learning_rate": 4.483110913833244e-11, + "loss": 0.2993, + "step": 12666 + }, + { + "epoch": 5.989125295508274, + "grad_norm": 3.536430835723877, + "learning_rate": 4.117302520451061e-11, + "loss": 0.3625, + "step": 12667 + }, + { + "epoch": 5.989598108747045, + "grad_norm": 3.1173746585845947, + "learning_rate": 3.767060274495471e-11, + "loss": 0.3317, + "step": 12668 + }, + { + "epoch": 5.990070921985816, + "grad_norm": 2.9591150283813477, + "learning_rate": 3.432384197754601e-11, + "loss": 0.2764, + "step": 12669 + }, + { + "epoch": 5.990543735224586, + "grad_norm": 3.2574095726013184, + "learning_rate": 3.113274311072889e-11, + "loss": 0.2925, + "step": 12670 + }, + { + "epoch": 5.991016548463357, + "grad_norm": 3.135549306869507, + "learning_rate": 2.8097306343233265e-11, + "loss": 0.3145, + "step": 12671 + }, + { + "epoch": 5.991489361702127, + "grad_norm": 3.5706069469451904, + "learning_rate": 2.5217531864074607e-11, + "loss": 0.3425, + "step": 12672 + }, + { + "epoch": 5.991962174940898, + "grad_norm": 3.5643908977508545, + "learning_rate": 2.2493419852831487e-11, + "loss": 0.3576, + "step": 12673 + }, + { + "epoch": 5.992434988179669, + "grad_norm": 3.684077739715576, + "learning_rate": 1.9924970478535365e-11, + "loss": 0.3748, + "step": 12674 + }, + { + "epoch": 5.99290780141844, + "grad_norm": 3.212735891342163, + "learning_rate": 1.7512183901613468e-11, + "loss": 0.2961, + "step": 12675 + }, + { + "epoch": 5.993380614657211, + "grad_norm": 2.776723623275757, + "learning_rate": 1.5255060271945897e-11, + "loss": 0.3052, + "step": 12676 + }, + { + "epoch": 5.993853427895981, + "grad_norm": 3.1812119483947754, + "learning_rate": 1.315359973025343e-11, + "loss": 0.2784, + "step": 12677 + }, + { + "epoch": 5.994326241134752, + "grad_norm": 3.5751991271972656, + "learning_rate": 1.1207802407542379e-11, + "loss": 0.309, + "step": 12678 + }, + { + "epoch": 5.994799054373522, + "grad_norm": 3.6249916553497314, + "learning_rate": 9.417668424827054e-12, + "loss": 0.3345, + "step": 12679 + }, + { + "epoch": 5.995271867612293, + "grad_norm": 3.166501522064209, + "learning_rate": 7.783197893407313e-12, + "loss": 0.3697, + "step": 12680 + }, + { + "epoch": 5.995744680851064, + "grad_norm": 2.945590019226074, + "learning_rate": 6.304390915146119e-12, + "loss": 0.2907, + "step": 12681 + }, + { + "epoch": 5.9962174940898345, + "grad_norm": 3.273108959197998, + "learning_rate": 4.981247582191984e-12, + "loss": 0.3229, + "step": 12682 + }, + { + "epoch": 5.996690307328605, + "grad_norm": 3.1620328426361084, + "learning_rate": 3.813767976978966e-12, + "loss": 0.3279, + "step": 12683 + }, + { + "epoch": 5.997163120567376, + "grad_norm": 3.197380304336548, + "learning_rate": 2.8019521719491182e-12, + "loss": 0.3382, + "step": 12684 + }, + { + "epoch": 5.997635933806147, + "grad_norm": 3.36995005607605, + "learning_rate": 1.945800230662709e-12, + "loss": 0.2993, + "step": 12685 + }, + { + "epoch": 5.998108747044917, + "grad_norm": 3.0389227867126465, + "learning_rate": 1.2453122055777755e-12, + "loss": 0.3357, + "step": 12686 + }, + { + "epoch": 5.998581560283688, + "grad_norm": 3.352140426635742, + "learning_rate": 7.004881411032394e-13, + "loss": 0.3167, + "step": 12687 + }, + { + "epoch": 5.999054373522458, + "grad_norm": 3.273684024810791, + "learning_rate": 3.1132807082334683e-13, + "loss": 0.3124, + "step": 12688 + }, + { + "epoch": 5.999527186761229, + "grad_norm": 3.941155433654785, + "learning_rate": 7.783201888544867e-14, + "loss": 0.3472, + "step": 12689 + }, + { + "epoch": 6.0, + "grad_norm": 2.634655237197876, + "learning_rate": 0.0, + "loss": 0.3022, + "step": 12690 + } + ], + "logging_steps": 1, + "max_steps": 12690, + "num_input_tokens_seen": 0, + "num_train_epochs": 6, + "save_steps": 2115, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": true + }, + "attributes": {} + } + }, + "total_flos": 3.2054353166598144e+19, + "train_batch_size": 4, + "trial_name": null, + "trial_params": null +} diff --git a/checkpoint-12690/training_args.bin b/checkpoint-12690/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..3885e6a27b204c834ff6fbc6f3a5a4c7f69fe2cc --- /dev/null +++ b/checkpoint-12690/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:51188d498a3fe1bf9ddf234271786a31e25e89b3aa68721a97f8264e1013c9b6 +size 8056 diff --git a/checkpoint-12690/zero_to_fp32.py b/checkpoint-12690/zero_to_fp32.py new file mode 100644 index 0000000000000000000000000000000000000000..24cc342e78d1a006c782b3a4cd68d9ce786d8fd8 --- /dev/null +++ b/checkpoint-12690/zero_to_fp32.py @@ -0,0 +1,604 @@ +#!/usr/bin/env python + +# Copyright (c) Microsoft Corporation. +# SPDX-License-Identifier: Apache-2.0 + +# DeepSpeed Team + +# This script extracts fp32 consolidated weights from a zero 1, 2 and 3 DeepSpeed checkpoints. It gets +# copied into the top level checkpoint dir, so the user can easily do the conversion at any point in +# the future. Once extracted, the weights don't require DeepSpeed and can be used in any +# application. +# +# example: python zero_to_fp32.py . pytorch_model.bin + +import argparse +import torch +import glob +import math +import os +import re +from collections import OrderedDict +from dataclasses import dataclass + +# while this script doesn't use deepspeed to recover data, since the checkpoints are pickled with +# DeepSpeed data structures it has to be available in the current python environment. +from deepspeed.utils import logger +from deepspeed.checkpoint.constants import (DS_VERSION, OPTIMIZER_STATE_DICT, SINGLE_PARTITION_OF_FP32_GROUPS, + FP32_FLAT_GROUPS, ZERO_STAGE, PARTITION_COUNT, PARAM_SHAPES, BUFFER_NAMES, + FROZEN_PARAM_SHAPES, FROZEN_PARAM_FRAGMENTS) + + +@dataclass +class zero_model_state: + buffers: dict() + param_shapes: dict() + shared_params: list + ds_version: int + frozen_param_shapes: dict() + frozen_param_fragments: dict() + + +debug = 0 + +# load to cpu +device = torch.device('cpu') + + +def atoi(text): + return int(text) if text.isdigit() else text + + +def natural_keys(text): + ''' + alist.sort(key=natural_keys) sorts in human order + http://nedbatchelder.com/blog/200712/human_sorting.html + (See Toothy's implementation in the comments) + ''' + return [atoi(c) for c in re.split(r'(\d+)', text)] + + +def get_model_state_file(checkpoint_dir, zero_stage): + if not os.path.isdir(checkpoint_dir): + raise FileNotFoundError(f"Directory '{checkpoint_dir}' doesn't exist") + + # there should be only one file + if zero_stage <= 2: + file = os.path.join(checkpoint_dir, "mp_rank_00_model_states.pt") + elif zero_stage == 3: + file = os.path.join(checkpoint_dir, "zero_pp_rank_0_mp_rank_00_model_states.pt") + + if not os.path.exists(file): + raise FileNotFoundError(f"can't find model states file at '{file}'") + + return file + + +def get_checkpoint_files(checkpoint_dir, glob_pattern): + # XXX: need to test that this simple glob rule works for multi-node setup too + ckpt_files = sorted(glob.glob(os.path.join(checkpoint_dir, glob_pattern)), key=natural_keys) + + if len(ckpt_files) == 0: + raise FileNotFoundError(f"can't find {glob_pattern} files in directory '{checkpoint_dir}'") + + return ckpt_files + + +def get_optim_files(checkpoint_dir): + return get_checkpoint_files(checkpoint_dir, "*_optim_states.pt") + + +def get_model_state_files(checkpoint_dir): + return get_checkpoint_files(checkpoint_dir, "*_model_states.pt") + + +def parse_model_states(files): + zero_model_states = [] + for file in files: + state_dict = torch.load(file, map_location=device) + + if BUFFER_NAMES not in state_dict: + raise ValueError(f"{file} is not a model state checkpoint") + buffer_names = state_dict[BUFFER_NAMES] + if debug: + print("Found buffers:", buffer_names) + + # recover just the buffers while restoring them to fp32 if they were saved in fp16 + buffers = {k: v.float() for k, v in state_dict["module"].items() if k in buffer_names} + param_shapes = state_dict[PARAM_SHAPES] + + # collect parameters that are included in param_shapes + param_names = [] + for s in param_shapes: + for name in s.keys(): + param_names.append(name) + + # update with frozen parameters + frozen_param_shapes = state_dict.get(FROZEN_PARAM_SHAPES, None) + if frozen_param_shapes is not None: + if debug: + print(f"Found frozen_param_shapes: {frozen_param_shapes}") + param_names += list(frozen_param_shapes.keys()) + + # handle shared params + shared_params = [[k, v] for k, v in state_dict["shared_params"].items()] + + ds_version = state_dict.get(DS_VERSION, None) + + frozen_param_fragments = state_dict.get(FROZEN_PARAM_FRAGMENTS, None) + + z_model_state = zero_model_state(buffers=buffers, + param_shapes=param_shapes, + shared_params=shared_params, + ds_version=ds_version, + frozen_param_shapes=frozen_param_shapes, + frozen_param_fragments=frozen_param_fragments) + zero_model_states.append(z_model_state) + + return zero_model_states + + +def parse_optim_states(files, ds_checkpoint_dir): + + total_files = len(files) + state_dicts = [] + for f in files: + state_dict = torch.load(f, map_location=device) + # immediately discard the potentially huge 2 optimizer states as we only care for fp32 master weights + # and also handle the case where it was already removed by another helper script + state_dict["optimizer_state_dict"].pop("optimizer_state_dict", None) + state_dicts.append(state_dict) + + if not ZERO_STAGE in state_dicts[0][OPTIMIZER_STATE_DICT]: + raise ValueError(f"{files[0]} is not a zero checkpoint") + zero_stage = state_dicts[0][OPTIMIZER_STATE_DICT][ZERO_STAGE] + world_size = state_dicts[0][OPTIMIZER_STATE_DICT][PARTITION_COUNT] + + # For ZeRO-2 each param group can have different partition_count as data parallelism for expert + # parameters can be different from data parallelism for non-expert parameters. So we can just + # use the max of the partition_count to get the dp world_size. + + if type(world_size) is list: + world_size = max(world_size) + + if world_size != total_files: + raise ValueError( + f"Expected {world_size} of '*_optim_states.pt' under '{ds_checkpoint_dir}' but found {total_files} files. " + "Possibly due to an overwrite of an old checkpoint, or a checkpoint didn't get saved by one or more processes." + ) + + # the groups are named differently in each stage + if zero_stage <= 2: + fp32_groups_key = SINGLE_PARTITION_OF_FP32_GROUPS + elif zero_stage == 3: + fp32_groups_key = FP32_FLAT_GROUPS + else: + raise ValueError(f"unknown zero stage {zero_stage}") + + if zero_stage <= 2: + fp32_flat_groups = [state_dicts[i][OPTIMIZER_STATE_DICT][fp32_groups_key] for i in range(len(state_dicts))] + elif zero_stage == 3: + # if there is more than one param group, there will be multiple flattened tensors - one + # flattened tensor per group - for simplicity merge them into a single tensor + # + # XXX: could make the script more memory efficient for when there are multiple groups - it + # will require matching the sub-lists of param_shapes for each param group flattened tensor + + fp32_flat_groups = [ + torch.cat(state_dicts[i][OPTIMIZER_STATE_DICT][fp32_groups_key], 0) for i in range(len(state_dicts)) + ] + + return zero_stage, world_size, fp32_flat_groups + + +def _get_fp32_state_dict_from_zero_checkpoint(ds_checkpoint_dir, exclude_frozen_parameters): + """ + Returns fp32 state_dict reconstructed from ds checkpoint + + Args: + - ``ds_checkpoint_dir``: path to the deepspeed checkpoint folder (where the optimizer files are) + + """ + print(f"Processing zero checkpoint '{ds_checkpoint_dir}'") + + optim_files = get_optim_files(ds_checkpoint_dir) + zero_stage, world_size, fp32_flat_groups = parse_optim_states(optim_files, ds_checkpoint_dir) + print(f"Detected checkpoint of type zero stage {zero_stage}, world_size: {world_size}") + + model_files = get_model_state_files(ds_checkpoint_dir) + + zero_model_states = parse_model_states(model_files) + print(f'Parsing checkpoint created by deepspeed=={zero_model_states[0].ds_version}') + + if zero_stage <= 2: + return _get_fp32_state_dict_from_zero2_checkpoint(world_size, fp32_flat_groups, zero_model_states, + exclude_frozen_parameters) + elif zero_stage == 3: + return _get_fp32_state_dict_from_zero3_checkpoint(world_size, fp32_flat_groups, zero_model_states, + exclude_frozen_parameters) + + +def _zero2_merge_frozen_params(state_dict, zero_model_states): + if zero_model_states[0].frozen_param_shapes is None or len(zero_model_states[0].frozen_param_shapes) == 0: + return + + frozen_param_shapes = zero_model_states[0].frozen_param_shapes + frozen_param_fragments = zero_model_states[0].frozen_param_fragments + + if debug: + num_elem = sum(s.numel() for s in frozen_param_shapes.values()) + print(f'rank 0: {FROZEN_PARAM_SHAPES}.numel = {num_elem}') + + wanted_params = len(frozen_param_shapes) + wanted_numel = sum(s.numel() for s in frozen_param_shapes.values()) + avail_numel = sum([p.numel() for p in frozen_param_fragments.values()]) + print(f'Frozen params: Have {avail_numel} numels to process.') + print(f'Frozen params: Need {wanted_numel} numels in {wanted_params} params') + + total_params = 0 + total_numel = 0 + for name, shape in frozen_param_shapes.items(): + total_params += 1 + unpartitioned_numel = shape.numel() + total_numel += unpartitioned_numel + + state_dict[name] = frozen_param_fragments[name] + + if debug: + print(f"{name} full shape: {shape} unpartitioned numel {unpartitioned_numel} ") + + print(f"Reconstructed Frozen fp32 state dict with {total_params} params {total_numel} elements") + + +def _has_callable(obj, fn): + attr = getattr(obj, fn, None) + return callable(attr) + + +def _zero2_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states): + param_shapes = zero_model_states[0].param_shapes + + # Reconstruction protocol: + # + # XXX: document this + + if debug: + for i in range(world_size): + for j in range(len(fp32_flat_groups[0])): + print(f"{FP32_FLAT_GROUPS}[{i}][{j}].shape={fp32_flat_groups[i][j].shape}") + + # XXX: memory usage doubles here (zero2) + num_param_groups = len(fp32_flat_groups[0]) + merged_single_partition_of_fp32_groups = [] + for i in range(num_param_groups): + merged_partitions = [sd[i] for sd in fp32_flat_groups] + full_single_fp32_vector = torch.cat(merged_partitions, 0) + merged_single_partition_of_fp32_groups.append(full_single_fp32_vector) + avail_numel = sum( + [full_single_fp32_vector.numel() for full_single_fp32_vector in merged_single_partition_of_fp32_groups]) + + if debug: + wanted_params = sum([len(shapes) for shapes in param_shapes]) + wanted_numel = sum([sum(shape.numel() for shape in shapes.values()) for shapes in param_shapes]) + # not asserting if there is a mismatch due to possible padding + print(f"Have {avail_numel} numels to process.") + print(f"Need {wanted_numel} numels in {wanted_params} params.") + + # params + # XXX: for huge models that can't fit into the host's RAM we will have to recode this to support + # out-of-core computing solution + total_numel = 0 + total_params = 0 + for shapes, full_single_fp32_vector in zip(param_shapes, merged_single_partition_of_fp32_groups): + offset = 0 + avail_numel = full_single_fp32_vector.numel() + for name, shape in shapes.items(): + + unpartitioned_numel = shape.numel() if _has_callable(shape, 'numel') else math.prod(shape) + total_numel += unpartitioned_numel + total_params += 1 + + if debug: + print(f"{name} full shape: {shape} unpartitioned numel {unpartitioned_numel} ") + state_dict[name] = full_single_fp32_vector.narrow(0, offset, unpartitioned_numel).view(shape) + offset += unpartitioned_numel + + # Z2 started to align to 2*world_size to improve nccl performance. Therefore both offset and + # avail_numel can differ by anywhere between 0..2*world_size. Due to two unrelated complex + # paddings performed in the code it's almost impossible to predict the exact numbers w/o the + # live optimizer object, so we are checking that the numbers are within the right range + align_to = 2 * world_size + + def zero2_align(x): + return align_to * math.ceil(x / align_to) + + if debug: + print(f"original offset={offset}, avail_numel={avail_numel}") + + offset = zero2_align(offset) + avail_numel = zero2_align(avail_numel) + + if debug: + print(f"aligned offset={offset}, avail_numel={avail_numel}") + + # Sanity check + if offset != avail_numel: + raise ValueError(f"consumed {offset} numels out of {avail_numel} - something is wrong") + + print(f"Reconstructed fp32 state dict with {total_params} params {total_numel} elements") + + +def _get_fp32_state_dict_from_zero2_checkpoint(world_size, fp32_flat_groups, zero_model_states, + exclude_frozen_parameters): + state_dict = OrderedDict() + + # buffers + buffers = zero_model_states[0].buffers + state_dict.update(buffers) + if debug: + print(f"added {len(buffers)} buffers") + + if not exclude_frozen_parameters: + _zero2_merge_frozen_params(state_dict, zero_model_states) + + _zero2_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states) + + # recover shared parameters + for pair in zero_model_states[0].shared_params: + if pair[1] in state_dict: + state_dict[pair[0]] = state_dict[pair[1]] + + return state_dict + + +def zero3_partitioned_param_info(unpartitioned_numel, world_size): + remainder = unpartitioned_numel % world_size + padding_numel = (world_size - remainder) if remainder else 0 + partitioned_numel = math.ceil(unpartitioned_numel / world_size) + return partitioned_numel, padding_numel + + +def _zero3_merge_frozen_params(state_dict, world_size, zero_model_states): + if zero_model_states[0].frozen_param_shapes is None or len(zero_model_states[0].frozen_param_shapes) == 0: + return + + if debug: + for i in range(world_size): + num_elem = sum(s.numel() for s in zero_model_states[i].frozen_param_fragments.values()) + print(f'rank {i}: {FROZEN_PARAM_SHAPES}.numel = {num_elem}') + + frozen_param_shapes = zero_model_states[0].frozen_param_shapes + wanted_params = len(frozen_param_shapes) + wanted_numel = sum(s.numel() for s in frozen_param_shapes.values()) + avail_numel = sum([p.numel() for p in zero_model_states[0].frozen_param_fragments.values()]) * world_size + print(f'Frozen params: Have {avail_numel} numels to process.') + print(f'Frozen params: Need {wanted_numel} numels in {wanted_params} params') + + total_params = 0 + total_numel = 0 + for name, shape in zero_model_states[0].frozen_param_shapes.items(): + total_params += 1 + unpartitioned_numel = shape.numel() + total_numel += unpartitioned_numel + + param_frags = tuple(model_state.frozen_param_fragments[name] for model_state in zero_model_states) + state_dict[name] = torch.cat(param_frags, 0).narrow(0, 0, unpartitioned_numel).view(shape) + + partitioned_numel, partitioned_padding_numel = zero3_partitioned_param_info(unpartitioned_numel, world_size) + + if debug: + print( + f"Frozen params: {total_params} {name} full shape: {shape} partition0 numel={partitioned_numel} partitioned_padding_numel={partitioned_padding_numel}" + ) + + print(f"Reconstructed Frozen fp32 state dict with {total_params} params {total_numel} elements") + + +def _zero3_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states): + param_shapes = zero_model_states[0].param_shapes + avail_numel = fp32_flat_groups[0].numel() * world_size + # Reconstruction protocol: For zero3 we need to zip the partitions together at boundary of each + # param, re-consolidating each param, while dealing with padding if any + + # merge list of dicts, preserving order + param_shapes = {k: v for d in param_shapes for k, v in d.items()} + + if debug: + for i in range(world_size): + print(f"{FP32_FLAT_GROUPS}[{i}].shape={fp32_flat_groups[i].shape}") + + wanted_params = len(param_shapes) + wanted_numel = sum(shape.numel() for shape in param_shapes.values()) + # not asserting if there is a mismatch due to possible padding + avail_numel = fp32_flat_groups[0].numel() * world_size + print(f"Trainable params: Have {avail_numel} numels to process.") + print(f"Trainable params: Need {wanted_numel} numels in {wanted_params} params.") + + # params + # XXX: for huge models that can't fit into the host's RAM we will have to recode this to support + # out-of-core computing solution + offset = 0 + total_numel = 0 + total_params = 0 + for name, shape in param_shapes.items(): + + unpartitioned_numel = shape.numel() + total_numel += unpartitioned_numel + total_params += 1 + + partitioned_numel, partitioned_padding_numel = zero3_partitioned_param_info(unpartitioned_numel, world_size) + + if debug: + print( + f"Trainable params: {total_params} {name} full shape: {shape} partition0 numel={partitioned_numel} partitioned_padding_numel={partitioned_padding_numel}" + ) + + # XXX: memory usage doubles here + state_dict[name] = torch.cat( + tuple(fp32_flat_groups[i].narrow(0, offset, partitioned_numel) for i in range(world_size)), + 0).narrow(0, 0, unpartitioned_numel).view(shape) + offset += partitioned_numel + + offset *= world_size + + # Sanity check + if offset != avail_numel: + raise ValueError(f"consumed {offset} numels out of {avail_numel} - something is wrong") + + print(f"Reconstructed Trainable fp32 state dict with {total_params} params {total_numel} elements") + + +def _get_fp32_state_dict_from_zero3_checkpoint(world_size, fp32_flat_groups, zero_model_states, + exclude_frozen_parameters): + state_dict = OrderedDict() + + # buffers + buffers = zero_model_states[0].buffers + state_dict.update(buffers) + if debug: + print(f"added {len(buffers)} buffers") + + if not exclude_frozen_parameters: + _zero3_merge_frozen_params(state_dict, world_size, zero_model_states) + + _zero3_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states) + + # recover shared parameters + for pair in zero_model_states[0].shared_params: + if pair[1] in state_dict: + state_dict[pair[0]] = state_dict[pair[1]] + + return state_dict + + +def get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, tag=None, exclude_frozen_parameters=False): + """ + Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated state_dict that can be loaded with + ``load_state_dict()`` and used for training without DeepSpeed or shared with others, for example + via a model hub. + + Args: + - ``checkpoint_dir``: path to the desired checkpoint folder + - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in 'latest' file. e.g., ``global_step14`` + - ``exclude_frozen_parameters``: exclude frozen parameters + + Returns: + - pytorch ``state_dict`` + + Note: this approach may not work if your application doesn't have sufficient free CPU memory and + you may need to use the offline approach using the ``zero_to_fp32.py`` script that is saved with + the checkpoint. + + A typical usage might be :: + + from deepspeed.utils.zero_to_fp32 import get_fp32_state_dict_from_zero_checkpoint + # do the training and checkpoint saving + state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir) # already on cpu + model = model.cpu() # move to cpu + model.load_state_dict(state_dict) + # submit to model hub or save the model to share with others + + In this example the ``model`` will no longer be usable in the deepspeed context of the same + application. i.e. you will need to re-initialize the deepspeed engine, since + ``model.load_state_dict(state_dict)`` will remove all the deepspeed magic from it. + + If you want it all done for you, use ``load_state_dict_from_zero_checkpoint`` instead. + + """ + if tag is None: + latest_path = os.path.join(checkpoint_dir, 'latest') + if os.path.isfile(latest_path): + with open(latest_path, 'r') as fd: + tag = fd.read().strip() + else: + raise ValueError(f"Unable to find 'latest' file at {latest_path}") + + ds_checkpoint_dir = os.path.join(checkpoint_dir, tag) + + if not os.path.isdir(ds_checkpoint_dir): + raise FileNotFoundError(f"Directory '{ds_checkpoint_dir}' doesn't exist") + + return _get_fp32_state_dict_from_zero_checkpoint(ds_checkpoint_dir, exclude_frozen_parameters) + + +def convert_zero_checkpoint_to_fp32_state_dict(checkpoint_dir, output_file, tag=None, exclude_frozen_parameters=False): + """ + Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated ``state_dict`` file that can be + loaded with ``torch.load(file)`` + ``load_state_dict()`` and used for training without DeepSpeed. + + Args: + - ``checkpoint_dir``: path to the desired checkpoint folder. (one that contains the tag-folder, like ``global_step14``) + - ``output_file``: path to the pytorch fp32 state_dict output file (e.g. path/pytorch_model.bin) + - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in the file named ``latest`` in the checkpoint folder, e.g., ``global_step14`` + - ``exclude_frozen_parameters``: exclude frozen parameters + """ + + state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, tag, exclude_frozen_parameters) + print(f"Saving fp32 state dict to {output_file}") + torch.save(state_dict, output_file) + + +def load_state_dict_from_zero_checkpoint(model, checkpoint_dir, tag=None): + """ + 1. Put the provided model to cpu + 2. Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated ``state_dict`` + 3. Load it into the provided model + + Args: + - ``model``: the model object to update + - ``checkpoint_dir``: path to the desired checkpoint folder. (one that contains the tag-folder, like ``global_step14``) + - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in the file named ``latest`` in the checkpoint folder, e.g., ``global_step14`` + + Returns: + - ``model`: modified model + + Make sure you have plenty of CPU memory available before you call this function. If you don't + have enough use the ``zero_to_fp32.py`` utility to do the conversion. You will find it + conveniently placed for you in the checkpoint folder. + + A typical usage might be :: + + from deepspeed.utils.zero_to_fp32 import load_state_dict_from_zero_checkpoint + model = load_state_dict_from_zero_checkpoint(trainer.model, checkpoint_dir) + # submit to model hub or save the model to share with others + + Note, that once this was run, the ``model`` will no longer be usable in the deepspeed context + of the same application. i.e. you will need to re-initialize the deepspeed engine, since + ``model.load_state_dict(state_dict)`` will remove all the deepspeed magic from it. + + """ + logger.info(f"Extracting fp32 weights") + state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, tag) + + logger.info(f"Overwriting model with fp32 weights") + model = model.cpu() + model.load_state_dict(state_dict, strict=False) + + return model + + +if __name__ == "__main__": + + parser = argparse.ArgumentParser() + parser.add_argument("checkpoint_dir", + type=str, + help="path to the desired checkpoint folder, e.g., path/checkpoint-12") + parser.add_argument( + "output_file", + type=str, + help="path to the pytorch fp32 state_dict output file (e.g. path/checkpoint-12/pytorch_model.bin)") + parser.add_argument("-t", + "--tag", + type=str, + default=None, + help="checkpoint tag used as a unique identifier for checkpoint. e.g., global_step1") + parser.add_argument("--exclude_frozen_parameters", action='store_true', help="exclude frozen parameters") + parser.add_argument("-d", "--debug", action='store_true', help="enable debug") + args = parser.parse_args() + + debug = args.debug + + convert_zero_checkpoint_to_fp32_state_dict(args.checkpoint_dir, + args.output_file, + tag=args.tag, + exclude_frozen_parameters=args.exclude_frozen_parameters) diff --git a/checkpoint-2115/README.md b/checkpoint-2115/README.md new file mode 100644 index 0000000000000000000000000000000000000000..049d467664ca6172b7ffbe6ba60b3eac7479cac4 --- /dev/null +++ b/checkpoint-2115/README.md @@ -0,0 +1,202 @@ +--- +base_model: meta-llama/Llama-3.1-8B +library_name: peft +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.14.0 \ No newline at end of file diff --git a/checkpoint-2115/adapter_config.json b/checkpoint-2115/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..38890ca21f7e1854f7350049a9113a9eec8a443a --- /dev/null +++ b/checkpoint-2115/adapter_config.json @@ -0,0 +1,40 @@ +{ + "alpha_pattern": {}, + "auto_mapping": null, + "base_model_name_or_path": "meta-llama/Llama-3.1-8B", + "bias": "none", + "eva_config": null, + "exclude_modules": null, + "fan_in_fan_out": null, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 512, + "lora_bias": false, + "lora_dropout": 0.05, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": [ + "embed_tokens", + "lm_head" + ], + "peft_type": "LORA", + "r": 256, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "v_proj", + "q_proj", + "gate_proj", + "k_proj", + "down_proj", + "up_proj", + "o_proj" + ], + "task_type": "CAUSAL_LM", + "use_dora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/checkpoint-2115/adapter_model.safetensors b/checkpoint-2115/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..4da763432abb5296cfd6c4ab09bc27dd7dba8c11 --- /dev/null +++ b/checkpoint-2115/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:026f3c7889342f0fe43b51086158facacc03251e62f9af75a2561c3bd540fe59 +size 3443586272 diff --git a/checkpoint-2115/global_step2115/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt b/checkpoint-2115/global_step2115/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..8d809445a1e9ab5ad0f90d907f08223e2bca9493 --- /dev/null +++ b/checkpoint-2115/global_step2115/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ec14e35a08f2f89de6b56dfedb73f3de522db79770e7c655e61fb822a23a2942 +size 20661195036 diff --git a/checkpoint-2115/global_step2115/mp_rank_00_model_states.pt b/checkpoint-2115/global_step2115/mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..8dd07acf4b4fbfa964e3f99ed0a94aa23b91cb56 --- /dev/null +++ b/checkpoint-2115/global_step2115/mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:aebb51cfec455d55d4ac9fb1829aeb3b1cb2978ad761fc8fb52f73a313fc76a6 +size 3555326777 diff --git a/checkpoint-2115/latest b/checkpoint-2115/latest new file mode 100644 index 0000000000000000000000000000000000000000..eab2857ea42f429150d4483575e33a16db9abe77 --- /dev/null +++ b/checkpoint-2115/latest @@ -0,0 +1 @@ +global_step2115 \ No newline at end of file diff --git a/checkpoint-2115/rng_state.pth b/checkpoint-2115/rng_state.pth new file mode 100644 index 0000000000000000000000000000000000000000..7435620051efd12b77e0ed7e5c44cebaacb7bbea --- /dev/null +++ b/checkpoint-2115/rng_state.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3626e2b3ca653eb16e81523f796a5be969ddc5e19daefb968419425ad5a11285 +size 14244 diff --git a/checkpoint-2115/scheduler.pt b/checkpoint-2115/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..22828224539edb6b797a5020a5994a7ead0dd2e4 --- /dev/null +++ b/checkpoint-2115/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f6f065ea9d96632a81e5e933322d38152bef3ca141882db2bf749d2021df436b +size 1064 diff --git a/checkpoint-2115/special_tokens_map.json b/checkpoint-2115/special_tokens_map.json new file mode 100644 index 0000000000000000000000000000000000000000..e5b39b6305d89284b04934011c68dbb26bf588ca --- /dev/null +++ b/checkpoint-2115/special_tokens_map.json @@ -0,0 +1,23 @@ +{ + "bos_token": { + "content": "<|begin_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "eos_token": { + "content": "<|end_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "pad_token": { + "content": "<|end_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + } +} diff --git a/checkpoint-2115/tokenizer.json b/checkpoint-2115/tokenizer.json new file mode 100644 index 0000000000000000000000000000000000000000..1c1d8d5c9024994f1d3b00f9662b8dd89ca13cf2 --- /dev/null +++ b/checkpoint-2115/tokenizer.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6b9e4e7fb171f92fd137b777cc2714bf87d11576700a1dcd7a399e7bbe39537b +size 17209920 diff --git a/checkpoint-2115/tokenizer_config.json b/checkpoint-2115/tokenizer_config.json new file mode 100644 index 0000000000000000000000000000000000000000..81dd14db6632ad5b35b9d447732e37ac074873a5 --- /dev/null +++ b/checkpoint-2115/tokenizer_config.json @@ -0,0 +1,2063 @@ +{ + "added_tokens_decoder": { + "128000": { + "content": "<|begin_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128001": { + "content": "<|end_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128002": { + "content": "<|reserved_special_token_0|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128003": { + "content": "<|reserved_special_token_1|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128004": { + "content": "<|finetune_right_pad_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128005": { + "content": "<|reserved_special_token_2|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128006": { + "content": "<|start_header_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128007": { + "content": "<|end_header_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128008": { + "content": "<|eom_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128009": { + "content": "<|eot_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128010": { + "content": "<|python_tag|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128011": { + "content": "<|reserved_special_token_3|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128012": { + "content": "<|reserved_special_token_4|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128013": { + "content": "<|reserved_special_token_5|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128014": { + "content": "<|reserved_special_token_6|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128015": { + "content": "<|reserved_special_token_7|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128016": { + "content": "<|reserved_special_token_8|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128017": { + "content": "<|reserved_special_token_9|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128018": { + "content": "<|reserved_special_token_10|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128019": { + "content": "<|reserved_special_token_11|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128020": { + "content": "<|reserved_special_token_12|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128021": { + "content": "<|reserved_special_token_13|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128022": { + "content": "<|reserved_special_token_14|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128023": { + "content": "<|reserved_special_token_15|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128024": { + "content": "<|reserved_special_token_16|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128025": { + "content": "<|reserved_special_token_17|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128026": { + "content": "<|reserved_special_token_18|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128027": { + "content": "<|reserved_special_token_19|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128028": { + "content": "<|reserved_special_token_20|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128029": { + "content": "<|reserved_special_token_21|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128030": { + "content": "<|reserved_special_token_22|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128031": { + "content": "<|reserved_special_token_23|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128032": { + "content": "<|reserved_special_token_24|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128033": { + "content": "<|reserved_special_token_25|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128034": { + "content": "<|reserved_special_token_26|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128035": { + "content": "<|reserved_special_token_27|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128036": { + "content": "<|reserved_special_token_28|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128037": { + "content": "<|reserved_special_token_29|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128038": { + "content": "<|reserved_special_token_30|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128039": { + "content": "<|reserved_special_token_31|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128040": { + "content": "<|reserved_special_token_32|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128041": { + "content": "<|reserved_special_token_33|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128042": { + "content": "<|reserved_special_token_34|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128043": { + "content": "<|reserved_special_token_35|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128044": { + "content": "<|reserved_special_token_36|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128045": { + "content": "<|reserved_special_token_37|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128046": { + "content": "<|reserved_special_token_38|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128047": { + "content": "<|reserved_special_token_39|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128048": { + "content": "<|reserved_special_token_40|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128049": { + "content": "<|reserved_special_token_41|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128050": { + "content": "<|reserved_special_token_42|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128051": { + "content": "<|reserved_special_token_43|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128052": { + "content": "<|reserved_special_token_44|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128053": { + "content": "<|reserved_special_token_45|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128054": { + "content": "<|reserved_special_token_46|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128055": { + "content": "<|reserved_special_token_47|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128056": { + "content": "<|reserved_special_token_48|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128057": { + "content": "<|reserved_special_token_49|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128058": { + "content": "<|reserved_special_token_50|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128059": { + "content": "<|reserved_special_token_51|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128060": { + "content": "<|reserved_special_token_52|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128061": { + "content": "<|reserved_special_token_53|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128062": { + "content": "<|reserved_special_token_54|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128063": { + "content": "<|reserved_special_token_55|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128064": { + "content": "<|reserved_special_token_56|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128065": { + "content": "<|reserved_special_token_57|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128066": { + "content": "<|reserved_special_token_58|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128067": { + "content": "<|reserved_special_token_59|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128068": { + "content": "<|reserved_special_token_60|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128069": { + "content": "<|reserved_special_token_61|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128070": { + "content": "<|reserved_special_token_62|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128071": { + "content": "<|reserved_special_token_63|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128072": { + "content": "<|reserved_special_token_64|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128073": { + "content": "<|reserved_special_token_65|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128074": { + "content": "<|reserved_special_token_66|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128075": { + "content": "<|reserved_special_token_67|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128076": { + "content": "<|reserved_special_token_68|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128077": { + "content": "<|reserved_special_token_69|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128078": { + "content": "<|reserved_special_token_70|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128079": { + "content": "<|reserved_special_token_71|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128080": { + "content": "<|reserved_special_token_72|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128081": { + "content": "<|reserved_special_token_73|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128082": { + "content": "<|reserved_special_token_74|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128083": { + "content": "<|reserved_special_token_75|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128084": { + "content": "<|reserved_special_token_76|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128085": { + "content": "<|reserved_special_token_77|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128086": { + "content": "<|reserved_special_token_78|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128087": { + "content": "<|reserved_special_token_79|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128088": { + "content": "<|reserved_special_token_80|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128089": { + "content": "<|reserved_special_token_81|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128090": { + "content": "<|reserved_special_token_82|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128091": { + "content": "<|reserved_special_token_83|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128092": { + "content": "<|reserved_special_token_84|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128093": { + "content": "<|reserved_special_token_85|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128094": { + "content": "<|reserved_special_token_86|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128095": { + "content": "<|reserved_special_token_87|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128096": { + "content": "<|reserved_special_token_88|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128097": { + "content": "<|reserved_special_token_89|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128098": { + "content": "<|reserved_special_token_90|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128099": { + "content": "<|reserved_special_token_91|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128100": { + "content": "<|reserved_special_token_92|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128101": { + "content": "<|reserved_special_token_93|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128102": { + "content": "<|reserved_special_token_94|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128103": { + "content": "<|reserved_special_token_95|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128104": { + "content": "<|reserved_special_token_96|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128105": { + "content": "<|reserved_special_token_97|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128106": { + "content": "<|reserved_special_token_98|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128107": { + "content": "<|reserved_special_token_99|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128108": { + "content": "<|reserved_special_token_100|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128109": { + "content": "<|reserved_special_token_101|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128110": { + "content": "<|reserved_special_token_102|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128111": { + "content": "<|reserved_special_token_103|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128112": { + "content": "<|reserved_special_token_104|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128113": { + "content": "<|reserved_special_token_105|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128114": { + "content": "<|reserved_special_token_106|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128115": { + "content": "<|reserved_special_token_107|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128116": { + "content": "<|reserved_special_token_108|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128117": { + "content": "<|reserved_special_token_109|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128118": { + "content": "<|reserved_special_token_110|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128119": { + "content": "<|reserved_special_token_111|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128120": { + "content": "<|reserved_special_token_112|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128121": { + "content": "<|reserved_special_token_113|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128122": { + "content": "<|reserved_special_token_114|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128123": { + "content": "<|reserved_special_token_115|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128124": { + "content": "<|reserved_special_token_116|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128125": { + "content": "<|reserved_special_token_117|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128126": { + "content": "<|reserved_special_token_118|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128127": { + "content": "<|reserved_special_token_119|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128128": { + "content": "<|reserved_special_token_120|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128129": { + "content": "<|reserved_special_token_121|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128130": { + "content": "<|reserved_special_token_122|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128131": { + "content": "<|reserved_special_token_123|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128132": { + "content": "<|reserved_special_token_124|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128133": { + "content": "<|reserved_special_token_125|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128134": { + "content": "<|reserved_special_token_126|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128135": { + "content": "<|reserved_special_token_127|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128136": { + "content": "<|reserved_special_token_128|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128137": { + "content": "<|reserved_special_token_129|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128138": { + "content": "<|reserved_special_token_130|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128139": { + "content": "<|reserved_special_token_131|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128140": { + "content": "<|reserved_special_token_132|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128141": { + "content": "<|reserved_special_token_133|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128142": { + "content": "<|reserved_special_token_134|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128143": { + "content": "<|reserved_special_token_135|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128144": { + "content": "<|reserved_special_token_136|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128145": { + "content": "<|reserved_special_token_137|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128146": { + "content": "<|reserved_special_token_138|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128147": { + "content": "<|reserved_special_token_139|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128148": { + "content": "<|reserved_special_token_140|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128149": { + "content": "<|reserved_special_token_141|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128150": { + "content": "<|reserved_special_token_142|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128151": { + "content": "<|reserved_special_token_143|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128152": { + "content": "<|reserved_special_token_144|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128153": { + "content": "<|reserved_special_token_145|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128154": { + "content": "<|reserved_special_token_146|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128155": { + "content": "<|reserved_special_token_147|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128156": { + "content": "<|reserved_special_token_148|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128157": { + "content": "<|reserved_special_token_149|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128158": { + "content": "<|reserved_special_token_150|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128159": { + "content": "<|reserved_special_token_151|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128160": { + "content": "<|reserved_special_token_152|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128161": { + "content": "<|reserved_special_token_153|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128162": { + "content": "<|reserved_special_token_154|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128163": { + "content": "<|reserved_special_token_155|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128164": { + "content": "<|reserved_special_token_156|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128165": { + "content": "<|reserved_special_token_157|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128166": { + "content": "<|reserved_special_token_158|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128167": { + "content": "<|reserved_special_token_159|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128168": { + "content": "<|reserved_special_token_160|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128169": { + "content": "<|reserved_special_token_161|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128170": { + "content": "<|reserved_special_token_162|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128171": { + "content": "<|reserved_special_token_163|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128172": { + "content": "<|reserved_special_token_164|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128173": { + "content": "<|reserved_special_token_165|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128174": { + "content": "<|reserved_special_token_166|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128175": { + "content": "<|reserved_special_token_167|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128176": { + "content": "<|reserved_special_token_168|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128177": { + "content": "<|reserved_special_token_169|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128178": { + "content": "<|reserved_special_token_170|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128179": { + "content": "<|reserved_special_token_171|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128180": { + "content": "<|reserved_special_token_172|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128181": { + "content": "<|reserved_special_token_173|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128182": { + "content": "<|reserved_special_token_174|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128183": { + "content": "<|reserved_special_token_175|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128184": { + "content": "<|reserved_special_token_176|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128185": { + "content": "<|reserved_special_token_177|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128186": { + "content": "<|reserved_special_token_178|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128187": { + "content": "<|reserved_special_token_179|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128188": { + "content": "<|reserved_special_token_180|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128189": { + "content": "<|reserved_special_token_181|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128190": { + "content": "<|reserved_special_token_182|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128191": { + "content": "<|reserved_special_token_183|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128192": { + "content": "<|reserved_special_token_184|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128193": { + "content": "<|reserved_special_token_185|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128194": { + "content": "<|reserved_special_token_186|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128195": { + "content": "<|reserved_special_token_187|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128196": { + "content": "<|reserved_special_token_188|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128197": { + "content": "<|reserved_special_token_189|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128198": { + "content": "<|reserved_special_token_190|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128199": { + "content": "<|reserved_special_token_191|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128200": { + "content": "<|reserved_special_token_192|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128201": { + "content": "<|reserved_special_token_193|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128202": { + "content": "<|reserved_special_token_194|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128203": { + "content": "<|reserved_special_token_195|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128204": { + "content": "<|reserved_special_token_196|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128205": { + "content": "<|reserved_special_token_197|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128206": { + "content": "<|reserved_special_token_198|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128207": { + "content": "<|reserved_special_token_199|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128208": { + "content": "<|reserved_special_token_200|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128209": { + "content": "<|reserved_special_token_201|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128210": { + "content": "<|reserved_special_token_202|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128211": { + "content": "<|reserved_special_token_203|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128212": { + "content": "<|reserved_special_token_204|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128213": { + "content": "<|reserved_special_token_205|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128214": { + "content": "<|reserved_special_token_206|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128215": { + "content": "<|reserved_special_token_207|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128216": { + "content": "<|reserved_special_token_208|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128217": { + "content": "<|reserved_special_token_209|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128218": { + "content": "<|reserved_special_token_210|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128219": { + "content": "<|reserved_special_token_211|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128220": { + "content": "<|reserved_special_token_212|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128221": { + "content": "<|reserved_special_token_213|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128222": { + "content": "<|reserved_special_token_214|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128223": { + "content": "<|reserved_special_token_215|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128224": { + "content": "<|reserved_special_token_216|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128225": { + "content": "<|reserved_special_token_217|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128226": { + "content": "<|reserved_special_token_218|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128227": { + "content": "<|reserved_special_token_219|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128228": { + "content": "<|reserved_special_token_220|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128229": { + "content": "<|reserved_special_token_221|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128230": { + "content": "<|reserved_special_token_222|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128231": { + "content": "<|reserved_special_token_223|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128232": { + "content": "<|reserved_special_token_224|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128233": { + "content": "<|reserved_special_token_225|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128234": { + "content": "<|reserved_special_token_226|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128235": { + "content": "<|reserved_special_token_227|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128236": { + "content": "<|reserved_special_token_228|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128237": { + "content": "<|reserved_special_token_229|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128238": { + "content": "<|reserved_special_token_230|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128239": { + "content": "<|reserved_special_token_231|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128240": { + "content": "<|reserved_special_token_232|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128241": { + "content": "<|reserved_special_token_233|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128242": { + "content": "<|reserved_special_token_234|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128243": { + "content": "<|reserved_special_token_235|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128244": { + "content": "<|reserved_special_token_236|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128245": { + "content": "<|reserved_special_token_237|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128246": { + "content": "<|reserved_special_token_238|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128247": { + "content": "<|reserved_special_token_239|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128248": { + "content": "<|reserved_special_token_240|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128249": { + "content": "<|reserved_special_token_241|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128250": { + "content": "<|reserved_special_token_242|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128251": { + "content": "<|reserved_special_token_243|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128252": { + "content": "<|reserved_special_token_244|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128253": { + "content": "<|reserved_special_token_245|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128254": { + "content": "<|reserved_special_token_246|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128255": { + "content": "<|reserved_special_token_247|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + } + }, + "bos_token": "<|begin_of_text|>", + "clean_up_tokenization_spaces": true, + "eos_token": "<|end_of_text|>", + "extra_special_tokens": {}, + "model_input_names": [ + "input_ids", + "attention_mask" + ], + "model_max_length": 131072, + "pad_token": "<|end_of_text|>", + "tokenizer_class": "PreTrainedTokenizer" +} diff --git a/checkpoint-2115/trainer_state.json b/checkpoint-2115/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..d54d1ea9b36cf3c6ea365748311c184de9beea1f --- /dev/null +++ b/checkpoint-2115/trainer_state.json @@ -0,0 +1,14838 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 1.0, + "eval_steps": 500, + "global_step": 2115, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.00047281323877068556, + "grad_norm": 5.163570880889893, + "learning_rate": 5.0000000000000004e-08, + "loss": 1.4628, + "step": 1 + }, + { + "epoch": 0.0009456264775413711, + "grad_norm": 6.298020839691162, + "learning_rate": 1.0000000000000001e-07, + "loss": 1.5003, + "step": 2 + }, + { + "epoch": 0.0014184397163120568, + "grad_norm": 5.853623390197754, + "learning_rate": 1.5000000000000002e-07, + "loss": 1.4495, + "step": 3 + }, + { + "epoch": 0.0018912529550827422, + "grad_norm": 5.456025123596191, + "learning_rate": 2.0000000000000002e-07, + "loss": 1.3798, + "step": 4 + }, + { + "epoch": 0.002364066193853428, + "grad_norm": 5.757407188415527, + "learning_rate": 2.5000000000000004e-07, + "loss": 1.4515, + "step": 5 + }, + { + "epoch": 0.0028368794326241137, + "grad_norm": 5.872277736663818, + "learning_rate": 3.0000000000000004e-07, + "loss": 1.4424, + "step": 6 + }, + { + "epoch": 0.003309692671394799, + "grad_norm": 6.7816009521484375, + "learning_rate": 3.5000000000000004e-07, + "loss": 1.4004, + "step": 7 + }, + { + "epoch": 0.0037825059101654845, + "grad_norm": 6.229667663574219, + "learning_rate": 4.0000000000000003e-07, + "loss": 1.4494, + "step": 8 + }, + { + "epoch": 0.00425531914893617, + "grad_norm": 5.336202621459961, + "learning_rate": 4.5000000000000003e-07, + "loss": 1.3916, + "step": 9 + }, + { + "epoch": 0.004728132387706856, + "grad_norm": 5.589445114135742, + "learning_rate": 5.000000000000001e-07, + "loss": 1.2318, + "step": 10 + }, + { + "epoch": 0.005200945626477541, + "grad_norm": 5.720539569854736, + "learning_rate": 5.5e-07, + "loss": 1.4367, + "step": 11 + }, + { + "epoch": 0.005673758865248227, + "grad_norm": 5.913913726806641, + "learning_rate": 6.000000000000001e-07, + "loss": 1.342, + "step": 12 + }, + { + "epoch": 0.006146572104018913, + "grad_norm": 5.899744987487793, + "learning_rate": 6.5e-07, + "loss": 1.4307, + "step": 13 + }, + { + "epoch": 0.006619385342789598, + "grad_norm": 5.571037292480469, + "learning_rate": 7.000000000000001e-07, + "loss": 1.3372, + "step": 14 + }, + { + "epoch": 0.0070921985815602835, + "grad_norm": 5.480010509490967, + "learning_rate": 7.5e-07, + "loss": 1.3923, + "step": 15 + }, + { + "epoch": 0.007565011820330969, + "grad_norm": 5.254702091217041, + "learning_rate": 8.000000000000001e-07, + "loss": 1.2928, + "step": 16 + }, + { + "epoch": 0.008037825059101654, + "grad_norm": 6.090312480926514, + "learning_rate": 8.500000000000001e-07, + "loss": 1.4984, + "step": 17 + }, + { + "epoch": 0.00851063829787234, + "grad_norm": 5.689319610595703, + "learning_rate": 9.000000000000001e-07, + "loss": 1.4108, + "step": 18 + }, + { + "epoch": 0.008983451536643027, + "grad_norm": 5.386685848236084, + "learning_rate": 9.500000000000001e-07, + "loss": 1.425, + "step": 19 + }, + { + "epoch": 0.009456264775413711, + "grad_norm": 6.451584815979004, + "learning_rate": 1.0000000000000002e-06, + "loss": 1.5507, + "step": 20 + }, + { + "epoch": 0.009929078014184398, + "grad_norm": 5.37647008895874, + "learning_rate": 1.0500000000000001e-06, + "loss": 1.4109, + "step": 21 + }, + { + "epoch": 0.010401891252955082, + "grad_norm": 4.716553211212158, + "learning_rate": 1.1e-06, + "loss": 1.2028, + "step": 22 + }, + { + "epoch": 0.010874704491725768, + "grad_norm": 4.950989723205566, + "learning_rate": 1.1500000000000002e-06, + "loss": 1.3043, + "step": 23 + }, + { + "epoch": 0.011347517730496455, + "grad_norm": 4.688975811004639, + "learning_rate": 1.2000000000000002e-06, + "loss": 1.2708, + "step": 24 + }, + { + "epoch": 0.01182033096926714, + "grad_norm": 4.905868053436279, + "learning_rate": 1.25e-06, + "loss": 1.3268, + "step": 25 + }, + { + "epoch": 0.012293144208037825, + "grad_norm": 4.503395080566406, + "learning_rate": 1.3e-06, + "loss": 1.1799, + "step": 26 + }, + { + "epoch": 0.01276595744680851, + "grad_norm": 4.77382230758667, + "learning_rate": 1.3500000000000002e-06, + "loss": 1.3882, + "step": 27 + }, + { + "epoch": 0.013238770685579196, + "grad_norm": 4.734329700469971, + "learning_rate": 1.4000000000000001e-06, + "loss": 1.3476, + "step": 28 + }, + { + "epoch": 0.013711583924349883, + "grad_norm": 4.775066375732422, + "learning_rate": 1.45e-06, + "loss": 1.2429, + "step": 29 + }, + { + "epoch": 0.014184397163120567, + "grad_norm": 4.978334426879883, + "learning_rate": 1.5e-06, + "loss": 1.2119, + "step": 30 + }, + { + "epoch": 0.014657210401891253, + "grad_norm": 4.506785869598389, + "learning_rate": 1.5500000000000002e-06, + "loss": 1.3157, + "step": 31 + }, + { + "epoch": 0.015130023640661938, + "grad_norm": 4.007757186889648, + "learning_rate": 1.6000000000000001e-06, + "loss": 1.1451, + "step": 32 + }, + { + "epoch": 0.015602836879432624, + "grad_norm": 3.6621618270874023, + "learning_rate": 1.6500000000000003e-06, + "loss": 1.093, + "step": 33 + }, + { + "epoch": 0.01607565011820331, + "grad_norm": 3.8733766078948975, + "learning_rate": 1.7000000000000002e-06, + "loss": 1.2289, + "step": 34 + }, + { + "epoch": 0.016548463356973995, + "grad_norm": 4.3391900062561035, + "learning_rate": 1.75e-06, + "loss": 1.1453, + "step": 35 + }, + { + "epoch": 0.01702127659574468, + "grad_norm": 3.287623643875122, + "learning_rate": 1.8000000000000001e-06, + "loss": 1.0257, + "step": 36 + }, + { + "epoch": 0.017494089834515367, + "grad_norm": 3.591721773147583, + "learning_rate": 1.85e-06, + "loss": 0.9976, + "step": 37 + }, + { + "epoch": 0.017966903073286054, + "grad_norm": 4.028271675109863, + "learning_rate": 1.9000000000000002e-06, + "loss": 1.0773, + "step": 38 + }, + { + "epoch": 0.018439716312056736, + "grad_norm": 3.3543951511383057, + "learning_rate": 1.9500000000000004e-06, + "loss": 1.1677, + "step": 39 + }, + { + "epoch": 0.018912529550827423, + "grad_norm": 3.807624340057373, + "learning_rate": 2.0000000000000003e-06, + "loss": 1.1232, + "step": 40 + }, + { + "epoch": 0.01938534278959811, + "grad_norm": 4.242797374725342, + "learning_rate": 2.05e-06, + "loss": 1.1819, + "step": 41 + }, + { + "epoch": 0.019858156028368795, + "grad_norm": 3.4574992656707764, + "learning_rate": 2.1000000000000002e-06, + "loss": 0.9878, + "step": 42 + }, + { + "epoch": 0.02033096926713948, + "grad_norm": 3.906695604324341, + "learning_rate": 2.15e-06, + "loss": 1.0592, + "step": 43 + }, + { + "epoch": 0.020803782505910164, + "grad_norm": 3.7543163299560547, + "learning_rate": 2.2e-06, + "loss": 1.0309, + "step": 44 + }, + { + "epoch": 0.02127659574468085, + "grad_norm": 3.3777148723602295, + "learning_rate": 2.25e-06, + "loss": 1.0664, + "step": 45 + }, + { + "epoch": 0.021749408983451537, + "grad_norm": 3.6003634929656982, + "learning_rate": 2.3000000000000004e-06, + "loss": 1.0482, + "step": 46 + }, + { + "epoch": 0.022222222222222223, + "grad_norm": 3.3961377143859863, + "learning_rate": 2.35e-06, + "loss": 1.0252, + "step": 47 + }, + { + "epoch": 0.02269503546099291, + "grad_norm": 3.1601035594940186, + "learning_rate": 2.4000000000000003e-06, + "loss": 1.0435, + "step": 48 + }, + { + "epoch": 0.023167848699763592, + "grad_norm": 3.4192967414855957, + "learning_rate": 2.4500000000000003e-06, + "loss": 1.0935, + "step": 49 + }, + { + "epoch": 0.02364066193853428, + "grad_norm": 3.1225922107696533, + "learning_rate": 2.5e-06, + "loss": 0.8988, + "step": 50 + }, + { + "epoch": 0.024113475177304965, + "grad_norm": 3.1423380374908447, + "learning_rate": 2.55e-06, + "loss": 1.0159, + "step": 51 + }, + { + "epoch": 0.02458628841607565, + "grad_norm": 3.4782402515411377, + "learning_rate": 2.6e-06, + "loss": 1.0231, + "step": 52 + }, + { + "epoch": 0.025059101654846337, + "grad_norm": 3.8362693786621094, + "learning_rate": 2.6500000000000005e-06, + "loss": 1.0725, + "step": 53 + }, + { + "epoch": 0.02553191489361702, + "grad_norm": 3.033294916152954, + "learning_rate": 2.7000000000000004e-06, + "loss": 0.9377, + "step": 54 + }, + { + "epoch": 0.026004728132387706, + "grad_norm": 3.849741220474243, + "learning_rate": 2.7500000000000004e-06, + "loss": 1.0046, + "step": 55 + }, + { + "epoch": 0.026477541371158392, + "grad_norm": 3.141876220703125, + "learning_rate": 2.8000000000000003e-06, + "loss": 0.9226, + "step": 56 + }, + { + "epoch": 0.02695035460992908, + "grad_norm": 2.773594856262207, + "learning_rate": 2.85e-06, + "loss": 0.8662, + "step": 57 + }, + { + "epoch": 0.027423167848699765, + "grad_norm": 3.1460225582122803, + "learning_rate": 2.9e-06, + "loss": 0.9304, + "step": 58 + }, + { + "epoch": 0.027895981087470448, + "grad_norm": 3.293583631515503, + "learning_rate": 2.95e-06, + "loss": 1.0374, + "step": 59 + }, + { + "epoch": 0.028368794326241134, + "grad_norm": 3.8190863132476807, + "learning_rate": 3e-06, + "loss": 0.971, + "step": 60 + }, + { + "epoch": 0.02884160756501182, + "grad_norm": 3.4566776752471924, + "learning_rate": 3.05e-06, + "loss": 0.9631, + "step": 61 + }, + { + "epoch": 0.029314420803782507, + "grad_norm": 3.355741500854492, + "learning_rate": 3.1000000000000004e-06, + "loss": 1.0097, + "step": 62 + }, + { + "epoch": 0.029787234042553193, + "grad_norm": 3.29746675491333, + "learning_rate": 3.1500000000000003e-06, + "loss": 0.9459, + "step": 63 + }, + { + "epoch": 0.030260047281323876, + "grad_norm": 3.3122968673706055, + "learning_rate": 3.2000000000000003e-06, + "loss": 0.8594, + "step": 64 + }, + { + "epoch": 0.030732860520094562, + "grad_norm": 3.477701187133789, + "learning_rate": 3.2500000000000002e-06, + "loss": 0.9197, + "step": 65 + }, + { + "epoch": 0.031205673758865248, + "grad_norm": 3.3363406658172607, + "learning_rate": 3.3000000000000006e-06, + "loss": 0.9478, + "step": 66 + }, + { + "epoch": 0.03167848699763593, + "grad_norm": 4.143295764923096, + "learning_rate": 3.3500000000000005e-06, + "loss": 1.0534, + "step": 67 + }, + { + "epoch": 0.03215130023640662, + "grad_norm": 3.2363274097442627, + "learning_rate": 3.4000000000000005e-06, + "loss": 0.9454, + "step": 68 + }, + { + "epoch": 0.032624113475177303, + "grad_norm": 3.198746681213379, + "learning_rate": 3.45e-06, + "loss": 0.9388, + "step": 69 + }, + { + "epoch": 0.03309692671394799, + "grad_norm": 3.5751023292541504, + "learning_rate": 3.5e-06, + "loss": 0.9444, + "step": 70 + }, + { + "epoch": 0.033569739952718676, + "grad_norm": 3.1745729446411133, + "learning_rate": 3.5500000000000003e-06, + "loss": 0.8683, + "step": 71 + }, + { + "epoch": 0.03404255319148936, + "grad_norm": 3.3210883140563965, + "learning_rate": 3.6000000000000003e-06, + "loss": 0.8811, + "step": 72 + }, + { + "epoch": 0.03451536643026005, + "grad_norm": 3.2502429485321045, + "learning_rate": 3.65e-06, + "loss": 1.0012, + "step": 73 + }, + { + "epoch": 0.034988179669030735, + "grad_norm": 3.44598126411438, + "learning_rate": 3.7e-06, + "loss": 0.9217, + "step": 74 + }, + { + "epoch": 0.03546099290780142, + "grad_norm": 3.439117431640625, + "learning_rate": 3.7500000000000005e-06, + "loss": 0.8976, + "step": 75 + }, + { + "epoch": 0.03593380614657211, + "grad_norm": 3.523627758026123, + "learning_rate": 3.8000000000000005e-06, + "loss": 0.8996, + "step": 76 + }, + { + "epoch": 0.03640661938534279, + "grad_norm": 3.3716015815734863, + "learning_rate": 3.85e-06, + "loss": 0.9061, + "step": 77 + }, + { + "epoch": 0.03687943262411347, + "grad_norm": 3.33518385887146, + "learning_rate": 3.900000000000001e-06, + "loss": 0.9371, + "step": 78 + }, + { + "epoch": 0.03735224586288416, + "grad_norm": 3.833829879760742, + "learning_rate": 3.95e-06, + "loss": 0.9669, + "step": 79 + }, + { + "epoch": 0.037825059101654845, + "grad_norm": 3.260446786880493, + "learning_rate": 4.000000000000001e-06, + "loss": 0.9449, + "step": 80 + }, + { + "epoch": 0.03829787234042553, + "grad_norm": 3.532451629638672, + "learning_rate": 4.05e-06, + "loss": 0.897, + "step": 81 + }, + { + "epoch": 0.03877068557919622, + "grad_norm": 3.1156492233276367, + "learning_rate": 4.1e-06, + "loss": 0.8463, + "step": 82 + }, + { + "epoch": 0.039243498817966904, + "grad_norm": 2.8801751136779785, + "learning_rate": 4.15e-06, + "loss": 0.8616, + "step": 83 + }, + { + "epoch": 0.03971631205673759, + "grad_norm": 3.072476863861084, + "learning_rate": 4.2000000000000004e-06, + "loss": 0.8387, + "step": 84 + }, + { + "epoch": 0.04018912529550828, + "grad_norm": 2.9601376056671143, + "learning_rate": 4.25e-06, + "loss": 0.8538, + "step": 85 + }, + { + "epoch": 0.04066193853427896, + "grad_norm": 3.521664619445801, + "learning_rate": 4.3e-06, + "loss": 0.8894, + "step": 86 + }, + { + "epoch": 0.04113475177304964, + "grad_norm": 3.2670981884002686, + "learning_rate": 4.350000000000001e-06, + "loss": 0.8387, + "step": 87 + }, + { + "epoch": 0.04160756501182033, + "grad_norm": 3.422089099884033, + "learning_rate": 4.4e-06, + "loss": 0.7728, + "step": 88 + }, + { + "epoch": 0.042080378250591015, + "grad_norm": 3.414034128189087, + "learning_rate": 4.450000000000001e-06, + "loss": 0.7968, + "step": 89 + }, + { + "epoch": 0.0425531914893617, + "grad_norm": 4.234285354614258, + "learning_rate": 4.5e-06, + "loss": 0.8502, + "step": 90 + }, + { + "epoch": 0.04302600472813239, + "grad_norm": 3.1446919441223145, + "learning_rate": 4.5500000000000005e-06, + "loss": 0.8236, + "step": 91 + }, + { + "epoch": 0.043498817966903074, + "grad_norm": 3.683443307876587, + "learning_rate": 4.600000000000001e-06, + "loss": 0.9792, + "step": 92 + }, + { + "epoch": 0.04397163120567376, + "grad_norm": 3.664219617843628, + "learning_rate": 4.65e-06, + "loss": 0.8743, + "step": 93 + }, + { + "epoch": 0.044444444444444446, + "grad_norm": 3.369479179382324, + "learning_rate": 4.7e-06, + "loss": 0.8741, + "step": 94 + }, + { + "epoch": 0.04491725768321513, + "grad_norm": 3.694949150085449, + "learning_rate": 4.75e-06, + "loss": 0.7574, + "step": 95 + }, + { + "epoch": 0.04539007092198582, + "grad_norm": 3.5144498348236084, + "learning_rate": 4.800000000000001e-06, + "loss": 0.9934, + "step": 96 + }, + { + "epoch": 0.0458628841607565, + "grad_norm": 3.164451837539673, + "learning_rate": 4.85e-06, + "loss": 0.7463, + "step": 97 + }, + { + "epoch": 0.046335697399527184, + "grad_norm": 3.222785472869873, + "learning_rate": 4.9000000000000005e-06, + "loss": 0.7698, + "step": 98 + }, + { + "epoch": 0.04680851063829787, + "grad_norm": 2.9129555225372314, + "learning_rate": 4.95e-06, + "loss": 0.7856, + "step": 99 + }, + { + "epoch": 0.04728132387706856, + "grad_norm": 3.5061235427856445, + "learning_rate": 5e-06, + "loss": 0.8588, + "step": 100 + }, + { + "epoch": 0.04775413711583924, + "grad_norm": 3.2805044651031494, + "learning_rate": 4.999999922167982e-06, + "loss": 0.7643, + "step": 101 + }, + { + "epoch": 0.04822695035460993, + "grad_norm": 3.5461678504943848, + "learning_rate": 4.999999688671929e-06, + "loss": 0.8253, + "step": 102 + }, + { + "epoch": 0.048699763593380616, + "grad_norm": 3.2238264083862305, + "learning_rate": 4.99999929951186e-06, + "loss": 0.7622, + "step": 103 + }, + { + "epoch": 0.0491725768321513, + "grad_norm": 3.818955898284912, + "learning_rate": 4.999998754687795e-06, + "loss": 0.8471, + "step": 104 + }, + { + "epoch": 0.04964539007092199, + "grad_norm": 3.1252424716949463, + "learning_rate": 4.99999805419977e-06, + "loss": 0.8409, + "step": 105 + }, + { + "epoch": 0.050118203309692674, + "grad_norm": 3.604283571243286, + "learning_rate": 4.999997198047828e-06, + "loss": 0.9027, + "step": 106 + }, + { + "epoch": 0.050591016548463354, + "grad_norm": 3.6752424240112305, + "learning_rate": 4.999996186232023e-06, + "loss": 0.9336, + "step": 107 + }, + { + "epoch": 0.05106382978723404, + "grad_norm": 3.517557144165039, + "learning_rate": 4.9999950187524184e-06, + "loss": 0.8351, + "step": 108 + }, + { + "epoch": 0.051536643026004726, + "grad_norm": 3.427285671234131, + "learning_rate": 4.999993695609085e-06, + "loss": 0.8457, + "step": 109 + }, + { + "epoch": 0.05200945626477541, + "grad_norm": 3.2792510986328125, + "learning_rate": 4.999992216802107e-06, + "loss": 0.8391, + "step": 110 + }, + { + "epoch": 0.0524822695035461, + "grad_norm": 3.581094741821289, + "learning_rate": 4.999990582331576e-06, + "loss": 0.7533, + "step": 111 + }, + { + "epoch": 0.052955082742316785, + "grad_norm": 3.1667377948760986, + "learning_rate": 4.999988792197593e-06, + "loss": 0.9562, + "step": 112 + }, + { + "epoch": 0.05342789598108747, + "grad_norm": 3.3609890937805176, + "learning_rate": 4.99998684640027e-06, + "loss": 0.8181, + "step": 113 + }, + { + "epoch": 0.05390070921985816, + "grad_norm": 3.260627269744873, + "learning_rate": 4.999984744939729e-06, + "loss": 0.8012, + "step": 114 + }, + { + "epoch": 0.054373522458628844, + "grad_norm": 3.4535653591156006, + "learning_rate": 4.9999824878160985e-06, + "loss": 0.919, + "step": 115 + }, + { + "epoch": 0.05484633569739953, + "grad_norm": 3.4880740642547607, + "learning_rate": 4.999980075029522e-06, + "loss": 0.8114, + "step": 116 + }, + { + "epoch": 0.05531914893617021, + "grad_norm": 3.2546932697296143, + "learning_rate": 4.999977506580147e-06, + "loss": 0.8274, + "step": 117 + }, + { + "epoch": 0.055791962174940896, + "grad_norm": 3.2762744426727295, + "learning_rate": 4.999974782468136e-06, + "loss": 0.9018, + "step": 118 + }, + { + "epoch": 0.05626477541371158, + "grad_norm": 3.42825984954834, + "learning_rate": 4.999971902693657e-06, + "loss": 0.8262, + "step": 119 + }, + { + "epoch": 0.05673758865248227, + "grad_norm": 3.082496404647827, + "learning_rate": 4.99996886725689e-06, + "loss": 0.8181, + "step": 120 + }, + { + "epoch": 0.057210401891252954, + "grad_norm": 3.322869300842285, + "learning_rate": 4.9999656761580225e-06, + "loss": 0.8382, + "step": 121 + }, + { + "epoch": 0.05768321513002364, + "grad_norm": 3.6365339756011963, + "learning_rate": 4.9999623293972555e-06, + "loss": 0.7489, + "step": 122 + }, + { + "epoch": 0.05815602836879433, + "grad_norm": 3.376352548599243, + "learning_rate": 4.999958826974796e-06, + "loss": 0.9012, + "step": 123 + }, + { + "epoch": 0.05862884160756501, + "grad_norm": 3.49088716506958, + "learning_rate": 4.999955168890862e-06, + "loss": 0.8999, + "step": 124 + }, + { + "epoch": 0.0591016548463357, + "grad_norm": 3.3265068531036377, + "learning_rate": 4.999951355145682e-06, + "loss": 0.8161, + "step": 125 + }, + { + "epoch": 0.059574468085106386, + "grad_norm": 3.697282314300537, + "learning_rate": 4.999947385739493e-06, + "loss": 0.9623, + "step": 126 + }, + { + "epoch": 0.06004728132387707, + "grad_norm": 2.7901928424835205, + "learning_rate": 4.999943260672542e-06, + "loss": 0.7371, + "step": 127 + }, + { + "epoch": 0.06052009456264775, + "grad_norm": 3.110319137573242, + "learning_rate": 4.999938979945086e-06, + "loss": 0.715, + "step": 128 + }, + { + "epoch": 0.06099290780141844, + "grad_norm": 3.2211520671844482, + "learning_rate": 4.999934543557392e-06, + "loss": 0.8888, + "step": 129 + }, + { + "epoch": 0.061465721040189124, + "grad_norm": 3.2466187477111816, + "learning_rate": 4.999929951509735e-06, + "loss": 0.9389, + "step": 130 + }, + { + "epoch": 0.06193853427895981, + "grad_norm": 3.3574399948120117, + "learning_rate": 4.999925203802403e-06, + "loss": 0.8263, + "step": 131 + }, + { + "epoch": 0.062411347517730496, + "grad_norm": 3.275601625442505, + "learning_rate": 4.99992030043569e-06, + "loss": 0.8338, + "step": 132 + }, + { + "epoch": 0.06288416075650118, + "grad_norm": 3.6011312007904053, + "learning_rate": 4.999915241409902e-06, + "loss": 0.8351, + "step": 133 + }, + { + "epoch": 0.06335697399527186, + "grad_norm": 2.969011068344116, + "learning_rate": 4.999910026725352e-06, + "loss": 0.79, + "step": 134 + }, + { + "epoch": 0.06382978723404255, + "grad_norm": 3.690784454345703, + "learning_rate": 4.999904656382369e-06, + "loss": 0.8209, + "step": 135 + }, + { + "epoch": 0.06430260047281323, + "grad_norm": 3.3363115787506104, + "learning_rate": 4.999899130381283e-06, + "loss": 0.858, + "step": 136 + }, + { + "epoch": 0.06477541371158392, + "grad_norm": 3.206881523132324, + "learning_rate": 4.9998934487224405e-06, + "loss": 0.834, + "step": 137 + }, + { + "epoch": 0.06524822695035461, + "grad_norm": 2.773146152496338, + "learning_rate": 4.999887611406195e-06, + "loss": 0.7576, + "step": 138 + }, + { + "epoch": 0.0657210401891253, + "grad_norm": 3.307725667953491, + "learning_rate": 4.999881618432908e-06, + "loss": 0.7487, + "step": 139 + }, + { + "epoch": 0.06619385342789598, + "grad_norm": 4.273657321929932, + "learning_rate": 4.999875469802956e-06, + "loss": 0.8176, + "step": 140 + }, + { + "epoch": 0.06666666666666667, + "grad_norm": 3.0898005962371826, + "learning_rate": 4.999869165516719e-06, + "loss": 0.7578, + "step": 141 + }, + { + "epoch": 0.06713947990543735, + "grad_norm": 3.25150990486145, + "learning_rate": 4.9998627055745915e-06, + "loss": 0.7873, + "step": 142 + }, + { + "epoch": 0.06761229314420804, + "grad_norm": 2.9705755710601807, + "learning_rate": 4.999856089976974e-06, + "loss": 0.6473, + "step": 143 + }, + { + "epoch": 0.06808510638297872, + "grad_norm": 3.5658507347106934, + "learning_rate": 4.9998493187242804e-06, + "loss": 0.855, + "step": 144 + }, + { + "epoch": 0.06855791962174941, + "grad_norm": 3.3994076251983643, + "learning_rate": 4.99984239181693e-06, + "loss": 0.7926, + "step": 145 + }, + { + "epoch": 0.0690307328605201, + "grad_norm": 2.8266260623931885, + "learning_rate": 4.999835309255357e-06, + "loss": 0.7564, + "step": 146 + }, + { + "epoch": 0.06950354609929078, + "grad_norm": 3.1143875122070312, + "learning_rate": 4.999828071039999e-06, + "loss": 0.8398, + "step": 147 + }, + { + "epoch": 0.06997635933806147, + "grad_norm": 2.9364278316497803, + "learning_rate": 4.99982067717131e-06, + "loss": 0.7381, + "step": 148 + }, + { + "epoch": 0.07044917257683216, + "grad_norm": 3.4155616760253906, + "learning_rate": 4.999813127649748e-06, + "loss": 0.7933, + "step": 149 + }, + { + "epoch": 0.07092198581560284, + "grad_norm": 4.371236324310303, + "learning_rate": 4.999805422475784e-06, + "loss": 0.8292, + "step": 150 + }, + { + "epoch": 0.07139479905437353, + "grad_norm": 3.3967185020446777, + "learning_rate": 4.999797561649897e-06, + "loss": 0.8712, + "step": 151 + }, + { + "epoch": 0.07186761229314421, + "grad_norm": 3.343303680419922, + "learning_rate": 4.999789545172578e-06, + "loss": 0.8177, + "step": 152 + }, + { + "epoch": 0.07234042553191489, + "grad_norm": 3.040235757827759, + "learning_rate": 4.999781373044325e-06, + "loss": 0.7379, + "step": 153 + }, + { + "epoch": 0.07281323877068557, + "grad_norm": 3.4069204330444336, + "learning_rate": 4.999773045265647e-06, + "loss": 0.7939, + "step": 154 + }, + { + "epoch": 0.07328605200945626, + "grad_norm": 3.1939475536346436, + "learning_rate": 4.999764561837063e-06, + "loss": 0.8037, + "step": 155 + }, + { + "epoch": 0.07375886524822695, + "grad_norm": 4.452004909515381, + "learning_rate": 4.999755922759101e-06, + "loss": 0.8421, + "step": 156 + }, + { + "epoch": 0.07423167848699763, + "grad_norm": 3.2031240463256836, + "learning_rate": 4.999747128032298e-06, + "loss": 0.794, + "step": 157 + }, + { + "epoch": 0.07470449172576832, + "grad_norm": 3.175920009613037, + "learning_rate": 4.999738177657203e-06, + "loss": 0.759, + "step": 158 + }, + { + "epoch": 0.075177304964539, + "grad_norm": 3.7679688930511475, + "learning_rate": 4.9997290716343725e-06, + "loss": 0.8174, + "step": 159 + }, + { + "epoch": 0.07565011820330969, + "grad_norm": 3.7020037174224854, + "learning_rate": 4.999719809964373e-06, + "loss": 0.7116, + "step": 160 + }, + { + "epoch": 0.07612293144208038, + "grad_norm": 4.357471942901611, + "learning_rate": 4.999710392647783e-06, + "loss": 0.7649, + "step": 161 + }, + { + "epoch": 0.07659574468085106, + "grad_norm": 3.3439087867736816, + "learning_rate": 4.999700819685187e-06, + "loss": 0.7907, + "step": 162 + }, + { + "epoch": 0.07706855791962175, + "grad_norm": 3.210815191268921, + "learning_rate": 4.999691091077182e-06, + "loss": 0.8446, + "step": 163 + }, + { + "epoch": 0.07754137115839244, + "grad_norm": 3.1029553413391113, + "learning_rate": 4.9996812068243735e-06, + "loss": 0.7232, + "step": 164 + }, + { + "epoch": 0.07801418439716312, + "grad_norm": 2.9389400482177734, + "learning_rate": 4.999671166927378e-06, + "loss": 0.7413, + "step": 165 + }, + { + "epoch": 0.07848699763593381, + "grad_norm": 3.7062697410583496, + "learning_rate": 4.9996609713868185e-06, + "loss": 0.8773, + "step": 166 + }, + { + "epoch": 0.0789598108747045, + "grad_norm": 3.2768924236297607, + "learning_rate": 4.999650620203332e-06, + "loss": 0.8046, + "step": 167 + }, + { + "epoch": 0.07943262411347518, + "grad_norm": 3.380373001098633, + "learning_rate": 4.999640113377561e-06, + "loss": 0.7529, + "step": 168 + }, + { + "epoch": 0.07990543735224587, + "grad_norm": 3.520022392272949, + "learning_rate": 4.999629450910162e-06, + "loss": 0.7352, + "step": 169 + }, + { + "epoch": 0.08037825059101655, + "grad_norm": 3.43269419670105, + "learning_rate": 4.999618632801796e-06, + "loss": 0.9371, + "step": 170 + }, + { + "epoch": 0.08085106382978724, + "grad_norm": 3.555877923965454, + "learning_rate": 4.99960765905314e-06, + "loss": 0.8276, + "step": 171 + }, + { + "epoch": 0.08132387706855793, + "grad_norm": 3.597050189971924, + "learning_rate": 4.999596529664874e-06, + "loss": 0.8164, + "step": 172 + }, + { + "epoch": 0.0817966903073286, + "grad_norm": 3.2002956867218018, + "learning_rate": 4.999585244637693e-06, + "loss": 0.7824, + "step": 173 + }, + { + "epoch": 0.08226950354609928, + "grad_norm": 3.527275562286377, + "learning_rate": 4.999573803972299e-06, + "loss": 0.8033, + "step": 174 + }, + { + "epoch": 0.08274231678486997, + "grad_norm": 3.5184452533721924, + "learning_rate": 4.999562207669405e-06, + "loss": 0.724, + "step": 175 + }, + { + "epoch": 0.08321513002364066, + "grad_norm": 3.6635067462921143, + "learning_rate": 4.999550455729732e-06, + "loss": 0.819, + "step": 176 + }, + { + "epoch": 0.08368794326241134, + "grad_norm": 3.192399740219116, + "learning_rate": 4.999538548154012e-06, + "loss": 0.7999, + "step": 177 + }, + { + "epoch": 0.08416075650118203, + "grad_norm": 3.0946953296661377, + "learning_rate": 4.999526484942988e-06, + "loss": 0.7367, + "step": 178 + }, + { + "epoch": 0.08463356973995272, + "grad_norm": 2.847198009490967, + "learning_rate": 4.99951426609741e-06, + "loss": 0.7536, + "step": 179 + }, + { + "epoch": 0.0851063829787234, + "grad_norm": 2.7674827575683594, + "learning_rate": 4.999501891618037e-06, + "loss": 0.701, + "step": 180 + }, + { + "epoch": 0.08557919621749409, + "grad_norm": 3.357933521270752, + "learning_rate": 4.999489361505643e-06, + "loss": 0.8331, + "step": 181 + }, + { + "epoch": 0.08605200945626477, + "grad_norm": 3.1464426517486572, + "learning_rate": 4.999476675761004e-06, + "loss": 0.7931, + "step": 182 + }, + { + "epoch": 0.08652482269503546, + "grad_norm": 3.310697078704834, + "learning_rate": 4.999463834384915e-06, + "loss": 0.753, + "step": 183 + }, + { + "epoch": 0.08699763593380615, + "grad_norm": 2.9794881343841553, + "learning_rate": 4.999450837378171e-06, + "loss": 0.7091, + "step": 184 + }, + { + "epoch": 0.08747044917257683, + "grad_norm": 3.0776889324188232, + "learning_rate": 4.999437684741584e-06, + "loss": 0.7226, + "step": 185 + }, + { + "epoch": 0.08794326241134752, + "grad_norm": 3.6657519340515137, + "learning_rate": 4.999424376475972e-06, + "loss": 0.845, + "step": 186 + }, + { + "epoch": 0.0884160756501182, + "grad_norm": 3.872718572616577, + "learning_rate": 4.999410912582164e-06, + "loss": 0.812, + "step": 187 + }, + { + "epoch": 0.08888888888888889, + "grad_norm": 2.9184508323669434, + "learning_rate": 4.9993972930609976e-06, + "loss": 0.6823, + "step": 188 + }, + { + "epoch": 0.08936170212765958, + "grad_norm": 3.5567142963409424, + "learning_rate": 4.999383517913321e-06, + "loss": 0.7614, + "step": 189 + }, + { + "epoch": 0.08983451536643026, + "grad_norm": 3.3688533306121826, + "learning_rate": 4.999369587139992e-06, + "loss": 0.858, + "step": 190 + }, + { + "epoch": 0.09030732860520095, + "grad_norm": 2.893223524093628, + "learning_rate": 4.99935550074188e-06, + "loss": 0.6761, + "step": 191 + }, + { + "epoch": 0.09078014184397164, + "grad_norm": 3.400225877761841, + "learning_rate": 4.999341258719859e-06, + "loss": 0.7531, + "step": 192 + }, + { + "epoch": 0.09125295508274232, + "grad_norm": 3.6167714595794678, + "learning_rate": 4.999326861074817e-06, + "loss": 0.8164, + "step": 193 + }, + { + "epoch": 0.091725768321513, + "grad_norm": 4.325016498565674, + "learning_rate": 4.9993123078076506e-06, + "loss": 0.7069, + "step": 194 + }, + { + "epoch": 0.09219858156028368, + "grad_norm": 3.195317029953003, + "learning_rate": 4.999297598919266e-06, + "loss": 0.726, + "step": 195 + }, + { + "epoch": 0.09267139479905437, + "grad_norm": 3.146530866622925, + "learning_rate": 4.999282734410579e-06, + "loss": 0.7888, + "step": 196 + }, + { + "epoch": 0.09314420803782505, + "grad_norm": 3.5166752338409424, + "learning_rate": 4.999267714282515e-06, + "loss": 0.8473, + "step": 197 + }, + { + "epoch": 0.09361702127659574, + "grad_norm": 3.3140196800231934, + "learning_rate": 4.99925253853601e-06, + "loss": 0.7233, + "step": 198 + }, + { + "epoch": 0.09408983451536643, + "grad_norm": 3.0318164825439453, + "learning_rate": 4.999237207172008e-06, + "loss": 0.7543, + "step": 199 + }, + { + "epoch": 0.09456264775413711, + "grad_norm": 3.662214756011963, + "learning_rate": 4.999221720191464e-06, + "loss": 0.7783, + "step": 200 + }, + { + "epoch": 0.0950354609929078, + "grad_norm": 3.452078104019165, + "learning_rate": 4.9992060775953425e-06, + "loss": 0.7868, + "step": 201 + }, + { + "epoch": 0.09550827423167849, + "grad_norm": 3.4051287174224854, + "learning_rate": 4.999190279384617e-06, + "loss": 0.7849, + "step": 202 + }, + { + "epoch": 0.09598108747044917, + "grad_norm": 3.1377196311950684, + "learning_rate": 4.999174325560271e-06, + "loss": 0.8364, + "step": 203 + }, + { + "epoch": 0.09645390070921986, + "grad_norm": 3.129473924636841, + "learning_rate": 4.999158216123299e-06, + "loss": 0.7458, + "step": 204 + }, + { + "epoch": 0.09692671394799054, + "grad_norm": 3.169548749923706, + "learning_rate": 4.999141951074703e-06, + "loss": 0.7256, + "step": 205 + }, + { + "epoch": 0.09739952718676123, + "grad_norm": 3.186009168624878, + "learning_rate": 4.999125530415495e-06, + "loss": 0.783, + "step": 206 + }, + { + "epoch": 0.09787234042553192, + "grad_norm": 3.0995123386383057, + "learning_rate": 4.9991089541467e-06, + "loss": 0.7519, + "step": 207 + }, + { + "epoch": 0.0983451536643026, + "grad_norm": 3.1854088306427, + "learning_rate": 4.999092222269348e-06, + "loss": 0.7444, + "step": 208 + }, + { + "epoch": 0.09881796690307329, + "grad_norm": 3.1512246131896973, + "learning_rate": 4.999075334784482e-06, + "loss": 0.7882, + "step": 209 + }, + { + "epoch": 0.09929078014184398, + "grad_norm": 3.6199698448181152, + "learning_rate": 4.999058291693153e-06, + "loss": 0.8048, + "step": 210 + }, + { + "epoch": 0.09976359338061466, + "grad_norm": 2.956907272338867, + "learning_rate": 4.999041092996422e-06, + "loss": 0.7663, + "step": 211 + }, + { + "epoch": 0.10023640661938535, + "grad_norm": 3.3493971824645996, + "learning_rate": 4.99902373869536e-06, + "loss": 0.7639, + "step": 212 + }, + { + "epoch": 0.10070921985815603, + "grad_norm": 3.144812822341919, + "learning_rate": 4.9990062287910475e-06, + "loss": 0.7953, + "step": 213 + }, + { + "epoch": 0.10118203309692671, + "grad_norm": 3.5986971855163574, + "learning_rate": 4.998988563284576e-06, + "loss": 0.8297, + "step": 214 + }, + { + "epoch": 0.1016548463356974, + "grad_norm": 3.447584867477417, + "learning_rate": 4.998970742177044e-06, + "loss": 0.808, + "step": 215 + }, + { + "epoch": 0.10212765957446808, + "grad_norm": 3.791353940963745, + "learning_rate": 4.998952765469562e-06, + "loss": 0.8005, + "step": 216 + }, + { + "epoch": 0.10260047281323877, + "grad_norm": 3.4490807056427, + "learning_rate": 4.998934633163247e-06, + "loss": 0.8135, + "step": 217 + }, + { + "epoch": 0.10307328605200945, + "grad_norm": 3.1053314208984375, + "learning_rate": 4.998916345259232e-06, + "loss": 0.7888, + "step": 218 + }, + { + "epoch": 0.10354609929078014, + "grad_norm": 3.407862663269043, + "learning_rate": 4.9988979017586514e-06, + "loss": 0.7099, + "step": 219 + }, + { + "epoch": 0.10401891252955082, + "grad_norm": 3.116656541824341, + "learning_rate": 4.998879302662658e-06, + "loss": 0.8344, + "step": 220 + }, + { + "epoch": 0.10449172576832151, + "grad_norm": 3.339264154434204, + "learning_rate": 4.998860547972406e-06, + "loss": 0.8496, + "step": 221 + }, + { + "epoch": 0.1049645390070922, + "grad_norm": 3.251892566680908, + "learning_rate": 4.998841637689066e-06, + "loss": 0.7455, + "step": 222 + }, + { + "epoch": 0.10543735224586288, + "grad_norm": 4.098135471343994, + "learning_rate": 4.998822571813814e-06, + "loss": 0.7772, + "step": 223 + }, + { + "epoch": 0.10591016548463357, + "grad_norm": 3.9871134757995605, + "learning_rate": 4.998803350347837e-06, + "loss": 0.8261, + "step": 224 + }, + { + "epoch": 0.10638297872340426, + "grad_norm": 3.2822303771972656, + "learning_rate": 4.998783973292333e-06, + "loss": 0.8623, + "step": 225 + }, + { + "epoch": 0.10685579196217494, + "grad_norm": 3.0356857776641846, + "learning_rate": 4.998764440648507e-06, + "loss": 0.7426, + "step": 226 + }, + { + "epoch": 0.10732860520094563, + "grad_norm": 2.8932785987854004, + "learning_rate": 4.998744752417576e-06, + "loss": 0.6741, + "step": 227 + }, + { + "epoch": 0.10780141843971631, + "grad_norm": 3.085820436477661, + "learning_rate": 4.998724908600767e-06, + "loss": 0.6549, + "step": 228 + }, + { + "epoch": 0.108274231678487, + "grad_norm": 3.135829210281372, + "learning_rate": 4.998704909199314e-06, + "loss": 0.6702, + "step": 229 + }, + { + "epoch": 0.10874704491725769, + "grad_norm": 5.016134262084961, + "learning_rate": 4.9986847542144625e-06, + "loss": 0.7852, + "step": 230 + }, + { + "epoch": 0.10921985815602837, + "grad_norm": 3.9056200981140137, + "learning_rate": 4.998664443647468e-06, + "loss": 0.9654, + "step": 231 + }, + { + "epoch": 0.10969267139479906, + "grad_norm": 3.0880749225616455, + "learning_rate": 4.998643977499595e-06, + "loss": 0.7579, + "step": 232 + }, + { + "epoch": 0.11016548463356975, + "grad_norm": 3.6893601417541504, + "learning_rate": 4.998623355772118e-06, + "loss": 0.713, + "step": 233 + }, + { + "epoch": 0.11063829787234042, + "grad_norm": 4.181536674499512, + "learning_rate": 4.998602578466319e-06, + "loss": 0.7331, + "step": 234 + }, + { + "epoch": 0.1111111111111111, + "grad_norm": 3.036386728286743, + "learning_rate": 4.998581645583496e-06, + "loss": 0.7115, + "step": 235 + }, + { + "epoch": 0.11158392434988179, + "grad_norm": 3.6333255767822266, + "learning_rate": 4.998560557124948e-06, + "loss": 0.7544, + "step": 236 + }, + { + "epoch": 0.11205673758865248, + "grad_norm": 2.926417827606201, + "learning_rate": 4.9985393130919915e-06, + "loss": 0.715, + "step": 237 + }, + { + "epoch": 0.11252955082742316, + "grad_norm": 2.969158172607422, + "learning_rate": 4.998517913485946e-06, + "loss": 0.7304, + "step": 238 + }, + { + "epoch": 0.11300236406619385, + "grad_norm": 3.5254971981048584, + "learning_rate": 4.9984963583081466e-06, + "loss": 0.7725, + "step": 239 + }, + { + "epoch": 0.11347517730496454, + "grad_norm": 3.7840335369110107, + "learning_rate": 4.998474647559936e-06, + "loss": 0.8685, + "step": 240 + }, + { + "epoch": 0.11394799054373522, + "grad_norm": 3.0333125591278076, + "learning_rate": 4.9984527812426625e-06, + "loss": 0.7793, + "step": 241 + }, + { + "epoch": 0.11442080378250591, + "grad_norm": 3.290159225463867, + "learning_rate": 4.99843075935769e-06, + "loss": 0.7158, + "step": 242 + }, + { + "epoch": 0.1148936170212766, + "grad_norm": 3.3935494422912598, + "learning_rate": 4.99840858190639e-06, + "loss": 0.7643, + "step": 243 + }, + { + "epoch": 0.11536643026004728, + "grad_norm": 3.333965539932251, + "learning_rate": 4.998386248890142e-06, + "loss": 0.7255, + "step": 244 + }, + { + "epoch": 0.11583924349881797, + "grad_norm": 2.8129613399505615, + "learning_rate": 4.998363760310339e-06, + "loss": 0.768, + "step": 245 + }, + { + "epoch": 0.11631205673758865, + "grad_norm": 2.8678107261657715, + "learning_rate": 4.998341116168378e-06, + "loss": 0.7403, + "step": 246 + }, + { + "epoch": 0.11678486997635934, + "grad_norm": 2.8898239135742188, + "learning_rate": 4.998318316465672e-06, + "loss": 0.6844, + "step": 247 + }, + { + "epoch": 0.11725768321513003, + "grad_norm": 3.139777898788452, + "learning_rate": 4.998295361203637e-06, + "loss": 0.7936, + "step": 248 + }, + { + "epoch": 0.11773049645390071, + "grad_norm": 3.393721103668213, + "learning_rate": 4.998272250383707e-06, + "loss": 0.8173, + "step": 249 + }, + { + "epoch": 0.1182033096926714, + "grad_norm": 3.240973949432373, + "learning_rate": 4.998248984007318e-06, + "loss": 0.8252, + "step": 250 + }, + { + "epoch": 0.11867612293144209, + "grad_norm": 3.384855031967163, + "learning_rate": 4.998225562075918e-06, + "loss": 0.7244, + "step": 251 + }, + { + "epoch": 0.11914893617021277, + "grad_norm": 3.1881816387176514, + "learning_rate": 4.9982019845909675e-06, + "loss": 0.6818, + "step": 252 + }, + { + "epoch": 0.11962174940898346, + "grad_norm": 2.888364553451538, + "learning_rate": 4.998178251553934e-06, + "loss": 0.6753, + "step": 253 + }, + { + "epoch": 0.12009456264775414, + "grad_norm": 3.630093812942505, + "learning_rate": 4.9981543629662944e-06, + "loss": 0.7995, + "step": 254 + }, + { + "epoch": 0.12056737588652482, + "grad_norm": 2.9820947647094727, + "learning_rate": 4.998130318829537e-06, + "loss": 0.7478, + "step": 255 + }, + { + "epoch": 0.1210401891252955, + "grad_norm": 2.7094738483428955, + "learning_rate": 4.998106119145159e-06, + "loss": 0.7237, + "step": 256 + }, + { + "epoch": 0.12151300236406619, + "grad_norm": 3.1808104515075684, + "learning_rate": 4.9980817639146665e-06, + "loss": 0.7915, + "step": 257 + }, + { + "epoch": 0.12198581560283688, + "grad_norm": 3.1661291122436523, + "learning_rate": 4.998057253139575e-06, + "loss": 0.8053, + "step": 258 + }, + { + "epoch": 0.12245862884160756, + "grad_norm": 3.528749942779541, + "learning_rate": 4.998032586821413e-06, + "loss": 0.7946, + "step": 259 + }, + { + "epoch": 0.12293144208037825, + "grad_norm": 3.125964879989624, + "learning_rate": 4.998007764961716e-06, + "loss": 0.7569, + "step": 260 + }, + { + "epoch": 0.12340425531914893, + "grad_norm": 3.0778942108154297, + "learning_rate": 4.997982787562029e-06, + "loss": 0.7184, + "step": 261 + }, + { + "epoch": 0.12387706855791962, + "grad_norm": 3.3531930446624756, + "learning_rate": 4.997957654623906e-06, + "loss": 0.7586, + "step": 262 + }, + { + "epoch": 0.1243498817966903, + "grad_norm": 3.229278564453125, + "learning_rate": 4.997932366148913e-06, + "loss": 0.6092, + "step": 263 + }, + { + "epoch": 0.12482269503546099, + "grad_norm": 3.7286155223846436, + "learning_rate": 4.997906922138626e-06, + "loss": 0.7965, + "step": 264 + }, + { + "epoch": 0.12529550827423167, + "grad_norm": 3.300311803817749, + "learning_rate": 4.997881322594628e-06, + "loss": 0.7665, + "step": 265 + }, + { + "epoch": 0.12576832151300235, + "grad_norm": 3.411482572555542, + "learning_rate": 4.9978555675185115e-06, + "loss": 0.7253, + "step": 266 + }, + { + "epoch": 0.12624113475177304, + "grad_norm": 3.0884511470794678, + "learning_rate": 4.9978296569118825e-06, + "loss": 0.659, + "step": 267 + }, + { + "epoch": 0.12671394799054372, + "grad_norm": 3.0652925968170166, + "learning_rate": 4.9978035907763535e-06, + "loss": 0.6739, + "step": 268 + }, + { + "epoch": 0.1271867612293144, + "grad_norm": 3.280555009841919, + "learning_rate": 4.997777369113547e-06, + "loss": 0.8003, + "step": 269 + }, + { + "epoch": 0.1276595744680851, + "grad_norm": 2.980860948562622, + "learning_rate": 4.997750991925096e-06, + "loss": 0.7097, + "step": 270 + }, + { + "epoch": 0.12813238770685578, + "grad_norm": 3.301760673522949, + "learning_rate": 4.997724459212644e-06, + "loss": 0.7894, + "step": 271 + }, + { + "epoch": 0.12860520094562647, + "grad_norm": 2.9584903717041016, + "learning_rate": 4.997697770977841e-06, + "loss": 0.733, + "step": 272 + }, + { + "epoch": 0.12907801418439716, + "grad_norm": 3.5632214546203613, + "learning_rate": 4.99767092722235e-06, + "loss": 0.7228, + "step": 273 + }, + { + "epoch": 0.12955082742316784, + "grad_norm": 3.5900983810424805, + "learning_rate": 4.997643927947843e-06, + "loss": 0.7634, + "step": 274 + }, + { + "epoch": 0.13002364066193853, + "grad_norm": 3.332650661468506, + "learning_rate": 4.997616773156e-06, + "loss": 0.797, + "step": 275 + }, + { + "epoch": 0.13049645390070921, + "grad_norm": 3.1094167232513428, + "learning_rate": 4.997589462848512e-06, + "loss": 0.7849, + "step": 276 + }, + { + "epoch": 0.1309692671394799, + "grad_norm": 3.5359463691711426, + "learning_rate": 4.99756199702708e-06, + "loss": 0.6871, + "step": 277 + }, + { + "epoch": 0.1314420803782506, + "grad_norm": 3.190441846847534, + "learning_rate": 4.997534375693414e-06, + "loss": 0.6883, + "step": 278 + }, + { + "epoch": 0.13191489361702127, + "grad_norm": 3.063518762588501, + "learning_rate": 4.997506598849234e-06, + "loss": 0.7586, + "step": 279 + }, + { + "epoch": 0.13238770685579196, + "grad_norm": 3.4112050533294678, + "learning_rate": 4.997478666496269e-06, + "loss": 0.796, + "step": 280 + }, + { + "epoch": 0.13286052009456265, + "grad_norm": 3.231886386871338, + "learning_rate": 4.997450578636259e-06, + "loss": 0.7714, + "step": 281 + }, + { + "epoch": 0.13333333333333333, + "grad_norm": 3.279425621032715, + "learning_rate": 4.9974223352709515e-06, + "loss": 0.7793, + "step": 282 + }, + { + "epoch": 0.13380614657210402, + "grad_norm": 3.2154316902160645, + "learning_rate": 4.9973939364021075e-06, + "loss": 0.791, + "step": 283 + }, + { + "epoch": 0.1342789598108747, + "grad_norm": 3.2090768814086914, + "learning_rate": 4.9973653820314925e-06, + "loss": 0.6433, + "step": 284 + }, + { + "epoch": 0.1347517730496454, + "grad_norm": 3.1712026596069336, + "learning_rate": 4.997336672160886e-06, + "loss": 0.8128, + "step": 285 + }, + { + "epoch": 0.13522458628841608, + "grad_norm": 2.929229497909546, + "learning_rate": 4.997307806792076e-06, + "loss": 0.7594, + "step": 286 + }, + { + "epoch": 0.13569739952718676, + "grad_norm": 3.0363314151763916, + "learning_rate": 4.997278785926859e-06, + "loss": 0.7336, + "step": 287 + }, + { + "epoch": 0.13617021276595745, + "grad_norm": 3.1352357864379883, + "learning_rate": 4.997249609567042e-06, + "loss": 0.7225, + "step": 288 + }, + { + "epoch": 0.13664302600472814, + "grad_norm": 3.3171157836914062, + "learning_rate": 4.997220277714442e-06, + "loss": 0.7777, + "step": 289 + }, + { + "epoch": 0.13711583924349882, + "grad_norm": 3.050717353820801, + "learning_rate": 4.997190790370885e-06, + "loss": 0.6836, + "step": 290 + }, + { + "epoch": 0.1375886524822695, + "grad_norm": 3.0297694206237793, + "learning_rate": 4.997161147538208e-06, + "loss": 0.6883, + "step": 291 + }, + { + "epoch": 0.1380614657210402, + "grad_norm": 3.0566554069519043, + "learning_rate": 4.997131349218256e-06, + "loss": 0.6674, + "step": 292 + }, + { + "epoch": 0.13853427895981088, + "grad_norm": 3.799111843109131, + "learning_rate": 4.997101395412885e-06, + "loss": 0.8256, + "step": 293 + }, + { + "epoch": 0.13900709219858157, + "grad_norm": 3.1394248008728027, + "learning_rate": 4.9970712861239576e-06, + "loss": 0.7306, + "step": 294 + }, + { + "epoch": 0.13947990543735225, + "grad_norm": 3.0605666637420654, + "learning_rate": 4.997041021353352e-06, + "loss": 0.7212, + "step": 295 + }, + { + "epoch": 0.13995271867612294, + "grad_norm": 3.8813397884368896, + "learning_rate": 4.997010601102951e-06, + "loss": 0.769, + "step": 296 + }, + { + "epoch": 0.14042553191489363, + "grad_norm": 3.0514819622039795, + "learning_rate": 4.996980025374649e-06, + "loss": 0.7422, + "step": 297 + }, + { + "epoch": 0.1408983451536643, + "grad_norm": 2.9544146060943604, + "learning_rate": 4.99694929417035e-06, + "loss": 0.6912, + "step": 298 + }, + { + "epoch": 0.141371158392435, + "grad_norm": 3.2635602951049805, + "learning_rate": 4.996918407491966e-06, + "loss": 0.7395, + "step": 299 + }, + { + "epoch": 0.14184397163120568, + "grad_norm": 3.373882532119751, + "learning_rate": 4.996887365341423e-06, + "loss": 0.7799, + "step": 300 + }, + { + "epoch": 0.14231678486997637, + "grad_norm": 3.001128673553467, + "learning_rate": 4.996856167720652e-06, + "loss": 0.7168, + "step": 301 + }, + { + "epoch": 0.14278959810874706, + "grad_norm": 3.1026835441589355, + "learning_rate": 4.996824814631595e-06, + "loss": 0.7492, + "step": 302 + }, + { + "epoch": 0.14326241134751774, + "grad_norm": 3.41947603225708, + "learning_rate": 4.996793306076205e-06, + "loss": 0.6659, + "step": 303 + }, + { + "epoch": 0.14373522458628843, + "grad_norm": 3.2272400856018066, + "learning_rate": 4.996761642056444e-06, + "loss": 0.7184, + "step": 304 + }, + { + "epoch": 0.14420803782505912, + "grad_norm": 2.9488935470581055, + "learning_rate": 4.996729822574284e-06, + "loss": 0.7451, + "step": 305 + }, + { + "epoch": 0.14468085106382977, + "grad_norm": 3.268231153488159, + "learning_rate": 4.9966978476317065e-06, + "loss": 0.7798, + "step": 306 + }, + { + "epoch": 0.14515366430260046, + "grad_norm": 3.9086556434631348, + "learning_rate": 4.996665717230701e-06, + "loss": 0.7871, + "step": 307 + }, + { + "epoch": 0.14562647754137115, + "grad_norm": 3.3483879566192627, + "learning_rate": 4.996633431373269e-06, + "loss": 0.7415, + "step": 308 + }, + { + "epoch": 0.14609929078014183, + "grad_norm": 2.839400053024292, + "learning_rate": 4.99660099006142e-06, + "loss": 0.7192, + "step": 309 + }, + { + "epoch": 0.14657210401891252, + "grad_norm": 3.177302598953247, + "learning_rate": 4.996568393297175e-06, + "loss": 0.755, + "step": 310 + }, + { + "epoch": 0.1470449172576832, + "grad_norm": 3.5477044582366943, + "learning_rate": 4.996535641082563e-06, + "loss": 0.7531, + "step": 311 + }, + { + "epoch": 0.1475177304964539, + "grad_norm": 3.418576717376709, + "learning_rate": 4.996502733419624e-06, + "loss": 0.8009, + "step": 312 + }, + { + "epoch": 0.14799054373522458, + "grad_norm": 3.711341619491577, + "learning_rate": 4.996469670310407e-06, + "loss": 0.7362, + "step": 313 + }, + { + "epoch": 0.14846335697399526, + "grad_norm": 3.2419373989105225, + "learning_rate": 4.99643645175697e-06, + "loss": 0.7761, + "step": 314 + }, + { + "epoch": 0.14893617021276595, + "grad_norm": 3.121858835220337, + "learning_rate": 4.996403077761381e-06, + "loss": 0.6495, + "step": 315 + }, + { + "epoch": 0.14940898345153664, + "grad_norm": 3.123054265975952, + "learning_rate": 4.996369548325719e-06, + "loss": 0.7444, + "step": 316 + }, + { + "epoch": 0.14988179669030732, + "grad_norm": 2.780880928039551, + "learning_rate": 4.996335863452072e-06, + "loss": 0.672, + "step": 317 + }, + { + "epoch": 0.150354609929078, + "grad_norm": 3.3738629817962646, + "learning_rate": 4.996302023142536e-06, + "loss": 0.7972, + "step": 318 + }, + { + "epoch": 0.1508274231678487, + "grad_norm": 3.4874777793884277, + "learning_rate": 4.99626802739922e-06, + "loss": 0.8252, + "step": 319 + }, + { + "epoch": 0.15130023640661938, + "grad_norm": 3.7074787616729736, + "learning_rate": 4.9962338762242395e-06, + "loss": 0.8216, + "step": 320 + }, + { + "epoch": 0.15177304964539007, + "grad_norm": 3.281912326812744, + "learning_rate": 4.996199569619721e-06, + "loss": 0.8175, + "step": 321 + }, + { + "epoch": 0.15224586288416075, + "grad_norm": 2.9485340118408203, + "learning_rate": 4.996165107587801e-06, + "loss": 0.707, + "step": 322 + }, + { + "epoch": 0.15271867612293144, + "grad_norm": 3.3757646083831787, + "learning_rate": 4.996130490130625e-06, + "loss": 0.7955, + "step": 323 + }, + { + "epoch": 0.15319148936170213, + "grad_norm": 2.962181568145752, + "learning_rate": 4.996095717250349e-06, + "loss": 0.7067, + "step": 324 + }, + { + "epoch": 0.1536643026004728, + "grad_norm": 3.114272356033325, + "learning_rate": 4.996060788949136e-06, + "loss": 0.7486, + "step": 325 + }, + { + "epoch": 0.1541371158392435, + "grad_norm": 3.0621590614318848, + "learning_rate": 4.996025705229165e-06, + "loss": 0.6547, + "step": 326 + }, + { + "epoch": 0.15460992907801419, + "grad_norm": 2.8745882511138916, + "learning_rate": 4.995990466092616e-06, + "loss": 0.6435, + "step": 327 + }, + { + "epoch": 0.15508274231678487, + "grad_norm": 2.90841007232666, + "learning_rate": 4.995955071541686e-06, + "loss": 0.7331, + "step": 328 + }, + { + "epoch": 0.15555555555555556, + "grad_norm": 2.694580316543579, + "learning_rate": 4.9959195215785784e-06, + "loss": 0.6731, + "step": 329 + }, + { + "epoch": 0.15602836879432624, + "grad_norm": 3.158083438873291, + "learning_rate": 4.995883816205507e-06, + "loss": 0.7257, + "step": 330 + }, + { + "epoch": 0.15650118203309693, + "grad_norm": 3.3234715461730957, + "learning_rate": 4.995847955424694e-06, + "loss": 0.7389, + "step": 331 + }, + { + "epoch": 0.15697399527186762, + "grad_norm": 2.9406495094299316, + "learning_rate": 4.995811939238373e-06, + "loss": 0.643, + "step": 332 + }, + { + "epoch": 0.1574468085106383, + "grad_norm": 3.3191726207733154, + "learning_rate": 4.995775767648785e-06, + "loss": 0.7879, + "step": 333 + }, + { + "epoch": 0.157919621749409, + "grad_norm": 3.711925745010376, + "learning_rate": 4.995739440658185e-06, + "loss": 0.7586, + "step": 334 + }, + { + "epoch": 0.15839243498817968, + "grad_norm": 9.573421478271484, + "learning_rate": 4.995702958268833e-06, + "loss": 0.7842, + "step": 335 + }, + { + "epoch": 0.15886524822695036, + "grad_norm": 3.4154508113861084, + "learning_rate": 4.995666320483001e-06, + "loss": 0.6735, + "step": 336 + }, + { + "epoch": 0.15933806146572105, + "grad_norm": 3.4169859886169434, + "learning_rate": 4.995629527302971e-06, + "loss": 0.741, + "step": 337 + }, + { + "epoch": 0.15981087470449173, + "grad_norm": 3.287503242492676, + "learning_rate": 4.9955925787310335e-06, + "loss": 0.7139, + "step": 338 + }, + { + "epoch": 0.16028368794326242, + "grad_norm": 3.288409471511841, + "learning_rate": 4.995555474769488e-06, + "loss": 0.7636, + "step": 339 + }, + { + "epoch": 0.1607565011820331, + "grad_norm": 2.8021693229675293, + "learning_rate": 4.995518215420646e-06, + "loss": 0.5883, + "step": 340 + }, + { + "epoch": 0.1612293144208038, + "grad_norm": 2.7038564682006836, + "learning_rate": 4.995480800686827e-06, + "loss": 0.657, + "step": 341 + }, + { + "epoch": 0.16170212765957448, + "grad_norm": 3.2370235919952393, + "learning_rate": 4.9954432305703615e-06, + "loss": 0.6999, + "step": 342 + }, + { + "epoch": 0.16217494089834517, + "grad_norm": 2.8666412830352783, + "learning_rate": 4.995405505073588e-06, + "loss": 0.7199, + "step": 343 + }, + { + "epoch": 0.16264775413711585, + "grad_norm": 3.6467232704162598, + "learning_rate": 4.995367624198856e-06, + "loss": 0.7317, + "step": 344 + }, + { + "epoch": 0.16312056737588654, + "grad_norm": 2.7576327323913574, + "learning_rate": 4.9953295879485246e-06, + "loss": 0.647, + "step": 345 + }, + { + "epoch": 0.1635933806146572, + "grad_norm": 2.922232151031494, + "learning_rate": 4.995291396324959e-06, + "loss": 0.6686, + "step": 346 + }, + { + "epoch": 0.16406619385342788, + "grad_norm": 2.8693501949310303, + "learning_rate": 4.995253049330542e-06, + "loss": 0.6756, + "step": 347 + }, + { + "epoch": 0.16453900709219857, + "grad_norm": 3.671865701675415, + "learning_rate": 4.995214546967658e-06, + "loss": 0.7347, + "step": 348 + }, + { + "epoch": 0.16501182033096926, + "grad_norm": 3.024219274520874, + "learning_rate": 4.995175889238706e-06, + "loss": 0.7547, + "step": 349 + }, + { + "epoch": 0.16548463356973994, + "grad_norm": 2.8470778465270996, + "learning_rate": 4.995137076146091e-06, + "loss": 0.6764, + "step": 350 + }, + { + "epoch": 0.16595744680851063, + "grad_norm": 2.905057907104492, + "learning_rate": 4.9950981076922324e-06, + "loss": 0.6814, + "step": 351 + }, + { + "epoch": 0.16643026004728131, + "grad_norm": 3.504377841949463, + "learning_rate": 4.995058983879555e-06, + "loss": 0.7145, + "step": 352 + }, + { + "epoch": 0.166903073286052, + "grad_norm": 3.0029661655426025, + "learning_rate": 4.995019704710495e-06, + "loss": 0.7114, + "step": 353 + }, + { + "epoch": 0.1673758865248227, + "grad_norm": 2.8666274547576904, + "learning_rate": 4.994980270187499e-06, + "loss": 0.7416, + "step": 354 + }, + { + "epoch": 0.16784869976359337, + "grad_norm": 3.1644718647003174, + "learning_rate": 4.994940680313021e-06, + "loss": 0.661, + "step": 355 + }, + { + "epoch": 0.16832151300236406, + "grad_norm": 3.050391674041748, + "learning_rate": 4.994900935089527e-06, + "loss": 0.7243, + "step": 356 + }, + { + "epoch": 0.16879432624113475, + "grad_norm": 2.985466480255127, + "learning_rate": 4.994861034519491e-06, + "loss": 0.6917, + "step": 357 + }, + { + "epoch": 0.16926713947990543, + "grad_norm": 2.909342050552368, + "learning_rate": 4.9948209786053995e-06, + "loss": 0.6636, + "step": 358 + }, + { + "epoch": 0.16973995271867612, + "grad_norm": 3.2214784622192383, + "learning_rate": 4.9947807673497435e-06, + "loss": 0.7903, + "step": 359 + }, + { + "epoch": 0.1702127659574468, + "grad_norm": 2.5654983520507812, + "learning_rate": 4.994740400755029e-06, + "loss": 0.6129, + "step": 360 + }, + { + "epoch": 0.1706855791962175, + "grad_norm": 3.775646448135376, + "learning_rate": 4.99469987882377e-06, + "loss": 0.7145, + "step": 361 + }, + { + "epoch": 0.17115839243498818, + "grad_norm": 2.8965413570404053, + "learning_rate": 4.994659201558487e-06, + "loss": 0.7177, + "step": 362 + }, + { + "epoch": 0.17163120567375886, + "grad_norm": 3.485597848892212, + "learning_rate": 4.9946183689617146e-06, + "loss": 0.8107, + "step": 363 + }, + { + "epoch": 0.17210401891252955, + "grad_norm": 3.277839183807373, + "learning_rate": 4.994577381035995e-06, + "loss": 0.691, + "step": 364 + }, + { + "epoch": 0.17257683215130024, + "grad_norm": 2.8807685375213623, + "learning_rate": 4.99453623778388e-06, + "loss": 0.7627, + "step": 365 + }, + { + "epoch": 0.17304964539007092, + "grad_norm": 3.0659940242767334, + "learning_rate": 4.994494939207932e-06, + "loss": 0.6858, + "step": 366 + }, + { + "epoch": 0.1735224586288416, + "grad_norm": 3.0881855487823486, + "learning_rate": 4.994453485310723e-06, + "loss": 0.8212, + "step": 367 + }, + { + "epoch": 0.1739952718676123, + "grad_norm": 2.7199201583862305, + "learning_rate": 4.994411876094832e-06, + "loss": 0.6516, + "step": 368 + }, + { + "epoch": 0.17446808510638298, + "grad_norm": 2.955889940261841, + "learning_rate": 4.994370111562851e-06, + "loss": 0.6579, + "step": 369 + }, + { + "epoch": 0.17494089834515367, + "grad_norm": 3.1321663856506348, + "learning_rate": 4.994328191717382e-06, + "loss": 0.6891, + "step": 370 + }, + { + "epoch": 0.17541371158392435, + "grad_norm": 3.0560388565063477, + "learning_rate": 4.994286116561034e-06, + "loss": 0.7243, + "step": 371 + }, + { + "epoch": 0.17588652482269504, + "grad_norm": 3.1560704708099365, + "learning_rate": 4.994243886096425e-06, + "loss": 0.7262, + "step": 372 + }, + { + "epoch": 0.17635933806146573, + "grad_norm": 2.913541316986084, + "learning_rate": 4.994201500326187e-06, + "loss": 0.7318, + "step": 373 + }, + { + "epoch": 0.1768321513002364, + "grad_norm": 3.098376512527466, + "learning_rate": 4.994158959252958e-06, + "loss": 0.6419, + "step": 374 + }, + { + "epoch": 0.1773049645390071, + "grad_norm": 2.977508544921875, + "learning_rate": 4.994116262879387e-06, + "loss": 0.6709, + "step": 375 + }, + { + "epoch": 0.17777777777777778, + "grad_norm": 3.168186902999878, + "learning_rate": 4.994073411208133e-06, + "loss": 0.6608, + "step": 376 + }, + { + "epoch": 0.17825059101654847, + "grad_norm": 3.436844825744629, + "learning_rate": 4.994030404241864e-06, + "loss": 0.7227, + "step": 377 + }, + { + "epoch": 0.17872340425531916, + "grad_norm": 2.8998289108276367, + "learning_rate": 4.993987241983258e-06, + "loss": 0.6512, + "step": 378 + }, + { + "epoch": 0.17919621749408984, + "grad_norm": 3.407191514968872, + "learning_rate": 4.993943924435002e-06, + "loss": 0.616, + "step": 379 + }, + { + "epoch": 0.17966903073286053, + "grad_norm": 3.744858741760254, + "learning_rate": 4.993900451599793e-06, + "loss": 0.8599, + "step": 380 + }, + { + "epoch": 0.18014184397163122, + "grad_norm": 3.486283779144287, + "learning_rate": 4.993856823480338e-06, + "loss": 0.6634, + "step": 381 + }, + { + "epoch": 0.1806146572104019, + "grad_norm": 2.895719051361084, + "learning_rate": 4.993813040079355e-06, + "loss": 0.6972, + "step": 382 + }, + { + "epoch": 0.1810874704491726, + "grad_norm": 2.814133882522583, + "learning_rate": 4.993769101399569e-06, + "loss": 0.6271, + "step": 383 + }, + { + "epoch": 0.18156028368794327, + "grad_norm": 2.8609800338745117, + "learning_rate": 4.993725007443715e-06, + "loss": 0.6481, + "step": 384 + }, + { + "epoch": 0.18203309692671396, + "grad_norm": 3.2829644680023193, + "learning_rate": 4.99368075821454e-06, + "loss": 0.7999, + "step": 385 + }, + { + "epoch": 0.18250591016548465, + "grad_norm": 3.1417458057403564, + "learning_rate": 4.993636353714798e-06, + "loss": 0.6972, + "step": 386 + }, + { + "epoch": 0.1829787234042553, + "grad_norm": 3.0679385662078857, + "learning_rate": 4.993591793947256e-06, + "loss": 0.667, + "step": 387 + }, + { + "epoch": 0.183451536643026, + "grad_norm": 3.1387410163879395, + "learning_rate": 4.993547078914686e-06, + "loss": 0.7618, + "step": 388 + }, + { + "epoch": 0.18392434988179668, + "grad_norm": 2.9181406497955322, + "learning_rate": 4.993502208619872e-06, + "loss": 0.7391, + "step": 389 + }, + { + "epoch": 0.18439716312056736, + "grad_norm": 2.8952157497406006, + "learning_rate": 4.993457183065611e-06, + "loss": 0.6988, + "step": 390 + }, + { + "epoch": 0.18486997635933805, + "grad_norm": 3.2274813652038574, + "learning_rate": 4.993412002254704e-06, + "loss": 0.688, + "step": 391 + }, + { + "epoch": 0.18534278959810874, + "grad_norm": 3.4693779945373535, + "learning_rate": 4.993366666189965e-06, + "loss": 0.6634, + "step": 392 + }, + { + "epoch": 0.18581560283687942, + "grad_norm": 3.5358526706695557, + "learning_rate": 4.993321174874217e-06, + "loss": 0.7343, + "step": 393 + }, + { + "epoch": 0.1862884160756501, + "grad_norm": 3.013338088989258, + "learning_rate": 4.993275528310292e-06, + "loss": 0.7579, + "step": 394 + }, + { + "epoch": 0.1867612293144208, + "grad_norm": 2.694772720336914, + "learning_rate": 4.993229726501033e-06, + "loss": 0.718, + "step": 395 + }, + { + "epoch": 0.18723404255319148, + "grad_norm": 3.070612907409668, + "learning_rate": 4.9931837694492915e-06, + "loss": 0.6438, + "step": 396 + }, + { + "epoch": 0.18770685579196217, + "grad_norm": 2.9193027019500732, + "learning_rate": 4.993137657157928e-06, + "loss": 0.6788, + "step": 397 + }, + { + "epoch": 0.18817966903073285, + "grad_norm": 3.047682046890259, + "learning_rate": 4.993091389629816e-06, + "loss": 0.6826, + "step": 398 + }, + { + "epoch": 0.18865248226950354, + "grad_norm": 2.9629905223846436, + "learning_rate": 4.993044966867834e-06, + "loss": 0.7196, + "step": 399 + }, + { + "epoch": 0.18912529550827423, + "grad_norm": 3.0692050457000732, + "learning_rate": 4.992998388874874e-06, + "loss": 0.7015, + "step": 400 + }, + { + "epoch": 0.1895981087470449, + "grad_norm": 3.5427212715148926, + "learning_rate": 4.992951655653836e-06, + "loss": 0.8292, + "step": 401 + }, + { + "epoch": 0.1900709219858156, + "grad_norm": 2.643526554107666, + "learning_rate": 4.992904767207629e-06, + "loss": 0.624, + "step": 402 + }, + { + "epoch": 0.19054373522458629, + "grad_norm": 3.1185996532440186, + "learning_rate": 4.992857723539173e-06, + "loss": 0.7354, + "step": 403 + }, + { + "epoch": 0.19101654846335697, + "grad_norm": 3.006856679916382, + "learning_rate": 4.992810524651398e-06, + "loss": 0.7752, + "step": 404 + }, + { + "epoch": 0.19148936170212766, + "grad_norm": 2.9913275241851807, + "learning_rate": 4.9927631705472425e-06, + "loss": 0.7306, + "step": 405 + }, + { + "epoch": 0.19196217494089834, + "grad_norm": 2.6794071197509766, + "learning_rate": 4.992715661229655e-06, + "loss": 0.6136, + "step": 406 + }, + { + "epoch": 0.19243498817966903, + "grad_norm": 3.5933966636657715, + "learning_rate": 4.992667996701593e-06, + "loss": 0.7024, + "step": 407 + }, + { + "epoch": 0.19290780141843972, + "grad_norm": 2.862187623977661, + "learning_rate": 4.992620176966025e-06, + "loss": 0.692, + "step": 408 + }, + { + "epoch": 0.1933806146572104, + "grad_norm": 3.076845407485962, + "learning_rate": 4.9925722020259286e-06, + "loss": 0.7475, + "step": 409 + }, + { + "epoch": 0.1938534278959811, + "grad_norm": 3.372919797897339, + "learning_rate": 4.9925240718842895e-06, + "loss": 0.6886, + "step": 410 + }, + { + "epoch": 0.19432624113475178, + "grad_norm": 2.922977924346924, + "learning_rate": 4.992475786544108e-06, + "loss": 0.7049, + "step": 411 + }, + { + "epoch": 0.19479905437352246, + "grad_norm": 2.908034324645996, + "learning_rate": 4.992427346008387e-06, + "loss": 0.6498, + "step": 412 + }, + { + "epoch": 0.19527186761229315, + "grad_norm": 3.096723794937134, + "learning_rate": 4.992378750280144e-06, + "loss": 0.7151, + "step": 413 + }, + { + "epoch": 0.19574468085106383, + "grad_norm": 2.895237684249878, + "learning_rate": 4.992329999362405e-06, + "loss": 0.7277, + "step": 414 + }, + { + "epoch": 0.19621749408983452, + "grad_norm": 2.718230724334717, + "learning_rate": 4.9922810932582065e-06, + "loss": 0.6375, + "step": 415 + }, + { + "epoch": 0.1966903073286052, + "grad_norm": 3.187743663787842, + "learning_rate": 4.992232031970592e-06, + "loss": 0.6528, + "step": 416 + }, + { + "epoch": 0.1971631205673759, + "grad_norm": 2.996406316757202, + "learning_rate": 4.992182815502616e-06, + "loss": 0.6552, + "step": 417 + }, + { + "epoch": 0.19763593380614658, + "grad_norm": 3.301084041595459, + "learning_rate": 4.992133443857345e-06, + "loss": 0.7061, + "step": 418 + }, + { + "epoch": 0.19810874704491727, + "grad_norm": 3.7874677181243896, + "learning_rate": 4.992083917037853e-06, + "loss": 0.7859, + "step": 419 + }, + { + "epoch": 0.19858156028368795, + "grad_norm": 3.124253511428833, + "learning_rate": 4.992034235047222e-06, + "loss": 0.7615, + "step": 420 + }, + { + "epoch": 0.19905437352245864, + "grad_norm": 3.0488970279693604, + "learning_rate": 4.991984397888546e-06, + "loss": 0.6916, + "step": 421 + }, + { + "epoch": 0.19952718676122932, + "grad_norm": 3.1241321563720703, + "learning_rate": 4.991934405564929e-06, + "loss": 0.7055, + "step": 422 + }, + { + "epoch": 0.2, + "grad_norm": 3.396632432937622, + "learning_rate": 4.991884258079484e-06, + "loss": 0.7675, + "step": 423 + }, + { + "epoch": 0.2004728132387707, + "grad_norm": 3.7776873111724854, + "learning_rate": 4.9918339554353316e-06, + "loss": 0.7371, + "step": 424 + }, + { + "epoch": 0.20094562647754138, + "grad_norm": 3.3356032371520996, + "learning_rate": 4.991783497635606e-06, + "loss": 0.6778, + "step": 425 + }, + { + "epoch": 0.20141843971631207, + "grad_norm": 2.988856792449951, + "learning_rate": 4.9917328846834474e-06, + "loss": 0.6795, + "step": 426 + }, + { + "epoch": 0.20189125295508276, + "grad_norm": 3.264183282852173, + "learning_rate": 4.99168211658201e-06, + "loss": 0.7707, + "step": 427 + }, + { + "epoch": 0.20236406619385341, + "grad_norm": 3.878068208694458, + "learning_rate": 4.991631193334451e-06, + "loss": 0.857, + "step": 428 + }, + { + "epoch": 0.2028368794326241, + "grad_norm": 3.6377553939819336, + "learning_rate": 4.991580114943943e-06, + "loss": 0.8033, + "step": 429 + }, + { + "epoch": 0.2033096926713948, + "grad_norm": 2.95393967628479, + "learning_rate": 4.991528881413667e-06, + "loss": 0.6809, + "step": 430 + }, + { + "epoch": 0.20378250591016547, + "grad_norm": 3.058704376220703, + "learning_rate": 4.9914774927468125e-06, + "loss": 0.6664, + "step": 431 + }, + { + "epoch": 0.20425531914893616, + "grad_norm": 2.7783217430114746, + "learning_rate": 4.9914259489465795e-06, + "loss": 0.6478, + "step": 432 + }, + { + "epoch": 0.20472813238770685, + "grad_norm": 2.4825217723846436, + "learning_rate": 4.991374250016177e-06, + "loss": 0.6598, + "step": 433 + }, + { + "epoch": 0.20520094562647753, + "grad_norm": 2.8753600120544434, + "learning_rate": 4.991322395958824e-06, + "loss": 0.6947, + "step": 434 + }, + { + "epoch": 0.20567375886524822, + "grad_norm": 3.2339367866516113, + "learning_rate": 4.99127038677775e-06, + "loss": 0.8201, + "step": 435 + }, + { + "epoch": 0.2061465721040189, + "grad_norm": 2.9065537452697754, + "learning_rate": 4.991218222476193e-06, + "loss": 0.6679, + "step": 436 + }, + { + "epoch": 0.2066193853427896, + "grad_norm": 3.283228874206543, + "learning_rate": 4.991165903057401e-06, + "loss": 0.8039, + "step": 437 + }, + { + "epoch": 0.20709219858156028, + "grad_norm": 3.429872751235962, + "learning_rate": 4.991113428524631e-06, + "loss": 0.7392, + "step": 438 + }, + { + "epoch": 0.20756501182033096, + "grad_norm": 3.118943452835083, + "learning_rate": 4.991060798881152e-06, + "loss": 0.6794, + "step": 439 + }, + { + "epoch": 0.20803782505910165, + "grad_norm": 3.395970106124878, + "learning_rate": 4.99100801413024e-06, + "loss": 0.6862, + "step": 440 + }, + { + "epoch": 0.20851063829787234, + "grad_norm": 2.869191884994507, + "learning_rate": 4.99095507427518e-06, + "loss": 0.6076, + "step": 441 + }, + { + "epoch": 0.20898345153664302, + "grad_norm": 3.1934661865234375, + "learning_rate": 4.990901979319272e-06, + "loss": 0.6927, + "step": 442 + }, + { + "epoch": 0.2094562647754137, + "grad_norm": 2.9068603515625, + "learning_rate": 4.990848729265819e-06, + "loss": 0.6864, + "step": 443 + }, + { + "epoch": 0.2099290780141844, + "grad_norm": 3.0535948276519775, + "learning_rate": 4.9907953241181375e-06, + "loss": 0.6396, + "step": 444 + }, + { + "epoch": 0.21040189125295508, + "grad_norm": 2.871511459350586, + "learning_rate": 4.990741763879554e-06, + "loss": 0.6743, + "step": 445 + }, + { + "epoch": 0.21087470449172577, + "grad_norm": 2.9184393882751465, + "learning_rate": 4.9906880485534015e-06, + "loss": 0.6786, + "step": 446 + }, + { + "epoch": 0.21134751773049645, + "grad_norm": 3.0628271102905273, + "learning_rate": 4.990634178143026e-06, + "loss": 0.6326, + "step": 447 + }, + { + "epoch": 0.21182033096926714, + "grad_norm": 3.7878305912017822, + "learning_rate": 4.990580152651782e-06, + "loss": 0.7944, + "step": 448 + }, + { + "epoch": 0.21229314420803783, + "grad_norm": 2.8577189445495605, + "learning_rate": 4.990525972083031e-06, + "loss": 0.71, + "step": 449 + }, + { + "epoch": 0.2127659574468085, + "grad_norm": 3.307769775390625, + "learning_rate": 4.99047163644015e-06, + "loss": 0.6893, + "step": 450 + }, + { + "epoch": 0.2132387706855792, + "grad_norm": 2.7391717433929443, + "learning_rate": 4.990417145726519e-06, + "loss": 0.712, + "step": 451 + }, + { + "epoch": 0.21371158392434988, + "grad_norm": 2.938044786453247, + "learning_rate": 4.990362499945534e-06, + "loss": 0.7516, + "step": 452 + }, + { + "epoch": 0.21418439716312057, + "grad_norm": 2.7831056118011475, + "learning_rate": 4.990307699100595e-06, + "loss": 0.6168, + "step": 453 + }, + { + "epoch": 0.21465721040189126, + "grad_norm": 2.907977342605591, + "learning_rate": 4.990252743195116e-06, + "loss": 0.6706, + "step": 454 + }, + { + "epoch": 0.21513002364066194, + "grad_norm": 3.7882161140441895, + "learning_rate": 4.990197632232517e-06, + "loss": 0.6847, + "step": 455 + }, + { + "epoch": 0.21560283687943263, + "grad_norm": 2.899716854095459, + "learning_rate": 4.990142366216232e-06, + "loss": 0.6699, + "step": 456 + }, + { + "epoch": 0.21607565011820332, + "grad_norm": 2.907003879547119, + "learning_rate": 4.990086945149701e-06, + "loss": 0.6864, + "step": 457 + }, + { + "epoch": 0.216548463356974, + "grad_norm": 3.2407333850860596, + "learning_rate": 4.9900313690363736e-06, + "loss": 0.692, + "step": 458 + }, + { + "epoch": 0.2170212765957447, + "grad_norm": 2.9055583477020264, + "learning_rate": 4.989975637879712e-06, + "loss": 0.7113, + "step": 459 + }, + { + "epoch": 0.21749408983451538, + "grad_norm": 2.9836206436157227, + "learning_rate": 4.989919751683184e-06, + "loss": 0.6673, + "step": 460 + }, + { + "epoch": 0.21796690307328606, + "grad_norm": 3.371035575866699, + "learning_rate": 4.989863710450273e-06, + "loss": 0.7181, + "step": 461 + }, + { + "epoch": 0.21843971631205675, + "grad_norm": 2.9636635780334473, + "learning_rate": 4.989807514184465e-06, + "loss": 0.6082, + "step": 462 + }, + { + "epoch": 0.21891252955082743, + "grad_norm": 2.9634664058685303, + "learning_rate": 4.9897511628892615e-06, + "loss": 0.7086, + "step": 463 + }, + { + "epoch": 0.21938534278959812, + "grad_norm": 3.154763698577881, + "learning_rate": 4.98969465656817e-06, + "loss": 0.7027, + "step": 464 + }, + { + "epoch": 0.2198581560283688, + "grad_norm": 2.9959890842437744, + "learning_rate": 4.98963799522471e-06, + "loss": 0.6498, + "step": 465 + }, + { + "epoch": 0.2203309692671395, + "grad_norm": 3.5470590591430664, + "learning_rate": 4.989581178862408e-06, + "loss": 0.7199, + "step": 466 + }, + { + "epoch": 0.22080378250591018, + "grad_norm": 7.1873369216918945, + "learning_rate": 4.989524207484802e-06, + "loss": 0.6676, + "step": 467 + }, + { + "epoch": 0.22127659574468084, + "grad_norm": 3.1099541187286377, + "learning_rate": 4.98946708109544e-06, + "loss": 0.6785, + "step": 468 + }, + { + "epoch": 0.22174940898345152, + "grad_norm": 2.830991506576538, + "learning_rate": 4.9894097996978795e-06, + "loss": 0.6456, + "step": 469 + }, + { + "epoch": 0.2222222222222222, + "grad_norm": 3.0212316513061523, + "learning_rate": 4.989352363295687e-06, + "loss": 0.6048, + "step": 470 + }, + { + "epoch": 0.2226950354609929, + "grad_norm": 3.18776798248291, + "learning_rate": 4.989294771892437e-06, + "loss": 0.7078, + "step": 471 + }, + { + "epoch": 0.22316784869976358, + "grad_norm": 2.9972598552703857, + "learning_rate": 4.989237025491717e-06, + "loss": 0.7082, + "step": 472 + }, + { + "epoch": 0.22364066193853427, + "grad_norm": 3.4935688972473145, + "learning_rate": 4.989179124097123e-06, + "loss": 0.8199, + "step": 473 + }, + { + "epoch": 0.22411347517730495, + "grad_norm": 2.6485543251037598, + "learning_rate": 4.9891210677122595e-06, + "loss": 0.6371, + "step": 474 + }, + { + "epoch": 0.22458628841607564, + "grad_norm": 2.969233512878418, + "learning_rate": 4.989062856340742e-06, + "loss": 0.6879, + "step": 475 + }, + { + "epoch": 0.22505910165484633, + "grad_norm": 2.881875514984131, + "learning_rate": 4.989004489986194e-06, + "loss": 0.7415, + "step": 476 + }, + { + "epoch": 0.225531914893617, + "grad_norm": 2.624540090560913, + "learning_rate": 4.98894596865225e-06, + "loss": 0.6522, + "step": 477 + }, + { + "epoch": 0.2260047281323877, + "grad_norm": 3.61075496673584, + "learning_rate": 4.988887292342555e-06, + "loss": 0.7109, + "step": 478 + }, + { + "epoch": 0.2264775413711584, + "grad_norm": 2.9368972778320312, + "learning_rate": 4.988828461060762e-06, + "loss": 0.6843, + "step": 479 + }, + { + "epoch": 0.22695035460992907, + "grad_norm": 3.0670197010040283, + "learning_rate": 4.988769474810533e-06, + "loss": 0.6807, + "step": 480 + }, + { + "epoch": 0.22742316784869976, + "grad_norm": 2.9662792682647705, + "learning_rate": 4.988710333595542e-06, + "loss": 0.6796, + "step": 481 + }, + { + "epoch": 0.22789598108747045, + "grad_norm": 2.971235752105713, + "learning_rate": 4.988651037419472e-06, + "loss": 0.696, + "step": 482 + }, + { + "epoch": 0.22836879432624113, + "grad_norm": 2.931884527206421, + "learning_rate": 4.988591586286013e-06, + "loss": 0.7323, + "step": 483 + }, + { + "epoch": 0.22884160756501182, + "grad_norm": 2.8114213943481445, + "learning_rate": 4.988531980198868e-06, + "loss": 0.6584, + "step": 484 + }, + { + "epoch": 0.2293144208037825, + "grad_norm": 3.2785916328430176, + "learning_rate": 4.98847221916175e-06, + "loss": 0.7514, + "step": 485 + }, + { + "epoch": 0.2297872340425532, + "grad_norm": 3.0520215034484863, + "learning_rate": 4.988412303178377e-06, + "loss": 0.7564, + "step": 486 + }, + { + "epoch": 0.23026004728132388, + "grad_norm": 3.181002616882324, + "learning_rate": 4.988352232252483e-06, + "loss": 0.6768, + "step": 487 + }, + { + "epoch": 0.23073286052009456, + "grad_norm": 3.4953625202178955, + "learning_rate": 4.988292006387805e-06, + "loss": 0.7143, + "step": 488 + }, + { + "epoch": 0.23120567375886525, + "grad_norm": 3.326571226119995, + "learning_rate": 4.988231625588096e-06, + "loss": 0.7318, + "step": 489 + }, + { + "epoch": 0.23167848699763594, + "grad_norm": 3.09614634513855, + "learning_rate": 4.988171089857113e-06, + "loss": 0.6574, + "step": 490 + }, + { + "epoch": 0.23215130023640662, + "grad_norm": 2.7439446449279785, + "learning_rate": 4.9881103991986265e-06, + "loss": 0.6637, + "step": 491 + }, + { + "epoch": 0.2326241134751773, + "grad_norm": 3.0681190490722656, + "learning_rate": 4.988049553616416e-06, + "loss": 0.6326, + "step": 492 + }, + { + "epoch": 0.233096926713948, + "grad_norm": 3.0757341384887695, + "learning_rate": 4.98798855311427e-06, + "loss": 0.695, + "step": 493 + }, + { + "epoch": 0.23356973995271868, + "grad_norm": 2.8637635707855225, + "learning_rate": 4.987927397695985e-06, + "loss": 0.6598, + "step": 494 + }, + { + "epoch": 0.23404255319148937, + "grad_norm": 3.3641068935394287, + "learning_rate": 4.9878660873653715e-06, + "loss": 0.7435, + "step": 495 + }, + { + "epoch": 0.23451536643026005, + "grad_norm": 3.5025596618652344, + "learning_rate": 4.987804622126245e-06, + "loss": 0.735, + "step": 496 + }, + { + "epoch": 0.23498817966903074, + "grad_norm": 2.9298837184906006, + "learning_rate": 4.987743001982434e-06, + "loss": 0.7063, + "step": 497 + }, + { + "epoch": 0.23546099290780143, + "grad_norm": 2.70358943939209, + "learning_rate": 4.987681226937774e-06, + "loss": 0.6799, + "step": 498 + }, + { + "epoch": 0.2359338061465721, + "grad_norm": 3.027871608734131, + "learning_rate": 4.9876192969961125e-06, + "loss": 0.6881, + "step": 499 + }, + { + "epoch": 0.2364066193853428, + "grad_norm": 3.362306594848633, + "learning_rate": 4.987557212161304e-06, + "loss": 0.7906, + "step": 500 + }, + { + "epoch": 0.23687943262411348, + "grad_norm": 3.3136050701141357, + "learning_rate": 4.987494972437217e-06, + "loss": 0.6878, + "step": 501 + }, + { + "epoch": 0.23735224586288417, + "grad_norm": 3.017089605331421, + "learning_rate": 4.9874325778277255e-06, + "loss": 0.7279, + "step": 502 + }, + { + "epoch": 0.23782505910165486, + "grad_norm": 2.8300516605377197, + "learning_rate": 4.987370028336714e-06, + "loss": 0.6864, + "step": 503 + }, + { + "epoch": 0.23829787234042554, + "grad_norm": 3.201860189437866, + "learning_rate": 4.987307323968077e-06, + "loss": 0.7531, + "step": 504 + }, + { + "epoch": 0.23877068557919623, + "grad_norm": 2.685396194458008, + "learning_rate": 4.987244464725721e-06, + "loss": 0.5849, + "step": 505 + }, + { + "epoch": 0.23924349881796692, + "grad_norm": 2.8715312480926514, + "learning_rate": 4.987181450613557e-06, + "loss": 0.675, + "step": 506 + }, + { + "epoch": 0.2397163120567376, + "grad_norm": 2.813908815383911, + "learning_rate": 4.987118281635511e-06, + "loss": 0.6841, + "step": 507 + }, + { + "epoch": 0.2401891252955083, + "grad_norm": 3.2738473415374756, + "learning_rate": 4.987054957795514e-06, + "loss": 0.7158, + "step": 508 + }, + { + "epoch": 0.24066193853427895, + "grad_norm": 2.896134376525879, + "learning_rate": 4.986991479097511e-06, + "loss": 0.7542, + "step": 509 + }, + { + "epoch": 0.24113475177304963, + "grad_norm": 3.0390403270721436, + "learning_rate": 4.986927845545454e-06, + "loss": 0.6733, + "step": 510 + }, + { + "epoch": 0.24160756501182032, + "grad_norm": 3.0300254821777344, + "learning_rate": 4.9868640571433044e-06, + "loss": 0.722, + "step": 511 + }, + { + "epoch": 0.242080378250591, + "grad_norm": 3.3037352561950684, + "learning_rate": 4.986800113895035e-06, + "loss": 0.6811, + "step": 512 + }, + { + "epoch": 0.2425531914893617, + "grad_norm": 3.0358474254608154, + "learning_rate": 4.986736015804627e-06, + "loss": 0.7348, + "step": 513 + }, + { + "epoch": 0.24302600472813238, + "grad_norm": 3.108792304992676, + "learning_rate": 4.986671762876071e-06, + "loss": 0.6096, + "step": 514 + }, + { + "epoch": 0.24349881796690306, + "grad_norm": 3.1316237449645996, + "learning_rate": 4.986607355113367e-06, + "loss": 0.6357, + "step": 515 + }, + { + "epoch": 0.24397163120567375, + "grad_norm": 3.3095219135284424, + "learning_rate": 4.986542792520528e-06, + "loss": 0.7515, + "step": 516 + }, + { + "epoch": 0.24444444444444444, + "grad_norm": 3.4775984287261963, + "learning_rate": 4.986478075101572e-06, + "loss": 0.7104, + "step": 517 + }, + { + "epoch": 0.24491725768321512, + "grad_norm": 3.341708183288574, + "learning_rate": 4.986413202860528e-06, + "loss": 0.7339, + "step": 518 + }, + { + "epoch": 0.2453900709219858, + "grad_norm": 2.9646966457366943, + "learning_rate": 4.986348175801438e-06, + "loss": 0.6032, + "step": 519 + }, + { + "epoch": 0.2458628841607565, + "grad_norm": 3.1853902339935303, + "learning_rate": 4.986282993928349e-06, + "loss": 0.6925, + "step": 520 + }, + { + "epoch": 0.24633569739952718, + "grad_norm": 3.286909818649292, + "learning_rate": 4.98621765724532e-06, + "loss": 0.7447, + "step": 521 + }, + { + "epoch": 0.24680851063829787, + "grad_norm": 3.2255051136016846, + "learning_rate": 4.986152165756419e-06, + "loss": 0.7747, + "step": 522 + }, + { + "epoch": 0.24728132387706855, + "grad_norm": 3.002352237701416, + "learning_rate": 4.986086519465724e-06, + "loss": 0.6472, + "step": 523 + }, + { + "epoch": 0.24775413711583924, + "grad_norm": 3.4738974571228027, + "learning_rate": 4.986020718377322e-06, + "loss": 0.7381, + "step": 524 + }, + { + "epoch": 0.24822695035460993, + "grad_norm": 3.4470200538635254, + "learning_rate": 4.985954762495312e-06, + "loss": 0.6878, + "step": 525 + }, + { + "epoch": 0.2486997635933806, + "grad_norm": 2.9219350814819336, + "learning_rate": 4.985888651823799e-06, + "loss": 0.6317, + "step": 526 + }, + { + "epoch": 0.2491725768321513, + "grad_norm": 3.061767101287842, + "learning_rate": 4.985822386366899e-06, + "loss": 0.6842, + "step": 527 + }, + { + "epoch": 0.24964539007092199, + "grad_norm": 3.0291247367858887, + "learning_rate": 4.985755966128742e-06, + "loss": 0.6852, + "step": 528 + }, + { + "epoch": 0.25011820330969264, + "grad_norm": 2.964280843734741, + "learning_rate": 4.985689391113457e-06, + "loss": 0.7738, + "step": 529 + }, + { + "epoch": 0.25059101654846333, + "grad_norm": 3.058302164077759, + "learning_rate": 4.9856226613251955e-06, + "loss": 0.6677, + "step": 530 + }, + { + "epoch": 0.251063829787234, + "grad_norm": 3.345141649246216, + "learning_rate": 4.985555776768109e-06, + "loss": 0.7837, + "step": 531 + }, + { + "epoch": 0.2515366430260047, + "grad_norm": 3.565031051635742, + "learning_rate": 4.9854887374463636e-06, + "loss": 0.7231, + "step": 532 + }, + { + "epoch": 0.2520094562647754, + "grad_norm": 2.7953789234161377, + "learning_rate": 4.985421543364132e-06, + "loss": 0.6102, + "step": 533 + }, + { + "epoch": 0.2524822695035461, + "grad_norm": 2.887606620788574, + "learning_rate": 4.9853541945256e-06, + "loss": 0.6289, + "step": 534 + }, + { + "epoch": 0.25295508274231676, + "grad_norm": 3.1480495929718018, + "learning_rate": 4.985286690934961e-06, + "loss": 0.6348, + "step": 535 + }, + { + "epoch": 0.25342789598108745, + "grad_norm": 2.8912761211395264, + "learning_rate": 4.985219032596416e-06, + "loss": 0.595, + "step": 536 + }, + { + "epoch": 0.25390070921985813, + "grad_norm": 2.947936534881592, + "learning_rate": 4.98515121951418e-06, + "loss": 0.6196, + "step": 537 + }, + { + "epoch": 0.2543735224586288, + "grad_norm": 3.1085827350616455, + "learning_rate": 4.985083251692474e-06, + "loss": 0.6387, + "step": 538 + }, + { + "epoch": 0.2548463356973995, + "grad_norm": 3.1688334941864014, + "learning_rate": 4.985015129135531e-06, + "loss": 0.7055, + "step": 539 + }, + { + "epoch": 0.2553191489361702, + "grad_norm": 3.075042963027954, + "learning_rate": 4.984946851847593e-06, + "loss": 0.7515, + "step": 540 + }, + { + "epoch": 0.2557919621749409, + "grad_norm": 3.1933093070983887, + "learning_rate": 4.98487841983291e-06, + "loss": 0.7054, + "step": 541 + }, + { + "epoch": 0.25626477541371157, + "grad_norm": 3.043473958969116, + "learning_rate": 4.984809833095744e-06, + "loss": 0.6281, + "step": 542 + }, + { + "epoch": 0.25673758865248225, + "grad_norm": 3.0532584190368652, + "learning_rate": 4.9847410916403645e-06, + "loss": 0.6155, + "step": 543 + }, + { + "epoch": 0.25721040189125294, + "grad_norm": 3.608480215072632, + "learning_rate": 4.984672195471053e-06, + "loss": 0.7363, + "step": 544 + }, + { + "epoch": 0.2576832151300236, + "grad_norm": 2.7491862773895264, + "learning_rate": 4.9846031445921e-06, + "loss": 0.6594, + "step": 545 + }, + { + "epoch": 0.2581560283687943, + "grad_norm": 2.8602418899536133, + "learning_rate": 4.984533939007802e-06, + "loss": 0.6742, + "step": 546 + }, + { + "epoch": 0.258628841607565, + "grad_norm": 3.1782007217407227, + "learning_rate": 4.98446457872247e-06, + "loss": 0.731, + "step": 547 + }, + { + "epoch": 0.2591016548463357, + "grad_norm": 2.796147584915161, + "learning_rate": 4.984395063740423e-06, + "loss": 0.6617, + "step": 548 + }, + { + "epoch": 0.25957446808510637, + "grad_norm": 2.8392202854156494, + "learning_rate": 4.984325394065991e-06, + "loss": 0.6753, + "step": 549 + }, + { + "epoch": 0.26004728132387706, + "grad_norm": 3.134672164916992, + "learning_rate": 4.984255569703508e-06, + "loss": 0.7222, + "step": 550 + }, + { + "epoch": 0.26052009456264774, + "grad_norm": 2.734330177307129, + "learning_rate": 4.984185590657325e-06, + "loss": 0.6098, + "step": 551 + }, + { + "epoch": 0.26099290780141843, + "grad_norm": 3.739010810852051, + "learning_rate": 4.984115456931798e-06, + "loss": 0.7457, + "step": 552 + }, + { + "epoch": 0.2614657210401891, + "grad_norm": 2.8412528038024902, + "learning_rate": 4.9840451685312925e-06, + "loss": 0.6972, + "step": 553 + }, + { + "epoch": 0.2619385342789598, + "grad_norm": 3.017395496368408, + "learning_rate": 4.983974725460188e-06, + "loss": 0.6887, + "step": 554 + }, + { + "epoch": 0.2624113475177305, + "grad_norm": 3.2746949195861816, + "learning_rate": 4.98390412772287e-06, + "loss": 0.7047, + "step": 555 + }, + { + "epoch": 0.2628841607565012, + "grad_norm": 3.1561965942382812, + "learning_rate": 4.983833375323732e-06, + "loss": 0.7726, + "step": 556 + }, + { + "epoch": 0.26335697399527186, + "grad_norm": 3.2367217540740967, + "learning_rate": 4.9837624682671816e-06, + "loss": 0.6348, + "step": 557 + }, + { + "epoch": 0.26382978723404255, + "grad_norm": 2.8195858001708984, + "learning_rate": 4.983691406557633e-06, + "loss": 0.6387, + "step": 558 + }, + { + "epoch": 0.26430260047281323, + "grad_norm": 3.349820852279663, + "learning_rate": 4.983620190199511e-06, + "loss": 0.6776, + "step": 559 + }, + { + "epoch": 0.2647754137115839, + "grad_norm": 2.8025588989257812, + "learning_rate": 4.98354881919725e-06, + "loss": 0.6512, + "step": 560 + }, + { + "epoch": 0.2652482269503546, + "grad_norm": 2.9125499725341797, + "learning_rate": 4.983477293555295e-06, + "loss": 0.7024, + "step": 561 + }, + { + "epoch": 0.2657210401891253, + "grad_norm": 3.3479275703430176, + "learning_rate": 4.983405613278098e-06, + "loss": 0.688, + "step": 562 + }, + { + "epoch": 0.266193853427896, + "grad_norm": 3.123971462249756, + "learning_rate": 4.983333778370123e-06, + "loss": 0.6743, + "step": 563 + }, + { + "epoch": 0.26666666666666666, + "grad_norm": 2.891625165939331, + "learning_rate": 4.983261788835843e-06, + "loss": 0.5971, + "step": 564 + }, + { + "epoch": 0.26713947990543735, + "grad_norm": 3.5066864490509033, + "learning_rate": 4.98318964467974e-06, + "loss": 0.6958, + "step": 565 + }, + { + "epoch": 0.26761229314420804, + "grad_norm": 2.570547342300415, + "learning_rate": 4.983117345906306e-06, + "loss": 0.609, + "step": 566 + }, + { + "epoch": 0.2680851063829787, + "grad_norm": 3.005106210708618, + "learning_rate": 4.983044892520044e-06, + "loss": 0.6791, + "step": 567 + }, + { + "epoch": 0.2685579196217494, + "grad_norm": 3.429675340652466, + "learning_rate": 4.982972284525463e-06, + "loss": 0.6625, + "step": 568 + }, + { + "epoch": 0.2690307328605201, + "grad_norm": 3.825657367706299, + "learning_rate": 4.982899521927086e-06, + "loss": 0.6368, + "step": 569 + }, + { + "epoch": 0.2695035460992908, + "grad_norm": 2.8699095249176025, + "learning_rate": 4.982826604729443e-06, + "loss": 0.6425, + "step": 570 + }, + { + "epoch": 0.26997635933806147, + "grad_norm": 3.1688714027404785, + "learning_rate": 4.982753532937074e-06, + "loss": 0.6904, + "step": 571 + }, + { + "epoch": 0.27044917257683215, + "grad_norm": 3.3889992237091064, + "learning_rate": 4.98268030655453e-06, + "loss": 0.7575, + "step": 572 + }, + { + "epoch": 0.27092198581560284, + "grad_norm": 3.108315944671631, + "learning_rate": 4.982606925586367e-06, + "loss": 0.6648, + "step": 573 + }, + { + "epoch": 0.2713947990543735, + "grad_norm": 3.209831953048706, + "learning_rate": 4.982533390037159e-06, + "loss": 0.657, + "step": 574 + }, + { + "epoch": 0.2718676122931442, + "grad_norm": 3.1740927696228027, + "learning_rate": 4.982459699911482e-06, + "loss": 0.7262, + "step": 575 + }, + { + "epoch": 0.2723404255319149, + "grad_norm": 3.0190417766571045, + "learning_rate": 4.982385855213924e-06, + "loss": 0.6368, + "step": 576 + }, + { + "epoch": 0.2728132387706856, + "grad_norm": 3.05049467086792, + "learning_rate": 4.982311855949084e-06, + "loss": 0.72, + "step": 577 + }, + { + "epoch": 0.27328605200945627, + "grad_norm": 2.984816551208496, + "learning_rate": 4.98223770212157e-06, + "loss": 0.6856, + "step": 578 + }, + { + "epoch": 0.27375886524822696, + "grad_norm": 2.744969606399536, + "learning_rate": 4.982163393735998e-06, + "loss": 0.6023, + "step": 579 + }, + { + "epoch": 0.27423167848699764, + "grad_norm": 3.170564889907837, + "learning_rate": 4.982088930796996e-06, + "loss": 0.6678, + "step": 580 + }, + { + "epoch": 0.27470449172576833, + "grad_norm": 2.8686118125915527, + "learning_rate": 4.982014313309199e-06, + "loss": 0.6157, + "step": 581 + }, + { + "epoch": 0.275177304964539, + "grad_norm": 2.8768694400787354, + "learning_rate": 4.981939541277254e-06, + "loss": 0.6566, + "step": 582 + }, + { + "epoch": 0.2756501182033097, + "grad_norm": 2.621481418609619, + "learning_rate": 4.981864614705818e-06, + "loss": 0.7372, + "step": 583 + }, + { + "epoch": 0.2761229314420804, + "grad_norm": 3.527374267578125, + "learning_rate": 4.981789533599554e-06, + "loss": 0.6485, + "step": 584 + }, + { + "epoch": 0.2765957446808511, + "grad_norm": 3.3141074180603027, + "learning_rate": 4.981714297963138e-06, + "loss": 0.6816, + "step": 585 + }, + { + "epoch": 0.27706855791962176, + "grad_norm": 2.9247069358825684, + "learning_rate": 4.981638907801255e-06, + "loss": 0.7217, + "step": 586 + }, + { + "epoch": 0.27754137115839245, + "grad_norm": 2.875236749649048, + "learning_rate": 4.981563363118599e-06, + "loss": 0.6662, + "step": 587 + }, + { + "epoch": 0.27801418439716313, + "grad_norm": 2.9540364742279053, + "learning_rate": 4.981487663919874e-06, + "loss": 0.7225, + "step": 588 + }, + { + "epoch": 0.2784869976359338, + "grad_norm": 2.90889310836792, + "learning_rate": 4.981411810209793e-06, + "loss": 0.6054, + "step": 589 + }, + { + "epoch": 0.2789598108747045, + "grad_norm": 2.8541409969329834, + "learning_rate": 4.981335801993078e-06, + "loss": 0.6539, + "step": 590 + }, + { + "epoch": 0.2794326241134752, + "grad_norm": 3.1600730419158936, + "learning_rate": 4.981259639274465e-06, + "loss": 0.6415, + "step": 591 + }, + { + "epoch": 0.2799054373522459, + "grad_norm": 3.569376230239868, + "learning_rate": 4.981183322058693e-06, + "loss": 0.6944, + "step": 592 + }, + { + "epoch": 0.28037825059101656, + "grad_norm": 3.067667007446289, + "learning_rate": 4.981106850350515e-06, + "loss": 0.7378, + "step": 593 + }, + { + "epoch": 0.28085106382978725, + "grad_norm": 3.082073450088501, + "learning_rate": 4.981030224154693e-06, + "loss": 0.693, + "step": 594 + }, + { + "epoch": 0.28132387706855794, + "grad_norm": 2.902932643890381, + "learning_rate": 4.980953443475998e-06, + "loss": 0.6549, + "step": 595 + }, + { + "epoch": 0.2817966903073286, + "grad_norm": 2.6821181774139404, + "learning_rate": 4.980876508319211e-06, + "loss": 0.6231, + "step": 596 + }, + { + "epoch": 0.2822695035460993, + "grad_norm": 3.1747355461120605, + "learning_rate": 4.9807994186891215e-06, + "loss": 0.6826, + "step": 597 + }, + { + "epoch": 0.28274231678487, + "grad_norm": 2.6975860595703125, + "learning_rate": 4.980722174590531e-06, + "loss": 0.6669, + "step": 598 + }, + { + "epoch": 0.2832151300236407, + "grad_norm": 2.924285650253296, + "learning_rate": 4.9806447760282486e-06, + "loss": 0.689, + "step": 599 + }, + { + "epoch": 0.28368794326241137, + "grad_norm": 2.941417694091797, + "learning_rate": 4.980567223007093e-06, + "loss": 0.6672, + "step": 600 + }, + { + "epoch": 0.28416075650118205, + "grad_norm": 2.8582186698913574, + "learning_rate": 4.980489515531892e-06, + "loss": 0.6229, + "step": 601 + }, + { + "epoch": 0.28463356973995274, + "grad_norm": 2.6462013721466064, + "learning_rate": 4.9804116536074865e-06, + "loss": 0.606, + "step": 602 + }, + { + "epoch": 0.2851063829787234, + "grad_norm": 2.9029998779296875, + "learning_rate": 4.980333637238723e-06, + "loss": 0.5915, + "step": 603 + }, + { + "epoch": 0.2855791962174941, + "grad_norm": 3.9359042644500732, + "learning_rate": 4.980255466430462e-06, + "loss": 0.7035, + "step": 604 + }, + { + "epoch": 0.2860520094562648, + "grad_norm": 3.200524091720581, + "learning_rate": 4.980177141187566e-06, + "loss": 0.7156, + "step": 605 + }, + { + "epoch": 0.2865248226950355, + "grad_norm": 3.1708686351776123, + "learning_rate": 4.980098661514916e-06, + "loss": 0.746, + "step": 606 + }, + { + "epoch": 0.28699763593380617, + "grad_norm": 2.8926830291748047, + "learning_rate": 4.980020027417397e-06, + "loss": 0.6282, + "step": 607 + }, + { + "epoch": 0.28747044917257686, + "grad_norm": 3.0526294708251953, + "learning_rate": 4.979941238899906e-06, + "loss": 0.6594, + "step": 608 + }, + { + "epoch": 0.28794326241134754, + "grad_norm": 2.9869306087493896, + "learning_rate": 4.9798622959673486e-06, + "loss": 0.7771, + "step": 609 + }, + { + "epoch": 0.28841607565011823, + "grad_norm": 2.7894513607025146, + "learning_rate": 4.979783198624638e-06, + "loss": 0.6819, + "step": 610 + }, + { + "epoch": 0.28888888888888886, + "grad_norm": 2.958575963973999, + "learning_rate": 4.9797039468767025e-06, + "loss": 0.6474, + "step": 611 + }, + { + "epoch": 0.28936170212765955, + "grad_norm": 3.423748016357422, + "learning_rate": 4.979624540728475e-06, + "loss": 0.7389, + "step": 612 + }, + { + "epoch": 0.28983451536643023, + "grad_norm": 2.9641635417938232, + "learning_rate": 4.9795449801849e-06, + "loss": 0.6005, + "step": 613 + }, + { + "epoch": 0.2903073286052009, + "grad_norm": 3.02274227142334, + "learning_rate": 4.979465265250933e-06, + "loss": 0.6358, + "step": 614 + }, + { + "epoch": 0.2907801418439716, + "grad_norm": 3.0562758445739746, + "learning_rate": 4.979385395931534e-06, + "loss": 0.6313, + "step": 615 + }, + { + "epoch": 0.2912529550827423, + "grad_norm": 3.301816701889038, + "learning_rate": 4.97930537223168e-06, + "loss": 0.7264, + "step": 616 + }, + { + "epoch": 0.291725768321513, + "grad_norm": 2.975360870361328, + "learning_rate": 4.979225194156351e-06, + "loss": 0.613, + "step": 617 + }, + { + "epoch": 0.29219858156028367, + "grad_norm": 2.9245030879974365, + "learning_rate": 4.97914486171054e-06, + "loss": 0.6646, + "step": 618 + }, + { + "epoch": 0.29267139479905435, + "grad_norm": 3.1336188316345215, + "learning_rate": 4.979064374899249e-06, + "loss": 0.6421, + "step": 619 + }, + { + "epoch": 0.29314420803782504, + "grad_norm": 3.6298763751983643, + "learning_rate": 4.978983733727491e-06, + "loss": 0.6433, + "step": 620 + }, + { + "epoch": 0.2936170212765957, + "grad_norm": 2.919597625732422, + "learning_rate": 4.9789029382002845e-06, + "loss": 0.6288, + "step": 621 + }, + { + "epoch": 0.2940898345153664, + "grad_norm": 3.2206127643585205, + "learning_rate": 4.978821988322662e-06, + "loss": 0.7102, + "step": 622 + }, + { + "epoch": 0.2945626477541371, + "grad_norm": 3.1767101287841797, + "learning_rate": 4.978740884099664e-06, + "loss": 0.6722, + "step": 623 + }, + { + "epoch": 0.2950354609929078, + "grad_norm": 3.3425452709198, + "learning_rate": 4.97865962553634e-06, + "loss": 0.6492, + "step": 624 + }, + { + "epoch": 0.29550827423167847, + "grad_norm": 3.0408358573913574, + "learning_rate": 4.97857821263775e-06, + "loss": 0.6522, + "step": 625 + }, + { + "epoch": 0.29598108747044916, + "grad_norm": 2.8144783973693848, + "learning_rate": 4.978496645408963e-06, + "loss": 0.7237, + "step": 626 + }, + { + "epoch": 0.29645390070921984, + "grad_norm": 3.7010560035705566, + "learning_rate": 4.978414923855057e-06, + "loss": 0.7509, + "step": 627 + }, + { + "epoch": 0.29692671394799053, + "grad_norm": 2.9438371658325195, + "learning_rate": 4.978333047981122e-06, + "loss": 0.6244, + "step": 628 + }, + { + "epoch": 0.2973995271867612, + "grad_norm": 3.285982370376587, + "learning_rate": 4.978251017792255e-06, + "loss": 0.7553, + "step": 629 + }, + { + "epoch": 0.2978723404255319, + "grad_norm": 3.7021138668060303, + "learning_rate": 4.978168833293564e-06, + "loss": 0.7859, + "step": 630 + }, + { + "epoch": 0.2983451536643026, + "grad_norm": 3.481858730316162, + "learning_rate": 4.9780864944901654e-06, + "loss": 0.7146, + "step": 631 + }, + { + "epoch": 0.2988179669030733, + "grad_norm": 3.693824529647827, + "learning_rate": 4.978004001387188e-06, + "loss": 0.6608, + "step": 632 + }, + { + "epoch": 0.29929078014184396, + "grad_norm": 3.0069146156311035, + "learning_rate": 4.9779213539897665e-06, + "loss": 0.6506, + "step": 633 + }, + { + "epoch": 0.29976359338061465, + "grad_norm": 3.037644147872925, + "learning_rate": 4.977838552303048e-06, + "loss": 0.6487, + "step": 634 + }, + { + "epoch": 0.30023640661938533, + "grad_norm": 3.018554449081421, + "learning_rate": 4.977755596332188e-06, + "loss": 0.6128, + "step": 635 + }, + { + "epoch": 0.300709219858156, + "grad_norm": 3.000312089920044, + "learning_rate": 4.977672486082351e-06, + "loss": 0.6431, + "step": 636 + }, + { + "epoch": 0.3011820330969267, + "grad_norm": 2.836803913116455, + "learning_rate": 4.977589221558713e-06, + "loss": 0.5914, + "step": 637 + }, + { + "epoch": 0.3016548463356974, + "grad_norm": 3.080469846725464, + "learning_rate": 4.977505802766457e-06, + "loss": 0.7265, + "step": 638 + }, + { + "epoch": 0.3021276595744681, + "grad_norm": 3.2245471477508545, + "learning_rate": 4.97742222971078e-06, + "loss": 0.6895, + "step": 639 + }, + { + "epoch": 0.30260047281323876, + "grad_norm": 3.559006452560425, + "learning_rate": 4.977338502396882e-06, + "loss": 0.7439, + "step": 640 + }, + { + "epoch": 0.30307328605200945, + "grad_norm": 2.9116289615631104, + "learning_rate": 4.9772546208299795e-06, + "loss": 0.6907, + "step": 641 + }, + { + "epoch": 0.30354609929078014, + "grad_norm": 3.3645524978637695, + "learning_rate": 4.977170585015295e-06, + "loss": 0.6983, + "step": 642 + }, + { + "epoch": 0.3040189125295508, + "grad_norm": 3.080148458480835, + "learning_rate": 4.977086394958058e-06, + "loss": 0.7016, + "step": 643 + }, + { + "epoch": 0.3044917257683215, + "grad_norm": 2.9276750087738037, + "learning_rate": 4.977002050663515e-06, + "loss": 0.6509, + "step": 644 + }, + { + "epoch": 0.3049645390070922, + "grad_norm": 3.183609962463379, + "learning_rate": 4.976917552136914e-06, + "loss": 0.6814, + "step": 645 + }, + { + "epoch": 0.3054373522458629, + "grad_norm": 3.0980000495910645, + "learning_rate": 4.976832899383519e-06, + "loss": 0.6319, + "step": 646 + }, + { + "epoch": 0.30591016548463357, + "grad_norm": 3.211376190185547, + "learning_rate": 4.9767480924086e-06, + "loss": 0.6365, + "step": 647 + }, + { + "epoch": 0.30638297872340425, + "grad_norm": 3.214430093765259, + "learning_rate": 4.976663131217437e-06, + "loss": 0.6006, + "step": 648 + }, + { + "epoch": 0.30685579196217494, + "grad_norm": 3.0914318561553955, + "learning_rate": 4.976578015815321e-06, + "loss": 0.7162, + "step": 649 + }, + { + "epoch": 0.3073286052009456, + "grad_norm": 2.7644500732421875, + "learning_rate": 4.976492746207551e-06, + "loss": 0.6045, + "step": 650 + }, + { + "epoch": 0.3078014184397163, + "grad_norm": 3.1913280487060547, + "learning_rate": 4.9764073223994374e-06, + "loss": 0.6796, + "step": 651 + }, + { + "epoch": 0.308274231678487, + "grad_norm": 2.8919692039489746, + "learning_rate": 4.976321744396299e-06, + "loss": 0.6683, + "step": 652 + }, + { + "epoch": 0.3087470449172577, + "grad_norm": 2.862234115600586, + "learning_rate": 4.976236012203463e-06, + "loss": 0.6631, + "step": 653 + }, + { + "epoch": 0.30921985815602837, + "grad_norm": 2.9708092212677, + "learning_rate": 4.976150125826268e-06, + "loss": 0.6326, + "step": 654 + }, + { + "epoch": 0.30969267139479906, + "grad_norm": 2.892465353012085, + "learning_rate": 4.976064085270063e-06, + "loss": 0.6574, + "step": 655 + }, + { + "epoch": 0.31016548463356974, + "grad_norm": 3.9215126037597656, + "learning_rate": 4.975977890540205e-06, + "loss": 0.7351, + "step": 656 + }, + { + "epoch": 0.31063829787234043, + "grad_norm": 2.9544081687927246, + "learning_rate": 4.975891541642059e-06, + "loss": 0.7264, + "step": 657 + }, + { + "epoch": 0.3111111111111111, + "grad_norm": 2.995035409927368, + "learning_rate": 4.975805038581005e-06, + "loss": 0.7405, + "step": 658 + }, + { + "epoch": 0.3115839243498818, + "grad_norm": 2.9653120040893555, + "learning_rate": 4.975718381362427e-06, + "loss": 0.679, + "step": 659 + }, + { + "epoch": 0.3120567375886525, + "grad_norm": 2.93976092338562, + "learning_rate": 4.9756315699917205e-06, + "loss": 0.627, + "step": 660 + }, + { + "epoch": 0.3125295508274232, + "grad_norm": 3.106522560119629, + "learning_rate": 4.9755446044742915e-06, + "loss": 0.6329, + "step": 661 + }, + { + "epoch": 0.31300236406619386, + "grad_norm": 3.0238280296325684, + "learning_rate": 4.975457484815554e-06, + "loss": 0.6643, + "step": 662 + }, + { + "epoch": 0.31347517730496455, + "grad_norm": 2.943528175354004, + "learning_rate": 4.9753702110209356e-06, + "loss": 0.668, + "step": 663 + }, + { + "epoch": 0.31394799054373523, + "grad_norm": 2.6840121746063232, + "learning_rate": 4.9752827830958676e-06, + "loss": 0.5482, + "step": 664 + }, + { + "epoch": 0.3144208037825059, + "grad_norm": 2.823875904083252, + "learning_rate": 4.975195201045794e-06, + "loss": 0.7017, + "step": 665 + }, + { + "epoch": 0.3148936170212766, + "grad_norm": 3.148181200027466, + "learning_rate": 4.975107464876168e-06, + "loss": 0.747, + "step": 666 + }, + { + "epoch": 0.3153664302600473, + "grad_norm": 2.630584478378296, + "learning_rate": 4.9750195745924545e-06, + "loss": 0.5987, + "step": 667 + }, + { + "epoch": 0.315839243498818, + "grad_norm": 3.075866460800171, + "learning_rate": 4.974931530200124e-06, + "loss": 0.664, + "step": 668 + }, + { + "epoch": 0.31631205673758866, + "grad_norm": 2.947197914123535, + "learning_rate": 4.974843331704659e-06, + "loss": 0.631, + "step": 669 + }, + { + "epoch": 0.31678486997635935, + "grad_norm": 3.519646644592285, + "learning_rate": 4.974754979111552e-06, + "loss": 0.7154, + "step": 670 + }, + { + "epoch": 0.31725768321513004, + "grad_norm": 2.8687186241149902, + "learning_rate": 4.974666472426305e-06, + "loss": 0.6366, + "step": 671 + }, + { + "epoch": 0.3177304964539007, + "grad_norm": 2.6966612339019775, + "learning_rate": 4.974577811654426e-06, + "loss": 0.7112, + "step": 672 + }, + { + "epoch": 0.3182033096926714, + "grad_norm": 3.1390228271484375, + "learning_rate": 4.974488996801439e-06, + "loss": 0.6882, + "step": 673 + }, + { + "epoch": 0.3186761229314421, + "grad_norm": 3.4667599201202393, + "learning_rate": 4.974400027872871e-06, + "loss": 0.7153, + "step": 674 + }, + { + "epoch": 0.3191489361702128, + "grad_norm": 2.9632184505462646, + "learning_rate": 4.974310904874265e-06, + "loss": 0.7081, + "step": 675 + }, + { + "epoch": 0.31962174940898347, + "grad_norm": 3.46150279045105, + "learning_rate": 4.9742216278111666e-06, + "loss": 0.6242, + "step": 676 + }, + { + "epoch": 0.32009456264775416, + "grad_norm": 3.380403757095337, + "learning_rate": 4.974132196689137e-06, + "loss": 0.6863, + "step": 677 + }, + { + "epoch": 0.32056737588652484, + "grad_norm": 3.4279606342315674, + "learning_rate": 4.974042611513746e-06, + "loss": 0.6388, + "step": 678 + }, + { + "epoch": 0.3210401891252955, + "grad_norm": 2.634523391723633, + "learning_rate": 4.973952872290568e-06, + "loss": 0.6038, + "step": 679 + }, + { + "epoch": 0.3215130023640662, + "grad_norm": 3.19693922996521, + "learning_rate": 4.973862979025194e-06, + "loss": 0.6383, + "step": 680 + }, + { + "epoch": 0.3219858156028369, + "grad_norm": 3.437692165374756, + "learning_rate": 4.973772931723218e-06, + "loss": 0.7288, + "step": 681 + }, + { + "epoch": 0.3224586288416076, + "grad_norm": 2.506301164627075, + "learning_rate": 4.97368273039025e-06, + "loss": 0.5707, + "step": 682 + }, + { + "epoch": 0.3229314420803783, + "grad_norm": 3.0942845344543457, + "learning_rate": 4.9735923750319044e-06, + "loss": 0.6348, + "step": 683 + }, + { + "epoch": 0.32340425531914896, + "grad_norm": 3.0889835357666016, + "learning_rate": 4.973501865653809e-06, + "loss": 0.6697, + "step": 684 + }, + { + "epoch": 0.32387706855791965, + "grad_norm": 3.0391931533813477, + "learning_rate": 4.973411202261598e-06, + "loss": 0.7091, + "step": 685 + }, + { + "epoch": 0.32434988179669033, + "grad_norm": 3.0333497524261475, + "learning_rate": 4.973320384860917e-06, + "loss": 0.6403, + "step": 686 + }, + { + "epoch": 0.324822695035461, + "grad_norm": 2.9714622497558594, + "learning_rate": 4.973229413457421e-06, + "loss": 0.6977, + "step": 687 + }, + { + "epoch": 0.3252955082742317, + "grad_norm": 3.057558298110962, + "learning_rate": 4.973138288056774e-06, + "loss": 0.7236, + "step": 688 + }, + { + "epoch": 0.3257683215130024, + "grad_norm": 2.921093463897705, + "learning_rate": 4.97304700866465e-06, + "loss": 0.576, + "step": 689 + }, + { + "epoch": 0.3262411347517731, + "grad_norm": 3.0287256240844727, + "learning_rate": 4.972955575286732e-06, + "loss": 0.7077, + "step": 690 + }, + { + "epoch": 0.32671394799054376, + "grad_norm": 2.8621346950531006, + "learning_rate": 4.972863987928716e-06, + "loss": 0.6952, + "step": 691 + }, + { + "epoch": 0.3271867612293144, + "grad_norm": 2.631359100341797, + "learning_rate": 4.9727722465963006e-06, + "loss": 0.6931, + "step": 692 + }, + { + "epoch": 0.3276595744680851, + "grad_norm": 2.8484320640563965, + "learning_rate": 4.972680351295201e-06, + "loss": 0.6292, + "step": 693 + }, + { + "epoch": 0.32813238770685577, + "grad_norm": 2.593001365661621, + "learning_rate": 4.972588302031138e-06, + "loss": 0.5942, + "step": 694 + }, + { + "epoch": 0.32860520094562645, + "grad_norm": 2.6321065425872803, + "learning_rate": 4.972496098809844e-06, + "loss": 0.65, + "step": 695 + }, + { + "epoch": 0.32907801418439714, + "grad_norm": 3.2516732215881348, + "learning_rate": 4.972403741637059e-06, + "loss": 0.7385, + "step": 696 + }, + { + "epoch": 0.3295508274231678, + "grad_norm": 3.180854320526123, + "learning_rate": 4.972311230518535e-06, + "loss": 0.6569, + "step": 697 + }, + { + "epoch": 0.3300236406619385, + "grad_norm": 4.161016941070557, + "learning_rate": 4.972218565460031e-06, + "loss": 0.6416, + "step": 698 + }, + { + "epoch": 0.3304964539007092, + "grad_norm": 3.153897762298584, + "learning_rate": 4.972125746467317e-06, + "loss": 0.7196, + "step": 699 + }, + { + "epoch": 0.3309692671394799, + "grad_norm": 2.9595556259155273, + "learning_rate": 4.972032773546173e-06, + "loss": 0.7093, + "step": 700 + }, + { + "epoch": 0.33144208037825057, + "grad_norm": 3.1086833477020264, + "learning_rate": 4.9719396467023875e-06, + "loss": 0.6963, + "step": 701 + }, + { + "epoch": 0.33191489361702126, + "grad_norm": 2.958921432495117, + "learning_rate": 4.971846365941759e-06, + "loss": 0.6518, + "step": 702 + }, + { + "epoch": 0.33238770685579194, + "grad_norm": 2.8745479583740234, + "learning_rate": 4.971752931270096e-06, + "loss": 0.696, + "step": 703 + }, + { + "epoch": 0.33286052009456263, + "grad_norm": 3.224358558654785, + "learning_rate": 4.971659342693217e-06, + "loss": 0.6769, + "step": 704 + }, + { + "epoch": 0.3333333333333333, + "grad_norm": 2.696319580078125, + "learning_rate": 4.9715656002169486e-06, + "loss": 0.6833, + "step": 705 + }, + { + "epoch": 0.333806146572104, + "grad_norm": 2.9283502101898193, + "learning_rate": 4.971471703847127e-06, + "loss": 0.6784, + "step": 706 + }, + { + "epoch": 0.3342789598108747, + "grad_norm": 2.654914140701294, + "learning_rate": 4.9713776535896e-06, + "loss": 0.6337, + "step": 707 + }, + { + "epoch": 0.3347517730496454, + "grad_norm": 3.041555643081665, + "learning_rate": 4.971283449450224e-06, + "loss": 0.6227, + "step": 708 + }, + { + "epoch": 0.33522458628841606, + "grad_norm": 2.893008232116699, + "learning_rate": 4.971189091434863e-06, + "loss": 0.655, + "step": 709 + }, + { + "epoch": 0.33569739952718675, + "grad_norm": 2.8806653022766113, + "learning_rate": 4.971094579549393e-06, + "loss": 0.7077, + "step": 710 + }, + { + "epoch": 0.33617021276595743, + "grad_norm": 3.4830048084259033, + "learning_rate": 4.9709999137996986e-06, + "loss": 0.7461, + "step": 711 + }, + { + "epoch": 0.3366430260047281, + "grad_norm": 3.155444860458374, + "learning_rate": 4.970905094191674e-06, + "loss": 0.652, + "step": 712 + }, + { + "epoch": 0.3371158392434988, + "grad_norm": 2.7608706951141357, + "learning_rate": 4.970810120731225e-06, + "loss": 0.684, + "step": 713 + }, + { + "epoch": 0.3375886524822695, + "grad_norm": 2.8209474086761475, + "learning_rate": 4.970714993424265e-06, + "loss": 0.6009, + "step": 714 + }, + { + "epoch": 0.3380614657210402, + "grad_norm": 3.6532654762268066, + "learning_rate": 4.9706197122767145e-06, + "loss": 0.702, + "step": 715 + }, + { + "epoch": 0.33853427895981086, + "grad_norm": 2.6276566982269287, + "learning_rate": 4.970524277294508e-06, + "loss": 0.6338, + "step": 716 + }, + { + "epoch": 0.33900709219858155, + "grad_norm": 3.509871482849121, + "learning_rate": 4.970428688483589e-06, + "loss": 0.6853, + "step": 717 + }, + { + "epoch": 0.33947990543735224, + "grad_norm": 5.332682132720947, + "learning_rate": 4.970332945849906e-06, + "loss": 0.6684, + "step": 718 + }, + { + "epoch": 0.3399527186761229, + "grad_norm": 2.718801975250244, + "learning_rate": 4.970237049399424e-06, + "loss": 0.6676, + "step": 719 + }, + { + "epoch": 0.3404255319148936, + "grad_norm": 3.891003131866455, + "learning_rate": 4.970140999138112e-06, + "loss": 0.7043, + "step": 720 + }, + { + "epoch": 0.3408983451536643, + "grad_norm": 2.8863155841827393, + "learning_rate": 4.970044795071951e-06, + "loss": 0.6563, + "step": 721 + }, + { + "epoch": 0.341371158392435, + "grad_norm": 3.2527518272399902, + "learning_rate": 4.969948437206932e-06, + "loss": 0.7244, + "step": 722 + }, + { + "epoch": 0.34184397163120567, + "grad_norm": 2.9726758003234863, + "learning_rate": 4.969851925549054e-06, + "loss": 0.6548, + "step": 723 + }, + { + "epoch": 0.34231678486997635, + "grad_norm": 3.118309497833252, + "learning_rate": 4.969755260104327e-06, + "loss": 0.7293, + "step": 724 + }, + { + "epoch": 0.34278959810874704, + "grad_norm": 3.373068332672119, + "learning_rate": 4.969658440878769e-06, + "loss": 0.6444, + "step": 725 + }, + { + "epoch": 0.3432624113475177, + "grad_norm": 2.7157437801361084, + "learning_rate": 4.969561467878409e-06, + "loss": 0.642, + "step": 726 + }, + { + "epoch": 0.3437352245862884, + "grad_norm": 2.58929705619812, + "learning_rate": 4.969464341109285e-06, + "loss": 0.6165, + "step": 727 + }, + { + "epoch": 0.3442080378250591, + "grad_norm": 2.8811306953430176, + "learning_rate": 4.969367060577445e-06, + "loss": 0.7127, + "step": 728 + }, + { + "epoch": 0.3446808510638298, + "grad_norm": 3.494358539581299, + "learning_rate": 4.969269626288946e-06, + "loss": 0.7103, + "step": 729 + }, + { + "epoch": 0.34515366430260047, + "grad_norm": 2.9753928184509277, + "learning_rate": 4.969172038249855e-06, + "loss": 0.6911, + "step": 730 + }, + { + "epoch": 0.34562647754137116, + "grad_norm": 3.2885913848876953, + "learning_rate": 4.969074296466247e-06, + "loss": 0.6968, + "step": 731 + }, + { + "epoch": 0.34609929078014184, + "grad_norm": 2.7564568519592285, + "learning_rate": 4.968976400944211e-06, + "loss": 0.6843, + "step": 732 + }, + { + "epoch": 0.34657210401891253, + "grad_norm": 2.9255006313323975, + "learning_rate": 4.96887835168984e-06, + "loss": 0.6024, + "step": 733 + }, + { + "epoch": 0.3470449172576832, + "grad_norm": 3.1808290481567383, + "learning_rate": 4.968780148709239e-06, + "loss": 0.7377, + "step": 734 + }, + { + "epoch": 0.3475177304964539, + "grad_norm": 2.956666946411133, + "learning_rate": 4.968681792008523e-06, + "loss": 0.65, + "step": 735 + }, + { + "epoch": 0.3479905437352246, + "grad_norm": 2.9631855487823486, + "learning_rate": 4.9685832815938175e-06, + "loss": 0.677, + "step": 736 + }, + { + "epoch": 0.3484633569739953, + "grad_norm": 2.501917600631714, + "learning_rate": 4.968484617471256e-06, + "loss": 0.6282, + "step": 737 + }, + { + "epoch": 0.34893617021276596, + "grad_norm": 2.750779628753662, + "learning_rate": 4.968385799646981e-06, + "loss": 0.6507, + "step": 738 + }, + { + "epoch": 0.34940898345153665, + "grad_norm": 2.872300624847412, + "learning_rate": 4.968286828127146e-06, + "loss": 0.5949, + "step": 739 + }, + { + "epoch": 0.34988179669030733, + "grad_norm": 2.6316142082214355, + "learning_rate": 4.9681877029179124e-06, + "loss": 0.6328, + "step": 740 + }, + { + "epoch": 0.350354609929078, + "grad_norm": 3.244364023208618, + "learning_rate": 4.968088424025454e-06, + "loss": 0.7393, + "step": 741 + }, + { + "epoch": 0.3508274231678487, + "grad_norm": 2.620465040206909, + "learning_rate": 4.967988991455951e-06, + "loss": 0.6797, + "step": 742 + }, + { + "epoch": 0.3513002364066194, + "grad_norm": 2.854513645172119, + "learning_rate": 4.967889405215596e-06, + "loss": 0.6368, + "step": 743 + }, + { + "epoch": 0.3517730496453901, + "grad_norm": 2.579854726791382, + "learning_rate": 4.9677896653105886e-06, + "loss": 0.6489, + "step": 744 + }, + { + "epoch": 0.35224586288416077, + "grad_norm": 3.0697381496429443, + "learning_rate": 4.96768977174714e-06, + "loss": 0.6313, + "step": 745 + }, + { + "epoch": 0.35271867612293145, + "grad_norm": 3.369338035583496, + "learning_rate": 4.96758972453147e-06, + "loss": 0.7416, + "step": 746 + }, + { + "epoch": 0.35319148936170214, + "grad_norm": 2.836221933364868, + "learning_rate": 4.967489523669807e-06, + "loss": 0.6422, + "step": 747 + }, + { + "epoch": 0.3536643026004728, + "grad_norm": 2.929579496383667, + "learning_rate": 4.967389169168392e-06, + "loss": 0.6482, + "step": 748 + }, + { + "epoch": 0.3541371158392435, + "grad_norm": 2.9243831634521484, + "learning_rate": 4.967288661033472e-06, + "loss": 0.5813, + "step": 749 + }, + { + "epoch": 0.3546099290780142, + "grad_norm": 3.7555336952209473, + "learning_rate": 4.967187999271306e-06, + "loss": 0.6501, + "step": 750 + }, + { + "epoch": 0.3550827423167849, + "grad_norm": 3.4279143810272217, + "learning_rate": 4.9670871838881615e-06, + "loss": 0.6326, + "step": 751 + }, + { + "epoch": 0.35555555555555557, + "grad_norm": 2.875066041946411, + "learning_rate": 4.9669862148903166e-06, + "loss": 0.664, + "step": 752 + }, + { + "epoch": 0.35602836879432626, + "grad_norm": 3.130394697189331, + "learning_rate": 4.966885092284057e-06, + "loss": 0.706, + "step": 753 + }, + { + "epoch": 0.35650118203309694, + "grad_norm": 2.9606287479400635, + "learning_rate": 4.96678381607568e-06, + "loss": 0.693, + "step": 754 + }, + { + "epoch": 0.35697399527186763, + "grad_norm": 3.0584909915924072, + "learning_rate": 4.966682386271491e-06, + "loss": 0.6034, + "step": 755 + }, + { + "epoch": 0.3574468085106383, + "grad_norm": 2.8215200901031494, + "learning_rate": 4.966580802877805e-06, + "loss": 0.6217, + "step": 756 + }, + { + "epoch": 0.357919621749409, + "grad_norm": 2.7348055839538574, + "learning_rate": 4.966479065900949e-06, + "loss": 0.6194, + "step": 757 + }, + { + "epoch": 0.3583924349881797, + "grad_norm": 3.2347466945648193, + "learning_rate": 4.966377175347257e-06, + "loss": 0.6377, + "step": 758 + }, + { + "epoch": 0.3588652482269504, + "grad_norm": 3.311845302581787, + "learning_rate": 4.966275131223072e-06, + "loss": 0.6234, + "step": 759 + }, + { + "epoch": 0.35933806146572106, + "grad_norm": 3.0384368896484375, + "learning_rate": 4.96617293353475e-06, + "loss": 0.609, + "step": 760 + }, + { + "epoch": 0.35981087470449175, + "grad_norm": 3.516854763031006, + "learning_rate": 4.966070582288653e-06, + "loss": 0.6627, + "step": 761 + }, + { + "epoch": 0.36028368794326243, + "grad_norm": 3.2425215244293213, + "learning_rate": 4.9659680774911534e-06, + "loss": 0.7355, + "step": 762 + }, + { + "epoch": 0.3607565011820331, + "grad_norm": 3.2665750980377197, + "learning_rate": 4.965865419148636e-06, + "loss": 0.6787, + "step": 763 + }, + { + "epoch": 0.3612293144208038, + "grad_norm": 2.729428291320801, + "learning_rate": 4.96576260726749e-06, + "loss": 0.6272, + "step": 764 + }, + { + "epoch": 0.3617021276595745, + "grad_norm": 3.299969434738159, + "learning_rate": 4.965659641854119e-06, + "loss": 0.6552, + "step": 765 + }, + { + "epoch": 0.3621749408983452, + "grad_norm": 2.7090916633605957, + "learning_rate": 4.965556522914934e-06, + "loss": 0.6661, + "step": 766 + }, + { + "epoch": 0.36264775413711586, + "grad_norm": 2.488846778869629, + "learning_rate": 4.965453250456355e-06, + "loss": 0.5821, + "step": 767 + }, + { + "epoch": 0.36312056737588655, + "grad_norm": 2.5267233848571777, + "learning_rate": 4.965349824484813e-06, + "loss": 0.5593, + "step": 768 + }, + { + "epoch": 0.36359338061465724, + "grad_norm": 3.0646679401397705, + "learning_rate": 4.965246245006748e-06, + "loss": 0.6341, + "step": 769 + }, + { + "epoch": 0.3640661938534279, + "grad_norm": 2.9877712726593018, + "learning_rate": 4.965142512028609e-06, + "loss": 0.7202, + "step": 770 + }, + { + "epoch": 0.3645390070921986, + "grad_norm": 3.7494113445281982, + "learning_rate": 4.965038625556854e-06, + "loss": 0.7643, + "step": 771 + }, + { + "epoch": 0.3650118203309693, + "grad_norm": 2.8382890224456787, + "learning_rate": 4.964934585597954e-06, + "loss": 0.6522, + "step": 772 + }, + { + "epoch": 0.3654846335697399, + "grad_norm": 3.091655731201172, + "learning_rate": 4.9648303921583854e-06, + "loss": 0.7117, + "step": 773 + }, + { + "epoch": 0.3659574468085106, + "grad_norm": 3.0608325004577637, + "learning_rate": 4.964726045244635e-06, + "loss": 0.6538, + "step": 774 + }, + { + "epoch": 0.3664302600472813, + "grad_norm": 2.8492867946624756, + "learning_rate": 4.964621544863203e-06, + "loss": 0.6079, + "step": 775 + }, + { + "epoch": 0.366903073286052, + "grad_norm": 3.0669894218444824, + "learning_rate": 4.964516891020594e-06, + "loss": 0.6223, + "step": 776 + }, + { + "epoch": 0.36737588652482267, + "grad_norm": 3.089984893798828, + "learning_rate": 4.964412083723325e-06, + "loss": 0.671, + "step": 777 + }, + { + "epoch": 0.36784869976359336, + "grad_norm": 2.905242443084717, + "learning_rate": 4.964307122977921e-06, + "loss": 0.62, + "step": 778 + }, + { + "epoch": 0.36832151300236404, + "grad_norm": 3.954436779022217, + "learning_rate": 4.964202008790918e-06, + "loss": 0.6535, + "step": 779 + }, + { + "epoch": 0.36879432624113473, + "grad_norm": 2.6026058197021484, + "learning_rate": 4.9640967411688615e-06, + "loss": 0.5865, + "step": 780 + }, + { + "epoch": 0.3692671394799054, + "grad_norm": 2.9876346588134766, + "learning_rate": 4.963991320118306e-06, + "loss": 0.6698, + "step": 781 + }, + { + "epoch": 0.3697399527186761, + "grad_norm": 2.9411263465881348, + "learning_rate": 4.963885745645815e-06, + "loss": 0.6173, + "step": 782 + }, + { + "epoch": 0.3702127659574468, + "grad_norm": 2.5679805278778076, + "learning_rate": 4.963780017757962e-06, + "loss": 0.6285, + "step": 783 + }, + { + "epoch": 0.3706855791962175, + "grad_norm": 3.3100640773773193, + "learning_rate": 4.963674136461332e-06, + "loss": 0.5968, + "step": 784 + }, + { + "epoch": 0.37115839243498816, + "grad_norm": 3.1293699741363525, + "learning_rate": 4.963568101762515e-06, + "loss": 0.697, + "step": 785 + }, + { + "epoch": 0.37163120567375885, + "grad_norm": 3.043853759765625, + "learning_rate": 4.963461913668115e-06, + "loss": 0.5881, + "step": 786 + }, + { + "epoch": 0.37210401891252953, + "grad_norm": 3.07351016998291, + "learning_rate": 4.963355572184744e-06, + "loss": 0.6307, + "step": 787 + }, + { + "epoch": 0.3725768321513002, + "grad_norm": 2.7381317615509033, + "learning_rate": 4.9632490773190225e-06, + "loss": 0.716, + "step": 788 + }, + { + "epoch": 0.3730496453900709, + "grad_norm": 2.892221450805664, + "learning_rate": 4.963142429077582e-06, + "loss": 0.6867, + "step": 789 + }, + { + "epoch": 0.3735224586288416, + "grad_norm": 3.133122205734253, + "learning_rate": 4.963035627467064e-06, + "loss": 0.659, + "step": 790 + }, + { + "epoch": 0.3739952718676123, + "grad_norm": 3.032599925994873, + "learning_rate": 4.962928672494116e-06, + "loss": 0.6848, + "step": 791 + }, + { + "epoch": 0.37446808510638296, + "grad_norm": 3.0076355934143066, + "learning_rate": 4.9628215641654e-06, + "loss": 0.6549, + "step": 792 + }, + { + "epoch": 0.37494089834515365, + "grad_norm": 2.8904454708099365, + "learning_rate": 4.962714302487585e-06, + "loss": 0.6484, + "step": 793 + }, + { + "epoch": 0.37541371158392434, + "grad_norm": 2.881364107131958, + "learning_rate": 4.9626068874673486e-06, + "loss": 0.721, + "step": 794 + }, + { + "epoch": 0.375886524822695, + "grad_norm": 3.11668062210083, + "learning_rate": 4.962499319111379e-06, + "loss": 0.7824, + "step": 795 + }, + { + "epoch": 0.3763593380614657, + "grad_norm": 2.9201436042785645, + "learning_rate": 4.962391597426374e-06, + "loss": 0.6911, + "step": 796 + }, + { + "epoch": 0.3768321513002364, + "grad_norm": 2.926598072052002, + "learning_rate": 4.962283722419043e-06, + "loss": 0.6715, + "step": 797 + }, + { + "epoch": 0.3773049645390071, + "grad_norm": 2.7267675399780273, + "learning_rate": 4.962175694096101e-06, + "loss": 0.6111, + "step": 798 + }, + { + "epoch": 0.37777777777777777, + "grad_norm": 3.194031000137329, + "learning_rate": 4.962067512464275e-06, + "loss": 0.6558, + "step": 799 + }, + { + "epoch": 0.37825059101654845, + "grad_norm": 2.6249136924743652, + "learning_rate": 4.9619591775303e-06, + "loss": 0.6166, + "step": 800 + }, + { + "epoch": 0.37872340425531914, + "grad_norm": 2.6356167793273926, + "learning_rate": 4.961850689300923e-06, + "loss": 0.6112, + "step": 801 + }, + { + "epoch": 0.3791962174940898, + "grad_norm": 3.030724287033081, + "learning_rate": 4.961742047782898e-06, + "loss": 0.6511, + "step": 802 + }, + { + "epoch": 0.3796690307328605, + "grad_norm": 3.4987757205963135, + "learning_rate": 4.96163325298299e-06, + "loss": 0.5888, + "step": 803 + }, + { + "epoch": 0.3801418439716312, + "grad_norm": 3.0371780395507812, + "learning_rate": 4.961524304907974e-06, + "loss": 0.6385, + "step": 804 + }, + { + "epoch": 0.3806146572104019, + "grad_norm": 3.302570104598999, + "learning_rate": 4.961415203564632e-06, + "loss": 0.6515, + "step": 805 + }, + { + "epoch": 0.38108747044917257, + "grad_norm": 2.7597038745880127, + "learning_rate": 4.961305948959759e-06, + "loss": 0.6126, + "step": 806 + }, + { + "epoch": 0.38156028368794326, + "grad_norm": 2.789811849594116, + "learning_rate": 4.9611965411001575e-06, + "loss": 0.6601, + "step": 807 + }, + { + "epoch": 0.38203309692671394, + "grad_norm": 3.0403921604156494, + "learning_rate": 4.961086979992639e-06, + "loss": 0.6947, + "step": 808 + }, + { + "epoch": 0.38250591016548463, + "grad_norm": 3.2139980792999268, + "learning_rate": 4.960977265644026e-06, + "loss": 0.6876, + "step": 809 + }, + { + "epoch": 0.3829787234042553, + "grad_norm": 2.918515205383301, + "learning_rate": 4.960867398061149e-06, + "loss": 0.5997, + "step": 810 + }, + { + "epoch": 0.383451536643026, + "grad_norm": 3.197636604309082, + "learning_rate": 4.9607573772508495e-06, + "loss": 0.5754, + "step": 811 + }, + { + "epoch": 0.3839243498817967, + "grad_norm": 2.8848466873168945, + "learning_rate": 4.960647203219979e-06, + "loss": 0.6424, + "step": 812 + }, + { + "epoch": 0.3843971631205674, + "grad_norm": 3.4810187816619873, + "learning_rate": 4.960536875975397e-06, + "loss": 0.6851, + "step": 813 + }, + { + "epoch": 0.38486997635933806, + "grad_norm": 3.713934898376465, + "learning_rate": 4.960426395523972e-06, + "loss": 0.6122, + "step": 814 + }, + { + "epoch": 0.38534278959810875, + "grad_norm": 2.862600803375244, + "learning_rate": 4.960315761872585e-06, + "loss": 0.6493, + "step": 815 + }, + { + "epoch": 0.38581560283687943, + "grad_norm": 3.133882522583008, + "learning_rate": 4.960204975028123e-06, + "loss": 0.7535, + "step": 816 + }, + { + "epoch": 0.3862884160756501, + "grad_norm": 3.1526732444763184, + "learning_rate": 4.960094034997485e-06, + "loss": 0.6512, + "step": 817 + }, + { + "epoch": 0.3867612293144208, + "grad_norm": 2.7213544845581055, + "learning_rate": 4.959982941787579e-06, + "loss": 0.6121, + "step": 818 + }, + { + "epoch": 0.3872340425531915, + "grad_norm": 3.4935851097106934, + "learning_rate": 4.9598716954053214e-06, + "loss": 0.7852, + "step": 819 + }, + { + "epoch": 0.3877068557919622, + "grad_norm": 2.691016435623169, + "learning_rate": 4.9597602958576395e-06, + "loss": 0.6861, + "step": 820 + }, + { + "epoch": 0.38817966903073287, + "grad_norm": 2.8621015548706055, + "learning_rate": 4.959648743151469e-06, + "loss": 0.6262, + "step": 821 + }, + { + "epoch": 0.38865248226950355, + "grad_norm": 3.3887462615966797, + "learning_rate": 4.959537037293758e-06, + "loss": 0.7103, + "step": 822 + }, + { + "epoch": 0.38912529550827424, + "grad_norm": 2.7565438747406006, + "learning_rate": 4.95942517829146e-06, + "loss": 0.6471, + "step": 823 + }, + { + "epoch": 0.3895981087470449, + "grad_norm": 2.7920358180999756, + "learning_rate": 4.959313166151541e-06, + "loss": 0.6239, + "step": 824 + }, + { + "epoch": 0.3900709219858156, + "grad_norm": 3.18904185295105, + "learning_rate": 4.959201000880973e-06, + "loss": 0.7461, + "step": 825 + }, + { + "epoch": 0.3905437352245863, + "grad_norm": 2.727872371673584, + "learning_rate": 4.959088682486743e-06, + "loss": 0.6333, + "step": 826 + }, + { + "epoch": 0.391016548463357, + "grad_norm": 2.906378746032715, + "learning_rate": 4.958976210975844e-06, + "loss": 0.7547, + "step": 827 + }, + { + "epoch": 0.39148936170212767, + "grad_norm": 2.96482515335083, + "learning_rate": 4.958863586355278e-06, + "loss": 0.6312, + "step": 828 + }, + { + "epoch": 0.39196217494089836, + "grad_norm": 3.2890889644622803, + "learning_rate": 4.958750808632059e-06, + "loss": 0.6943, + "step": 829 + }, + { + "epoch": 0.39243498817966904, + "grad_norm": 2.7004311084747314, + "learning_rate": 4.958637877813207e-06, + "loss": 0.5918, + "step": 830 + }, + { + "epoch": 0.39290780141843973, + "grad_norm": 2.7487950325012207, + "learning_rate": 4.9585247939057566e-06, + "loss": 0.6201, + "step": 831 + }, + { + "epoch": 0.3933806146572104, + "grad_norm": 2.7873897552490234, + "learning_rate": 4.958411556916747e-06, + "loss": 0.6268, + "step": 832 + }, + { + "epoch": 0.3938534278959811, + "grad_norm": 2.8501343727111816, + "learning_rate": 4.958298166853229e-06, + "loss": 0.7119, + "step": 833 + }, + { + "epoch": 0.3943262411347518, + "grad_norm": 3.0391547679901123, + "learning_rate": 4.958184623722265e-06, + "loss": 0.6375, + "step": 834 + }, + { + "epoch": 0.3947990543735225, + "grad_norm": 2.850520133972168, + "learning_rate": 4.958070927530922e-06, + "loss": 0.5962, + "step": 835 + }, + { + "epoch": 0.39527186761229316, + "grad_norm": 3.351914644241333, + "learning_rate": 4.957957078286281e-06, + "loss": 0.7247, + "step": 836 + }, + { + "epoch": 0.39574468085106385, + "grad_norm": 2.9559543132781982, + "learning_rate": 4.957843075995431e-06, + "loss": 0.6571, + "step": 837 + }, + { + "epoch": 0.39621749408983453, + "grad_norm": 3.225785255432129, + "learning_rate": 4.95772892066547e-06, + "loss": 0.7074, + "step": 838 + }, + { + "epoch": 0.3966903073286052, + "grad_norm": 2.7842373847961426, + "learning_rate": 4.957614612303505e-06, + "loss": 0.6469, + "step": 839 + }, + { + "epoch": 0.3971631205673759, + "grad_norm": 4.249724864959717, + "learning_rate": 4.957500150916655e-06, + "loss": 0.741, + "step": 840 + }, + { + "epoch": 0.3976359338061466, + "grad_norm": 3.138221263885498, + "learning_rate": 4.957385536512046e-06, + "loss": 0.6676, + "step": 841 + }, + { + "epoch": 0.3981087470449173, + "grad_norm": 3.456423759460449, + "learning_rate": 4.957270769096816e-06, + "loss": 0.6877, + "step": 842 + }, + { + "epoch": 0.39858156028368796, + "grad_norm": 2.8676278591156006, + "learning_rate": 4.957155848678109e-06, + "loss": 0.5986, + "step": 843 + }, + { + "epoch": 0.39905437352245865, + "grad_norm": 2.705324411392212, + "learning_rate": 4.957040775263082e-06, + "loss": 0.6356, + "step": 844 + }, + { + "epoch": 0.39952718676122934, + "grad_norm": 3.0767486095428467, + "learning_rate": 4.9569255488589e-06, + "loss": 0.6844, + "step": 845 + }, + { + "epoch": 0.4, + "grad_norm": 2.7787704467773438, + "learning_rate": 4.956810169472736e-06, + "loss": 0.6641, + "step": 846 + }, + { + "epoch": 0.4004728132387707, + "grad_norm": 2.584277868270874, + "learning_rate": 4.956694637111777e-06, + "loss": 0.6256, + "step": 847 + }, + { + "epoch": 0.4009456264775414, + "grad_norm": 2.751641273498535, + "learning_rate": 4.956578951783215e-06, + "loss": 0.5954, + "step": 848 + }, + { + "epoch": 0.4014184397163121, + "grad_norm": 3.0181658267974854, + "learning_rate": 4.956463113494253e-06, + "loss": 0.6569, + "step": 849 + }, + { + "epoch": 0.40189125295508277, + "grad_norm": 3.0933220386505127, + "learning_rate": 4.956347122252104e-06, + "loss": 0.6248, + "step": 850 + }, + { + "epoch": 0.40236406619385345, + "grad_norm": 3.3767428398132324, + "learning_rate": 4.956230978063991e-06, + "loss": 0.719, + "step": 851 + }, + { + "epoch": 0.40283687943262414, + "grad_norm": 3.7666573524475098, + "learning_rate": 4.956114680937145e-06, + "loss": 0.6467, + "step": 852 + }, + { + "epoch": 0.4033096926713948, + "grad_norm": 2.9836843013763428, + "learning_rate": 4.955998230878808e-06, + "loss": 0.6993, + "step": 853 + }, + { + "epoch": 0.4037825059101655, + "grad_norm": 2.981497049331665, + "learning_rate": 4.955881627896229e-06, + "loss": 0.6578, + "step": 854 + }, + { + "epoch": 0.40425531914893614, + "grad_norm": 3.1369056701660156, + "learning_rate": 4.955764871996672e-06, + "loss": 0.6763, + "step": 855 + }, + { + "epoch": 0.40472813238770683, + "grad_norm": 2.7675817012786865, + "learning_rate": 4.9556479631874036e-06, + "loss": 0.6488, + "step": 856 + }, + { + "epoch": 0.4052009456264775, + "grad_norm": 3.035334825515747, + "learning_rate": 4.9555309014757034e-06, + "loss": 0.7076, + "step": 857 + }, + { + "epoch": 0.4056737588652482, + "grad_norm": 3.493704319000244, + "learning_rate": 4.955413686868862e-06, + "loss": 0.6773, + "step": 858 + }, + { + "epoch": 0.4061465721040189, + "grad_norm": 3.245487928390503, + "learning_rate": 4.9552963193741765e-06, + "loss": 0.6915, + "step": 859 + }, + { + "epoch": 0.4066193853427896, + "grad_norm": 3.189969539642334, + "learning_rate": 4.955178798998956e-06, + "loss": 0.7318, + "step": 860 + }, + { + "epoch": 0.40709219858156026, + "grad_norm": 2.7987146377563477, + "learning_rate": 4.955061125750517e-06, + "loss": 0.6162, + "step": 861 + }, + { + "epoch": 0.40756501182033095, + "grad_norm": 3.020118474960327, + "learning_rate": 4.954943299636187e-06, + "loss": 0.6678, + "step": 862 + }, + { + "epoch": 0.40803782505910163, + "grad_norm": 2.715463876724243, + "learning_rate": 4.954825320663302e-06, + "loss": 0.668, + "step": 863 + }, + { + "epoch": 0.4085106382978723, + "grad_norm": 2.595050096511841, + "learning_rate": 4.9547071888392085e-06, + "loss": 0.6557, + "step": 864 + }, + { + "epoch": 0.408983451536643, + "grad_norm": 3.131596088409424, + "learning_rate": 4.954588904171261e-06, + "loss": 0.6548, + "step": 865 + }, + { + "epoch": 0.4094562647754137, + "grad_norm": 2.5742313861846924, + "learning_rate": 4.954470466666827e-06, + "loss": 0.6592, + "step": 866 + }, + { + "epoch": 0.4099290780141844, + "grad_norm": 2.8612802028656006, + "learning_rate": 4.9543518763332785e-06, + "loss": 0.5391, + "step": 867 + }, + { + "epoch": 0.41040189125295506, + "grad_norm": 2.8973186016082764, + "learning_rate": 4.954233133178001e-06, + "loss": 0.6649, + "step": 868 + }, + { + "epoch": 0.41087470449172575, + "grad_norm": 2.802525043487549, + "learning_rate": 4.954114237208388e-06, + "loss": 0.6212, + "step": 869 + }, + { + "epoch": 0.41134751773049644, + "grad_norm": 2.5919506549835205, + "learning_rate": 4.953995188431843e-06, + "loss": 0.6596, + "step": 870 + }, + { + "epoch": 0.4118203309692671, + "grad_norm": 3.139169454574585, + "learning_rate": 4.953875986855777e-06, + "loss": 0.6799, + "step": 871 + }, + { + "epoch": 0.4122931442080378, + "grad_norm": 3.99727725982666, + "learning_rate": 4.953756632487614e-06, + "loss": 0.6519, + "step": 872 + }, + { + "epoch": 0.4127659574468085, + "grad_norm": 3.238706350326538, + "learning_rate": 4.953637125334784e-06, + "loss": 0.7361, + "step": 873 + }, + { + "epoch": 0.4132387706855792, + "grad_norm": 2.780019998550415, + "learning_rate": 4.9535174654047295e-06, + "loss": 0.6406, + "step": 874 + }, + { + "epoch": 0.41371158392434987, + "grad_norm": 2.7629551887512207, + "learning_rate": 4.953397652704901e-06, + "loss": 0.6131, + "step": 875 + }, + { + "epoch": 0.41418439716312055, + "grad_norm": 2.8008246421813965, + "learning_rate": 4.9532776872427585e-06, + "loss": 0.6464, + "step": 876 + }, + { + "epoch": 0.41465721040189124, + "grad_norm": 3.0970115661621094, + "learning_rate": 4.953157569025772e-06, + "loss": 0.7066, + "step": 877 + }, + { + "epoch": 0.4151300236406619, + "grad_norm": 2.8375589847564697, + "learning_rate": 4.9530372980614195e-06, + "loss": 0.6551, + "step": 878 + }, + { + "epoch": 0.4156028368794326, + "grad_norm": 2.718843936920166, + "learning_rate": 4.952916874357191e-06, + "loss": 0.5947, + "step": 879 + }, + { + "epoch": 0.4160756501182033, + "grad_norm": 2.7104697227478027, + "learning_rate": 4.952796297920585e-06, + "loss": 0.6708, + "step": 880 + }, + { + "epoch": 0.416548463356974, + "grad_norm": 2.8223445415496826, + "learning_rate": 4.952675568759108e-06, + "loss": 0.6214, + "step": 881 + }, + { + "epoch": 0.41702127659574467, + "grad_norm": 2.6598153114318848, + "learning_rate": 4.952554686880279e-06, + "loss": 0.6116, + "step": 882 + }, + { + "epoch": 0.41749408983451536, + "grad_norm": 2.8639824390411377, + "learning_rate": 4.952433652291623e-06, + "loss": 0.5971, + "step": 883 + }, + { + "epoch": 0.41796690307328604, + "grad_norm": 2.9578304290771484, + "learning_rate": 4.952312465000677e-06, + "loss": 0.6785, + "step": 884 + }, + { + "epoch": 0.41843971631205673, + "grad_norm": 2.872144937515259, + "learning_rate": 4.952191125014987e-06, + "loss": 0.6772, + "step": 885 + }, + { + "epoch": 0.4189125295508274, + "grad_norm": 2.7513675689697266, + "learning_rate": 4.952069632342108e-06, + "loss": 0.702, + "step": 886 + }, + { + "epoch": 0.4193853427895981, + "grad_norm": 2.9275078773498535, + "learning_rate": 4.951947986989606e-06, + "loss": 0.589, + "step": 887 + }, + { + "epoch": 0.4198581560283688, + "grad_norm": 2.740549325942993, + "learning_rate": 4.951826188965053e-06, + "loss": 0.5942, + "step": 888 + }, + { + "epoch": 0.4203309692671395, + "grad_norm": 2.92452073097229, + "learning_rate": 4.951704238276035e-06, + "loss": 0.6819, + "step": 889 + }, + { + "epoch": 0.42080378250591016, + "grad_norm": 2.842491865158081, + "learning_rate": 4.951582134930144e-06, + "loss": 0.6304, + "step": 890 + }, + { + "epoch": 0.42127659574468085, + "grad_norm": 2.613478422164917, + "learning_rate": 4.951459878934983e-06, + "loss": 0.6912, + "step": 891 + }, + { + "epoch": 0.42174940898345153, + "grad_norm": 3.2408607006073, + "learning_rate": 4.951337470298165e-06, + "loss": 0.6755, + "step": 892 + }, + { + "epoch": 0.4222222222222222, + "grad_norm": 3.1022439002990723, + "learning_rate": 4.9512149090273125e-06, + "loss": 0.6138, + "step": 893 + }, + { + "epoch": 0.4226950354609929, + "grad_norm": 2.6418895721435547, + "learning_rate": 4.951092195130055e-06, + "loss": 0.639, + "step": 894 + }, + { + "epoch": 0.4231678486997636, + "grad_norm": 3.010744333267212, + "learning_rate": 4.950969328614035e-06, + "loss": 0.7102, + "step": 895 + }, + { + "epoch": 0.4236406619385343, + "grad_norm": 2.673292636871338, + "learning_rate": 4.950846309486901e-06, + "loss": 0.5676, + "step": 896 + }, + { + "epoch": 0.42411347517730497, + "grad_norm": 3.6974737644195557, + "learning_rate": 4.950723137756314e-06, + "loss": 0.5722, + "step": 897 + }, + { + "epoch": 0.42458628841607565, + "grad_norm": 3.69028902053833, + "learning_rate": 4.9505998134299435e-06, + "loss": 0.6337, + "step": 898 + }, + { + "epoch": 0.42505910165484634, + "grad_norm": 3.2136125564575195, + "learning_rate": 4.950476336515469e-06, + "loss": 0.6469, + "step": 899 + }, + { + "epoch": 0.425531914893617, + "grad_norm": 2.7396016120910645, + "learning_rate": 4.950352707020577e-06, + "loss": 0.6656, + "step": 900 + }, + { + "epoch": 0.4260047281323877, + "grad_norm": 2.825416088104248, + "learning_rate": 4.950228924952967e-06, + "loss": 0.6298, + "step": 901 + }, + { + "epoch": 0.4264775413711584, + "grad_norm": 3.401658535003662, + "learning_rate": 4.950104990320345e-06, + "loss": 0.778, + "step": 902 + }, + { + "epoch": 0.4269503546099291, + "grad_norm": 2.7002272605895996, + "learning_rate": 4.9499809031304294e-06, + "loss": 0.6536, + "step": 903 + }, + { + "epoch": 0.42742316784869977, + "grad_norm": 2.62386417388916, + "learning_rate": 4.949856663390945e-06, + "loss": 0.6629, + "step": 904 + }, + { + "epoch": 0.42789598108747046, + "grad_norm": 2.584247589111328, + "learning_rate": 4.94973227110963e-06, + "loss": 0.5813, + "step": 905 + }, + { + "epoch": 0.42836879432624114, + "grad_norm": 3.4365768432617188, + "learning_rate": 4.9496077262942265e-06, + "loss": 0.7648, + "step": 906 + }, + { + "epoch": 0.42884160756501183, + "grad_norm": 2.8993639945983887, + "learning_rate": 4.949483028952492e-06, + "loss": 0.6696, + "step": 907 + }, + { + "epoch": 0.4293144208037825, + "grad_norm": 2.922809362411499, + "learning_rate": 4.94935817909219e-06, + "loss": 0.6892, + "step": 908 + }, + { + "epoch": 0.4297872340425532, + "grad_norm": 2.85478138923645, + "learning_rate": 4.9492331767210944e-06, + "loss": 0.536, + "step": 909 + }, + { + "epoch": 0.4302600472813239, + "grad_norm": 2.8639259338378906, + "learning_rate": 4.949108021846988e-06, + "loss": 0.634, + "step": 910 + }, + { + "epoch": 0.4307328605200946, + "grad_norm": 3.0533697605133057, + "learning_rate": 4.948982714477664e-06, + "loss": 0.6318, + "step": 911 + }, + { + "epoch": 0.43120567375886526, + "grad_norm": 2.331674814224243, + "learning_rate": 4.9488572546209255e-06, + "loss": 0.6562, + "step": 912 + }, + { + "epoch": 0.43167848699763595, + "grad_norm": 3.0154623985290527, + "learning_rate": 4.9487316422845835e-06, + "loss": 0.6675, + "step": 913 + }, + { + "epoch": 0.43215130023640663, + "grad_norm": 2.7354514598846436, + "learning_rate": 4.948605877476459e-06, + "loss": 0.6012, + "step": 914 + }, + { + "epoch": 0.4326241134751773, + "grad_norm": 2.863736629486084, + "learning_rate": 4.948479960204383e-06, + "loss": 0.6062, + "step": 915 + }, + { + "epoch": 0.433096926713948, + "grad_norm": 3.01998233795166, + "learning_rate": 4.948353890476197e-06, + "loss": 0.6749, + "step": 916 + }, + { + "epoch": 0.4335697399527187, + "grad_norm": 2.7550456523895264, + "learning_rate": 4.94822766829975e-06, + "loss": 0.6507, + "step": 917 + }, + { + "epoch": 0.4340425531914894, + "grad_norm": 3.370572805404663, + "learning_rate": 4.948101293682901e-06, + "loss": 0.714, + "step": 918 + }, + { + "epoch": 0.43451536643026006, + "grad_norm": 2.9736790657043457, + "learning_rate": 4.947974766633519e-06, + "loss": 0.729, + "step": 919 + }, + { + "epoch": 0.43498817966903075, + "grad_norm": 3.1036548614501953, + "learning_rate": 4.947848087159483e-06, + "loss": 0.7547, + "step": 920 + }, + { + "epoch": 0.43546099290780144, + "grad_norm": 2.895094871520996, + "learning_rate": 4.947721255268679e-06, + "loss": 0.6089, + "step": 921 + }, + { + "epoch": 0.4359338061465721, + "grad_norm": 2.798476219177246, + "learning_rate": 4.947594270969005e-06, + "loss": 0.5432, + "step": 922 + }, + { + "epoch": 0.4364066193853428, + "grad_norm": 2.7675702571868896, + "learning_rate": 4.94746713426837e-06, + "loss": 0.5693, + "step": 923 + }, + { + "epoch": 0.4368794326241135, + "grad_norm": 2.6851553916931152, + "learning_rate": 4.947339845174687e-06, + "loss": 0.6503, + "step": 924 + }, + { + "epoch": 0.4373522458628842, + "grad_norm": 2.909635543823242, + "learning_rate": 4.947212403695883e-06, + "loss": 0.6494, + "step": 925 + }, + { + "epoch": 0.43782505910165487, + "grad_norm": 2.604526996612549, + "learning_rate": 4.947084809839894e-06, + "loss": 0.6349, + "step": 926 + }, + { + "epoch": 0.43829787234042555, + "grad_norm": 3.118149518966675, + "learning_rate": 4.946957063614664e-06, + "loss": 0.6219, + "step": 927 + }, + { + "epoch": 0.43877068557919624, + "grad_norm": 2.7452616691589355, + "learning_rate": 4.9468291650281465e-06, + "loss": 0.6096, + "step": 928 + }, + { + "epoch": 0.4392434988179669, + "grad_norm": 3.30098819732666, + "learning_rate": 4.946701114088307e-06, + "loss": 0.6277, + "step": 929 + }, + { + "epoch": 0.4397163120567376, + "grad_norm": 2.789482593536377, + "learning_rate": 4.946572910803116e-06, + "loss": 0.7, + "step": 930 + }, + { + "epoch": 0.4401891252955083, + "grad_norm": 2.7283935546875, + "learning_rate": 4.946444555180559e-06, + "loss": 0.5375, + "step": 931 + }, + { + "epoch": 0.440661938534279, + "grad_norm": 3.101304054260254, + "learning_rate": 4.946316047228627e-06, + "loss": 0.6131, + "step": 932 + }, + { + "epoch": 0.44113475177304967, + "grad_norm": 3.573908805847168, + "learning_rate": 4.946187386955321e-06, + "loss": 0.7073, + "step": 933 + }, + { + "epoch": 0.44160756501182036, + "grad_norm": 3.214979648590088, + "learning_rate": 4.946058574368653e-06, + "loss": 0.6508, + "step": 934 + }, + { + "epoch": 0.44208037825059104, + "grad_norm": 3.145082712173462, + "learning_rate": 4.945929609476643e-06, + "loss": 0.64, + "step": 935 + }, + { + "epoch": 0.4425531914893617, + "grad_norm": 2.991780996322632, + "learning_rate": 4.945800492287321e-06, + "loss": 0.6315, + "step": 936 + }, + { + "epoch": 0.44302600472813236, + "grad_norm": 3.2441139221191406, + "learning_rate": 4.945671222808727e-06, + "loss": 0.7144, + "step": 937 + }, + { + "epoch": 0.44349881796690305, + "grad_norm": 2.9397029876708984, + "learning_rate": 4.94554180104891e-06, + "loss": 0.6818, + "step": 938 + }, + { + "epoch": 0.44397163120567373, + "grad_norm": 3.2471461296081543, + "learning_rate": 4.945412227015929e-06, + "loss": 0.6921, + "step": 939 + }, + { + "epoch": 0.4444444444444444, + "grad_norm": 3.0882487297058105, + "learning_rate": 4.945282500717851e-06, + "loss": 0.718, + "step": 940 + }, + { + "epoch": 0.4449172576832151, + "grad_norm": 2.6035783290863037, + "learning_rate": 4.945152622162753e-06, + "loss": 0.621, + "step": 941 + }, + { + "epoch": 0.4453900709219858, + "grad_norm": 2.83659029006958, + "learning_rate": 4.945022591358724e-06, + "loss": 0.6403, + "step": 942 + }, + { + "epoch": 0.4458628841607565, + "grad_norm": 2.824463129043579, + "learning_rate": 4.944892408313859e-06, + "loss": 0.6594, + "step": 943 + }, + { + "epoch": 0.44633569739952716, + "grad_norm": 2.753735065460205, + "learning_rate": 4.9447620730362645e-06, + "loss": 0.6116, + "step": 944 + }, + { + "epoch": 0.44680851063829785, + "grad_norm": 3.0659725666046143, + "learning_rate": 4.944631585534056e-06, + "loss": 0.5983, + "step": 945 + }, + { + "epoch": 0.44728132387706854, + "grad_norm": 2.969113349914551, + "learning_rate": 4.944500945815357e-06, + "loss": 0.6859, + "step": 946 + }, + { + "epoch": 0.4477541371158392, + "grad_norm": 2.810303211212158, + "learning_rate": 4.944370153888303e-06, + "loss": 0.7025, + "step": 947 + }, + { + "epoch": 0.4482269503546099, + "grad_norm": 3.027721643447876, + "learning_rate": 4.944239209761038e-06, + "loss": 0.7268, + "step": 948 + }, + { + "epoch": 0.4486997635933806, + "grad_norm": 2.661503314971924, + "learning_rate": 4.944108113441716e-06, + "loss": 0.6702, + "step": 949 + }, + { + "epoch": 0.4491725768321513, + "grad_norm": 2.738591432571411, + "learning_rate": 4.943976864938498e-06, + "loss": 0.6728, + "step": 950 + }, + { + "epoch": 0.44964539007092197, + "grad_norm": 3.447505474090576, + "learning_rate": 4.943845464259557e-06, + "loss": 0.6586, + "step": 951 + }, + { + "epoch": 0.45011820330969265, + "grad_norm": 3.0968854427337646, + "learning_rate": 4.943713911413075e-06, + "loss": 0.7666, + "step": 952 + }, + { + "epoch": 0.45059101654846334, + "grad_norm": 2.4113779067993164, + "learning_rate": 4.943582206407244e-06, + "loss": 0.6173, + "step": 953 + }, + { + "epoch": 0.451063829787234, + "grad_norm": 2.6357979774475098, + "learning_rate": 4.943450349250263e-06, + "loss": 0.5589, + "step": 954 + }, + { + "epoch": 0.4515366430260047, + "grad_norm": 2.9182233810424805, + "learning_rate": 4.9433183399503425e-06, + "loss": 0.6252, + "step": 955 + }, + { + "epoch": 0.4520094562647754, + "grad_norm": 2.832740306854248, + "learning_rate": 4.943186178515703e-06, + "loss": 0.6882, + "step": 956 + }, + { + "epoch": 0.4524822695035461, + "grad_norm": 2.9508981704711914, + "learning_rate": 4.943053864954574e-06, + "loss": 0.5722, + "step": 957 + }, + { + "epoch": 0.4529550827423168, + "grad_norm": 3.044729471206665, + "learning_rate": 4.9429213992751925e-06, + "loss": 0.6772, + "step": 958 + }, + { + "epoch": 0.45342789598108746, + "grad_norm": 2.606003522872925, + "learning_rate": 4.9427887814858075e-06, + "loss": 0.6445, + "step": 959 + }, + { + "epoch": 0.45390070921985815, + "grad_norm": 2.4634225368499756, + "learning_rate": 4.942656011594676e-06, + "loss": 0.6151, + "step": 960 + }, + { + "epoch": 0.45437352245862883, + "grad_norm": 2.8872334957122803, + "learning_rate": 4.942523089610066e-06, + "loss": 0.6255, + "step": 961 + }, + { + "epoch": 0.4548463356973995, + "grad_norm": 2.870605707168579, + "learning_rate": 4.942390015540253e-06, + "loss": 0.7481, + "step": 962 + }, + { + "epoch": 0.4553191489361702, + "grad_norm": 2.952680826187134, + "learning_rate": 4.942256789393524e-06, + "loss": 0.5556, + "step": 963 + }, + { + "epoch": 0.4557919621749409, + "grad_norm": 2.623680353164673, + "learning_rate": 4.9421234111781725e-06, + "loss": 0.6115, + "step": 964 + }, + { + "epoch": 0.4562647754137116, + "grad_norm": 2.6933600902557373, + "learning_rate": 4.941989880902505e-06, + "loss": 0.6102, + "step": 965 + }, + { + "epoch": 0.45673758865248226, + "grad_norm": 2.6047189235687256, + "learning_rate": 4.941856198574836e-06, + "loss": 0.612, + "step": 966 + }, + { + "epoch": 0.45721040189125295, + "grad_norm": 2.779186725616455, + "learning_rate": 4.9417223642034885e-06, + "loss": 0.5424, + "step": 967 + }, + { + "epoch": 0.45768321513002364, + "grad_norm": 2.6177165508270264, + "learning_rate": 4.941588377796795e-06, + "loss": 0.4661, + "step": 968 + }, + { + "epoch": 0.4581560283687943, + "grad_norm": 2.959676742553711, + "learning_rate": 4.941454239363101e-06, + "loss": 0.6966, + "step": 969 + }, + { + "epoch": 0.458628841607565, + "grad_norm": 2.9788379669189453, + "learning_rate": 4.941319948910756e-06, + "loss": 0.6181, + "step": 970 + }, + { + "epoch": 0.4591016548463357, + "grad_norm": 4.642750263214111, + "learning_rate": 4.941185506448122e-06, + "loss": 0.5602, + "step": 971 + }, + { + "epoch": 0.4595744680851064, + "grad_norm": 2.793002128601074, + "learning_rate": 4.941050911983572e-06, + "loss": 0.602, + "step": 972 + }, + { + "epoch": 0.46004728132387707, + "grad_norm": 2.6833035945892334, + "learning_rate": 4.9409161655254845e-06, + "loss": 0.5549, + "step": 973 + }, + { + "epoch": 0.46052009456264775, + "grad_norm": 3.905032157897949, + "learning_rate": 4.94078126708225e-06, + "loss": 0.6335, + "step": 974 + }, + { + "epoch": 0.46099290780141844, + "grad_norm": 2.922609329223633, + "learning_rate": 4.94064621666227e-06, + "loss": 0.5839, + "step": 975 + }, + { + "epoch": 0.4614657210401891, + "grad_norm": 2.8277416229248047, + "learning_rate": 4.940511014273952e-06, + "loss": 0.629, + "step": 976 + }, + { + "epoch": 0.4619385342789598, + "grad_norm": 3.07511043548584, + "learning_rate": 4.940375659925714e-06, + "loss": 0.7058, + "step": 977 + }, + { + "epoch": 0.4624113475177305, + "grad_norm": 3.65043044090271, + "learning_rate": 4.940240153625984e-06, + "loss": 0.7174, + "step": 978 + }, + { + "epoch": 0.4628841607565012, + "grad_norm": 2.755167245864868, + "learning_rate": 4.9401044953832e-06, + "loss": 0.6548, + "step": 979 + }, + { + "epoch": 0.46335697399527187, + "grad_norm": 2.9881057739257812, + "learning_rate": 4.939968685205808e-06, + "loss": 0.6245, + "step": 980 + }, + { + "epoch": 0.46382978723404256, + "grad_norm": 2.9484212398529053, + "learning_rate": 4.939832723102266e-06, + "loss": 0.655, + "step": 981 + }, + { + "epoch": 0.46430260047281324, + "grad_norm": 2.898918628692627, + "learning_rate": 4.939696609081038e-06, + "loss": 0.6178, + "step": 982 + }, + { + "epoch": 0.46477541371158393, + "grad_norm": 2.7052435874938965, + "learning_rate": 4.9395603431506e-06, + "loss": 0.6393, + "step": 983 + }, + { + "epoch": 0.4652482269503546, + "grad_norm": 2.5610013008117676, + "learning_rate": 4.939423925319436e-06, + "loss": 0.4847, + "step": 984 + }, + { + "epoch": 0.4657210401891253, + "grad_norm": 3.229083299636841, + "learning_rate": 4.939287355596042e-06, + "loss": 0.6473, + "step": 985 + }, + { + "epoch": 0.466193853427896, + "grad_norm": 2.907097816467285, + "learning_rate": 4.9391506339889195e-06, + "loss": 0.652, + "step": 986 + }, + { + "epoch": 0.4666666666666667, + "grad_norm": 2.6929478645324707, + "learning_rate": 4.939013760506582e-06, + "loss": 0.6175, + "step": 987 + }, + { + "epoch": 0.46713947990543736, + "grad_norm": 3.414813280105591, + "learning_rate": 4.938876735157554e-06, + "loss": 0.7597, + "step": 988 + }, + { + "epoch": 0.46761229314420805, + "grad_norm": 3.297360420227051, + "learning_rate": 4.938739557950365e-06, + "loss": 0.6824, + "step": 989 + }, + { + "epoch": 0.46808510638297873, + "grad_norm": 3.083155393600464, + "learning_rate": 4.938602228893557e-06, + "loss": 0.6505, + "step": 990 + }, + { + "epoch": 0.4685579196217494, + "grad_norm": 2.9781153202056885, + "learning_rate": 4.938464747995681e-06, + "loss": 0.666, + "step": 991 + }, + { + "epoch": 0.4690307328605201, + "grad_norm": 3.1494534015655518, + "learning_rate": 4.9383271152652975e-06, + "loss": 0.6422, + "step": 992 + }, + { + "epoch": 0.4695035460992908, + "grad_norm": 2.547868490219116, + "learning_rate": 4.938189330710976e-06, + "loss": 0.5766, + "step": 993 + }, + { + "epoch": 0.4699763593380615, + "grad_norm": 2.684736967086792, + "learning_rate": 4.938051394341297e-06, + "loss": 0.6407, + "step": 994 + }, + { + "epoch": 0.47044917257683216, + "grad_norm": 2.9619693756103516, + "learning_rate": 4.937913306164847e-06, + "loss": 0.6936, + "step": 995 + }, + { + "epoch": 0.47092198581560285, + "grad_norm": 2.9698498249053955, + "learning_rate": 4.937775066190227e-06, + "loss": 0.6464, + "step": 996 + }, + { + "epoch": 0.47139479905437354, + "grad_norm": 3.121049642562866, + "learning_rate": 4.937636674426042e-06, + "loss": 0.6383, + "step": 997 + }, + { + "epoch": 0.4718676122931442, + "grad_norm": 3.113672971725464, + "learning_rate": 4.93749813088091e-06, + "loss": 0.6892, + "step": 998 + }, + { + "epoch": 0.4723404255319149, + "grad_norm": 3.126113176345825, + "learning_rate": 4.937359435563458e-06, + "loss": 0.6728, + "step": 999 + }, + { + "epoch": 0.4728132387706856, + "grad_norm": 3.353966236114502, + "learning_rate": 4.937220588482321e-06, + "loss": 0.6041, + "step": 1000 + }, + { + "epoch": 0.4732860520094563, + "grad_norm": 2.8860628604888916, + "learning_rate": 4.937081589646144e-06, + "loss": 0.6798, + "step": 1001 + }, + { + "epoch": 0.47375886524822697, + "grad_norm": 3.0510590076446533, + "learning_rate": 4.936942439063584e-06, + "loss": 0.5841, + "step": 1002 + }, + { + "epoch": 0.47423167848699765, + "grad_norm": 2.6998369693756104, + "learning_rate": 4.936803136743303e-06, + "loss": 0.6403, + "step": 1003 + }, + { + "epoch": 0.47470449172576834, + "grad_norm": 2.875347137451172, + "learning_rate": 4.9366636826939765e-06, + "loss": 0.5811, + "step": 1004 + }, + { + "epoch": 0.475177304964539, + "grad_norm": 2.9122262001037598, + "learning_rate": 4.936524076924287e-06, + "loss": 0.6852, + "step": 1005 + }, + { + "epoch": 0.4756501182033097, + "grad_norm": 2.5167057514190674, + "learning_rate": 4.9363843194429265e-06, + "loss": 0.5367, + "step": 1006 + }, + { + "epoch": 0.4761229314420804, + "grad_norm": 2.5745551586151123, + "learning_rate": 4.9362444102585985e-06, + "loss": 0.6241, + "step": 1007 + }, + { + "epoch": 0.4765957446808511, + "grad_norm": 2.5024216175079346, + "learning_rate": 4.9361043493800125e-06, + "loss": 0.6133, + "step": 1008 + }, + { + "epoch": 0.47706855791962177, + "grad_norm": 2.7281384468078613, + "learning_rate": 4.935964136815892e-06, + "loss": 0.6834, + "step": 1009 + }, + { + "epoch": 0.47754137115839246, + "grad_norm": 3.0118913650512695, + "learning_rate": 4.935823772574965e-06, + "loss": 0.6922, + "step": 1010 + }, + { + "epoch": 0.47801418439716314, + "grad_norm": 3.016216993331909, + "learning_rate": 4.935683256665973e-06, + "loss": 0.6653, + "step": 1011 + }, + { + "epoch": 0.47848699763593383, + "grad_norm": 2.9526784420013428, + "learning_rate": 4.9355425890976636e-06, + "loss": 0.6423, + "step": 1012 + }, + { + "epoch": 0.4789598108747045, + "grad_norm": 6.222797393798828, + "learning_rate": 4.9354017698787985e-06, + "loss": 0.5884, + "step": 1013 + }, + { + "epoch": 0.4794326241134752, + "grad_norm": 2.6553597450256348, + "learning_rate": 4.935260799018143e-06, + "loss": 0.6624, + "step": 1014 + }, + { + "epoch": 0.4799054373522459, + "grad_norm": 3.0942065715789795, + "learning_rate": 4.935119676524475e-06, + "loss": 0.6623, + "step": 1015 + }, + { + "epoch": 0.4803782505910166, + "grad_norm": 2.626359224319458, + "learning_rate": 4.934978402406585e-06, + "loss": 0.6195, + "step": 1016 + }, + { + "epoch": 0.4808510638297872, + "grad_norm": 2.7954699993133545, + "learning_rate": 4.934836976673265e-06, + "loss": 0.5545, + "step": 1017 + }, + { + "epoch": 0.4813238770685579, + "grad_norm": 2.913557291030884, + "learning_rate": 4.934695399333324e-06, + "loss": 0.6288, + "step": 1018 + }, + { + "epoch": 0.4817966903073286, + "grad_norm": 3.1043739318847656, + "learning_rate": 4.9345536703955746e-06, + "loss": 0.6771, + "step": 1019 + }, + { + "epoch": 0.48226950354609927, + "grad_norm": 2.789357900619507, + "learning_rate": 4.934411789868845e-06, + "loss": 0.6227, + "step": 1020 + }, + { + "epoch": 0.48274231678486995, + "grad_norm": 2.480609655380249, + "learning_rate": 4.934269757761967e-06, + "loss": 0.5779, + "step": 1021 + }, + { + "epoch": 0.48321513002364064, + "grad_norm": 2.7946252822875977, + "learning_rate": 4.934127574083785e-06, + "loss": 0.6166, + "step": 1022 + }, + { + "epoch": 0.4836879432624113, + "grad_norm": 3.0670509338378906, + "learning_rate": 4.933985238843153e-06, + "loss": 0.7766, + "step": 1023 + }, + { + "epoch": 0.484160756501182, + "grad_norm": 2.8567559719085693, + "learning_rate": 4.933842752048932e-06, + "loss": 0.5088, + "step": 1024 + }, + { + "epoch": 0.4846335697399527, + "grad_norm": 2.5674657821655273, + "learning_rate": 4.933700113709996e-06, + "loss": 0.6036, + "step": 1025 + }, + { + "epoch": 0.4851063829787234, + "grad_norm": 2.782339096069336, + "learning_rate": 4.933557323835224e-06, + "loss": 0.5335, + "step": 1026 + }, + { + "epoch": 0.48557919621749407, + "grad_norm": 2.6334071159362793, + "learning_rate": 4.93341438243351e-06, + "loss": 0.6327, + "step": 1027 + }, + { + "epoch": 0.48605200945626476, + "grad_norm": 3.0853965282440186, + "learning_rate": 4.933271289513751e-06, + "loss": 0.7102, + "step": 1028 + }, + { + "epoch": 0.48652482269503544, + "grad_norm": 2.619997501373291, + "learning_rate": 4.933128045084859e-06, + "loss": 0.6138, + "step": 1029 + }, + { + "epoch": 0.48699763593380613, + "grad_norm": 2.8316116333007812, + "learning_rate": 4.932984649155753e-06, + "loss": 0.6346, + "step": 1030 + }, + { + "epoch": 0.4874704491725768, + "grad_norm": 3.153486490249634, + "learning_rate": 4.932841101735361e-06, + "loss": 0.7626, + "step": 1031 + }, + { + "epoch": 0.4879432624113475, + "grad_norm": 3.1831274032592773, + "learning_rate": 4.9326974028326214e-06, + "loss": 0.6607, + "step": 1032 + }, + { + "epoch": 0.4884160756501182, + "grad_norm": 2.791078567504883, + "learning_rate": 4.932553552456481e-06, + "loss": 0.6141, + "step": 1033 + }, + { + "epoch": 0.4888888888888889, + "grad_norm": 2.627263307571411, + "learning_rate": 4.932409550615898e-06, + "loss": 0.6777, + "step": 1034 + }, + { + "epoch": 0.48936170212765956, + "grad_norm": 2.8550007343292236, + "learning_rate": 4.932265397319838e-06, + "loss": 0.6379, + "step": 1035 + }, + { + "epoch": 0.48983451536643025, + "grad_norm": 4.505824089050293, + "learning_rate": 4.932121092577276e-06, + "loss": 0.5892, + "step": 1036 + }, + { + "epoch": 0.49030732860520093, + "grad_norm": 3.100191116333008, + "learning_rate": 4.931976636397199e-06, + "loss": 0.6443, + "step": 1037 + }, + { + "epoch": 0.4907801418439716, + "grad_norm": 2.921494245529175, + "learning_rate": 4.9318320287886e-06, + "loss": 0.6821, + "step": 1038 + }, + { + "epoch": 0.4912529550827423, + "grad_norm": 4.577807903289795, + "learning_rate": 4.931687269760485e-06, + "loss": 0.5946, + "step": 1039 + }, + { + "epoch": 0.491725768321513, + "grad_norm": 2.7347636222839355, + "learning_rate": 4.931542359321865e-06, + "loss": 0.5689, + "step": 1040 + }, + { + "epoch": 0.4921985815602837, + "grad_norm": 2.5289158821105957, + "learning_rate": 4.931397297481765e-06, + "loss": 0.5632, + "step": 1041 + }, + { + "epoch": 0.49267139479905436, + "grad_norm": 3.3518471717834473, + "learning_rate": 4.9312520842492165e-06, + "loss": 0.6349, + "step": 1042 + }, + { + "epoch": 0.49314420803782505, + "grad_norm": 3.0469748973846436, + "learning_rate": 4.931106719633261e-06, + "loss": 0.5734, + "step": 1043 + }, + { + "epoch": 0.49361702127659574, + "grad_norm": 3.104682445526123, + "learning_rate": 4.930961203642951e-06, + "loss": 0.6101, + "step": 1044 + }, + { + "epoch": 0.4940898345153664, + "grad_norm": 2.776705503463745, + "learning_rate": 4.930815536287346e-06, + "loss": 0.6397, + "step": 1045 + }, + { + "epoch": 0.4945626477541371, + "grad_norm": 2.760380983352661, + "learning_rate": 4.930669717575516e-06, + "loss": 0.668, + "step": 1046 + }, + { + "epoch": 0.4950354609929078, + "grad_norm": 2.70084547996521, + "learning_rate": 4.930523747516541e-06, + "loss": 0.5729, + "step": 1047 + }, + { + "epoch": 0.4955082742316785, + "grad_norm": 2.7319583892822266, + "learning_rate": 4.930377626119511e-06, + "loss": 0.6258, + "step": 1048 + }, + { + "epoch": 0.49598108747044917, + "grad_norm": 3.2515223026275635, + "learning_rate": 4.930231353393521e-06, + "loss": 0.7412, + "step": 1049 + }, + { + "epoch": 0.49645390070921985, + "grad_norm": 3.0646486282348633, + "learning_rate": 4.930084929347682e-06, + "loss": 0.5809, + "step": 1050 + }, + { + "epoch": 0.49692671394799054, + "grad_norm": 3.1621921062469482, + "learning_rate": 4.9299383539911096e-06, + "loss": 0.6282, + "step": 1051 + }, + { + "epoch": 0.4973995271867612, + "grad_norm": 2.864713191986084, + "learning_rate": 4.929791627332931e-06, + "loss": 0.6263, + "step": 1052 + }, + { + "epoch": 0.4978723404255319, + "grad_norm": 3.181016683578491, + "learning_rate": 4.929644749382283e-06, + "loss": 0.5697, + "step": 1053 + }, + { + "epoch": 0.4983451536643026, + "grad_norm": 2.9064836502075195, + "learning_rate": 4.929497720148309e-06, + "loss": 0.6161, + "step": 1054 + }, + { + "epoch": 0.4988179669030733, + "grad_norm": 3.058112859725952, + "learning_rate": 4.9293505396401655e-06, + "loss": 0.6477, + "step": 1055 + }, + { + "epoch": 0.49929078014184397, + "grad_norm": 2.5227596759796143, + "learning_rate": 4.929203207867016e-06, + "loss": 0.5819, + "step": 1056 + }, + { + "epoch": 0.49976359338061466, + "grad_norm": 3.386862277984619, + "learning_rate": 4.929055724838035e-06, + "loss": 0.7342, + "step": 1057 + }, + { + "epoch": 0.5002364066193853, + "grad_norm": 3.368346929550171, + "learning_rate": 4.928908090562404e-06, + "loss": 0.6622, + "step": 1058 + }, + { + "epoch": 0.500709219858156, + "grad_norm": 2.9108314514160156, + "learning_rate": 4.928760305049317e-06, + "loss": 0.6598, + "step": 1059 + }, + { + "epoch": 0.5011820330969267, + "grad_norm": 2.822305917739868, + "learning_rate": 4.928612368307977e-06, + "loss": 0.5841, + "step": 1060 + }, + { + "epoch": 0.5016548463356973, + "grad_norm": 2.689131259918213, + "learning_rate": 4.928464280347592e-06, + "loss": 0.6631, + "step": 1061 + }, + { + "epoch": 0.502127659574468, + "grad_norm": 3.337214946746826, + "learning_rate": 4.9283160411773864e-06, + "loss": 0.6105, + "step": 1062 + }, + { + "epoch": 0.5026004728132387, + "grad_norm": 3.035911798477173, + "learning_rate": 4.928167650806588e-06, + "loss": 0.6981, + "step": 1063 + }, + { + "epoch": 0.5030732860520094, + "grad_norm": 2.8820855617523193, + "learning_rate": 4.9280191092444375e-06, + "loss": 0.6408, + "step": 1064 + }, + { + "epoch": 0.5035460992907801, + "grad_norm": 3.080432415008545, + "learning_rate": 4.927870416500183e-06, + "loss": 0.6398, + "step": 1065 + }, + { + "epoch": 0.5040189125295508, + "grad_norm": 2.761612892150879, + "learning_rate": 4.927721572583084e-06, + "loss": 0.6126, + "step": 1066 + }, + { + "epoch": 0.5044917257683215, + "grad_norm": 2.8561882972717285, + "learning_rate": 4.927572577502408e-06, + "loss": 0.584, + "step": 1067 + }, + { + "epoch": 0.5049645390070922, + "grad_norm": 3.3386311531066895, + "learning_rate": 4.927423431267432e-06, + "loss": 0.6666, + "step": 1068 + }, + { + "epoch": 0.5054373522458628, + "grad_norm": 2.632906675338745, + "learning_rate": 4.927274133887443e-06, + "loss": 0.632, + "step": 1069 + }, + { + "epoch": 0.5059101654846335, + "grad_norm": 2.8737308979034424, + "learning_rate": 4.927124685371737e-06, + "loss": 0.6051, + "step": 1070 + }, + { + "epoch": 0.5063829787234042, + "grad_norm": 3.042222738265991, + "learning_rate": 4.926975085729619e-06, + "loss": 0.6954, + "step": 1071 + }, + { + "epoch": 0.5068557919621749, + "grad_norm": 3.3341481685638428, + "learning_rate": 4.926825334970404e-06, + "loss": 0.7148, + "step": 1072 + }, + { + "epoch": 0.5073286052009456, + "grad_norm": 2.7415387630462646, + "learning_rate": 4.926675433103418e-06, + "loss": 0.5456, + "step": 1073 + }, + { + "epoch": 0.5078014184397163, + "grad_norm": 2.7545325756073, + "learning_rate": 4.926525380137993e-06, + "loss": 0.6213, + "step": 1074 + }, + { + "epoch": 0.508274231678487, + "grad_norm": 2.9153690338134766, + "learning_rate": 4.926375176083472e-06, + "loss": 0.6466, + "step": 1075 + }, + { + "epoch": 0.5087470449172576, + "grad_norm": 4.210638523101807, + "learning_rate": 4.926224820949209e-06, + "loss": 0.6192, + "step": 1076 + }, + { + "epoch": 0.5092198581560283, + "grad_norm": 2.4357898235321045, + "learning_rate": 4.926074314744565e-06, + "loss": 0.594, + "step": 1077 + }, + { + "epoch": 0.509692671394799, + "grad_norm": 2.8004701137542725, + "learning_rate": 4.92592365747891e-06, + "loss": 0.6276, + "step": 1078 + }, + { + "epoch": 0.5101654846335697, + "grad_norm": 2.920675039291382, + "learning_rate": 4.925772849161628e-06, + "loss": 0.6043, + "step": 1079 + }, + { + "epoch": 0.5106382978723404, + "grad_norm": 2.791555404663086, + "learning_rate": 4.9256218898021055e-06, + "loss": 0.6837, + "step": 1080 + }, + { + "epoch": 0.5111111111111111, + "grad_norm": 3.1702463626861572, + "learning_rate": 4.925470779409746e-06, + "loss": 0.668, + "step": 1081 + }, + { + "epoch": 0.5115839243498818, + "grad_norm": 2.7149479389190674, + "learning_rate": 4.925319517993955e-06, + "loss": 0.5842, + "step": 1082 + }, + { + "epoch": 0.5120567375886524, + "grad_norm": 2.916311025619507, + "learning_rate": 4.925168105564153e-06, + "loss": 0.6893, + "step": 1083 + }, + { + "epoch": 0.5125295508274231, + "grad_norm": 2.917654514312744, + "learning_rate": 4.925016542129767e-06, + "loss": 0.6513, + "step": 1084 + }, + { + "epoch": 0.5130023640661938, + "grad_norm": 2.5568928718566895, + "learning_rate": 4.924864827700234e-06, + "loss": 0.6177, + "step": 1085 + }, + { + "epoch": 0.5134751773049645, + "grad_norm": 2.816720485687256, + "learning_rate": 4.924712962285001e-06, + "loss": 0.5833, + "step": 1086 + }, + { + "epoch": 0.5139479905437352, + "grad_norm": 2.6989188194274902, + "learning_rate": 4.9245609458935235e-06, + "loss": 0.6332, + "step": 1087 + }, + { + "epoch": 0.5144208037825059, + "grad_norm": 2.959599494934082, + "learning_rate": 4.924408778535268e-06, + "loss": 0.626, + "step": 1088 + }, + { + "epoch": 0.5148936170212766, + "grad_norm": 2.872814416885376, + "learning_rate": 4.924256460219708e-06, + "loss": 0.6407, + "step": 1089 + }, + { + "epoch": 0.5153664302600472, + "grad_norm": 2.6989097595214844, + "learning_rate": 4.924103990956329e-06, + "loss": 0.6391, + "step": 1090 + }, + { + "epoch": 0.5158392434988179, + "grad_norm": 2.986492156982422, + "learning_rate": 4.9239513707546235e-06, + "loss": 0.6911, + "step": 1091 + }, + { + "epoch": 0.5163120567375886, + "grad_norm": 3.069920301437378, + "learning_rate": 4.9237985996240954e-06, + "loss": 0.671, + "step": 1092 + }, + { + "epoch": 0.5167848699763593, + "grad_norm": 2.8214917182922363, + "learning_rate": 4.9236456775742555e-06, + "loss": 0.5885, + "step": 1093 + }, + { + "epoch": 0.51725768321513, + "grad_norm": 2.9416961669921875, + "learning_rate": 4.923492604614627e-06, + "loss": 0.6293, + "step": 1094 + }, + { + "epoch": 0.5177304964539007, + "grad_norm": 2.761780023574829, + "learning_rate": 4.923339380754741e-06, + "loss": 0.649, + "step": 1095 + }, + { + "epoch": 0.5182033096926714, + "grad_norm": 2.7648792266845703, + "learning_rate": 4.923186006004138e-06, + "loss": 0.5906, + "step": 1096 + }, + { + "epoch": 0.518676122931442, + "grad_norm": 3.5535428524017334, + "learning_rate": 4.923032480372367e-06, + "loss": 0.7138, + "step": 1097 + }, + { + "epoch": 0.5191489361702127, + "grad_norm": 2.6252479553222656, + "learning_rate": 4.922878803868988e-06, + "loss": 0.5499, + "step": 1098 + }, + { + "epoch": 0.5196217494089834, + "grad_norm": 2.901002883911133, + "learning_rate": 4.9227249765035715e-06, + "loss": 0.6991, + "step": 1099 + }, + { + "epoch": 0.5200945626477541, + "grad_norm": 2.621877431869507, + "learning_rate": 4.9225709982856925e-06, + "loss": 0.6269, + "step": 1100 + }, + { + "epoch": 0.5205673758865248, + "grad_norm": 2.872483015060425, + "learning_rate": 4.92241686922494e-06, + "loss": 0.6657, + "step": 1101 + }, + { + "epoch": 0.5210401891252955, + "grad_norm": 2.730447769165039, + "learning_rate": 4.922262589330912e-06, + "loss": 0.6061, + "step": 1102 + }, + { + "epoch": 0.5215130023640662, + "grad_norm": 2.646247386932373, + "learning_rate": 4.922108158613213e-06, + "loss": 0.5923, + "step": 1103 + }, + { + "epoch": 0.5219858156028369, + "grad_norm": 2.6488895416259766, + "learning_rate": 4.92195357708146e-06, + "loss": 0.6293, + "step": 1104 + }, + { + "epoch": 0.5224586288416075, + "grad_norm": 2.756338357925415, + "learning_rate": 4.921798844745278e-06, + "loss": 0.6374, + "step": 1105 + }, + { + "epoch": 0.5229314420803782, + "grad_norm": 3.1441280841827393, + "learning_rate": 4.921643961614301e-06, + "loss": 0.6652, + "step": 1106 + }, + { + "epoch": 0.5234042553191489, + "grad_norm": 3.050002098083496, + "learning_rate": 4.921488927698172e-06, + "loss": 0.6809, + "step": 1107 + }, + { + "epoch": 0.5238770685579196, + "grad_norm": 2.71750807762146, + "learning_rate": 4.921333743006547e-06, + "loss": 0.6266, + "step": 1108 + }, + { + "epoch": 0.5243498817966903, + "grad_norm": 2.8439245223999023, + "learning_rate": 4.921178407549086e-06, + "loss": 0.5663, + "step": 1109 + }, + { + "epoch": 0.524822695035461, + "grad_norm": 3.0722241401672363, + "learning_rate": 4.921022921335464e-06, + "loss": 0.6791, + "step": 1110 + }, + { + "epoch": 0.5252955082742317, + "grad_norm": 3.4381656646728516, + "learning_rate": 4.920867284375358e-06, + "loss": 0.6687, + "step": 1111 + }, + { + "epoch": 0.5257683215130023, + "grad_norm": 2.819812774658203, + "learning_rate": 4.920711496678463e-06, + "loss": 0.6299, + "step": 1112 + }, + { + "epoch": 0.526241134751773, + "grad_norm": 3.6587414741516113, + "learning_rate": 4.9205555582544765e-06, + "loss": 0.7392, + "step": 1113 + }, + { + "epoch": 0.5267139479905437, + "grad_norm": 2.774296522140503, + "learning_rate": 4.920399469113109e-06, + "loss": 0.6652, + "step": 1114 + }, + { + "epoch": 0.5271867612293144, + "grad_norm": 2.7480580806732178, + "learning_rate": 4.920243229264081e-06, + "loss": 0.596, + "step": 1115 + }, + { + "epoch": 0.5276595744680851, + "grad_norm": 3.213057518005371, + "learning_rate": 4.920086838717119e-06, + "loss": 0.6986, + "step": 1116 + }, + { + "epoch": 0.5281323877068558, + "grad_norm": 2.940546989440918, + "learning_rate": 4.919930297481962e-06, + "loss": 0.6481, + "step": 1117 + }, + { + "epoch": 0.5286052009456265, + "grad_norm": 2.5970494747161865, + "learning_rate": 4.9197736055683555e-06, + "loss": 0.5658, + "step": 1118 + }, + { + "epoch": 0.5290780141843971, + "grad_norm": 4.49385404586792, + "learning_rate": 4.919616762986057e-06, + "loss": 0.605, + "step": 1119 + }, + { + "epoch": 0.5295508274231678, + "grad_norm": 2.971857786178589, + "learning_rate": 4.919459769744833e-06, + "loss": 0.6539, + "step": 1120 + }, + { + "epoch": 0.5300236406619385, + "grad_norm": 2.6192965507507324, + "learning_rate": 4.919302625854457e-06, + "loss": 0.6226, + "step": 1121 + }, + { + "epoch": 0.5304964539007092, + "grad_norm": 2.665088176727295, + "learning_rate": 4.919145331324716e-06, + "loss": 0.6647, + "step": 1122 + }, + { + "epoch": 0.5309692671394799, + "grad_norm": 2.612126111984253, + "learning_rate": 4.918987886165403e-06, + "loss": 0.6965, + "step": 1123 + }, + { + "epoch": 0.5314420803782506, + "grad_norm": 3.80017352104187, + "learning_rate": 4.9188302903863205e-06, + "loss": 0.7396, + "step": 1124 + }, + { + "epoch": 0.5319148936170213, + "grad_norm": 2.781752824783325, + "learning_rate": 4.918672543997282e-06, + "loss": 0.5985, + "step": 1125 + }, + { + "epoch": 0.532387706855792, + "grad_norm": 2.6067914962768555, + "learning_rate": 4.91851464700811e-06, + "loss": 0.6159, + "step": 1126 + }, + { + "epoch": 0.5328605200945626, + "grad_norm": 2.670807123184204, + "learning_rate": 4.918356599428636e-06, + "loss": 0.5958, + "step": 1127 + }, + { + "epoch": 0.5333333333333333, + "grad_norm": 2.608611822128296, + "learning_rate": 4.9181984012687e-06, + "loss": 0.5768, + "step": 1128 + }, + { + "epoch": 0.533806146572104, + "grad_norm": 2.586764097213745, + "learning_rate": 4.918040052538154e-06, + "loss": 0.661, + "step": 1129 + }, + { + "epoch": 0.5342789598108747, + "grad_norm": 3.1317451000213623, + "learning_rate": 4.917881553246856e-06, + "loss": 0.6626, + "step": 1130 + }, + { + "epoch": 0.5347517730496454, + "grad_norm": 2.7135281562805176, + "learning_rate": 4.917722903404676e-06, + "loss": 0.6572, + "step": 1131 + }, + { + "epoch": 0.5352245862884161, + "grad_norm": 3.4546358585357666, + "learning_rate": 4.917564103021493e-06, + "loss": 0.5597, + "step": 1132 + }, + { + "epoch": 0.5356973995271868, + "grad_norm": 3.0943493843078613, + "learning_rate": 4.917405152107193e-06, + "loss": 0.7258, + "step": 1133 + }, + { + "epoch": 0.5361702127659574, + "grad_norm": 2.6069352626800537, + "learning_rate": 4.917246050671674e-06, + "loss": 0.6209, + "step": 1134 + }, + { + "epoch": 0.5366430260047281, + "grad_norm": 2.584883689880371, + "learning_rate": 4.917086798724844e-06, + "loss": 0.658, + "step": 1135 + }, + { + "epoch": 0.5371158392434988, + "grad_norm": 3.001976490020752, + "learning_rate": 4.9169273962766166e-06, + "loss": 0.6306, + "step": 1136 + }, + { + "epoch": 0.5375886524822695, + "grad_norm": 2.5013928413391113, + "learning_rate": 4.916767843336918e-06, + "loss": 0.572, + "step": 1137 + }, + { + "epoch": 0.5380614657210402, + "grad_norm": 2.9114553928375244, + "learning_rate": 4.916608139915684e-06, + "loss": 0.5841, + "step": 1138 + }, + { + "epoch": 0.5385342789598109, + "grad_norm": 2.8878467082977295, + "learning_rate": 4.9164482860228564e-06, + "loss": 0.6654, + "step": 1139 + }, + { + "epoch": 0.5390070921985816, + "grad_norm": 2.9827866554260254, + "learning_rate": 4.91628828166839e-06, + "loss": 0.6674, + "step": 1140 + }, + { + "epoch": 0.5394799054373522, + "grad_norm": 3.8696281909942627, + "learning_rate": 4.916128126862248e-06, + "loss": 0.6241, + "step": 1141 + }, + { + "epoch": 0.5399527186761229, + "grad_norm": 2.9556291103363037, + "learning_rate": 4.915967821614402e-06, + "loss": 0.6478, + "step": 1142 + }, + { + "epoch": 0.5404255319148936, + "grad_norm": 2.392942428588867, + "learning_rate": 4.915807365934834e-06, + "loss": 0.6097, + "step": 1143 + }, + { + "epoch": 0.5408983451536643, + "grad_norm": 3.032235860824585, + "learning_rate": 4.915646759833534e-06, + "loss": 0.7193, + "step": 1144 + }, + { + "epoch": 0.541371158392435, + "grad_norm": 2.840416193008423, + "learning_rate": 4.915486003320501e-06, + "loss": 0.5506, + "step": 1145 + }, + { + "epoch": 0.5418439716312057, + "grad_norm": 2.5438895225524902, + "learning_rate": 4.915325096405747e-06, + "loss": 0.6487, + "step": 1146 + }, + { + "epoch": 0.5423167848699764, + "grad_norm": 2.544334650039673, + "learning_rate": 4.9151640390992905e-06, + "loss": 0.6168, + "step": 1147 + }, + { + "epoch": 0.542789598108747, + "grad_norm": 2.8535678386688232, + "learning_rate": 4.91500283141116e-06, + "loss": 0.678, + "step": 1148 + }, + { + "epoch": 0.5432624113475177, + "grad_norm": 2.8086955547332764, + "learning_rate": 4.9148414733513915e-06, + "loss": 0.6473, + "step": 1149 + }, + { + "epoch": 0.5437352245862884, + "grad_norm": 2.4709885120391846, + "learning_rate": 4.914679964930034e-06, + "loss": 0.6797, + "step": 1150 + }, + { + "epoch": 0.5442080378250591, + "grad_norm": 2.8546934127807617, + "learning_rate": 4.9145183061571435e-06, + "loss": 0.6247, + "step": 1151 + }, + { + "epoch": 0.5446808510638298, + "grad_norm": 2.991184711456299, + "learning_rate": 4.9143564970427844e-06, + "loss": 0.5977, + "step": 1152 + }, + { + "epoch": 0.5451536643026005, + "grad_norm": 3.011216402053833, + "learning_rate": 4.914194537597033e-06, + "loss": 0.7005, + "step": 1153 + }, + { + "epoch": 0.5456264775413712, + "grad_norm": 2.807521343231201, + "learning_rate": 4.9140324278299744e-06, + "loss": 0.5412, + "step": 1154 + }, + { + "epoch": 0.5460992907801419, + "grad_norm": 3.0401229858398438, + "learning_rate": 4.913870167751701e-06, + "loss": 0.6394, + "step": 1155 + }, + { + "epoch": 0.5465721040189125, + "grad_norm": 2.853914976119995, + "learning_rate": 4.913707757372317e-06, + "loss": 0.6745, + "step": 1156 + }, + { + "epoch": 0.5470449172576832, + "grad_norm": 4.505620956420898, + "learning_rate": 4.913545196701935e-06, + "loss": 0.6668, + "step": 1157 + }, + { + "epoch": 0.5475177304964539, + "grad_norm": 3.0505781173706055, + "learning_rate": 4.913382485750676e-06, + "loss": 0.6926, + "step": 1158 + }, + { + "epoch": 0.5479905437352246, + "grad_norm": 2.798435688018799, + "learning_rate": 4.913219624528672e-06, + "loss": 0.605, + "step": 1159 + }, + { + "epoch": 0.5484633569739953, + "grad_norm": 2.7814908027648926, + "learning_rate": 4.913056613046065e-06, + "loss": 0.6678, + "step": 1160 + }, + { + "epoch": 0.548936170212766, + "grad_norm": 3.2089321613311768, + "learning_rate": 4.9128934513130025e-06, + "loss": 0.5995, + "step": 1161 + }, + { + "epoch": 0.5494089834515367, + "grad_norm": 2.7699952125549316, + "learning_rate": 4.9127301393396455e-06, + "loss": 0.7062, + "step": 1162 + }, + { + "epoch": 0.5498817966903073, + "grad_norm": 2.859368324279785, + "learning_rate": 4.912566677136162e-06, + "loss": 0.6063, + "step": 1163 + }, + { + "epoch": 0.550354609929078, + "grad_norm": 2.727334499359131, + "learning_rate": 4.91240306471273e-06, + "loss": 0.6848, + "step": 1164 + }, + { + "epoch": 0.5508274231678487, + "grad_norm": 2.6017510890960693, + "learning_rate": 4.912239302079537e-06, + "loss": 0.5808, + "step": 1165 + }, + { + "epoch": 0.5513002364066194, + "grad_norm": 3.539583206176758, + "learning_rate": 4.912075389246781e-06, + "loss": 0.7053, + "step": 1166 + }, + { + "epoch": 0.5517730496453901, + "grad_norm": 2.918280601501465, + "learning_rate": 4.911911326224666e-06, + "loss": 0.5904, + "step": 1167 + }, + { + "epoch": 0.5522458628841608, + "grad_norm": 3.0067362785339355, + "learning_rate": 4.9117471130234095e-06, + "loss": 0.6392, + "step": 1168 + }, + { + "epoch": 0.5527186761229315, + "grad_norm": 2.4374797344207764, + "learning_rate": 4.911582749653236e-06, + "loss": 0.5793, + "step": 1169 + }, + { + "epoch": 0.5531914893617021, + "grad_norm": 3.121182918548584, + "learning_rate": 4.911418236124378e-06, + "loss": 0.6636, + "step": 1170 + }, + { + "epoch": 0.5536643026004728, + "grad_norm": 3.1289851665496826, + "learning_rate": 4.91125357244708e-06, + "loss": 0.656, + "step": 1171 + }, + { + "epoch": 0.5541371158392435, + "grad_norm": 2.7034592628479004, + "learning_rate": 4.911088758631596e-06, + "loss": 0.6001, + "step": 1172 + }, + { + "epoch": 0.5546099290780142, + "grad_norm": 2.710146188735962, + "learning_rate": 4.910923794688187e-06, + "loss": 0.6007, + "step": 1173 + }, + { + "epoch": 0.5550827423167849, + "grad_norm": 2.5424487590789795, + "learning_rate": 4.910758680627124e-06, + "loss": 0.5193, + "step": 1174 + }, + { + "epoch": 0.5555555555555556, + "grad_norm": 2.615893602371216, + "learning_rate": 4.91059341645869e-06, + "loss": 0.5525, + "step": 1175 + }, + { + "epoch": 0.5560283687943263, + "grad_norm": 3.3179728984832764, + "learning_rate": 4.910428002193174e-06, + "loss": 0.7285, + "step": 1176 + }, + { + "epoch": 0.556501182033097, + "grad_norm": 2.7234175205230713, + "learning_rate": 4.910262437840875e-06, + "loss": 0.574, + "step": 1177 + }, + { + "epoch": 0.5569739952718676, + "grad_norm": 3.0416605472564697, + "learning_rate": 4.9100967234121034e-06, + "loss": 0.5623, + "step": 1178 + }, + { + "epoch": 0.5574468085106383, + "grad_norm": 3.067786455154419, + "learning_rate": 4.909930858917177e-06, + "loss": 0.6491, + "step": 1179 + }, + { + "epoch": 0.557919621749409, + "grad_norm": 3.0037379264831543, + "learning_rate": 4.909764844366422e-06, + "loss": 0.5696, + "step": 1180 + }, + { + "epoch": 0.5583924349881797, + "grad_norm": 2.966179609298706, + "learning_rate": 4.909598679770178e-06, + "loss": 0.6042, + "step": 1181 + }, + { + "epoch": 0.5588652482269504, + "grad_norm": 2.6000657081604004, + "learning_rate": 4.909432365138789e-06, + "loss": 0.5883, + "step": 1182 + }, + { + "epoch": 0.5593380614657211, + "grad_norm": 2.6794495582580566, + "learning_rate": 4.909265900482612e-06, + "loss": 0.6809, + "step": 1183 + }, + { + "epoch": 0.5598108747044918, + "grad_norm": 2.6765122413635254, + "learning_rate": 4.9090992858120115e-06, + "loss": 0.6601, + "step": 1184 + }, + { + "epoch": 0.5602836879432624, + "grad_norm": 2.6051928997039795, + "learning_rate": 4.908932521137363e-06, + "loss": 0.5946, + "step": 1185 + }, + { + "epoch": 0.5607565011820331, + "grad_norm": 3.0405542850494385, + "learning_rate": 4.908765606469048e-06, + "loss": 0.6998, + "step": 1186 + }, + { + "epoch": 0.5612293144208038, + "grad_norm": 2.7975668907165527, + "learning_rate": 4.908598541817462e-06, + "loss": 0.6218, + "step": 1187 + }, + { + "epoch": 0.5617021276595745, + "grad_norm": 2.5367627143859863, + "learning_rate": 4.908431327193005e-06, + "loss": 0.6354, + "step": 1188 + }, + { + "epoch": 0.5621749408983452, + "grad_norm": 3.7939631938934326, + "learning_rate": 4.908263962606091e-06, + "loss": 0.6376, + "step": 1189 + }, + { + "epoch": 0.5626477541371159, + "grad_norm": 2.864079475402832, + "learning_rate": 4.908096448067139e-06, + "loss": 0.5485, + "step": 1190 + }, + { + "epoch": 0.5631205673758866, + "grad_norm": 2.7855563163757324, + "learning_rate": 4.9079287835865804e-06, + "loss": 0.6645, + "step": 1191 + }, + { + "epoch": 0.5635933806146572, + "grad_norm": 2.6156625747680664, + "learning_rate": 4.9077609691748556e-06, + "loss": 0.5751, + "step": 1192 + }, + { + "epoch": 0.5640661938534279, + "grad_norm": 3.0475659370422363, + "learning_rate": 4.907593004842412e-06, + "loss": 0.6739, + "step": 1193 + }, + { + "epoch": 0.5645390070921986, + "grad_norm": 2.9176738262176514, + "learning_rate": 4.9074248905997104e-06, + "loss": 0.6493, + "step": 1194 + }, + { + "epoch": 0.5650118203309693, + "grad_norm": 2.6168384552001953, + "learning_rate": 4.907256626457216e-06, + "loss": 0.6154, + "step": 1195 + }, + { + "epoch": 0.56548463356974, + "grad_norm": 2.893980026245117, + "learning_rate": 4.907088212425408e-06, + "loss": 0.5808, + "step": 1196 + }, + { + "epoch": 0.5659574468085107, + "grad_norm": 3.3832836151123047, + "learning_rate": 4.90691964851477e-06, + "loss": 0.7888, + "step": 1197 + }, + { + "epoch": 0.5664302600472814, + "grad_norm": 3.088932752609253, + "learning_rate": 4.906750934735801e-06, + "loss": 0.6516, + "step": 1198 + }, + { + "epoch": 0.566903073286052, + "grad_norm": 2.494471549987793, + "learning_rate": 4.906582071099004e-06, + "loss": 0.6286, + "step": 1199 + }, + { + "epoch": 0.5673758865248227, + "grad_norm": 2.716550588607788, + "learning_rate": 4.906413057614895e-06, + "loss": 0.5939, + "step": 1200 + }, + { + "epoch": 0.5678486997635934, + "grad_norm": 2.5821073055267334, + "learning_rate": 4.906243894293995e-06, + "loss": 0.6668, + "step": 1201 + }, + { + "epoch": 0.5683215130023641, + "grad_norm": 3.651787042617798, + "learning_rate": 4.90607458114684e-06, + "loss": 0.6124, + "step": 1202 + }, + { + "epoch": 0.5687943262411348, + "grad_norm": 2.7567858695983887, + "learning_rate": 4.9059051181839705e-06, + "loss": 0.6656, + "step": 1203 + }, + { + "epoch": 0.5692671394799055, + "grad_norm": 2.8067586421966553, + "learning_rate": 4.90573550541594e-06, + "loss": 0.6306, + "step": 1204 + }, + { + "epoch": 0.5697399527186762, + "grad_norm": 2.6136393547058105, + "learning_rate": 4.905565742853307e-06, + "loss": 0.5992, + "step": 1205 + }, + { + "epoch": 0.5702127659574469, + "grad_norm": 2.899049758911133, + "learning_rate": 4.905395830506644e-06, + "loss": 0.621, + "step": 1206 + }, + { + "epoch": 0.5706855791962175, + "grad_norm": 3.036583185195923, + "learning_rate": 4.9052257683865294e-06, + "loss": 0.652, + "step": 1207 + }, + { + "epoch": 0.5711583924349882, + "grad_norm": 2.7947216033935547, + "learning_rate": 4.905055556503553e-06, + "loss": 0.6636, + "step": 1208 + }, + { + "epoch": 0.5716312056737589, + "grad_norm": 3.1646955013275146, + "learning_rate": 4.9048851948683135e-06, + "loss": 0.6376, + "step": 1209 + }, + { + "epoch": 0.5721040189125296, + "grad_norm": 2.8175766468048096, + "learning_rate": 4.904714683491417e-06, + "loss": 0.5929, + "step": 1210 + }, + { + "epoch": 0.5725768321513003, + "grad_norm": 2.923923969268799, + "learning_rate": 4.904544022383483e-06, + "loss": 0.6633, + "step": 1211 + }, + { + "epoch": 0.573049645390071, + "grad_norm": 2.7471134662628174, + "learning_rate": 4.9043732115551356e-06, + "loss": 0.6551, + "step": 1212 + }, + { + "epoch": 0.5735224586288417, + "grad_norm": 2.8660807609558105, + "learning_rate": 4.90420225101701e-06, + "loss": 0.6423, + "step": 1213 + }, + { + "epoch": 0.5739952718676123, + "grad_norm": 2.769247531890869, + "learning_rate": 4.904031140779754e-06, + "loss": 0.5982, + "step": 1214 + }, + { + "epoch": 0.574468085106383, + "grad_norm": 2.9043145179748535, + "learning_rate": 4.90385988085402e-06, + "loss": 0.5843, + "step": 1215 + }, + { + "epoch": 0.5749408983451537, + "grad_norm": 2.6639609336853027, + "learning_rate": 4.903688471250471e-06, + "loss": 0.5858, + "step": 1216 + }, + { + "epoch": 0.5754137115839244, + "grad_norm": 2.6967573165893555, + "learning_rate": 4.903516911979781e-06, + "loss": 0.5755, + "step": 1217 + }, + { + "epoch": 0.5758865248226951, + "grad_norm": 2.8865857124328613, + "learning_rate": 4.903345203052633e-06, + "loss": 0.6051, + "step": 1218 + }, + { + "epoch": 0.5763593380614658, + "grad_norm": 2.381979465484619, + "learning_rate": 4.903173344479717e-06, + "loss": 0.5727, + "step": 1219 + }, + { + "epoch": 0.5768321513002365, + "grad_norm": 2.7717981338500977, + "learning_rate": 4.903001336271734e-06, + "loss": 0.6406, + "step": 1220 + }, + { + "epoch": 0.577304964539007, + "grad_norm": 2.6431570053100586, + "learning_rate": 4.902829178439395e-06, + "loss": 0.6226, + "step": 1221 + }, + { + "epoch": 0.5777777777777777, + "grad_norm": 2.8090415000915527, + "learning_rate": 4.902656870993419e-06, + "loss": 0.5761, + "step": 1222 + }, + { + "epoch": 0.5782505910165484, + "grad_norm": 2.4769368171691895, + "learning_rate": 4.902484413944535e-06, + "loss": 0.5602, + "step": 1223 + }, + { + "epoch": 0.5787234042553191, + "grad_norm": 2.693316698074341, + "learning_rate": 4.902311807303481e-06, + "loss": 0.5222, + "step": 1224 + }, + { + "epoch": 0.5791962174940898, + "grad_norm": 2.7623913288116455, + "learning_rate": 4.902139051081004e-06, + "loss": 0.6978, + "step": 1225 + }, + { + "epoch": 0.5796690307328605, + "grad_norm": 2.6133766174316406, + "learning_rate": 4.901966145287863e-06, + "loss": 0.5802, + "step": 1226 + }, + { + "epoch": 0.5801418439716312, + "grad_norm": 2.7345972061157227, + "learning_rate": 4.901793089934821e-06, + "loss": 0.6294, + "step": 1227 + }, + { + "epoch": 0.5806146572104018, + "grad_norm": 2.7545835971832275, + "learning_rate": 4.9016198850326555e-06, + "loss": 0.6085, + "step": 1228 + }, + { + "epoch": 0.5810874704491725, + "grad_norm": 2.6947758197784424, + "learning_rate": 4.90144653059215e-06, + "loss": 0.6025, + "step": 1229 + }, + { + "epoch": 0.5815602836879432, + "grad_norm": 2.692967414855957, + "learning_rate": 4.901273026624099e-06, + "loss": 0.5715, + "step": 1230 + }, + { + "epoch": 0.5820330969267139, + "grad_norm": 2.78347110748291, + "learning_rate": 4.901099373139307e-06, + "loss": 0.6063, + "step": 1231 + }, + { + "epoch": 0.5825059101654846, + "grad_norm": 2.346496343612671, + "learning_rate": 4.900925570148585e-06, + "loss": 0.5869, + "step": 1232 + }, + { + "epoch": 0.5829787234042553, + "grad_norm": 2.606639862060547, + "learning_rate": 4.900751617662755e-06, + "loss": 0.6197, + "step": 1233 + }, + { + "epoch": 0.583451536643026, + "grad_norm": 2.5825929641723633, + "learning_rate": 4.900577515692649e-06, + "loss": 0.6721, + "step": 1234 + }, + { + "epoch": 0.5839243498817966, + "grad_norm": 2.731349468231201, + "learning_rate": 4.900403264249107e-06, + "loss": 0.6273, + "step": 1235 + }, + { + "epoch": 0.5843971631205673, + "grad_norm": 3.2133874893188477, + "learning_rate": 4.90022886334298e-06, + "loss": 0.6231, + "step": 1236 + }, + { + "epoch": 0.584869976359338, + "grad_norm": 2.9213852882385254, + "learning_rate": 4.900054312985127e-06, + "loss": 0.6677, + "step": 1237 + }, + { + "epoch": 0.5853427895981087, + "grad_norm": 2.815425157546997, + "learning_rate": 4.899879613186414e-06, + "loss": 0.6405, + "step": 1238 + }, + { + "epoch": 0.5858156028368794, + "grad_norm": 2.730782985687256, + "learning_rate": 4.899704763957721e-06, + "loss": 0.6233, + "step": 1239 + }, + { + "epoch": 0.5862884160756501, + "grad_norm": 2.6432766914367676, + "learning_rate": 4.899529765309936e-06, + "loss": 0.6267, + "step": 1240 + }, + { + "epoch": 0.5867612293144208, + "grad_norm": 2.616215229034424, + "learning_rate": 4.899354617253953e-06, + "loss": 0.6268, + "step": 1241 + }, + { + "epoch": 0.5872340425531914, + "grad_norm": 2.7630255222320557, + "learning_rate": 4.899179319800679e-06, + "loss": 0.6348, + "step": 1242 + }, + { + "epoch": 0.5877068557919621, + "grad_norm": 2.785095453262329, + "learning_rate": 4.899003872961029e-06, + "loss": 0.5839, + "step": 1243 + }, + { + "epoch": 0.5881796690307328, + "grad_norm": 2.9050328731536865, + "learning_rate": 4.898828276745927e-06, + "loss": 0.651, + "step": 1244 + }, + { + "epoch": 0.5886524822695035, + "grad_norm": 2.958092212677002, + "learning_rate": 4.8986525311663065e-06, + "loss": 0.6395, + "step": 1245 + }, + { + "epoch": 0.5891252955082742, + "grad_norm": 2.952310800552368, + "learning_rate": 4.898476636233111e-06, + "loss": 0.6731, + "step": 1246 + }, + { + "epoch": 0.5895981087470449, + "grad_norm": 2.9876346588134766, + "learning_rate": 4.898300591957293e-06, + "loss": 0.7015, + "step": 1247 + }, + { + "epoch": 0.5900709219858156, + "grad_norm": 2.8941752910614014, + "learning_rate": 4.898124398349813e-06, + "loss": 0.6452, + "step": 1248 + }, + { + "epoch": 0.5905437352245863, + "grad_norm": 2.9809536933898926, + "learning_rate": 4.897948055421642e-06, + "loss": 0.5736, + "step": 1249 + }, + { + "epoch": 0.5910165484633569, + "grad_norm": 2.927046775817871, + "learning_rate": 4.897771563183761e-06, + "loss": 0.5918, + "step": 1250 + }, + { + "epoch": 0.5914893617021276, + "grad_norm": 2.865020275115967, + "learning_rate": 4.897594921647158e-06, + "loss": 0.6924, + "step": 1251 + }, + { + "epoch": 0.5919621749408983, + "grad_norm": 2.7406699657440186, + "learning_rate": 4.897418130822832e-06, + "loss": 0.509, + "step": 1252 + }, + { + "epoch": 0.592434988179669, + "grad_norm": 2.781606912612915, + "learning_rate": 4.897241190721791e-06, + "loss": 0.5555, + "step": 1253 + }, + { + "epoch": 0.5929078014184397, + "grad_norm": 2.79209303855896, + "learning_rate": 4.8970641013550535e-06, + "loss": 0.6722, + "step": 1254 + }, + { + "epoch": 0.5933806146572104, + "grad_norm": 3.0672268867492676, + "learning_rate": 4.896886862733645e-06, + "loss": 0.6366, + "step": 1255 + }, + { + "epoch": 0.5938534278959811, + "grad_norm": 2.7456953525543213, + "learning_rate": 4.896709474868602e-06, + "loss": 0.6246, + "step": 1256 + }, + { + "epoch": 0.5943262411347517, + "grad_norm": 3.6731202602386475, + "learning_rate": 4.896531937770968e-06, + "loss": 0.668, + "step": 1257 + }, + { + "epoch": 0.5947990543735224, + "grad_norm": 2.6056087017059326, + "learning_rate": 4.8963542514518e-06, + "loss": 0.5815, + "step": 1258 + }, + { + "epoch": 0.5952718676122931, + "grad_norm": 2.719698905944824, + "learning_rate": 4.89617641592216e-06, + "loss": 0.6058, + "step": 1259 + }, + { + "epoch": 0.5957446808510638, + "grad_norm": 2.625838279724121, + "learning_rate": 4.895998431193121e-06, + "loss": 0.6143, + "step": 1260 + }, + { + "epoch": 0.5962174940898345, + "grad_norm": 2.7166085243225098, + "learning_rate": 4.895820297275767e-06, + "loss": 0.5187, + "step": 1261 + }, + { + "epoch": 0.5966903073286052, + "grad_norm": 2.7544102668762207, + "learning_rate": 4.8956420141811875e-06, + "loss": 0.5928, + "step": 1262 + }, + { + "epoch": 0.5971631205673759, + "grad_norm": 2.6678333282470703, + "learning_rate": 4.895463581920484e-06, + "loss": 0.611, + "step": 1263 + }, + { + "epoch": 0.5976359338061465, + "grad_norm": 2.853384494781494, + "learning_rate": 4.895285000504768e-06, + "loss": 0.642, + "step": 1264 + }, + { + "epoch": 0.5981087470449172, + "grad_norm": 2.637852430343628, + "learning_rate": 4.895106269945158e-06, + "loss": 0.6308, + "step": 1265 + }, + { + "epoch": 0.5985815602836879, + "grad_norm": 2.9880387783050537, + "learning_rate": 4.8949273902527826e-06, + "loss": 0.5781, + "step": 1266 + }, + { + "epoch": 0.5990543735224586, + "grad_norm": 3.5984015464782715, + "learning_rate": 4.89474836143878e-06, + "loss": 0.5865, + "step": 1267 + }, + { + "epoch": 0.5995271867612293, + "grad_norm": 2.719855546951294, + "learning_rate": 4.8945691835142975e-06, + "loss": 0.6393, + "step": 1268 + }, + { + "epoch": 0.6, + "grad_norm": 2.7885141372680664, + "learning_rate": 4.894389856490492e-06, + "loss": 0.66, + "step": 1269 + }, + { + "epoch": 0.6004728132387707, + "grad_norm": 2.698819875717163, + "learning_rate": 4.894210380378529e-06, + "loss": 0.6144, + "step": 1270 + }, + { + "epoch": 0.6009456264775414, + "grad_norm": 2.278045654296875, + "learning_rate": 4.894030755189584e-06, + "loss": 0.5609, + "step": 1271 + }, + { + "epoch": 0.601418439716312, + "grad_norm": 2.8729357719421387, + "learning_rate": 4.893850980934841e-06, + "loss": 0.6715, + "step": 1272 + }, + { + "epoch": 0.6018912529550827, + "grad_norm": 2.8541221618652344, + "learning_rate": 4.893671057625495e-06, + "loss": 0.6787, + "step": 1273 + }, + { + "epoch": 0.6023640661938534, + "grad_norm": 2.4561476707458496, + "learning_rate": 4.893490985272748e-06, + "loss": 0.6331, + "step": 1274 + }, + { + "epoch": 0.6028368794326241, + "grad_norm": 2.565739154815674, + "learning_rate": 4.893310763887812e-06, + "loss": 0.587, + "step": 1275 + }, + { + "epoch": 0.6033096926713948, + "grad_norm": 2.384951591491699, + "learning_rate": 4.8931303934819095e-06, + "loss": 0.5358, + "step": 1276 + }, + { + "epoch": 0.6037825059101655, + "grad_norm": 2.380808115005493, + "learning_rate": 4.89294987406627e-06, + "loss": 0.5402, + "step": 1277 + }, + { + "epoch": 0.6042553191489362, + "grad_norm": 2.764815092086792, + "learning_rate": 4.892769205652136e-06, + "loss": 0.6103, + "step": 1278 + }, + { + "epoch": 0.6047281323877068, + "grad_norm": 2.463350296020508, + "learning_rate": 4.892588388250754e-06, + "loss": 0.5937, + "step": 1279 + }, + { + "epoch": 0.6052009456264775, + "grad_norm": 3.099689245223999, + "learning_rate": 4.8924074218733855e-06, + "loss": 0.6354, + "step": 1280 + }, + { + "epoch": 0.6056737588652482, + "grad_norm": 2.804450035095215, + "learning_rate": 4.892226306531297e-06, + "loss": 0.6595, + "step": 1281 + }, + { + "epoch": 0.6061465721040189, + "grad_norm": 3.1559767723083496, + "learning_rate": 4.892045042235765e-06, + "loss": 0.6664, + "step": 1282 + }, + { + "epoch": 0.6066193853427896, + "grad_norm": 2.844341993331909, + "learning_rate": 4.891863628998079e-06, + "loss": 0.7454, + "step": 1283 + }, + { + "epoch": 0.6070921985815603, + "grad_norm": 2.686602830886841, + "learning_rate": 4.891682066829532e-06, + "loss": 0.6755, + "step": 1284 + }, + { + "epoch": 0.607565011820331, + "grad_norm": 2.736457347869873, + "learning_rate": 4.8915003557414285e-06, + "loss": 0.6305, + "step": 1285 + }, + { + "epoch": 0.6080378250591016, + "grad_norm": 2.661362409591675, + "learning_rate": 4.891318495745086e-06, + "loss": 0.5958, + "step": 1286 + }, + { + "epoch": 0.6085106382978723, + "grad_norm": 2.707348108291626, + "learning_rate": 4.8911364868518255e-06, + "loss": 0.5824, + "step": 1287 + }, + { + "epoch": 0.608983451536643, + "grad_norm": 2.9798858165740967, + "learning_rate": 4.890954329072981e-06, + "loss": 0.5981, + "step": 1288 + }, + { + "epoch": 0.6094562647754137, + "grad_norm": 2.6285455226898193, + "learning_rate": 4.890772022419895e-06, + "loss": 0.6194, + "step": 1289 + }, + { + "epoch": 0.6099290780141844, + "grad_norm": 2.9254322052001953, + "learning_rate": 4.890589566903917e-06, + "loss": 0.6002, + "step": 1290 + }, + { + "epoch": 0.6104018912529551, + "grad_norm": 2.6458325386047363, + "learning_rate": 4.89040696253641e-06, + "loss": 0.5457, + "step": 1291 + }, + { + "epoch": 0.6108747044917258, + "grad_norm": 2.508242607116699, + "learning_rate": 4.890224209328743e-06, + "loss": 0.6168, + "step": 1292 + }, + { + "epoch": 0.6113475177304964, + "grad_norm": 3.034785509109497, + "learning_rate": 4.890041307292296e-06, + "loss": 0.664, + "step": 1293 + }, + { + "epoch": 0.6118203309692671, + "grad_norm": 3.52469539642334, + "learning_rate": 4.889858256438455e-06, + "loss": 0.7301, + "step": 1294 + }, + { + "epoch": 0.6122931442080378, + "grad_norm": 2.9145348072052, + "learning_rate": 4.889675056778622e-06, + "loss": 0.6494, + "step": 1295 + }, + { + "epoch": 0.6127659574468085, + "grad_norm": 2.831829071044922, + "learning_rate": 4.8894917083242e-06, + "loss": 0.6064, + "step": 1296 + }, + { + "epoch": 0.6132387706855792, + "grad_norm": 2.6883130073547363, + "learning_rate": 4.889308211086608e-06, + "loss": 0.5642, + "step": 1297 + }, + { + "epoch": 0.6137115839243499, + "grad_norm": 3.0605485439300537, + "learning_rate": 4.889124565077269e-06, + "loss": 0.6695, + "step": 1298 + }, + { + "epoch": 0.6141843971631206, + "grad_norm": 3.44062876701355, + "learning_rate": 4.88894077030762e-06, + "loss": 0.6415, + "step": 1299 + }, + { + "epoch": 0.6146572104018913, + "grad_norm": 2.5970818996429443, + "learning_rate": 4.888756826789105e-06, + "loss": 0.6518, + "step": 1300 + }, + { + "epoch": 0.6151300236406619, + "grad_norm": 4.2233567237854, + "learning_rate": 4.8885727345331755e-06, + "loss": 0.6555, + "step": 1301 + }, + { + "epoch": 0.6156028368794326, + "grad_norm": 2.645385503768921, + "learning_rate": 4.888388493551297e-06, + "loss": 0.6762, + "step": 1302 + }, + { + "epoch": 0.6160756501182033, + "grad_norm": 2.907954454421997, + "learning_rate": 4.8882041038549385e-06, + "loss": 0.6526, + "step": 1303 + }, + { + "epoch": 0.616548463356974, + "grad_norm": 2.482771873474121, + "learning_rate": 4.888019565455583e-06, + "loss": 0.628, + "step": 1304 + }, + { + "epoch": 0.6170212765957447, + "grad_norm": 2.7165915966033936, + "learning_rate": 4.88783487836472e-06, + "loss": 0.5743, + "step": 1305 + }, + { + "epoch": 0.6174940898345154, + "grad_norm": 3.095627546310425, + "learning_rate": 4.88765004259385e-06, + "loss": 0.627, + "step": 1306 + }, + { + "epoch": 0.6179669030732861, + "grad_norm": 2.5018465518951416, + "learning_rate": 4.8874650581544805e-06, + "loss": 0.5215, + "step": 1307 + }, + { + "epoch": 0.6184397163120567, + "grad_norm": 3.094337224960327, + "learning_rate": 4.8872799250581316e-06, + "loss": 0.6979, + "step": 1308 + }, + { + "epoch": 0.6189125295508274, + "grad_norm": 3.1002209186553955, + "learning_rate": 4.887094643316329e-06, + "loss": 0.6565, + "step": 1309 + }, + { + "epoch": 0.6193853427895981, + "grad_norm": 2.551431894302368, + "learning_rate": 4.88690921294061e-06, + "loss": 0.5748, + "step": 1310 + }, + { + "epoch": 0.6198581560283688, + "grad_norm": 2.8282904624938965, + "learning_rate": 4.886723633942521e-06, + "loss": 0.676, + "step": 1311 + }, + { + "epoch": 0.6203309692671395, + "grad_norm": 2.8887810707092285, + "learning_rate": 4.886537906333617e-06, + "loss": 0.5971, + "step": 1312 + }, + { + "epoch": 0.6208037825059102, + "grad_norm": 2.9989118576049805, + "learning_rate": 4.886352030125462e-06, + "loss": 0.6341, + "step": 1313 + }, + { + "epoch": 0.6212765957446809, + "grad_norm": 2.8042776584625244, + "learning_rate": 4.886166005329629e-06, + "loss": 0.6578, + "step": 1314 + }, + { + "epoch": 0.6217494089834515, + "grad_norm": 2.4980967044830322, + "learning_rate": 4.8859798319577026e-06, + "loss": 0.6711, + "step": 1315 + }, + { + "epoch": 0.6222222222222222, + "grad_norm": 2.762369155883789, + "learning_rate": 4.885793510021274e-06, + "loss": 0.5747, + "step": 1316 + }, + { + "epoch": 0.6226950354609929, + "grad_norm": 3.136327028274536, + "learning_rate": 4.885607039531945e-06, + "loss": 0.7544, + "step": 1317 + }, + { + "epoch": 0.6231678486997636, + "grad_norm": 2.8736963272094727, + "learning_rate": 4.885420420501327e-06, + "loss": 0.6603, + "step": 1318 + }, + { + "epoch": 0.6236406619385343, + "grad_norm": 2.766237497329712, + "learning_rate": 4.885233652941039e-06, + "loss": 0.581, + "step": 1319 + }, + { + "epoch": 0.624113475177305, + "grad_norm": 2.4740939140319824, + "learning_rate": 4.88504673686271e-06, + "loss": 0.6335, + "step": 1320 + }, + { + "epoch": 0.6245862884160757, + "grad_norm": 3.324795961380005, + "learning_rate": 4.884859672277978e-06, + "loss": 0.6019, + "step": 1321 + }, + { + "epoch": 0.6250591016548463, + "grad_norm": 3.521327257156372, + "learning_rate": 4.884672459198493e-06, + "loss": 0.6104, + "step": 1322 + }, + { + "epoch": 0.625531914893617, + "grad_norm": 2.7728071212768555, + "learning_rate": 4.884485097635909e-06, + "loss": 0.6714, + "step": 1323 + }, + { + "epoch": 0.6260047281323877, + "grad_norm": 3.0738155841827393, + "learning_rate": 4.884297587601895e-06, + "loss": 0.604, + "step": 1324 + }, + { + "epoch": 0.6264775413711584, + "grad_norm": 2.719240427017212, + "learning_rate": 4.884109929108124e-06, + "loss": 0.6795, + "step": 1325 + }, + { + "epoch": 0.6269503546099291, + "grad_norm": 2.4108200073242188, + "learning_rate": 4.883922122166282e-06, + "loss": 0.5846, + "step": 1326 + }, + { + "epoch": 0.6274231678486998, + "grad_norm": 2.393899917602539, + "learning_rate": 4.883734166788063e-06, + "loss": 0.6188, + "step": 1327 + }, + { + "epoch": 0.6278959810874705, + "grad_norm": 4.555255889892578, + "learning_rate": 4.883546062985169e-06, + "loss": 0.5962, + "step": 1328 + }, + { + "epoch": 0.6283687943262412, + "grad_norm": 2.571075439453125, + "learning_rate": 4.883357810769315e-06, + "loss": 0.6165, + "step": 1329 + }, + { + "epoch": 0.6288416075650118, + "grad_norm": 2.553115129470825, + "learning_rate": 4.8831694101522185e-06, + "loss": 0.6787, + "step": 1330 + }, + { + "epoch": 0.6293144208037825, + "grad_norm": 3.2564642429351807, + "learning_rate": 4.882980861145614e-06, + "loss": 0.659, + "step": 1331 + }, + { + "epoch": 0.6297872340425532, + "grad_norm": 2.535216808319092, + "learning_rate": 4.882792163761241e-06, + "loss": 0.6176, + "step": 1332 + }, + { + "epoch": 0.6302600472813239, + "grad_norm": 3.097921848297119, + "learning_rate": 4.882603318010847e-06, + "loss": 0.6822, + "step": 1333 + }, + { + "epoch": 0.6307328605200946, + "grad_norm": 2.8135175704956055, + "learning_rate": 4.882414323906192e-06, + "loss": 0.6782, + "step": 1334 + }, + { + "epoch": 0.6312056737588653, + "grad_norm": 2.724634885787964, + "learning_rate": 4.882225181459044e-06, + "loss": 0.6545, + "step": 1335 + }, + { + "epoch": 0.631678486997636, + "grad_norm": 2.9585227966308594, + "learning_rate": 4.882035890681179e-06, + "loss": 0.6218, + "step": 1336 + }, + { + "epoch": 0.6321513002364066, + "grad_norm": 2.6952011585235596, + "learning_rate": 4.881846451584385e-06, + "loss": 0.6, + "step": 1337 + }, + { + "epoch": 0.6326241134751773, + "grad_norm": 3.1400704383850098, + "learning_rate": 4.881656864180455e-06, + "loss": 0.6687, + "step": 1338 + }, + { + "epoch": 0.633096926713948, + "grad_norm": 2.8382487297058105, + "learning_rate": 4.881467128481197e-06, + "loss": 0.574, + "step": 1339 + }, + { + "epoch": 0.6335697399527187, + "grad_norm": 2.8520095348358154, + "learning_rate": 4.881277244498422e-06, + "loss": 0.6582, + "step": 1340 + }, + { + "epoch": 0.6340425531914894, + "grad_norm": 2.703498363494873, + "learning_rate": 4.881087212243956e-06, + "loss": 0.7224, + "step": 1341 + }, + { + "epoch": 0.6345153664302601, + "grad_norm": 3.697205066680908, + "learning_rate": 4.880897031729629e-06, + "loss": 0.6582, + "step": 1342 + }, + { + "epoch": 0.6349881796690308, + "grad_norm": 2.7625808715820312, + "learning_rate": 4.880706702967284e-06, + "loss": 0.574, + "step": 1343 + }, + { + "epoch": 0.6354609929078014, + "grad_norm": 2.949984073638916, + "learning_rate": 4.880516225968771e-06, + "loss": 0.66, + "step": 1344 + }, + { + "epoch": 0.6359338061465721, + "grad_norm": 2.548269748687744, + "learning_rate": 4.8803256007459525e-06, + "loss": 0.642, + "step": 1345 + }, + { + "epoch": 0.6364066193853428, + "grad_norm": 2.5102174282073975, + "learning_rate": 4.8801348273106945e-06, + "loss": 0.6238, + "step": 1346 + }, + { + "epoch": 0.6368794326241135, + "grad_norm": 2.9847946166992188, + "learning_rate": 4.8799439056748786e-06, + "loss": 0.5416, + "step": 1347 + }, + { + "epoch": 0.6373522458628842, + "grad_norm": 2.8711049556732178, + "learning_rate": 4.879752835850391e-06, + "loss": 0.6427, + "step": 1348 + }, + { + "epoch": 0.6378250591016549, + "grad_norm": 2.7901716232299805, + "learning_rate": 4.879561617849129e-06, + "loss": 0.6026, + "step": 1349 + }, + { + "epoch": 0.6382978723404256, + "grad_norm": 2.659778356552124, + "learning_rate": 4.879370251682999e-06, + "loss": 0.6623, + "step": 1350 + }, + { + "epoch": 0.6387706855791963, + "grad_norm": 3.224386692047119, + "learning_rate": 4.879178737363917e-06, + "loss": 0.6485, + "step": 1351 + }, + { + "epoch": 0.6392434988179669, + "grad_norm": 2.6385605335235596, + "learning_rate": 4.8789870749038076e-06, + "loss": 0.5866, + "step": 1352 + }, + { + "epoch": 0.6397163120567376, + "grad_norm": 2.807713270187378, + "learning_rate": 4.8787952643146045e-06, + "loss": 0.6537, + "step": 1353 + }, + { + "epoch": 0.6401891252955083, + "grad_norm": 2.5689280033111572, + "learning_rate": 4.878603305608251e-06, + "loss": 0.6216, + "step": 1354 + }, + { + "epoch": 0.640661938534279, + "grad_norm": 2.7347843647003174, + "learning_rate": 4.8784111987967e-06, + "loss": 0.6318, + "step": 1355 + }, + { + "epoch": 0.6411347517730497, + "grad_norm": 2.5210378170013428, + "learning_rate": 4.878218943891911e-06, + "loss": 0.5472, + "step": 1356 + }, + { + "epoch": 0.6416075650118204, + "grad_norm": 2.866785764694214, + "learning_rate": 4.878026540905858e-06, + "loss": 0.7108, + "step": 1357 + }, + { + "epoch": 0.642080378250591, + "grad_norm": 2.923314332962036, + "learning_rate": 4.877833989850519e-06, + "loss": 0.5557, + "step": 1358 + }, + { + "epoch": 0.6425531914893617, + "grad_norm": 2.925463914871216, + "learning_rate": 4.8776412907378845e-06, + "loss": 0.6382, + "step": 1359 + }, + { + "epoch": 0.6430260047281324, + "grad_norm": 2.909644365310669, + "learning_rate": 4.877448443579952e-06, + "loss": 0.5603, + "step": 1360 + }, + { + "epoch": 0.6434988179669031, + "grad_norm": 3.501148223876953, + "learning_rate": 4.8772554483887306e-06, + "loss": 0.6722, + "step": 1361 + }, + { + "epoch": 0.6439716312056738, + "grad_norm": 2.823765516281128, + "learning_rate": 4.877062305176235e-06, + "loss": 0.6408, + "step": 1362 + }, + { + "epoch": 0.6444444444444445, + "grad_norm": 2.9807584285736084, + "learning_rate": 4.8768690139544935e-06, + "loss": 0.5984, + "step": 1363 + }, + { + "epoch": 0.6449172576832152, + "grad_norm": 2.8411378860473633, + "learning_rate": 4.8766755747355405e-06, + "loss": 0.6231, + "step": 1364 + }, + { + "epoch": 0.6453900709219859, + "grad_norm": 3.158952236175537, + "learning_rate": 4.8764819875314215e-06, + "loss": 0.6441, + "step": 1365 + }, + { + "epoch": 0.6458628841607565, + "grad_norm": 2.9614369869232178, + "learning_rate": 4.876288252354189e-06, + "loss": 0.6308, + "step": 1366 + }, + { + "epoch": 0.6463356973995272, + "grad_norm": 3.073805570602417, + "learning_rate": 4.876094369215907e-06, + "loss": 0.6046, + "step": 1367 + }, + { + "epoch": 0.6468085106382979, + "grad_norm": 2.719189405441284, + "learning_rate": 4.875900338128648e-06, + "loss": 0.6082, + "step": 1368 + }, + { + "epoch": 0.6472813238770686, + "grad_norm": 2.676726818084717, + "learning_rate": 4.8757061591044914e-06, + "loss": 0.6344, + "step": 1369 + }, + { + "epoch": 0.6477541371158393, + "grad_norm": 2.955256938934326, + "learning_rate": 4.87551183215553e-06, + "loss": 0.6506, + "step": 1370 + }, + { + "epoch": 0.64822695035461, + "grad_norm": 2.5672218799591064, + "learning_rate": 4.875317357293864e-06, + "loss": 0.5284, + "step": 1371 + }, + { + "epoch": 0.6486997635933807, + "grad_norm": 2.5860238075256348, + "learning_rate": 4.875122734531602e-06, + "loss": 0.667, + "step": 1372 + }, + { + "epoch": 0.6491725768321513, + "grad_norm": 3.1037003993988037, + "learning_rate": 4.8749279638808605e-06, + "loss": 0.6902, + "step": 1373 + }, + { + "epoch": 0.649645390070922, + "grad_norm": 2.7715282440185547, + "learning_rate": 4.874733045353769e-06, + "loss": 0.6291, + "step": 1374 + }, + { + "epoch": 0.6501182033096927, + "grad_norm": 2.527071475982666, + "learning_rate": 4.874537978962463e-06, + "loss": 0.5565, + "step": 1375 + }, + { + "epoch": 0.6505910165484634, + "grad_norm": 2.722092628479004, + "learning_rate": 4.874342764719091e-06, + "loss": 0.5724, + "step": 1376 + }, + { + "epoch": 0.6510638297872341, + "grad_norm": 2.6342411041259766, + "learning_rate": 4.874147402635805e-06, + "loss": 0.6308, + "step": 1377 + }, + { + "epoch": 0.6515366430260048, + "grad_norm": 2.3850719928741455, + "learning_rate": 4.8739518927247695e-06, + "loss": 0.5692, + "step": 1378 + }, + { + "epoch": 0.6520094562647755, + "grad_norm": 2.9787259101867676, + "learning_rate": 4.873756234998161e-06, + "loss": 0.6953, + "step": 1379 + }, + { + "epoch": 0.6524822695035462, + "grad_norm": 2.634141683578491, + "learning_rate": 4.873560429468159e-06, + "loss": 0.6077, + "step": 1380 + }, + { + "epoch": 0.6529550827423168, + "grad_norm": 2.803046941757202, + "learning_rate": 4.873364476146958e-06, + "loss": 0.6657, + "step": 1381 + }, + { + "epoch": 0.6534278959810875, + "grad_norm": 2.762827157974243, + "learning_rate": 4.8731683750467574e-06, + "loss": 0.6061, + "step": 1382 + }, + { + "epoch": 0.6539007092198581, + "grad_norm": 2.6654391288757324, + "learning_rate": 4.872972126179768e-06, + "loss": 0.6387, + "step": 1383 + }, + { + "epoch": 0.6543735224586288, + "grad_norm": 2.4363625049591064, + "learning_rate": 4.872775729558209e-06, + "loss": 0.5623, + "step": 1384 + }, + { + "epoch": 0.6548463356973995, + "grad_norm": 2.528959035873413, + "learning_rate": 4.87257918519431e-06, + "loss": 0.5609, + "step": 1385 + }, + { + "epoch": 0.6553191489361702, + "grad_norm": 2.718383312225342, + "learning_rate": 4.872382493100309e-06, + "loss": 0.5575, + "step": 1386 + }, + { + "epoch": 0.6557919621749408, + "grad_norm": 2.660841226577759, + "learning_rate": 4.872185653288453e-06, + "loss": 0.6106, + "step": 1387 + }, + { + "epoch": 0.6562647754137115, + "grad_norm": 2.508753538131714, + "learning_rate": 4.871988665770997e-06, + "loss": 0.5705, + "step": 1388 + }, + { + "epoch": 0.6567375886524822, + "grad_norm": 2.5134334564208984, + "learning_rate": 4.871791530560208e-06, + "loss": 0.5592, + "step": 1389 + }, + { + "epoch": 0.6572104018912529, + "grad_norm": 2.7475597858428955, + "learning_rate": 4.871594247668361e-06, + "loss": 0.6277, + "step": 1390 + }, + { + "epoch": 0.6576832151300236, + "grad_norm": 2.793616533279419, + "learning_rate": 4.871396817107739e-06, + "loss": 0.595, + "step": 1391 + }, + { + "epoch": 0.6581560283687943, + "grad_norm": 2.8285086154937744, + "learning_rate": 4.871199238890635e-06, + "loss": 0.6094, + "step": 1392 + }, + { + "epoch": 0.658628841607565, + "grad_norm": 2.74124813079834, + "learning_rate": 4.871001513029352e-06, + "loss": 0.6296, + "step": 1393 + }, + { + "epoch": 0.6591016548463356, + "grad_norm": 2.761237621307373, + "learning_rate": 4.870803639536202e-06, + "loss": 0.5702, + "step": 1394 + }, + { + "epoch": 0.6595744680851063, + "grad_norm": 2.761038064956665, + "learning_rate": 4.870605618423504e-06, + "loss": 0.6195, + "step": 1395 + }, + { + "epoch": 0.660047281323877, + "grad_norm": 2.8812482357025146, + "learning_rate": 4.870407449703589e-06, + "loss": 0.616, + "step": 1396 + }, + { + "epoch": 0.6605200945626477, + "grad_norm": 2.9966578483581543, + "learning_rate": 4.870209133388797e-06, + "loss": 0.6547, + "step": 1397 + }, + { + "epoch": 0.6609929078014184, + "grad_norm": 2.7969017028808594, + "learning_rate": 4.870010669491474e-06, + "loss": 0.5762, + "step": 1398 + }, + { + "epoch": 0.6614657210401891, + "grad_norm": 2.557783842086792, + "learning_rate": 4.86981205802398e-06, + "loss": 0.6184, + "step": 1399 + }, + { + "epoch": 0.6619385342789598, + "grad_norm": 2.5393927097320557, + "learning_rate": 4.86961329899868e-06, + "loss": 0.5953, + "step": 1400 + }, + { + "epoch": 0.6624113475177305, + "grad_norm": 2.7745981216430664, + "learning_rate": 4.86941439242795e-06, + "loss": 0.5967, + "step": 1401 + }, + { + "epoch": 0.6628841607565011, + "grad_norm": 2.650381326675415, + "learning_rate": 4.869215338324176e-06, + "loss": 0.5667, + "step": 1402 + }, + { + "epoch": 0.6633569739952718, + "grad_norm": 2.583169937133789, + "learning_rate": 4.869016136699751e-06, + "loss": 0.549, + "step": 1403 + }, + { + "epoch": 0.6638297872340425, + "grad_norm": 2.984978437423706, + "learning_rate": 4.868816787567079e-06, + "loss": 0.5931, + "step": 1404 + }, + { + "epoch": 0.6643026004728132, + "grad_norm": 3.1947181224823, + "learning_rate": 4.868617290938573e-06, + "loss": 0.5473, + "step": 1405 + }, + { + "epoch": 0.6647754137115839, + "grad_norm": 2.562927007675171, + "learning_rate": 4.868417646826654e-06, + "loss": 0.6878, + "step": 1406 + }, + { + "epoch": 0.6652482269503546, + "grad_norm": 2.8741261959075928, + "learning_rate": 4.868217855243754e-06, + "loss": 0.6312, + "step": 1407 + }, + { + "epoch": 0.6657210401891253, + "grad_norm": 2.9834797382354736, + "learning_rate": 4.868017916202312e-06, + "loss": 0.5624, + "step": 1408 + }, + { + "epoch": 0.6661938534278959, + "grad_norm": 2.6935982704162598, + "learning_rate": 4.8678178297147785e-06, + "loss": 0.5857, + "step": 1409 + }, + { + "epoch": 0.6666666666666666, + "grad_norm": 2.8200576305389404, + "learning_rate": 4.86761759579361e-06, + "loss": 0.6153, + "step": 1410 + }, + { + "epoch": 0.6671394799054373, + "grad_norm": 2.831425189971924, + "learning_rate": 4.867417214451276e-06, + "loss": 0.6495, + "step": 1411 + }, + { + "epoch": 0.667612293144208, + "grad_norm": 2.733565092086792, + "learning_rate": 4.867216685700253e-06, + "loss": 0.6036, + "step": 1412 + }, + { + "epoch": 0.6680851063829787, + "grad_norm": 3.0609400272369385, + "learning_rate": 4.867016009553027e-06, + "loss": 0.6773, + "step": 1413 + }, + { + "epoch": 0.6685579196217494, + "grad_norm": 2.665452241897583, + "learning_rate": 4.866815186022093e-06, + "loss": 0.6256, + "step": 1414 + }, + { + "epoch": 0.6690307328605201, + "grad_norm": 2.9480721950531006, + "learning_rate": 4.866614215119956e-06, + "loss": 0.535, + "step": 1415 + }, + { + "epoch": 0.6695035460992907, + "grad_norm": 2.5514180660247803, + "learning_rate": 4.866413096859128e-06, + "loss": 0.6588, + "step": 1416 + }, + { + "epoch": 0.6699763593380614, + "grad_norm": 3.3442373275756836, + "learning_rate": 4.866211831252134e-06, + "loss": 0.5754, + "step": 1417 + }, + { + "epoch": 0.6704491725768321, + "grad_norm": 2.521467685699463, + "learning_rate": 4.866010418311504e-06, + "loss": 0.5546, + "step": 1418 + }, + { + "epoch": 0.6709219858156028, + "grad_norm": 2.930706262588501, + "learning_rate": 4.865808858049781e-06, + "loss": 0.589, + "step": 1419 + }, + { + "epoch": 0.6713947990543735, + "grad_norm": 2.6298375129699707, + "learning_rate": 4.865607150479513e-06, + "loss": 0.5915, + "step": 1420 + }, + { + "epoch": 0.6718676122931442, + "grad_norm": 2.9554293155670166, + "learning_rate": 4.8654052956132615e-06, + "loss": 0.6654, + "step": 1421 + }, + { + "epoch": 0.6723404255319149, + "grad_norm": 3.2706902027130127, + "learning_rate": 4.865203293463593e-06, + "loss": 0.7115, + "step": 1422 + }, + { + "epoch": 0.6728132387706856, + "grad_norm": 3.041539430618286, + "learning_rate": 4.865001144043088e-06, + "loss": 0.5818, + "step": 1423 + }, + { + "epoch": 0.6732860520094562, + "grad_norm": 3.1314544677734375, + "learning_rate": 4.864798847364331e-06, + "loss": 0.5822, + "step": 1424 + }, + { + "epoch": 0.6737588652482269, + "grad_norm": 2.5301461219787598, + "learning_rate": 4.86459640343992e-06, + "loss": 0.5525, + "step": 1425 + }, + { + "epoch": 0.6742316784869976, + "grad_norm": 2.809295892715454, + "learning_rate": 4.864393812282458e-06, + "loss": 0.6768, + "step": 1426 + }, + { + "epoch": 0.6747044917257683, + "grad_norm": 2.794664144515991, + "learning_rate": 4.864191073904562e-06, + "loss": 0.5793, + "step": 1427 + }, + { + "epoch": 0.675177304964539, + "grad_norm": 2.7771105766296387, + "learning_rate": 4.863988188318854e-06, + "loss": 0.6453, + "step": 1428 + }, + { + "epoch": 0.6756501182033097, + "grad_norm": 2.6431946754455566, + "learning_rate": 4.863785155537967e-06, + "loss": 0.5877, + "step": 1429 + }, + { + "epoch": 0.6761229314420804, + "grad_norm": 2.951353073120117, + "learning_rate": 4.863581975574544e-06, + "loss": 0.6793, + "step": 1430 + }, + { + "epoch": 0.676595744680851, + "grad_norm": 3.1336071491241455, + "learning_rate": 4.863378648441235e-06, + "loss": 0.6695, + "step": 1431 + }, + { + "epoch": 0.6770685579196217, + "grad_norm": 2.735982656478882, + "learning_rate": 4.8631751741507e-06, + "loss": 0.5239, + "step": 1432 + }, + { + "epoch": 0.6775413711583924, + "grad_norm": 2.7085206508636475, + "learning_rate": 4.862971552715611e-06, + "loss": 0.6837, + "step": 1433 + }, + { + "epoch": 0.6780141843971631, + "grad_norm": 3.136528730392456, + "learning_rate": 4.8627677841486436e-06, + "loss": 0.683, + "step": 1434 + }, + { + "epoch": 0.6784869976359338, + "grad_norm": 2.7879369258880615, + "learning_rate": 4.862563868462486e-06, + "loss": 0.608, + "step": 1435 + }, + { + "epoch": 0.6789598108747045, + "grad_norm": 2.7937729358673096, + "learning_rate": 4.862359805669837e-06, + "loss": 0.6131, + "step": 1436 + }, + { + "epoch": 0.6794326241134752, + "grad_norm": 2.5988364219665527, + "learning_rate": 4.862155595783401e-06, + "loss": 0.6303, + "step": 1437 + }, + { + "epoch": 0.6799054373522458, + "grad_norm": 3.251070499420166, + "learning_rate": 4.861951238815894e-06, + "loss": 0.7246, + "step": 1438 + }, + { + "epoch": 0.6803782505910165, + "grad_norm": 2.646759271621704, + "learning_rate": 4.861746734780039e-06, + "loss": 0.6313, + "step": 1439 + }, + { + "epoch": 0.6808510638297872, + "grad_norm": 2.773866891860962, + "learning_rate": 4.861542083688573e-06, + "loss": 0.6463, + "step": 1440 + }, + { + "epoch": 0.6813238770685579, + "grad_norm": 2.759965658187866, + "learning_rate": 4.861337285554235e-06, + "loss": 0.5428, + "step": 1441 + }, + { + "epoch": 0.6817966903073286, + "grad_norm": 3.3250818252563477, + "learning_rate": 4.861132340389779e-06, + "loss": 0.6522, + "step": 1442 + }, + { + "epoch": 0.6822695035460993, + "grad_norm": 2.661797523498535, + "learning_rate": 4.860927248207965e-06, + "loss": 0.5871, + "step": 1443 + }, + { + "epoch": 0.68274231678487, + "grad_norm": 2.706289052963257, + "learning_rate": 4.860722009021563e-06, + "loss": 0.6651, + "step": 1444 + }, + { + "epoch": 0.6832151300236406, + "grad_norm": 2.8459298610687256, + "learning_rate": 4.860516622843354e-06, + "loss": 0.5827, + "step": 1445 + }, + { + "epoch": 0.6836879432624113, + "grad_norm": 3.1041831970214844, + "learning_rate": 4.860311089686125e-06, + "loss": 0.6727, + "step": 1446 + }, + { + "epoch": 0.684160756501182, + "grad_norm": 2.9382801055908203, + "learning_rate": 4.8601054095626746e-06, + "loss": 0.6002, + "step": 1447 + }, + { + "epoch": 0.6846335697399527, + "grad_norm": 2.782475471496582, + "learning_rate": 4.859899582485808e-06, + "loss": 0.6951, + "step": 1448 + }, + { + "epoch": 0.6851063829787234, + "grad_norm": 3.313894510269165, + "learning_rate": 4.859693608468343e-06, + "loss": 0.6363, + "step": 1449 + }, + { + "epoch": 0.6855791962174941, + "grad_norm": 3.1639695167541504, + "learning_rate": 4.8594874875231045e-06, + "loss": 0.7002, + "step": 1450 + }, + { + "epoch": 0.6860520094562648, + "grad_norm": 2.6762218475341797, + "learning_rate": 4.859281219662926e-06, + "loss": 0.6246, + "step": 1451 + }, + { + "epoch": 0.6865248226950355, + "grad_norm": 2.8368663787841797, + "learning_rate": 4.85907480490065e-06, + "loss": 0.5906, + "step": 1452 + }, + { + "epoch": 0.6869976359338061, + "grad_norm": 2.887373208999634, + "learning_rate": 4.858868243249131e-06, + "loss": 0.5931, + "step": 1453 + }, + { + "epoch": 0.6874704491725768, + "grad_norm": 2.8115322589874268, + "learning_rate": 4.858661534721229e-06, + "loss": 0.6337, + "step": 1454 + }, + { + "epoch": 0.6879432624113475, + "grad_norm": 2.8470499515533447, + "learning_rate": 4.8584546793298174e-06, + "loss": 0.632, + "step": 1455 + }, + { + "epoch": 0.6884160756501182, + "grad_norm": 2.8229613304138184, + "learning_rate": 4.8582476770877725e-06, + "loss": 0.6494, + "step": 1456 + }, + { + "epoch": 0.6888888888888889, + "grad_norm": 2.4235479831695557, + "learning_rate": 4.858040528007987e-06, + "loss": 0.5709, + "step": 1457 + }, + { + "epoch": 0.6893617021276596, + "grad_norm": 2.9348199367523193, + "learning_rate": 4.857833232103356e-06, + "loss": 0.5404, + "step": 1458 + }, + { + "epoch": 0.6898345153664303, + "grad_norm": 2.8274219036102295, + "learning_rate": 4.857625789386789e-06, + "loss": 0.701, + "step": 1459 + }, + { + "epoch": 0.6903073286052009, + "grad_norm": 3.136929988861084, + "learning_rate": 4.857418199871203e-06, + "loss": 0.6971, + "step": 1460 + }, + { + "epoch": 0.6907801418439716, + "grad_norm": 2.8987185955047607, + "learning_rate": 4.8572104635695214e-06, + "loss": 0.6613, + "step": 1461 + }, + { + "epoch": 0.6912529550827423, + "grad_norm": 2.5073442459106445, + "learning_rate": 4.857002580494681e-06, + "loss": 0.6032, + "step": 1462 + }, + { + "epoch": 0.691725768321513, + "grad_norm": 2.7019522190093994, + "learning_rate": 4.856794550659625e-06, + "loss": 0.567, + "step": 1463 + }, + { + "epoch": 0.6921985815602837, + "grad_norm": 2.4795594215393066, + "learning_rate": 4.8565863740773054e-06, + "loss": 0.5777, + "step": 1464 + }, + { + "epoch": 0.6926713947990544, + "grad_norm": 3.032506227493286, + "learning_rate": 4.856378050760687e-06, + "loss": 0.607, + "step": 1465 + }, + { + "epoch": 0.6931442080378251, + "grad_norm": 3.052091121673584, + "learning_rate": 4.85616958072274e-06, + "loss": 0.591, + "step": 1466 + }, + { + "epoch": 0.6936170212765957, + "grad_norm": 2.704831838607788, + "learning_rate": 4.855960963976443e-06, + "loss": 0.6528, + "step": 1467 + }, + { + "epoch": 0.6940898345153664, + "grad_norm": 2.680995225906372, + "learning_rate": 4.855752200534788e-06, + "loss": 0.6294, + "step": 1468 + }, + { + "epoch": 0.6945626477541371, + "grad_norm": 2.3948659896850586, + "learning_rate": 4.855543290410774e-06, + "loss": 0.6091, + "step": 1469 + }, + { + "epoch": 0.6950354609929078, + "grad_norm": 2.6407411098480225, + "learning_rate": 4.855334233617407e-06, + "loss": 0.5572, + "step": 1470 + }, + { + "epoch": 0.6955082742316785, + "grad_norm": 2.5526835918426514, + "learning_rate": 4.8551250301677064e-06, + "loss": 0.5432, + "step": 1471 + }, + { + "epoch": 0.6959810874704492, + "grad_norm": 3.1237430572509766, + "learning_rate": 4.8549156800746965e-06, + "loss": 0.5944, + "step": 1472 + }, + { + "epoch": 0.6964539007092199, + "grad_norm": 2.8112540245056152, + "learning_rate": 4.854706183351412e-06, + "loss": 0.604, + "step": 1473 + }, + { + "epoch": 0.6969267139479906, + "grad_norm": 2.664644479751587, + "learning_rate": 4.8544965400109e-06, + "loss": 0.5647, + "step": 1474 + }, + { + "epoch": 0.6973995271867612, + "grad_norm": 3.26310133934021, + "learning_rate": 4.854286750066212e-06, + "loss": 0.6999, + "step": 1475 + }, + { + "epoch": 0.6978723404255319, + "grad_norm": 2.9717442989349365, + "learning_rate": 4.8540768135304115e-06, + "loss": 0.6655, + "step": 1476 + }, + { + "epoch": 0.6983451536643026, + "grad_norm": 2.5302982330322266, + "learning_rate": 4.85386673041657e-06, + "loss": 0.6384, + "step": 1477 + }, + { + "epoch": 0.6988179669030733, + "grad_norm": 2.864877700805664, + "learning_rate": 4.853656500737769e-06, + "loss": 0.6834, + "step": 1478 + }, + { + "epoch": 0.699290780141844, + "grad_norm": 2.5522031784057617, + "learning_rate": 4.853446124507098e-06, + "loss": 0.5929, + "step": 1479 + }, + { + "epoch": 0.6997635933806147, + "grad_norm": 3.096477746963501, + "learning_rate": 4.853235601737656e-06, + "loss": 0.5737, + "step": 1480 + }, + { + "epoch": 0.7002364066193854, + "grad_norm": 2.884779214859009, + "learning_rate": 4.853024932442552e-06, + "loss": 0.6362, + "step": 1481 + }, + { + "epoch": 0.700709219858156, + "grad_norm": 3.368558406829834, + "learning_rate": 4.852814116634903e-06, + "loss": 0.6721, + "step": 1482 + }, + { + "epoch": 0.7011820330969267, + "grad_norm": 2.742414951324463, + "learning_rate": 4.852603154327837e-06, + "loss": 0.6212, + "step": 1483 + }, + { + "epoch": 0.7016548463356974, + "grad_norm": 2.53454852104187, + "learning_rate": 4.8523920455344864e-06, + "loss": 0.6675, + "step": 1484 + }, + { + "epoch": 0.7021276595744681, + "grad_norm": 2.9354238510131836, + "learning_rate": 4.852180790267999e-06, + "loss": 0.6692, + "step": 1485 + }, + { + "epoch": 0.7026004728132388, + "grad_norm": 2.585070848464966, + "learning_rate": 4.8519693885415274e-06, + "loss": 0.6215, + "step": 1486 + }, + { + "epoch": 0.7030732860520095, + "grad_norm": 2.9047999382019043, + "learning_rate": 4.851757840368235e-06, + "loss": 0.6231, + "step": 1487 + }, + { + "epoch": 0.7035460992907802, + "grad_norm": 3.0930933952331543, + "learning_rate": 4.851546145761295e-06, + "loss": 0.7267, + "step": 1488 + }, + { + "epoch": 0.7040189125295508, + "grad_norm": 3.0224719047546387, + "learning_rate": 4.8513343047338875e-06, + "loss": 0.6293, + "step": 1489 + }, + { + "epoch": 0.7044917257683215, + "grad_norm": 2.5758471488952637, + "learning_rate": 4.851122317299203e-06, + "loss": 0.5855, + "step": 1490 + }, + { + "epoch": 0.7049645390070922, + "grad_norm": 2.579272508621216, + "learning_rate": 4.850910183470441e-06, + "loss": 0.582, + "step": 1491 + }, + { + "epoch": 0.7054373522458629, + "grad_norm": 2.8148300647735596, + "learning_rate": 4.85069790326081e-06, + "loss": 0.6396, + "step": 1492 + }, + { + "epoch": 0.7059101654846336, + "grad_norm": 2.6380527019500732, + "learning_rate": 4.850485476683528e-06, + "loss": 0.6114, + "step": 1493 + }, + { + "epoch": 0.7063829787234043, + "grad_norm": 2.7736263275146484, + "learning_rate": 4.850272903751823e-06, + "loss": 0.6683, + "step": 1494 + }, + { + "epoch": 0.706855791962175, + "grad_norm": 3.1958179473876953, + "learning_rate": 4.8500601844789285e-06, + "loss": 0.6265, + "step": 1495 + }, + { + "epoch": 0.7073286052009456, + "grad_norm": 3.783212423324585, + "learning_rate": 4.8498473188780916e-06, + "loss": 0.6078, + "step": 1496 + }, + { + "epoch": 0.7078014184397163, + "grad_norm": 2.6656646728515625, + "learning_rate": 4.849634306962566e-06, + "loss": 0.5756, + "step": 1497 + }, + { + "epoch": 0.708274231678487, + "grad_norm": 2.757141590118408, + "learning_rate": 4.849421148745615e-06, + "loss": 0.5596, + "step": 1498 + }, + { + "epoch": 0.7087470449172577, + "grad_norm": 3.0391886234283447, + "learning_rate": 4.849207844240511e-06, + "loss": 0.5293, + "step": 1499 + }, + { + "epoch": 0.7092198581560284, + "grad_norm": 2.981912851333618, + "learning_rate": 4.848994393460535e-06, + "loss": 0.598, + "step": 1500 + }, + { + "epoch": 0.7096926713947991, + "grad_norm": 2.5470798015594482, + "learning_rate": 4.848780796418978e-06, + "loss": 0.6266, + "step": 1501 + }, + { + "epoch": 0.7101654846335698, + "grad_norm": 2.8394415378570557, + "learning_rate": 4.8485670531291415e-06, + "loss": 0.6844, + "step": 1502 + }, + { + "epoch": 0.7106382978723405, + "grad_norm": 3.2023508548736572, + "learning_rate": 4.848353163604331e-06, + "loss": 0.6134, + "step": 1503 + }, + { + "epoch": 0.7111111111111111, + "grad_norm": 2.98245906829834, + "learning_rate": 4.848139127857867e-06, + "loss": 0.7084, + "step": 1504 + }, + { + "epoch": 0.7115839243498818, + "grad_norm": 2.5917441844940186, + "learning_rate": 4.847924945903076e-06, + "loss": 0.5676, + "step": 1505 + }, + { + "epoch": 0.7120567375886525, + "grad_norm": 2.8736681938171387, + "learning_rate": 4.847710617753294e-06, + "loss": 0.6304, + "step": 1506 + }, + { + "epoch": 0.7125295508274232, + "grad_norm": 2.7832682132720947, + "learning_rate": 4.847496143421866e-06, + "loss": 0.5705, + "step": 1507 + }, + { + "epoch": 0.7130023640661939, + "grad_norm": 2.480560779571533, + "learning_rate": 4.847281522922147e-06, + "loss": 0.5595, + "step": 1508 + }, + { + "epoch": 0.7134751773049646, + "grad_norm": 2.357675313949585, + "learning_rate": 4.847066756267499e-06, + "loss": 0.5065, + "step": 1509 + }, + { + "epoch": 0.7139479905437353, + "grad_norm": 2.632669448852539, + "learning_rate": 4.846851843471296e-06, + "loss": 0.6949, + "step": 1510 + }, + { + "epoch": 0.7144208037825059, + "grad_norm": 2.7691073417663574, + "learning_rate": 4.84663678454692e-06, + "loss": 0.6638, + "step": 1511 + }, + { + "epoch": 0.7148936170212766, + "grad_norm": 2.5647685527801514, + "learning_rate": 4.846421579507761e-06, + "loss": 0.6098, + "step": 1512 + }, + { + "epoch": 0.7153664302600473, + "grad_norm": 2.476701021194458, + "learning_rate": 4.846206228367218e-06, + "loss": 0.592, + "step": 1513 + }, + { + "epoch": 0.715839243498818, + "grad_norm": 2.805727958679199, + "learning_rate": 4.845990731138702e-06, + "loss": 0.5466, + "step": 1514 + }, + { + "epoch": 0.7163120567375887, + "grad_norm": 2.551392078399658, + "learning_rate": 4.84577508783563e-06, + "loss": 0.6039, + "step": 1515 + }, + { + "epoch": 0.7167848699763594, + "grad_norm": 2.6861350536346436, + "learning_rate": 4.845559298471429e-06, + "loss": 0.6427, + "step": 1516 + }, + { + "epoch": 0.7172576832151301, + "grad_norm": 3.1908371448516846, + "learning_rate": 4.845343363059535e-06, + "loss": 0.5447, + "step": 1517 + }, + { + "epoch": 0.7177304964539007, + "grad_norm": 2.9021761417388916, + "learning_rate": 4.845127281613394e-06, + "loss": 0.5836, + "step": 1518 + }, + { + "epoch": 0.7182033096926714, + "grad_norm": 2.476670742034912, + "learning_rate": 4.844911054146461e-06, + "loss": 0.5863, + "step": 1519 + }, + { + "epoch": 0.7186761229314421, + "grad_norm": 2.662935495376587, + "learning_rate": 4.844694680672198e-06, + "loss": 0.5678, + "step": 1520 + }, + { + "epoch": 0.7191489361702128, + "grad_norm": 2.677896738052368, + "learning_rate": 4.844478161204079e-06, + "loss": 0.6195, + "step": 1521 + }, + { + "epoch": 0.7196217494089835, + "grad_norm": 2.781921863555908, + "learning_rate": 4.844261495755585e-06, + "loss": 0.643, + "step": 1522 + }, + { + "epoch": 0.7200945626477542, + "grad_norm": 3.0157392024993896, + "learning_rate": 4.844044684340206e-06, + "loss": 0.7559, + "step": 1523 + }, + { + "epoch": 0.7205673758865249, + "grad_norm": 2.8109354972839355, + "learning_rate": 4.843827726971444e-06, + "loss": 0.6264, + "step": 1524 + }, + { + "epoch": 0.7210401891252955, + "grad_norm": 3.0953569412231445, + "learning_rate": 4.8436106236628064e-06, + "loss": 0.6429, + "step": 1525 + }, + { + "epoch": 0.7215130023640662, + "grad_norm": 2.6850643157958984, + "learning_rate": 4.843393374427812e-06, + "loss": 0.6598, + "step": 1526 + }, + { + "epoch": 0.7219858156028369, + "grad_norm": 3.043480634689331, + "learning_rate": 4.8431759792799874e-06, + "loss": 0.6331, + "step": 1527 + }, + { + "epoch": 0.7224586288416076, + "grad_norm": 2.723870038986206, + "learning_rate": 4.842958438232868e-06, + "loss": 0.6259, + "step": 1528 + }, + { + "epoch": 0.7229314420803783, + "grad_norm": 2.822492837905884, + "learning_rate": 4.842740751300002e-06, + "loss": 0.6554, + "step": 1529 + }, + { + "epoch": 0.723404255319149, + "grad_norm": 2.7866315841674805, + "learning_rate": 4.842522918494941e-06, + "loss": 0.6991, + "step": 1530 + }, + { + "epoch": 0.7238770685579197, + "grad_norm": 2.8881826400756836, + "learning_rate": 4.84230493983125e-06, + "loss": 0.5876, + "step": 1531 + }, + { + "epoch": 0.7243498817966904, + "grad_norm": 2.7456939220428467, + "learning_rate": 4.8420868153225e-06, + "loss": 0.6188, + "step": 1532 + }, + { + "epoch": 0.724822695035461, + "grad_norm": 3.0257532596588135, + "learning_rate": 4.841868544982274e-06, + "loss": 0.63, + "step": 1533 + }, + { + "epoch": 0.7252955082742317, + "grad_norm": 3.1581954956054688, + "learning_rate": 4.841650128824164e-06, + "loss": 0.7214, + "step": 1534 + }, + { + "epoch": 0.7257683215130024, + "grad_norm": 2.9174306392669678, + "learning_rate": 4.841431566861767e-06, + "loss": 0.704, + "step": 1535 + }, + { + "epoch": 0.7262411347517731, + "grad_norm": 2.5019054412841797, + "learning_rate": 4.8412128591086935e-06, + "loss": 0.6298, + "step": 1536 + }, + { + "epoch": 0.7267139479905438, + "grad_norm": 2.724285125732422, + "learning_rate": 4.840994005578562e-06, + "loss": 0.6289, + "step": 1537 + }, + { + "epoch": 0.7271867612293145, + "grad_norm": 2.5882341861724854, + "learning_rate": 4.840775006284998e-06, + "loss": 0.6355, + "step": 1538 + }, + { + "epoch": 0.7276595744680852, + "grad_norm": 3.1281991004943848, + "learning_rate": 4.840555861241638e-06, + "loss": 0.5551, + "step": 1539 + }, + { + "epoch": 0.7281323877068558, + "grad_norm": 2.6064817905426025, + "learning_rate": 4.840336570462127e-06, + "loss": 0.5543, + "step": 1540 + }, + { + "epoch": 0.7286052009456265, + "grad_norm": 2.67112398147583, + "learning_rate": 4.840117133960122e-06, + "loss": 0.6044, + "step": 1541 + }, + { + "epoch": 0.7290780141843972, + "grad_norm": 2.838022232055664, + "learning_rate": 4.839897551749282e-06, + "loss": 0.6814, + "step": 1542 + }, + { + "epoch": 0.7295508274231679, + "grad_norm": 2.8897151947021484, + "learning_rate": 4.839677823843283e-06, + "loss": 0.593, + "step": 1543 + }, + { + "epoch": 0.7300236406619386, + "grad_norm": 2.9238014221191406, + "learning_rate": 4.839457950255805e-06, + "loss": 0.5544, + "step": 1544 + }, + { + "epoch": 0.7304964539007093, + "grad_norm": 3.016876459121704, + "learning_rate": 4.839237931000538e-06, + "loss": 0.6099, + "step": 1545 + }, + { + "epoch": 0.7309692671394799, + "grad_norm": 2.9415392875671387, + "learning_rate": 4.839017766091182e-06, + "loss": 0.6413, + "step": 1546 + }, + { + "epoch": 0.7314420803782505, + "grad_norm": 2.658067226409912, + "learning_rate": 4.838797455541446e-06, + "loss": 0.6534, + "step": 1547 + }, + { + "epoch": 0.7319148936170212, + "grad_norm": 2.460358142852783, + "learning_rate": 4.838576999365049e-06, + "loss": 0.5307, + "step": 1548 + }, + { + "epoch": 0.7323877068557919, + "grad_norm": 2.5818674564361572, + "learning_rate": 4.838356397575716e-06, + "loss": 0.6265, + "step": 1549 + }, + { + "epoch": 0.7328605200945626, + "grad_norm": 3.009197473526001, + "learning_rate": 4.838135650187183e-06, + "loss": 0.6957, + "step": 1550 + }, + { + "epoch": 0.7333333333333333, + "grad_norm": 2.738543748855591, + "learning_rate": 4.837914757213196e-06, + "loss": 0.646, + "step": 1551 + }, + { + "epoch": 0.733806146572104, + "grad_norm": 2.8208494186401367, + "learning_rate": 4.837693718667508e-06, + "loss": 0.5936, + "step": 1552 + }, + { + "epoch": 0.7342789598108747, + "grad_norm": 3.1574649810791016, + "learning_rate": 4.837472534563883e-06, + "loss": 0.6455, + "step": 1553 + }, + { + "epoch": 0.7347517730496453, + "grad_norm": 2.6737420558929443, + "learning_rate": 4.837251204916093e-06, + "loss": 0.5921, + "step": 1554 + }, + { + "epoch": 0.735224586288416, + "grad_norm": 2.424983024597168, + "learning_rate": 4.837029729737918e-06, + "loss": 0.6346, + "step": 1555 + }, + { + "epoch": 0.7356973995271867, + "grad_norm": 2.5163493156433105, + "learning_rate": 4.836808109043151e-06, + "loss": 0.6061, + "step": 1556 + }, + { + "epoch": 0.7361702127659574, + "grad_norm": 2.8377044200897217, + "learning_rate": 4.836586342845588e-06, + "loss": 0.611, + "step": 1557 + }, + { + "epoch": 0.7366430260047281, + "grad_norm": 2.5929181575775146, + "learning_rate": 4.83636443115904e-06, + "loss": 0.5496, + "step": 1558 + }, + { + "epoch": 0.7371158392434988, + "grad_norm": 2.5017223358154297, + "learning_rate": 4.836142373997323e-06, + "loss": 0.6235, + "step": 1559 + }, + { + "epoch": 0.7375886524822695, + "grad_norm": 2.822500228881836, + "learning_rate": 4.835920171374265e-06, + "loss": 0.6147, + "step": 1560 + }, + { + "epoch": 0.7380614657210401, + "grad_norm": 2.7234230041503906, + "learning_rate": 4.8356978233037e-06, + "loss": 0.6228, + "step": 1561 + }, + { + "epoch": 0.7385342789598108, + "grad_norm": 2.9565515518188477, + "learning_rate": 4.835475329799472e-06, + "loss": 0.5728, + "step": 1562 + }, + { + "epoch": 0.7390070921985815, + "grad_norm": 2.4356038570404053, + "learning_rate": 4.835252690875438e-06, + "loss": 0.6723, + "step": 1563 + }, + { + "epoch": 0.7394799054373522, + "grad_norm": 2.765913248062134, + "learning_rate": 4.835029906545458e-06, + "loss": 0.5805, + "step": 1564 + }, + { + "epoch": 0.7399527186761229, + "grad_norm": 2.4481914043426514, + "learning_rate": 4.834806976823405e-06, + "loss": 0.599, + "step": 1565 + }, + { + "epoch": 0.7404255319148936, + "grad_norm": 2.620779514312744, + "learning_rate": 4.834583901723158e-06, + "loss": 0.63, + "step": 1566 + }, + { + "epoch": 0.7408983451536643, + "grad_norm": 2.654426097869873, + "learning_rate": 4.83436068125861e-06, + "loss": 0.6544, + "step": 1567 + }, + { + "epoch": 0.741371158392435, + "grad_norm": 2.589623212814331, + "learning_rate": 4.834137315443656e-06, + "loss": 0.5596, + "step": 1568 + }, + { + "epoch": 0.7418439716312056, + "grad_norm": 2.572883129119873, + "learning_rate": 4.833913804292209e-06, + "loss": 0.5974, + "step": 1569 + }, + { + "epoch": 0.7423167848699763, + "grad_norm": 2.8744914531707764, + "learning_rate": 4.833690147818181e-06, + "loss": 0.5364, + "step": 1570 + }, + { + "epoch": 0.742789598108747, + "grad_norm": 2.9800851345062256, + "learning_rate": 4.833466346035502e-06, + "loss": 0.6287, + "step": 1571 + }, + { + "epoch": 0.7432624113475177, + "grad_norm": 2.627784490585327, + "learning_rate": 4.833242398958105e-06, + "loss": 0.621, + "step": 1572 + }, + { + "epoch": 0.7437352245862884, + "grad_norm": 2.5187721252441406, + "learning_rate": 4.833018306599933e-06, + "loss": 0.5901, + "step": 1573 + }, + { + "epoch": 0.7442080378250591, + "grad_norm": 2.4843688011169434, + "learning_rate": 4.832794068974944e-06, + "loss": 0.6336, + "step": 1574 + }, + { + "epoch": 0.7446808510638298, + "grad_norm": 2.774911880493164, + "learning_rate": 4.832569686097096e-06, + "loss": 0.6091, + "step": 1575 + }, + { + "epoch": 0.7451536643026004, + "grad_norm": 3.2562527656555176, + "learning_rate": 4.8323451579803615e-06, + "loss": 0.7686, + "step": 1576 + }, + { + "epoch": 0.7456264775413711, + "grad_norm": 2.799570083618164, + "learning_rate": 4.832120484638721e-06, + "loss": 0.6233, + "step": 1577 + }, + { + "epoch": 0.7460992907801418, + "grad_norm": 2.661893367767334, + "learning_rate": 4.831895666086164e-06, + "loss": 0.5841, + "step": 1578 + }, + { + "epoch": 0.7465721040189125, + "grad_norm": 3.0382652282714844, + "learning_rate": 4.831670702336689e-06, + "loss": 0.5769, + "step": 1579 + }, + { + "epoch": 0.7470449172576832, + "grad_norm": 2.676398515701294, + "learning_rate": 4.831445593404304e-06, + "loss": 0.619, + "step": 1580 + }, + { + "epoch": 0.7475177304964539, + "grad_norm": 2.717916965484619, + "learning_rate": 4.831220339303024e-06, + "loss": 0.5787, + "step": 1581 + }, + { + "epoch": 0.7479905437352246, + "grad_norm": 2.3918066024780273, + "learning_rate": 4.830994940046876e-06, + "loss": 0.5108, + "step": 1582 + }, + { + "epoch": 0.7484633569739952, + "grad_norm": 2.709144115447998, + "learning_rate": 4.830769395649895e-06, + "loss": 0.6875, + "step": 1583 + }, + { + "epoch": 0.7489361702127659, + "grad_norm": 2.8711116313934326, + "learning_rate": 4.830543706126123e-06, + "loss": 0.6745, + "step": 1584 + }, + { + "epoch": 0.7494089834515366, + "grad_norm": 2.612339496612549, + "learning_rate": 4.830317871489614e-06, + "loss": 0.5738, + "step": 1585 + }, + { + "epoch": 0.7498817966903073, + "grad_norm": 2.4355857372283936, + "learning_rate": 4.830091891754429e-06, + "loss": 0.5907, + "step": 1586 + }, + { + "epoch": 0.750354609929078, + "grad_norm": 2.676051378250122, + "learning_rate": 4.829865766934638e-06, + "loss": 0.6628, + "step": 1587 + }, + { + "epoch": 0.7508274231678487, + "grad_norm": 2.66489839553833, + "learning_rate": 4.829639497044323e-06, + "loss": 0.5984, + "step": 1588 + }, + { + "epoch": 0.7513002364066194, + "grad_norm": 2.5358035564422607, + "learning_rate": 4.829413082097572e-06, + "loss": 0.5867, + "step": 1589 + }, + { + "epoch": 0.75177304964539, + "grad_norm": 2.6530144214630127, + "learning_rate": 4.8291865221084815e-06, + "loss": 0.5917, + "step": 1590 + }, + { + "epoch": 0.7522458628841607, + "grad_norm": 2.5160958766937256, + "learning_rate": 4.82895981709116e-06, + "loss": 0.6347, + "step": 1591 + }, + { + "epoch": 0.7527186761229314, + "grad_norm": 2.61592698097229, + "learning_rate": 4.8287329670597225e-06, + "loss": 0.5472, + "step": 1592 + }, + { + "epoch": 0.7531914893617021, + "grad_norm": 2.7528622150421143, + "learning_rate": 4.828505972028296e-06, + "loss": 0.5842, + "step": 1593 + }, + { + "epoch": 0.7536643026004728, + "grad_norm": 2.8154072761535645, + "learning_rate": 4.828278832011011e-06, + "loss": 0.5757, + "step": 1594 + }, + { + "epoch": 0.7541371158392435, + "grad_norm": 3.118515729904175, + "learning_rate": 4.828051547022013e-06, + "loss": 0.6472, + "step": 1595 + }, + { + "epoch": 0.7546099290780142, + "grad_norm": 2.452033758163452, + "learning_rate": 4.827824117075453e-06, + "loss": 0.5571, + "step": 1596 + }, + { + "epoch": 0.7550827423167848, + "grad_norm": 2.984388828277588, + "learning_rate": 4.827596542185492e-06, + "loss": 0.6656, + "step": 1597 + }, + { + "epoch": 0.7555555555555555, + "grad_norm": 2.61356782913208, + "learning_rate": 4.8273688223663014e-06, + "loss": 0.6444, + "step": 1598 + }, + { + "epoch": 0.7560283687943262, + "grad_norm": 2.8967196941375732, + "learning_rate": 4.8271409576320595e-06, + "loss": 0.6457, + "step": 1599 + }, + { + "epoch": 0.7565011820330969, + "grad_norm": 2.852367639541626, + "learning_rate": 4.826912947996954e-06, + "loss": 0.5629, + "step": 1600 + }, + { + "epoch": 0.7569739952718676, + "grad_norm": 2.905280590057373, + "learning_rate": 4.826684793475182e-06, + "loss": 0.6245, + "step": 1601 + }, + { + "epoch": 0.7574468085106383, + "grad_norm": 2.6156530380249023, + "learning_rate": 4.826456494080951e-06, + "loss": 0.5869, + "step": 1602 + }, + { + "epoch": 0.757919621749409, + "grad_norm": 2.6490228176116943, + "learning_rate": 4.826228049828475e-06, + "loss": 0.5461, + "step": 1603 + }, + { + "epoch": 0.7583924349881797, + "grad_norm": 2.9626693725585938, + "learning_rate": 4.825999460731978e-06, + "loss": 0.6842, + "step": 1604 + }, + { + "epoch": 0.7588652482269503, + "grad_norm": 2.6866023540496826, + "learning_rate": 4.825770726805695e-06, + "loss": 0.5726, + "step": 1605 + }, + { + "epoch": 0.759338061465721, + "grad_norm": 2.5525858402252197, + "learning_rate": 4.825541848063866e-06, + "loss": 0.6061, + "step": 1606 + }, + { + "epoch": 0.7598108747044917, + "grad_norm": 2.703977584838867, + "learning_rate": 4.825312824520743e-06, + "loss": 0.6726, + "step": 1607 + }, + { + "epoch": 0.7602836879432624, + "grad_norm": 2.856534957885742, + "learning_rate": 4.825083656190588e-06, + "loss": 0.625, + "step": 1608 + }, + { + "epoch": 0.7607565011820331, + "grad_norm": 2.8564887046813965, + "learning_rate": 4.824854343087668e-06, + "loss": 0.7251, + "step": 1609 + }, + { + "epoch": 0.7612293144208038, + "grad_norm": 2.327650308609009, + "learning_rate": 4.824624885226262e-06, + "loss": 0.526, + "step": 1610 + }, + { + "epoch": 0.7617021276595745, + "grad_norm": 3.0025737285614014, + "learning_rate": 4.824395282620659e-06, + "loss": 0.6043, + "step": 1611 + }, + { + "epoch": 0.7621749408983451, + "grad_norm": 2.5441737174987793, + "learning_rate": 4.824165535285152e-06, + "loss": 0.6276, + "step": 1612 + }, + { + "epoch": 0.7626477541371158, + "grad_norm": 2.4177372455596924, + "learning_rate": 4.823935643234049e-06, + "loss": 0.6419, + "step": 1613 + }, + { + "epoch": 0.7631205673758865, + "grad_norm": 2.9210550785064697, + "learning_rate": 4.823705606481664e-06, + "loss": 0.5663, + "step": 1614 + }, + { + "epoch": 0.7635933806146572, + "grad_norm": 2.6353724002838135, + "learning_rate": 4.82347542504232e-06, + "loss": 0.5669, + "step": 1615 + }, + { + "epoch": 0.7640661938534279, + "grad_norm": 2.419081926345825, + "learning_rate": 4.823245098930349e-06, + "loss": 0.5777, + "step": 1616 + }, + { + "epoch": 0.7645390070921986, + "grad_norm": 2.5077571868896484, + "learning_rate": 4.823014628160093e-06, + "loss": 0.5924, + "step": 1617 + }, + { + "epoch": 0.7650118203309693, + "grad_norm": 2.816056251525879, + "learning_rate": 4.822784012745902e-06, + "loss": 0.7273, + "step": 1618 + }, + { + "epoch": 0.76548463356974, + "grad_norm": 2.7163147926330566, + "learning_rate": 4.8225532527021366e-06, + "loss": 0.5545, + "step": 1619 + }, + { + "epoch": 0.7659574468085106, + "grad_norm": 2.4784302711486816, + "learning_rate": 4.822322348043164e-06, + "loss": 0.556, + "step": 1620 + }, + { + "epoch": 0.7664302600472813, + "grad_norm": 2.712467670440674, + "learning_rate": 4.822091298783361e-06, + "loss": 0.6501, + "step": 1621 + }, + { + "epoch": 0.766903073286052, + "grad_norm": 2.7217724323272705, + "learning_rate": 4.821860104937115e-06, + "loss": 0.5989, + "step": 1622 + }, + { + "epoch": 0.7673758865248227, + "grad_norm": 2.5622854232788086, + "learning_rate": 4.821628766518821e-06, + "loss": 0.5263, + "step": 1623 + }, + { + "epoch": 0.7678486997635934, + "grad_norm": 3.230923891067505, + "learning_rate": 4.821397283542884e-06, + "loss": 0.6707, + "step": 1624 + }, + { + "epoch": 0.7683215130023641, + "grad_norm": 2.37929105758667, + "learning_rate": 4.821165656023718e-06, + "loss": 0.6124, + "step": 1625 + }, + { + "epoch": 0.7687943262411348, + "grad_norm": 2.9811325073242188, + "learning_rate": 4.820933883975745e-06, + "loss": 0.6435, + "step": 1626 + }, + { + "epoch": 0.7692671394799054, + "grad_norm": 2.887380838394165, + "learning_rate": 4.820701967413395e-06, + "loss": 0.621, + "step": 1627 + }, + { + "epoch": 0.7697399527186761, + "grad_norm": 2.6762876510620117, + "learning_rate": 4.820469906351109e-06, + "loss": 0.5713, + "step": 1628 + }, + { + "epoch": 0.7702127659574468, + "grad_norm": 2.7347512245178223, + "learning_rate": 4.820237700803337e-06, + "loss": 0.6136, + "step": 1629 + }, + { + "epoch": 0.7706855791962175, + "grad_norm": 2.7244746685028076, + "learning_rate": 4.820005350784539e-06, + "loss": 0.5816, + "step": 1630 + }, + { + "epoch": 0.7711583924349882, + "grad_norm": 2.9293999671936035, + "learning_rate": 4.8197728563091795e-06, + "loss": 0.6649, + "step": 1631 + }, + { + "epoch": 0.7716312056737589, + "grad_norm": 2.4402127265930176, + "learning_rate": 4.819540217391736e-06, + "loss": 0.6481, + "step": 1632 + }, + { + "epoch": 0.7721040189125296, + "grad_norm": 3.083941698074341, + "learning_rate": 4.819307434046694e-06, + "loss": 0.6951, + "step": 1633 + }, + { + "epoch": 0.7725768321513002, + "grad_norm": 2.544952392578125, + "learning_rate": 4.819074506288548e-06, + "loss": 0.539, + "step": 1634 + }, + { + "epoch": 0.7730496453900709, + "grad_norm": 2.7791268825531006, + "learning_rate": 4.818841434131801e-06, + "loss": 0.5827, + "step": 1635 + }, + { + "epoch": 0.7735224586288416, + "grad_norm": 2.7349796295166016, + "learning_rate": 4.818608217590967e-06, + "loss": 0.5584, + "step": 1636 + }, + { + "epoch": 0.7739952718676123, + "grad_norm": 2.637652635574341, + "learning_rate": 4.818374856680565e-06, + "loss": 0.6386, + "step": 1637 + }, + { + "epoch": 0.774468085106383, + "grad_norm": 2.9821584224700928, + "learning_rate": 4.818141351415127e-06, + "loss": 0.6734, + "step": 1638 + }, + { + "epoch": 0.7749408983451537, + "grad_norm": 2.992938995361328, + "learning_rate": 4.817907701809192e-06, + "loss": 0.5899, + "step": 1639 + }, + { + "epoch": 0.7754137115839244, + "grad_norm": 4.35719633102417, + "learning_rate": 4.8176739078773076e-06, + "loss": 0.6281, + "step": 1640 + }, + { + "epoch": 0.775886524822695, + "grad_norm": 2.838146209716797, + "learning_rate": 4.8174399696340315e-06, + "loss": 0.5766, + "step": 1641 + }, + { + "epoch": 0.7763593380614657, + "grad_norm": 3.3116989135742188, + "learning_rate": 4.81720588709393e-06, + "loss": 0.6409, + "step": 1642 + }, + { + "epoch": 0.7768321513002364, + "grad_norm": 2.9843590259552, + "learning_rate": 4.816971660271579e-06, + "loss": 0.6108, + "step": 1643 + }, + { + "epoch": 0.7773049645390071, + "grad_norm": 2.843770742416382, + "learning_rate": 4.816737289181562e-06, + "loss": 0.6053, + "step": 1644 + }, + { + "epoch": 0.7777777777777778, + "grad_norm": 2.7608556747436523, + "learning_rate": 4.816502773838473e-06, + "loss": 0.5854, + "step": 1645 + }, + { + "epoch": 0.7782505910165485, + "grad_norm": 3.343682289123535, + "learning_rate": 4.816268114256914e-06, + "loss": 0.6329, + "step": 1646 + }, + { + "epoch": 0.7787234042553192, + "grad_norm": 2.769768476486206, + "learning_rate": 4.816033310451496e-06, + "loss": 0.6242, + "step": 1647 + }, + { + "epoch": 0.7791962174940898, + "grad_norm": 2.989851713180542, + "learning_rate": 4.815798362436838e-06, + "loss": 0.6493, + "step": 1648 + }, + { + "epoch": 0.7796690307328605, + "grad_norm": 3.170736312866211, + "learning_rate": 4.8155632702275716e-06, + "loss": 0.6341, + "step": 1649 + }, + { + "epoch": 0.7801418439716312, + "grad_norm": 2.7372522354125977, + "learning_rate": 4.815328033838334e-06, + "loss": 0.5445, + "step": 1650 + }, + { + "epoch": 0.7806146572104019, + "grad_norm": 2.6947238445281982, + "learning_rate": 4.8150926532837715e-06, + "loss": 0.6437, + "step": 1651 + }, + { + "epoch": 0.7810874704491726, + "grad_norm": 2.472323179244995, + "learning_rate": 4.81485712857854e-06, + "loss": 0.5751, + "step": 1652 + }, + { + "epoch": 0.7815602836879433, + "grad_norm": 2.791114091873169, + "learning_rate": 4.814621459737308e-06, + "loss": 0.5996, + "step": 1653 + }, + { + "epoch": 0.782033096926714, + "grad_norm": 3.1957521438598633, + "learning_rate": 4.814385646774745e-06, + "loss": 0.5803, + "step": 1654 + }, + { + "epoch": 0.7825059101654847, + "grad_norm": 2.4120798110961914, + "learning_rate": 4.8141496897055364e-06, + "loss": 0.5814, + "step": 1655 + }, + { + "epoch": 0.7829787234042553, + "grad_norm": 2.9262423515319824, + "learning_rate": 4.813913588544374e-06, + "loss": 0.6292, + "step": 1656 + }, + { + "epoch": 0.783451536643026, + "grad_norm": 2.8251047134399414, + "learning_rate": 4.813677343305959e-06, + "loss": 0.6787, + "step": 1657 + }, + { + "epoch": 0.7839243498817967, + "grad_norm": 2.931659698486328, + "learning_rate": 4.8134409540050005e-06, + "loss": 0.6163, + "step": 1658 + }, + { + "epoch": 0.7843971631205674, + "grad_norm": 2.7160706520080566, + "learning_rate": 4.813204420656219e-06, + "loss": 0.6831, + "step": 1659 + }, + { + "epoch": 0.7848699763593381, + "grad_norm": 3.2134454250335693, + "learning_rate": 4.81296774327434e-06, + "loss": 0.6002, + "step": 1660 + }, + { + "epoch": 0.7853427895981088, + "grad_norm": 2.4002513885498047, + "learning_rate": 4.812730921874103e-06, + "loss": 0.5488, + "step": 1661 + }, + { + "epoch": 0.7858156028368795, + "grad_norm": 2.5559282302856445, + "learning_rate": 4.812493956470251e-06, + "loss": 0.5802, + "step": 1662 + }, + { + "epoch": 0.7862884160756501, + "grad_norm": 2.57478404045105, + "learning_rate": 4.812256847077541e-06, + "loss": 0.646, + "step": 1663 + }, + { + "epoch": 0.7867612293144208, + "grad_norm": 2.811851978302002, + "learning_rate": 4.812019593710736e-06, + "loss": 0.6245, + "step": 1664 + }, + { + "epoch": 0.7872340425531915, + "grad_norm": 2.5228829383850098, + "learning_rate": 4.811782196384609e-06, + "loss": 0.5949, + "step": 1665 + }, + { + "epoch": 0.7877068557919622, + "grad_norm": 2.744096040725708, + "learning_rate": 4.8115446551139415e-06, + "loss": 0.6006, + "step": 1666 + }, + { + "epoch": 0.7881796690307329, + "grad_norm": 3.129242420196533, + "learning_rate": 4.811306969913524e-06, + "loss": 0.7251, + "step": 1667 + }, + { + "epoch": 0.7886524822695036, + "grad_norm": 2.7855660915374756, + "learning_rate": 4.811069140798156e-06, + "loss": 0.6534, + "step": 1668 + }, + { + "epoch": 0.7891252955082743, + "grad_norm": 2.836603879928589, + "learning_rate": 4.810831167782647e-06, + "loss": 0.6661, + "step": 1669 + }, + { + "epoch": 0.789598108747045, + "grad_norm": 2.5339887142181396, + "learning_rate": 4.810593050881813e-06, + "loss": 0.5354, + "step": 1670 + }, + { + "epoch": 0.7900709219858156, + "grad_norm": 2.9553709030151367, + "learning_rate": 4.810354790110482e-06, + "loss": 0.6001, + "step": 1671 + }, + { + "epoch": 0.7905437352245863, + "grad_norm": 2.6581788063049316, + "learning_rate": 4.8101163854834885e-06, + "loss": 0.6802, + "step": 1672 + }, + { + "epoch": 0.791016548463357, + "grad_norm": 3.2002551555633545, + "learning_rate": 4.809877837015677e-06, + "loss": 0.6641, + "step": 1673 + }, + { + "epoch": 0.7914893617021277, + "grad_norm": 2.918792963027954, + "learning_rate": 4.809639144721902e-06, + "loss": 0.6758, + "step": 1674 + }, + { + "epoch": 0.7919621749408984, + "grad_norm": 2.7993946075439453, + "learning_rate": 4.8094003086170245e-06, + "loss": 0.5889, + "step": 1675 + }, + { + "epoch": 0.7924349881796691, + "grad_norm": 2.3698952198028564, + "learning_rate": 4.809161328715916e-06, + "loss": 0.6244, + "step": 1676 + }, + { + "epoch": 0.7929078014184398, + "grad_norm": 2.8891594409942627, + "learning_rate": 4.808922205033458e-06, + "loss": 0.5835, + "step": 1677 + }, + { + "epoch": 0.7933806146572104, + "grad_norm": 2.838345766067505, + "learning_rate": 4.808682937584537e-06, + "loss": 0.6907, + "step": 1678 + }, + { + "epoch": 0.7938534278959811, + "grad_norm": 2.8443174362182617, + "learning_rate": 4.808443526384053e-06, + "loss": 0.6692, + "step": 1679 + }, + { + "epoch": 0.7943262411347518, + "grad_norm": 2.7355034351348877, + "learning_rate": 4.808203971446913e-06, + "loss": 0.5799, + "step": 1680 + }, + { + "epoch": 0.7947990543735225, + "grad_norm": 2.7108020782470703, + "learning_rate": 4.807964272788033e-06, + "loss": 0.652, + "step": 1681 + }, + { + "epoch": 0.7952718676122932, + "grad_norm": 2.397650957107544, + "learning_rate": 4.807724430422338e-06, + "loss": 0.5418, + "step": 1682 + }, + { + "epoch": 0.7957446808510639, + "grad_norm": 2.4981582164764404, + "learning_rate": 4.807484444364762e-06, + "loss": 0.5731, + "step": 1683 + }, + { + "epoch": 0.7962174940898346, + "grad_norm": 2.7943713665008545, + "learning_rate": 4.8072443146302475e-06, + "loss": 0.5913, + "step": 1684 + }, + { + "epoch": 0.7966903073286052, + "grad_norm": 2.5691423416137695, + "learning_rate": 4.807004041233746e-06, + "loss": 0.6475, + "step": 1685 + }, + { + "epoch": 0.7971631205673759, + "grad_norm": 3.2367498874664307, + "learning_rate": 4.8067636241902195e-06, + "loss": 0.675, + "step": 1686 + }, + { + "epoch": 0.7976359338061466, + "grad_norm": 3.000595808029175, + "learning_rate": 4.806523063514637e-06, + "loss": 0.5481, + "step": 1687 + }, + { + "epoch": 0.7981087470449173, + "grad_norm": 2.702014207839966, + "learning_rate": 4.806282359221976e-06, + "loss": 0.5993, + "step": 1688 + }, + { + "epoch": 0.798581560283688, + "grad_norm": 2.383671998977661, + "learning_rate": 4.806041511327226e-06, + "loss": 0.562, + "step": 1689 + }, + { + "epoch": 0.7990543735224587, + "grad_norm": 2.6965041160583496, + "learning_rate": 4.8058005198453834e-06, + "loss": 0.5955, + "step": 1690 + }, + { + "epoch": 0.7995271867612294, + "grad_norm": 2.5906765460968018, + "learning_rate": 4.805559384791453e-06, + "loss": 0.5151, + "step": 1691 + }, + { + "epoch": 0.8, + "grad_norm": 2.5454652309417725, + "learning_rate": 4.8053181061804475e-06, + "loss": 0.5843, + "step": 1692 + }, + { + "epoch": 0.8004728132387707, + "grad_norm": 2.661343812942505, + "learning_rate": 4.8050766840273935e-06, + "loss": 0.5995, + "step": 1693 + }, + { + "epoch": 0.8009456264775414, + "grad_norm": 2.5635924339294434, + "learning_rate": 4.8048351183473215e-06, + "loss": 0.5676, + "step": 1694 + }, + { + "epoch": 0.8014184397163121, + "grad_norm": 2.5936667919158936, + "learning_rate": 4.804593409155274e-06, + "loss": 0.6291, + "step": 1695 + }, + { + "epoch": 0.8018912529550828, + "grad_norm": 2.6902432441711426, + "learning_rate": 4.804351556466299e-06, + "loss": 0.6114, + "step": 1696 + }, + { + "epoch": 0.8023640661938535, + "grad_norm": 2.7764673233032227, + "learning_rate": 4.804109560295457e-06, + "loss": 0.5768, + "step": 1697 + }, + { + "epoch": 0.8028368794326242, + "grad_norm": 2.9587221145629883, + "learning_rate": 4.803867420657816e-06, + "loss": 0.6048, + "step": 1698 + }, + { + "epoch": 0.8033096926713948, + "grad_norm": 2.9238998889923096, + "learning_rate": 4.803625137568453e-06, + "loss": 0.6329, + "step": 1699 + }, + { + "epoch": 0.8037825059101655, + "grad_norm": 2.70473313331604, + "learning_rate": 4.803382711042455e-06, + "loss": 0.5427, + "step": 1700 + }, + { + "epoch": 0.8042553191489362, + "grad_norm": 3.1604979038238525, + "learning_rate": 4.803140141094914e-06, + "loss": 0.626, + "step": 1701 + }, + { + "epoch": 0.8047281323877069, + "grad_norm": 2.9567699432373047, + "learning_rate": 4.802897427740936e-06, + "loss": 0.5319, + "step": 1702 + }, + { + "epoch": 0.8052009456264776, + "grad_norm": 2.90983247756958, + "learning_rate": 4.802654570995632e-06, + "loss": 0.586, + "step": 1703 + }, + { + "epoch": 0.8056737588652483, + "grad_norm": 2.783480167388916, + "learning_rate": 4.8024115708741255e-06, + "loss": 0.5773, + "step": 1704 + }, + { + "epoch": 0.806146572104019, + "grad_norm": 3.3307793140411377, + "learning_rate": 4.802168427391547e-06, + "loss": 0.6257, + "step": 1705 + }, + { + "epoch": 0.8066193853427897, + "grad_norm": 3.0475001335144043, + "learning_rate": 4.801925140563034e-06, + "loss": 0.6612, + "step": 1706 + }, + { + "epoch": 0.8070921985815603, + "grad_norm": 2.8278894424438477, + "learning_rate": 4.8016817104037375e-06, + "loss": 0.6449, + "step": 1707 + }, + { + "epoch": 0.807565011820331, + "grad_norm": 2.760244369506836, + "learning_rate": 4.801438136928812e-06, + "loss": 0.7007, + "step": 1708 + }, + { + "epoch": 0.8080378250591016, + "grad_norm": 2.827634572982788, + "learning_rate": 4.801194420153427e-06, + "loss": 0.6418, + "step": 1709 + }, + { + "epoch": 0.8085106382978723, + "grad_norm": 2.8655009269714355, + "learning_rate": 4.800950560092754e-06, + "loss": 0.6231, + "step": 1710 + }, + { + "epoch": 0.808983451536643, + "grad_norm": 2.738112688064575, + "learning_rate": 4.800706556761981e-06, + "loss": 0.6463, + "step": 1711 + }, + { + "epoch": 0.8094562647754137, + "grad_norm": 2.4781179428100586, + "learning_rate": 4.800462410176296e-06, + "loss": 0.5365, + "step": 1712 + }, + { + "epoch": 0.8099290780141843, + "grad_norm": 2.6049838066101074, + "learning_rate": 4.800218120350906e-06, + "loss": 0.6035, + "step": 1713 + }, + { + "epoch": 0.810401891252955, + "grad_norm": 2.9089980125427246, + "learning_rate": 4.79997368730102e-06, + "loss": 0.5828, + "step": 1714 + }, + { + "epoch": 0.8108747044917257, + "grad_norm": 2.831871747970581, + "learning_rate": 4.799729111041857e-06, + "loss": 0.5953, + "step": 1715 + }, + { + "epoch": 0.8113475177304964, + "grad_norm": 2.5611300468444824, + "learning_rate": 4.799484391588647e-06, + "loss": 0.6302, + "step": 1716 + }, + { + "epoch": 0.8118203309692671, + "grad_norm": 2.744070053100586, + "learning_rate": 4.799239528956625e-06, + "loss": 0.5561, + "step": 1717 + }, + { + "epoch": 0.8122931442080378, + "grad_norm": 2.7344231605529785, + "learning_rate": 4.798994523161041e-06, + "loss": 0.6317, + "step": 1718 + }, + { + "epoch": 0.8127659574468085, + "grad_norm": 2.3420889377593994, + "learning_rate": 4.798749374217149e-06, + "loss": 0.5415, + "step": 1719 + }, + { + "epoch": 0.8132387706855791, + "grad_norm": 2.57384991645813, + "learning_rate": 4.798504082140212e-06, + "loss": 0.6383, + "step": 1720 + }, + { + "epoch": 0.8137115839243498, + "grad_norm": 2.8819844722747803, + "learning_rate": 4.798258646945505e-06, + "loss": 0.6355, + "step": 1721 + }, + { + "epoch": 0.8141843971631205, + "grad_norm": 2.908123254776001, + "learning_rate": 4.79801306864831e-06, + "loss": 0.701, + "step": 1722 + }, + { + "epoch": 0.8146572104018912, + "grad_norm": 2.6500701904296875, + "learning_rate": 4.797767347263917e-06, + "loss": 0.6152, + "step": 1723 + }, + { + "epoch": 0.8151300236406619, + "grad_norm": 2.5513017177581787, + "learning_rate": 4.797521482807628e-06, + "loss": 0.6241, + "step": 1724 + }, + { + "epoch": 0.8156028368794326, + "grad_norm": 2.6239185333251953, + "learning_rate": 4.7972754752947495e-06, + "loss": 0.6072, + "step": 1725 + }, + { + "epoch": 0.8160756501182033, + "grad_norm": 2.673436403274536, + "learning_rate": 4.797029324740601e-06, + "loss": 0.5802, + "step": 1726 + }, + { + "epoch": 0.816548463356974, + "grad_norm": 2.533831834793091, + "learning_rate": 4.796783031160508e-06, + "loss": 0.5566, + "step": 1727 + }, + { + "epoch": 0.8170212765957446, + "grad_norm": 2.9806582927703857, + "learning_rate": 4.796536594569807e-06, + "loss": 0.6945, + "step": 1728 + }, + { + "epoch": 0.8174940898345153, + "grad_norm": 2.7093560695648193, + "learning_rate": 4.796290014983842e-06, + "loss": 0.7143, + "step": 1729 + }, + { + "epoch": 0.817966903073286, + "grad_norm": 2.814507246017456, + "learning_rate": 4.796043292417967e-06, + "loss": 0.6122, + "step": 1730 + }, + { + "epoch": 0.8184397163120567, + "grad_norm": 2.537156820297241, + "learning_rate": 4.795796426887543e-06, + "loss": 0.6229, + "step": 1731 + }, + { + "epoch": 0.8189125295508274, + "grad_norm": 2.4878013134002686, + "learning_rate": 4.795549418407944e-06, + "loss": 0.5442, + "step": 1732 + }, + { + "epoch": 0.8193853427895981, + "grad_norm": 2.839383363723755, + "learning_rate": 4.795302266994548e-06, + "loss": 0.6717, + "step": 1733 + }, + { + "epoch": 0.8198581560283688, + "grad_norm": 3.1981801986694336, + "learning_rate": 4.795054972662744e-06, + "loss": 0.6596, + "step": 1734 + }, + { + "epoch": 0.8203309692671394, + "grad_norm": 2.781730890274048, + "learning_rate": 4.79480753542793e-06, + "loss": 0.5845, + "step": 1735 + }, + { + "epoch": 0.8208037825059101, + "grad_norm": 2.689948558807373, + "learning_rate": 4.794559955305513e-06, + "loss": 0.5928, + "step": 1736 + }, + { + "epoch": 0.8212765957446808, + "grad_norm": 2.7267637252807617, + "learning_rate": 4.7943122323109105e-06, + "loss": 0.5224, + "step": 1737 + }, + { + "epoch": 0.8217494089834515, + "grad_norm": 2.4346601963043213, + "learning_rate": 4.794064366459544e-06, + "loss": 0.6431, + "step": 1738 + }, + { + "epoch": 0.8222222222222222, + "grad_norm": 2.7440176010131836, + "learning_rate": 4.793816357766849e-06, + "loss": 0.6083, + "step": 1739 + }, + { + "epoch": 0.8226950354609929, + "grad_norm": 2.6558027267456055, + "learning_rate": 4.793568206248268e-06, + "loss": 0.698, + "step": 1740 + }, + { + "epoch": 0.8231678486997636, + "grad_norm": 2.591658353805542, + "learning_rate": 4.793319911919251e-06, + "loss": 0.6601, + "step": 1741 + }, + { + "epoch": 0.8236406619385342, + "grad_norm": 2.5431172847747803, + "learning_rate": 4.79307147479526e-06, + "loss": 0.5917, + "step": 1742 + }, + { + "epoch": 0.8241134751773049, + "grad_norm": 2.7335588932037354, + "learning_rate": 4.792822894891762e-06, + "loss": 0.5925, + "step": 1743 + }, + { + "epoch": 0.8245862884160756, + "grad_norm": 2.2500839233398438, + "learning_rate": 4.792574172224237e-06, + "loss": 0.4984, + "step": 1744 + }, + { + "epoch": 0.8250591016548463, + "grad_norm": 2.691343069076538, + "learning_rate": 4.79232530680817e-06, + "loss": 0.6262, + "step": 1745 + }, + { + "epoch": 0.825531914893617, + "grad_norm": 2.612204074859619, + "learning_rate": 4.792076298659058e-06, + "loss": 0.5822, + "step": 1746 + }, + { + "epoch": 0.8260047281323877, + "grad_norm": 3.0163519382476807, + "learning_rate": 4.791827147792406e-06, + "loss": 0.6263, + "step": 1747 + }, + { + "epoch": 0.8264775413711584, + "grad_norm": 2.742183208465576, + "learning_rate": 4.791577854223727e-06, + "loss": 0.6628, + "step": 1748 + }, + { + "epoch": 0.826950354609929, + "grad_norm": 2.872213840484619, + "learning_rate": 4.791328417968542e-06, + "loss": 0.6332, + "step": 1749 + }, + { + "epoch": 0.8274231678486997, + "grad_norm": 2.725006580352783, + "learning_rate": 4.7910788390423844e-06, + "loss": 0.6266, + "step": 1750 + }, + { + "epoch": 0.8278959810874704, + "grad_norm": 3.0366697311401367, + "learning_rate": 4.790829117460793e-06, + "loss": 0.6403, + "step": 1751 + }, + { + "epoch": 0.8283687943262411, + "grad_norm": 2.594881772994995, + "learning_rate": 4.790579253239318e-06, + "loss": 0.521, + "step": 1752 + }, + { + "epoch": 0.8288416075650118, + "grad_norm": 2.4496347904205322, + "learning_rate": 4.790329246393517e-06, + "loss": 0.54, + "step": 1753 + }, + { + "epoch": 0.8293144208037825, + "grad_norm": 3.102278470993042, + "learning_rate": 4.790079096938956e-06, + "loss": 0.6142, + "step": 1754 + }, + { + "epoch": 0.8297872340425532, + "grad_norm": 2.4645912647247314, + "learning_rate": 4.789828804891212e-06, + "loss": 0.5212, + "step": 1755 + }, + { + "epoch": 0.8302600472813239, + "grad_norm": 2.7482516765594482, + "learning_rate": 4.789578370265868e-06, + "loss": 0.6712, + "step": 1756 + }, + { + "epoch": 0.8307328605200945, + "grad_norm": 2.61360502243042, + "learning_rate": 4.7893277930785195e-06, + "loss": 0.6367, + "step": 1757 + }, + { + "epoch": 0.8312056737588652, + "grad_norm": 2.79028058052063, + "learning_rate": 4.789077073344767e-06, + "loss": 0.5099, + "step": 1758 + }, + { + "epoch": 0.8316784869976359, + "grad_norm": 2.647662401199341, + "learning_rate": 4.788826211080222e-06, + "loss": 0.6698, + "step": 1759 + }, + { + "epoch": 0.8321513002364066, + "grad_norm": 3.0214831829071045, + "learning_rate": 4.7885752063005055e-06, + "loss": 0.6121, + "step": 1760 + }, + { + "epoch": 0.8326241134751773, + "grad_norm": 2.8244032859802246, + "learning_rate": 4.788324059021247e-06, + "loss": 0.6921, + "step": 1761 + }, + { + "epoch": 0.833096926713948, + "grad_norm": 3.1501076221466064, + "learning_rate": 4.788072769258082e-06, + "loss": 0.6872, + "step": 1762 + }, + { + "epoch": 0.8335697399527187, + "grad_norm": 2.6989903450012207, + "learning_rate": 4.7878213370266594e-06, + "loss": 0.5884, + "step": 1763 + }, + { + "epoch": 0.8340425531914893, + "grad_norm": 2.6982665061950684, + "learning_rate": 4.787569762342633e-06, + "loss": 0.6112, + "step": 1764 + }, + { + "epoch": 0.83451536643026, + "grad_norm": 2.6918323040008545, + "learning_rate": 4.7873180452216685e-06, + "loss": 0.5315, + "step": 1765 + }, + { + "epoch": 0.8349881796690307, + "grad_norm": 2.5494401454925537, + "learning_rate": 4.78706618567944e-06, + "loss": 0.5909, + "step": 1766 + }, + { + "epoch": 0.8354609929078014, + "grad_norm": 2.7532095909118652, + "learning_rate": 4.786814183731627e-06, + "loss": 0.5566, + "step": 1767 + }, + { + "epoch": 0.8359338061465721, + "grad_norm": 2.550865888595581, + "learning_rate": 4.786562039393923e-06, + "loss": 0.555, + "step": 1768 + }, + { + "epoch": 0.8364066193853428, + "grad_norm": 2.4477791786193848, + "learning_rate": 4.786309752682028e-06, + "loss": 0.5844, + "step": 1769 + }, + { + "epoch": 0.8368794326241135, + "grad_norm": 2.6982262134552, + "learning_rate": 4.7860573236116485e-06, + "loss": 0.6136, + "step": 1770 + }, + { + "epoch": 0.8373522458628841, + "grad_norm": 2.456263542175293, + "learning_rate": 4.785804752198503e-06, + "loss": 0.5055, + "step": 1771 + }, + { + "epoch": 0.8378250591016548, + "grad_norm": 2.428544521331787, + "learning_rate": 4.78555203845832e-06, + "loss": 0.5859, + "step": 1772 + }, + { + "epoch": 0.8382978723404255, + "grad_norm": 2.1782307624816895, + "learning_rate": 4.785299182406833e-06, + "loss": 0.5325, + "step": 1773 + }, + { + "epoch": 0.8387706855791962, + "grad_norm": 3.137956142425537, + "learning_rate": 4.785046184059786e-06, + "loss": 0.6097, + "step": 1774 + }, + { + "epoch": 0.8392434988179669, + "grad_norm": 2.6269001960754395, + "learning_rate": 4.7847930434329336e-06, + "loss": 0.5972, + "step": 1775 + }, + { + "epoch": 0.8397163120567376, + "grad_norm": 2.732659339904785, + "learning_rate": 4.784539760542037e-06, + "loss": 0.6054, + "step": 1776 + }, + { + "epoch": 0.8401891252955083, + "grad_norm": 2.5346736907958984, + "learning_rate": 4.784286335402866e-06, + "loss": 0.5521, + "step": 1777 + }, + { + "epoch": 0.840661938534279, + "grad_norm": 3.1420228481292725, + "learning_rate": 4.784032768031202e-06, + "loss": 0.6165, + "step": 1778 + }, + { + "epoch": 0.8411347517730496, + "grad_norm": 3.073793411254883, + "learning_rate": 4.783779058442831e-06, + "loss": 0.6414, + "step": 1779 + }, + { + "epoch": 0.8416075650118203, + "grad_norm": 2.6621336936950684, + "learning_rate": 4.783525206653554e-06, + "loss": 0.5836, + "step": 1780 + }, + { + "epoch": 0.842080378250591, + "grad_norm": 2.7029049396514893, + "learning_rate": 4.7832712126791745e-06, + "loss": 0.5897, + "step": 1781 + }, + { + "epoch": 0.8425531914893617, + "grad_norm": 2.4733822345733643, + "learning_rate": 4.783017076535509e-06, + "loss": 0.5913, + "step": 1782 + }, + { + "epoch": 0.8430260047281324, + "grad_norm": 2.8119473457336426, + "learning_rate": 4.782762798238381e-06, + "loss": 0.6105, + "step": 1783 + }, + { + "epoch": 0.8434988179669031, + "grad_norm": 2.5290818214416504, + "learning_rate": 4.782508377803622e-06, + "loss": 0.6119, + "step": 1784 + }, + { + "epoch": 0.8439716312056738, + "grad_norm": 3.193472385406494, + "learning_rate": 4.782253815247076e-06, + "loss": 0.6665, + "step": 1785 + }, + { + "epoch": 0.8444444444444444, + "grad_norm": 3.206759452819824, + "learning_rate": 4.781999110584592e-06, + "loss": 0.6012, + "step": 1786 + }, + { + "epoch": 0.8449172576832151, + "grad_norm": 2.6227457523345947, + "learning_rate": 4.781744263832029e-06, + "loss": 0.5845, + "step": 1787 + }, + { + "epoch": 0.8453900709219858, + "grad_norm": 2.838365316390991, + "learning_rate": 4.781489275005257e-06, + "loss": 0.5695, + "step": 1788 + }, + { + "epoch": 0.8458628841607565, + "grad_norm": 2.8348326683044434, + "learning_rate": 4.78123414412015e-06, + "loss": 0.6136, + "step": 1789 + }, + { + "epoch": 0.8463356973995272, + "grad_norm": 2.5698344707489014, + "learning_rate": 4.780978871192597e-06, + "loss": 0.6576, + "step": 1790 + }, + { + "epoch": 0.8468085106382979, + "grad_norm": 2.5198330879211426, + "learning_rate": 4.780723456238492e-06, + "loss": 0.5521, + "step": 1791 + }, + { + "epoch": 0.8472813238770686, + "grad_norm": 3.001325845718384, + "learning_rate": 4.780467899273737e-06, + "loss": 0.6075, + "step": 1792 + }, + { + "epoch": 0.8477541371158392, + "grad_norm": 2.7732746601104736, + "learning_rate": 4.780212200314247e-06, + "loss": 0.6245, + "step": 1793 + }, + { + "epoch": 0.8482269503546099, + "grad_norm": 2.6950337886810303, + "learning_rate": 4.77995635937594e-06, + "loss": 0.5723, + "step": 1794 + }, + { + "epoch": 0.8486997635933806, + "grad_norm": 2.82051420211792, + "learning_rate": 4.779700376474749e-06, + "loss": 0.6184, + "step": 1795 + }, + { + "epoch": 0.8491725768321513, + "grad_norm": 2.757791757583618, + "learning_rate": 4.779444251626611e-06, + "loss": 0.608, + "step": 1796 + }, + { + "epoch": 0.849645390070922, + "grad_norm": 2.394108533859253, + "learning_rate": 4.779187984847475e-06, + "loss": 0.6174, + "step": 1797 + }, + { + "epoch": 0.8501182033096927, + "grad_norm": 2.427562713623047, + "learning_rate": 4.778931576153296e-06, + "loss": 0.5618, + "step": 1798 + }, + { + "epoch": 0.8505910165484634, + "grad_norm": 2.891268491744995, + "learning_rate": 4.778675025560042e-06, + "loss": 0.6865, + "step": 1799 + }, + { + "epoch": 0.851063829787234, + "grad_norm": 2.665534257888794, + "learning_rate": 4.778418333083685e-06, + "loss": 0.5852, + "step": 1800 + }, + { + "epoch": 0.8515366430260047, + "grad_norm": 2.5492889881134033, + "learning_rate": 4.7781614987402095e-06, + "loss": 0.5161, + "step": 1801 + }, + { + "epoch": 0.8520094562647754, + "grad_norm": 2.400177001953125, + "learning_rate": 4.777904522545607e-06, + "loss": 0.5128, + "step": 1802 + }, + { + "epoch": 0.8524822695035461, + "grad_norm": 2.3949809074401855, + "learning_rate": 4.777647404515878e-06, + "loss": 0.571, + "step": 1803 + }, + { + "epoch": 0.8529550827423168, + "grad_norm": 2.3624472618103027, + "learning_rate": 4.7773901446670325e-06, + "loss": 0.5486, + "step": 1804 + }, + { + "epoch": 0.8534278959810875, + "grad_norm": 2.711366891860962, + "learning_rate": 4.7771327430150885e-06, + "loss": 0.5667, + "step": 1805 + }, + { + "epoch": 0.8539007092198582, + "grad_norm": 2.7681493759155273, + "learning_rate": 4.776875199576073e-06, + "loss": 0.5686, + "step": 1806 + }, + { + "epoch": 0.8543735224586289, + "grad_norm": 3.0369436740875244, + "learning_rate": 4.776617514366023e-06, + "loss": 0.6635, + "step": 1807 + }, + { + "epoch": 0.8548463356973995, + "grad_norm": 2.919649600982666, + "learning_rate": 4.776359687400983e-06, + "loss": 0.5749, + "step": 1808 + }, + { + "epoch": 0.8553191489361702, + "grad_norm": 2.7986185550689697, + "learning_rate": 4.776101718697007e-06, + "loss": 0.559, + "step": 1809 + }, + { + "epoch": 0.8557919621749409, + "grad_norm": 2.5951223373413086, + "learning_rate": 4.775843608270158e-06, + "loss": 0.5654, + "step": 1810 + }, + { + "epoch": 0.8562647754137116, + "grad_norm": 2.674138069152832, + "learning_rate": 4.775585356136505e-06, + "loss": 0.5286, + "step": 1811 + }, + { + "epoch": 0.8567375886524823, + "grad_norm": 3.045437812805176, + "learning_rate": 4.775326962312131e-06, + "loss": 0.6185, + "step": 1812 + }, + { + "epoch": 0.857210401891253, + "grad_norm": 2.6145293712615967, + "learning_rate": 4.775068426813124e-06, + "loss": 0.6075, + "step": 1813 + }, + { + "epoch": 0.8576832151300237, + "grad_norm": 2.6320106983184814, + "learning_rate": 4.7748097496555824e-06, + "loss": 0.561, + "step": 1814 + }, + { + "epoch": 0.8581560283687943, + "grad_norm": 2.5038623809814453, + "learning_rate": 4.774550930855612e-06, + "loss": 0.593, + "step": 1815 + }, + { + "epoch": 0.858628841607565, + "grad_norm": 2.8168089389801025, + "learning_rate": 4.774291970429329e-06, + "loss": 0.5196, + "step": 1816 + }, + { + "epoch": 0.8591016548463357, + "grad_norm": 2.778130292892456, + "learning_rate": 4.774032868392858e-06, + "loss": 0.5984, + "step": 1817 + }, + { + "epoch": 0.8595744680851064, + "grad_norm": 2.536458730697632, + "learning_rate": 4.7737736247623305e-06, + "loss": 0.568, + "step": 1818 + }, + { + "epoch": 0.8600472813238771, + "grad_norm": 2.6669719219207764, + "learning_rate": 4.77351423955389e-06, + "loss": 0.6233, + "step": 1819 + }, + { + "epoch": 0.8605200945626478, + "grad_norm": 2.578242540359497, + "learning_rate": 4.773254712783687e-06, + "loss": 0.579, + "step": 1820 + }, + { + "epoch": 0.8609929078014185, + "grad_norm": 2.816664457321167, + "learning_rate": 4.772995044467881e-06, + "loss": 0.6635, + "step": 1821 + }, + { + "epoch": 0.8614657210401891, + "grad_norm": 3.1111979484558105, + "learning_rate": 4.77273523462264e-06, + "loss": 0.6372, + "step": 1822 + }, + { + "epoch": 0.8619385342789598, + "grad_norm": 2.764552354812622, + "learning_rate": 4.772475283264142e-06, + "loss": 0.6216, + "step": 1823 + }, + { + "epoch": 0.8624113475177305, + "grad_norm": 2.9126830101013184, + "learning_rate": 4.772215190408572e-06, + "loss": 0.6396, + "step": 1824 + }, + { + "epoch": 0.8628841607565012, + "grad_norm": 2.7502307891845703, + "learning_rate": 4.7719549560721264e-06, + "loss": 0.6186, + "step": 1825 + }, + { + "epoch": 0.8633569739952719, + "grad_norm": 2.6279006004333496, + "learning_rate": 4.771694580271007e-06, + "loss": 0.5557, + "step": 1826 + }, + { + "epoch": 0.8638297872340426, + "grad_norm": 2.996563196182251, + "learning_rate": 4.7714340630214276e-06, + "loss": 0.6259, + "step": 1827 + }, + { + "epoch": 0.8643026004728133, + "grad_norm": 3.231323480606079, + "learning_rate": 4.771173404339609e-06, + "loss": 0.5473, + "step": 1828 + }, + { + "epoch": 0.864775413711584, + "grad_norm": 3.143519878387451, + "learning_rate": 4.770912604241781e-06, + "loss": 0.593, + "step": 1829 + }, + { + "epoch": 0.8652482269503546, + "grad_norm": 2.515484094619751, + "learning_rate": 4.770651662744184e-06, + "loss": 0.538, + "step": 1830 + }, + { + "epoch": 0.8657210401891253, + "grad_norm": 2.629058837890625, + "learning_rate": 4.770390579863064e-06, + "loss": 0.5745, + "step": 1831 + }, + { + "epoch": 0.866193853427896, + "grad_norm": 2.5826802253723145, + "learning_rate": 4.770129355614677e-06, + "loss": 0.6397, + "step": 1832 + }, + { + "epoch": 0.8666666666666667, + "grad_norm": 2.954623222351074, + "learning_rate": 4.769867990015289e-06, + "loss": 0.6106, + "step": 1833 + }, + { + "epoch": 0.8671394799054374, + "grad_norm": 2.742192268371582, + "learning_rate": 4.769606483081175e-06, + "loss": 0.6902, + "step": 1834 + }, + { + "epoch": 0.8676122931442081, + "grad_norm": 2.2619097232818604, + "learning_rate": 4.769344834828618e-06, + "loss": 0.5414, + "step": 1835 + }, + { + "epoch": 0.8680851063829788, + "grad_norm": 2.7384188175201416, + "learning_rate": 4.769083045273908e-06, + "loss": 0.5787, + "step": 1836 + }, + { + "epoch": 0.8685579196217494, + "grad_norm": 2.6734485626220703, + "learning_rate": 4.768821114433346e-06, + "loss": 0.5923, + "step": 1837 + }, + { + "epoch": 0.8690307328605201, + "grad_norm": 2.286140203475952, + "learning_rate": 4.768559042323243e-06, + "loss": 0.5822, + "step": 1838 + }, + { + "epoch": 0.8695035460992908, + "grad_norm": 3.0243725776672363, + "learning_rate": 4.768296828959915e-06, + "loss": 0.6623, + "step": 1839 + }, + { + "epoch": 0.8699763593380615, + "grad_norm": 2.4026312828063965, + "learning_rate": 4.768034474359689e-06, + "loss": 0.5554, + "step": 1840 + }, + { + "epoch": 0.8704491725768322, + "grad_norm": 2.7469029426574707, + "learning_rate": 4.767771978538903e-06, + "loss": 0.6316, + "step": 1841 + }, + { + "epoch": 0.8709219858156029, + "grad_norm": 2.729659080505371, + "learning_rate": 4.767509341513899e-06, + "loss": 0.5807, + "step": 1842 + }, + { + "epoch": 0.8713947990543736, + "grad_norm": 2.5336945056915283, + "learning_rate": 4.76724656330103e-06, + "loss": 0.6109, + "step": 1843 + }, + { + "epoch": 0.8718676122931442, + "grad_norm": 2.519880533218384, + "learning_rate": 4.76698364391666e-06, + "loss": 0.5313, + "step": 1844 + }, + { + "epoch": 0.8723404255319149, + "grad_norm": 2.698862075805664, + "learning_rate": 4.766720583377159e-06, + "loss": 0.5953, + "step": 1845 + }, + { + "epoch": 0.8728132387706856, + "grad_norm": 3.0195560455322266, + "learning_rate": 4.766457381698907e-06, + "loss": 0.5965, + "step": 1846 + }, + { + "epoch": 0.8732860520094563, + "grad_norm": 2.5972697734832764, + "learning_rate": 4.766194038898291e-06, + "loss": 0.6014, + "step": 1847 + }, + { + "epoch": 0.873758865248227, + "grad_norm": 2.7132294178009033, + "learning_rate": 4.76593055499171e-06, + "loss": 0.5638, + "step": 1848 + }, + { + "epoch": 0.8742316784869977, + "grad_norm": 2.7134575843811035, + "learning_rate": 4.765666929995568e-06, + "loss": 0.52, + "step": 1849 + }, + { + "epoch": 0.8747044917257684, + "grad_norm": 2.3804993629455566, + "learning_rate": 4.765403163926282e-06, + "loss": 0.5435, + "step": 1850 + }, + { + "epoch": 0.875177304964539, + "grad_norm": 2.8782761096954346, + "learning_rate": 4.765139256800274e-06, + "loss": 0.5843, + "step": 1851 + }, + { + "epoch": 0.8756501182033097, + "grad_norm": 2.836209774017334, + "learning_rate": 4.764875208633977e-06, + "loss": 0.6667, + "step": 1852 + }, + { + "epoch": 0.8761229314420804, + "grad_norm": 2.608851194381714, + "learning_rate": 4.764611019443831e-06, + "loss": 0.5436, + "step": 1853 + }, + { + "epoch": 0.8765957446808511, + "grad_norm": 2.788738965988159, + "learning_rate": 4.764346689246288e-06, + "loss": 0.7331, + "step": 1854 + }, + { + "epoch": 0.8770685579196218, + "grad_norm": 2.524277687072754, + "learning_rate": 4.764082218057805e-06, + "loss": 0.5067, + "step": 1855 + }, + { + "epoch": 0.8775413711583925, + "grad_norm": 3.7559316158294678, + "learning_rate": 4.763817605894851e-06, + "loss": 0.6809, + "step": 1856 + }, + { + "epoch": 0.8780141843971632, + "grad_norm": 2.9070613384246826, + "learning_rate": 4.763552852773899e-06, + "loss": 0.5913, + "step": 1857 + }, + { + "epoch": 0.8784869976359339, + "grad_norm": 2.7050609588623047, + "learning_rate": 4.7632879587114386e-06, + "loss": 0.6074, + "step": 1858 + }, + { + "epoch": 0.8789598108747045, + "grad_norm": 2.891134262084961, + "learning_rate": 4.76302292372396e-06, + "loss": 0.5939, + "step": 1859 + }, + { + "epoch": 0.8794326241134752, + "grad_norm": 2.8581702709198, + "learning_rate": 4.762757747827968e-06, + "loss": 0.5972, + "step": 1860 + }, + { + "epoch": 0.8799054373522459, + "grad_norm": 2.8266196250915527, + "learning_rate": 4.762492431039971e-06, + "loss": 0.5993, + "step": 1861 + }, + { + "epoch": 0.8803782505910166, + "grad_norm": 2.4853954315185547, + "learning_rate": 4.762226973376493e-06, + "loss": 0.6388, + "step": 1862 + }, + { + "epoch": 0.8808510638297873, + "grad_norm": 3.2212886810302734, + "learning_rate": 4.761961374854059e-06, + "loss": 0.6698, + "step": 1863 + }, + { + "epoch": 0.881323877068558, + "grad_norm": 3.1254501342773438, + "learning_rate": 4.761695635489211e-06, + "loss": 0.5263, + "step": 1864 + }, + { + "epoch": 0.8817966903073287, + "grad_norm": 2.6891462802886963, + "learning_rate": 4.761429755298491e-06, + "loss": 0.5359, + "step": 1865 + }, + { + "epoch": 0.8822695035460993, + "grad_norm": 2.8557538986206055, + "learning_rate": 4.761163734298457e-06, + "loss": 0.5933, + "step": 1866 + }, + { + "epoch": 0.88274231678487, + "grad_norm": 2.53548264503479, + "learning_rate": 4.7608975725056724e-06, + "loss": 0.6397, + "step": 1867 + }, + { + "epoch": 0.8832151300236407, + "grad_norm": 3.0237956047058105, + "learning_rate": 4.76063126993671e-06, + "loss": 0.6845, + "step": 1868 + }, + { + "epoch": 0.8836879432624114, + "grad_norm": 3.222886800765991, + "learning_rate": 4.76036482660815e-06, + "loss": 0.6055, + "step": 1869 + }, + { + "epoch": 0.8841607565011821, + "grad_norm": 3.1867551803588867, + "learning_rate": 4.760098242536584e-06, + "loss": 0.6592, + "step": 1870 + }, + { + "epoch": 0.8846335697399527, + "grad_norm": 2.782209873199463, + "learning_rate": 4.7598315177386115e-06, + "loss": 0.5833, + "step": 1871 + }, + { + "epoch": 0.8851063829787233, + "grad_norm": 2.899871587753296, + "learning_rate": 4.759564652230838e-06, + "loss": 0.6129, + "step": 1872 + }, + { + "epoch": 0.885579196217494, + "grad_norm": 2.5690579414367676, + "learning_rate": 4.759297646029882e-06, + "loss": 0.5827, + "step": 1873 + }, + { + "epoch": 0.8860520094562647, + "grad_norm": 2.666130304336548, + "learning_rate": 4.759030499152368e-06, + "loss": 0.5272, + "step": 1874 + }, + { + "epoch": 0.8865248226950354, + "grad_norm": 2.7030911445617676, + "learning_rate": 4.758763211614932e-06, + "loss": 0.6415, + "step": 1875 + }, + { + "epoch": 0.8869976359338061, + "grad_norm": 2.717512845993042, + "learning_rate": 4.7584957834342135e-06, + "loss": 0.5827, + "step": 1876 + }, + { + "epoch": 0.8874704491725768, + "grad_norm": 2.665823459625244, + "learning_rate": 4.758228214626867e-06, + "loss": 0.6209, + "step": 1877 + }, + { + "epoch": 0.8879432624113475, + "grad_norm": 2.636653184890747, + "learning_rate": 4.75796050520955e-06, + "loss": 0.6413, + "step": 1878 + }, + { + "epoch": 0.8884160756501182, + "grad_norm": 2.585115671157837, + "learning_rate": 4.7576926551989345e-06, + "loss": 0.5518, + "step": 1879 + }, + { + "epoch": 0.8888888888888888, + "grad_norm": 2.808526039123535, + "learning_rate": 4.757424664611697e-06, + "loss": 0.5717, + "step": 1880 + }, + { + "epoch": 0.8893617021276595, + "grad_norm": 3.5957939624786377, + "learning_rate": 4.757156533464524e-06, + "loss": 0.6323, + "step": 1881 + }, + { + "epoch": 0.8898345153664302, + "grad_norm": 2.5003883838653564, + "learning_rate": 4.756888261774111e-06, + "loss": 0.5937, + "step": 1882 + }, + { + "epoch": 0.8903073286052009, + "grad_norm": 2.749061346054077, + "learning_rate": 4.756619849557161e-06, + "loss": 0.6642, + "step": 1883 + }, + { + "epoch": 0.8907801418439716, + "grad_norm": 2.6757891178131104, + "learning_rate": 4.756351296830389e-06, + "loss": 0.5887, + "step": 1884 + }, + { + "epoch": 0.8912529550827423, + "grad_norm": 2.811925172805786, + "learning_rate": 4.756082603610516e-06, + "loss": 0.6571, + "step": 1885 + }, + { + "epoch": 0.891725768321513, + "grad_norm": 2.5054616928100586, + "learning_rate": 4.755813769914271e-06, + "loss": 0.6312, + "step": 1886 + }, + { + "epoch": 0.8921985815602836, + "grad_norm": 2.7518467903137207, + "learning_rate": 4.755544795758395e-06, + "loss": 0.6685, + "step": 1887 + }, + { + "epoch": 0.8926713947990543, + "grad_norm": 2.7527287006378174, + "learning_rate": 4.755275681159634e-06, + "loss": 0.5886, + "step": 1888 + }, + { + "epoch": 0.893144208037825, + "grad_norm": 2.6162452697753906, + "learning_rate": 4.755006426134745e-06, + "loss": 0.546, + "step": 1889 + }, + { + "epoch": 0.8936170212765957, + "grad_norm": 2.4016737937927246, + "learning_rate": 4.754737030700495e-06, + "loss": 0.5726, + "step": 1890 + }, + { + "epoch": 0.8940898345153664, + "grad_norm": 2.528327703475952, + "learning_rate": 4.754467494873656e-06, + "loss": 0.5682, + "step": 1891 + }, + { + "epoch": 0.8945626477541371, + "grad_norm": 2.3139286041259766, + "learning_rate": 4.7541978186710115e-06, + "loss": 0.6108, + "step": 1892 + }, + { + "epoch": 0.8950354609929078, + "grad_norm": 2.7269136905670166, + "learning_rate": 4.753928002109354e-06, + "loss": 0.5875, + "step": 1893 + }, + { + "epoch": 0.8955082742316784, + "grad_norm": 4.425495147705078, + "learning_rate": 4.753658045205482e-06, + "loss": 0.5572, + "step": 1894 + }, + { + "epoch": 0.8959810874704491, + "grad_norm": 2.535409927368164, + "learning_rate": 4.753387947976206e-06, + "loss": 0.5868, + "step": 1895 + }, + { + "epoch": 0.8964539007092198, + "grad_norm": 2.722458600997925, + "learning_rate": 4.753117710438343e-06, + "loss": 0.5935, + "step": 1896 + }, + { + "epoch": 0.8969267139479905, + "grad_norm": 2.743861436843872, + "learning_rate": 4.75284733260872e-06, + "loss": 0.572, + "step": 1897 + }, + { + "epoch": 0.8973995271867612, + "grad_norm": 2.60640549659729, + "learning_rate": 4.752576814504173e-06, + "loss": 0.567, + "step": 1898 + }, + { + "epoch": 0.8978723404255319, + "grad_norm": 2.7486042976379395, + "learning_rate": 4.7523061561415435e-06, + "loss": 0.5768, + "step": 1899 + }, + { + "epoch": 0.8983451536643026, + "grad_norm": 3.8410251140594482, + "learning_rate": 4.752035357537686e-06, + "loss": 0.6034, + "step": 1900 + }, + { + "epoch": 0.8988179669030733, + "grad_norm": 3.0935890674591064, + "learning_rate": 4.751764418709462e-06, + "loss": 0.5644, + "step": 1901 + }, + { + "epoch": 0.8992907801418439, + "grad_norm": 2.7989892959594727, + "learning_rate": 4.751493339673742e-06, + "loss": 0.656, + "step": 1902 + }, + { + "epoch": 0.8997635933806146, + "grad_norm": 3.6940557956695557, + "learning_rate": 4.751222120447403e-06, + "loss": 0.6632, + "step": 1903 + }, + { + "epoch": 0.9002364066193853, + "grad_norm": 2.3428797721862793, + "learning_rate": 4.750950761047335e-06, + "loss": 0.4485, + "step": 1904 + }, + { + "epoch": 0.900709219858156, + "grad_norm": 2.622544050216675, + "learning_rate": 4.750679261490432e-06, + "loss": 0.5857, + "step": 1905 + }, + { + "epoch": 0.9011820330969267, + "grad_norm": 2.4911322593688965, + "learning_rate": 4.750407621793601e-06, + "loss": 0.5618, + "step": 1906 + }, + { + "epoch": 0.9016548463356974, + "grad_norm": 2.6434662342071533, + "learning_rate": 4.750135841973755e-06, + "loss": 0.6057, + "step": 1907 + }, + { + "epoch": 0.902127659574468, + "grad_norm": 3.115443706512451, + "learning_rate": 4.749863922047817e-06, + "loss": 0.6064, + "step": 1908 + }, + { + "epoch": 0.9026004728132387, + "grad_norm": 2.5671091079711914, + "learning_rate": 4.749591862032718e-06, + "loss": 0.5625, + "step": 1909 + }, + { + "epoch": 0.9030732860520094, + "grad_norm": 3.2008655071258545, + "learning_rate": 4.749319661945398e-06, + "loss": 0.5547, + "step": 1910 + }, + { + "epoch": 0.9035460992907801, + "grad_norm": 2.905987024307251, + "learning_rate": 4.749047321802805e-06, + "loss": 0.6033, + "step": 1911 + }, + { + "epoch": 0.9040189125295508, + "grad_norm": 3.1456053256988525, + "learning_rate": 4.748774841621897e-06, + "loss": 0.5651, + "step": 1912 + }, + { + "epoch": 0.9044917257683215, + "grad_norm": 2.8116416931152344, + "learning_rate": 4.748502221419641e-06, + "loss": 0.5853, + "step": 1913 + }, + { + "epoch": 0.9049645390070922, + "grad_norm": 3.123835325241089, + "learning_rate": 4.748229461213011e-06, + "loss": 0.5427, + "step": 1914 + }, + { + "epoch": 0.9054373522458629, + "grad_norm": 2.4750146865844727, + "learning_rate": 4.747956561018989e-06, + "loss": 0.6517, + "step": 1915 + }, + { + "epoch": 0.9059101654846335, + "grad_norm": 2.6174299716949463, + "learning_rate": 4.7476835208545705e-06, + "loss": 0.6119, + "step": 1916 + }, + { + "epoch": 0.9063829787234042, + "grad_norm": 2.7390382289886475, + "learning_rate": 4.747410340736755e-06, + "loss": 0.5664, + "step": 1917 + }, + { + "epoch": 0.9068557919621749, + "grad_norm": 2.7940444946289062, + "learning_rate": 4.747137020682552e-06, + "loss": 0.5628, + "step": 1918 + }, + { + "epoch": 0.9073286052009456, + "grad_norm": 2.477365016937256, + "learning_rate": 4.7468635607089795e-06, + "loss": 0.5261, + "step": 1919 + }, + { + "epoch": 0.9078014184397163, + "grad_norm": 2.7016685009002686, + "learning_rate": 4.746589960833066e-06, + "loss": 0.5576, + "step": 1920 + }, + { + "epoch": 0.908274231678487, + "grad_norm": 2.8806519508361816, + "learning_rate": 4.746316221071846e-06, + "loss": 0.5925, + "step": 1921 + }, + { + "epoch": 0.9087470449172577, + "grad_norm": 3.0315234661102295, + "learning_rate": 4.746042341442365e-06, + "loss": 0.6142, + "step": 1922 + }, + { + "epoch": 0.9092198581560283, + "grad_norm": 4.2446160316467285, + "learning_rate": 4.745768321961676e-06, + "loss": 0.5352, + "step": 1923 + }, + { + "epoch": 0.909692671394799, + "grad_norm": 2.6517012119293213, + "learning_rate": 4.745494162646841e-06, + "loss": 0.6118, + "step": 1924 + }, + { + "epoch": 0.9101654846335697, + "grad_norm": 2.774900197982788, + "learning_rate": 4.7452198635149304e-06, + "loss": 0.572, + "step": 1925 + }, + { + "epoch": 0.9106382978723404, + "grad_norm": 3.0133683681488037, + "learning_rate": 4.744945424583024e-06, + "loss": 0.5897, + "step": 1926 + }, + { + "epoch": 0.9111111111111111, + "grad_norm": 2.7344839572906494, + "learning_rate": 4.744670845868211e-06, + "loss": 0.6207, + "step": 1927 + }, + { + "epoch": 0.9115839243498818, + "grad_norm": 2.636578321456909, + "learning_rate": 4.744396127387586e-06, + "loss": 0.6687, + "step": 1928 + }, + { + "epoch": 0.9120567375886525, + "grad_norm": 2.8663458824157715, + "learning_rate": 4.744121269158255e-06, + "loss": 0.5002, + "step": 1929 + }, + { + "epoch": 0.9125295508274232, + "grad_norm": 2.661079168319702, + "learning_rate": 4.743846271197333e-06, + "loss": 0.5848, + "step": 1930 + }, + { + "epoch": 0.9130023640661938, + "grad_norm": 2.881256341934204, + "learning_rate": 4.743571133521943e-06, + "loss": 0.5911, + "step": 1931 + }, + { + "epoch": 0.9134751773049645, + "grad_norm": 2.5540573596954346, + "learning_rate": 4.743295856149217e-06, + "loss": 0.5647, + "step": 1932 + }, + { + "epoch": 0.9139479905437352, + "grad_norm": 2.7060387134552, + "learning_rate": 4.743020439096293e-06, + "loss": 0.6267, + "step": 1933 + }, + { + "epoch": 0.9144208037825059, + "grad_norm": 2.694481372833252, + "learning_rate": 4.742744882380323e-06, + "loss": 0.6283, + "step": 1934 + }, + { + "epoch": 0.9148936170212766, + "grad_norm": 2.711555242538452, + "learning_rate": 4.7424691860184625e-06, + "loss": 0.5784, + "step": 1935 + }, + { + "epoch": 0.9153664302600473, + "grad_norm": 2.9077224731445312, + "learning_rate": 4.742193350027879e-06, + "loss": 0.5948, + "step": 1936 + }, + { + "epoch": 0.915839243498818, + "grad_norm": 2.9824187755584717, + "learning_rate": 4.7419173744257476e-06, + "loss": 0.6115, + "step": 1937 + }, + { + "epoch": 0.9163120567375886, + "grad_norm": 2.5127830505371094, + "learning_rate": 4.7416412592292515e-06, + "loss": 0.5803, + "step": 1938 + }, + { + "epoch": 0.9167848699763593, + "grad_norm": 3.1307175159454346, + "learning_rate": 4.741365004455583e-06, + "loss": 0.5657, + "step": 1939 + }, + { + "epoch": 0.91725768321513, + "grad_norm": 2.8205273151397705, + "learning_rate": 4.741088610121944e-06, + "loss": 0.6145, + "step": 1940 + }, + { + "epoch": 0.9177304964539007, + "grad_norm": 2.6119720935821533, + "learning_rate": 4.7408120762455444e-06, + "loss": 0.6058, + "step": 1941 + }, + { + "epoch": 0.9182033096926714, + "grad_norm": 2.421276092529297, + "learning_rate": 4.7405354028436025e-06, + "loss": 0.5973, + "step": 1942 + }, + { + "epoch": 0.9186761229314421, + "grad_norm": 2.9846808910369873, + "learning_rate": 4.740258589933346e-06, + "loss": 0.6892, + "step": 1943 + }, + { + "epoch": 0.9191489361702128, + "grad_norm": 2.6899871826171875, + "learning_rate": 4.739981637532009e-06, + "loss": 0.5705, + "step": 1944 + }, + { + "epoch": 0.9196217494089834, + "grad_norm": 2.8636131286621094, + "learning_rate": 4.739704545656839e-06, + "loss": 0.5775, + "step": 1945 + }, + { + "epoch": 0.9200945626477541, + "grad_norm": 2.7659449577331543, + "learning_rate": 4.739427314325087e-06, + "loss": 0.5823, + "step": 1946 + }, + { + "epoch": 0.9205673758865248, + "grad_norm": 4.71295166015625, + "learning_rate": 4.739149943554016e-06, + "loss": 0.5601, + "step": 1947 + }, + { + "epoch": 0.9210401891252955, + "grad_norm": 2.642636775970459, + "learning_rate": 4.738872433360896e-06, + "loss": 0.5278, + "step": 1948 + }, + { + "epoch": 0.9215130023640662, + "grad_norm": 2.4658217430114746, + "learning_rate": 4.7385947837630065e-06, + "loss": 0.6392, + "step": 1949 + }, + { + "epoch": 0.9219858156028369, + "grad_norm": 2.851602792739868, + "learning_rate": 4.738316994777636e-06, + "loss": 0.6164, + "step": 1950 + }, + { + "epoch": 0.9224586288416076, + "grad_norm": 2.394226551055908, + "learning_rate": 4.738039066422081e-06, + "loss": 0.5556, + "step": 1951 + }, + { + "epoch": 0.9229314420803783, + "grad_norm": 2.7985100746154785, + "learning_rate": 4.737760998713647e-06, + "loss": 0.5799, + "step": 1952 + }, + { + "epoch": 0.9234042553191489, + "grad_norm": 2.5974674224853516, + "learning_rate": 4.737482791669648e-06, + "loss": 0.6984, + "step": 1953 + }, + { + "epoch": 0.9238770685579196, + "grad_norm": 2.707636594772339, + "learning_rate": 4.737204445307406e-06, + "loss": 0.5548, + "step": 1954 + }, + { + "epoch": 0.9243498817966903, + "grad_norm": 2.7882707118988037, + "learning_rate": 4.736925959644254e-06, + "loss": 0.6026, + "step": 1955 + }, + { + "epoch": 0.924822695035461, + "grad_norm": 2.474482774734497, + "learning_rate": 4.7366473346975304e-06, + "loss": 0.5832, + "step": 1956 + }, + { + "epoch": 0.9252955082742317, + "grad_norm": 2.6196324825286865, + "learning_rate": 4.736368570484585e-06, + "loss": 0.5861, + "step": 1957 + }, + { + "epoch": 0.9257683215130024, + "grad_norm": 2.826864004135132, + "learning_rate": 4.736089667022775e-06, + "loss": 0.6173, + "step": 1958 + }, + { + "epoch": 0.926241134751773, + "grad_norm": 2.414473056793213, + "learning_rate": 4.735810624329466e-06, + "loss": 0.5753, + "step": 1959 + }, + { + "epoch": 0.9267139479905437, + "grad_norm": 2.8037970066070557, + "learning_rate": 4.7355314424220335e-06, + "loss": 0.6207, + "step": 1960 + }, + { + "epoch": 0.9271867612293144, + "grad_norm": 2.645458698272705, + "learning_rate": 4.735252121317861e-06, + "loss": 0.5959, + "step": 1961 + }, + { + "epoch": 0.9276595744680851, + "grad_norm": 2.7983884811401367, + "learning_rate": 4.734972661034339e-06, + "loss": 0.5696, + "step": 1962 + }, + { + "epoch": 0.9281323877068558, + "grad_norm": 3.0568997859954834, + "learning_rate": 4.73469306158887e-06, + "loss": 0.6194, + "step": 1963 + }, + { + "epoch": 0.9286052009456265, + "grad_norm": 2.7205135822296143, + "learning_rate": 4.734413322998863e-06, + "loss": 0.5292, + "step": 1964 + }, + { + "epoch": 0.9290780141843972, + "grad_norm": 3.3168489933013916, + "learning_rate": 4.734133445281735e-06, + "loss": 0.5654, + "step": 1965 + }, + { + "epoch": 0.9295508274231679, + "grad_norm": 3.0095653533935547, + "learning_rate": 4.733853428454916e-06, + "loss": 0.6508, + "step": 1966 + }, + { + "epoch": 0.9300236406619385, + "grad_norm": 2.7726712226867676, + "learning_rate": 4.733573272535838e-06, + "loss": 0.644, + "step": 1967 + }, + { + "epoch": 0.9304964539007092, + "grad_norm": 2.474397659301758, + "learning_rate": 4.7332929775419456e-06, + "loss": 0.5479, + "step": 1968 + }, + { + "epoch": 0.9309692671394799, + "grad_norm": 2.4518635272979736, + "learning_rate": 4.733012543490693e-06, + "loss": 0.6, + "step": 1969 + }, + { + "epoch": 0.9314420803782506, + "grad_norm": 2.9292192459106445, + "learning_rate": 4.73273197039954e-06, + "loss": 0.6647, + "step": 1970 + }, + { + "epoch": 0.9319148936170213, + "grad_norm": 2.425004720687866, + "learning_rate": 4.732451258285958e-06, + "loss": 0.6338, + "step": 1971 + }, + { + "epoch": 0.932387706855792, + "grad_norm": 2.904479503631592, + "learning_rate": 4.7321704071674255e-06, + "loss": 0.5923, + "step": 1972 + }, + { + "epoch": 0.9328605200945627, + "grad_norm": 2.477085590362549, + "learning_rate": 4.731889417061428e-06, + "loss": 0.5984, + "step": 1973 + }, + { + "epoch": 0.9333333333333333, + "grad_norm": 2.585240364074707, + "learning_rate": 4.731608287985465e-06, + "loss": 0.558, + "step": 1974 + }, + { + "epoch": 0.933806146572104, + "grad_norm": 2.658714532852173, + "learning_rate": 4.731327019957039e-06, + "loss": 0.5567, + "step": 1975 + }, + { + "epoch": 0.9342789598108747, + "grad_norm": 2.7593026161193848, + "learning_rate": 4.731045612993662e-06, + "loss": 0.5772, + "step": 1976 + }, + { + "epoch": 0.9347517730496454, + "grad_norm": 2.4386026859283447, + "learning_rate": 4.7307640671128585e-06, + "loss": 0.6199, + "step": 1977 + }, + { + "epoch": 0.9352245862884161, + "grad_norm": 2.681910514831543, + "learning_rate": 4.730482382332158e-06, + "loss": 0.5971, + "step": 1978 + }, + { + "epoch": 0.9356973995271868, + "grad_norm": 3.7593860626220703, + "learning_rate": 4.7302005586691e-06, + "loss": 0.6346, + "step": 1979 + }, + { + "epoch": 0.9361702127659575, + "grad_norm": 2.5789096355438232, + "learning_rate": 4.729918596141232e-06, + "loss": 0.5684, + "step": 1980 + }, + { + "epoch": 0.9366430260047282, + "grad_norm": 3.0607335567474365, + "learning_rate": 4.729636494766111e-06, + "loss": 0.6223, + "step": 1981 + }, + { + "epoch": 0.9371158392434988, + "grad_norm": 2.906643867492676, + "learning_rate": 4.729354254561303e-06, + "loss": 0.6513, + "step": 1982 + }, + { + "epoch": 0.9375886524822695, + "grad_norm": 3.192430019378662, + "learning_rate": 4.7290718755443795e-06, + "loss": 0.5095, + "step": 1983 + }, + { + "epoch": 0.9380614657210402, + "grad_norm": 2.661536931991577, + "learning_rate": 4.7287893577329255e-06, + "loss": 0.5525, + "step": 1984 + }, + { + "epoch": 0.9385342789598109, + "grad_norm": 2.8436734676361084, + "learning_rate": 4.728506701144531e-06, + "loss": 0.6323, + "step": 1985 + }, + { + "epoch": 0.9390070921985816, + "grad_norm": 2.75544810295105, + "learning_rate": 4.728223905796796e-06, + "loss": 0.6018, + "step": 1986 + }, + { + "epoch": 0.9394799054373523, + "grad_norm": 3.0652759075164795, + "learning_rate": 4.727940971707329e-06, + "loss": 0.62, + "step": 1987 + }, + { + "epoch": 0.939952718676123, + "grad_norm": 2.802567720413208, + "learning_rate": 4.727657898893747e-06, + "loss": 0.5809, + "step": 1988 + }, + { + "epoch": 0.9404255319148936, + "grad_norm": 2.6208512783050537, + "learning_rate": 4.7273746873736745e-06, + "loss": 0.5762, + "step": 1989 + }, + { + "epoch": 0.9408983451536643, + "grad_norm": 2.5901873111724854, + "learning_rate": 4.727091337164748e-06, + "loss": 0.6111, + "step": 1990 + }, + { + "epoch": 0.941371158392435, + "grad_norm": 3.002347707748413, + "learning_rate": 4.726807848284609e-06, + "loss": 0.6419, + "step": 1991 + }, + { + "epoch": 0.9418439716312057, + "grad_norm": 2.522151470184326, + "learning_rate": 4.72652422075091e-06, + "loss": 0.642, + "step": 1992 + }, + { + "epoch": 0.9423167848699764, + "grad_norm": 2.5571532249450684, + "learning_rate": 4.726240454581311e-06, + "loss": 0.5729, + "step": 1993 + }, + { + "epoch": 0.9427895981087471, + "grad_norm": 2.7704918384552, + "learning_rate": 4.72595654979348e-06, + "loss": 0.6816, + "step": 1994 + }, + { + "epoch": 0.9432624113475178, + "grad_norm": 2.517040491104126, + "learning_rate": 4.7256725064050955e-06, + "loss": 0.5782, + "step": 1995 + }, + { + "epoch": 0.9437352245862884, + "grad_norm": 2.613955020904541, + "learning_rate": 4.725388324433843e-06, + "loss": 0.6291, + "step": 1996 + }, + { + "epoch": 0.9442080378250591, + "grad_norm": 2.848891258239746, + "learning_rate": 4.725104003897418e-06, + "loss": 0.6544, + "step": 1997 + }, + { + "epoch": 0.9446808510638298, + "grad_norm": 3.0162429809570312, + "learning_rate": 4.724819544813523e-06, + "loss": 0.6301, + "step": 1998 + }, + { + "epoch": 0.9451536643026005, + "grad_norm": 2.613614559173584, + "learning_rate": 4.72453494719987e-06, + "loss": 0.5829, + "step": 1999 + }, + { + "epoch": 0.9456264775413712, + "grad_norm": 2.4838767051696777, + "learning_rate": 4.724250211074182e-06, + "loss": 0.6042, + "step": 2000 + }, + { + "epoch": 0.9460992907801419, + "grad_norm": 2.526470899581909, + "learning_rate": 4.723965336454185e-06, + "loss": 0.6167, + "step": 2001 + }, + { + "epoch": 0.9465721040189126, + "grad_norm": 2.504506826400757, + "learning_rate": 4.723680323357618e-06, + "loss": 0.6061, + "step": 2002 + }, + { + "epoch": 0.9470449172576832, + "grad_norm": 3.0547544956207275, + "learning_rate": 4.723395171802228e-06, + "loss": 0.6619, + "step": 2003 + }, + { + "epoch": 0.9475177304964539, + "grad_norm": 2.8692407608032227, + "learning_rate": 4.723109881805771e-06, + "loss": 0.5985, + "step": 2004 + }, + { + "epoch": 0.9479905437352246, + "grad_norm": 2.7929654121398926, + "learning_rate": 4.7228244533860094e-06, + "loss": 0.5869, + "step": 2005 + }, + { + "epoch": 0.9484633569739953, + "grad_norm": 2.764869451522827, + "learning_rate": 4.7225388865607146e-06, + "loss": 0.6288, + "step": 2006 + }, + { + "epoch": 0.948936170212766, + "grad_norm": 2.7656404972076416, + "learning_rate": 4.722253181347671e-06, + "loss": 0.5831, + "step": 2007 + }, + { + "epoch": 0.9494089834515367, + "grad_norm": 2.6698336601257324, + "learning_rate": 4.7219673377646635e-06, + "loss": 0.6087, + "step": 2008 + }, + { + "epoch": 0.9498817966903074, + "grad_norm": 2.524935722351074, + "learning_rate": 4.7216813558294946e-06, + "loss": 0.5675, + "step": 2009 + }, + { + "epoch": 0.950354609929078, + "grad_norm": 2.5998785495758057, + "learning_rate": 4.721395235559969e-06, + "loss": 0.5667, + "step": 2010 + }, + { + "epoch": 0.9508274231678487, + "grad_norm": 2.758021354675293, + "learning_rate": 4.721108976973902e-06, + "loss": 0.4931, + "step": 2011 + }, + { + "epoch": 0.9513002364066194, + "grad_norm": 2.767695903778076, + "learning_rate": 4.72082258008912e-06, + "loss": 0.5778, + "step": 2012 + }, + { + "epoch": 0.9517730496453901, + "grad_norm": 2.982314348220825, + "learning_rate": 4.720536044923453e-06, + "loss": 0.6096, + "step": 2013 + }, + { + "epoch": 0.9522458628841608, + "grad_norm": 2.7608799934387207, + "learning_rate": 4.720249371494743e-06, + "loss": 0.6242, + "step": 2014 + }, + { + "epoch": 0.9527186761229315, + "grad_norm": 2.60054349899292, + "learning_rate": 4.71996255982084e-06, + "loss": 0.6249, + "step": 2015 + }, + { + "epoch": 0.9531914893617022, + "grad_norm": 2.654355764389038, + "learning_rate": 4.719675609919603e-06, + "loss": 0.6327, + "step": 2016 + }, + { + "epoch": 0.9536643026004729, + "grad_norm": 2.589404582977295, + "learning_rate": 4.719388521808899e-06, + "loss": 0.6357, + "step": 2017 + }, + { + "epoch": 0.9541371158392435, + "grad_norm": 2.8016581535339355, + "learning_rate": 4.719101295506603e-06, + "loss": 0.5901, + "step": 2018 + }, + { + "epoch": 0.9546099290780142, + "grad_norm": 3.1408045291900635, + "learning_rate": 4.7188139310306e-06, + "loss": 0.598, + "step": 2019 + }, + { + "epoch": 0.9550827423167849, + "grad_norm": 2.7432665824890137, + "learning_rate": 4.718526428398783e-06, + "loss": 0.5508, + "step": 2020 + }, + { + "epoch": 0.9555555555555556, + "grad_norm": 2.947800874710083, + "learning_rate": 4.718238787629053e-06, + "loss": 0.6439, + "step": 2021 + }, + { + "epoch": 0.9560283687943263, + "grad_norm": 2.50828218460083, + "learning_rate": 4.71795100873932e-06, + "loss": 0.5441, + "step": 2022 + }, + { + "epoch": 0.956501182033097, + "grad_norm": 2.8558974266052246, + "learning_rate": 4.717663091747503e-06, + "loss": 0.5416, + "step": 2023 + }, + { + "epoch": 0.9569739952718677, + "grad_norm": 2.4803316593170166, + "learning_rate": 4.71737503667153e-06, + "loss": 0.5317, + "step": 2024 + }, + { + "epoch": 0.9574468085106383, + "grad_norm": 4.36754035949707, + "learning_rate": 4.717086843529336e-06, + "loss": 0.5808, + "step": 2025 + }, + { + "epoch": 0.957919621749409, + "grad_norm": 2.730185031890869, + "learning_rate": 4.7167985123388665e-06, + "loss": 0.5257, + "step": 2026 + }, + { + "epoch": 0.9583924349881797, + "grad_norm": 2.8136069774627686, + "learning_rate": 4.716510043118074e-06, + "loss": 0.5836, + "step": 2027 + }, + { + "epoch": 0.9588652482269504, + "grad_norm": 2.793975353240967, + "learning_rate": 4.71622143588492e-06, + "loss": 0.5706, + "step": 2028 + }, + { + "epoch": 0.9593380614657211, + "grad_norm": 2.3883821964263916, + "learning_rate": 4.7159326906573745e-06, + "loss": 0.5291, + "step": 2029 + }, + { + "epoch": 0.9598108747044918, + "grad_norm": 2.6135976314544678, + "learning_rate": 4.715643807453417e-06, + "loss": 0.6199, + "step": 2030 + }, + { + "epoch": 0.9602836879432625, + "grad_norm": 2.6245670318603516, + "learning_rate": 4.715354786291035e-06, + "loss": 0.5585, + "step": 2031 + }, + { + "epoch": 0.9607565011820332, + "grad_norm": 2.7870967388153076, + "learning_rate": 4.715065627188225e-06, + "loss": 0.6196, + "step": 2032 + }, + { + "epoch": 0.9612293144208038, + "grad_norm": 2.6983911991119385, + "learning_rate": 4.714776330162991e-06, + "loss": 0.6424, + "step": 2033 + }, + { + "epoch": 0.9617021276595744, + "grad_norm": 2.3221919536590576, + "learning_rate": 4.7144868952333465e-06, + "loss": 0.568, + "step": 2034 + }, + { + "epoch": 0.9621749408983451, + "grad_norm": 2.9408178329467773, + "learning_rate": 4.714197322417314e-06, + "loss": 0.6175, + "step": 2035 + }, + { + "epoch": 0.9626477541371158, + "grad_norm": 2.404057264328003, + "learning_rate": 4.713907611732921e-06, + "loss": 0.4943, + "step": 2036 + }, + { + "epoch": 0.9631205673758865, + "grad_norm": 3.547607660293579, + "learning_rate": 4.71361776319821e-06, + "loss": 0.5488, + "step": 2037 + }, + { + "epoch": 0.9635933806146572, + "grad_norm": 2.679614543914795, + "learning_rate": 4.713327776831227e-06, + "loss": 0.6234, + "step": 2038 + }, + { + "epoch": 0.9640661938534278, + "grad_norm": 2.526914119720459, + "learning_rate": 4.7130376526500286e-06, + "loss": 0.5891, + "step": 2039 + }, + { + "epoch": 0.9645390070921985, + "grad_norm": 2.6953470706939697, + "learning_rate": 4.71274739067268e-06, + "loss": 0.69, + "step": 2040 + }, + { + "epoch": 0.9650118203309692, + "grad_norm": 2.546660900115967, + "learning_rate": 4.712456990917254e-06, + "loss": 0.6185, + "step": 2041 + }, + { + "epoch": 0.9654846335697399, + "grad_norm": 3.3920490741729736, + "learning_rate": 4.712166453401832e-06, + "loss": 0.587, + "step": 2042 + }, + { + "epoch": 0.9659574468085106, + "grad_norm": 2.5961573123931885, + "learning_rate": 4.711875778144504e-06, + "loss": 0.6105, + "step": 2043 + }, + { + "epoch": 0.9664302600472813, + "grad_norm": 2.5111498832702637, + "learning_rate": 4.711584965163372e-06, + "loss": 0.5533, + "step": 2044 + }, + { + "epoch": 0.966903073286052, + "grad_norm": 2.4878132343292236, + "learning_rate": 4.7112940144765405e-06, + "loss": 0.5604, + "step": 2045 + }, + { + "epoch": 0.9673758865248226, + "grad_norm": 2.5714077949523926, + "learning_rate": 4.711002926102128e-06, + "loss": 0.5794, + "step": 2046 + }, + { + "epoch": 0.9678486997635933, + "grad_norm": 2.7069091796875, + "learning_rate": 4.710711700058257e-06, + "loss": 0.594, + "step": 2047 + }, + { + "epoch": 0.968321513002364, + "grad_norm": 2.8104631900787354, + "learning_rate": 4.710420336363063e-06, + "loss": 0.6247, + "step": 2048 + }, + { + "epoch": 0.9687943262411347, + "grad_norm": 2.8464386463165283, + "learning_rate": 4.7101288350346865e-06, + "loss": 0.6162, + "step": 2049 + }, + { + "epoch": 0.9692671394799054, + "grad_norm": 2.7187976837158203, + "learning_rate": 4.709837196091279e-06, + "loss": 0.6109, + "step": 2050 + }, + { + "epoch": 0.9697399527186761, + "grad_norm": 2.556734085083008, + "learning_rate": 4.709545419550999e-06, + "loss": 0.6297, + "step": 2051 + }, + { + "epoch": 0.9702127659574468, + "grad_norm": 2.937195062637329, + "learning_rate": 4.709253505432014e-06, + "loss": 0.6862, + "step": 2052 + }, + { + "epoch": 0.9706855791962175, + "grad_norm": 2.792175531387329, + "learning_rate": 4.7089614537525015e-06, + "loss": 0.6105, + "step": 2053 + }, + { + "epoch": 0.9711583924349881, + "grad_norm": 2.625636100769043, + "learning_rate": 4.708669264530644e-06, + "loss": 0.5849, + "step": 2054 + }, + { + "epoch": 0.9716312056737588, + "grad_norm": 2.6752610206604004, + "learning_rate": 4.708376937784637e-06, + "loss": 0.5949, + "step": 2055 + }, + { + "epoch": 0.9721040189125295, + "grad_norm": 2.6072793006896973, + "learning_rate": 4.708084473532681e-06, + "loss": 0.5776, + "step": 2056 + }, + { + "epoch": 0.9725768321513002, + "grad_norm": 2.728632926940918, + "learning_rate": 4.707791871792988e-06, + "loss": 0.6352, + "step": 2057 + }, + { + "epoch": 0.9730496453900709, + "grad_norm": 2.5841758251190186, + "learning_rate": 4.707499132583775e-06, + "loss": 0.5488, + "step": 2058 + }, + { + "epoch": 0.9735224586288416, + "grad_norm": 2.8464293479919434, + "learning_rate": 4.707206255923271e-06, + "loss": 0.7051, + "step": 2059 + }, + { + "epoch": 0.9739952718676123, + "grad_norm": 2.547297239303589, + "learning_rate": 4.706913241829712e-06, + "loss": 0.5937, + "step": 2060 + }, + { + "epoch": 0.9744680851063829, + "grad_norm": 2.6572306156158447, + "learning_rate": 4.706620090321341e-06, + "loss": 0.6041, + "step": 2061 + }, + { + "epoch": 0.9749408983451536, + "grad_norm": 2.3262805938720703, + "learning_rate": 4.706326801416414e-06, + "loss": 0.5144, + "step": 2062 + }, + { + "epoch": 0.9754137115839243, + "grad_norm": 2.9693965911865234, + "learning_rate": 4.706033375133191e-06, + "loss": 0.551, + "step": 2063 + }, + { + "epoch": 0.975886524822695, + "grad_norm": 2.5993731021881104, + "learning_rate": 4.7057398114899435e-06, + "loss": 0.6143, + "step": 2064 + }, + { + "epoch": 0.9763593380614657, + "grad_norm": 2.453336477279663, + "learning_rate": 4.70544611050495e-06, + "loss": 0.6093, + "step": 2065 + }, + { + "epoch": 0.9768321513002364, + "grad_norm": 2.898629665374756, + "learning_rate": 4.705152272196497e-06, + "loss": 0.6007, + "step": 2066 + }, + { + "epoch": 0.9773049645390071, + "grad_norm": 2.7990612983703613, + "learning_rate": 4.7048582965828815e-06, + "loss": 0.6687, + "step": 2067 + }, + { + "epoch": 0.9777777777777777, + "grad_norm": 2.635284423828125, + "learning_rate": 4.704564183682408e-06, + "loss": 0.5564, + "step": 2068 + }, + { + "epoch": 0.9782505910165484, + "grad_norm": 3.014547109603882, + "learning_rate": 4.704269933513389e-06, + "loss": 0.6084, + "step": 2069 + }, + { + "epoch": 0.9787234042553191, + "grad_norm": 2.659357786178589, + "learning_rate": 4.703975546094147e-06, + "loss": 0.6031, + "step": 2070 + }, + { + "epoch": 0.9791962174940898, + "grad_norm": 2.326932668685913, + "learning_rate": 4.703681021443013e-06, + "loss": 0.5859, + "step": 2071 + }, + { + "epoch": 0.9796690307328605, + "grad_norm": 2.958803653717041, + "learning_rate": 4.7033863595783235e-06, + "loss": 0.5586, + "step": 2072 + }, + { + "epoch": 0.9801418439716312, + "grad_norm": 2.921386957168579, + "learning_rate": 4.703091560518427e-06, + "loss": 0.6126, + "step": 2073 + }, + { + "epoch": 0.9806146572104019, + "grad_norm": 2.6500775814056396, + "learning_rate": 4.702796624281679e-06, + "loss": 0.5678, + "step": 2074 + }, + { + "epoch": 0.9810874704491725, + "grad_norm": 2.7740228176116943, + "learning_rate": 4.702501550886445e-06, + "loss": 0.6067, + "step": 2075 + }, + { + "epoch": 0.9815602836879432, + "grad_norm": 2.3296213150024414, + "learning_rate": 4.702206340351096e-06, + "loss": 0.5247, + "step": 2076 + }, + { + "epoch": 0.9820330969267139, + "grad_norm": 2.748300790786743, + "learning_rate": 4.701910992694016e-06, + "loss": 0.5197, + "step": 2077 + }, + { + "epoch": 0.9825059101654846, + "grad_norm": 2.250985622406006, + "learning_rate": 4.7016155079335926e-06, + "loss": 0.5214, + "step": 2078 + }, + { + "epoch": 0.9829787234042553, + "grad_norm": 2.389845848083496, + "learning_rate": 4.701319886088226e-06, + "loss": 0.519, + "step": 2079 + }, + { + "epoch": 0.983451536643026, + "grad_norm": 2.818220853805542, + "learning_rate": 4.701024127176322e-06, + "loss": 0.607, + "step": 2080 + }, + { + "epoch": 0.9839243498817967, + "grad_norm": 3.4058034420013428, + "learning_rate": 4.700728231216297e-06, + "loss": 0.5711, + "step": 2081 + }, + { + "epoch": 0.9843971631205674, + "grad_norm": 2.5297787189483643, + "learning_rate": 4.700432198226575e-06, + "loss": 0.5979, + "step": 2082 + }, + { + "epoch": 0.984869976359338, + "grad_norm": 3.0548105239868164, + "learning_rate": 4.7001360282255885e-06, + "loss": 0.6041, + "step": 2083 + }, + { + "epoch": 0.9853427895981087, + "grad_norm": 2.8983733654022217, + "learning_rate": 4.699839721231779e-06, + "loss": 0.5926, + "step": 2084 + }, + { + "epoch": 0.9858156028368794, + "grad_norm": 3.2717764377593994, + "learning_rate": 4.699543277263596e-06, + "loss": 0.6477, + "step": 2085 + }, + { + "epoch": 0.9862884160756501, + "grad_norm": 3.03729248046875, + "learning_rate": 4.699246696339497e-06, + "loss": 0.6786, + "step": 2086 + }, + { + "epoch": 0.9867612293144208, + "grad_norm": 2.852301597595215, + "learning_rate": 4.698949978477951e-06, + "loss": 0.6565, + "step": 2087 + }, + { + "epoch": 0.9872340425531915, + "grad_norm": 2.843485116958618, + "learning_rate": 4.698653123697431e-06, + "loss": 0.6627, + "step": 2088 + }, + { + "epoch": 0.9877068557919622, + "grad_norm": 2.6315064430236816, + "learning_rate": 4.698356132016423e-06, + "loss": 0.6577, + "step": 2089 + }, + { + "epoch": 0.9881796690307328, + "grad_norm": 2.7482151985168457, + "learning_rate": 4.698059003453417e-06, + "loss": 0.5514, + "step": 2090 + }, + { + "epoch": 0.9886524822695035, + "grad_norm": 2.826673746109009, + "learning_rate": 4.6977617380269145e-06, + "loss": 0.565, + "step": 2091 + }, + { + "epoch": 0.9891252955082742, + "grad_norm": 3.0273752212524414, + "learning_rate": 4.697464335755427e-06, + "loss": 0.6331, + "step": 2092 + }, + { + "epoch": 0.9895981087470449, + "grad_norm": 2.7551653385162354, + "learning_rate": 4.6971667966574695e-06, + "loss": 0.6486, + "step": 2093 + }, + { + "epoch": 0.9900709219858156, + "grad_norm": 2.656299114227295, + "learning_rate": 4.696869120751571e-06, + "loss": 0.6562, + "step": 2094 + }, + { + "epoch": 0.9905437352245863, + "grad_norm": 2.785322904586792, + "learning_rate": 4.696571308056265e-06, + "loss": 0.5892, + "step": 2095 + }, + { + "epoch": 0.991016548463357, + "grad_norm": 2.9334635734558105, + "learning_rate": 4.696273358590095e-06, + "loss": 0.6346, + "step": 2096 + }, + { + "epoch": 0.9914893617021276, + "grad_norm": 2.7944300174713135, + "learning_rate": 4.695975272371613e-06, + "loss": 0.5859, + "step": 2097 + }, + { + "epoch": 0.9919621749408983, + "grad_norm": 2.5416972637176514, + "learning_rate": 4.695677049419381e-06, + "loss": 0.5658, + "step": 2098 + }, + { + "epoch": 0.992434988179669, + "grad_norm": 2.4056856632232666, + "learning_rate": 4.695378689751966e-06, + "loss": 0.5121, + "step": 2099 + }, + { + "epoch": 0.9929078014184397, + "grad_norm": 2.614548683166504, + "learning_rate": 4.695080193387948e-06, + "loss": 0.5961, + "step": 2100 + }, + { + "epoch": 0.9933806146572104, + "grad_norm": 2.8966517448425293, + "learning_rate": 4.69478156034591e-06, + "loss": 0.5985, + "step": 2101 + }, + { + "epoch": 0.9938534278959811, + "grad_norm": 2.9514098167419434, + "learning_rate": 4.694482790644448e-06, + "loss": 0.5677, + "step": 2102 + }, + { + "epoch": 0.9943262411347518, + "grad_norm": 2.4326791763305664, + "learning_rate": 4.694183884302165e-06, + "loss": 0.5698, + "step": 2103 + }, + { + "epoch": 0.9947990543735225, + "grad_norm": 2.9242892265319824, + "learning_rate": 4.6938848413376735e-06, + "loss": 0.6245, + "step": 2104 + }, + { + "epoch": 0.9952718676122931, + "grad_norm": 2.9134104251861572, + "learning_rate": 4.693585661769593e-06, + "loss": 0.6164, + "step": 2105 + }, + { + "epoch": 0.9957446808510638, + "grad_norm": 2.472564458847046, + "learning_rate": 4.693286345616551e-06, + "loss": 0.5616, + "step": 2106 + }, + { + "epoch": 0.9962174940898345, + "grad_norm": 3.2456448078155518, + "learning_rate": 4.692986892897186e-06, + "loss": 0.6977, + "step": 2107 + }, + { + "epoch": 0.9966903073286052, + "grad_norm": 3.4032769203186035, + "learning_rate": 4.692687303630143e-06, + "loss": 0.643, + "step": 2108 + }, + { + "epoch": 0.9971631205673759, + "grad_norm": 2.722200870513916, + "learning_rate": 4.692387577834076e-06, + "loss": 0.5873, + "step": 2109 + }, + { + "epoch": 0.9976359338061466, + "grad_norm": 2.687532663345337, + "learning_rate": 4.692087715527648e-06, + "loss": 0.5423, + "step": 2110 + }, + { + "epoch": 0.9981087470449173, + "grad_norm": 2.578613042831421, + "learning_rate": 4.6917877167295305e-06, + "loss": 0.5689, + "step": 2111 + }, + { + "epoch": 0.9985815602836879, + "grad_norm": 3.1806094646453857, + "learning_rate": 4.691487581458402e-06, + "loss": 0.6133, + "step": 2112 + }, + { + "epoch": 0.9990543735224586, + "grad_norm": 2.4449520111083984, + "learning_rate": 4.691187309732952e-06, + "loss": 0.5841, + "step": 2113 + }, + { + "epoch": 0.9995271867612293, + "grad_norm": 2.908749580383301, + "learning_rate": 4.690886901571875e-06, + "loss": 0.534, + "step": 2114 + }, + { + "epoch": 1.0, + "grad_norm": 4.019968032836914, + "learning_rate": 4.6905863569938785e-06, + "loss": 0.596, + "step": 2115 + } + ], + "logging_steps": 1, + "max_steps": 12690, + "num_input_tokens_seen": 0, + "num_train_epochs": 6, + "save_steps": 2115, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 5.341936104473887e+18, + "train_batch_size": 4, + "trial_name": null, + "trial_params": null +} diff --git a/checkpoint-2115/training_args.bin b/checkpoint-2115/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..3885e6a27b204c834ff6fbc6f3a5a4c7f69fe2cc --- /dev/null +++ b/checkpoint-2115/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:51188d498a3fe1bf9ddf234271786a31e25e89b3aa68721a97f8264e1013c9b6 +size 8056 diff --git a/checkpoint-2115/zero_to_fp32.py b/checkpoint-2115/zero_to_fp32.py new file mode 100644 index 0000000000000000000000000000000000000000..24cc342e78d1a006c782b3a4cd68d9ce786d8fd8 --- /dev/null +++ b/checkpoint-2115/zero_to_fp32.py @@ -0,0 +1,604 @@ +#!/usr/bin/env python + +# Copyright (c) Microsoft Corporation. +# SPDX-License-Identifier: Apache-2.0 + +# DeepSpeed Team + +# This script extracts fp32 consolidated weights from a zero 1, 2 and 3 DeepSpeed checkpoints. It gets +# copied into the top level checkpoint dir, so the user can easily do the conversion at any point in +# the future. Once extracted, the weights don't require DeepSpeed and can be used in any +# application. +# +# example: python zero_to_fp32.py . pytorch_model.bin + +import argparse +import torch +import glob +import math +import os +import re +from collections import OrderedDict +from dataclasses import dataclass + +# while this script doesn't use deepspeed to recover data, since the checkpoints are pickled with +# DeepSpeed data structures it has to be available in the current python environment. +from deepspeed.utils import logger +from deepspeed.checkpoint.constants import (DS_VERSION, OPTIMIZER_STATE_DICT, SINGLE_PARTITION_OF_FP32_GROUPS, + FP32_FLAT_GROUPS, ZERO_STAGE, PARTITION_COUNT, PARAM_SHAPES, BUFFER_NAMES, + FROZEN_PARAM_SHAPES, FROZEN_PARAM_FRAGMENTS) + + +@dataclass +class zero_model_state: + buffers: dict() + param_shapes: dict() + shared_params: list + ds_version: int + frozen_param_shapes: dict() + frozen_param_fragments: dict() + + +debug = 0 + +# load to cpu +device = torch.device('cpu') + + +def atoi(text): + return int(text) if text.isdigit() else text + + +def natural_keys(text): + ''' + alist.sort(key=natural_keys) sorts in human order + http://nedbatchelder.com/blog/200712/human_sorting.html + (See Toothy's implementation in the comments) + ''' + return [atoi(c) for c in re.split(r'(\d+)', text)] + + +def get_model_state_file(checkpoint_dir, zero_stage): + if not os.path.isdir(checkpoint_dir): + raise FileNotFoundError(f"Directory '{checkpoint_dir}' doesn't exist") + + # there should be only one file + if zero_stage <= 2: + file = os.path.join(checkpoint_dir, "mp_rank_00_model_states.pt") + elif zero_stage == 3: + file = os.path.join(checkpoint_dir, "zero_pp_rank_0_mp_rank_00_model_states.pt") + + if not os.path.exists(file): + raise FileNotFoundError(f"can't find model states file at '{file}'") + + return file + + +def get_checkpoint_files(checkpoint_dir, glob_pattern): + # XXX: need to test that this simple glob rule works for multi-node setup too + ckpt_files = sorted(glob.glob(os.path.join(checkpoint_dir, glob_pattern)), key=natural_keys) + + if len(ckpt_files) == 0: + raise FileNotFoundError(f"can't find {glob_pattern} files in directory '{checkpoint_dir}'") + + return ckpt_files + + +def get_optim_files(checkpoint_dir): + return get_checkpoint_files(checkpoint_dir, "*_optim_states.pt") + + +def get_model_state_files(checkpoint_dir): + return get_checkpoint_files(checkpoint_dir, "*_model_states.pt") + + +def parse_model_states(files): + zero_model_states = [] + for file in files: + state_dict = torch.load(file, map_location=device) + + if BUFFER_NAMES not in state_dict: + raise ValueError(f"{file} is not a model state checkpoint") + buffer_names = state_dict[BUFFER_NAMES] + if debug: + print("Found buffers:", buffer_names) + + # recover just the buffers while restoring them to fp32 if they were saved in fp16 + buffers = {k: v.float() for k, v in state_dict["module"].items() if k in buffer_names} + param_shapes = state_dict[PARAM_SHAPES] + + # collect parameters that are included in param_shapes + param_names = [] + for s in param_shapes: + for name in s.keys(): + param_names.append(name) + + # update with frozen parameters + frozen_param_shapes = state_dict.get(FROZEN_PARAM_SHAPES, None) + if frozen_param_shapes is not None: + if debug: + print(f"Found frozen_param_shapes: {frozen_param_shapes}") + param_names += list(frozen_param_shapes.keys()) + + # handle shared params + shared_params = [[k, v] for k, v in state_dict["shared_params"].items()] + + ds_version = state_dict.get(DS_VERSION, None) + + frozen_param_fragments = state_dict.get(FROZEN_PARAM_FRAGMENTS, None) + + z_model_state = zero_model_state(buffers=buffers, + param_shapes=param_shapes, + shared_params=shared_params, + ds_version=ds_version, + frozen_param_shapes=frozen_param_shapes, + frozen_param_fragments=frozen_param_fragments) + zero_model_states.append(z_model_state) + + return zero_model_states + + +def parse_optim_states(files, ds_checkpoint_dir): + + total_files = len(files) + state_dicts = [] + for f in files: + state_dict = torch.load(f, map_location=device) + # immediately discard the potentially huge 2 optimizer states as we only care for fp32 master weights + # and also handle the case where it was already removed by another helper script + state_dict["optimizer_state_dict"].pop("optimizer_state_dict", None) + state_dicts.append(state_dict) + + if not ZERO_STAGE in state_dicts[0][OPTIMIZER_STATE_DICT]: + raise ValueError(f"{files[0]} is not a zero checkpoint") + zero_stage = state_dicts[0][OPTIMIZER_STATE_DICT][ZERO_STAGE] + world_size = state_dicts[0][OPTIMIZER_STATE_DICT][PARTITION_COUNT] + + # For ZeRO-2 each param group can have different partition_count as data parallelism for expert + # parameters can be different from data parallelism for non-expert parameters. So we can just + # use the max of the partition_count to get the dp world_size. + + if type(world_size) is list: + world_size = max(world_size) + + if world_size != total_files: + raise ValueError( + f"Expected {world_size} of '*_optim_states.pt' under '{ds_checkpoint_dir}' but found {total_files} files. " + "Possibly due to an overwrite of an old checkpoint, or a checkpoint didn't get saved by one or more processes." + ) + + # the groups are named differently in each stage + if zero_stage <= 2: + fp32_groups_key = SINGLE_PARTITION_OF_FP32_GROUPS + elif zero_stage == 3: + fp32_groups_key = FP32_FLAT_GROUPS + else: + raise ValueError(f"unknown zero stage {zero_stage}") + + if zero_stage <= 2: + fp32_flat_groups = [state_dicts[i][OPTIMIZER_STATE_DICT][fp32_groups_key] for i in range(len(state_dicts))] + elif zero_stage == 3: + # if there is more than one param group, there will be multiple flattened tensors - one + # flattened tensor per group - for simplicity merge them into a single tensor + # + # XXX: could make the script more memory efficient for when there are multiple groups - it + # will require matching the sub-lists of param_shapes for each param group flattened tensor + + fp32_flat_groups = [ + torch.cat(state_dicts[i][OPTIMIZER_STATE_DICT][fp32_groups_key], 0) for i in range(len(state_dicts)) + ] + + return zero_stage, world_size, fp32_flat_groups + + +def _get_fp32_state_dict_from_zero_checkpoint(ds_checkpoint_dir, exclude_frozen_parameters): + """ + Returns fp32 state_dict reconstructed from ds checkpoint + + Args: + - ``ds_checkpoint_dir``: path to the deepspeed checkpoint folder (where the optimizer files are) + + """ + print(f"Processing zero checkpoint '{ds_checkpoint_dir}'") + + optim_files = get_optim_files(ds_checkpoint_dir) + zero_stage, world_size, fp32_flat_groups = parse_optim_states(optim_files, ds_checkpoint_dir) + print(f"Detected checkpoint of type zero stage {zero_stage}, world_size: {world_size}") + + model_files = get_model_state_files(ds_checkpoint_dir) + + zero_model_states = parse_model_states(model_files) + print(f'Parsing checkpoint created by deepspeed=={zero_model_states[0].ds_version}') + + if zero_stage <= 2: + return _get_fp32_state_dict_from_zero2_checkpoint(world_size, fp32_flat_groups, zero_model_states, + exclude_frozen_parameters) + elif zero_stage == 3: + return _get_fp32_state_dict_from_zero3_checkpoint(world_size, fp32_flat_groups, zero_model_states, + exclude_frozen_parameters) + + +def _zero2_merge_frozen_params(state_dict, zero_model_states): + if zero_model_states[0].frozen_param_shapes is None or len(zero_model_states[0].frozen_param_shapes) == 0: + return + + frozen_param_shapes = zero_model_states[0].frozen_param_shapes + frozen_param_fragments = zero_model_states[0].frozen_param_fragments + + if debug: + num_elem = sum(s.numel() for s in frozen_param_shapes.values()) + print(f'rank 0: {FROZEN_PARAM_SHAPES}.numel = {num_elem}') + + wanted_params = len(frozen_param_shapes) + wanted_numel = sum(s.numel() for s in frozen_param_shapes.values()) + avail_numel = sum([p.numel() for p in frozen_param_fragments.values()]) + print(f'Frozen params: Have {avail_numel} numels to process.') + print(f'Frozen params: Need {wanted_numel} numels in {wanted_params} params') + + total_params = 0 + total_numel = 0 + for name, shape in frozen_param_shapes.items(): + total_params += 1 + unpartitioned_numel = shape.numel() + total_numel += unpartitioned_numel + + state_dict[name] = frozen_param_fragments[name] + + if debug: + print(f"{name} full shape: {shape} unpartitioned numel {unpartitioned_numel} ") + + print(f"Reconstructed Frozen fp32 state dict with {total_params} params {total_numel} elements") + + +def _has_callable(obj, fn): + attr = getattr(obj, fn, None) + return callable(attr) + + +def _zero2_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states): + param_shapes = zero_model_states[0].param_shapes + + # Reconstruction protocol: + # + # XXX: document this + + if debug: + for i in range(world_size): + for j in range(len(fp32_flat_groups[0])): + print(f"{FP32_FLAT_GROUPS}[{i}][{j}].shape={fp32_flat_groups[i][j].shape}") + + # XXX: memory usage doubles here (zero2) + num_param_groups = len(fp32_flat_groups[0]) + merged_single_partition_of_fp32_groups = [] + for i in range(num_param_groups): + merged_partitions = [sd[i] for sd in fp32_flat_groups] + full_single_fp32_vector = torch.cat(merged_partitions, 0) + merged_single_partition_of_fp32_groups.append(full_single_fp32_vector) + avail_numel = sum( + [full_single_fp32_vector.numel() for full_single_fp32_vector in merged_single_partition_of_fp32_groups]) + + if debug: + wanted_params = sum([len(shapes) for shapes in param_shapes]) + wanted_numel = sum([sum(shape.numel() for shape in shapes.values()) for shapes in param_shapes]) + # not asserting if there is a mismatch due to possible padding + print(f"Have {avail_numel} numels to process.") + print(f"Need {wanted_numel} numels in {wanted_params} params.") + + # params + # XXX: for huge models that can't fit into the host's RAM we will have to recode this to support + # out-of-core computing solution + total_numel = 0 + total_params = 0 + for shapes, full_single_fp32_vector in zip(param_shapes, merged_single_partition_of_fp32_groups): + offset = 0 + avail_numel = full_single_fp32_vector.numel() + for name, shape in shapes.items(): + + unpartitioned_numel = shape.numel() if _has_callable(shape, 'numel') else math.prod(shape) + total_numel += unpartitioned_numel + total_params += 1 + + if debug: + print(f"{name} full shape: {shape} unpartitioned numel {unpartitioned_numel} ") + state_dict[name] = full_single_fp32_vector.narrow(0, offset, unpartitioned_numel).view(shape) + offset += unpartitioned_numel + + # Z2 started to align to 2*world_size to improve nccl performance. Therefore both offset and + # avail_numel can differ by anywhere between 0..2*world_size. Due to two unrelated complex + # paddings performed in the code it's almost impossible to predict the exact numbers w/o the + # live optimizer object, so we are checking that the numbers are within the right range + align_to = 2 * world_size + + def zero2_align(x): + return align_to * math.ceil(x / align_to) + + if debug: + print(f"original offset={offset}, avail_numel={avail_numel}") + + offset = zero2_align(offset) + avail_numel = zero2_align(avail_numel) + + if debug: + print(f"aligned offset={offset}, avail_numel={avail_numel}") + + # Sanity check + if offset != avail_numel: + raise ValueError(f"consumed {offset} numels out of {avail_numel} - something is wrong") + + print(f"Reconstructed fp32 state dict with {total_params} params {total_numel} elements") + + +def _get_fp32_state_dict_from_zero2_checkpoint(world_size, fp32_flat_groups, zero_model_states, + exclude_frozen_parameters): + state_dict = OrderedDict() + + # buffers + buffers = zero_model_states[0].buffers + state_dict.update(buffers) + if debug: + print(f"added {len(buffers)} buffers") + + if not exclude_frozen_parameters: + _zero2_merge_frozen_params(state_dict, zero_model_states) + + _zero2_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states) + + # recover shared parameters + for pair in zero_model_states[0].shared_params: + if pair[1] in state_dict: + state_dict[pair[0]] = state_dict[pair[1]] + + return state_dict + + +def zero3_partitioned_param_info(unpartitioned_numel, world_size): + remainder = unpartitioned_numel % world_size + padding_numel = (world_size - remainder) if remainder else 0 + partitioned_numel = math.ceil(unpartitioned_numel / world_size) + return partitioned_numel, padding_numel + + +def _zero3_merge_frozen_params(state_dict, world_size, zero_model_states): + if zero_model_states[0].frozen_param_shapes is None or len(zero_model_states[0].frozen_param_shapes) == 0: + return + + if debug: + for i in range(world_size): + num_elem = sum(s.numel() for s in zero_model_states[i].frozen_param_fragments.values()) + print(f'rank {i}: {FROZEN_PARAM_SHAPES}.numel = {num_elem}') + + frozen_param_shapes = zero_model_states[0].frozen_param_shapes + wanted_params = len(frozen_param_shapes) + wanted_numel = sum(s.numel() for s in frozen_param_shapes.values()) + avail_numel = sum([p.numel() for p in zero_model_states[0].frozen_param_fragments.values()]) * world_size + print(f'Frozen params: Have {avail_numel} numels to process.') + print(f'Frozen params: Need {wanted_numel} numels in {wanted_params} params') + + total_params = 0 + total_numel = 0 + for name, shape in zero_model_states[0].frozen_param_shapes.items(): + total_params += 1 + unpartitioned_numel = shape.numel() + total_numel += unpartitioned_numel + + param_frags = tuple(model_state.frozen_param_fragments[name] for model_state in zero_model_states) + state_dict[name] = torch.cat(param_frags, 0).narrow(0, 0, unpartitioned_numel).view(shape) + + partitioned_numel, partitioned_padding_numel = zero3_partitioned_param_info(unpartitioned_numel, world_size) + + if debug: + print( + f"Frozen params: {total_params} {name} full shape: {shape} partition0 numel={partitioned_numel} partitioned_padding_numel={partitioned_padding_numel}" + ) + + print(f"Reconstructed Frozen fp32 state dict with {total_params} params {total_numel} elements") + + +def _zero3_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states): + param_shapes = zero_model_states[0].param_shapes + avail_numel = fp32_flat_groups[0].numel() * world_size + # Reconstruction protocol: For zero3 we need to zip the partitions together at boundary of each + # param, re-consolidating each param, while dealing with padding if any + + # merge list of dicts, preserving order + param_shapes = {k: v for d in param_shapes for k, v in d.items()} + + if debug: + for i in range(world_size): + print(f"{FP32_FLAT_GROUPS}[{i}].shape={fp32_flat_groups[i].shape}") + + wanted_params = len(param_shapes) + wanted_numel = sum(shape.numel() for shape in param_shapes.values()) + # not asserting if there is a mismatch due to possible padding + avail_numel = fp32_flat_groups[0].numel() * world_size + print(f"Trainable params: Have {avail_numel} numels to process.") + print(f"Trainable params: Need {wanted_numel} numels in {wanted_params} params.") + + # params + # XXX: for huge models that can't fit into the host's RAM we will have to recode this to support + # out-of-core computing solution + offset = 0 + total_numel = 0 + total_params = 0 + for name, shape in param_shapes.items(): + + unpartitioned_numel = shape.numel() + total_numel += unpartitioned_numel + total_params += 1 + + partitioned_numel, partitioned_padding_numel = zero3_partitioned_param_info(unpartitioned_numel, world_size) + + if debug: + print( + f"Trainable params: {total_params} {name} full shape: {shape} partition0 numel={partitioned_numel} partitioned_padding_numel={partitioned_padding_numel}" + ) + + # XXX: memory usage doubles here + state_dict[name] = torch.cat( + tuple(fp32_flat_groups[i].narrow(0, offset, partitioned_numel) for i in range(world_size)), + 0).narrow(0, 0, unpartitioned_numel).view(shape) + offset += partitioned_numel + + offset *= world_size + + # Sanity check + if offset != avail_numel: + raise ValueError(f"consumed {offset} numels out of {avail_numel} - something is wrong") + + print(f"Reconstructed Trainable fp32 state dict with {total_params} params {total_numel} elements") + + +def _get_fp32_state_dict_from_zero3_checkpoint(world_size, fp32_flat_groups, zero_model_states, + exclude_frozen_parameters): + state_dict = OrderedDict() + + # buffers + buffers = zero_model_states[0].buffers + state_dict.update(buffers) + if debug: + print(f"added {len(buffers)} buffers") + + if not exclude_frozen_parameters: + _zero3_merge_frozen_params(state_dict, world_size, zero_model_states) + + _zero3_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states) + + # recover shared parameters + for pair in zero_model_states[0].shared_params: + if pair[1] in state_dict: + state_dict[pair[0]] = state_dict[pair[1]] + + return state_dict + + +def get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, tag=None, exclude_frozen_parameters=False): + """ + Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated state_dict that can be loaded with + ``load_state_dict()`` and used for training without DeepSpeed or shared with others, for example + via a model hub. + + Args: + - ``checkpoint_dir``: path to the desired checkpoint folder + - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in 'latest' file. e.g., ``global_step14`` + - ``exclude_frozen_parameters``: exclude frozen parameters + + Returns: + - pytorch ``state_dict`` + + Note: this approach may not work if your application doesn't have sufficient free CPU memory and + you may need to use the offline approach using the ``zero_to_fp32.py`` script that is saved with + the checkpoint. + + A typical usage might be :: + + from deepspeed.utils.zero_to_fp32 import get_fp32_state_dict_from_zero_checkpoint + # do the training and checkpoint saving + state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir) # already on cpu + model = model.cpu() # move to cpu + model.load_state_dict(state_dict) + # submit to model hub or save the model to share with others + + In this example the ``model`` will no longer be usable in the deepspeed context of the same + application. i.e. you will need to re-initialize the deepspeed engine, since + ``model.load_state_dict(state_dict)`` will remove all the deepspeed magic from it. + + If you want it all done for you, use ``load_state_dict_from_zero_checkpoint`` instead. + + """ + if tag is None: + latest_path = os.path.join(checkpoint_dir, 'latest') + if os.path.isfile(latest_path): + with open(latest_path, 'r') as fd: + tag = fd.read().strip() + else: + raise ValueError(f"Unable to find 'latest' file at {latest_path}") + + ds_checkpoint_dir = os.path.join(checkpoint_dir, tag) + + if not os.path.isdir(ds_checkpoint_dir): + raise FileNotFoundError(f"Directory '{ds_checkpoint_dir}' doesn't exist") + + return _get_fp32_state_dict_from_zero_checkpoint(ds_checkpoint_dir, exclude_frozen_parameters) + + +def convert_zero_checkpoint_to_fp32_state_dict(checkpoint_dir, output_file, tag=None, exclude_frozen_parameters=False): + """ + Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated ``state_dict`` file that can be + loaded with ``torch.load(file)`` + ``load_state_dict()`` and used for training without DeepSpeed. + + Args: + - ``checkpoint_dir``: path to the desired checkpoint folder. (one that contains the tag-folder, like ``global_step14``) + - ``output_file``: path to the pytorch fp32 state_dict output file (e.g. path/pytorch_model.bin) + - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in the file named ``latest`` in the checkpoint folder, e.g., ``global_step14`` + - ``exclude_frozen_parameters``: exclude frozen parameters + """ + + state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, tag, exclude_frozen_parameters) + print(f"Saving fp32 state dict to {output_file}") + torch.save(state_dict, output_file) + + +def load_state_dict_from_zero_checkpoint(model, checkpoint_dir, tag=None): + """ + 1. Put the provided model to cpu + 2. Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated ``state_dict`` + 3. Load it into the provided model + + Args: + - ``model``: the model object to update + - ``checkpoint_dir``: path to the desired checkpoint folder. (one that contains the tag-folder, like ``global_step14``) + - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in the file named ``latest`` in the checkpoint folder, e.g., ``global_step14`` + + Returns: + - ``model`: modified model + + Make sure you have plenty of CPU memory available before you call this function. If you don't + have enough use the ``zero_to_fp32.py`` utility to do the conversion. You will find it + conveniently placed for you in the checkpoint folder. + + A typical usage might be :: + + from deepspeed.utils.zero_to_fp32 import load_state_dict_from_zero_checkpoint + model = load_state_dict_from_zero_checkpoint(trainer.model, checkpoint_dir) + # submit to model hub or save the model to share with others + + Note, that once this was run, the ``model`` will no longer be usable in the deepspeed context + of the same application. i.e. you will need to re-initialize the deepspeed engine, since + ``model.load_state_dict(state_dict)`` will remove all the deepspeed magic from it. + + """ + logger.info(f"Extracting fp32 weights") + state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, tag) + + logger.info(f"Overwriting model with fp32 weights") + model = model.cpu() + model.load_state_dict(state_dict, strict=False) + + return model + + +if __name__ == "__main__": + + parser = argparse.ArgumentParser() + parser.add_argument("checkpoint_dir", + type=str, + help="path to the desired checkpoint folder, e.g., path/checkpoint-12") + parser.add_argument( + "output_file", + type=str, + help="path to the pytorch fp32 state_dict output file (e.g. path/checkpoint-12/pytorch_model.bin)") + parser.add_argument("-t", + "--tag", + type=str, + default=None, + help="checkpoint tag used as a unique identifier for checkpoint. e.g., global_step1") + parser.add_argument("--exclude_frozen_parameters", action='store_true', help="exclude frozen parameters") + parser.add_argument("-d", "--debug", action='store_true', help="enable debug") + args = parser.parse_args() + + debug = args.debug + + convert_zero_checkpoint_to_fp32_state_dict(args.checkpoint_dir, + args.output_file, + tag=args.tag, + exclude_frozen_parameters=args.exclude_frozen_parameters) diff --git a/checkpoint-4230/README.md b/checkpoint-4230/README.md new file mode 100644 index 0000000000000000000000000000000000000000..049d467664ca6172b7ffbe6ba60b3eac7479cac4 --- /dev/null +++ b/checkpoint-4230/README.md @@ -0,0 +1,202 @@ +--- +base_model: meta-llama/Llama-3.1-8B +library_name: peft +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.14.0 \ No newline at end of file diff --git a/checkpoint-4230/adapter_config.json b/checkpoint-4230/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..38890ca21f7e1854f7350049a9113a9eec8a443a --- /dev/null +++ b/checkpoint-4230/adapter_config.json @@ -0,0 +1,40 @@ +{ + "alpha_pattern": {}, + "auto_mapping": null, + "base_model_name_or_path": "meta-llama/Llama-3.1-8B", + "bias": "none", + "eva_config": null, + "exclude_modules": null, + "fan_in_fan_out": null, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 512, + "lora_bias": false, + "lora_dropout": 0.05, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": [ + "embed_tokens", + "lm_head" + ], + "peft_type": "LORA", + "r": 256, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "v_proj", + "q_proj", + "gate_proj", + "k_proj", + "down_proj", + "up_proj", + "o_proj" + ], + "task_type": "CAUSAL_LM", + "use_dora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/checkpoint-4230/adapter_model.safetensors b/checkpoint-4230/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..420d07ca284a2570953e1bd2322de57b2b701082 --- /dev/null +++ b/checkpoint-4230/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9e32d7e7fb1567458c31490ab395d2f18a8f2a2690e95b59b9e604be0d81e8bc +size 3443586272 diff --git a/checkpoint-4230/global_step4230/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt b/checkpoint-4230/global_step4230/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..f48fbb730ff80865dd6aded2e67d5ce04cf2d15a --- /dev/null +++ b/checkpoint-4230/global_step4230/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0a5a654f580570a7d079875df3d7a10ca68d7313b922e4c6d48890d70c339aa2 +size 20661195036 diff --git a/checkpoint-4230/global_step4230/mp_rank_00_model_states.pt b/checkpoint-4230/global_step4230/mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..6df6bf0bc1f5de93b6204a0cbdf465d83d2245e4 --- /dev/null +++ b/checkpoint-4230/global_step4230/mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6f90d26a9e90140c4f5a4396c1ef868c0e735530fe319c146a64e9e907214f2d +size 3555326777 diff --git a/checkpoint-4230/latest b/checkpoint-4230/latest new file mode 100644 index 0000000000000000000000000000000000000000..f2e9527786b2225f95d976a4280b0a1cffe4254a --- /dev/null +++ b/checkpoint-4230/latest @@ -0,0 +1 @@ +global_step4230 \ No newline at end of file diff --git a/checkpoint-4230/rng_state.pth b/checkpoint-4230/rng_state.pth new file mode 100644 index 0000000000000000000000000000000000000000..bb90bc95800d8247bb004ebf48ee2f7d06c22c57 --- /dev/null +++ b/checkpoint-4230/rng_state.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a422aae9cec4d59c271c92acda4a907bc97c3691c619213443cf5eb6a7b483f5 +size 14244 diff --git a/checkpoint-4230/scheduler.pt b/checkpoint-4230/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..92005bd02a40f30b4b2615a6f0e6bb7f48ac4ce5 --- /dev/null +++ b/checkpoint-4230/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:37808eff2730899ea378e6807a984be98b958d7ff33bf2192d4b87ce8d6f1fb2 +size 1064 diff --git a/checkpoint-4230/special_tokens_map.json b/checkpoint-4230/special_tokens_map.json new file mode 100644 index 0000000000000000000000000000000000000000..e5b39b6305d89284b04934011c68dbb26bf588ca --- /dev/null +++ b/checkpoint-4230/special_tokens_map.json @@ -0,0 +1,23 @@ +{ + "bos_token": { + "content": "<|begin_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "eos_token": { + "content": "<|end_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "pad_token": { + "content": "<|end_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + } +} diff --git a/checkpoint-4230/tokenizer.json b/checkpoint-4230/tokenizer.json new file mode 100644 index 0000000000000000000000000000000000000000..1c1d8d5c9024994f1d3b00f9662b8dd89ca13cf2 --- /dev/null +++ b/checkpoint-4230/tokenizer.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6b9e4e7fb171f92fd137b777cc2714bf87d11576700a1dcd7a399e7bbe39537b +size 17209920 diff --git a/checkpoint-4230/tokenizer_config.json b/checkpoint-4230/tokenizer_config.json new file mode 100644 index 0000000000000000000000000000000000000000..81dd14db6632ad5b35b9d447732e37ac074873a5 --- /dev/null +++ b/checkpoint-4230/tokenizer_config.json @@ -0,0 +1,2063 @@ +{ + "added_tokens_decoder": { + "128000": { + "content": "<|begin_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128001": { + "content": "<|end_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128002": { + "content": "<|reserved_special_token_0|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128003": { + "content": "<|reserved_special_token_1|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128004": { + "content": "<|finetune_right_pad_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128005": { + "content": "<|reserved_special_token_2|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128006": { + "content": "<|start_header_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128007": { + "content": "<|end_header_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128008": { + "content": "<|eom_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128009": { + "content": "<|eot_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128010": { + "content": "<|python_tag|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128011": { + "content": "<|reserved_special_token_3|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128012": { + "content": "<|reserved_special_token_4|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128013": { + "content": "<|reserved_special_token_5|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128014": { + "content": "<|reserved_special_token_6|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128015": { + "content": "<|reserved_special_token_7|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128016": { + "content": "<|reserved_special_token_8|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128017": { + "content": "<|reserved_special_token_9|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128018": { + "content": "<|reserved_special_token_10|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128019": { + "content": "<|reserved_special_token_11|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128020": { + "content": "<|reserved_special_token_12|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128021": { + "content": "<|reserved_special_token_13|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128022": { + "content": "<|reserved_special_token_14|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128023": { + "content": "<|reserved_special_token_15|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128024": { + "content": "<|reserved_special_token_16|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128025": { + "content": "<|reserved_special_token_17|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128026": { + "content": "<|reserved_special_token_18|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128027": { + "content": "<|reserved_special_token_19|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128028": { + "content": "<|reserved_special_token_20|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128029": { + "content": "<|reserved_special_token_21|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128030": { + "content": "<|reserved_special_token_22|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128031": { + "content": "<|reserved_special_token_23|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128032": { + "content": "<|reserved_special_token_24|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128033": { + "content": "<|reserved_special_token_25|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128034": { + "content": "<|reserved_special_token_26|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128035": { + "content": "<|reserved_special_token_27|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128036": { + "content": "<|reserved_special_token_28|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128037": { + "content": "<|reserved_special_token_29|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128038": { + "content": "<|reserved_special_token_30|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128039": { + "content": "<|reserved_special_token_31|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128040": { + "content": "<|reserved_special_token_32|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128041": { + "content": "<|reserved_special_token_33|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128042": { + "content": "<|reserved_special_token_34|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128043": { + "content": "<|reserved_special_token_35|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128044": { + "content": "<|reserved_special_token_36|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128045": { + "content": "<|reserved_special_token_37|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128046": { + "content": "<|reserved_special_token_38|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128047": { + "content": "<|reserved_special_token_39|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128048": { + "content": "<|reserved_special_token_40|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128049": { + "content": "<|reserved_special_token_41|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128050": { + "content": "<|reserved_special_token_42|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128051": { + "content": "<|reserved_special_token_43|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128052": { + "content": "<|reserved_special_token_44|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128053": { + "content": "<|reserved_special_token_45|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128054": { + "content": "<|reserved_special_token_46|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128055": { + "content": "<|reserved_special_token_47|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128056": { + "content": "<|reserved_special_token_48|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128057": { + "content": "<|reserved_special_token_49|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128058": { + "content": "<|reserved_special_token_50|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128059": { + "content": "<|reserved_special_token_51|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128060": { + "content": "<|reserved_special_token_52|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128061": { + "content": "<|reserved_special_token_53|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128062": { + "content": "<|reserved_special_token_54|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128063": { + "content": "<|reserved_special_token_55|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128064": { + "content": "<|reserved_special_token_56|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128065": { + "content": "<|reserved_special_token_57|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128066": { + "content": "<|reserved_special_token_58|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128067": { + "content": "<|reserved_special_token_59|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128068": { + "content": "<|reserved_special_token_60|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128069": { + "content": "<|reserved_special_token_61|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128070": { + "content": "<|reserved_special_token_62|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128071": { + "content": "<|reserved_special_token_63|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128072": { + "content": "<|reserved_special_token_64|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128073": { + "content": "<|reserved_special_token_65|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128074": { + "content": "<|reserved_special_token_66|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128075": { + "content": "<|reserved_special_token_67|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128076": { + "content": "<|reserved_special_token_68|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128077": { + "content": "<|reserved_special_token_69|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128078": { + "content": "<|reserved_special_token_70|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128079": { + "content": "<|reserved_special_token_71|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128080": { + "content": "<|reserved_special_token_72|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128081": { + "content": "<|reserved_special_token_73|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128082": { + "content": "<|reserved_special_token_74|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128083": { + "content": "<|reserved_special_token_75|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128084": { + "content": "<|reserved_special_token_76|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128085": { + "content": "<|reserved_special_token_77|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128086": { + "content": "<|reserved_special_token_78|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128087": { + "content": "<|reserved_special_token_79|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128088": { + "content": "<|reserved_special_token_80|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128089": { + "content": "<|reserved_special_token_81|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128090": { + "content": "<|reserved_special_token_82|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128091": { + "content": "<|reserved_special_token_83|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128092": { + "content": "<|reserved_special_token_84|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128093": { + "content": "<|reserved_special_token_85|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128094": { + "content": "<|reserved_special_token_86|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128095": { + "content": "<|reserved_special_token_87|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128096": { + "content": "<|reserved_special_token_88|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128097": { + "content": "<|reserved_special_token_89|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128098": { + "content": "<|reserved_special_token_90|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128099": { + "content": "<|reserved_special_token_91|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128100": { + "content": "<|reserved_special_token_92|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128101": { + "content": "<|reserved_special_token_93|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128102": { + "content": "<|reserved_special_token_94|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128103": { + "content": "<|reserved_special_token_95|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128104": { + "content": "<|reserved_special_token_96|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128105": { + "content": "<|reserved_special_token_97|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128106": { + "content": "<|reserved_special_token_98|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128107": { + "content": "<|reserved_special_token_99|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128108": { + "content": "<|reserved_special_token_100|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128109": { + "content": "<|reserved_special_token_101|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128110": { + "content": "<|reserved_special_token_102|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128111": { + "content": "<|reserved_special_token_103|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128112": { + "content": "<|reserved_special_token_104|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128113": { + "content": "<|reserved_special_token_105|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128114": { + "content": "<|reserved_special_token_106|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128115": { + "content": "<|reserved_special_token_107|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128116": { + "content": "<|reserved_special_token_108|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128117": { + "content": "<|reserved_special_token_109|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128118": { + "content": "<|reserved_special_token_110|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128119": { + "content": "<|reserved_special_token_111|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128120": { + "content": "<|reserved_special_token_112|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128121": { + "content": "<|reserved_special_token_113|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128122": { + "content": "<|reserved_special_token_114|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128123": { + "content": "<|reserved_special_token_115|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128124": { + "content": "<|reserved_special_token_116|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128125": { + "content": "<|reserved_special_token_117|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128126": { + "content": "<|reserved_special_token_118|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128127": { + "content": "<|reserved_special_token_119|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128128": { + "content": "<|reserved_special_token_120|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128129": { + "content": "<|reserved_special_token_121|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128130": { + "content": "<|reserved_special_token_122|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128131": { + "content": "<|reserved_special_token_123|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128132": { + "content": "<|reserved_special_token_124|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128133": { + "content": "<|reserved_special_token_125|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128134": { + "content": "<|reserved_special_token_126|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128135": { + "content": "<|reserved_special_token_127|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128136": { + "content": "<|reserved_special_token_128|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128137": { + "content": "<|reserved_special_token_129|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128138": { + "content": "<|reserved_special_token_130|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128139": { + "content": "<|reserved_special_token_131|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128140": { + "content": "<|reserved_special_token_132|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128141": { + "content": "<|reserved_special_token_133|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128142": { + "content": "<|reserved_special_token_134|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128143": { + "content": "<|reserved_special_token_135|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128144": { + "content": "<|reserved_special_token_136|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128145": { + "content": "<|reserved_special_token_137|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128146": { + "content": "<|reserved_special_token_138|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128147": { + "content": "<|reserved_special_token_139|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128148": { + "content": "<|reserved_special_token_140|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128149": { + "content": "<|reserved_special_token_141|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128150": { + "content": "<|reserved_special_token_142|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128151": { + "content": "<|reserved_special_token_143|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128152": { + "content": "<|reserved_special_token_144|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128153": { + "content": "<|reserved_special_token_145|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128154": { + "content": "<|reserved_special_token_146|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128155": { + "content": "<|reserved_special_token_147|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128156": { + "content": "<|reserved_special_token_148|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128157": { + "content": "<|reserved_special_token_149|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128158": { + "content": "<|reserved_special_token_150|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128159": { + "content": "<|reserved_special_token_151|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128160": { + "content": "<|reserved_special_token_152|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128161": { + "content": "<|reserved_special_token_153|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128162": { + "content": "<|reserved_special_token_154|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128163": { + "content": "<|reserved_special_token_155|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128164": { + "content": "<|reserved_special_token_156|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128165": { + "content": "<|reserved_special_token_157|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128166": { + "content": "<|reserved_special_token_158|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128167": { + "content": "<|reserved_special_token_159|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128168": { + "content": "<|reserved_special_token_160|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128169": { + "content": "<|reserved_special_token_161|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128170": { + "content": "<|reserved_special_token_162|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128171": { + "content": "<|reserved_special_token_163|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128172": { + "content": "<|reserved_special_token_164|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128173": { + "content": "<|reserved_special_token_165|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128174": { + "content": "<|reserved_special_token_166|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128175": { + "content": "<|reserved_special_token_167|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128176": { + "content": "<|reserved_special_token_168|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128177": { + "content": "<|reserved_special_token_169|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128178": { + "content": "<|reserved_special_token_170|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128179": { + "content": "<|reserved_special_token_171|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128180": { + "content": "<|reserved_special_token_172|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128181": { + "content": "<|reserved_special_token_173|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128182": { + "content": "<|reserved_special_token_174|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128183": { + "content": "<|reserved_special_token_175|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128184": { + "content": "<|reserved_special_token_176|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128185": { + "content": "<|reserved_special_token_177|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128186": { + "content": "<|reserved_special_token_178|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128187": { + "content": "<|reserved_special_token_179|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128188": { + "content": "<|reserved_special_token_180|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128189": { + "content": "<|reserved_special_token_181|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128190": { + "content": "<|reserved_special_token_182|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128191": { + "content": "<|reserved_special_token_183|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128192": { + "content": "<|reserved_special_token_184|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128193": { + "content": "<|reserved_special_token_185|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128194": { + "content": "<|reserved_special_token_186|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128195": { + "content": "<|reserved_special_token_187|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128196": { + "content": "<|reserved_special_token_188|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128197": { + "content": "<|reserved_special_token_189|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128198": { + "content": "<|reserved_special_token_190|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128199": { + "content": "<|reserved_special_token_191|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128200": { + "content": "<|reserved_special_token_192|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128201": { + "content": "<|reserved_special_token_193|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128202": { + "content": "<|reserved_special_token_194|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128203": { + "content": "<|reserved_special_token_195|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128204": { + "content": "<|reserved_special_token_196|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128205": { + "content": "<|reserved_special_token_197|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128206": { + "content": "<|reserved_special_token_198|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128207": { + "content": "<|reserved_special_token_199|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128208": { + "content": "<|reserved_special_token_200|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128209": { + "content": "<|reserved_special_token_201|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128210": { + "content": "<|reserved_special_token_202|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128211": { + "content": "<|reserved_special_token_203|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128212": { + "content": "<|reserved_special_token_204|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128213": { + "content": "<|reserved_special_token_205|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128214": { + "content": "<|reserved_special_token_206|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128215": { + "content": "<|reserved_special_token_207|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128216": { + "content": "<|reserved_special_token_208|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128217": { + "content": "<|reserved_special_token_209|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128218": { + "content": "<|reserved_special_token_210|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128219": { + "content": "<|reserved_special_token_211|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128220": { + "content": "<|reserved_special_token_212|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128221": { + "content": "<|reserved_special_token_213|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128222": { + "content": "<|reserved_special_token_214|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128223": { + "content": "<|reserved_special_token_215|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128224": { + "content": "<|reserved_special_token_216|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128225": { + "content": "<|reserved_special_token_217|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128226": { + "content": "<|reserved_special_token_218|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128227": { + "content": "<|reserved_special_token_219|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128228": { + "content": "<|reserved_special_token_220|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128229": { + "content": "<|reserved_special_token_221|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128230": { + "content": "<|reserved_special_token_222|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128231": { + "content": "<|reserved_special_token_223|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128232": { + "content": "<|reserved_special_token_224|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128233": { + "content": "<|reserved_special_token_225|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128234": { + "content": "<|reserved_special_token_226|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128235": { + "content": "<|reserved_special_token_227|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128236": { + "content": "<|reserved_special_token_228|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128237": { + "content": "<|reserved_special_token_229|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128238": { + "content": "<|reserved_special_token_230|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128239": { + "content": "<|reserved_special_token_231|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128240": { + "content": "<|reserved_special_token_232|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128241": { + "content": "<|reserved_special_token_233|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128242": { + "content": "<|reserved_special_token_234|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128243": { + "content": "<|reserved_special_token_235|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128244": { + "content": "<|reserved_special_token_236|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128245": { + "content": "<|reserved_special_token_237|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128246": { + "content": "<|reserved_special_token_238|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128247": { + "content": "<|reserved_special_token_239|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128248": { + "content": "<|reserved_special_token_240|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128249": { + "content": "<|reserved_special_token_241|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128250": { + "content": "<|reserved_special_token_242|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128251": { + "content": "<|reserved_special_token_243|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128252": { + "content": "<|reserved_special_token_244|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128253": { + "content": "<|reserved_special_token_245|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128254": { + "content": "<|reserved_special_token_246|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128255": { + "content": "<|reserved_special_token_247|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + } + }, + "bos_token": "<|begin_of_text|>", + "clean_up_tokenization_spaces": true, + "eos_token": "<|end_of_text|>", + "extra_special_tokens": {}, + "model_input_names": [ + "input_ids", + "attention_mask" + ], + "model_max_length": 131072, + "pad_token": "<|end_of_text|>", + "tokenizer_class": "PreTrainedTokenizer" +} diff --git a/checkpoint-4230/trainer_state.json b/checkpoint-4230/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..378b191d53a48520b41e4a2d095e1445ea939ca6 --- /dev/null +++ b/checkpoint-4230/trainer_state.json @@ -0,0 +1,29643 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 2.0, + "eval_steps": 500, + "global_step": 4230, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.00047281323877068556, + "grad_norm": 5.163570880889893, + "learning_rate": 5.0000000000000004e-08, + "loss": 1.4628, + "step": 1 + }, + { + "epoch": 0.0009456264775413711, + "grad_norm": 6.298020839691162, + "learning_rate": 1.0000000000000001e-07, + "loss": 1.5003, + "step": 2 + }, + { + "epoch": 0.0014184397163120568, + "grad_norm": 5.853623390197754, + "learning_rate": 1.5000000000000002e-07, + "loss": 1.4495, + "step": 3 + }, + { + "epoch": 0.0018912529550827422, + "grad_norm": 5.456025123596191, + "learning_rate": 2.0000000000000002e-07, + "loss": 1.3798, + "step": 4 + }, + { + "epoch": 0.002364066193853428, + "grad_norm": 5.757407188415527, + "learning_rate": 2.5000000000000004e-07, + "loss": 1.4515, + "step": 5 + }, + { + "epoch": 0.0028368794326241137, + "grad_norm": 5.872277736663818, + "learning_rate": 3.0000000000000004e-07, + "loss": 1.4424, + "step": 6 + }, + { + "epoch": 0.003309692671394799, + "grad_norm": 6.7816009521484375, + "learning_rate": 3.5000000000000004e-07, + "loss": 1.4004, + "step": 7 + }, + { + "epoch": 0.0037825059101654845, + "grad_norm": 6.229667663574219, + "learning_rate": 4.0000000000000003e-07, + "loss": 1.4494, + "step": 8 + }, + { + "epoch": 0.00425531914893617, + "grad_norm": 5.336202621459961, + "learning_rate": 4.5000000000000003e-07, + "loss": 1.3916, + "step": 9 + }, + { + "epoch": 0.004728132387706856, + "grad_norm": 5.589445114135742, + "learning_rate": 5.000000000000001e-07, + "loss": 1.2318, + "step": 10 + }, + { + "epoch": 0.005200945626477541, + "grad_norm": 5.720539569854736, + "learning_rate": 5.5e-07, + "loss": 1.4367, + "step": 11 + }, + { + "epoch": 0.005673758865248227, + "grad_norm": 5.913913726806641, + "learning_rate": 6.000000000000001e-07, + "loss": 1.342, + "step": 12 + }, + { + "epoch": 0.006146572104018913, + "grad_norm": 5.899744987487793, + "learning_rate": 6.5e-07, + "loss": 1.4307, + "step": 13 + }, + { + "epoch": 0.006619385342789598, + "grad_norm": 5.571037292480469, + "learning_rate": 7.000000000000001e-07, + "loss": 1.3372, + "step": 14 + }, + { + "epoch": 0.0070921985815602835, + "grad_norm": 5.480010509490967, + "learning_rate": 7.5e-07, + "loss": 1.3923, + "step": 15 + }, + { + "epoch": 0.007565011820330969, + "grad_norm": 5.254702091217041, + "learning_rate": 8.000000000000001e-07, + "loss": 1.2928, + "step": 16 + }, + { + "epoch": 0.008037825059101654, + "grad_norm": 6.090312480926514, + "learning_rate": 8.500000000000001e-07, + "loss": 1.4984, + "step": 17 + }, + { + "epoch": 0.00851063829787234, + "grad_norm": 5.689319610595703, + "learning_rate": 9.000000000000001e-07, + "loss": 1.4108, + "step": 18 + }, + { + "epoch": 0.008983451536643027, + "grad_norm": 5.386685848236084, + "learning_rate": 9.500000000000001e-07, + "loss": 1.425, + "step": 19 + }, + { + "epoch": 0.009456264775413711, + "grad_norm": 6.451584815979004, + "learning_rate": 1.0000000000000002e-06, + "loss": 1.5507, + "step": 20 + }, + { + "epoch": 0.009929078014184398, + "grad_norm": 5.37647008895874, + "learning_rate": 1.0500000000000001e-06, + "loss": 1.4109, + "step": 21 + }, + { + "epoch": 0.010401891252955082, + "grad_norm": 4.716553211212158, + "learning_rate": 1.1e-06, + "loss": 1.2028, + "step": 22 + }, + { + "epoch": 0.010874704491725768, + "grad_norm": 4.950989723205566, + "learning_rate": 1.1500000000000002e-06, + "loss": 1.3043, + "step": 23 + }, + { + "epoch": 0.011347517730496455, + "grad_norm": 4.688975811004639, + "learning_rate": 1.2000000000000002e-06, + "loss": 1.2708, + "step": 24 + }, + { + "epoch": 0.01182033096926714, + "grad_norm": 4.905868053436279, + "learning_rate": 1.25e-06, + "loss": 1.3268, + "step": 25 + }, + { + "epoch": 0.012293144208037825, + "grad_norm": 4.503395080566406, + "learning_rate": 1.3e-06, + "loss": 1.1799, + "step": 26 + }, + { + "epoch": 0.01276595744680851, + "grad_norm": 4.77382230758667, + "learning_rate": 1.3500000000000002e-06, + "loss": 1.3882, + "step": 27 + }, + { + "epoch": 0.013238770685579196, + "grad_norm": 4.734329700469971, + "learning_rate": 1.4000000000000001e-06, + "loss": 1.3476, + "step": 28 + }, + { + "epoch": 0.013711583924349883, + "grad_norm": 4.775066375732422, + "learning_rate": 1.45e-06, + "loss": 1.2429, + "step": 29 + }, + { + "epoch": 0.014184397163120567, + "grad_norm": 4.978334426879883, + "learning_rate": 1.5e-06, + "loss": 1.2119, + "step": 30 + }, + { + "epoch": 0.014657210401891253, + "grad_norm": 4.506785869598389, + "learning_rate": 1.5500000000000002e-06, + "loss": 1.3157, + "step": 31 + }, + { + "epoch": 0.015130023640661938, + "grad_norm": 4.007757186889648, + "learning_rate": 1.6000000000000001e-06, + "loss": 1.1451, + "step": 32 + }, + { + "epoch": 0.015602836879432624, + "grad_norm": 3.6621618270874023, + "learning_rate": 1.6500000000000003e-06, + "loss": 1.093, + "step": 33 + }, + { + "epoch": 0.01607565011820331, + "grad_norm": 3.8733766078948975, + "learning_rate": 1.7000000000000002e-06, + "loss": 1.2289, + "step": 34 + }, + { + "epoch": 0.016548463356973995, + "grad_norm": 4.3391900062561035, + "learning_rate": 1.75e-06, + "loss": 1.1453, + "step": 35 + }, + { + "epoch": 0.01702127659574468, + "grad_norm": 3.287623643875122, + "learning_rate": 1.8000000000000001e-06, + "loss": 1.0257, + "step": 36 + }, + { + "epoch": 0.017494089834515367, + "grad_norm": 3.591721773147583, + "learning_rate": 1.85e-06, + "loss": 0.9976, + "step": 37 + }, + { + "epoch": 0.017966903073286054, + "grad_norm": 4.028271675109863, + "learning_rate": 1.9000000000000002e-06, + "loss": 1.0773, + "step": 38 + }, + { + "epoch": 0.018439716312056736, + "grad_norm": 3.3543951511383057, + "learning_rate": 1.9500000000000004e-06, + "loss": 1.1677, + "step": 39 + }, + { + "epoch": 0.018912529550827423, + "grad_norm": 3.807624340057373, + "learning_rate": 2.0000000000000003e-06, + "loss": 1.1232, + "step": 40 + }, + { + "epoch": 0.01938534278959811, + "grad_norm": 4.242797374725342, + "learning_rate": 2.05e-06, + "loss": 1.1819, + "step": 41 + }, + { + "epoch": 0.019858156028368795, + "grad_norm": 3.4574992656707764, + "learning_rate": 2.1000000000000002e-06, + "loss": 0.9878, + "step": 42 + }, + { + "epoch": 0.02033096926713948, + "grad_norm": 3.906695604324341, + "learning_rate": 2.15e-06, + "loss": 1.0592, + "step": 43 + }, + { + "epoch": 0.020803782505910164, + "grad_norm": 3.7543163299560547, + "learning_rate": 2.2e-06, + "loss": 1.0309, + "step": 44 + }, + { + "epoch": 0.02127659574468085, + "grad_norm": 3.3777148723602295, + "learning_rate": 2.25e-06, + "loss": 1.0664, + "step": 45 + }, + { + "epoch": 0.021749408983451537, + "grad_norm": 3.6003634929656982, + "learning_rate": 2.3000000000000004e-06, + "loss": 1.0482, + "step": 46 + }, + { + "epoch": 0.022222222222222223, + "grad_norm": 3.3961377143859863, + "learning_rate": 2.35e-06, + "loss": 1.0252, + "step": 47 + }, + { + "epoch": 0.02269503546099291, + "grad_norm": 3.1601035594940186, + "learning_rate": 2.4000000000000003e-06, + "loss": 1.0435, + "step": 48 + }, + { + "epoch": 0.023167848699763592, + "grad_norm": 3.4192967414855957, + "learning_rate": 2.4500000000000003e-06, + "loss": 1.0935, + "step": 49 + }, + { + "epoch": 0.02364066193853428, + "grad_norm": 3.1225922107696533, + "learning_rate": 2.5e-06, + "loss": 0.8988, + "step": 50 + }, + { + "epoch": 0.024113475177304965, + "grad_norm": 3.1423380374908447, + "learning_rate": 2.55e-06, + "loss": 1.0159, + "step": 51 + }, + { + "epoch": 0.02458628841607565, + "grad_norm": 3.4782402515411377, + "learning_rate": 2.6e-06, + "loss": 1.0231, + "step": 52 + }, + { + "epoch": 0.025059101654846337, + "grad_norm": 3.8362693786621094, + "learning_rate": 2.6500000000000005e-06, + "loss": 1.0725, + "step": 53 + }, + { + "epoch": 0.02553191489361702, + "grad_norm": 3.033294916152954, + "learning_rate": 2.7000000000000004e-06, + "loss": 0.9377, + "step": 54 + }, + { + "epoch": 0.026004728132387706, + "grad_norm": 3.849741220474243, + "learning_rate": 2.7500000000000004e-06, + "loss": 1.0046, + "step": 55 + }, + { + "epoch": 0.026477541371158392, + "grad_norm": 3.141876220703125, + "learning_rate": 2.8000000000000003e-06, + "loss": 0.9226, + "step": 56 + }, + { + "epoch": 0.02695035460992908, + "grad_norm": 2.773594856262207, + "learning_rate": 2.85e-06, + "loss": 0.8662, + "step": 57 + }, + { + "epoch": 0.027423167848699765, + "grad_norm": 3.1460225582122803, + "learning_rate": 2.9e-06, + "loss": 0.9304, + "step": 58 + }, + { + "epoch": 0.027895981087470448, + "grad_norm": 3.293583631515503, + "learning_rate": 2.95e-06, + "loss": 1.0374, + "step": 59 + }, + { + "epoch": 0.028368794326241134, + "grad_norm": 3.8190863132476807, + "learning_rate": 3e-06, + "loss": 0.971, + "step": 60 + }, + { + "epoch": 0.02884160756501182, + "grad_norm": 3.4566776752471924, + "learning_rate": 3.05e-06, + "loss": 0.9631, + "step": 61 + }, + { + "epoch": 0.029314420803782507, + "grad_norm": 3.355741500854492, + "learning_rate": 3.1000000000000004e-06, + "loss": 1.0097, + "step": 62 + }, + { + "epoch": 0.029787234042553193, + "grad_norm": 3.29746675491333, + "learning_rate": 3.1500000000000003e-06, + "loss": 0.9459, + "step": 63 + }, + { + "epoch": 0.030260047281323876, + "grad_norm": 3.3122968673706055, + "learning_rate": 3.2000000000000003e-06, + "loss": 0.8594, + "step": 64 + }, + { + "epoch": 0.030732860520094562, + "grad_norm": 3.477701187133789, + "learning_rate": 3.2500000000000002e-06, + "loss": 0.9197, + "step": 65 + }, + { + "epoch": 0.031205673758865248, + "grad_norm": 3.3363406658172607, + "learning_rate": 3.3000000000000006e-06, + "loss": 0.9478, + "step": 66 + }, + { + "epoch": 0.03167848699763593, + "grad_norm": 4.143295764923096, + "learning_rate": 3.3500000000000005e-06, + "loss": 1.0534, + "step": 67 + }, + { + "epoch": 0.03215130023640662, + "grad_norm": 3.2363274097442627, + "learning_rate": 3.4000000000000005e-06, + "loss": 0.9454, + "step": 68 + }, + { + "epoch": 0.032624113475177303, + "grad_norm": 3.198746681213379, + "learning_rate": 3.45e-06, + "loss": 0.9388, + "step": 69 + }, + { + "epoch": 0.03309692671394799, + "grad_norm": 3.5751023292541504, + "learning_rate": 3.5e-06, + "loss": 0.9444, + "step": 70 + }, + { + "epoch": 0.033569739952718676, + "grad_norm": 3.1745729446411133, + "learning_rate": 3.5500000000000003e-06, + "loss": 0.8683, + "step": 71 + }, + { + "epoch": 0.03404255319148936, + "grad_norm": 3.3210883140563965, + "learning_rate": 3.6000000000000003e-06, + "loss": 0.8811, + "step": 72 + }, + { + "epoch": 0.03451536643026005, + "grad_norm": 3.2502429485321045, + "learning_rate": 3.65e-06, + "loss": 1.0012, + "step": 73 + }, + { + "epoch": 0.034988179669030735, + "grad_norm": 3.44598126411438, + "learning_rate": 3.7e-06, + "loss": 0.9217, + "step": 74 + }, + { + "epoch": 0.03546099290780142, + "grad_norm": 3.439117431640625, + "learning_rate": 3.7500000000000005e-06, + "loss": 0.8976, + "step": 75 + }, + { + "epoch": 0.03593380614657211, + "grad_norm": 3.523627758026123, + "learning_rate": 3.8000000000000005e-06, + "loss": 0.8996, + "step": 76 + }, + { + "epoch": 0.03640661938534279, + "grad_norm": 3.3716015815734863, + "learning_rate": 3.85e-06, + "loss": 0.9061, + "step": 77 + }, + { + "epoch": 0.03687943262411347, + "grad_norm": 3.33518385887146, + "learning_rate": 3.900000000000001e-06, + "loss": 0.9371, + "step": 78 + }, + { + "epoch": 0.03735224586288416, + "grad_norm": 3.833829879760742, + "learning_rate": 3.95e-06, + "loss": 0.9669, + "step": 79 + }, + { + "epoch": 0.037825059101654845, + "grad_norm": 3.260446786880493, + "learning_rate": 4.000000000000001e-06, + "loss": 0.9449, + "step": 80 + }, + { + "epoch": 0.03829787234042553, + "grad_norm": 3.532451629638672, + "learning_rate": 4.05e-06, + "loss": 0.897, + "step": 81 + }, + { + "epoch": 0.03877068557919622, + "grad_norm": 3.1156492233276367, + "learning_rate": 4.1e-06, + "loss": 0.8463, + "step": 82 + }, + { + "epoch": 0.039243498817966904, + "grad_norm": 2.8801751136779785, + "learning_rate": 4.15e-06, + "loss": 0.8616, + "step": 83 + }, + { + "epoch": 0.03971631205673759, + "grad_norm": 3.072476863861084, + "learning_rate": 4.2000000000000004e-06, + "loss": 0.8387, + "step": 84 + }, + { + "epoch": 0.04018912529550828, + "grad_norm": 2.9601376056671143, + "learning_rate": 4.25e-06, + "loss": 0.8538, + "step": 85 + }, + { + "epoch": 0.04066193853427896, + "grad_norm": 3.521664619445801, + "learning_rate": 4.3e-06, + "loss": 0.8894, + "step": 86 + }, + { + "epoch": 0.04113475177304964, + "grad_norm": 3.2670981884002686, + "learning_rate": 4.350000000000001e-06, + "loss": 0.8387, + "step": 87 + }, + { + "epoch": 0.04160756501182033, + "grad_norm": 3.422089099884033, + "learning_rate": 4.4e-06, + "loss": 0.7728, + "step": 88 + }, + { + "epoch": 0.042080378250591015, + "grad_norm": 3.414034128189087, + "learning_rate": 4.450000000000001e-06, + "loss": 0.7968, + "step": 89 + }, + { + "epoch": 0.0425531914893617, + "grad_norm": 4.234285354614258, + "learning_rate": 4.5e-06, + "loss": 0.8502, + "step": 90 + }, + { + "epoch": 0.04302600472813239, + "grad_norm": 3.1446919441223145, + "learning_rate": 4.5500000000000005e-06, + "loss": 0.8236, + "step": 91 + }, + { + "epoch": 0.043498817966903074, + "grad_norm": 3.683443307876587, + "learning_rate": 4.600000000000001e-06, + "loss": 0.9792, + "step": 92 + }, + { + "epoch": 0.04397163120567376, + "grad_norm": 3.664219617843628, + "learning_rate": 4.65e-06, + "loss": 0.8743, + "step": 93 + }, + { + "epoch": 0.044444444444444446, + "grad_norm": 3.369479179382324, + "learning_rate": 4.7e-06, + "loss": 0.8741, + "step": 94 + }, + { + "epoch": 0.04491725768321513, + "grad_norm": 3.694949150085449, + "learning_rate": 4.75e-06, + "loss": 0.7574, + "step": 95 + }, + { + "epoch": 0.04539007092198582, + "grad_norm": 3.5144498348236084, + "learning_rate": 4.800000000000001e-06, + "loss": 0.9934, + "step": 96 + }, + { + "epoch": 0.0458628841607565, + "grad_norm": 3.164451837539673, + "learning_rate": 4.85e-06, + "loss": 0.7463, + "step": 97 + }, + { + "epoch": 0.046335697399527184, + "grad_norm": 3.222785472869873, + "learning_rate": 4.9000000000000005e-06, + "loss": 0.7698, + "step": 98 + }, + { + "epoch": 0.04680851063829787, + "grad_norm": 2.9129555225372314, + "learning_rate": 4.95e-06, + "loss": 0.7856, + "step": 99 + }, + { + "epoch": 0.04728132387706856, + "grad_norm": 3.5061235427856445, + "learning_rate": 5e-06, + "loss": 0.8588, + "step": 100 + }, + { + "epoch": 0.04775413711583924, + "grad_norm": 3.2805044651031494, + "learning_rate": 4.999999922167982e-06, + "loss": 0.7643, + "step": 101 + }, + { + "epoch": 0.04822695035460993, + "grad_norm": 3.5461678504943848, + "learning_rate": 4.999999688671929e-06, + "loss": 0.8253, + "step": 102 + }, + { + "epoch": 0.048699763593380616, + "grad_norm": 3.2238264083862305, + "learning_rate": 4.99999929951186e-06, + "loss": 0.7622, + "step": 103 + }, + { + "epoch": 0.0491725768321513, + "grad_norm": 3.818955898284912, + "learning_rate": 4.999998754687795e-06, + "loss": 0.8471, + "step": 104 + }, + { + "epoch": 0.04964539007092199, + "grad_norm": 3.1252424716949463, + "learning_rate": 4.99999805419977e-06, + "loss": 0.8409, + "step": 105 + }, + { + "epoch": 0.050118203309692674, + "grad_norm": 3.604283571243286, + "learning_rate": 4.999997198047828e-06, + "loss": 0.9027, + "step": 106 + }, + { + "epoch": 0.050591016548463354, + "grad_norm": 3.6752424240112305, + "learning_rate": 4.999996186232023e-06, + "loss": 0.9336, + "step": 107 + }, + { + "epoch": 0.05106382978723404, + "grad_norm": 3.517557144165039, + "learning_rate": 4.9999950187524184e-06, + "loss": 0.8351, + "step": 108 + }, + { + "epoch": 0.051536643026004726, + "grad_norm": 3.427285671234131, + "learning_rate": 4.999993695609085e-06, + "loss": 0.8457, + "step": 109 + }, + { + "epoch": 0.05200945626477541, + "grad_norm": 3.2792510986328125, + "learning_rate": 4.999992216802107e-06, + "loss": 0.8391, + "step": 110 + }, + { + "epoch": 0.0524822695035461, + "grad_norm": 3.581094741821289, + "learning_rate": 4.999990582331576e-06, + "loss": 0.7533, + "step": 111 + }, + { + "epoch": 0.052955082742316785, + "grad_norm": 3.1667377948760986, + "learning_rate": 4.999988792197593e-06, + "loss": 0.9562, + "step": 112 + }, + { + "epoch": 0.05342789598108747, + "grad_norm": 3.3609890937805176, + "learning_rate": 4.99998684640027e-06, + "loss": 0.8181, + "step": 113 + }, + { + "epoch": 0.05390070921985816, + "grad_norm": 3.260627269744873, + "learning_rate": 4.999984744939729e-06, + "loss": 0.8012, + "step": 114 + }, + { + "epoch": 0.054373522458628844, + "grad_norm": 3.4535653591156006, + "learning_rate": 4.9999824878160985e-06, + "loss": 0.919, + "step": 115 + }, + { + "epoch": 0.05484633569739953, + "grad_norm": 3.4880740642547607, + "learning_rate": 4.999980075029522e-06, + "loss": 0.8114, + "step": 116 + }, + { + "epoch": 0.05531914893617021, + "grad_norm": 3.2546932697296143, + "learning_rate": 4.999977506580147e-06, + "loss": 0.8274, + "step": 117 + }, + { + "epoch": 0.055791962174940896, + "grad_norm": 3.2762744426727295, + "learning_rate": 4.999974782468136e-06, + "loss": 0.9018, + "step": 118 + }, + { + "epoch": 0.05626477541371158, + "grad_norm": 3.42825984954834, + "learning_rate": 4.999971902693657e-06, + "loss": 0.8262, + "step": 119 + }, + { + "epoch": 0.05673758865248227, + "grad_norm": 3.082496404647827, + "learning_rate": 4.99996886725689e-06, + "loss": 0.8181, + "step": 120 + }, + { + "epoch": 0.057210401891252954, + "grad_norm": 3.322869300842285, + "learning_rate": 4.9999656761580225e-06, + "loss": 0.8382, + "step": 121 + }, + { + "epoch": 0.05768321513002364, + "grad_norm": 3.6365339756011963, + "learning_rate": 4.9999623293972555e-06, + "loss": 0.7489, + "step": 122 + }, + { + "epoch": 0.05815602836879433, + "grad_norm": 3.376352548599243, + "learning_rate": 4.999958826974796e-06, + "loss": 0.9012, + "step": 123 + }, + { + "epoch": 0.05862884160756501, + "grad_norm": 3.49088716506958, + "learning_rate": 4.999955168890862e-06, + "loss": 0.8999, + "step": 124 + }, + { + "epoch": 0.0591016548463357, + "grad_norm": 3.3265068531036377, + "learning_rate": 4.999951355145682e-06, + "loss": 0.8161, + "step": 125 + }, + { + "epoch": 0.059574468085106386, + "grad_norm": 3.697282314300537, + "learning_rate": 4.999947385739493e-06, + "loss": 0.9623, + "step": 126 + }, + { + "epoch": 0.06004728132387707, + "grad_norm": 2.7901928424835205, + "learning_rate": 4.999943260672542e-06, + "loss": 0.7371, + "step": 127 + }, + { + "epoch": 0.06052009456264775, + "grad_norm": 3.110319137573242, + "learning_rate": 4.999938979945086e-06, + "loss": 0.715, + "step": 128 + }, + { + "epoch": 0.06099290780141844, + "grad_norm": 3.2211520671844482, + "learning_rate": 4.999934543557392e-06, + "loss": 0.8888, + "step": 129 + }, + { + "epoch": 0.061465721040189124, + "grad_norm": 3.2466187477111816, + "learning_rate": 4.999929951509735e-06, + "loss": 0.9389, + "step": 130 + }, + { + "epoch": 0.06193853427895981, + "grad_norm": 3.3574399948120117, + "learning_rate": 4.999925203802403e-06, + "loss": 0.8263, + "step": 131 + }, + { + "epoch": 0.062411347517730496, + "grad_norm": 3.275601625442505, + "learning_rate": 4.99992030043569e-06, + "loss": 0.8338, + "step": 132 + }, + { + "epoch": 0.06288416075650118, + "grad_norm": 3.6011312007904053, + "learning_rate": 4.999915241409902e-06, + "loss": 0.8351, + "step": 133 + }, + { + "epoch": 0.06335697399527186, + "grad_norm": 2.969011068344116, + "learning_rate": 4.999910026725352e-06, + "loss": 0.79, + "step": 134 + }, + { + "epoch": 0.06382978723404255, + "grad_norm": 3.690784454345703, + "learning_rate": 4.999904656382369e-06, + "loss": 0.8209, + "step": 135 + }, + { + "epoch": 0.06430260047281323, + "grad_norm": 3.3363115787506104, + "learning_rate": 4.999899130381283e-06, + "loss": 0.858, + "step": 136 + }, + { + "epoch": 0.06477541371158392, + "grad_norm": 3.206881523132324, + "learning_rate": 4.9998934487224405e-06, + "loss": 0.834, + "step": 137 + }, + { + "epoch": 0.06524822695035461, + "grad_norm": 2.773146152496338, + "learning_rate": 4.999887611406195e-06, + "loss": 0.7576, + "step": 138 + }, + { + "epoch": 0.0657210401891253, + "grad_norm": 3.307725667953491, + "learning_rate": 4.999881618432908e-06, + "loss": 0.7487, + "step": 139 + }, + { + "epoch": 0.06619385342789598, + "grad_norm": 4.273657321929932, + "learning_rate": 4.999875469802956e-06, + "loss": 0.8176, + "step": 140 + }, + { + "epoch": 0.06666666666666667, + "grad_norm": 3.0898005962371826, + "learning_rate": 4.999869165516719e-06, + "loss": 0.7578, + "step": 141 + }, + { + "epoch": 0.06713947990543735, + "grad_norm": 3.25150990486145, + "learning_rate": 4.9998627055745915e-06, + "loss": 0.7873, + "step": 142 + }, + { + "epoch": 0.06761229314420804, + "grad_norm": 2.9705755710601807, + "learning_rate": 4.999856089976974e-06, + "loss": 0.6473, + "step": 143 + }, + { + "epoch": 0.06808510638297872, + "grad_norm": 3.5658507347106934, + "learning_rate": 4.9998493187242804e-06, + "loss": 0.855, + "step": 144 + }, + { + "epoch": 0.06855791962174941, + "grad_norm": 3.3994076251983643, + "learning_rate": 4.99984239181693e-06, + "loss": 0.7926, + "step": 145 + }, + { + "epoch": 0.0690307328605201, + "grad_norm": 2.8266260623931885, + "learning_rate": 4.999835309255357e-06, + "loss": 0.7564, + "step": 146 + }, + { + "epoch": 0.06950354609929078, + "grad_norm": 3.1143875122070312, + "learning_rate": 4.999828071039999e-06, + "loss": 0.8398, + "step": 147 + }, + { + "epoch": 0.06997635933806147, + "grad_norm": 2.9364278316497803, + "learning_rate": 4.99982067717131e-06, + "loss": 0.7381, + "step": 148 + }, + { + "epoch": 0.07044917257683216, + "grad_norm": 3.4155616760253906, + "learning_rate": 4.999813127649748e-06, + "loss": 0.7933, + "step": 149 + }, + { + "epoch": 0.07092198581560284, + "grad_norm": 4.371236324310303, + "learning_rate": 4.999805422475784e-06, + "loss": 0.8292, + "step": 150 + }, + { + "epoch": 0.07139479905437353, + "grad_norm": 3.3967185020446777, + "learning_rate": 4.999797561649897e-06, + "loss": 0.8712, + "step": 151 + }, + { + "epoch": 0.07186761229314421, + "grad_norm": 3.343303680419922, + "learning_rate": 4.999789545172578e-06, + "loss": 0.8177, + "step": 152 + }, + { + "epoch": 0.07234042553191489, + "grad_norm": 3.040235757827759, + "learning_rate": 4.999781373044325e-06, + "loss": 0.7379, + "step": 153 + }, + { + "epoch": 0.07281323877068557, + "grad_norm": 3.4069204330444336, + "learning_rate": 4.999773045265647e-06, + "loss": 0.7939, + "step": 154 + }, + { + "epoch": 0.07328605200945626, + "grad_norm": 3.1939475536346436, + "learning_rate": 4.999764561837063e-06, + "loss": 0.8037, + "step": 155 + }, + { + "epoch": 0.07375886524822695, + "grad_norm": 4.452004909515381, + "learning_rate": 4.999755922759101e-06, + "loss": 0.8421, + "step": 156 + }, + { + "epoch": 0.07423167848699763, + "grad_norm": 3.2031240463256836, + "learning_rate": 4.999747128032298e-06, + "loss": 0.794, + "step": 157 + }, + { + "epoch": 0.07470449172576832, + "grad_norm": 3.175920009613037, + "learning_rate": 4.999738177657203e-06, + "loss": 0.759, + "step": 158 + }, + { + "epoch": 0.075177304964539, + "grad_norm": 3.7679688930511475, + "learning_rate": 4.9997290716343725e-06, + "loss": 0.8174, + "step": 159 + }, + { + "epoch": 0.07565011820330969, + "grad_norm": 3.7020037174224854, + "learning_rate": 4.999719809964373e-06, + "loss": 0.7116, + "step": 160 + }, + { + "epoch": 0.07612293144208038, + "grad_norm": 4.357471942901611, + "learning_rate": 4.999710392647783e-06, + "loss": 0.7649, + "step": 161 + }, + { + "epoch": 0.07659574468085106, + "grad_norm": 3.3439087867736816, + "learning_rate": 4.999700819685187e-06, + "loss": 0.7907, + "step": 162 + }, + { + "epoch": 0.07706855791962175, + "grad_norm": 3.210815191268921, + "learning_rate": 4.999691091077182e-06, + "loss": 0.8446, + "step": 163 + }, + { + "epoch": 0.07754137115839244, + "grad_norm": 3.1029553413391113, + "learning_rate": 4.9996812068243735e-06, + "loss": 0.7232, + "step": 164 + }, + { + "epoch": 0.07801418439716312, + "grad_norm": 2.9389400482177734, + "learning_rate": 4.999671166927378e-06, + "loss": 0.7413, + "step": 165 + }, + { + "epoch": 0.07848699763593381, + "grad_norm": 3.7062697410583496, + "learning_rate": 4.9996609713868185e-06, + "loss": 0.8773, + "step": 166 + }, + { + "epoch": 0.0789598108747045, + "grad_norm": 3.2768924236297607, + "learning_rate": 4.999650620203332e-06, + "loss": 0.8046, + "step": 167 + }, + { + "epoch": 0.07943262411347518, + "grad_norm": 3.380373001098633, + "learning_rate": 4.999640113377561e-06, + "loss": 0.7529, + "step": 168 + }, + { + "epoch": 0.07990543735224587, + "grad_norm": 3.520022392272949, + "learning_rate": 4.999629450910162e-06, + "loss": 0.7352, + "step": 169 + }, + { + "epoch": 0.08037825059101655, + "grad_norm": 3.43269419670105, + "learning_rate": 4.999618632801796e-06, + "loss": 0.9371, + "step": 170 + }, + { + "epoch": 0.08085106382978724, + "grad_norm": 3.555877923965454, + "learning_rate": 4.99960765905314e-06, + "loss": 0.8276, + "step": 171 + }, + { + "epoch": 0.08132387706855793, + "grad_norm": 3.597050189971924, + "learning_rate": 4.999596529664874e-06, + "loss": 0.8164, + "step": 172 + }, + { + "epoch": 0.0817966903073286, + "grad_norm": 3.2002956867218018, + "learning_rate": 4.999585244637693e-06, + "loss": 0.7824, + "step": 173 + }, + { + "epoch": 0.08226950354609928, + "grad_norm": 3.527275562286377, + "learning_rate": 4.999573803972299e-06, + "loss": 0.8033, + "step": 174 + }, + { + "epoch": 0.08274231678486997, + "grad_norm": 3.5184452533721924, + "learning_rate": 4.999562207669405e-06, + "loss": 0.724, + "step": 175 + }, + { + "epoch": 0.08321513002364066, + "grad_norm": 3.6635067462921143, + "learning_rate": 4.999550455729732e-06, + "loss": 0.819, + "step": 176 + }, + { + "epoch": 0.08368794326241134, + "grad_norm": 3.192399740219116, + "learning_rate": 4.999538548154012e-06, + "loss": 0.7999, + "step": 177 + }, + { + "epoch": 0.08416075650118203, + "grad_norm": 3.0946953296661377, + "learning_rate": 4.999526484942988e-06, + "loss": 0.7367, + "step": 178 + }, + { + "epoch": 0.08463356973995272, + "grad_norm": 2.847198009490967, + "learning_rate": 4.99951426609741e-06, + "loss": 0.7536, + "step": 179 + }, + { + "epoch": 0.0851063829787234, + "grad_norm": 2.7674827575683594, + "learning_rate": 4.999501891618037e-06, + "loss": 0.701, + "step": 180 + }, + { + "epoch": 0.08557919621749409, + "grad_norm": 3.357933521270752, + "learning_rate": 4.999489361505643e-06, + "loss": 0.8331, + "step": 181 + }, + { + "epoch": 0.08605200945626477, + "grad_norm": 3.1464426517486572, + "learning_rate": 4.999476675761004e-06, + "loss": 0.7931, + "step": 182 + }, + { + "epoch": 0.08652482269503546, + "grad_norm": 3.310697078704834, + "learning_rate": 4.999463834384915e-06, + "loss": 0.753, + "step": 183 + }, + { + "epoch": 0.08699763593380615, + "grad_norm": 2.9794881343841553, + "learning_rate": 4.999450837378171e-06, + "loss": 0.7091, + "step": 184 + }, + { + "epoch": 0.08747044917257683, + "grad_norm": 3.0776889324188232, + "learning_rate": 4.999437684741584e-06, + "loss": 0.7226, + "step": 185 + }, + { + "epoch": 0.08794326241134752, + "grad_norm": 3.6657519340515137, + "learning_rate": 4.999424376475972e-06, + "loss": 0.845, + "step": 186 + }, + { + "epoch": 0.0884160756501182, + "grad_norm": 3.872718572616577, + "learning_rate": 4.999410912582164e-06, + "loss": 0.812, + "step": 187 + }, + { + "epoch": 0.08888888888888889, + "grad_norm": 2.9184508323669434, + "learning_rate": 4.9993972930609976e-06, + "loss": 0.6823, + "step": 188 + }, + { + "epoch": 0.08936170212765958, + "grad_norm": 3.5567142963409424, + "learning_rate": 4.999383517913321e-06, + "loss": 0.7614, + "step": 189 + }, + { + "epoch": 0.08983451536643026, + "grad_norm": 3.3688533306121826, + "learning_rate": 4.999369587139992e-06, + "loss": 0.858, + "step": 190 + }, + { + "epoch": 0.09030732860520095, + "grad_norm": 2.893223524093628, + "learning_rate": 4.99935550074188e-06, + "loss": 0.6761, + "step": 191 + }, + { + "epoch": 0.09078014184397164, + "grad_norm": 3.400225877761841, + "learning_rate": 4.999341258719859e-06, + "loss": 0.7531, + "step": 192 + }, + { + "epoch": 0.09125295508274232, + "grad_norm": 3.6167714595794678, + "learning_rate": 4.999326861074817e-06, + "loss": 0.8164, + "step": 193 + }, + { + "epoch": 0.091725768321513, + "grad_norm": 4.325016498565674, + "learning_rate": 4.9993123078076506e-06, + "loss": 0.7069, + "step": 194 + }, + { + "epoch": 0.09219858156028368, + "grad_norm": 3.195317029953003, + "learning_rate": 4.999297598919266e-06, + "loss": 0.726, + "step": 195 + }, + { + "epoch": 0.09267139479905437, + "grad_norm": 3.146530866622925, + "learning_rate": 4.999282734410579e-06, + "loss": 0.7888, + "step": 196 + }, + { + "epoch": 0.09314420803782505, + "grad_norm": 3.5166752338409424, + "learning_rate": 4.999267714282515e-06, + "loss": 0.8473, + "step": 197 + }, + { + "epoch": 0.09361702127659574, + "grad_norm": 3.3140196800231934, + "learning_rate": 4.99925253853601e-06, + "loss": 0.7233, + "step": 198 + }, + { + "epoch": 0.09408983451536643, + "grad_norm": 3.0318164825439453, + "learning_rate": 4.999237207172008e-06, + "loss": 0.7543, + "step": 199 + }, + { + "epoch": 0.09456264775413711, + "grad_norm": 3.662214756011963, + "learning_rate": 4.999221720191464e-06, + "loss": 0.7783, + "step": 200 + }, + { + "epoch": 0.0950354609929078, + "grad_norm": 3.452078104019165, + "learning_rate": 4.9992060775953425e-06, + "loss": 0.7868, + "step": 201 + }, + { + "epoch": 0.09550827423167849, + "grad_norm": 3.4051287174224854, + "learning_rate": 4.999190279384617e-06, + "loss": 0.7849, + "step": 202 + }, + { + "epoch": 0.09598108747044917, + "grad_norm": 3.1377196311950684, + "learning_rate": 4.999174325560271e-06, + "loss": 0.8364, + "step": 203 + }, + { + "epoch": 0.09645390070921986, + "grad_norm": 3.129473924636841, + "learning_rate": 4.999158216123299e-06, + "loss": 0.7458, + "step": 204 + }, + { + "epoch": 0.09692671394799054, + "grad_norm": 3.169548749923706, + "learning_rate": 4.999141951074703e-06, + "loss": 0.7256, + "step": 205 + }, + { + "epoch": 0.09739952718676123, + "grad_norm": 3.186009168624878, + "learning_rate": 4.999125530415495e-06, + "loss": 0.783, + "step": 206 + }, + { + "epoch": 0.09787234042553192, + "grad_norm": 3.0995123386383057, + "learning_rate": 4.9991089541467e-06, + "loss": 0.7519, + "step": 207 + }, + { + "epoch": 0.0983451536643026, + "grad_norm": 3.1854088306427, + "learning_rate": 4.999092222269348e-06, + "loss": 0.7444, + "step": 208 + }, + { + "epoch": 0.09881796690307329, + "grad_norm": 3.1512246131896973, + "learning_rate": 4.999075334784482e-06, + "loss": 0.7882, + "step": 209 + }, + { + "epoch": 0.09929078014184398, + "grad_norm": 3.6199698448181152, + "learning_rate": 4.999058291693153e-06, + "loss": 0.8048, + "step": 210 + }, + { + "epoch": 0.09976359338061466, + "grad_norm": 2.956907272338867, + "learning_rate": 4.999041092996422e-06, + "loss": 0.7663, + "step": 211 + }, + { + "epoch": 0.10023640661938535, + "grad_norm": 3.3493971824645996, + "learning_rate": 4.99902373869536e-06, + "loss": 0.7639, + "step": 212 + }, + { + "epoch": 0.10070921985815603, + "grad_norm": 3.144812822341919, + "learning_rate": 4.9990062287910475e-06, + "loss": 0.7953, + "step": 213 + }, + { + "epoch": 0.10118203309692671, + "grad_norm": 3.5986971855163574, + "learning_rate": 4.998988563284576e-06, + "loss": 0.8297, + "step": 214 + }, + { + "epoch": 0.1016548463356974, + "grad_norm": 3.447584867477417, + "learning_rate": 4.998970742177044e-06, + "loss": 0.808, + "step": 215 + }, + { + "epoch": 0.10212765957446808, + "grad_norm": 3.791353940963745, + "learning_rate": 4.998952765469562e-06, + "loss": 0.8005, + "step": 216 + }, + { + "epoch": 0.10260047281323877, + "grad_norm": 3.4490807056427, + "learning_rate": 4.998934633163247e-06, + "loss": 0.8135, + "step": 217 + }, + { + "epoch": 0.10307328605200945, + "grad_norm": 3.1053314208984375, + "learning_rate": 4.998916345259232e-06, + "loss": 0.7888, + "step": 218 + }, + { + "epoch": 0.10354609929078014, + "grad_norm": 3.407862663269043, + "learning_rate": 4.9988979017586514e-06, + "loss": 0.7099, + "step": 219 + }, + { + "epoch": 0.10401891252955082, + "grad_norm": 3.116656541824341, + "learning_rate": 4.998879302662658e-06, + "loss": 0.8344, + "step": 220 + }, + { + "epoch": 0.10449172576832151, + "grad_norm": 3.339264154434204, + "learning_rate": 4.998860547972406e-06, + "loss": 0.8496, + "step": 221 + }, + { + "epoch": 0.1049645390070922, + "grad_norm": 3.251892566680908, + "learning_rate": 4.998841637689066e-06, + "loss": 0.7455, + "step": 222 + }, + { + "epoch": 0.10543735224586288, + "grad_norm": 4.098135471343994, + "learning_rate": 4.998822571813814e-06, + "loss": 0.7772, + "step": 223 + }, + { + "epoch": 0.10591016548463357, + "grad_norm": 3.9871134757995605, + "learning_rate": 4.998803350347837e-06, + "loss": 0.8261, + "step": 224 + }, + { + "epoch": 0.10638297872340426, + "grad_norm": 3.2822303771972656, + "learning_rate": 4.998783973292333e-06, + "loss": 0.8623, + "step": 225 + }, + { + "epoch": 0.10685579196217494, + "grad_norm": 3.0356857776641846, + "learning_rate": 4.998764440648507e-06, + "loss": 0.7426, + "step": 226 + }, + { + "epoch": 0.10732860520094563, + "grad_norm": 2.8932785987854004, + "learning_rate": 4.998744752417576e-06, + "loss": 0.6741, + "step": 227 + }, + { + "epoch": 0.10780141843971631, + "grad_norm": 3.085820436477661, + "learning_rate": 4.998724908600767e-06, + "loss": 0.6549, + "step": 228 + }, + { + "epoch": 0.108274231678487, + "grad_norm": 3.135829210281372, + "learning_rate": 4.998704909199314e-06, + "loss": 0.6702, + "step": 229 + }, + { + "epoch": 0.10874704491725769, + "grad_norm": 5.016134262084961, + "learning_rate": 4.9986847542144625e-06, + "loss": 0.7852, + "step": 230 + }, + { + "epoch": 0.10921985815602837, + "grad_norm": 3.9056200981140137, + "learning_rate": 4.998664443647468e-06, + "loss": 0.9654, + "step": 231 + }, + { + "epoch": 0.10969267139479906, + "grad_norm": 3.0880749225616455, + "learning_rate": 4.998643977499595e-06, + "loss": 0.7579, + "step": 232 + }, + { + "epoch": 0.11016548463356975, + "grad_norm": 3.6893601417541504, + "learning_rate": 4.998623355772118e-06, + "loss": 0.713, + "step": 233 + }, + { + "epoch": 0.11063829787234042, + "grad_norm": 4.181536674499512, + "learning_rate": 4.998602578466319e-06, + "loss": 0.7331, + "step": 234 + }, + { + "epoch": 0.1111111111111111, + "grad_norm": 3.036386728286743, + "learning_rate": 4.998581645583496e-06, + "loss": 0.7115, + "step": 235 + }, + { + "epoch": 0.11158392434988179, + "grad_norm": 3.6333255767822266, + "learning_rate": 4.998560557124948e-06, + "loss": 0.7544, + "step": 236 + }, + { + "epoch": 0.11205673758865248, + "grad_norm": 2.926417827606201, + "learning_rate": 4.9985393130919915e-06, + "loss": 0.715, + "step": 237 + }, + { + "epoch": 0.11252955082742316, + "grad_norm": 2.969158172607422, + "learning_rate": 4.998517913485946e-06, + "loss": 0.7304, + "step": 238 + }, + { + "epoch": 0.11300236406619385, + "grad_norm": 3.5254971981048584, + "learning_rate": 4.9984963583081466e-06, + "loss": 0.7725, + "step": 239 + }, + { + "epoch": 0.11347517730496454, + "grad_norm": 3.7840335369110107, + "learning_rate": 4.998474647559936e-06, + "loss": 0.8685, + "step": 240 + }, + { + "epoch": 0.11394799054373522, + "grad_norm": 3.0333125591278076, + "learning_rate": 4.9984527812426625e-06, + "loss": 0.7793, + "step": 241 + }, + { + "epoch": 0.11442080378250591, + "grad_norm": 3.290159225463867, + "learning_rate": 4.99843075935769e-06, + "loss": 0.7158, + "step": 242 + }, + { + "epoch": 0.1148936170212766, + "grad_norm": 3.3935494422912598, + "learning_rate": 4.99840858190639e-06, + "loss": 0.7643, + "step": 243 + }, + { + "epoch": 0.11536643026004728, + "grad_norm": 3.333965539932251, + "learning_rate": 4.998386248890142e-06, + "loss": 0.7255, + "step": 244 + }, + { + "epoch": 0.11583924349881797, + "grad_norm": 2.8129613399505615, + "learning_rate": 4.998363760310339e-06, + "loss": 0.768, + "step": 245 + }, + { + "epoch": 0.11631205673758865, + "grad_norm": 2.8678107261657715, + "learning_rate": 4.998341116168378e-06, + "loss": 0.7403, + "step": 246 + }, + { + "epoch": 0.11678486997635934, + "grad_norm": 2.8898239135742188, + "learning_rate": 4.998318316465672e-06, + "loss": 0.6844, + "step": 247 + }, + { + "epoch": 0.11725768321513003, + "grad_norm": 3.139777898788452, + "learning_rate": 4.998295361203637e-06, + "loss": 0.7936, + "step": 248 + }, + { + "epoch": 0.11773049645390071, + "grad_norm": 3.393721103668213, + "learning_rate": 4.998272250383707e-06, + "loss": 0.8173, + "step": 249 + }, + { + "epoch": 0.1182033096926714, + "grad_norm": 3.240973949432373, + "learning_rate": 4.998248984007318e-06, + "loss": 0.8252, + "step": 250 + }, + { + "epoch": 0.11867612293144209, + "grad_norm": 3.384855031967163, + "learning_rate": 4.998225562075918e-06, + "loss": 0.7244, + "step": 251 + }, + { + "epoch": 0.11914893617021277, + "grad_norm": 3.1881816387176514, + "learning_rate": 4.9982019845909675e-06, + "loss": 0.6818, + "step": 252 + }, + { + "epoch": 0.11962174940898346, + "grad_norm": 2.888364553451538, + "learning_rate": 4.998178251553934e-06, + "loss": 0.6753, + "step": 253 + }, + { + "epoch": 0.12009456264775414, + "grad_norm": 3.630093812942505, + "learning_rate": 4.9981543629662944e-06, + "loss": 0.7995, + "step": 254 + }, + { + "epoch": 0.12056737588652482, + "grad_norm": 2.9820947647094727, + "learning_rate": 4.998130318829537e-06, + "loss": 0.7478, + "step": 255 + }, + { + "epoch": 0.1210401891252955, + "grad_norm": 2.7094738483428955, + "learning_rate": 4.998106119145159e-06, + "loss": 0.7237, + "step": 256 + }, + { + "epoch": 0.12151300236406619, + "grad_norm": 3.1808104515075684, + "learning_rate": 4.9980817639146665e-06, + "loss": 0.7915, + "step": 257 + }, + { + "epoch": 0.12198581560283688, + "grad_norm": 3.1661291122436523, + "learning_rate": 4.998057253139575e-06, + "loss": 0.8053, + "step": 258 + }, + { + "epoch": 0.12245862884160756, + "grad_norm": 3.528749942779541, + "learning_rate": 4.998032586821413e-06, + "loss": 0.7946, + "step": 259 + }, + { + "epoch": 0.12293144208037825, + "grad_norm": 3.125964879989624, + "learning_rate": 4.998007764961716e-06, + "loss": 0.7569, + "step": 260 + }, + { + "epoch": 0.12340425531914893, + "grad_norm": 3.0778942108154297, + "learning_rate": 4.997982787562029e-06, + "loss": 0.7184, + "step": 261 + }, + { + "epoch": 0.12387706855791962, + "grad_norm": 3.3531930446624756, + "learning_rate": 4.997957654623906e-06, + "loss": 0.7586, + "step": 262 + }, + { + "epoch": 0.1243498817966903, + "grad_norm": 3.229278564453125, + "learning_rate": 4.997932366148913e-06, + "loss": 0.6092, + "step": 263 + }, + { + "epoch": 0.12482269503546099, + "grad_norm": 3.7286155223846436, + "learning_rate": 4.997906922138626e-06, + "loss": 0.7965, + "step": 264 + }, + { + "epoch": 0.12529550827423167, + "grad_norm": 3.300311803817749, + "learning_rate": 4.997881322594628e-06, + "loss": 0.7665, + "step": 265 + }, + { + "epoch": 0.12576832151300235, + "grad_norm": 3.411482572555542, + "learning_rate": 4.9978555675185115e-06, + "loss": 0.7253, + "step": 266 + }, + { + "epoch": 0.12624113475177304, + "grad_norm": 3.0884511470794678, + "learning_rate": 4.9978296569118825e-06, + "loss": 0.659, + "step": 267 + }, + { + "epoch": 0.12671394799054372, + "grad_norm": 3.0652925968170166, + "learning_rate": 4.9978035907763535e-06, + "loss": 0.6739, + "step": 268 + }, + { + "epoch": 0.1271867612293144, + "grad_norm": 3.280555009841919, + "learning_rate": 4.997777369113547e-06, + "loss": 0.8003, + "step": 269 + }, + { + "epoch": 0.1276595744680851, + "grad_norm": 2.980860948562622, + "learning_rate": 4.997750991925096e-06, + "loss": 0.7097, + "step": 270 + }, + { + "epoch": 0.12813238770685578, + "grad_norm": 3.301760673522949, + "learning_rate": 4.997724459212644e-06, + "loss": 0.7894, + "step": 271 + }, + { + "epoch": 0.12860520094562647, + "grad_norm": 2.9584903717041016, + "learning_rate": 4.997697770977841e-06, + "loss": 0.733, + "step": 272 + }, + { + "epoch": 0.12907801418439716, + "grad_norm": 3.5632214546203613, + "learning_rate": 4.99767092722235e-06, + "loss": 0.7228, + "step": 273 + }, + { + "epoch": 0.12955082742316784, + "grad_norm": 3.5900983810424805, + "learning_rate": 4.997643927947843e-06, + "loss": 0.7634, + "step": 274 + }, + { + "epoch": 0.13002364066193853, + "grad_norm": 3.332650661468506, + "learning_rate": 4.997616773156e-06, + "loss": 0.797, + "step": 275 + }, + { + "epoch": 0.13049645390070921, + "grad_norm": 3.1094167232513428, + "learning_rate": 4.997589462848512e-06, + "loss": 0.7849, + "step": 276 + }, + { + "epoch": 0.1309692671394799, + "grad_norm": 3.5359463691711426, + "learning_rate": 4.99756199702708e-06, + "loss": 0.6871, + "step": 277 + }, + { + "epoch": 0.1314420803782506, + "grad_norm": 3.190441846847534, + "learning_rate": 4.997534375693414e-06, + "loss": 0.6883, + "step": 278 + }, + { + "epoch": 0.13191489361702127, + "grad_norm": 3.063518762588501, + "learning_rate": 4.997506598849234e-06, + "loss": 0.7586, + "step": 279 + }, + { + "epoch": 0.13238770685579196, + "grad_norm": 3.4112050533294678, + "learning_rate": 4.997478666496269e-06, + "loss": 0.796, + "step": 280 + }, + { + "epoch": 0.13286052009456265, + "grad_norm": 3.231886386871338, + "learning_rate": 4.997450578636259e-06, + "loss": 0.7714, + "step": 281 + }, + { + "epoch": 0.13333333333333333, + "grad_norm": 3.279425621032715, + "learning_rate": 4.9974223352709515e-06, + "loss": 0.7793, + "step": 282 + }, + { + "epoch": 0.13380614657210402, + "grad_norm": 3.2154316902160645, + "learning_rate": 4.9973939364021075e-06, + "loss": 0.791, + "step": 283 + }, + { + "epoch": 0.1342789598108747, + "grad_norm": 3.2090768814086914, + "learning_rate": 4.9973653820314925e-06, + "loss": 0.6433, + "step": 284 + }, + { + "epoch": 0.1347517730496454, + "grad_norm": 3.1712026596069336, + "learning_rate": 4.997336672160886e-06, + "loss": 0.8128, + "step": 285 + }, + { + "epoch": 0.13522458628841608, + "grad_norm": 2.929229497909546, + "learning_rate": 4.997307806792076e-06, + "loss": 0.7594, + "step": 286 + }, + { + "epoch": 0.13569739952718676, + "grad_norm": 3.0363314151763916, + "learning_rate": 4.997278785926859e-06, + "loss": 0.7336, + "step": 287 + }, + { + "epoch": 0.13617021276595745, + "grad_norm": 3.1352357864379883, + "learning_rate": 4.997249609567042e-06, + "loss": 0.7225, + "step": 288 + }, + { + "epoch": 0.13664302600472814, + "grad_norm": 3.3171157836914062, + "learning_rate": 4.997220277714442e-06, + "loss": 0.7777, + "step": 289 + }, + { + "epoch": 0.13711583924349882, + "grad_norm": 3.050717353820801, + "learning_rate": 4.997190790370885e-06, + "loss": 0.6836, + "step": 290 + }, + { + "epoch": 0.1375886524822695, + "grad_norm": 3.0297694206237793, + "learning_rate": 4.997161147538208e-06, + "loss": 0.6883, + "step": 291 + }, + { + "epoch": 0.1380614657210402, + "grad_norm": 3.0566554069519043, + "learning_rate": 4.997131349218256e-06, + "loss": 0.6674, + "step": 292 + }, + { + "epoch": 0.13853427895981088, + "grad_norm": 3.799111843109131, + "learning_rate": 4.997101395412885e-06, + "loss": 0.8256, + "step": 293 + }, + { + "epoch": 0.13900709219858157, + "grad_norm": 3.1394248008728027, + "learning_rate": 4.9970712861239576e-06, + "loss": 0.7306, + "step": 294 + }, + { + "epoch": 0.13947990543735225, + "grad_norm": 3.0605666637420654, + "learning_rate": 4.997041021353352e-06, + "loss": 0.7212, + "step": 295 + }, + { + "epoch": 0.13995271867612294, + "grad_norm": 3.8813397884368896, + "learning_rate": 4.997010601102951e-06, + "loss": 0.769, + "step": 296 + }, + { + "epoch": 0.14042553191489363, + "grad_norm": 3.0514819622039795, + "learning_rate": 4.996980025374649e-06, + "loss": 0.7422, + "step": 297 + }, + { + "epoch": 0.1408983451536643, + "grad_norm": 2.9544146060943604, + "learning_rate": 4.99694929417035e-06, + "loss": 0.6912, + "step": 298 + }, + { + "epoch": 0.141371158392435, + "grad_norm": 3.2635602951049805, + "learning_rate": 4.996918407491966e-06, + "loss": 0.7395, + "step": 299 + }, + { + "epoch": 0.14184397163120568, + "grad_norm": 3.373882532119751, + "learning_rate": 4.996887365341423e-06, + "loss": 0.7799, + "step": 300 + }, + { + "epoch": 0.14231678486997637, + "grad_norm": 3.001128673553467, + "learning_rate": 4.996856167720652e-06, + "loss": 0.7168, + "step": 301 + }, + { + "epoch": 0.14278959810874706, + "grad_norm": 3.1026835441589355, + "learning_rate": 4.996824814631595e-06, + "loss": 0.7492, + "step": 302 + }, + { + "epoch": 0.14326241134751774, + "grad_norm": 3.41947603225708, + "learning_rate": 4.996793306076205e-06, + "loss": 0.6659, + "step": 303 + }, + { + "epoch": 0.14373522458628843, + "grad_norm": 3.2272400856018066, + "learning_rate": 4.996761642056444e-06, + "loss": 0.7184, + "step": 304 + }, + { + "epoch": 0.14420803782505912, + "grad_norm": 2.9488935470581055, + "learning_rate": 4.996729822574284e-06, + "loss": 0.7451, + "step": 305 + }, + { + "epoch": 0.14468085106382977, + "grad_norm": 3.268231153488159, + "learning_rate": 4.9966978476317065e-06, + "loss": 0.7798, + "step": 306 + }, + { + "epoch": 0.14515366430260046, + "grad_norm": 3.9086556434631348, + "learning_rate": 4.996665717230701e-06, + "loss": 0.7871, + "step": 307 + }, + { + "epoch": 0.14562647754137115, + "grad_norm": 3.3483879566192627, + "learning_rate": 4.996633431373269e-06, + "loss": 0.7415, + "step": 308 + }, + { + "epoch": 0.14609929078014183, + "grad_norm": 2.839400053024292, + "learning_rate": 4.99660099006142e-06, + "loss": 0.7192, + "step": 309 + }, + { + "epoch": 0.14657210401891252, + "grad_norm": 3.177302598953247, + "learning_rate": 4.996568393297175e-06, + "loss": 0.755, + "step": 310 + }, + { + "epoch": 0.1470449172576832, + "grad_norm": 3.5477044582366943, + "learning_rate": 4.996535641082563e-06, + "loss": 0.7531, + "step": 311 + }, + { + "epoch": 0.1475177304964539, + "grad_norm": 3.418576717376709, + "learning_rate": 4.996502733419624e-06, + "loss": 0.8009, + "step": 312 + }, + { + "epoch": 0.14799054373522458, + "grad_norm": 3.711341619491577, + "learning_rate": 4.996469670310407e-06, + "loss": 0.7362, + "step": 313 + }, + { + "epoch": 0.14846335697399526, + "grad_norm": 3.2419373989105225, + "learning_rate": 4.99643645175697e-06, + "loss": 0.7761, + "step": 314 + }, + { + "epoch": 0.14893617021276595, + "grad_norm": 3.121858835220337, + "learning_rate": 4.996403077761381e-06, + "loss": 0.6495, + "step": 315 + }, + { + "epoch": 0.14940898345153664, + "grad_norm": 3.123054265975952, + "learning_rate": 4.996369548325719e-06, + "loss": 0.7444, + "step": 316 + }, + { + "epoch": 0.14988179669030732, + "grad_norm": 2.780880928039551, + "learning_rate": 4.996335863452072e-06, + "loss": 0.672, + "step": 317 + }, + { + "epoch": 0.150354609929078, + "grad_norm": 3.3738629817962646, + "learning_rate": 4.996302023142536e-06, + "loss": 0.7972, + "step": 318 + }, + { + "epoch": 0.1508274231678487, + "grad_norm": 3.4874777793884277, + "learning_rate": 4.99626802739922e-06, + "loss": 0.8252, + "step": 319 + }, + { + "epoch": 0.15130023640661938, + "grad_norm": 3.7074787616729736, + "learning_rate": 4.9962338762242395e-06, + "loss": 0.8216, + "step": 320 + }, + { + "epoch": 0.15177304964539007, + "grad_norm": 3.281912326812744, + "learning_rate": 4.996199569619721e-06, + "loss": 0.8175, + "step": 321 + }, + { + "epoch": 0.15224586288416075, + "grad_norm": 2.9485340118408203, + "learning_rate": 4.996165107587801e-06, + "loss": 0.707, + "step": 322 + }, + { + "epoch": 0.15271867612293144, + "grad_norm": 3.3757646083831787, + "learning_rate": 4.996130490130625e-06, + "loss": 0.7955, + "step": 323 + }, + { + "epoch": 0.15319148936170213, + "grad_norm": 2.962181568145752, + "learning_rate": 4.996095717250349e-06, + "loss": 0.7067, + "step": 324 + }, + { + "epoch": 0.1536643026004728, + "grad_norm": 3.114272356033325, + "learning_rate": 4.996060788949136e-06, + "loss": 0.7486, + "step": 325 + }, + { + "epoch": 0.1541371158392435, + "grad_norm": 3.0621590614318848, + "learning_rate": 4.996025705229165e-06, + "loss": 0.6547, + "step": 326 + }, + { + "epoch": 0.15460992907801419, + "grad_norm": 2.8745882511138916, + "learning_rate": 4.995990466092616e-06, + "loss": 0.6435, + "step": 327 + }, + { + "epoch": 0.15508274231678487, + "grad_norm": 2.90841007232666, + "learning_rate": 4.995955071541686e-06, + "loss": 0.7331, + "step": 328 + }, + { + "epoch": 0.15555555555555556, + "grad_norm": 2.694580316543579, + "learning_rate": 4.9959195215785784e-06, + "loss": 0.6731, + "step": 329 + }, + { + "epoch": 0.15602836879432624, + "grad_norm": 3.158083438873291, + "learning_rate": 4.995883816205507e-06, + "loss": 0.7257, + "step": 330 + }, + { + "epoch": 0.15650118203309693, + "grad_norm": 3.3234715461730957, + "learning_rate": 4.995847955424694e-06, + "loss": 0.7389, + "step": 331 + }, + { + "epoch": 0.15697399527186762, + "grad_norm": 2.9406495094299316, + "learning_rate": 4.995811939238373e-06, + "loss": 0.643, + "step": 332 + }, + { + "epoch": 0.1574468085106383, + "grad_norm": 3.3191726207733154, + "learning_rate": 4.995775767648785e-06, + "loss": 0.7879, + "step": 333 + }, + { + "epoch": 0.157919621749409, + "grad_norm": 3.711925745010376, + "learning_rate": 4.995739440658185e-06, + "loss": 0.7586, + "step": 334 + }, + { + "epoch": 0.15839243498817968, + "grad_norm": 9.573421478271484, + "learning_rate": 4.995702958268833e-06, + "loss": 0.7842, + "step": 335 + }, + { + "epoch": 0.15886524822695036, + "grad_norm": 3.4154508113861084, + "learning_rate": 4.995666320483001e-06, + "loss": 0.6735, + "step": 336 + }, + { + "epoch": 0.15933806146572105, + "grad_norm": 3.4169859886169434, + "learning_rate": 4.995629527302971e-06, + "loss": 0.741, + "step": 337 + }, + { + "epoch": 0.15981087470449173, + "grad_norm": 3.287503242492676, + "learning_rate": 4.9955925787310335e-06, + "loss": 0.7139, + "step": 338 + }, + { + "epoch": 0.16028368794326242, + "grad_norm": 3.288409471511841, + "learning_rate": 4.995555474769488e-06, + "loss": 0.7636, + "step": 339 + }, + { + "epoch": 0.1607565011820331, + "grad_norm": 2.8021693229675293, + "learning_rate": 4.995518215420646e-06, + "loss": 0.5883, + "step": 340 + }, + { + "epoch": 0.1612293144208038, + "grad_norm": 2.7038564682006836, + "learning_rate": 4.995480800686827e-06, + "loss": 0.657, + "step": 341 + }, + { + "epoch": 0.16170212765957448, + "grad_norm": 3.2370235919952393, + "learning_rate": 4.9954432305703615e-06, + "loss": 0.6999, + "step": 342 + }, + { + "epoch": 0.16217494089834517, + "grad_norm": 2.8666412830352783, + "learning_rate": 4.995405505073588e-06, + "loss": 0.7199, + "step": 343 + }, + { + "epoch": 0.16264775413711585, + "grad_norm": 3.6467232704162598, + "learning_rate": 4.995367624198856e-06, + "loss": 0.7317, + "step": 344 + }, + { + "epoch": 0.16312056737588654, + "grad_norm": 2.7576327323913574, + "learning_rate": 4.9953295879485246e-06, + "loss": 0.647, + "step": 345 + }, + { + "epoch": 0.1635933806146572, + "grad_norm": 2.922232151031494, + "learning_rate": 4.995291396324959e-06, + "loss": 0.6686, + "step": 346 + }, + { + "epoch": 0.16406619385342788, + "grad_norm": 2.8693501949310303, + "learning_rate": 4.995253049330542e-06, + "loss": 0.6756, + "step": 347 + }, + { + "epoch": 0.16453900709219857, + "grad_norm": 3.671865701675415, + "learning_rate": 4.995214546967658e-06, + "loss": 0.7347, + "step": 348 + }, + { + "epoch": 0.16501182033096926, + "grad_norm": 3.024219274520874, + "learning_rate": 4.995175889238706e-06, + "loss": 0.7547, + "step": 349 + }, + { + "epoch": 0.16548463356973994, + "grad_norm": 2.8470778465270996, + "learning_rate": 4.995137076146091e-06, + "loss": 0.6764, + "step": 350 + }, + { + "epoch": 0.16595744680851063, + "grad_norm": 2.905057907104492, + "learning_rate": 4.9950981076922324e-06, + "loss": 0.6814, + "step": 351 + }, + { + "epoch": 0.16643026004728131, + "grad_norm": 3.504377841949463, + "learning_rate": 4.995058983879555e-06, + "loss": 0.7145, + "step": 352 + }, + { + "epoch": 0.166903073286052, + "grad_norm": 3.0029661655426025, + "learning_rate": 4.995019704710495e-06, + "loss": 0.7114, + "step": 353 + }, + { + "epoch": 0.1673758865248227, + "grad_norm": 2.8666274547576904, + "learning_rate": 4.994980270187499e-06, + "loss": 0.7416, + "step": 354 + }, + { + "epoch": 0.16784869976359337, + "grad_norm": 3.1644718647003174, + "learning_rate": 4.994940680313021e-06, + "loss": 0.661, + "step": 355 + }, + { + "epoch": 0.16832151300236406, + "grad_norm": 3.050391674041748, + "learning_rate": 4.994900935089527e-06, + "loss": 0.7243, + "step": 356 + }, + { + "epoch": 0.16879432624113475, + "grad_norm": 2.985466480255127, + "learning_rate": 4.994861034519491e-06, + "loss": 0.6917, + "step": 357 + }, + { + "epoch": 0.16926713947990543, + "grad_norm": 2.909342050552368, + "learning_rate": 4.9948209786053995e-06, + "loss": 0.6636, + "step": 358 + }, + { + "epoch": 0.16973995271867612, + "grad_norm": 3.2214784622192383, + "learning_rate": 4.9947807673497435e-06, + "loss": 0.7903, + "step": 359 + }, + { + "epoch": 0.1702127659574468, + "grad_norm": 2.5654983520507812, + "learning_rate": 4.994740400755029e-06, + "loss": 0.6129, + "step": 360 + }, + { + "epoch": 0.1706855791962175, + "grad_norm": 3.775646448135376, + "learning_rate": 4.99469987882377e-06, + "loss": 0.7145, + "step": 361 + }, + { + "epoch": 0.17115839243498818, + "grad_norm": 2.8965413570404053, + "learning_rate": 4.994659201558487e-06, + "loss": 0.7177, + "step": 362 + }, + { + "epoch": 0.17163120567375886, + "grad_norm": 3.485597848892212, + "learning_rate": 4.9946183689617146e-06, + "loss": 0.8107, + "step": 363 + }, + { + "epoch": 0.17210401891252955, + "grad_norm": 3.277839183807373, + "learning_rate": 4.994577381035995e-06, + "loss": 0.691, + "step": 364 + }, + { + "epoch": 0.17257683215130024, + "grad_norm": 2.8807685375213623, + "learning_rate": 4.99453623778388e-06, + "loss": 0.7627, + "step": 365 + }, + { + "epoch": 0.17304964539007092, + "grad_norm": 3.0659940242767334, + "learning_rate": 4.994494939207932e-06, + "loss": 0.6858, + "step": 366 + }, + { + "epoch": 0.1735224586288416, + "grad_norm": 3.0881855487823486, + "learning_rate": 4.994453485310723e-06, + "loss": 0.8212, + "step": 367 + }, + { + "epoch": 0.1739952718676123, + "grad_norm": 2.7199201583862305, + "learning_rate": 4.994411876094832e-06, + "loss": 0.6516, + "step": 368 + }, + { + "epoch": 0.17446808510638298, + "grad_norm": 2.955889940261841, + "learning_rate": 4.994370111562851e-06, + "loss": 0.6579, + "step": 369 + }, + { + "epoch": 0.17494089834515367, + "grad_norm": 3.1321663856506348, + "learning_rate": 4.994328191717382e-06, + "loss": 0.6891, + "step": 370 + }, + { + "epoch": 0.17541371158392435, + "grad_norm": 3.0560388565063477, + "learning_rate": 4.994286116561034e-06, + "loss": 0.7243, + "step": 371 + }, + { + "epoch": 0.17588652482269504, + "grad_norm": 3.1560704708099365, + "learning_rate": 4.994243886096425e-06, + "loss": 0.7262, + "step": 372 + }, + { + "epoch": 0.17635933806146573, + "grad_norm": 2.913541316986084, + "learning_rate": 4.994201500326187e-06, + "loss": 0.7318, + "step": 373 + }, + { + "epoch": 0.1768321513002364, + "grad_norm": 3.098376512527466, + "learning_rate": 4.994158959252958e-06, + "loss": 0.6419, + "step": 374 + }, + { + "epoch": 0.1773049645390071, + "grad_norm": 2.977508544921875, + "learning_rate": 4.994116262879387e-06, + "loss": 0.6709, + "step": 375 + }, + { + "epoch": 0.17777777777777778, + "grad_norm": 3.168186902999878, + "learning_rate": 4.994073411208133e-06, + "loss": 0.6608, + "step": 376 + }, + { + "epoch": 0.17825059101654847, + "grad_norm": 3.436844825744629, + "learning_rate": 4.994030404241864e-06, + "loss": 0.7227, + "step": 377 + }, + { + "epoch": 0.17872340425531916, + "grad_norm": 2.8998289108276367, + "learning_rate": 4.993987241983258e-06, + "loss": 0.6512, + "step": 378 + }, + { + "epoch": 0.17919621749408984, + "grad_norm": 3.407191514968872, + "learning_rate": 4.993943924435002e-06, + "loss": 0.616, + "step": 379 + }, + { + "epoch": 0.17966903073286053, + "grad_norm": 3.744858741760254, + "learning_rate": 4.993900451599793e-06, + "loss": 0.8599, + "step": 380 + }, + { + "epoch": 0.18014184397163122, + "grad_norm": 3.486283779144287, + "learning_rate": 4.993856823480338e-06, + "loss": 0.6634, + "step": 381 + }, + { + "epoch": 0.1806146572104019, + "grad_norm": 2.895719051361084, + "learning_rate": 4.993813040079355e-06, + "loss": 0.6972, + "step": 382 + }, + { + "epoch": 0.1810874704491726, + "grad_norm": 2.814133882522583, + "learning_rate": 4.993769101399569e-06, + "loss": 0.6271, + "step": 383 + }, + { + "epoch": 0.18156028368794327, + "grad_norm": 2.8609800338745117, + "learning_rate": 4.993725007443715e-06, + "loss": 0.6481, + "step": 384 + }, + { + "epoch": 0.18203309692671396, + "grad_norm": 3.2829644680023193, + "learning_rate": 4.99368075821454e-06, + "loss": 0.7999, + "step": 385 + }, + { + "epoch": 0.18250591016548465, + "grad_norm": 3.1417458057403564, + "learning_rate": 4.993636353714798e-06, + "loss": 0.6972, + "step": 386 + }, + { + "epoch": 0.1829787234042553, + "grad_norm": 3.0679385662078857, + "learning_rate": 4.993591793947256e-06, + "loss": 0.667, + "step": 387 + }, + { + "epoch": 0.183451536643026, + "grad_norm": 3.1387410163879395, + "learning_rate": 4.993547078914686e-06, + "loss": 0.7618, + "step": 388 + }, + { + "epoch": 0.18392434988179668, + "grad_norm": 2.9181406497955322, + "learning_rate": 4.993502208619872e-06, + "loss": 0.7391, + "step": 389 + }, + { + "epoch": 0.18439716312056736, + "grad_norm": 2.8952157497406006, + "learning_rate": 4.993457183065611e-06, + "loss": 0.6988, + "step": 390 + }, + { + "epoch": 0.18486997635933805, + "grad_norm": 3.2274813652038574, + "learning_rate": 4.993412002254704e-06, + "loss": 0.688, + "step": 391 + }, + { + "epoch": 0.18534278959810874, + "grad_norm": 3.4693779945373535, + "learning_rate": 4.993366666189965e-06, + "loss": 0.6634, + "step": 392 + }, + { + "epoch": 0.18581560283687942, + "grad_norm": 3.5358526706695557, + "learning_rate": 4.993321174874217e-06, + "loss": 0.7343, + "step": 393 + }, + { + "epoch": 0.1862884160756501, + "grad_norm": 3.013338088989258, + "learning_rate": 4.993275528310292e-06, + "loss": 0.7579, + "step": 394 + }, + { + "epoch": 0.1867612293144208, + "grad_norm": 2.694772720336914, + "learning_rate": 4.993229726501033e-06, + "loss": 0.718, + "step": 395 + }, + { + "epoch": 0.18723404255319148, + "grad_norm": 3.070612907409668, + "learning_rate": 4.9931837694492915e-06, + "loss": 0.6438, + "step": 396 + }, + { + "epoch": 0.18770685579196217, + "grad_norm": 2.9193027019500732, + "learning_rate": 4.993137657157928e-06, + "loss": 0.6788, + "step": 397 + }, + { + "epoch": 0.18817966903073285, + "grad_norm": 3.047682046890259, + "learning_rate": 4.993091389629816e-06, + "loss": 0.6826, + "step": 398 + }, + { + "epoch": 0.18865248226950354, + "grad_norm": 2.9629905223846436, + "learning_rate": 4.993044966867834e-06, + "loss": 0.7196, + "step": 399 + }, + { + "epoch": 0.18912529550827423, + "grad_norm": 3.0692050457000732, + "learning_rate": 4.992998388874874e-06, + "loss": 0.7015, + "step": 400 + }, + { + "epoch": 0.1895981087470449, + "grad_norm": 3.5427212715148926, + "learning_rate": 4.992951655653836e-06, + "loss": 0.8292, + "step": 401 + }, + { + "epoch": 0.1900709219858156, + "grad_norm": 2.643526554107666, + "learning_rate": 4.992904767207629e-06, + "loss": 0.624, + "step": 402 + }, + { + "epoch": 0.19054373522458629, + "grad_norm": 3.1185996532440186, + "learning_rate": 4.992857723539173e-06, + "loss": 0.7354, + "step": 403 + }, + { + "epoch": 0.19101654846335697, + "grad_norm": 3.006856679916382, + "learning_rate": 4.992810524651398e-06, + "loss": 0.7752, + "step": 404 + }, + { + "epoch": 0.19148936170212766, + "grad_norm": 2.9913275241851807, + "learning_rate": 4.9927631705472425e-06, + "loss": 0.7306, + "step": 405 + }, + { + "epoch": 0.19196217494089834, + "grad_norm": 2.6794071197509766, + "learning_rate": 4.992715661229655e-06, + "loss": 0.6136, + "step": 406 + }, + { + "epoch": 0.19243498817966903, + "grad_norm": 3.5933966636657715, + "learning_rate": 4.992667996701593e-06, + "loss": 0.7024, + "step": 407 + }, + { + "epoch": 0.19290780141843972, + "grad_norm": 2.862187623977661, + "learning_rate": 4.992620176966025e-06, + "loss": 0.692, + "step": 408 + }, + { + "epoch": 0.1933806146572104, + "grad_norm": 3.076845407485962, + "learning_rate": 4.9925722020259286e-06, + "loss": 0.7475, + "step": 409 + }, + { + "epoch": 0.1938534278959811, + "grad_norm": 3.372919797897339, + "learning_rate": 4.9925240718842895e-06, + "loss": 0.6886, + "step": 410 + }, + { + "epoch": 0.19432624113475178, + "grad_norm": 2.922977924346924, + "learning_rate": 4.992475786544108e-06, + "loss": 0.7049, + "step": 411 + }, + { + "epoch": 0.19479905437352246, + "grad_norm": 2.908034324645996, + "learning_rate": 4.992427346008387e-06, + "loss": 0.6498, + "step": 412 + }, + { + "epoch": 0.19527186761229315, + "grad_norm": 3.096723794937134, + "learning_rate": 4.992378750280144e-06, + "loss": 0.7151, + "step": 413 + }, + { + "epoch": 0.19574468085106383, + "grad_norm": 2.895237684249878, + "learning_rate": 4.992329999362405e-06, + "loss": 0.7277, + "step": 414 + }, + { + "epoch": 0.19621749408983452, + "grad_norm": 2.718230724334717, + "learning_rate": 4.9922810932582065e-06, + "loss": 0.6375, + "step": 415 + }, + { + "epoch": 0.1966903073286052, + "grad_norm": 3.187743663787842, + "learning_rate": 4.992232031970592e-06, + "loss": 0.6528, + "step": 416 + }, + { + "epoch": 0.1971631205673759, + "grad_norm": 2.996406316757202, + "learning_rate": 4.992182815502616e-06, + "loss": 0.6552, + "step": 417 + }, + { + "epoch": 0.19763593380614658, + "grad_norm": 3.301084041595459, + "learning_rate": 4.992133443857345e-06, + "loss": 0.7061, + "step": 418 + }, + { + "epoch": 0.19810874704491727, + "grad_norm": 3.7874677181243896, + "learning_rate": 4.992083917037853e-06, + "loss": 0.7859, + "step": 419 + }, + { + "epoch": 0.19858156028368795, + "grad_norm": 3.124253511428833, + "learning_rate": 4.992034235047222e-06, + "loss": 0.7615, + "step": 420 + }, + { + "epoch": 0.19905437352245864, + "grad_norm": 3.0488970279693604, + "learning_rate": 4.991984397888546e-06, + "loss": 0.6916, + "step": 421 + }, + { + "epoch": 0.19952718676122932, + "grad_norm": 3.1241321563720703, + "learning_rate": 4.991934405564929e-06, + "loss": 0.7055, + "step": 422 + }, + { + "epoch": 0.2, + "grad_norm": 3.396632432937622, + "learning_rate": 4.991884258079484e-06, + "loss": 0.7675, + "step": 423 + }, + { + "epoch": 0.2004728132387707, + "grad_norm": 3.7776873111724854, + "learning_rate": 4.9918339554353316e-06, + "loss": 0.7371, + "step": 424 + }, + { + "epoch": 0.20094562647754138, + "grad_norm": 3.3356032371520996, + "learning_rate": 4.991783497635606e-06, + "loss": 0.6778, + "step": 425 + }, + { + "epoch": 0.20141843971631207, + "grad_norm": 2.988856792449951, + "learning_rate": 4.9917328846834474e-06, + "loss": 0.6795, + "step": 426 + }, + { + "epoch": 0.20189125295508276, + "grad_norm": 3.264183282852173, + "learning_rate": 4.99168211658201e-06, + "loss": 0.7707, + "step": 427 + }, + { + "epoch": 0.20236406619385341, + "grad_norm": 3.878068208694458, + "learning_rate": 4.991631193334451e-06, + "loss": 0.857, + "step": 428 + }, + { + "epoch": 0.2028368794326241, + "grad_norm": 3.6377553939819336, + "learning_rate": 4.991580114943943e-06, + "loss": 0.8033, + "step": 429 + }, + { + "epoch": 0.2033096926713948, + "grad_norm": 2.95393967628479, + "learning_rate": 4.991528881413667e-06, + "loss": 0.6809, + "step": 430 + }, + { + "epoch": 0.20378250591016547, + "grad_norm": 3.058704376220703, + "learning_rate": 4.9914774927468125e-06, + "loss": 0.6664, + "step": 431 + }, + { + "epoch": 0.20425531914893616, + "grad_norm": 2.7783217430114746, + "learning_rate": 4.9914259489465795e-06, + "loss": 0.6478, + "step": 432 + }, + { + "epoch": 0.20472813238770685, + "grad_norm": 2.4825217723846436, + "learning_rate": 4.991374250016177e-06, + "loss": 0.6598, + "step": 433 + }, + { + "epoch": 0.20520094562647753, + "grad_norm": 2.8753600120544434, + "learning_rate": 4.991322395958824e-06, + "loss": 0.6947, + "step": 434 + }, + { + "epoch": 0.20567375886524822, + "grad_norm": 3.2339367866516113, + "learning_rate": 4.99127038677775e-06, + "loss": 0.8201, + "step": 435 + }, + { + "epoch": 0.2061465721040189, + "grad_norm": 2.9065537452697754, + "learning_rate": 4.991218222476193e-06, + "loss": 0.6679, + "step": 436 + }, + { + "epoch": 0.2066193853427896, + "grad_norm": 3.283228874206543, + "learning_rate": 4.991165903057401e-06, + "loss": 0.8039, + "step": 437 + }, + { + "epoch": 0.20709219858156028, + "grad_norm": 3.429872751235962, + "learning_rate": 4.991113428524631e-06, + "loss": 0.7392, + "step": 438 + }, + { + "epoch": 0.20756501182033096, + "grad_norm": 3.118943452835083, + "learning_rate": 4.991060798881152e-06, + "loss": 0.6794, + "step": 439 + }, + { + "epoch": 0.20803782505910165, + "grad_norm": 3.395970106124878, + "learning_rate": 4.99100801413024e-06, + "loss": 0.6862, + "step": 440 + }, + { + "epoch": 0.20851063829787234, + "grad_norm": 2.869191884994507, + "learning_rate": 4.99095507427518e-06, + "loss": 0.6076, + "step": 441 + }, + { + "epoch": 0.20898345153664302, + "grad_norm": 3.1934661865234375, + "learning_rate": 4.990901979319272e-06, + "loss": 0.6927, + "step": 442 + }, + { + "epoch": 0.2094562647754137, + "grad_norm": 2.9068603515625, + "learning_rate": 4.990848729265819e-06, + "loss": 0.6864, + "step": 443 + }, + { + "epoch": 0.2099290780141844, + "grad_norm": 3.0535948276519775, + "learning_rate": 4.9907953241181375e-06, + "loss": 0.6396, + "step": 444 + }, + { + "epoch": 0.21040189125295508, + "grad_norm": 2.871511459350586, + "learning_rate": 4.990741763879554e-06, + "loss": 0.6743, + "step": 445 + }, + { + "epoch": 0.21087470449172577, + "grad_norm": 2.9184393882751465, + "learning_rate": 4.9906880485534015e-06, + "loss": 0.6786, + "step": 446 + }, + { + "epoch": 0.21134751773049645, + "grad_norm": 3.0628271102905273, + "learning_rate": 4.990634178143026e-06, + "loss": 0.6326, + "step": 447 + }, + { + "epoch": 0.21182033096926714, + "grad_norm": 3.7878305912017822, + "learning_rate": 4.990580152651782e-06, + "loss": 0.7944, + "step": 448 + }, + { + "epoch": 0.21229314420803783, + "grad_norm": 2.8577189445495605, + "learning_rate": 4.990525972083031e-06, + "loss": 0.71, + "step": 449 + }, + { + "epoch": 0.2127659574468085, + "grad_norm": 3.307769775390625, + "learning_rate": 4.99047163644015e-06, + "loss": 0.6893, + "step": 450 + }, + { + "epoch": 0.2132387706855792, + "grad_norm": 2.7391717433929443, + "learning_rate": 4.990417145726519e-06, + "loss": 0.712, + "step": 451 + }, + { + "epoch": 0.21371158392434988, + "grad_norm": 2.938044786453247, + "learning_rate": 4.990362499945534e-06, + "loss": 0.7516, + "step": 452 + }, + { + "epoch": 0.21418439716312057, + "grad_norm": 2.7831056118011475, + "learning_rate": 4.990307699100595e-06, + "loss": 0.6168, + "step": 453 + }, + { + "epoch": 0.21465721040189126, + "grad_norm": 2.907977342605591, + "learning_rate": 4.990252743195116e-06, + "loss": 0.6706, + "step": 454 + }, + { + "epoch": 0.21513002364066194, + "grad_norm": 3.7882161140441895, + "learning_rate": 4.990197632232517e-06, + "loss": 0.6847, + "step": 455 + }, + { + "epoch": 0.21560283687943263, + "grad_norm": 2.899716854095459, + "learning_rate": 4.990142366216232e-06, + "loss": 0.6699, + "step": 456 + }, + { + "epoch": 0.21607565011820332, + "grad_norm": 2.907003879547119, + "learning_rate": 4.990086945149701e-06, + "loss": 0.6864, + "step": 457 + }, + { + "epoch": 0.216548463356974, + "grad_norm": 3.2407333850860596, + "learning_rate": 4.9900313690363736e-06, + "loss": 0.692, + "step": 458 + }, + { + "epoch": 0.2170212765957447, + "grad_norm": 2.9055583477020264, + "learning_rate": 4.989975637879712e-06, + "loss": 0.7113, + "step": 459 + }, + { + "epoch": 0.21749408983451538, + "grad_norm": 2.9836206436157227, + "learning_rate": 4.989919751683184e-06, + "loss": 0.6673, + "step": 460 + }, + { + "epoch": 0.21796690307328606, + "grad_norm": 3.371035575866699, + "learning_rate": 4.989863710450273e-06, + "loss": 0.7181, + "step": 461 + }, + { + "epoch": 0.21843971631205675, + "grad_norm": 2.9636635780334473, + "learning_rate": 4.989807514184465e-06, + "loss": 0.6082, + "step": 462 + }, + { + "epoch": 0.21891252955082743, + "grad_norm": 2.9634664058685303, + "learning_rate": 4.9897511628892615e-06, + "loss": 0.7086, + "step": 463 + }, + { + "epoch": 0.21938534278959812, + "grad_norm": 3.154763698577881, + "learning_rate": 4.98969465656817e-06, + "loss": 0.7027, + "step": 464 + }, + { + "epoch": 0.2198581560283688, + "grad_norm": 2.9959890842437744, + "learning_rate": 4.98963799522471e-06, + "loss": 0.6498, + "step": 465 + }, + { + "epoch": 0.2203309692671395, + "grad_norm": 3.5470590591430664, + "learning_rate": 4.989581178862408e-06, + "loss": 0.7199, + "step": 466 + }, + { + "epoch": 0.22080378250591018, + "grad_norm": 7.1873369216918945, + "learning_rate": 4.989524207484802e-06, + "loss": 0.6676, + "step": 467 + }, + { + "epoch": 0.22127659574468084, + "grad_norm": 3.1099541187286377, + "learning_rate": 4.98946708109544e-06, + "loss": 0.6785, + "step": 468 + }, + { + "epoch": 0.22174940898345152, + "grad_norm": 2.830991506576538, + "learning_rate": 4.9894097996978795e-06, + "loss": 0.6456, + "step": 469 + }, + { + "epoch": 0.2222222222222222, + "grad_norm": 3.0212316513061523, + "learning_rate": 4.989352363295687e-06, + "loss": 0.6048, + "step": 470 + }, + { + "epoch": 0.2226950354609929, + "grad_norm": 3.18776798248291, + "learning_rate": 4.989294771892437e-06, + "loss": 0.7078, + "step": 471 + }, + { + "epoch": 0.22316784869976358, + "grad_norm": 2.9972598552703857, + "learning_rate": 4.989237025491717e-06, + "loss": 0.7082, + "step": 472 + }, + { + "epoch": 0.22364066193853427, + "grad_norm": 3.4935688972473145, + "learning_rate": 4.989179124097123e-06, + "loss": 0.8199, + "step": 473 + }, + { + "epoch": 0.22411347517730495, + "grad_norm": 2.6485543251037598, + "learning_rate": 4.9891210677122595e-06, + "loss": 0.6371, + "step": 474 + }, + { + "epoch": 0.22458628841607564, + "grad_norm": 2.969233512878418, + "learning_rate": 4.989062856340742e-06, + "loss": 0.6879, + "step": 475 + }, + { + "epoch": 0.22505910165484633, + "grad_norm": 2.881875514984131, + "learning_rate": 4.989004489986194e-06, + "loss": 0.7415, + "step": 476 + }, + { + "epoch": 0.225531914893617, + "grad_norm": 2.624540090560913, + "learning_rate": 4.98894596865225e-06, + "loss": 0.6522, + "step": 477 + }, + { + "epoch": 0.2260047281323877, + "grad_norm": 3.61075496673584, + "learning_rate": 4.988887292342555e-06, + "loss": 0.7109, + "step": 478 + }, + { + "epoch": 0.2264775413711584, + "grad_norm": 2.9368972778320312, + "learning_rate": 4.988828461060762e-06, + "loss": 0.6843, + "step": 479 + }, + { + "epoch": 0.22695035460992907, + "grad_norm": 3.0670197010040283, + "learning_rate": 4.988769474810533e-06, + "loss": 0.6807, + "step": 480 + }, + { + "epoch": 0.22742316784869976, + "grad_norm": 2.9662792682647705, + "learning_rate": 4.988710333595542e-06, + "loss": 0.6796, + "step": 481 + }, + { + "epoch": 0.22789598108747045, + "grad_norm": 2.971235752105713, + "learning_rate": 4.988651037419472e-06, + "loss": 0.696, + "step": 482 + }, + { + "epoch": 0.22836879432624113, + "grad_norm": 2.931884527206421, + "learning_rate": 4.988591586286013e-06, + "loss": 0.7323, + "step": 483 + }, + { + "epoch": 0.22884160756501182, + "grad_norm": 2.8114213943481445, + "learning_rate": 4.988531980198868e-06, + "loss": 0.6584, + "step": 484 + }, + { + "epoch": 0.2293144208037825, + "grad_norm": 3.2785916328430176, + "learning_rate": 4.98847221916175e-06, + "loss": 0.7514, + "step": 485 + }, + { + "epoch": 0.2297872340425532, + "grad_norm": 3.0520215034484863, + "learning_rate": 4.988412303178377e-06, + "loss": 0.7564, + "step": 486 + }, + { + "epoch": 0.23026004728132388, + "grad_norm": 3.181002616882324, + "learning_rate": 4.988352232252483e-06, + "loss": 0.6768, + "step": 487 + }, + { + "epoch": 0.23073286052009456, + "grad_norm": 3.4953625202178955, + "learning_rate": 4.988292006387805e-06, + "loss": 0.7143, + "step": 488 + }, + { + "epoch": 0.23120567375886525, + "grad_norm": 3.326571226119995, + "learning_rate": 4.988231625588096e-06, + "loss": 0.7318, + "step": 489 + }, + { + "epoch": 0.23167848699763594, + "grad_norm": 3.09614634513855, + "learning_rate": 4.988171089857113e-06, + "loss": 0.6574, + "step": 490 + }, + { + "epoch": 0.23215130023640662, + "grad_norm": 2.7439446449279785, + "learning_rate": 4.9881103991986265e-06, + "loss": 0.6637, + "step": 491 + }, + { + "epoch": 0.2326241134751773, + "grad_norm": 3.0681190490722656, + "learning_rate": 4.988049553616416e-06, + "loss": 0.6326, + "step": 492 + }, + { + "epoch": 0.233096926713948, + "grad_norm": 3.0757341384887695, + "learning_rate": 4.98798855311427e-06, + "loss": 0.695, + "step": 493 + }, + { + "epoch": 0.23356973995271868, + "grad_norm": 2.8637635707855225, + "learning_rate": 4.987927397695985e-06, + "loss": 0.6598, + "step": 494 + }, + { + "epoch": 0.23404255319148937, + "grad_norm": 3.3641068935394287, + "learning_rate": 4.9878660873653715e-06, + "loss": 0.7435, + "step": 495 + }, + { + "epoch": 0.23451536643026005, + "grad_norm": 3.5025596618652344, + "learning_rate": 4.987804622126245e-06, + "loss": 0.735, + "step": 496 + }, + { + "epoch": 0.23498817966903074, + "grad_norm": 2.9298837184906006, + "learning_rate": 4.987743001982434e-06, + "loss": 0.7063, + "step": 497 + }, + { + "epoch": 0.23546099290780143, + "grad_norm": 2.70358943939209, + "learning_rate": 4.987681226937774e-06, + "loss": 0.6799, + "step": 498 + }, + { + "epoch": 0.2359338061465721, + "grad_norm": 3.027871608734131, + "learning_rate": 4.9876192969961125e-06, + "loss": 0.6881, + "step": 499 + }, + { + "epoch": 0.2364066193853428, + "grad_norm": 3.362306594848633, + "learning_rate": 4.987557212161304e-06, + "loss": 0.7906, + "step": 500 + }, + { + "epoch": 0.23687943262411348, + "grad_norm": 3.3136050701141357, + "learning_rate": 4.987494972437217e-06, + "loss": 0.6878, + "step": 501 + }, + { + "epoch": 0.23735224586288417, + "grad_norm": 3.017089605331421, + "learning_rate": 4.9874325778277255e-06, + "loss": 0.7279, + "step": 502 + }, + { + "epoch": 0.23782505910165486, + "grad_norm": 2.8300516605377197, + "learning_rate": 4.987370028336714e-06, + "loss": 0.6864, + "step": 503 + }, + { + "epoch": 0.23829787234042554, + "grad_norm": 3.201860189437866, + "learning_rate": 4.987307323968077e-06, + "loss": 0.7531, + "step": 504 + }, + { + "epoch": 0.23877068557919623, + "grad_norm": 2.685396194458008, + "learning_rate": 4.987244464725721e-06, + "loss": 0.5849, + "step": 505 + }, + { + "epoch": 0.23924349881796692, + "grad_norm": 2.8715312480926514, + "learning_rate": 4.987181450613557e-06, + "loss": 0.675, + "step": 506 + }, + { + "epoch": 0.2397163120567376, + "grad_norm": 2.813908815383911, + "learning_rate": 4.987118281635511e-06, + "loss": 0.6841, + "step": 507 + }, + { + "epoch": 0.2401891252955083, + "grad_norm": 3.2738473415374756, + "learning_rate": 4.987054957795514e-06, + "loss": 0.7158, + "step": 508 + }, + { + "epoch": 0.24066193853427895, + "grad_norm": 2.896134376525879, + "learning_rate": 4.986991479097511e-06, + "loss": 0.7542, + "step": 509 + }, + { + "epoch": 0.24113475177304963, + "grad_norm": 3.0390403270721436, + "learning_rate": 4.986927845545454e-06, + "loss": 0.6733, + "step": 510 + }, + { + "epoch": 0.24160756501182032, + "grad_norm": 3.0300254821777344, + "learning_rate": 4.9868640571433044e-06, + "loss": 0.722, + "step": 511 + }, + { + "epoch": 0.242080378250591, + "grad_norm": 3.3037352561950684, + "learning_rate": 4.986800113895035e-06, + "loss": 0.6811, + "step": 512 + }, + { + "epoch": 0.2425531914893617, + "grad_norm": 3.0358474254608154, + "learning_rate": 4.986736015804627e-06, + "loss": 0.7348, + "step": 513 + }, + { + "epoch": 0.24302600472813238, + "grad_norm": 3.108792304992676, + "learning_rate": 4.986671762876071e-06, + "loss": 0.6096, + "step": 514 + }, + { + "epoch": 0.24349881796690306, + "grad_norm": 3.1316237449645996, + "learning_rate": 4.986607355113367e-06, + "loss": 0.6357, + "step": 515 + }, + { + "epoch": 0.24397163120567375, + "grad_norm": 3.3095219135284424, + "learning_rate": 4.986542792520528e-06, + "loss": 0.7515, + "step": 516 + }, + { + "epoch": 0.24444444444444444, + "grad_norm": 3.4775984287261963, + "learning_rate": 4.986478075101572e-06, + "loss": 0.7104, + "step": 517 + }, + { + "epoch": 0.24491725768321512, + "grad_norm": 3.341708183288574, + "learning_rate": 4.986413202860528e-06, + "loss": 0.7339, + "step": 518 + }, + { + "epoch": 0.2453900709219858, + "grad_norm": 2.9646966457366943, + "learning_rate": 4.986348175801438e-06, + "loss": 0.6032, + "step": 519 + }, + { + "epoch": 0.2458628841607565, + "grad_norm": 3.1853902339935303, + "learning_rate": 4.986282993928349e-06, + "loss": 0.6925, + "step": 520 + }, + { + "epoch": 0.24633569739952718, + "grad_norm": 3.286909818649292, + "learning_rate": 4.98621765724532e-06, + "loss": 0.7447, + "step": 521 + }, + { + "epoch": 0.24680851063829787, + "grad_norm": 3.2255051136016846, + "learning_rate": 4.986152165756419e-06, + "loss": 0.7747, + "step": 522 + }, + { + "epoch": 0.24728132387706855, + "grad_norm": 3.002352237701416, + "learning_rate": 4.986086519465724e-06, + "loss": 0.6472, + "step": 523 + }, + { + "epoch": 0.24775413711583924, + "grad_norm": 3.4738974571228027, + "learning_rate": 4.986020718377322e-06, + "loss": 0.7381, + "step": 524 + }, + { + "epoch": 0.24822695035460993, + "grad_norm": 3.4470200538635254, + "learning_rate": 4.985954762495312e-06, + "loss": 0.6878, + "step": 525 + }, + { + "epoch": 0.2486997635933806, + "grad_norm": 2.9219350814819336, + "learning_rate": 4.985888651823799e-06, + "loss": 0.6317, + "step": 526 + }, + { + "epoch": 0.2491725768321513, + "grad_norm": 3.061767101287842, + "learning_rate": 4.985822386366899e-06, + "loss": 0.6842, + "step": 527 + }, + { + "epoch": 0.24964539007092199, + "grad_norm": 3.0291247367858887, + "learning_rate": 4.985755966128742e-06, + "loss": 0.6852, + "step": 528 + }, + { + "epoch": 0.25011820330969264, + "grad_norm": 2.964280843734741, + "learning_rate": 4.985689391113457e-06, + "loss": 0.7738, + "step": 529 + }, + { + "epoch": 0.25059101654846333, + "grad_norm": 3.058302164077759, + "learning_rate": 4.9856226613251955e-06, + "loss": 0.6677, + "step": 530 + }, + { + "epoch": 0.251063829787234, + "grad_norm": 3.345141649246216, + "learning_rate": 4.985555776768109e-06, + "loss": 0.7837, + "step": 531 + }, + { + "epoch": 0.2515366430260047, + "grad_norm": 3.565031051635742, + "learning_rate": 4.9854887374463636e-06, + "loss": 0.7231, + "step": 532 + }, + { + "epoch": 0.2520094562647754, + "grad_norm": 2.7953789234161377, + "learning_rate": 4.985421543364132e-06, + "loss": 0.6102, + "step": 533 + }, + { + "epoch": 0.2524822695035461, + "grad_norm": 2.887606620788574, + "learning_rate": 4.9853541945256e-06, + "loss": 0.6289, + "step": 534 + }, + { + "epoch": 0.25295508274231676, + "grad_norm": 3.1480495929718018, + "learning_rate": 4.985286690934961e-06, + "loss": 0.6348, + "step": 535 + }, + { + "epoch": 0.25342789598108745, + "grad_norm": 2.8912761211395264, + "learning_rate": 4.985219032596416e-06, + "loss": 0.595, + "step": 536 + }, + { + "epoch": 0.25390070921985813, + "grad_norm": 2.947936534881592, + "learning_rate": 4.98515121951418e-06, + "loss": 0.6196, + "step": 537 + }, + { + "epoch": 0.2543735224586288, + "grad_norm": 3.1085827350616455, + "learning_rate": 4.985083251692474e-06, + "loss": 0.6387, + "step": 538 + }, + { + "epoch": 0.2548463356973995, + "grad_norm": 3.1688334941864014, + "learning_rate": 4.985015129135531e-06, + "loss": 0.7055, + "step": 539 + }, + { + "epoch": 0.2553191489361702, + "grad_norm": 3.075042963027954, + "learning_rate": 4.984946851847593e-06, + "loss": 0.7515, + "step": 540 + }, + { + "epoch": 0.2557919621749409, + "grad_norm": 3.1933093070983887, + "learning_rate": 4.98487841983291e-06, + "loss": 0.7054, + "step": 541 + }, + { + "epoch": 0.25626477541371157, + "grad_norm": 3.043473958969116, + "learning_rate": 4.984809833095744e-06, + "loss": 0.6281, + "step": 542 + }, + { + "epoch": 0.25673758865248225, + "grad_norm": 3.0532584190368652, + "learning_rate": 4.9847410916403645e-06, + "loss": 0.6155, + "step": 543 + }, + { + "epoch": 0.25721040189125294, + "grad_norm": 3.608480215072632, + "learning_rate": 4.984672195471053e-06, + "loss": 0.7363, + "step": 544 + }, + { + "epoch": 0.2576832151300236, + "grad_norm": 2.7491862773895264, + "learning_rate": 4.9846031445921e-06, + "loss": 0.6594, + "step": 545 + }, + { + "epoch": 0.2581560283687943, + "grad_norm": 2.8602418899536133, + "learning_rate": 4.984533939007802e-06, + "loss": 0.6742, + "step": 546 + }, + { + "epoch": 0.258628841607565, + "grad_norm": 3.1782007217407227, + "learning_rate": 4.98446457872247e-06, + "loss": 0.731, + "step": 547 + }, + { + "epoch": 0.2591016548463357, + "grad_norm": 2.796147584915161, + "learning_rate": 4.984395063740423e-06, + "loss": 0.6617, + "step": 548 + }, + { + "epoch": 0.25957446808510637, + "grad_norm": 2.8392202854156494, + "learning_rate": 4.984325394065991e-06, + "loss": 0.6753, + "step": 549 + }, + { + "epoch": 0.26004728132387706, + "grad_norm": 3.134672164916992, + "learning_rate": 4.984255569703508e-06, + "loss": 0.7222, + "step": 550 + }, + { + "epoch": 0.26052009456264774, + "grad_norm": 2.734330177307129, + "learning_rate": 4.984185590657325e-06, + "loss": 0.6098, + "step": 551 + }, + { + "epoch": 0.26099290780141843, + "grad_norm": 3.739010810852051, + "learning_rate": 4.984115456931798e-06, + "loss": 0.7457, + "step": 552 + }, + { + "epoch": 0.2614657210401891, + "grad_norm": 2.8412528038024902, + "learning_rate": 4.9840451685312925e-06, + "loss": 0.6972, + "step": 553 + }, + { + "epoch": 0.2619385342789598, + "grad_norm": 3.017395496368408, + "learning_rate": 4.983974725460188e-06, + "loss": 0.6887, + "step": 554 + }, + { + "epoch": 0.2624113475177305, + "grad_norm": 3.2746949195861816, + "learning_rate": 4.98390412772287e-06, + "loss": 0.7047, + "step": 555 + }, + { + "epoch": 0.2628841607565012, + "grad_norm": 3.1561965942382812, + "learning_rate": 4.983833375323732e-06, + "loss": 0.7726, + "step": 556 + }, + { + "epoch": 0.26335697399527186, + "grad_norm": 3.2367217540740967, + "learning_rate": 4.9837624682671816e-06, + "loss": 0.6348, + "step": 557 + }, + { + "epoch": 0.26382978723404255, + "grad_norm": 2.8195858001708984, + "learning_rate": 4.983691406557633e-06, + "loss": 0.6387, + "step": 558 + }, + { + "epoch": 0.26430260047281323, + "grad_norm": 3.349820852279663, + "learning_rate": 4.983620190199511e-06, + "loss": 0.6776, + "step": 559 + }, + { + "epoch": 0.2647754137115839, + "grad_norm": 2.8025588989257812, + "learning_rate": 4.98354881919725e-06, + "loss": 0.6512, + "step": 560 + }, + { + "epoch": 0.2652482269503546, + "grad_norm": 2.9125499725341797, + "learning_rate": 4.983477293555295e-06, + "loss": 0.7024, + "step": 561 + }, + { + "epoch": 0.2657210401891253, + "grad_norm": 3.3479275703430176, + "learning_rate": 4.983405613278098e-06, + "loss": 0.688, + "step": 562 + }, + { + "epoch": 0.266193853427896, + "grad_norm": 3.123971462249756, + "learning_rate": 4.983333778370123e-06, + "loss": 0.6743, + "step": 563 + }, + { + "epoch": 0.26666666666666666, + "grad_norm": 2.891625165939331, + "learning_rate": 4.983261788835843e-06, + "loss": 0.5971, + "step": 564 + }, + { + "epoch": 0.26713947990543735, + "grad_norm": 3.5066864490509033, + "learning_rate": 4.98318964467974e-06, + "loss": 0.6958, + "step": 565 + }, + { + "epoch": 0.26761229314420804, + "grad_norm": 2.570547342300415, + "learning_rate": 4.983117345906306e-06, + "loss": 0.609, + "step": 566 + }, + { + "epoch": 0.2680851063829787, + "grad_norm": 3.005106210708618, + "learning_rate": 4.983044892520044e-06, + "loss": 0.6791, + "step": 567 + }, + { + "epoch": 0.2685579196217494, + "grad_norm": 3.429675340652466, + "learning_rate": 4.982972284525463e-06, + "loss": 0.6625, + "step": 568 + }, + { + "epoch": 0.2690307328605201, + "grad_norm": 3.825657367706299, + "learning_rate": 4.982899521927086e-06, + "loss": 0.6368, + "step": 569 + }, + { + "epoch": 0.2695035460992908, + "grad_norm": 2.8699095249176025, + "learning_rate": 4.982826604729443e-06, + "loss": 0.6425, + "step": 570 + }, + { + "epoch": 0.26997635933806147, + "grad_norm": 3.1688714027404785, + "learning_rate": 4.982753532937074e-06, + "loss": 0.6904, + "step": 571 + }, + { + "epoch": 0.27044917257683215, + "grad_norm": 3.3889992237091064, + "learning_rate": 4.98268030655453e-06, + "loss": 0.7575, + "step": 572 + }, + { + "epoch": 0.27092198581560284, + "grad_norm": 3.108315944671631, + "learning_rate": 4.982606925586367e-06, + "loss": 0.6648, + "step": 573 + }, + { + "epoch": 0.2713947990543735, + "grad_norm": 3.209831953048706, + "learning_rate": 4.982533390037159e-06, + "loss": 0.657, + "step": 574 + }, + { + "epoch": 0.2718676122931442, + "grad_norm": 3.1740927696228027, + "learning_rate": 4.982459699911482e-06, + "loss": 0.7262, + "step": 575 + }, + { + "epoch": 0.2723404255319149, + "grad_norm": 3.0190417766571045, + "learning_rate": 4.982385855213924e-06, + "loss": 0.6368, + "step": 576 + }, + { + "epoch": 0.2728132387706856, + "grad_norm": 3.05049467086792, + "learning_rate": 4.982311855949084e-06, + "loss": 0.72, + "step": 577 + }, + { + "epoch": 0.27328605200945627, + "grad_norm": 2.984816551208496, + "learning_rate": 4.98223770212157e-06, + "loss": 0.6856, + "step": 578 + }, + { + "epoch": 0.27375886524822696, + "grad_norm": 2.744969606399536, + "learning_rate": 4.982163393735998e-06, + "loss": 0.6023, + "step": 579 + }, + { + "epoch": 0.27423167848699764, + "grad_norm": 3.170564889907837, + "learning_rate": 4.982088930796996e-06, + "loss": 0.6678, + "step": 580 + }, + { + "epoch": 0.27470449172576833, + "grad_norm": 2.8686118125915527, + "learning_rate": 4.982014313309199e-06, + "loss": 0.6157, + "step": 581 + }, + { + "epoch": 0.275177304964539, + "grad_norm": 2.8768694400787354, + "learning_rate": 4.981939541277254e-06, + "loss": 0.6566, + "step": 582 + }, + { + "epoch": 0.2756501182033097, + "grad_norm": 2.621481418609619, + "learning_rate": 4.981864614705818e-06, + "loss": 0.7372, + "step": 583 + }, + { + "epoch": 0.2761229314420804, + "grad_norm": 3.527374267578125, + "learning_rate": 4.981789533599554e-06, + "loss": 0.6485, + "step": 584 + }, + { + "epoch": 0.2765957446808511, + "grad_norm": 3.3141074180603027, + "learning_rate": 4.981714297963138e-06, + "loss": 0.6816, + "step": 585 + }, + { + "epoch": 0.27706855791962176, + "grad_norm": 2.9247069358825684, + "learning_rate": 4.981638907801255e-06, + "loss": 0.7217, + "step": 586 + }, + { + "epoch": 0.27754137115839245, + "grad_norm": 2.875236749649048, + "learning_rate": 4.981563363118599e-06, + "loss": 0.6662, + "step": 587 + }, + { + "epoch": 0.27801418439716313, + "grad_norm": 2.9540364742279053, + "learning_rate": 4.981487663919874e-06, + "loss": 0.7225, + "step": 588 + }, + { + "epoch": 0.2784869976359338, + "grad_norm": 2.90889310836792, + "learning_rate": 4.981411810209793e-06, + "loss": 0.6054, + "step": 589 + }, + { + "epoch": 0.2789598108747045, + "grad_norm": 2.8541409969329834, + "learning_rate": 4.981335801993078e-06, + "loss": 0.6539, + "step": 590 + }, + { + "epoch": 0.2794326241134752, + "grad_norm": 3.1600730419158936, + "learning_rate": 4.981259639274465e-06, + "loss": 0.6415, + "step": 591 + }, + { + "epoch": 0.2799054373522459, + "grad_norm": 3.569376230239868, + "learning_rate": 4.981183322058693e-06, + "loss": 0.6944, + "step": 592 + }, + { + "epoch": 0.28037825059101656, + "grad_norm": 3.067667007446289, + "learning_rate": 4.981106850350515e-06, + "loss": 0.7378, + "step": 593 + }, + { + "epoch": 0.28085106382978725, + "grad_norm": 3.082073450088501, + "learning_rate": 4.981030224154693e-06, + "loss": 0.693, + "step": 594 + }, + { + "epoch": 0.28132387706855794, + "grad_norm": 2.902932643890381, + "learning_rate": 4.980953443475998e-06, + "loss": 0.6549, + "step": 595 + }, + { + "epoch": 0.2817966903073286, + "grad_norm": 2.6821181774139404, + "learning_rate": 4.980876508319211e-06, + "loss": 0.6231, + "step": 596 + }, + { + "epoch": 0.2822695035460993, + "grad_norm": 3.1747355461120605, + "learning_rate": 4.9807994186891215e-06, + "loss": 0.6826, + "step": 597 + }, + { + "epoch": 0.28274231678487, + "grad_norm": 2.6975860595703125, + "learning_rate": 4.980722174590531e-06, + "loss": 0.6669, + "step": 598 + }, + { + "epoch": 0.2832151300236407, + "grad_norm": 2.924285650253296, + "learning_rate": 4.9806447760282486e-06, + "loss": 0.689, + "step": 599 + }, + { + "epoch": 0.28368794326241137, + "grad_norm": 2.941417694091797, + "learning_rate": 4.980567223007093e-06, + "loss": 0.6672, + "step": 600 + }, + { + "epoch": 0.28416075650118205, + "grad_norm": 2.8582186698913574, + "learning_rate": 4.980489515531892e-06, + "loss": 0.6229, + "step": 601 + }, + { + "epoch": 0.28463356973995274, + "grad_norm": 2.6462013721466064, + "learning_rate": 4.9804116536074865e-06, + "loss": 0.606, + "step": 602 + }, + { + "epoch": 0.2851063829787234, + "grad_norm": 2.9029998779296875, + "learning_rate": 4.980333637238723e-06, + "loss": 0.5915, + "step": 603 + }, + { + "epoch": 0.2855791962174941, + "grad_norm": 3.9359042644500732, + "learning_rate": 4.980255466430462e-06, + "loss": 0.7035, + "step": 604 + }, + { + "epoch": 0.2860520094562648, + "grad_norm": 3.200524091720581, + "learning_rate": 4.980177141187566e-06, + "loss": 0.7156, + "step": 605 + }, + { + "epoch": 0.2865248226950355, + "grad_norm": 3.1708686351776123, + "learning_rate": 4.980098661514916e-06, + "loss": 0.746, + "step": 606 + }, + { + "epoch": 0.28699763593380617, + "grad_norm": 2.8926830291748047, + "learning_rate": 4.980020027417397e-06, + "loss": 0.6282, + "step": 607 + }, + { + "epoch": 0.28747044917257686, + "grad_norm": 3.0526294708251953, + "learning_rate": 4.979941238899906e-06, + "loss": 0.6594, + "step": 608 + }, + { + "epoch": 0.28794326241134754, + "grad_norm": 2.9869306087493896, + "learning_rate": 4.9798622959673486e-06, + "loss": 0.7771, + "step": 609 + }, + { + "epoch": 0.28841607565011823, + "grad_norm": 2.7894513607025146, + "learning_rate": 4.979783198624638e-06, + "loss": 0.6819, + "step": 610 + }, + { + "epoch": 0.28888888888888886, + "grad_norm": 2.958575963973999, + "learning_rate": 4.9797039468767025e-06, + "loss": 0.6474, + "step": 611 + }, + { + "epoch": 0.28936170212765955, + "grad_norm": 3.423748016357422, + "learning_rate": 4.979624540728475e-06, + "loss": 0.7389, + "step": 612 + }, + { + "epoch": 0.28983451536643023, + "grad_norm": 2.9641635417938232, + "learning_rate": 4.9795449801849e-06, + "loss": 0.6005, + "step": 613 + }, + { + "epoch": 0.2903073286052009, + "grad_norm": 3.02274227142334, + "learning_rate": 4.979465265250933e-06, + "loss": 0.6358, + "step": 614 + }, + { + "epoch": 0.2907801418439716, + "grad_norm": 3.0562758445739746, + "learning_rate": 4.979385395931534e-06, + "loss": 0.6313, + "step": 615 + }, + { + "epoch": 0.2912529550827423, + "grad_norm": 3.301816701889038, + "learning_rate": 4.97930537223168e-06, + "loss": 0.7264, + "step": 616 + }, + { + "epoch": 0.291725768321513, + "grad_norm": 2.975360870361328, + "learning_rate": 4.979225194156351e-06, + "loss": 0.613, + "step": 617 + }, + { + "epoch": 0.29219858156028367, + "grad_norm": 2.9245030879974365, + "learning_rate": 4.97914486171054e-06, + "loss": 0.6646, + "step": 618 + }, + { + "epoch": 0.29267139479905435, + "grad_norm": 3.1336188316345215, + "learning_rate": 4.979064374899249e-06, + "loss": 0.6421, + "step": 619 + }, + { + "epoch": 0.29314420803782504, + "grad_norm": 3.6298763751983643, + "learning_rate": 4.978983733727491e-06, + "loss": 0.6433, + "step": 620 + }, + { + "epoch": 0.2936170212765957, + "grad_norm": 2.919597625732422, + "learning_rate": 4.9789029382002845e-06, + "loss": 0.6288, + "step": 621 + }, + { + "epoch": 0.2940898345153664, + "grad_norm": 3.2206127643585205, + "learning_rate": 4.978821988322662e-06, + "loss": 0.7102, + "step": 622 + }, + { + "epoch": 0.2945626477541371, + "grad_norm": 3.1767101287841797, + "learning_rate": 4.978740884099664e-06, + "loss": 0.6722, + "step": 623 + }, + { + "epoch": 0.2950354609929078, + "grad_norm": 3.3425452709198, + "learning_rate": 4.97865962553634e-06, + "loss": 0.6492, + "step": 624 + }, + { + "epoch": 0.29550827423167847, + "grad_norm": 3.0408358573913574, + "learning_rate": 4.97857821263775e-06, + "loss": 0.6522, + "step": 625 + }, + { + "epoch": 0.29598108747044916, + "grad_norm": 2.8144783973693848, + "learning_rate": 4.978496645408963e-06, + "loss": 0.7237, + "step": 626 + }, + { + "epoch": 0.29645390070921984, + "grad_norm": 3.7010560035705566, + "learning_rate": 4.978414923855057e-06, + "loss": 0.7509, + "step": 627 + }, + { + "epoch": 0.29692671394799053, + "grad_norm": 2.9438371658325195, + "learning_rate": 4.978333047981122e-06, + "loss": 0.6244, + "step": 628 + }, + { + "epoch": 0.2973995271867612, + "grad_norm": 3.285982370376587, + "learning_rate": 4.978251017792255e-06, + "loss": 0.7553, + "step": 629 + }, + { + "epoch": 0.2978723404255319, + "grad_norm": 3.7021138668060303, + "learning_rate": 4.978168833293564e-06, + "loss": 0.7859, + "step": 630 + }, + { + "epoch": 0.2983451536643026, + "grad_norm": 3.481858730316162, + "learning_rate": 4.9780864944901654e-06, + "loss": 0.7146, + "step": 631 + }, + { + "epoch": 0.2988179669030733, + "grad_norm": 3.693824529647827, + "learning_rate": 4.978004001387188e-06, + "loss": 0.6608, + "step": 632 + }, + { + "epoch": 0.29929078014184396, + "grad_norm": 3.0069146156311035, + "learning_rate": 4.9779213539897665e-06, + "loss": 0.6506, + "step": 633 + }, + { + "epoch": 0.29976359338061465, + "grad_norm": 3.037644147872925, + "learning_rate": 4.977838552303048e-06, + "loss": 0.6487, + "step": 634 + }, + { + "epoch": 0.30023640661938533, + "grad_norm": 3.018554449081421, + "learning_rate": 4.977755596332188e-06, + "loss": 0.6128, + "step": 635 + }, + { + "epoch": 0.300709219858156, + "grad_norm": 3.000312089920044, + "learning_rate": 4.977672486082351e-06, + "loss": 0.6431, + "step": 636 + }, + { + "epoch": 0.3011820330969267, + "grad_norm": 2.836803913116455, + "learning_rate": 4.977589221558713e-06, + "loss": 0.5914, + "step": 637 + }, + { + "epoch": 0.3016548463356974, + "grad_norm": 3.080469846725464, + "learning_rate": 4.977505802766457e-06, + "loss": 0.7265, + "step": 638 + }, + { + "epoch": 0.3021276595744681, + "grad_norm": 3.2245471477508545, + "learning_rate": 4.97742222971078e-06, + "loss": 0.6895, + "step": 639 + }, + { + "epoch": 0.30260047281323876, + "grad_norm": 3.559006452560425, + "learning_rate": 4.977338502396882e-06, + "loss": 0.7439, + "step": 640 + }, + { + "epoch": 0.30307328605200945, + "grad_norm": 2.9116289615631104, + "learning_rate": 4.9772546208299795e-06, + "loss": 0.6907, + "step": 641 + }, + { + "epoch": 0.30354609929078014, + "grad_norm": 3.3645524978637695, + "learning_rate": 4.977170585015295e-06, + "loss": 0.6983, + "step": 642 + }, + { + "epoch": 0.3040189125295508, + "grad_norm": 3.080148458480835, + "learning_rate": 4.977086394958058e-06, + "loss": 0.7016, + "step": 643 + }, + { + "epoch": 0.3044917257683215, + "grad_norm": 2.9276750087738037, + "learning_rate": 4.977002050663515e-06, + "loss": 0.6509, + "step": 644 + }, + { + "epoch": 0.3049645390070922, + "grad_norm": 3.183609962463379, + "learning_rate": 4.976917552136914e-06, + "loss": 0.6814, + "step": 645 + }, + { + "epoch": 0.3054373522458629, + "grad_norm": 3.0980000495910645, + "learning_rate": 4.976832899383519e-06, + "loss": 0.6319, + "step": 646 + }, + { + "epoch": 0.30591016548463357, + "grad_norm": 3.211376190185547, + "learning_rate": 4.9767480924086e-06, + "loss": 0.6365, + "step": 647 + }, + { + "epoch": 0.30638297872340425, + "grad_norm": 3.214430093765259, + "learning_rate": 4.976663131217437e-06, + "loss": 0.6006, + "step": 648 + }, + { + "epoch": 0.30685579196217494, + "grad_norm": 3.0914318561553955, + "learning_rate": 4.976578015815321e-06, + "loss": 0.7162, + "step": 649 + }, + { + "epoch": 0.3073286052009456, + "grad_norm": 2.7644500732421875, + "learning_rate": 4.976492746207551e-06, + "loss": 0.6045, + "step": 650 + }, + { + "epoch": 0.3078014184397163, + "grad_norm": 3.1913280487060547, + "learning_rate": 4.9764073223994374e-06, + "loss": 0.6796, + "step": 651 + }, + { + "epoch": 0.308274231678487, + "grad_norm": 2.8919692039489746, + "learning_rate": 4.976321744396299e-06, + "loss": 0.6683, + "step": 652 + }, + { + "epoch": 0.3087470449172577, + "grad_norm": 2.862234115600586, + "learning_rate": 4.976236012203463e-06, + "loss": 0.6631, + "step": 653 + }, + { + "epoch": 0.30921985815602837, + "grad_norm": 2.9708092212677, + "learning_rate": 4.976150125826268e-06, + "loss": 0.6326, + "step": 654 + }, + { + "epoch": 0.30969267139479906, + "grad_norm": 2.892465353012085, + "learning_rate": 4.976064085270063e-06, + "loss": 0.6574, + "step": 655 + }, + { + "epoch": 0.31016548463356974, + "grad_norm": 3.9215126037597656, + "learning_rate": 4.975977890540205e-06, + "loss": 0.7351, + "step": 656 + }, + { + "epoch": 0.31063829787234043, + "grad_norm": 2.9544081687927246, + "learning_rate": 4.975891541642059e-06, + "loss": 0.7264, + "step": 657 + }, + { + "epoch": 0.3111111111111111, + "grad_norm": 2.995035409927368, + "learning_rate": 4.975805038581005e-06, + "loss": 0.7405, + "step": 658 + }, + { + "epoch": 0.3115839243498818, + "grad_norm": 2.9653120040893555, + "learning_rate": 4.975718381362427e-06, + "loss": 0.679, + "step": 659 + }, + { + "epoch": 0.3120567375886525, + "grad_norm": 2.93976092338562, + "learning_rate": 4.9756315699917205e-06, + "loss": 0.627, + "step": 660 + }, + { + "epoch": 0.3125295508274232, + "grad_norm": 3.106522560119629, + "learning_rate": 4.9755446044742915e-06, + "loss": 0.6329, + "step": 661 + }, + { + "epoch": 0.31300236406619386, + "grad_norm": 3.0238280296325684, + "learning_rate": 4.975457484815554e-06, + "loss": 0.6643, + "step": 662 + }, + { + "epoch": 0.31347517730496455, + "grad_norm": 2.943528175354004, + "learning_rate": 4.9753702110209356e-06, + "loss": 0.668, + "step": 663 + }, + { + "epoch": 0.31394799054373523, + "grad_norm": 2.6840121746063232, + "learning_rate": 4.9752827830958676e-06, + "loss": 0.5482, + "step": 664 + }, + { + "epoch": 0.3144208037825059, + "grad_norm": 2.823875904083252, + "learning_rate": 4.975195201045794e-06, + "loss": 0.7017, + "step": 665 + }, + { + "epoch": 0.3148936170212766, + "grad_norm": 3.148181200027466, + "learning_rate": 4.975107464876168e-06, + "loss": 0.747, + "step": 666 + }, + { + "epoch": 0.3153664302600473, + "grad_norm": 2.630584478378296, + "learning_rate": 4.9750195745924545e-06, + "loss": 0.5987, + "step": 667 + }, + { + "epoch": 0.315839243498818, + "grad_norm": 3.075866460800171, + "learning_rate": 4.974931530200124e-06, + "loss": 0.664, + "step": 668 + }, + { + "epoch": 0.31631205673758866, + "grad_norm": 2.947197914123535, + "learning_rate": 4.974843331704659e-06, + "loss": 0.631, + "step": 669 + }, + { + "epoch": 0.31678486997635935, + "grad_norm": 3.519646644592285, + "learning_rate": 4.974754979111552e-06, + "loss": 0.7154, + "step": 670 + }, + { + "epoch": 0.31725768321513004, + "grad_norm": 2.8687186241149902, + "learning_rate": 4.974666472426305e-06, + "loss": 0.6366, + "step": 671 + }, + { + "epoch": 0.3177304964539007, + "grad_norm": 2.6966612339019775, + "learning_rate": 4.974577811654426e-06, + "loss": 0.7112, + "step": 672 + }, + { + "epoch": 0.3182033096926714, + "grad_norm": 3.1390228271484375, + "learning_rate": 4.974488996801439e-06, + "loss": 0.6882, + "step": 673 + }, + { + "epoch": 0.3186761229314421, + "grad_norm": 3.4667599201202393, + "learning_rate": 4.974400027872871e-06, + "loss": 0.7153, + "step": 674 + }, + { + "epoch": 0.3191489361702128, + "grad_norm": 2.9632184505462646, + "learning_rate": 4.974310904874265e-06, + "loss": 0.7081, + "step": 675 + }, + { + "epoch": 0.31962174940898347, + "grad_norm": 3.46150279045105, + "learning_rate": 4.9742216278111666e-06, + "loss": 0.6242, + "step": 676 + }, + { + "epoch": 0.32009456264775416, + "grad_norm": 3.380403757095337, + "learning_rate": 4.974132196689137e-06, + "loss": 0.6863, + "step": 677 + }, + { + "epoch": 0.32056737588652484, + "grad_norm": 3.4279606342315674, + "learning_rate": 4.974042611513746e-06, + "loss": 0.6388, + "step": 678 + }, + { + "epoch": 0.3210401891252955, + "grad_norm": 2.634523391723633, + "learning_rate": 4.973952872290568e-06, + "loss": 0.6038, + "step": 679 + }, + { + "epoch": 0.3215130023640662, + "grad_norm": 3.19693922996521, + "learning_rate": 4.973862979025194e-06, + "loss": 0.6383, + "step": 680 + }, + { + "epoch": 0.3219858156028369, + "grad_norm": 3.437692165374756, + "learning_rate": 4.973772931723218e-06, + "loss": 0.7288, + "step": 681 + }, + { + "epoch": 0.3224586288416076, + "grad_norm": 2.506301164627075, + "learning_rate": 4.97368273039025e-06, + "loss": 0.5707, + "step": 682 + }, + { + "epoch": 0.3229314420803783, + "grad_norm": 3.0942845344543457, + "learning_rate": 4.9735923750319044e-06, + "loss": 0.6348, + "step": 683 + }, + { + "epoch": 0.32340425531914896, + "grad_norm": 3.0889835357666016, + "learning_rate": 4.973501865653809e-06, + "loss": 0.6697, + "step": 684 + }, + { + "epoch": 0.32387706855791965, + "grad_norm": 3.0391931533813477, + "learning_rate": 4.973411202261598e-06, + "loss": 0.7091, + "step": 685 + }, + { + "epoch": 0.32434988179669033, + "grad_norm": 3.0333497524261475, + "learning_rate": 4.973320384860917e-06, + "loss": 0.6403, + "step": 686 + }, + { + "epoch": 0.324822695035461, + "grad_norm": 2.9714622497558594, + "learning_rate": 4.973229413457421e-06, + "loss": 0.6977, + "step": 687 + }, + { + "epoch": 0.3252955082742317, + "grad_norm": 3.057558298110962, + "learning_rate": 4.973138288056774e-06, + "loss": 0.7236, + "step": 688 + }, + { + "epoch": 0.3257683215130024, + "grad_norm": 2.921093463897705, + "learning_rate": 4.97304700866465e-06, + "loss": 0.576, + "step": 689 + }, + { + "epoch": 0.3262411347517731, + "grad_norm": 3.0287256240844727, + "learning_rate": 4.972955575286732e-06, + "loss": 0.7077, + "step": 690 + }, + { + "epoch": 0.32671394799054376, + "grad_norm": 2.8621346950531006, + "learning_rate": 4.972863987928716e-06, + "loss": 0.6952, + "step": 691 + }, + { + "epoch": 0.3271867612293144, + "grad_norm": 2.631359100341797, + "learning_rate": 4.9727722465963006e-06, + "loss": 0.6931, + "step": 692 + }, + { + "epoch": 0.3276595744680851, + "grad_norm": 2.8484320640563965, + "learning_rate": 4.972680351295201e-06, + "loss": 0.6292, + "step": 693 + }, + { + "epoch": 0.32813238770685577, + "grad_norm": 2.593001365661621, + "learning_rate": 4.972588302031138e-06, + "loss": 0.5942, + "step": 694 + }, + { + "epoch": 0.32860520094562645, + "grad_norm": 2.6321065425872803, + "learning_rate": 4.972496098809844e-06, + "loss": 0.65, + "step": 695 + }, + { + "epoch": 0.32907801418439714, + "grad_norm": 3.2516732215881348, + "learning_rate": 4.972403741637059e-06, + "loss": 0.7385, + "step": 696 + }, + { + "epoch": 0.3295508274231678, + "grad_norm": 3.180854320526123, + "learning_rate": 4.972311230518535e-06, + "loss": 0.6569, + "step": 697 + }, + { + "epoch": 0.3300236406619385, + "grad_norm": 4.161016941070557, + "learning_rate": 4.972218565460031e-06, + "loss": 0.6416, + "step": 698 + }, + { + "epoch": 0.3304964539007092, + "grad_norm": 3.153897762298584, + "learning_rate": 4.972125746467317e-06, + "loss": 0.7196, + "step": 699 + }, + { + "epoch": 0.3309692671394799, + "grad_norm": 2.9595556259155273, + "learning_rate": 4.972032773546173e-06, + "loss": 0.7093, + "step": 700 + }, + { + "epoch": 0.33144208037825057, + "grad_norm": 3.1086833477020264, + "learning_rate": 4.9719396467023875e-06, + "loss": 0.6963, + "step": 701 + }, + { + "epoch": 0.33191489361702126, + "grad_norm": 2.958921432495117, + "learning_rate": 4.971846365941759e-06, + "loss": 0.6518, + "step": 702 + }, + { + "epoch": 0.33238770685579194, + "grad_norm": 2.8745479583740234, + "learning_rate": 4.971752931270096e-06, + "loss": 0.696, + "step": 703 + }, + { + "epoch": 0.33286052009456263, + "grad_norm": 3.224358558654785, + "learning_rate": 4.971659342693217e-06, + "loss": 0.6769, + "step": 704 + }, + { + "epoch": 0.3333333333333333, + "grad_norm": 2.696319580078125, + "learning_rate": 4.9715656002169486e-06, + "loss": 0.6833, + "step": 705 + }, + { + "epoch": 0.333806146572104, + "grad_norm": 2.9283502101898193, + "learning_rate": 4.971471703847127e-06, + "loss": 0.6784, + "step": 706 + }, + { + "epoch": 0.3342789598108747, + "grad_norm": 2.654914140701294, + "learning_rate": 4.9713776535896e-06, + "loss": 0.6337, + "step": 707 + }, + { + "epoch": 0.3347517730496454, + "grad_norm": 3.041555643081665, + "learning_rate": 4.971283449450224e-06, + "loss": 0.6227, + "step": 708 + }, + { + "epoch": 0.33522458628841606, + "grad_norm": 2.893008232116699, + "learning_rate": 4.971189091434863e-06, + "loss": 0.655, + "step": 709 + }, + { + "epoch": 0.33569739952718675, + "grad_norm": 2.8806653022766113, + "learning_rate": 4.971094579549393e-06, + "loss": 0.7077, + "step": 710 + }, + { + "epoch": 0.33617021276595743, + "grad_norm": 3.4830048084259033, + "learning_rate": 4.9709999137996986e-06, + "loss": 0.7461, + "step": 711 + }, + { + "epoch": 0.3366430260047281, + "grad_norm": 3.155444860458374, + "learning_rate": 4.970905094191674e-06, + "loss": 0.652, + "step": 712 + }, + { + "epoch": 0.3371158392434988, + "grad_norm": 2.7608706951141357, + "learning_rate": 4.970810120731225e-06, + "loss": 0.684, + "step": 713 + }, + { + "epoch": 0.3375886524822695, + "grad_norm": 2.8209474086761475, + "learning_rate": 4.970714993424265e-06, + "loss": 0.6009, + "step": 714 + }, + { + "epoch": 0.3380614657210402, + "grad_norm": 3.6532654762268066, + "learning_rate": 4.9706197122767145e-06, + "loss": 0.702, + "step": 715 + }, + { + "epoch": 0.33853427895981086, + "grad_norm": 2.6276566982269287, + "learning_rate": 4.970524277294508e-06, + "loss": 0.6338, + "step": 716 + }, + { + "epoch": 0.33900709219858155, + "grad_norm": 3.509871482849121, + "learning_rate": 4.970428688483589e-06, + "loss": 0.6853, + "step": 717 + }, + { + "epoch": 0.33947990543735224, + "grad_norm": 5.332682132720947, + "learning_rate": 4.970332945849906e-06, + "loss": 0.6684, + "step": 718 + }, + { + "epoch": 0.3399527186761229, + "grad_norm": 2.718801975250244, + "learning_rate": 4.970237049399424e-06, + "loss": 0.6676, + "step": 719 + }, + { + "epoch": 0.3404255319148936, + "grad_norm": 3.891003131866455, + "learning_rate": 4.970140999138112e-06, + "loss": 0.7043, + "step": 720 + }, + { + "epoch": 0.3408983451536643, + "grad_norm": 2.8863155841827393, + "learning_rate": 4.970044795071951e-06, + "loss": 0.6563, + "step": 721 + }, + { + "epoch": 0.341371158392435, + "grad_norm": 3.2527518272399902, + "learning_rate": 4.969948437206932e-06, + "loss": 0.7244, + "step": 722 + }, + { + "epoch": 0.34184397163120567, + "grad_norm": 2.9726758003234863, + "learning_rate": 4.969851925549054e-06, + "loss": 0.6548, + "step": 723 + }, + { + "epoch": 0.34231678486997635, + "grad_norm": 3.118309497833252, + "learning_rate": 4.969755260104327e-06, + "loss": 0.7293, + "step": 724 + }, + { + "epoch": 0.34278959810874704, + "grad_norm": 3.373068332672119, + "learning_rate": 4.969658440878769e-06, + "loss": 0.6444, + "step": 725 + }, + { + "epoch": 0.3432624113475177, + "grad_norm": 2.7157437801361084, + "learning_rate": 4.969561467878409e-06, + "loss": 0.642, + "step": 726 + }, + { + "epoch": 0.3437352245862884, + "grad_norm": 2.58929705619812, + "learning_rate": 4.969464341109285e-06, + "loss": 0.6165, + "step": 727 + }, + { + "epoch": 0.3442080378250591, + "grad_norm": 2.8811306953430176, + "learning_rate": 4.969367060577445e-06, + "loss": 0.7127, + "step": 728 + }, + { + "epoch": 0.3446808510638298, + "grad_norm": 3.494358539581299, + "learning_rate": 4.969269626288946e-06, + "loss": 0.7103, + "step": 729 + }, + { + "epoch": 0.34515366430260047, + "grad_norm": 2.9753928184509277, + "learning_rate": 4.969172038249855e-06, + "loss": 0.6911, + "step": 730 + }, + { + "epoch": 0.34562647754137116, + "grad_norm": 3.2885913848876953, + "learning_rate": 4.969074296466247e-06, + "loss": 0.6968, + "step": 731 + }, + { + "epoch": 0.34609929078014184, + "grad_norm": 2.7564568519592285, + "learning_rate": 4.968976400944211e-06, + "loss": 0.6843, + "step": 732 + }, + { + "epoch": 0.34657210401891253, + "grad_norm": 2.9255006313323975, + "learning_rate": 4.96887835168984e-06, + "loss": 0.6024, + "step": 733 + }, + { + "epoch": 0.3470449172576832, + "grad_norm": 3.1808290481567383, + "learning_rate": 4.968780148709239e-06, + "loss": 0.7377, + "step": 734 + }, + { + "epoch": 0.3475177304964539, + "grad_norm": 2.956666946411133, + "learning_rate": 4.968681792008523e-06, + "loss": 0.65, + "step": 735 + }, + { + "epoch": 0.3479905437352246, + "grad_norm": 2.9631855487823486, + "learning_rate": 4.9685832815938175e-06, + "loss": 0.677, + "step": 736 + }, + { + "epoch": 0.3484633569739953, + "grad_norm": 2.501917600631714, + "learning_rate": 4.968484617471256e-06, + "loss": 0.6282, + "step": 737 + }, + { + "epoch": 0.34893617021276596, + "grad_norm": 2.750779628753662, + "learning_rate": 4.968385799646981e-06, + "loss": 0.6507, + "step": 738 + }, + { + "epoch": 0.34940898345153665, + "grad_norm": 2.872300624847412, + "learning_rate": 4.968286828127146e-06, + "loss": 0.5949, + "step": 739 + }, + { + "epoch": 0.34988179669030733, + "grad_norm": 2.6316142082214355, + "learning_rate": 4.9681877029179124e-06, + "loss": 0.6328, + "step": 740 + }, + { + "epoch": 0.350354609929078, + "grad_norm": 3.244364023208618, + "learning_rate": 4.968088424025454e-06, + "loss": 0.7393, + "step": 741 + }, + { + "epoch": 0.3508274231678487, + "grad_norm": 2.620465040206909, + "learning_rate": 4.967988991455951e-06, + "loss": 0.6797, + "step": 742 + }, + { + "epoch": 0.3513002364066194, + "grad_norm": 2.854513645172119, + "learning_rate": 4.967889405215596e-06, + "loss": 0.6368, + "step": 743 + }, + { + "epoch": 0.3517730496453901, + "grad_norm": 2.579854726791382, + "learning_rate": 4.9677896653105886e-06, + "loss": 0.6489, + "step": 744 + }, + { + "epoch": 0.35224586288416077, + "grad_norm": 3.0697381496429443, + "learning_rate": 4.96768977174714e-06, + "loss": 0.6313, + "step": 745 + }, + { + "epoch": 0.35271867612293145, + "grad_norm": 3.369338035583496, + "learning_rate": 4.96758972453147e-06, + "loss": 0.7416, + "step": 746 + }, + { + "epoch": 0.35319148936170214, + "grad_norm": 2.836221933364868, + "learning_rate": 4.967489523669807e-06, + "loss": 0.6422, + "step": 747 + }, + { + "epoch": 0.3536643026004728, + "grad_norm": 2.929579496383667, + "learning_rate": 4.967389169168392e-06, + "loss": 0.6482, + "step": 748 + }, + { + "epoch": 0.3541371158392435, + "grad_norm": 2.9243831634521484, + "learning_rate": 4.967288661033472e-06, + "loss": 0.5813, + "step": 749 + }, + { + "epoch": 0.3546099290780142, + "grad_norm": 3.7555336952209473, + "learning_rate": 4.967187999271306e-06, + "loss": 0.6501, + "step": 750 + }, + { + "epoch": 0.3550827423167849, + "grad_norm": 3.4279143810272217, + "learning_rate": 4.9670871838881615e-06, + "loss": 0.6326, + "step": 751 + }, + { + "epoch": 0.35555555555555557, + "grad_norm": 2.875066041946411, + "learning_rate": 4.9669862148903166e-06, + "loss": 0.664, + "step": 752 + }, + { + "epoch": 0.35602836879432626, + "grad_norm": 3.130394697189331, + "learning_rate": 4.966885092284057e-06, + "loss": 0.706, + "step": 753 + }, + { + "epoch": 0.35650118203309694, + "grad_norm": 2.9606287479400635, + "learning_rate": 4.96678381607568e-06, + "loss": 0.693, + "step": 754 + }, + { + "epoch": 0.35697399527186763, + "grad_norm": 3.0584909915924072, + "learning_rate": 4.966682386271491e-06, + "loss": 0.6034, + "step": 755 + }, + { + "epoch": 0.3574468085106383, + "grad_norm": 2.8215200901031494, + "learning_rate": 4.966580802877805e-06, + "loss": 0.6217, + "step": 756 + }, + { + "epoch": 0.357919621749409, + "grad_norm": 2.7348055839538574, + "learning_rate": 4.966479065900949e-06, + "loss": 0.6194, + "step": 757 + }, + { + "epoch": 0.3583924349881797, + "grad_norm": 3.2347466945648193, + "learning_rate": 4.966377175347257e-06, + "loss": 0.6377, + "step": 758 + }, + { + "epoch": 0.3588652482269504, + "grad_norm": 3.311845302581787, + "learning_rate": 4.966275131223072e-06, + "loss": 0.6234, + "step": 759 + }, + { + "epoch": 0.35933806146572106, + "grad_norm": 3.0384368896484375, + "learning_rate": 4.96617293353475e-06, + "loss": 0.609, + "step": 760 + }, + { + "epoch": 0.35981087470449175, + "grad_norm": 3.516854763031006, + "learning_rate": 4.966070582288653e-06, + "loss": 0.6627, + "step": 761 + }, + { + "epoch": 0.36028368794326243, + "grad_norm": 3.2425215244293213, + "learning_rate": 4.9659680774911534e-06, + "loss": 0.7355, + "step": 762 + }, + { + "epoch": 0.3607565011820331, + "grad_norm": 3.2665750980377197, + "learning_rate": 4.965865419148636e-06, + "loss": 0.6787, + "step": 763 + }, + { + "epoch": 0.3612293144208038, + "grad_norm": 2.729428291320801, + "learning_rate": 4.96576260726749e-06, + "loss": 0.6272, + "step": 764 + }, + { + "epoch": 0.3617021276595745, + "grad_norm": 3.299969434738159, + "learning_rate": 4.965659641854119e-06, + "loss": 0.6552, + "step": 765 + }, + { + "epoch": 0.3621749408983452, + "grad_norm": 2.7090916633605957, + "learning_rate": 4.965556522914934e-06, + "loss": 0.6661, + "step": 766 + }, + { + "epoch": 0.36264775413711586, + "grad_norm": 2.488846778869629, + "learning_rate": 4.965453250456355e-06, + "loss": 0.5821, + "step": 767 + }, + { + "epoch": 0.36312056737588655, + "grad_norm": 2.5267233848571777, + "learning_rate": 4.965349824484813e-06, + "loss": 0.5593, + "step": 768 + }, + { + "epoch": 0.36359338061465724, + "grad_norm": 3.0646679401397705, + "learning_rate": 4.965246245006748e-06, + "loss": 0.6341, + "step": 769 + }, + { + "epoch": 0.3640661938534279, + "grad_norm": 2.9877712726593018, + "learning_rate": 4.965142512028609e-06, + "loss": 0.7202, + "step": 770 + }, + { + "epoch": 0.3645390070921986, + "grad_norm": 3.7494113445281982, + "learning_rate": 4.965038625556854e-06, + "loss": 0.7643, + "step": 771 + }, + { + "epoch": 0.3650118203309693, + "grad_norm": 2.8382890224456787, + "learning_rate": 4.964934585597954e-06, + "loss": 0.6522, + "step": 772 + }, + { + "epoch": 0.3654846335697399, + "grad_norm": 3.091655731201172, + "learning_rate": 4.9648303921583854e-06, + "loss": 0.7117, + "step": 773 + }, + { + "epoch": 0.3659574468085106, + "grad_norm": 3.0608325004577637, + "learning_rate": 4.964726045244635e-06, + "loss": 0.6538, + "step": 774 + }, + { + "epoch": 0.3664302600472813, + "grad_norm": 2.8492867946624756, + "learning_rate": 4.964621544863203e-06, + "loss": 0.6079, + "step": 775 + }, + { + "epoch": 0.366903073286052, + "grad_norm": 3.0669894218444824, + "learning_rate": 4.964516891020594e-06, + "loss": 0.6223, + "step": 776 + }, + { + "epoch": 0.36737588652482267, + "grad_norm": 3.089984893798828, + "learning_rate": 4.964412083723325e-06, + "loss": 0.671, + "step": 777 + }, + { + "epoch": 0.36784869976359336, + "grad_norm": 2.905242443084717, + "learning_rate": 4.964307122977921e-06, + "loss": 0.62, + "step": 778 + }, + { + "epoch": 0.36832151300236404, + "grad_norm": 3.954436779022217, + "learning_rate": 4.964202008790918e-06, + "loss": 0.6535, + "step": 779 + }, + { + "epoch": 0.36879432624113473, + "grad_norm": 2.6026058197021484, + "learning_rate": 4.9640967411688615e-06, + "loss": 0.5865, + "step": 780 + }, + { + "epoch": 0.3692671394799054, + "grad_norm": 2.9876346588134766, + "learning_rate": 4.963991320118306e-06, + "loss": 0.6698, + "step": 781 + }, + { + "epoch": 0.3697399527186761, + "grad_norm": 2.9411263465881348, + "learning_rate": 4.963885745645815e-06, + "loss": 0.6173, + "step": 782 + }, + { + "epoch": 0.3702127659574468, + "grad_norm": 2.5679805278778076, + "learning_rate": 4.963780017757962e-06, + "loss": 0.6285, + "step": 783 + }, + { + "epoch": 0.3706855791962175, + "grad_norm": 3.3100640773773193, + "learning_rate": 4.963674136461332e-06, + "loss": 0.5968, + "step": 784 + }, + { + "epoch": 0.37115839243498816, + "grad_norm": 3.1293699741363525, + "learning_rate": 4.963568101762515e-06, + "loss": 0.697, + "step": 785 + }, + { + "epoch": 0.37163120567375885, + "grad_norm": 3.043853759765625, + "learning_rate": 4.963461913668115e-06, + "loss": 0.5881, + "step": 786 + }, + { + "epoch": 0.37210401891252953, + "grad_norm": 3.07351016998291, + "learning_rate": 4.963355572184744e-06, + "loss": 0.6307, + "step": 787 + }, + { + "epoch": 0.3725768321513002, + "grad_norm": 2.7381317615509033, + "learning_rate": 4.9632490773190225e-06, + "loss": 0.716, + "step": 788 + }, + { + "epoch": 0.3730496453900709, + "grad_norm": 2.892221450805664, + "learning_rate": 4.963142429077582e-06, + "loss": 0.6867, + "step": 789 + }, + { + "epoch": 0.3735224586288416, + "grad_norm": 3.133122205734253, + "learning_rate": 4.963035627467064e-06, + "loss": 0.659, + "step": 790 + }, + { + "epoch": 0.3739952718676123, + "grad_norm": 3.032599925994873, + "learning_rate": 4.962928672494116e-06, + "loss": 0.6848, + "step": 791 + }, + { + "epoch": 0.37446808510638296, + "grad_norm": 3.0076355934143066, + "learning_rate": 4.9628215641654e-06, + "loss": 0.6549, + "step": 792 + }, + { + "epoch": 0.37494089834515365, + "grad_norm": 2.8904454708099365, + "learning_rate": 4.962714302487585e-06, + "loss": 0.6484, + "step": 793 + }, + { + "epoch": 0.37541371158392434, + "grad_norm": 2.881364107131958, + "learning_rate": 4.9626068874673486e-06, + "loss": 0.721, + "step": 794 + }, + { + "epoch": 0.375886524822695, + "grad_norm": 3.11668062210083, + "learning_rate": 4.962499319111379e-06, + "loss": 0.7824, + "step": 795 + }, + { + "epoch": 0.3763593380614657, + "grad_norm": 2.9201436042785645, + "learning_rate": 4.962391597426374e-06, + "loss": 0.6911, + "step": 796 + }, + { + "epoch": 0.3768321513002364, + "grad_norm": 2.926598072052002, + "learning_rate": 4.962283722419043e-06, + "loss": 0.6715, + "step": 797 + }, + { + "epoch": 0.3773049645390071, + "grad_norm": 2.7267675399780273, + "learning_rate": 4.962175694096101e-06, + "loss": 0.6111, + "step": 798 + }, + { + "epoch": 0.37777777777777777, + "grad_norm": 3.194031000137329, + "learning_rate": 4.962067512464275e-06, + "loss": 0.6558, + "step": 799 + }, + { + "epoch": 0.37825059101654845, + "grad_norm": 2.6249136924743652, + "learning_rate": 4.9619591775303e-06, + "loss": 0.6166, + "step": 800 + }, + { + "epoch": 0.37872340425531914, + "grad_norm": 2.6356167793273926, + "learning_rate": 4.961850689300923e-06, + "loss": 0.6112, + "step": 801 + }, + { + "epoch": 0.3791962174940898, + "grad_norm": 3.030724287033081, + "learning_rate": 4.961742047782898e-06, + "loss": 0.6511, + "step": 802 + }, + { + "epoch": 0.3796690307328605, + "grad_norm": 3.4987757205963135, + "learning_rate": 4.96163325298299e-06, + "loss": 0.5888, + "step": 803 + }, + { + "epoch": 0.3801418439716312, + "grad_norm": 3.0371780395507812, + "learning_rate": 4.961524304907974e-06, + "loss": 0.6385, + "step": 804 + }, + { + "epoch": 0.3806146572104019, + "grad_norm": 3.302570104598999, + "learning_rate": 4.961415203564632e-06, + "loss": 0.6515, + "step": 805 + }, + { + "epoch": 0.38108747044917257, + "grad_norm": 2.7597038745880127, + "learning_rate": 4.961305948959759e-06, + "loss": 0.6126, + "step": 806 + }, + { + "epoch": 0.38156028368794326, + "grad_norm": 2.789811849594116, + "learning_rate": 4.9611965411001575e-06, + "loss": 0.6601, + "step": 807 + }, + { + "epoch": 0.38203309692671394, + "grad_norm": 3.0403921604156494, + "learning_rate": 4.961086979992639e-06, + "loss": 0.6947, + "step": 808 + }, + { + "epoch": 0.38250591016548463, + "grad_norm": 3.2139980792999268, + "learning_rate": 4.960977265644026e-06, + "loss": 0.6876, + "step": 809 + }, + { + "epoch": 0.3829787234042553, + "grad_norm": 2.918515205383301, + "learning_rate": 4.960867398061149e-06, + "loss": 0.5997, + "step": 810 + }, + { + "epoch": 0.383451536643026, + "grad_norm": 3.197636604309082, + "learning_rate": 4.9607573772508495e-06, + "loss": 0.5754, + "step": 811 + }, + { + "epoch": 0.3839243498817967, + "grad_norm": 2.8848466873168945, + "learning_rate": 4.960647203219979e-06, + "loss": 0.6424, + "step": 812 + }, + { + "epoch": 0.3843971631205674, + "grad_norm": 3.4810187816619873, + "learning_rate": 4.960536875975397e-06, + "loss": 0.6851, + "step": 813 + }, + { + "epoch": 0.38486997635933806, + "grad_norm": 3.713934898376465, + "learning_rate": 4.960426395523972e-06, + "loss": 0.6122, + "step": 814 + }, + { + "epoch": 0.38534278959810875, + "grad_norm": 2.862600803375244, + "learning_rate": 4.960315761872585e-06, + "loss": 0.6493, + "step": 815 + }, + { + "epoch": 0.38581560283687943, + "grad_norm": 3.133882522583008, + "learning_rate": 4.960204975028123e-06, + "loss": 0.7535, + "step": 816 + }, + { + "epoch": 0.3862884160756501, + "grad_norm": 3.1526732444763184, + "learning_rate": 4.960094034997485e-06, + "loss": 0.6512, + "step": 817 + }, + { + "epoch": 0.3867612293144208, + "grad_norm": 2.7213544845581055, + "learning_rate": 4.959982941787579e-06, + "loss": 0.6121, + "step": 818 + }, + { + "epoch": 0.3872340425531915, + "grad_norm": 3.4935851097106934, + "learning_rate": 4.9598716954053214e-06, + "loss": 0.7852, + "step": 819 + }, + { + "epoch": 0.3877068557919622, + "grad_norm": 2.691016435623169, + "learning_rate": 4.9597602958576395e-06, + "loss": 0.6861, + "step": 820 + }, + { + "epoch": 0.38817966903073287, + "grad_norm": 2.8621015548706055, + "learning_rate": 4.959648743151469e-06, + "loss": 0.6262, + "step": 821 + }, + { + "epoch": 0.38865248226950355, + "grad_norm": 3.3887462615966797, + "learning_rate": 4.959537037293758e-06, + "loss": 0.7103, + "step": 822 + }, + { + "epoch": 0.38912529550827424, + "grad_norm": 2.7565438747406006, + "learning_rate": 4.95942517829146e-06, + "loss": 0.6471, + "step": 823 + }, + { + "epoch": 0.3895981087470449, + "grad_norm": 2.7920358180999756, + "learning_rate": 4.959313166151541e-06, + "loss": 0.6239, + "step": 824 + }, + { + "epoch": 0.3900709219858156, + "grad_norm": 3.18904185295105, + "learning_rate": 4.959201000880973e-06, + "loss": 0.7461, + "step": 825 + }, + { + "epoch": 0.3905437352245863, + "grad_norm": 2.727872371673584, + "learning_rate": 4.959088682486743e-06, + "loss": 0.6333, + "step": 826 + }, + { + "epoch": 0.391016548463357, + "grad_norm": 2.906378746032715, + "learning_rate": 4.958976210975844e-06, + "loss": 0.7547, + "step": 827 + }, + { + "epoch": 0.39148936170212767, + "grad_norm": 2.96482515335083, + "learning_rate": 4.958863586355278e-06, + "loss": 0.6312, + "step": 828 + }, + { + "epoch": 0.39196217494089836, + "grad_norm": 3.2890889644622803, + "learning_rate": 4.958750808632059e-06, + "loss": 0.6943, + "step": 829 + }, + { + "epoch": 0.39243498817966904, + "grad_norm": 2.7004311084747314, + "learning_rate": 4.958637877813207e-06, + "loss": 0.5918, + "step": 830 + }, + { + "epoch": 0.39290780141843973, + "grad_norm": 2.7487950325012207, + "learning_rate": 4.9585247939057566e-06, + "loss": 0.6201, + "step": 831 + }, + { + "epoch": 0.3933806146572104, + "grad_norm": 2.7873897552490234, + "learning_rate": 4.958411556916747e-06, + "loss": 0.6268, + "step": 832 + }, + { + "epoch": 0.3938534278959811, + "grad_norm": 2.8501343727111816, + "learning_rate": 4.958298166853229e-06, + "loss": 0.7119, + "step": 833 + }, + { + "epoch": 0.3943262411347518, + "grad_norm": 3.0391547679901123, + "learning_rate": 4.958184623722265e-06, + "loss": 0.6375, + "step": 834 + }, + { + "epoch": 0.3947990543735225, + "grad_norm": 2.850520133972168, + "learning_rate": 4.958070927530922e-06, + "loss": 0.5962, + "step": 835 + }, + { + "epoch": 0.39527186761229316, + "grad_norm": 3.351914644241333, + "learning_rate": 4.957957078286281e-06, + "loss": 0.7247, + "step": 836 + }, + { + "epoch": 0.39574468085106385, + "grad_norm": 2.9559543132781982, + "learning_rate": 4.957843075995431e-06, + "loss": 0.6571, + "step": 837 + }, + { + "epoch": 0.39621749408983453, + "grad_norm": 3.225785255432129, + "learning_rate": 4.95772892066547e-06, + "loss": 0.7074, + "step": 838 + }, + { + "epoch": 0.3966903073286052, + "grad_norm": 2.7842373847961426, + "learning_rate": 4.957614612303505e-06, + "loss": 0.6469, + "step": 839 + }, + { + "epoch": 0.3971631205673759, + "grad_norm": 4.249724864959717, + "learning_rate": 4.957500150916655e-06, + "loss": 0.741, + "step": 840 + }, + { + "epoch": 0.3976359338061466, + "grad_norm": 3.138221263885498, + "learning_rate": 4.957385536512046e-06, + "loss": 0.6676, + "step": 841 + }, + { + "epoch": 0.3981087470449173, + "grad_norm": 3.456423759460449, + "learning_rate": 4.957270769096816e-06, + "loss": 0.6877, + "step": 842 + }, + { + "epoch": 0.39858156028368796, + "grad_norm": 2.8676278591156006, + "learning_rate": 4.957155848678109e-06, + "loss": 0.5986, + "step": 843 + }, + { + "epoch": 0.39905437352245865, + "grad_norm": 2.705324411392212, + "learning_rate": 4.957040775263082e-06, + "loss": 0.6356, + "step": 844 + }, + { + "epoch": 0.39952718676122934, + "grad_norm": 3.0767486095428467, + "learning_rate": 4.9569255488589e-06, + "loss": 0.6844, + "step": 845 + }, + { + "epoch": 0.4, + "grad_norm": 2.7787704467773438, + "learning_rate": 4.956810169472736e-06, + "loss": 0.6641, + "step": 846 + }, + { + "epoch": 0.4004728132387707, + "grad_norm": 2.584277868270874, + "learning_rate": 4.956694637111777e-06, + "loss": 0.6256, + "step": 847 + }, + { + "epoch": 0.4009456264775414, + "grad_norm": 2.751641273498535, + "learning_rate": 4.956578951783215e-06, + "loss": 0.5954, + "step": 848 + }, + { + "epoch": 0.4014184397163121, + "grad_norm": 3.0181658267974854, + "learning_rate": 4.956463113494253e-06, + "loss": 0.6569, + "step": 849 + }, + { + "epoch": 0.40189125295508277, + "grad_norm": 3.0933220386505127, + "learning_rate": 4.956347122252104e-06, + "loss": 0.6248, + "step": 850 + }, + { + "epoch": 0.40236406619385345, + "grad_norm": 3.3767428398132324, + "learning_rate": 4.956230978063991e-06, + "loss": 0.719, + "step": 851 + }, + { + "epoch": 0.40283687943262414, + "grad_norm": 3.7666573524475098, + "learning_rate": 4.956114680937145e-06, + "loss": 0.6467, + "step": 852 + }, + { + "epoch": 0.4033096926713948, + "grad_norm": 2.9836843013763428, + "learning_rate": 4.955998230878808e-06, + "loss": 0.6993, + "step": 853 + }, + { + "epoch": 0.4037825059101655, + "grad_norm": 2.981497049331665, + "learning_rate": 4.955881627896229e-06, + "loss": 0.6578, + "step": 854 + }, + { + "epoch": 0.40425531914893614, + "grad_norm": 3.1369056701660156, + "learning_rate": 4.955764871996672e-06, + "loss": 0.6763, + "step": 855 + }, + { + "epoch": 0.40472813238770683, + "grad_norm": 2.7675817012786865, + "learning_rate": 4.9556479631874036e-06, + "loss": 0.6488, + "step": 856 + }, + { + "epoch": 0.4052009456264775, + "grad_norm": 3.035334825515747, + "learning_rate": 4.9555309014757034e-06, + "loss": 0.7076, + "step": 857 + }, + { + "epoch": 0.4056737588652482, + "grad_norm": 3.493704319000244, + "learning_rate": 4.955413686868862e-06, + "loss": 0.6773, + "step": 858 + }, + { + "epoch": 0.4061465721040189, + "grad_norm": 3.245487928390503, + "learning_rate": 4.9552963193741765e-06, + "loss": 0.6915, + "step": 859 + }, + { + "epoch": 0.4066193853427896, + "grad_norm": 3.189969539642334, + "learning_rate": 4.955178798998956e-06, + "loss": 0.7318, + "step": 860 + }, + { + "epoch": 0.40709219858156026, + "grad_norm": 2.7987146377563477, + "learning_rate": 4.955061125750517e-06, + "loss": 0.6162, + "step": 861 + }, + { + "epoch": 0.40756501182033095, + "grad_norm": 3.020118474960327, + "learning_rate": 4.954943299636187e-06, + "loss": 0.6678, + "step": 862 + }, + { + "epoch": 0.40803782505910163, + "grad_norm": 2.715463876724243, + "learning_rate": 4.954825320663302e-06, + "loss": 0.668, + "step": 863 + }, + { + "epoch": 0.4085106382978723, + "grad_norm": 2.595050096511841, + "learning_rate": 4.9547071888392085e-06, + "loss": 0.6557, + "step": 864 + }, + { + "epoch": 0.408983451536643, + "grad_norm": 3.131596088409424, + "learning_rate": 4.954588904171261e-06, + "loss": 0.6548, + "step": 865 + }, + { + "epoch": 0.4094562647754137, + "grad_norm": 2.5742313861846924, + "learning_rate": 4.954470466666827e-06, + "loss": 0.6592, + "step": 866 + }, + { + "epoch": 0.4099290780141844, + "grad_norm": 2.8612802028656006, + "learning_rate": 4.9543518763332785e-06, + "loss": 0.5391, + "step": 867 + }, + { + "epoch": 0.41040189125295506, + "grad_norm": 2.8973186016082764, + "learning_rate": 4.954233133178001e-06, + "loss": 0.6649, + "step": 868 + }, + { + "epoch": 0.41087470449172575, + "grad_norm": 2.802525043487549, + "learning_rate": 4.954114237208388e-06, + "loss": 0.6212, + "step": 869 + }, + { + "epoch": 0.41134751773049644, + "grad_norm": 2.5919506549835205, + "learning_rate": 4.953995188431843e-06, + "loss": 0.6596, + "step": 870 + }, + { + "epoch": 0.4118203309692671, + "grad_norm": 3.139169454574585, + "learning_rate": 4.953875986855777e-06, + "loss": 0.6799, + "step": 871 + }, + { + "epoch": 0.4122931442080378, + "grad_norm": 3.99727725982666, + "learning_rate": 4.953756632487614e-06, + "loss": 0.6519, + "step": 872 + }, + { + "epoch": 0.4127659574468085, + "grad_norm": 3.238706350326538, + "learning_rate": 4.953637125334784e-06, + "loss": 0.7361, + "step": 873 + }, + { + "epoch": 0.4132387706855792, + "grad_norm": 2.780019998550415, + "learning_rate": 4.9535174654047295e-06, + "loss": 0.6406, + "step": 874 + }, + { + "epoch": 0.41371158392434987, + "grad_norm": 2.7629551887512207, + "learning_rate": 4.953397652704901e-06, + "loss": 0.6131, + "step": 875 + }, + { + "epoch": 0.41418439716312055, + "grad_norm": 2.8008246421813965, + "learning_rate": 4.9532776872427585e-06, + "loss": 0.6464, + "step": 876 + }, + { + "epoch": 0.41465721040189124, + "grad_norm": 3.0970115661621094, + "learning_rate": 4.953157569025772e-06, + "loss": 0.7066, + "step": 877 + }, + { + "epoch": 0.4151300236406619, + "grad_norm": 2.8375589847564697, + "learning_rate": 4.9530372980614195e-06, + "loss": 0.6551, + "step": 878 + }, + { + "epoch": 0.4156028368794326, + "grad_norm": 2.718843936920166, + "learning_rate": 4.952916874357191e-06, + "loss": 0.5947, + "step": 879 + }, + { + "epoch": 0.4160756501182033, + "grad_norm": 2.7104697227478027, + "learning_rate": 4.952796297920585e-06, + "loss": 0.6708, + "step": 880 + }, + { + "epoch": 0.416548463356974, + "grad_norm": 2.8223445415496826, + "learning_rate": 4.952675568759108e-06, + "loss": 0.6214, + "step": 881 + }, + { + "epoch": 0.41702127659574467, + "grad_norm": 2.6598153114318848, + "learning_rate": 4.952554686880279e-06, + "loss": 0.6116, + "step": 882 + }, + { + "epoch": 0.41749408983451536, + "grad_norm": 2.8639824390411377, + "learning_rate": 4.952433652291623e-06, + "loss": 0.5971, + "step": 883 + }, + { + "epoch": 0.41796690307328604, + "grad_norm": 2.9578304290771484, + "learning_rate": 4.952312465000677e-06, + "loss": 0.6785, + "step": 884 + }, + { + "epoch": 0.41843971631205673, + "grad_norm": 2.872144937515259, + "learning_rate": 4.952191125014987e-06, + "loss": 0.6772, + "step": 885 + }, + { + "epoch": 0.4189125295508274, + "grad_norm": 2.7513675689697266, + "learning_rate": 4.952069632342108e-06, + "loss": 0.702, + "step": 886 + }, + { + "epoch": 0.4193853427895981, + "grad_norm": 2.9275078773498535, + "learning_rate": 4.951947986989606e-06, + "loss": 0.589, + "step": 887 + }, + { + "epoch": 0.4198581560283688, + "grad_norm": 2.740549325942993, + "learning_rate": 4.951826188965053e-06, + "loss": 0.5942, + "step": 888 + }, + { + "epoch": 0.4203309692671395, + "grad_norm": 2.92452073097229, + "learning_rate": 4.951704238276035e-06, + "loss": 0.6819, + "step": 889 + }, + { + "epoch": 0.42080378250591016, + "grad_norm": 2.842491865158081, + "learning_rate": 4.951582134930144e-06, + "loss": 0.6304, + "step": 890 + }, + { + "epoch": 0.42127659574468085, + "grad_norm": 2.613478422164917, + "learning_rate": 4.951459878934983e-06, + "loss": 0.6912, + "step": 891 + }, + { + "epoch": 0.42174940898345153, + "grad_norm": 3.2408607006073, + "learning_rate": 4.951337470298165e-06, + "loss": 0.6755, + "step": 892 + }, + { + "epoch": 0.4222222222222222, + "grad_norm": 3.1022439002990723, + "learning_rate": 4.9512149090273125e-06, + "loss": 0.6138, + "step": 893 + }, + { + "epoch": 0.4226950354609929, + "grad_norm": 2.6418895721435547, + "learning_rate": 4.951092195130055e-06, + "loss": 0.639, + "step": 894 + }, + { + "epoch": 0.4231678486997636, + "grad_norm": 3.010744333267212, + "learning_rate": 4.950969328614035e-06, + "loss": 0.7102, + "step": 895 + }, + { + "epoch": 0.4236406619385343, + "grad_norm": 2.673292636871338, + "learning_rate": 4.950846309486901e-06, + "loss": 0.5676, + "step": 896 + }, + { + "epoch": 0.42411347517730497, + "grad_norm": 3.6974737644195557, + "learning_rate": 4.950723137756314e-06, + "loss": 0.5722, + "step": 897 + }, + { + "epoch": 0.42458628841607565, + "grad_norm": 3.69028902053833, + "learning_rate": 4.9505998134299435e-06, + "loss": 0.6337, + "step": 898 + }, + { + "epoch": 0.42505910165484634, + "grad_norm": 3.2136125564575195, + "learning_rate": 4.950476336515469e-06, + "loss": 0.6469, + "step": 899 + }, + { + "epoch": 0.425531914893617, + "grad_norm": 2.7396016120910645, + "learning_rate": 4.950352707020577e-06, + "loss": 0.6656, + "step": 900 + }, + { + "epoch": 0.4260047281323877, + "grad_norm": 2.825416088104248, + "learning_rate": 4.950228924952967e-06, + "loss": 0.6298, + "step": 901 + }, + { + "epoch": 0.4264775413711584, + "grad_norm": 3.401658535003662, + "learning_rate": 4.950104990320345e-06, + "loss": 0.778, + "step": 902 + }, + { + "epoch": 0.4269503546099291, + "grad_norm": 2.7002272605895996, + "learning_rate": 4.9499809031304294e-06, + "loss": 0.6536, + "step": 903 + }, + { + "epoch": 0.42742316784869977, + "grad_norm": 2.62386417388916, + "learning_rate": 4.949856663390945e-06, + "loss": 0.6629, + "step": 904 + }, + { + "epoch": 0.42789598108747046, + "grad_norm": 2.584247589111328, + "learning_rate": 4.94973227110963e-06, + "loss": 0.5813, + "step": 905 + }, + { + "epoch": 0.42836879432624114, + "grad_norm": 3.4365768432617188, + "learning_rate": 4.9496077262942265e-06, + "loss": 0.7648, + "step": 906 + }, + { + "epoch": 0.42884160756501183, + "grad_norm": 2.8993639945983887, + "learning_rate": 4.949483028952492e-06, + "loss": 0.6696, + "step": 907 + }, + { + "epoch": 0.4293144208037825, + "grad_norm": 2.922809362411499, + "learning_rate": 4.94935817909219e-06, + "loss": 0.6892, + "step": 908 + }, + { + "epoch": 0.4297872340425532, + "grad_norm": 2.85478138923645, + "learning_rate": 4.9492331767210944e-06, + "loss": 0.536, + "step": 909 + }, + { + "epoch": 0.4302600472813239, + "grad_norm": 2.8639259338378906, + "learning_rate": 4.949108021846988e-06, + "loss": 0.634, + "step": 910 + }, + { + "epoch": 0.4307328605200946, + "grad_norm": 3.0533697605133057, + "learning_rate": 4.948982714477664e-06, + "loss": 0.6318, + "step": 911 + }, + { + "epoch": 0.43120567375886526, + "grad_norm": 2.331674814224243, + "learning_rate": 4.9488572546209255e-06, + "loss": 0.6562, + "step": 912 + }, + { + "epoch": 0.43167848699763595, + "grad_norm": 3.0154623985290527, + "learning_rate": 4.9487316422845835e-06, + "loss": 0.6675, + "step": 913 + }, + { + "epoch": 0.43215130023640663, + "grad_norm": 2.7354514598846436, + "learning_rate": 4.948605877476459e-06, + "loss": 0.6012, + "step": 914 + }, + { + "epoch": 0.4326241134751773, + "grad_norm": 2.863736629486084, + "learning_rate": 4.948479960204383e-06, + "loss": 0.6062, + "step": 915 + }, + { + "epoch": 0.433096926713948, + "grad_norm": 3.01998233795166, + "learning_rate": 4.948353890476197e-06, + "loss": 0.6749, + "step": 916 + }, + { + "epoch": 0.4335697399527187, + "grad_norm": 2.7550456523895264, + "learning_rate": 4.94822766829975e-06, + "loss": 0.6507, + "step": 917 + }, + { + "epoch": 0.4340425531914894, + "grad_norm": 3.370572805404663, + "learning_rate": 4.948101293682901e-06, + "loss": 0.714, + "step": 918 + }, + { + "epoch": 0.43451536643026006, + "grad_norm": 2.9736790657043457, + "learning_rate": 4.947974766633519e-06, + "loss": 0.729, + "step": 919 + }, + { + "epoch": 0.43498817966903075, + "grad_norm": 3.1036548614501953, + "learning_rate": 4.947848087159483e-06, + "loss": 0.7547, + "step": 920 + }, + { + "epoch": 0.43546099290780144, + "grad_norm": 2.895094871520996, + "learning_rate": 4.947721255268679e-06, + "loss": 0.6089, + "step": 921 + }, + { + "epoch": 0.4359338061465721, + "grad_norm": 2.798476219177246, + "learning_rate": 4.947594270969005e-06, + "loss": 0.5432, + "step": 922 + }, + { + "epoch": 0.4364066193853428, + "grad_norm": 2.7675702571868896, + "learning_rate": 4.94746713426837e-06, + "loss": 0.5693, + "step": 923 + }, + { + "epoch": 0.4368794326241135, + "grad_norm": 2.6851553916931152, + "learning_rate": 4.947339845174687e-06, + "loss": 0.6503, + "step": 924 + }, + { + "epoch": 0.4373522458628842, + "grad_norm": 2.909635543823242, + "learning_rate": 4.947212403695883e-06, + "loss": 0.6494, + "step": 925 + }, + { + "epoch": 0.43782505910165487, + "grad_norm": 2.604526996612549, + "learning_rate": 4.947084809839894e-06, + "loss": 0.6349, + "step": 926 + }, + { + "epoch": 0.43829787234042555, + "grad_norm": 3.118149518966675, + "learning_rate": 4.946957063614664e-06, + "loss": 0.6219, + "step": 927 + }, + { + "epoch": 0.43877068557919624, + "grad_norm": 2.7452616691589355, + "learning_rate": 4.9468291650281465e-06, + "loss": 0.6096, + "step": 928 + }, + { + "epoch": 0.4392434988179669, + "grad_norm": 3.30098819732666, + "learning_rate": 4.946701114088307e-06, + "loss": 0.6277, + "step": 929 + }, + { + "epoch": 0.4397163120567376, + "grad_norm": 2.789482593536377, + "learning_rate": 4.946572910803116e-06, + "loss": 0.7, + "step": 930 + }, + { + "epoch": 0.4401891252955083, + "grad_norm": 2.7283935546875, + "learning_rate": 4.946444555180559e-06, + "loss": 0.5375, + "step": 931 + }, + { + "epoch": 0.440661938534279, + "grad_norm": 3.101304054260254, + "learning_rate": 4.946316047228627e-06, + "loss": 0.6131, + "step": 932 + }, + { + "epoch": 0.44113475177304967, + "grad_norm": 3.573908805847168, + "learning_rate": 4.946187386955321e-06, + "loss": 0.7073, + "step": 933 + }, + { + "epoch": 0.44160756501182036, + "grad_norm": 3.214979648590088, + "learning_rate": 4.946058574368653e-06, + "loss": 0.6508, + "step": 934 + }, + { + "epoch": 0.44208037825059104, + "grad_norm": 3.145082712173462, + "learning_rate": 4.945929609476643e-06, + "loss": 0.64, + "step": 935 + }, + { + "epoch": 0.4425531914893617, + "grad_norm": 2.991780996322632, + "learning_rate": 4.945800492287321e-06, + "loss": 0.6315, + "step": 936 + }, + { + "epoch": 0.44302600472813236, + "grad_norm": 3.2441139221191406, + "learning_rate": 4.945671222808727e-06, + "loss": 0.7144, + "step": 937 + }, + { + "epoch": 0.44349881796690305, + "grad_norm": 2.9397029876708984, + "learning_rate": 4.94554180104891e-06, + "loss": 0.6818, + "step": 938 + }, + { + "epoch": 0.44397163120567373, + "grad_norm": 3.2471461296081543, + "learning_rate": 4.945412227015929e-06, + "loss": 0.6921, + "step": 939 + }, + { + "epoch": 0.4444444444444444, + "grad_norm": 3.0882487297058105, + "learning_rate": 4.945282500717851e-06, + "loss": 0.718, + "step": 940 + }, + { + "epoch": 0.4449172576832151, + "grad_norm": 2.6035783290863037, + "learning_rate": 4.945152622162753e-06, + "loss": 0.621, + "step": 941 + }, + { + "epoch": 0.4453900709219858, + "grad_norm": 2.83659029006958, + "learning_rate": 4.945022591358724e-06, + "loss": 0.6403, + "step": 942 + }, + { + "epoch": 0.4458628841607565, + "grad_norm": 2.824463129043579, + "learning_rate": 4.944892408313859e-06, + "loss": 0.6594, + "step": 943 + }, + { + "epoch": 0.44633569739952716, + "grad_norm": 2.753735065460205, + "learning_rate": 4.9447620730362645e-06, + "loss": 0.6116, + "step": 944 + }, + { + "epoch": 0.44680851063829785, + "grad_norm": 3.0659725666046143, + "learning_rate": 4.944631585534056e-06, + "loss": 0.5983, + "step": 945 + }, + { + "epoch": 0.44728132387706854, + "grad_norm": 2.969113349914551, + "learning_rate": 4.944500945815357e-06, + "loss": 0.6859, + "step": 946 + }, + { + "epoch": 0.4477541371158392, + "grad_norm": 2.810303211212158, + "learning_rate": 4.944370153888303e-06, + "loss": 0.7025, + "step": 947 + }, + { + "epoch": 0.4482269503546099, + "grad_norm": 3.027721643447876, + "learning_rate": 4.944239209761038e-06, + "loss": 0.7268, + "step": 948 + }, + { + "epoch": 0.4486997635933806, + "grad_norm": 2.661503314971924, + "learning_rate": 4.944108113441716e-06, + "loss": 0.6702, + "step": 949 + }, + { + "epoch": 0.4491725768321513, + "grad_norm": 2.738591432571411, + "learning_rate": 4.943976864938498e-06, + "loss": 0.6728, + "step": 950 + }, + { + "epoch": 0.44964539007092197, + "grad_norm": 3.447505474090576, + "learning_rate": 4.943845464259557e-06, + "loss": 0.6586, + "step": 951 + }, + { + "epoch": 0.45011820330969265, + "grad_norm": 3.0968854427337646, + "learning_rate": 4.943713911413075e-06, + "loss": 0.7666, + "step": 952 + }, + { + "epoch": 0.45059101654846334, + "grad_norm": 2.4113779067993164, + "learning_rate": 4.943582206407244e-06, + "loss": 0.6173, + "step": 953 + }, + { + "epoch": 0.451063829787234, + "grad_norm": 2.6357979774475098, + "learning_rate": 4.943450349250263e-06, + "loss": 0.5589, + "step": 954 + }, + { + "epoch": 0.4515366430260047, + "grad_norm": 2.9182233810424805, + "learning_rate": 4.9433183399503425e-06, + "loss": 0.6252, + "step": 955 + }, + { + "epoch": 0.4520094562647754, + "grad_norm": 2.832740306854248, + "learning_rate": 4.943186178515703e-06, + "loss": 0.6882, + "step": 956 + }, + { + "epoch": 0.4524822695035461, + "grad_norm": 2.9508981704711914, + "learning_rate": 4.943053864954574e-06, + "loss": 0.5722, + "step": 957 + }, + { + "epoch": 0.4529550827423168, + "grad_norm": 3.044729471206665, + "learning_rate": 4.9429213992751925e-06, + "loss": 0.6772, + "step": 958 + }, + { + "epoch": 0.45342789598108746, + "grad_norm": 2.606003522872925, + "learning_rate": 4.9427887814858075e-06, + "loss": 0.6445, + "step": 959 + }, + { + "epoch": 0.45390070921985815, + "grad_norm": 2.4634225368499756, + "learning_rate": 4.942656011594676e-06, + "loss": 0.6151, + "step": 960 + }, + { + "epoch": 0.45437352245862883, + "grad_norm": 2.8872334957122803, + "learning_rate": 4.942523089610066e-06, + "loss": 0.6255, + "step": 961 + }, + { + "epoch": 0.4548463356973995, + "grad_norm": 2.870605707168579, + "learning_rate": 4.942390015540253e-06, + "loss": 0.7481, + "step": 962 + }, + { + "epoch": 0.4553191489361702, + "grad_norm": 2.952680826187134, + "learning_rate": 4.942256789393524e-06, + "loss": 0.5556, + "step": 963 + }, + { + "epoch": 0.4557919621749409, + "grad_norm": 2.623680353164673, + "learning_rate": 4.9421234111781725e-06, + "loss": 0.6115, + "step": 964 + }, + { + "epoch": 0.4562647754137116, + "grad_norm": 2.6933600902557373, + "learning_rate": 4.941989880902505e-06, + "loss": 0.6102, + "step": 965 + }, + { + "epoch": 0.45673758865248226, + "grad_norm": 2.6047189235687256, + "learning_rate": 4.941856198574836e-06, + "loss": 0.612, + "step": 966 + }, + { + "epoch": 0.45721040189125295, + "grad_norm": 2.779186725616455, + "learning_rate": 4.9417223642034885e-06, + "loss": 0.5424, + "step": 967 + }, + { + "epoch": 0.45768321513002364, + "grad_norm": 2.6177165508270264, + "learning_rate": 4.941588377796795e-06, + "loss": 0.4661, + "step": 968 + }, + { + "epoch": 0.4581560283687943, + "grad_norm": 2.959676742553711, + "learning_rate": 4.941454239363101e-06, + "loss": 0.6966, + "step": 969 + }, + { + "epoch": 0.458628841607565, + "grad_norm": 2.9788379669189453, + "learning_rate": 4.941319948910756e-06, + "loss": 0.6181, + "step": 970 + }, + { + "epoch": 0.4591016548463357, + "grad_norm": 4.642750263214111, + "learning_rate": 4.941185506448122e-06, + "loss": 0.5602, + "step": 971 + }, + { + "epoch": 0.4595744680851064, + "grad_norm": 2.793002128601074, + "learning_rate": 4.941050911983572e-06, + "loss": 0.602, + "step": 972 + }, + { + "epoch": 0.46004728132387707, + "grad_norm": 2.6833035945892334, + "learning_rate": 4.9409161655254845e-06, + "loss": 0.5549, + "step": 973 + }, + { + "epoch": 0.46052009456264775, + "grad_norm": 3.905032157897949, + "learning_rate": 4.94078126708225e-06, + "loss": 0.6335, + "step": 974 + }, + { + "epoch": 0.46099290780141844, + "grad_norm": 2.922609329223633, + "learning_rate": 4.94064621666227e-06, + "loss": 0.5839, + "step": 975 + }, + { + "epoch": 0.4614657210401891, + "grad_norm": 2.8277416229248047, + "learning_rate": 4.940511014273952e-06, + "loss": 0.629, + "step": 976 + }, + { + "epoch": 0.4619385342789598, + "grad_norm": 3.07511043548584, + "learning_rate": 4.940375659925714e-06, + "loss": 0.7058, + "step": 977 + }, + { + "epoch": 0.4624113475177305, + "grad_norm": 3.65043044090271, + "learning_rate": 4.940240153625984e-06, + "loss": 0.7174, + "step": 978 + }, + { + "epoch": 0.4628841607565012, + "grad_norm": 2.755167245864868, + "learning_rate": 4.9401044953832e-06, + "loss": 0.6548, + "step": 979 + }, + { + "epoch": 0.46335697399527187, + "grad_norm": 2.9881057739257812, + "learning_rate": 4.939968685205808e-06, + "loss": 0.6245, + "step": 980 + }, + { + "epoch": 0.46382978723404256, + "grad_norm": 2.9484212398529053, + "learning_rate": 4.939832723102266e-06, + "loss": 0.655, + "step": 981 + }, + { + "epoch": 0.46430260047281324, + "grad_norm": 2.898918628692627, + "learning_rate": 4.939696609081038e-06, + "loss": 0.6178, + "step": 982 + }, + { + "epoch": 0.46477541371158393, + "grad_norm": 2.7052435874938965, + "learning_rate": 4.9395603431506e-06, + "loss": 0.6393, + "step": 983 + }, + { + "epoch": 0.4652482269503546, + "grad_norm": 2.5610013008117676, + "learning_rate": 4.939423925319436e-06, + "loss": 0.4847, + "step": 984 + }, + { + "epoch": 0.4657210401891253, + "grad_norm": 3.229083299636841, + "learning_rate": 4.939287355596042e-06, + "loss": 0.6473, + "step": 985 + }, + { + "epoch": 0.466193853427896, + "grad_norm": 2.907097816467285, + "learning_rate": 4.9391506339889195e-06, + "loss": 0.652, + "step": 986 + }, + { + "epoch": 0.4666666666666667, + "grad_norm": 2.6929478645324707, + "learning_rate": 4.939013760506582e-06, + "loss": 0.6175, + "step": 987 + }, + { + "epoch": 0.46713947990543736, + "grad_norm": 3.414813280105591, + "learning_rate": 4.938876735157554e-06, + "loss": 0.7597, + "step": 988 + }, + { + "epoch": 0.46761229314420805, + "grad_norm": 3.297360420227051, + "learning_rate": 4.938739557950365e-06, + "loss": 0.6824, + "step": 989 + }, + { + "epoch": 0.46808510638297873, + "grad_norm": 3.083155393600464, + "learning_rate": 4.938602228893557e-06, + "loss": 0.6505, + "step": 990 + }, + { + "epoch": 0.4685579196217494, + "grad_norm": 2.9781153202056885, + "learning_rate": 4.938464747995681e-06, + "loss": 0.666, + "step": 991 + }, + { + "epoch": 0.4690307328605201, + "grad_norm": 3.1494534015655518, + "learning_rate": 4.9383271152652975e-06, + "loss": 0.6422, + "step": 992 + }, + { + "epoch": 0.4695035460992908, + "grad_norm": 2.547868490219116, + "learning_rate": 4.938189330710976e-06, + "loss": 0.5766, + "step": 993 + }, + { + "epoch": 0.4699763593380615, + "grad_norm": 2.684736967086792, + "learning_rate": 4.938051394341297e-06, + "loss": 0.6407, + "step": 994 + }, + { + "epoch": 0.47044917257683216, + "grad_norm": 2.9619693756103516, + "learning_rate": 4.937913306164847e-06, + "loss": 0.6936, + "step": 995 + }, + { + "epoch": 0.47092198581560285, + "grad_norm": 2.9698498249053955, + "learning_rate": 4.937775066190227e-06, + "loss": 0.6464, + "step": 996 + }, + { + "epoch": 0.47139479905437354, + "grad_norm": 3.121049642562866, + "learning_rate": 4.937636674426042e-06, + "loss": 0.6383, + "step": 997 + }, + { + "epoch": 0.4718676122931442, + "grad_norm": 3.113672971725464, + "learning_rate": 4.93749813088091e-06, + "loss": 0.6892, + "step": 998 + }, + { + "epoch": 0.4723404255319149, + "grad_norm": 3.126113176345825, + "learning_rate": 4.937359435563458e-06, + "loss": 0.6728, + "step": 999 + }, + { + "epoch": 0.4728132387706856, + "grad_norm": 3.353966236114502, + "learning_rate": 4.937220588482321e-06, + "loss": 0.6041, + "step": 1000 + }, + { + "epoch": 0.4732860520094563, + "grad_norm": 2.8860628604888916, + "learning_rate": 4.937081589646144e-06, + "loss": 0.6798, + "step": 1001 + }, + { + "epoch": 0.47375886524822697, + "grad_norm": 3.0510590076446533, + "learning_rate": 4.936942439063584e-06, + "loss": 0.5841, + "step": 1002 + }, + { + "epoch": 0.47423167848699765, + "grad_norm": 2.6998369693756104, + "learning_rate": 4.936803136743303e-06, + "loss": 0.6403, + "step": 1003 + }, + { + "epoch": 0.47470449172576834, + "grad_norm": 2.875347137451172, + "learning_rate": 4.9366636826939765e-06, + "loss": 0.5811, + "step": 1004 + }, + { + "epoch": 0.475177304964539, + "grad_norm": 2.9122262001037598, + "learning_rate": 4.936524076924287e-06, + "loss": 0.6852, + "step": 1005 + }, + { + "epoch": 0.4756501182033097, + "grad_norm": 2.5167057514190674, + "learning_rate": 4.9363843194429265e-06, + "loss": 0.5367, + "step": 1006 + }, + { + "epoch": 0.4761229314420804, + "grad_norm": 2.5745551586151123, + "learning_rate": 4.9362444102585985e-06, + "loss": 0.6241, + "step": 1007 + }, + { + "epoch": 0.4765957446808511, + "grad_norm": 2.5024216175079346, + "learning_rate": 4.9361043493800125e-06, + "loss": 0.6133, + "step": 1008 + }, + { + "epoch": 0.47706855791962177, + "grad_norm": 2.7281384468078613, + "learning_rate": 4.935964136815892e-06, + "loss": 0.6834, + "step": 1009 + }, + { + "epoch": 0.47754137115839246, + "grad_norm": 3.0118913650512695, + "learning_rate": 4.935823772574965e-06, + "loss": 0.6922, + "step": 1010 + }, + { + "epoch": 0.47801418439716314, + "grad_norm": 3.016216993331909, + "learning_rate": 4.935683256665973e-06, + "loss": 0.6653, + "step": 1011 + }, + { + "epoch": 0.47848699763593383, + "grad_norm": 2.9526784420013428, + "learning_rate": 4.9355425890976636e-06, + "loss": 0.6423, + "step": 1012 + }, + { + "epoch": 0.4789598108747045, + "grad_norm": 6.222797393798828, + "learning_rate": 4.9354017698787985e-06, + "loss": 0.5884, + "step": 1013 + }, + { + "epoch": 0.4794326241134752, + "grad_norm": 2.6553597450256348, + "learning_rate": 4.935260799018143e-06, + "loss": 0.6624, + "step": 1014 + }, + { + "epoch": 0.4799054373522459, + "grad_norm": 3.0942065715789795, + "learning_rate": 4.935119676524475e-06, + "loss": 0.6623, + "step": 1015 + }, + { + "epoch": 0.4803782505910166, + "grad_norm": 2.626359224319458, + "learning_rate": 4.934978402406585e-06, + "loss": 0.6195, + "step": 1016 + }, + { + "epoch": 0.4808510638297872, + "grad_norm": 2.7954699993133545, + "learning_rate": 4.934836976673265e-06, + "loss": 0.5545, + "step": 1017 + }, + { + "epoch": 0.4813238770685579, + "grad_norm": 2.913557291030884, + "learning_rate": 4.934695399333324e-06, + "loss": 0.6288, + "step": 1018 + }, + { + "epoch": 0.4817966903073286, + "grad_norm": 3.1043739318847656, + "learning_rate": 4.9345536703955746e-06, + "loss": 0.6771, + "step": 1019 + }, + { + "epoch": 0.48226950354609927, + "grad_norm": 2.789357900619507, + "learning_rate": 4.934411789868845e-06, + "loss": 0.6227, + "step": 1020 + }, + { + "epoch": 0.48274231678486995, + "grad_norm": 2.480609655380249, + "learning_rate": 4.934269757761967e-06, + "loss": 0.5779, + "step": 1021 + }, + { + "epoch": 0.48321513002364064, + "grad_norm": 2.7946252822875977, + "learning_rate": 4.934127574083785e-06, + "loss": 0.6166, + "step": 1022 + }, + { + "epoch": 0.4836879432624113, + "grad_norm": 3.0670509338378906, + "learning_rate": 4.933985238843153e-06, + "loss": 0.7766, + "step": 1023 + }, + { + "epoch": 0.484160756501182, + "grad_norm": 2.8567559719085693, + "learning_rate": 4.933842752048932e-06, + "loss": 0.5088, + "step": 1024 + }, + { + "epoch": 0.4846335697399527, + "grad_norm": 2.5674657821655273, + "learning_rate": 4.933700113709996e-06, + "loss": 0.6036, + "step": 1025 + }, + { + "epoch": 0.4851063829787234, + "grad_norm": 2.782339096069336, + "learning_rate": 4.933557323835224e-06, + "loss": 0.5335, + "step": 1026 + }, + { + "epoch": 0.48557919621749407, + "grad_norm": 2.6334071159362793, + "learning_rate": 4.93341438243351e-06, + "loss": 0.6327, + "step": 1027 + }, + { + "epoch": 0.48605200945626476, + "grad_norm": 3.0853965282440186, + "learning_rate": 4.933271289513751e-06, + "loss": 0.7102, + "step": 1028 + }, + { + "epoch": 0.48652482269503544, + "grad_norm": 2.619997501373291, + "learning_rate": 4.933128045084859e-06, + "loss": 0.6138, + "step": 1029 + }, + { + "epoch": 0.48699763593380613, + "grad_norm": 2.8316116333007812, + "learning_rate": 4.932984649155753e-06, + "loss": 0.6346, + "step": 1030 + }, + { + "epoch": 0.4874704491725768, + "grad_norm": 3.153486490249634, + "learning_rate": 4.932841101735361e-06, + "loss": 0.7626, + "step": 1031 + }, + { + "epoch": 0.4879432624113475, + "grad_norm": 3.1831274032592773, + "learning_rate": 4.9326974028326214e-06, + "loss": 0.6607, + "step": 1032 + }, + { + "epoch": 0.4884160756501182, + "grad_norm": 2.791078567504883, + "learning_rate": 4.932553552456481e-06, + "loss": 0.6141, + "step": 1033 + }, + { + "epoch": 0.4888888888888889, + "grad_norm": 2.627263307571411, + "learning_rate": 4.932409550615898e-06, + "loss": 0.6777, + "step": 1034 + }, + { + "epoch": 0.48936170212765956, + "grad_norm": 2.8550007343292236, + "learning_rate": 4.932265397319838e-06, + "loss": 0.6379, + "step": 1035 + }, + { + "epoch": 0.48983451536643025, + "grad_norm": 4.505824089050293, + "learning_rate": 4.932121092577276e-06, + "loss": 0.5892, + "step": 1036 + }, + { + "epoch": 0.49030732860520093, + "grad_norm": 3.100191116333008, + "learning_rate": 4.931976636397199e-06, + "loss": 0.6443, + "step": 1037 + }, + { + "epoch": 0.4907801418439716, + "grad_norm": 2.921494245529175, + "learning_rate": 4.9318320287886e-06, + "loss": 0.6821, + "step": 1038 + }, + { + "epoch": 0.4912529550827423, + "grad_norm": 4.577807903289795, + "learning_rate": 4.931687269760485e-06, + "loss": 0.5946, + "step": 1039 + }, + { + "epoch": 0.491725768321513, + "grad_norm": 2.7347636222839355, + "learning_rate": 4.931542359321865e-06, + "loss": 0.5689, + "step": 1040 + }, + { + "epoch": 0.4921985815602837, + "grad_norm": 2.5289158821105957, + "learning_rate": 4.931397297481765e-06, + "loss": 0.5632, + "step": 1041 + }, + { + "epoch": 0.49267139479905436, + "grad_norm": 3.3518471717834473, + "learning_rate": 4.9312520842492165e-06, + "loss": 0.6349, + "step": 1042 + }, + { + "epoch": 0.49314420803782505, + "grad_norm": 3.0469748973846436, + "learning_rate": 4.931106719633261e-06, + "loss": 0.5734, + "step": 1043 + }, + { + "epoch": 0.49361702127659574, + "grad_norm": 3.104682445526123, + "learning_rate": 4.930961203642951e-06, + "loss": 0.6101, + "step": 1044 + }, + { + "epoch": 0.4940898345153664, + "grad_norm": 2.776705503463745, + "learning_rate": 4.930815536287346e-06, + "loss": 0.6397, + "step": 1045 + }, + { + "epoch": 0.4945626477541371, + "grad_norm": 2.760380983352661, + "learning_rate": 4.930669717575516e-06, + "loss": 0.668, + "step": 1046 + }, + { + "epoch": 0.4950354609929078, + "grad_norm": 2.70084547996521, + "learning_rate": 4.930523747516541e-06, + "loss": 0.5729, + "step": 1047 + }, + { + "epoch": 0.4955082742316785, + "grad_norm": 2.7319583892822266, + "learning_rate": 4.930377626119511e-06, + "loss": 0.6258, + "step": 1048 + }, + { + "epoch": 0.49598108747044917, + "grad_norm": 3.2515223026275635, + "learning_rate": 4.930231353393521e-06, + "loss": 0.7412, + "step": 1049 + }, + { + "epoch": 0.49645390070921985, + "grad_norm": 3.0646486282348633, + "learning_rate": 4.930084929347682e-06, + "loss": 0.5809, + "step": 1050 + }, + { + "epoch": 0.49692671394799054, + "grad_norm": 3.1621921062469482, + "learning_rate": 4.9299383539911096e-06, + "loss": 0.6282, + "step": 1051 + }, + { + "epoch": 0.4973995271867612, + "grad_norm": 2.864713191986084, + "learning_rate": 4.929791627332931e-06, + "loss": 0.6263, + "step": 1052 + }, + { + "epoch": 0.4978723404255319, + "grad_norm": 3.181016683578491, + "learning_rate": 4.929644749382283e-06, + "loss": 0.5697, + "step": 1053 + }, + { + "epoch": 0.4983451536643026, + "grad_norm": 2.9064836502075195, + "learning_rate": 4.929497720148309e-06, + "loss": 0.6161, + "step": 1054 + }, + { + "epoch": 0.4988179669030733, + "grad_norm": 3.058112859725952, + "learning_rate": 4.9293505396401655e-06, + "loss": 0.6477, + "step": 1055 + }, + { + "epoch": 0.49929078014184397, + "grad_norm": 2.5227596759796143, + "learning_rate": 4.929203207867016e-06, + "loss": 0.5819, + "step": 1056 + }, + { + "epoch": 0.49976359338061466, + "grad_norm": 3.386862277984619, + "learning_rate": 4.929055724838035e-06, + "loss": 0.7342, + "step": 1057 + }, + { + "epoch": 0.5002364066193853, + "grad_norm": 3.368346929550171, + "learning_rate": 4.928908090562404e-06, + "loss": 0.6622, + "step": 1058 + }, + { + "epoch": 0.500709219858156, + "grad_norm": 2.9108314514160156, + "learning_rate": 4.928760305049317e-06, + "loss": 0.6598, + "step": 1059 + }, + { + "epoch": 0.5011820330969267, + "grad_norm": 2.822305917739868, + "learning_rate": 4.928612368307977e-06, + "loss": 0.5841, + "step": 1060 + }, + { + "epoch": 0.5016548463356973, + "grad_norm": 2.689131259918213, + "learning_rate": 4.928464280347592e-06, + "loss": 0.6631, + "step": 1061 + }, + { + "epoch": 0.502127659574468, + "grad_norm": 3.337214946746826, + "learning_rate": 4.9283160411773864e-06, + "loss": 0.6105, + "step": 1062 + }, + { + "epoch": 0.5026004728132387, + "grad_norm": 3.035911798477173, + "learning_rate": 4.928167650806588e-06, + "loss": 0.6981, + "step": 1063 + }, + { + "epoch": 0.5030732860520094, + "grad_norm": 2.8820855617523193, + "learning_rate": 4.9280191092444375e-06, + "loss": 0.6408, + "step": 1064 + }, + { + "epoch": 0.5035460992907801, + "grad_norm": 3.080432415008545, + "learning_rate": 4.927870416500183e-06, + "loss": 0.6398, + "step": 1065 + }, + { + "epoch": 0.5040189125295508, + "grad_norm": 2.761612892150879, + "learning_rate": 4.927721572583084e-06, + "loss": 0.6126, + "step": 1066 + }, + { + "epoch": 0.5044917257683215, + "grad_norm": 2.8561882972717285, + "learning_rate": 4.927572577502408e-06, + "loss": 0.584, + "step": 1067 + }, + { + "epoch": 0.5049645390070922, + "grad_norm": 3.3386311531066895, + "learning_rate": 4.927423431267432e-06, + "loss": 0.6666, + "step": 1068 + }, + { + "epoch": 0.5054373522458628, + "grad_norm": 2.632906675338745, + "learning_rate": 4.927274133887443e-06, + "loss": 0.632, + "step": 1069 + }, + { + "epoch": 0.5059101654846335, + "grad_norm": 2.8737308979034424, + "learning_rate": 4.927124685371737e-06, + "loss": 0.6051, + "step": 1070 + }, + { + "epoch": 0.5063829787234042, + "grad_norm": 3.042222738265991, + "learning_rate": 4.926975085729619e-06, + "loss": 0.6954, + "step": 1071 + }, + { + "epoch": 0.5068557919621749, + "grad_norm": 3.3341481685638428, + "learning_rate": 4.926825334970404e-06, + "loss": 0.7148, + "step": 1072 + }, + { + "epoch": 0.5073286052009456, + "grad_norm": 2.7415387630462646, + "learning_rate": 4.926675433103418e-06, + "loss": 0.5456, + "step": 1073 + }, + { + "epoch": 0.5078014184397163, + "grad_norm": 2.7545325756073, + "learning_rate": 4.926525380137993e-06, + "loss": 0.6213, + "step": 1074 + }, + { + "epoch": 0.508274231678487, + "grad_norm": 2.9153690338134766, + "learning_rate": 4.926375176083472e-06, + "loss": 0.6466, + "step": 1075 + }, + { + "epoch": 0.5087470449172576, + "grad_norm": 4.210638523101807, + "learning_rate": 4.926224820949209e-06, + "loss": 0.6192, + "step": 1076 + }, + { + "epoch": 0.5092198581560283, + "grad_norm": 2.4357898235321045, + "learning_rate": 4.926074314744565e-06, + "loss": 0.594, + "step": 1077 + }, + { + "epoch": 0.509692671394799, + "grad_norm": 2.8004701137542725, + "learning_rate": 4.92592365747891e-06, + "loss": 0.6276, + "step": 1078 + }, + { + "epoch": 0.5101654846335697, + "grad_norm": 2.920675039291382, + "learning_rate": 4.925772849161628e-06, + "loss": 0.6043, + "step": 1079 + }, + { + "epoch": 0.5106382978723404, + "grad_norm": 2.791555404663086, + "learning_rate": 4.9256218898021055e-06, + "loss": 0.6837, + "step": 1080 + }, + { + "epoch": 0.5111111111111111, + "grad_norm": 3.1702463626861572, + "learning_rate": 4.925470779409746e-06, + "loss": 0.668, + "step": 1081 + }, + { + "epoch": 0.5115839243498818, + "grad_norm": 2.7149479389190674, + "learning_rate": 4.925319517993955e-06, + "loss": 0.5842, + "step": 1082 + }, + { + "epoch": 0.5120567375886524, + "grad_norm": 2.916311025619507, + "learning_rate": 4.925168105564153e-06, + "loss": 0.6893, + "step": 1083 + }, + { + "epoch": 0.5125295508274231, + "grad_norm": 2.917654514312744, + "learning_rate": 4.925016542129767e-06, + "loss": 0.6513, + "step": 1084 + }, + { + "epoch": 0.5130023640661938, + "grad_norm": 2.5568928718566895, + "learning_rate": 4.924864827700234e-06, + "loss": 0.6177, + "step": 1085 + }, + { + "epoch": 0.5134751773049645, + "grad_norm": 2.816720485687256, + "learning_rate": 4.924712962285001e-06, + "loss": 0.5833, + "step": 1086 + }, + { + "epoch": 0.5139479905437352, + "grad_norm": 2.6989188194274902, + "learning_rate": 4.9245609458935235e-06, + "loss": 0.6332, + "step": 1087 + }, + { + "epoch": 0.5144208037825059, + "grad_norm": 2.959599494934082, + "learning_rate": 4.924408778535268e-06, + "loss": 0.626, + "step": 1088 + }, + { + "epoch": 0.5148936170212766, + "grad_norm": 2.872814416885376, + "learning_rate": 4.924256460219708e-06, + "loss": 0.6407, + "step": 1089 + }, + { + "epoch": 0.5153664302600472, + "grad_norm": 2.6989097595214844, + "learning_rate": 4.924103990956329e-06, + "loss": 0.6391, + "step": 1090 + }, + { + "epoch": 0.5158392434988179, + "grad_norm": 2.986492156982422, + "learning_rate": 4.9239513707546235e-06, + "loss": 0.6911, + "step": 1091 + }, + { + "epoch": 0.5163120567375886, + "grad_norm": 3.069920301437378, + "learning_rate": 4.9237985996240954e-06, + "loss": 0.671, + "step": 1092 + }, + { + "epoch": 0.5167848699763593, + "grad_norm": 2.8214917182922363, + "learning_rate": 4.9236456775742555e-06, + "loss": 0.5885, + "step": 1093 + }, + { + "epoch": 0.51725768321513, + "grad_norm": 2.9416961669921875, + "learning_rate": 4.923492604614627e-06, + "loss": 0.6293, + "step": 1094 + }, + { + "epoch": 0.5177304964539007, + "grad_norm": 2.761780023574829, + "learning_rate": 4.923339380754741e-06, + "loss": 0.649, + "step": 1095 + }, + { + "epoch": 0.5182033096926714, + "grad_norm": 2.7648792266845703, + "learning_rate": 4.923186006004138e-06, + "loss": 0.5906, + "step": 1096 + }, + { + "epoch": 0.518676122931442, + "grad_norm": 3.5535428524017334, + "learning_rate": 4.923032480372367e-06, + "loss": 0.7138, + "step": 1097 + }, + { + "epoch": 0.5191489361702127, + "grad_norm": 2.6252479553222656, + "learning_rate": 4.922878803868988e-06, + "loss": 0.5499, + "step": 1098 + }, + { + "epoch": 0.5196217494089834, + "grad_norm": 2.901002883911133, + "learning_rate": 4.9227249765035715e-06, + "loss": 0.6991, + "step": 1099 + }, + { + "epoch": 0.5200945626477541, + "grad_norm": 2.621877431869507, + "learning_rate": 4.9225709982856925e-06, + "loss": 0.6269, + "step": 1100 + }, + { + "epoch": 0.5205673758865248, + "grad_norm": 2.872483015060425, + "learning_rate": 4.92241686922494e-06, + "loss": 0.6657, + "step": 1101 + }, + { + "epoch": 0.5210401891252955, + "grad_norm": 2.730447769165039, + "learning_rate": 4.922262589330912e-06, + "loss": 0.6061, + "step": 1102 + }, + { + "epoch": 0.5215130023640662, + "grad_norm": 2.646247386932373, + "learning_rate": 4.922108158613213e-06, + "loss": 0.5923, + "step": 1103 + }, + { + "epoch": 0.5219858156028369, + "grad_norm": 2.6488895416259766, + "learning_rate": 4.92195357708146e-06, + "loss": 0.6293, + "step": 1104 + }, + { + "epoch": 0.5224586288416075, + "grad_norm": 2.756338357925415, + "learning_rate": 4.921798844745278e-06, + "loss": 0.6374, + "step": 1105 + }, + { + "epoch": 0.5229314420803782, + "grad_norm": 3.1441280841827393, + "learning_rate": 4.921643961614301e-06, + "loss": 0.6652, + "step": 1106 + }, + { + "epoch": 0.5234042553191489, + "grad_norm": 3.050002098083496, + "learning_rate": 4.921488927698172e-06, + "loss": 0.6809, + "step": 1107 + }, + { + "epoch": 0.5238770685579196, + "grad_norm": 2.71750807762146, + "learning_rate": 4.921333743006547e-06, + "loss": 0.6266, + "step": 1108 + }, + { + "epoch": 0.5243498817966903, + "grad_norm": 2.8439245223999023, + "learning_rate": 4.921178407549086e-06, + "loss": 0.5663, + "step": 1109 + }, + { + "epoch": 0.524822695035461, + "grad_norm": 3.0722241401672363, + "learning_rate": 4.921022921335464e-06, + "loss": 0.6791, + "step": 1110 + }, + { + "epoch": 0.5252955082742317, + "grad_norm": 3.4381656646728516, + "learning_rate": 4.920867284375358e-06, + "loss": 0.6687, + "step": 1111 + }, + { + "epoch": 0.5257683215130023, + "grad_norm": 2.819812774658203, + "learning_rate": 4.920711496678463e-06, + "loss": 0.6299, + "step": 1112 + }, + { + "epoch": 0.526241134751773, + "grad_norm": 3.6587414741516113, + "learning_rate": 4.9205555582544765e-06, + "loss": 0.7392, + "step": 1113 + }, + { + "epoch": 0.5267139479905437, + "grad_norm": 2.774296522140503, + "learning_rate": 4.920399469113109e-06, + "loss": 0.6652, + "step": 1114 + }, + { + "epoch": 0.5271867612293144, + "grad_norm": 2.7480580806732178, + "learning_rate": 4.920243229264081e-06, + "loss": 0.596, + "step": 1115 + }, + { + "epoch": 0.5276595744680851, + "grad_norm": 3.213057518005371, + "learning_rate": 4.920086838717119e-06, + "loss": 0.6986, + "step": 1116 + }, + { + "epoch": 0.5281323877068558, + "grad_norm": 2.940546989440918, + "learning_rate": 4.919930297481962e-06, + "loss": 0.6481, + "step": 1117 + }, + { + "epoch": 0.5286052009456265, + "grad_norm": 2.5970494747161865, + "learning_rate": 4.9197736055683555e-06, + "loss": 0.5658, + "step": 1118 + }, + { + "epoch": 0.5290780141843971, + "grad_norm": 4.49385404586792, + "learning_rate": 4.919616762986057e-06, + "loss": 0.605, + "step": 1119 + }, + { + "epoch": 0.5295508274231678, + "grad_norm": 2.971857786178589, + "learning_rate": 4.919459769744833e-06, + "loss": 0.6539, + "step": 1120 + }, + { + "epoch": 0.5300236406619385, + "grad_norm": 2.6192965507507324, + "learning_rate": 4.919302625854457e-06, + "loss": 0.6226, + "step": 1121 + }, + { + "epoch": 0.5304964539007092, + "grad_norm": 2.665088176727295, + "learning_rate": 4.919145331324716e-06, + "loss": 0.6647, + "step": 1122 + }, + { + "epoch": 0.5309692671394799, + "grad_norm": 2.612126111984253, + "learning_rate": 4.918987886165403e-06, + "loss": 0.6965, + "step": 1123 + }, + { + "epoch": 0.5314420803782506, + "grad_norm": 3.80017352104187, + "learning_rate": 4.9188302903863205e-06, + "loss": 0.7396, + "step": 1124 + }, + { + "epoch": 0.5319148936170213, + "grad_norm": 2.781752824783325, + "learning_rate": 4.918672543997282e-06, + "loss": 0.5985, + "step": 1125 + }, + { + "epoch": 0.532387706855792, + "grad_norm": 2.6067914962768555, + "learning_rate": 4.91851464700811e-06, + "loss": 0.6159, + "step": 1126 + }, + { + "epoch": 0.5328605200945626, + "grad_norm": 2.670807123184204, + "learning_rate": 4.918356599428636e-06, + "loss": 0.5958, + "step": 1127 + }, + { + "epoch": 0.5333333333333333, + "grad_norm": 2.608611822128296, + "learning_rate": 4.9181984012687e-06, + "loss": 0.5768, + "step": 1128 + }, + { + "epoch": 0.533806146572104, + "grad_norm": 2.586764097213745, + "learning_rate": 4.918040052538154e-06, + "loss": 0.661, + "step": 1129 + }, + { + "epoch": 0.5342789598108747, + "grad_norm": 3.1317451000213623, + "learning_rate": 4.917881553246856e-06, + "loss": 0.6626, + "step": 1130 + }, + { + "epoch": 0.5347517730496454, + "grad_norm": 2.7135281562805176, + "learning_rate": 4.917722903404676e-06, + "loss": 0.6572, + "step": 1131 + }, + { + "epoch": 0.5352245862884161, + "grad_norm": 3.4546358585357666, + "learning_rate": 4.917564103021493e-06, + "loss": 0.5597, + "step": 1132 + }, + { + "epoch": 0.5356973995271868, + "grad_norm": 3.0943493843078613, + "learning_rate": 4.917405152107193e-06, + "loss": 0.7258, + "step": 1133 + }, + { + "epoch": 0.5361702127659574, + "grad_norm": 2.6069352626800537, + "learning_rate": 4.917246050671674e-06, + "loss": 0.6209, + "step": 1134 + }, + { + "epoch": 0.5366430260047281, + "grad_norm": 2.584883689880371, + "learning_rate": 4.917086798724844e-06, + "loss": 0.658, + "step": 1135 + }, + { + "epoch": 0.5371158392434988, + "grad_norm": 3.001976490020752, + "learning_rate": 4.9169273962766166e-06, + "loss": 0.6306, + "step": 1136 + }, + { + "epoch": 0.5375886524822695, + "grad_norm": 2.5013928413391113, + "learning_rate": 4.916767843336918e-06, + "loss": 0.572, + "step": 1137 + }, + { + "epoch": 0.5380614657210402, + "grad_norm": 2.9114553928375244, + "learning_rate": 4.916608139915684e-06, + "loss": 0.5841, + "step": 1138 + }, + { + "epoch": 0.5385342789598109, + "grad_norm": 2.8878467082977295, + "learning_rate": 4.9164482860228564e-06, + "loss": 0.6654, + "step": 1139 + }, + { + "epoch": 0.5390070921985816, + "grad_norm": 2.9827866554260254, + "learning_rate": 4.91628828166839e-06, + "loss": 0.6674, + "step": 1140 + }, + { + "epoch": 0.5394799054373522, + "grad_norm": 3.8696281909942627, + "learning_rate": 4.916128126862248e-06, + "loss": 0.6241, + "step": 1141 + }, + { + "epoch": 0.5399527186761229, + "grad_norm": 2.9556291103363037, + "learning_rate": 4.915967821614402e-06, + "loss": 0.6478, + "step": 1142 + }, + { + "epoch": 0.5404255319148936, + "grad_norm": 2.392942428588867, + "learning_rate": 4.915807365934834e-06, + "loss": 0.6097, + "step": 1143 + }, + { + "epoch": 0.5408983451536643, + "grad_norm": 3.032235860824585, + "learning_rate": 4.915646759833534e-06, + "loss": 0.7193, + "step": 1144 + }, + { + "epoch": 0.541371158392435, + "grad_norm": 2.840416193008423, + "learning_rate": 4.915486003320501e-06, + "loss": 0.5506, + "step": 1145 + }, + { + "epoch": 0.5418439716312057, + "grad_norm": 2.5438895225524902, + "learning_rate": 4.915325096405747e-06, + "loss": 0.6487, + "step": 1146 + }, + { + "epoch": 0.5423167848699764, + "grad_norm": 2.544334650039673, + "learning_rate": 4.9151640390992905e-06, + "loss": 0.6168, + "step": 1147 + }, + { + "epoch": 0.542789598108747, + "grad_norm": 2.8535678386688232, + "learning_rate": 4.91500283141116e-06, + "loss": 0.678, + "step": 1148 + }, + { + "epoch": 0.5432624113475177, + "grad_norm": 2.8086955547332764, + "learning_rate": 4.9148414733513915e-06, + "loss": 0.6473, + "step": 1149 + }, + { + "epoch": 0.5437352245862884, + "grad_norm": 2.4709885120391846, + "learning_rate": 4.914679964930034e-06, + "loss": 0.6797, + "step": 1150 + }, + { + "epoch": 0.5442080378250591, + "grad_norm": 2.8546934127807617, + "learning_rate": 4.9145183061571435e-06, + "loss": 0.6247, + "step": 1151 + }, + { + "epoch": 0.5446808510638298, + "grad_norm": 2.991184711456299, + "learning_rate": 4.9143564970427844e-06, + "loss": 0.5977, + "step": 1152 + }, + { + "epoch": 0.5451536643026005, + "grad_norm": 3.011216402053833, + "learning_rate": 4.914194537597033e-06, + "loss": 0.7005, + "step": 1153 + }, + { + "epoch": 0.5456264775413712, + "grad_norm": 2.807521343231201, + "learning_rate": 4.9140324278299744e-06, + "loss": 0.5412, + "step": 1154 + }, + { + "epoch": 0.5460992907801419, + "grad_norm": 3.0401229858398438, + "learning_rate": 4.913870167751701e-06, + "loss": 0.6394, + "step": 1155 + }, + { + "epoch": 0.5465721040189125, + "grad_norm": 2.853914976119995, + "learning_rate": 4.913707757372317e-06, + "loss": 0.6745, + "step": 1156 + }, + { + "epoch": 0.5470449172576832, + "grad_norm": 4.505620956420898, + "learning_rate": 4.913545196701935e-06, + "loss": 0.6668, + "step": 1157 + }, + { + "epoch": 0.5475177304964539, + "grad_norm": 3.0505781173706055, + "learning_rate": 4.913382485750676e-06, + "loss": 0.6926, + "step": 1158 + }, + { + "epoch": 0.5479905437352246, + "grad_norm": 2.798435688018799, + "learning_rate": 4.913219624528672e-06, + "loss": 0.605, + "step": 1159 + }, + { + "epoch": 0.5484633569739953, + "grad_norm": 2.7814908027648926, + "learning_rate": 4.913056613046065e-06, + "loss": 0.6678, + "step": 1160 + }, + { + "epoch": 0.548936170212766, + "grad_norm": 3.2089321613311768, + "learning_rate": 4.9128934513130025e-06, + "loss": 0.5995, + "step": 1161 + }, + { + "epoch": 0.5494089834515367, + "grad_norm": 2.7699952125549316, + "learning_rate": 4.9127301393396455e-06, + "loss": 0.7062, + "step": 1162 + }, + { + "epoch": 0.5498817966903073, + "grad_norm": 2.859368324279785, + "learning_rate": 4.912566677136162e-06, + "loss": 0.6063, + "step": 1163 + }, + { + "epoch": 0.550354609929078, + "grad_norm": 2.727334499359131, + "learning_rate": 4.91240306471273e-06, + "loss": 0.6848, + "step": 1164 + }, + { + "epoch": 0.5508274231678487, + "grad_norm": 2.6017510890960693, + "learning_rate": 4.912239302079537e-06, + "loss": 0.5808, + "step": 1165 + }, + { + "epoch": 0.5513002364066194, + "grad_norm": 3.539583206176758, + "learning_rate": 4.912075389246781e-06, + "loss": 0.7053, + "step": 1166 + }, + { + "epoch": 0.5517730496453901, + "grad_norm": 2.918280601501465, + "learning_rate": 4.911911326224666e-06, + "loss": 0.5904, + "step": 1167 + }, + { + "epoch": 0.5522458628841608, + "grad_norm": 3.0067362785339355, + "learning_rate": 4.9117471130234095e-06, + "loss": 0.6392, + "step": 1168 + }, + { + "epoch": 0.5527186761229315, + "grad_norm": 2.4374797344207764, + "learning_rate": 4.911582749653236e-06, + "loss": 0.5793, + "step": 1169 + }, + { + "epoch": 0.5531914893617021, + "grad_norm": 3.121182918548584, + "learning_rate": 4.911418236124378e-06, + "loss": 0.6636, + "step": 1170 + }, + { + "epoch": 0.5536643026004728, + "grad_norm": 3.1289851665496826, + "learning_rate": 4.91125357244708e-06, + "loss": 0.656, + "step": 1171 + }, + { + "epoch": 0.5541371158392435, + "grad_norm": 2.7034592628479004, + "learning_rate": 4.911088758631596e-06, + "loss": 0.6001, + "step": 1172 + }, + { + "epoch": 0.5546099290780142, + "grad_norm": 2.710146188735962, + "learning_rate": 4.910923794688187e-06, + "loss": 0.6007, + "step": 1173 + }, + { + "epoch": 0.5550827423167849, + "grad_norm": 2.5424487590789795, + "learning_rate": 4.910758680627124e-06, + "loss": 0.5193, + "step": 1174 + }, + { + "epoch": 0.5555555555555556, + "grad_norm": 2.615893602371216, + "learning_rate": 4.91059341645869e-06, + "loss": 0.5525, + "step": 1175 + }, + { + "epoch": 0.5560283687943263, + "grad_norm": 3.3179728984832764, + "learning_rate": 4.910428002193174e-06, + "loss": 0.7285, + "step": 1176 + }, + { + "epoch": 0.556501182033097, + "grad_norm": 2.7234175205230713, + "learning_rate": 4.910262437840875e-06, + "loss": 0.574, + "step": 1177 + }, + { + "epoch": 0.5569739952718676, + "grad_norm": 3.0416605472564697, + "learning_rate": 4.9100967234121034e-06, + "loss": 0.5623, + "step": 1178 + }, + { + "epoch": 0.5574468085106383, + "grad_norm": 3.067786455154419, + "learning_rate": 4.909930858917177e-06, + "loss": 0.6491, + "step": 1179 + }, + { + "epoch": 0.557919621749409, + "grad_norm": 3.0037379264831543, + "learning_rate": 4.909764844366422e-06, + "loss": 0.5696, + "step": 1180 + }, + { + "epoch": 0.5583924349881797, + "grad_norm": 2.966179609298706, + "learning_rate": 4.909598679770178e-06, + "loss": 0.6042, + "step": 1181 + }, + { + "epoch": 0.5588652482269504, + "grad_norm": 2.6000657081604004, + "learning_rate": 4.909432365138789e-06, + "loss": 0.5883, + "step": 1182 + }, + { + "epoch": 0.5593380614657211, + "grad_norm": 2.6794495582580566, + "learning_rate": 4.909265900482612e-06, + "loss": 0.6809, + "step": 1183 + }, + { + "epoch": 0.5598108747044918, + "grad_norm": 2.6765122413635254, + "learning_rate": 4.9090992858120115e-06, + "loss": 0.6601, + "step": 1184 + }, + { + "epoch": 0.5602836879432624, + "grad_norm": 2.6051928997039795, + "learning_rate": 4.908932521137363e-06, + "loss": 0.5946, + "step": 1185 + }, + { + "epoch": 0.5607565011820331, + "grad_norm": 3.0405542850494385, + "learning_rate": 4.908765606469048e-06, + "loss": 0.6998, + "step": 1186 + }, + { + "epoch": 0.5612293144208038, + "grad_norm": 2.7975668907165527, + "learning_rate": 4.908598541817462e-06, + "loss": 0.6218, + "step": 1187 + }, + { + "epoch": 0.5617021276595745, + "grad_norm": 2.5367627143859863, + "learning_rate": 4.908431327193005e-06, + "loss": 0.6354, + "step": 1188 + }, + { + "epoch": 0.5621749408983452, + "grad_norm": 3.7939631938934326, + "learning_rate": 4.908263962606091e-06, + "loss": 0.6376, + "step": 1189 + }, + { + "epoch": 0.5626477541371159, + "grad_norm": 2.864079475402832, + "learning_rate": 4.908096448067139e-06, + "loss": 0.5485, + "step": 1190 + }, + { + "epoch": 0.5631205673758866, + "grad_norm": 2.7855563163757324, + "learning_rate": 4.9079287835865804e-06, + "loss": 0.6645, + "step": 1191 + }, + { + "epoch": 0.5635933806146572, + "grad_norm": 2.6156625747680664, + "learning_rate": 4.9077609691748556e-06, + "loss": 0.5751, + "step": 1192 + }, + { + "epoch": 0.5640661938534279, + "grad_norm": 3.0475659370422363, + "learning_rate": 4.907593004842412e-06, + "loss": 0.6739, + "step": 1193 + }, + { + "epoch": 0.5645390070921986, + "grad_norm": 2.9176738262176514, + "learning_rate": 4.9074248905997104e-06, + "loss": 0.6493, + "step": 1194 + }, + { + "epoch": 0.5650118203309693, + "grad_norm": 2.6168384552001953, + "learning_rate": 4.907256626457216e-06, + "loss": 0.6154, + "step": 1195 + }, + { + "epoch": 0.56548463356974, + "grad_norm": 2.893980026245117, + "learning_rate": 4.907088212425408e-06, + "loss": 0.5808, + "step": 1196 + }, + { + "epoch": 0.5659574468085107, + "grad_norm": 3.3832836151123047, + "learning_rate": 4.90691964851477e-06, + "loss": 0.7888, + "step": 1197 + }, + { + "epoch": 0.5664302600472814, + "grad_norm": 3.088932752609253, + "learning_rate": 4.906750934735801e-06, + "loss": 0.6516, + "step": 1198 + }, + { + "epoch": 0.566903073286052, + "grad_norm": 2.494471549987793, + "learning_rate": 4.906582071099004e-06, + "loss": 0.6286, + "step": 1199 + }, + { + "epoch": 0.5673758865248227, + "grad_norm": 2.716550588607788, + "learning_rate": 4.906413057614895e-06, + "loss": 0.5939, + "step": 1200 + }, + { + "epoch": 0.5678486997635934, + "grad_norm": 2.5821073055267334, + "learning_rate": 4.906243894293995e-06, + "loss": 0.6668, + "step": 1201 + }, + { + "epoch": 0.5683215130023641, + "grad_norm": 3.651787042617798, + "learning_rate": 4.90607458114684e-06, + "loss": 0.6124, + "step": 1202 + }, + { + "epoch": 0.5687943262411348, + "grad_norm": 2.7567858695983887, + "learning_rate": 4.9059051181839705e-06, + "loss": 0.6656, + "step": 1203 + }, + { + "epoch": 0.5692671394799055, + "grad_norm": 2.8067586421966553, + "learning_rate": 4.90573550541594e-06, + "loss": 0.6306, + "step": 1204 + }, + { + "epoch": 0.5697399527186762, + "grad_norm": 2.6136393547058105, + "learning_rate": 4.905565742853307e-06, + "loss": 0.5992, + "step": 1205 + }, + { + "epoch": 0.5702127659574469, + "grad_norm": 2.899049758911133, + "learning_rate": 4.905395830506644e-06, + "loss": 0.621, + "step": 1206 + }, + { + "epoch": 0.5706855791962175, + "grad_norm": 3.036583185195923, + "learning_rate": 4.9052257683865294e-06, + "loss": 0.652, + "step": 1207 + }, + { + "epoch": 0.5711583924349882, + "grad_norm": 2.7947216033935547, + "learning_rate": 4.905055556503553e-06, + "loss": 0.6636, + "step": 1208 + }, + { + "epoch": 0.5716312056737589, + "grad_norm": 3.1646955013275146, + "learning_rate": 4.9048851948683135e-06, + "loss": 0.6376, + "step": 1209 + }, + { + "epoch": 0.5721040189125296, + "grad_norm": 2.8175766468048096, + "learning_rate": 4.904714683491417e-06, + "loss": 0.5929, + "step": 1210 + }, + { + "epoch": 0.5725768321513003, + "grad_norm": 2.923923969268799, + "learning_rate": 4.904544022383483e-06, + "loss": 0.6633, + "step": 1211 + }, + { + "epoch": 0.573049645390071, + "grad_norm": 2.7471134662628174, + "learning_rate": 4.9043732115551356e-06, + "loss": 0.6551, + "step": 1212 + }, + { + "epoch": 0.5735224586288417, + "grad_norm": 2.8660807609558105, + "learning_rate": 4.90420225101701e-06, + "loss": 0.6423, + "step": 1213 + }, + { + "epoch": 0.5739952718676123, + "grad_norm": 2.769247531890869, + "learning_rate": 4.904031140779754e-06, + "loss": 0.5982, + "step": 1214 + }, + { + "epoch": 0.574468085106383, + "grad_norm": 2.9043145179748535, + "learning_rate": 4.90385988085402e-06, + "loss": 0.5843, + "step": 1215 + }, + { + "epoch": 0.5749408983451537, + "grad_norm": 2.6639609336853027, + "learning_rate": 4.903688471250471e-06, + "loss": 0.5858, + "step": 1216 + }, + { + "epoch": 0.5754137115839244, + "grad_norm": 2.6967573165893555, + "learning_rate": 4.903516911979781e-06, + "loss": 0.5755, + "step": 1217 + }, + { + "epoch": 0.5758865248226951, + "grad_norm": 2.8865857124328613, + "learning_rate": 4.903345203052633e-06, + "loss": 0.6051, + "step": 1218 + }, + { + "epoch": 0.5763593380614658, + "grad_norm": 2.381979465484619, + "learning_rate": 4.903173344479717e-06, + "loss": 0.5727, + "step": 1219 + }, + { + "epoch": 0.5768321513002365, + "grad_norm": 2.7717981338500977, + "learning_rate": 4.903001336271734e-06, + "loss": 0.6406, + "step": 1220 + }, + { + "epoch": 0.577304964539007, + "grad_norm": 2.6431570053100586, + "learning_rate": 4.902829178439395e-06, + "loss": 0.6226, + "step": 1221 + }, + { + "epoch": 0.5777777777777777, + "grad_norm": 2.8090415000915527, + "learning_rate": 4.902656870993419e-06, + "loss": 0.5761, + "step": 1222 + }, + { + "epoch": 0.5782505910165484, + "grad_norm": 2.4769368171691895, + "learning_rate": 4.902484413944535e-06, + "loss": 0.5602, + "step": 1223 + }, + { + "epoch": 0.5787234042553191, + "grad_norm": 2.693316698074341, + "learning_rate": 4.902311807303481e-06, + "loss": 0.5222, + "step": 1224 + }, + { + "epoch": 0.5791962174940898, + "grad_norm": 2.7623913288116455, + "learning_rate": 4.902139051081004e-06, + "loss": 0.6978, + "step": 1225 + }, + { + "epoch": 0.5796690307328605, + "grad_norm": 2.6133766174316406, + "learning_rate": 4.901966145287863e-06, + "loss": 0.5802, + "step": 1226 + }, + { + "epoch": 0.5801418439716312, + "grad_norm": 2.7345972061157227, + "learning_rate": 4.901793089934821e-06, + "loss": 0.6294, + "step": 1227 + }, + { + "epoch": 0.5806146572104018, + "grad_norm": 2.7545835971832275, + "learning_rate": 4.9016198850326555e-06, + "loss": 0.6085, + "step": 1228 + }, + { + "epoch": 0.5810874704491725, + "grad_norm": 2.6947758197784424, + "learning_rate": 4.90144653059215e-06, + "loss": 0.6025, + "step": 1229 + }, + { + "epoch": 0.5815602836879432, + "grad_norm": 2.692967414855957, + "learning_rate": 4.901273026624099e-06, + "loss": 0.5715, + "step": 1230 + }, + { + "epoch": 0.5820330969267139, + "grad_norm": 2.78347110748291, + "learning_rate": 4.901099373139307e-06, + "loss": 0.6063, + "step": 1231 + }, + { + "epoch": 0.5825059101654846, + "grad_norm": 2.346496343612671, + "learning_rate": 4.900925570148585e-06, + "loss": 0.5869, + "step": 1232 + }, + { + "epoch": 0.5829787234042553, + "grad_norm": 2.606639862060547, + "learning_rate": 4.900751617662755e-06, + "loss": 0.6197, + "step": 1233 + }, + { + "epoch": 0.583451536643026, + "grad_norm": 2.5825929641723633, + "learning_rate": 4.900577515692649e-06, + "loss": 0.6721, + "step": 1234 + }, + { + "epoch": 0.5839243498817966, + "grad_norm": 2.731349468231201, + "learning_rate": 4.900403264249107e-06, + "loss": 0.6273, + "step": 1235 + }, + { + "epoch": 0.5843971631205673, + "grad_norm": 3.2133874893188477, + "learning_rate": 4.90022886334298e-06, + "loss": 0.6231, + "step": 1236 + }, + { + "epoch": 0.584869976359338, + "grad_norm": 2.9213852882385254, + "learning_rate": 4.900054312985127e-06, + "loss": 0.6677, + "step": 1237 + }, + { + "epoch": 0.5853427895981087, + "grad_norm": 2.815425157546997, + "learning_rate": 4.899879613186414e-06, + "loss": 0.6405, + "step": 1238 + }, + { + "epoch": 0.5858156028368794, + "grad_norm": 2.730782985687256, + "learning_rate": 4.899704763957721e-06, + "loss": 0.6233, + "step": 1239 + }, + { + "epoch": 0.5862884160756501, + "grad_norm": 2.6432766914367676, + "learning_rate": 4.899529765309936e-06, + "loss": 0.6267, + "step": 1240 + }, + { + "epoch": 0.5867612293144208, + "grad_norm": 2.616215229034424, + "learning_rate": 4.899354617253953e-06, + "loss": 0.6268, + "step": 1241 + }, + { + "epoch": 0.5872340425531914, + "grad_norm": 2.7630255222320557, + "learning_rate": 4.899179319800679e-06, + "loss": 0.6348, + "step": 1242 + }, + { + "epoch": 0.5877068557919621, + "grad_norm": 2.785095453262329, + "learning_rate": 4.899003872961029e-06, + "loss": 0.5839, + "step": 1243 + }, + { + "epoch": 0.5881796690307328, + "grad_norm": 2.9050328731536865, + "learning_rate": 4.898828276745927e-06, + "loss": 0.651, + "step": 1244 + }, + { + "epoch": 0.5886524822695035, + "grad_norm": 2.958092212677002, + "learning_rate": 4.8986525311663065e-06, + "loss": 0.6395, + "step": 1245 + }, + { + "epoch": 0.5891252955082742, + "grad_norm": 2.952310800552368, + "learning_rate": 4.898476636233111e-06, + "loss": 0.6731, + "step": 1246 + }, + { + "epoch": 0.5895981087470449, + "grad_norm": 2.9876346588134766, + "learning_rate": 4.898300591957293e-06, + "loss": 0.7015, + "step": 1247 + }, + { + "epoch": 0.5900709219858156, + "grad_norm": 2.8941752910614014, + "learning_rate": 4.898124398349813e-06, + "loss": 0.6452, + "step": 1248 + }, + { + "epoch": 0.5905437352245863, + "grad_norm": 2.9809536933898926, + "learning_rate": 4.897948055421642e-06, + "loss": 0.5736, + "step": 1249 + }, + { + "epoch": 0.5910165484633569, + "grad_norm": 2.927046775817871, + "learning_rate": 4.897771563183761e-06, + "loss": 0.5918, + "step": 1250 + }, + { + "epoch": 0.5914893617021276, + "grad_norm": 2.865020275115967, + "learning_rate": 4.897594921647158e-06, + "loss": 0.6924, + "step": 1251 + }, + { + "epoch": 0.5919621749408983, + "grad_norm": 2.7406699657440186, + "learning_rate": 4.897418130822832e-06, + "loss": 0.509, + "step": 1252 + }, + { + "epoch": 0.592434988179669, + "grad_norm": 2.781606912612915, + "learning_rate": 4.897241190721791e-06, + "loss": 0.5555, + "step": 1253 + }, + { + "epoch": 0.5929078014184397, + "grad_norm": 2.79209303855896, + "learning_rate": 4.8970641013550535e-06, + "loss": 0.6722, + "step": 1254 + }, + { + "epoch": 0.5933806146572104, + "grad_norm": 3.0672268867492676, + "learning_rate": 4.896886862733645e-06, + "loss": 0.6366, + "step": 1255 + }, + { + "epoch": 0.5938534278959811, + "grad_norm": 2.7456953525543213, + "learning_rate": 4.896709474868602e-06, + "loss": 0.6246, + "step": 1256 + }, + { + "epoch": 0.5943262411347517, + "grad_norm": 3.6731202602386475, + "learning_rate": 4.896531937770968e-06, + "loss": 0.668, + "step": 1257 + }, + { + "epoch": 0.5947990543735224, + "grad_norm": 2.6056087017059326, + "learning_rate": 4.8963542514518e-06, + "loss": 0.5815, + "step": 1258 + }, + { + "epoch": 0.5952718676122931, + "grad_norm": 2.719698905944824, + "learning_rate": 4.89617641592216e-06, + "loss": 0.6058, + "step": 1259 + }, + { + "epoch": 0.5957446808510638, + "grad_norm": 2.625838279724121, + "learning_rate": 4.895998431193121e-06, + "loss": 0.6143, + "step": 1260 + }, + { + "epoch": 0.5962174940898345, + "grad_norm": 2.7166085243225098, + "learning_rate": 4.895820297275767e-06, + "loss": 0.5187, + "step": 1261 + }, + { + "epoch": 0.5966903073286052, + "grad_norm": 2.7544102668762207, + "learning_rate": 4.8956420141811875e-06, + "loss": 0.5928, + "step": 1262 + }, + { + "epoch": 0.5971631205673759, + "grad_norm": 2.6678333282470703, + "learning_rate": 4.895463581920484e-06, + "loss": 0.611, + "step": 1263 + }, + { + "epoch": 0.5976359338061465, + "grad_norm": 2.853384494781494, + "learning_rate": 4.895285000504768e-06, + "loss": 0.642, + "step": 1264 + }, + { + "epoch": 0.5981087470449172, + "grad_norm": 2.637852430343628, + "learning_rate": 4.895106269945158e-06, + "loss": 0.6308, + "step": 1265 + }, + { + "epoch": 0.5985815602836879, + "grad_norm": 2.9880387783050537, + "learning_rate": 4.8949273902527826e-06, + "loss": 0.5781, + "step": 1266 + }, + { + "epoch": 0.5990543735224586, + "grad_norm": 3.5984015464782715, + "learning_rate": 4.89474836143878e-06, + "loss": 0.5865, + "step": 1267 + }, + { + "epoch": 0.5995271867612293, + "grad_norm": 2.719855546951294, + "learning_rate": 4.8945691835142975e-06, + "loss": 0.6393, + "step": 1268 + }, + { + "epoch": 0.6, + "grad_norm": 2.7885141372680664, + "learning_rate": 4.894389856490492e-06, + "loss": 0.66, + "step": 1269 + }, + { + "epoch": 0.6004728132387707, + "grad_norm": 2.698819875717163, + "learning_rate": 4.894210380378529e-06, + "loss": 0.6144, + "step": 1270 + }, + { + "epoch": 0.6009456264775414, + "grad_norm": 2.278045654296875, + "learning_rate": 4.894030755189584e-06, + "loss": 0.5609, + "step": 1271 + }, + { + "epoch": 0.601418439716312, + "grad_norm": 2.8729357719421387, + "learning_rate": 4.893850980934841e-06, + "loss": 0.6715, + "step": 1272 + }, + { + "epoch": 0.6018912529550827, + "grad_norm": 2.8541221618652344, + "learning_rate": 4.893671057625495e-06, + "loss": 0.6787, + "step": 1273 + }, + { + "epoch": 0.6023640661938534, + "grad_norm": 2.4561476707458496, + "learning_rate": 4.893490985272748e-06, + "loss": 0.6331, + "step": 1274 + }, + { + "epoch": 0.6028368794326241, + "grad_norm": 2.565739154815674, + "learning_rate": 4.893310763887812e-06, + "loss": 0.587, + "step": 1275 + }, + { + "epoch": 0.6033096926713948, + "grad_norm": 2.384951591491699, + "learning_rate": 4.8931303934819095e-06, + "loss": 0.5358, + "step": 1276 + }, + { + "epoch": 0.6037825059101655, + "grad_norm": 2.380808115005493, + "learning_rate": 4.89294987406627e-06, + "loss": 0.5402, + "step": 1277 + }, + { + "epoch": 0.6042553191489362, + "grad_norm": 2.764815092086792, + "learning_rate": 4.892769205652136e-06, + "loss": 0.6103, + "step": 1278 + }, + { + "epoch": 0.6047281323877068, + "grad_norm": 2.463350296020508, + "learning_rate": 4.892588388250754e-06, + "loss": 0.5937, + "step": 1279 + }, + { + "epoch": 0.6052009456264775, + "grad_norm": 3.099689245223999, + "learning_rate": 4.8924074218733855e-06, + "loss": 0.6354, + "step": 1280 + }, + { + "epoch": 0.6056737588652482, + "grad_norm": 2.804450035095215, + "learning_rate": 4.892226306531297e-06, + "loss": 0.6595, + "step": 1281 + }, + { + "epoch": 0.6061465721040189, + "grad_norm": 3.1559767723083496, + "learning_rate": 4.892045042235765e-06, + "loss": 0.6664, + "step": 1282 + }, + { + "epoch": 0.6066193853427896, + "grad_norm": 2.844341993331909, + "learning_rate": 4.891863628998079e-06, + "loss": 0.7454, + "step": 1283 + }, + { + "epoch": 0.6070921985815603, + "grad_norm": 2.686602830886841, + "learning_rate": 4.891682066829532e-06, + "loss": 0.6755, + "step": 1284 + }, + { + "epoch": 0.607565011820331, + "grad_norm": 2.736457347869873, + "learning_rate": 4.8915003557414285e-06, + "loss": 0.6305, + "step": 1285 + }, + { + "epoch": 0.6080378250591016, + "grad_norm": 2.661362409591675, + "learning_rate": 4.891318495745086e-06, + "loss": 0.5958, + "step": 1286 + }, + { + "epoch": 0.6085106382978723, + "grad_norm": 2.707348108291626, + "learning_rate": 4.8911364868518255e-06, + "loss": 0.5824, + "step": 1287 + }, + { + "epoch": 0.608983451536643, + "grad_norm": 2.9798858165740967, + "learning_rate": 4.890954329072981e-06, + "loss": 0.5981, + "step": 1288 + }, + { + "epoch": 0.6094562647754137, + "grad_norm": 2.6285455226898193, + "learning_rate": 4.890772022419895e-06, + "loss": 0.6194, + "step": 1289 + }, + { + "epoch": 0.6099290780141844, + "grad_norm": 2.9254322052001953, + "learning_rate": 4.890589566903917e-06, + "loss": 0.6002, + "step": 1290 + }, + { + "epoch": 0.6104018912529551, + "grad_norm": 2.6458325386047363, + "learning_rate": 4.89040696253641e-06, + "loss": 0.5457, + "step": 1291 + }, + { + "epoch": 0.6108747044917258, + "grad_norm": 2.508242607116699, + "learning_rate": 4.890224209328743e-06, + "loss": 0.6168, + "step": 1292 + }, + { + "epoch": 0.6113475177304964, + "grad_norm": 3.034785509109497, + "learning_rate": 4.890041307292296e-06, + "loss": 0.664, + "step": 1293 + }, + { + "epoch": 0.6118203309692671, + "grad_norm": 3.52469539642334, + "learning_rate": 4.889858256438455e-06, + "loss": 0.7301, + "step": 1294 + }, + { + "epoch": 0.6122931442080378, + "grad_norm": 2.9145348072052, + "learning_rate": 4.889675056778622e-06, + "loss": 0.6494, + "step": 1295 + }, + { + "epoch": 0.6127659574468085, + "grad_norm": 2.831829071044922, + "learning_rate": 4.8894917083242e-06, + "loss": 0.6064, + "step": 1296 + }, + { + "epoch": 0.6132387706855792, + "grad_norm": 2.6883130073547363, + "learning_rate": 4.889308211086608e-06, + "loss": 0.5642, + "step": 1297 + }, + { + "epoch": 0.6137115839243499, + "grad_norm": 3.0605485439300537, + "learning_rate": 4.889124565077269e-06, + "loss": 0.6695, + "step": 1298 + }, + { + "epoch": 0.6141843971631206, + "grad_norm": 3.44062876701355, + "learning_rate": 4.88894077030762e-06, + "loss": 0.6415, + "step": 1299 + }, + { + "epoch": 0.6146572104018913, + "grad_norm": 2.5970818996429443, + "learning_rate": 4.888756826789105e-06, + "loss": 0.6518, + "step": 1300 + }, + { + "epoch": 0.6151300236406619, + "grad_norm": 4.2233567237854, + "learning_rate": 4.8885727345331755e-06, + "loss": 0.6555, + "step": 1301 + }, + { + "epoch": 0.6156028368794326, + "grad_norm": 2.645385503768921, + "learning_rate": 4.888388493551297e-06, + "loss": 0.6762, + "step": 1302 + }, + { + "epoch": 0.6160756501182033, + "grad_norm": 2.907954454421997, + "learning_rate": 4.8882041038549385e-06, + "loss": 0.6526, + "step": 1303 + }, + { + "epoch": 0.616548463356974, + "grad_norm": 2.482771873474121, + "learning_rate": 4.888019565455583e-06, + "loss": 0.628, + "step": 1304 + }, + { + "epoch": 0.6170212765957447, + "grad_norm": 2.7165915966033936, + "learning_rate": 4.88783487836472e-06, + "loss": 0.5743, + "step": 1305 + }, + { + "epoch": 0.6174940898345154, + "grad_norm": 3.095627546310425, + "learning_rate": 4.88765004259385e-06, + "loss": 0.627, + "step": 1306 + }, + { + "epoch": 0.6179669030732861, + "grad_norm": 2.5018465518951416, + "learning_rate": 4.8874650581544805e-06, + "loss": 0.5215, + "step": 1307 + }, + { + "epoch": 0.6184397163120567, + "grad_norm": 3.094337224960327, + "learning_rate": 4.8872799250581316e-06, + "loss": 0.6979, + "step": 1308 + }, + { + "epoch": 0.6189125295508274, + "grad_norm": 3.1002209186553955, + "learning_rate": 4.887094643316329e-06, + "loss": 0.6565, + "step": 1309 + }, + { + "epoch": 0.6193853427895981, + "grad_norm": 2.551431894302368, + "learning_rate": 4.88690921294061e-06, + "loss": 0.5748, + "step": 1310 + }, + { + "epoch": 0.6198581560283688, + "grad_norm": 2.8282904624938965, + "learning_rate": 4.886723633942521e-06, + "loss": 0.676, + "step": 1311 + }, + { + "epoch": 0.6203309692671395, + "grad_norm": 2.8887810707092285, + "learning_rate": 4.886537906333617e-06, + "loss": 0.5971, + "step": 1312 + }, + { + "epoch": 0.6208037825059102, + "grad_norm": 2.9989118576049805, + "learning_rate": 4.886352030125462e-06, + "loss": 0.6341, + "step": 1313 + }, + { + "epoch": 0.6212765957446809, + "grad_norm": 2.8042776584625244, + "learning_rate": 4.886166005329629e-06, + "loss": 0.6578, + "step": 1314 + }, + { + "epoch": 0.6217494089834515, + "grad_norm": 2.4980967044830322, + "learning_rate": 4.8859798319577026e-06, + "loss": 0.6711, + "step": 1315 + }, + { + "epoch": 0.6222222222222222, + "grad_norm": 2.762369155883789, + "learning_rate": 4.885793510021274e-06, + "loss": 0.5747, + "step": 1316 + }, + { + "epoch": 0.6226950354609929, + "grad_norm": 3.136327028274536, + "learning_rate": 4.885607039531945e-06, + "loss": 0.7544, + "step": 1317 + }, + { + "epoch": 0.6231678486997636, + "grad_norm": 2.8736963272094727, + "learning_rate": 4.885420420501327e-06, + "loss": 0.6603, + "step": 1318 + }, + { + "epoch": 0.6236406619385343, + "grad_norm": 2.766237497329712, + "learning_rate": 4.885233652941039e-06, + "loss": 0.581, + "step": 1319 + }, + { + "epoch": 0.624113475177305, + "grad_norm": 2.4740939140319824, + "learning_rate": 4.88504673686271e-06, + "loss": 0.6335, + "step": 1320 + }, + { + "epoch": 0.6245862884160757, + "grad_norm": 3.324795961380005, + "learning_rate": 4.884859672277978e-06, + "loss": 0.6019, + "step": 1321 + }, + { + "epoch": 0.6250591016548463, + "grad_norm": 3.521327257156372, + "learning_rate": 4.884672459198493e-06, + "loss": 0.6104, + "step": 1322 + }, + { + "epoch": 0.625531914893617, + "grad_norm": 2.7728071212768555, + "learning_rate": 4.884485097635909e-06, + "loss": 0.6714, + "step": 1323 + }, + { + "epoch": 0.6260047281323877, + "grad_norm": 3.0738155841827393, + "learning_rate": 4.884297587601895e-06, + "loss": 0.604, + "step": 1324 + }, + { + "epoch": 0.6264775413711584, + "grad_norm": 2.719240427017212, + "learning_rate": 4.884109929108124e-06, + "loss": 0.6795, + "step": 1325 + }, + { + "epoch": 0.6269503546099291, + "grad_norm": 2.4108200073242188, + "learning_rate": 4.883922122166282e-06, + "loss": 0.5846, + "step": 1326 + }, + { + "epoch": 0.6274231678486998, + "grad_norm": 2.393899917602539, + "learning_rate": 4.883734166788063e-06, + "loss": 0.6188, + "step": 1327 + }, + { + "epoch": 0.6278959810874705, + "grad_norm": 4.555255889892578, + "learning_rate": 4.883546062985169e-06, + "loss": 0.5962, + "step": 1328 + }, + { + "epoch": 0.6283687943262412, + "grad_norm": 2.571075439453125, + "learning_rate": 4.883357810769315e-06, + "loss": 0.6165, + "step": 1329 + }, + { + "epoch": 0.6288416075650118, + "grad_norm": 2.553115129470825, + "learning_rate": 4.8831694101522185e-06, + "loss": 0.6787, + "step": 1330 + }, + { + "epoch": 0.6293144208037825, + "grad_norm": 3.2564642429351807, + "learning_rate": 4.882980861145614e-06, + "loss": 0.659, + "step": 1331 + }, + { + "epoch": 0.6297872340425532, + "grad_norm": 2.535216808319092, + "learning_rate": 4.882792163761241e-06, + "loss": 0.6176, + "step": 1332 + }, + { + "epoch": 0.6302600472813239, + "grad_norm": 3.097921848297119, + "learning_rate": 4.882603318010847e-06, + "loss": 0.6822, + "step": 1333 + }, + { + "epoch": 0.6307328605200946, + "grad_norm": 2.8135175704956055, + "learning_rate": 4.882414323906192e-06, + "loss": 0.6782, + "step": 1334 + }, + { + "epoch": 0.6312056737588653, + "grad_norm": 2.724634885787964, + "learning_rate": 4.882225181459044e-06, + "loss": 0.6545, + "step": 1335 + }, + { + "epoch": 0.631678486997636, + "grad_norm": 2.9585227966308594, + "learning_rate": 4.882035890681179e-06, + "loss": 0.6218, + "step": 1336 + }, + { + "epoch": 0.6321513002364066, + "grad_norm": 2.6952011585235596, + "learning_rate": 4.881846451584385e-06, + "loss": 0.6, + "step": 1337 + }, + { + "epoch": 0.6326241134751773, + "grad_norm": 3.1400704383850098, + "learning_rate": 4.881656864180455e-06, + "loss": 0.6687, + "step": 1338 + }, + { + "epoch": 0.633096926713948, + "grad_norm": 2.8382487297058105, + "learning_rate": 4.881467128481197e-06, + "loss": 0.574, + "step": 1339 + }, + { + "epoch": 0.6335697399527187, + "grad_norm": 2.8520095348358154, + "learning_rate": 4.881277244498422e-06, + "loss": 0.6582, + "step": 1340 + }, + { + "epoch": 0.6340425531914894, + "grad_norm": 2.703498363494873, + "learning_rate": 4.881087212243956e-06, + "loss": 0.7224, + "step": 1341 + }, + { + "epoch": 0.6345153664302601, + "grad_norm": 3.697205066680908, + "learning_rate": 4.880897031729629e-06, + "loss": 0.6582, + "step": 1342 + }, + { + "epoch": 0.6349881796690308, + "grad_norm": 2.7625808715820312, + "learning_rate": 4.880706702967284e-06, + "loss": 0.574, + "step": 1343 + }, + { + "epoch": 0.6354609929078014, + "grad_norm": 2.949984073638916, + "learning_rate": 4.880516225968771e-06, + "loss": 0.66, + "step": 1344 + }, + { + "epoch": 0.6359338061465721, + "grad_norm": 2.548269748687744, + "learning_rate": 4.8803256007459525e-06, + "loss": 0.642, + "step": 1345 + }, + { + "epoch": 0.6364066193853428, + "grad_norm": 2.5102174282073975, + "learning_rate": 4.8801348273106945e-06, + "loss": 0.6238, + "step": 1346 + }, + { + "epoch": 0.6368794326241135, + "grad_norm": 2.9847946166992188, + "learning_rate": 4.8799439056748786e-06, + "loss": 0.5416, + "step": 1347 + }, + { + "epoch": 0.6373522458628842, + "grad_norm": 2.8711049556732178, + "learning_rate": 4.879752835850391e-06, + "loss": 0.6427, + "step": 1348 + }, + { + "epoch": 0.6378250591016549, + "grad_norm": 2.7901716232299805, + "learning_rate": 4.879561617849129e-06, + "loss": 0.6026, + "step": 1349 + }, + { + "epoch": 0.6382978723404256, + "grad_norm": 2.659778356552124, + "learning_rate": 4.879370251682999e-06, + "loss": 0.6623, + "step": 1350 + }, + { + "epoch": 0.6387706855791963, + "grad_norm": 3.224386692047119, + "learning_rate": 4.879178737363917e-06, + "loss": 0.6485, + "step": 1351 + }, + { + "epoch": 0.6392434988179669, + "grad_norm": 2.6385605335235596, + "learning_rate": 4.8789870749038076e-06, + "loss": 0.5866, + "step": 1352 + }, + { + "epoch": 0.6397163120567376, + "grad_norm": 2.807713270187378, + "learning_rate": 4.8787952643146045e-06, + "loss": 0.6537, + "step": 1353 + }, + { + "epoch": 0.6401891252955083, + "grad_norm": 2.5689280033111572, + "learning_rate": 4.878603305608251e-06, + "loss": 0.6216, + "step": 1354 + }, + { + "epoch": 0.640661938534279, + "grad_norm": 2.7347843647003174, + "learning_rate": 4.8784111987967e-06, + "loss": 0.6318, + "step": 1355 + }, + { + "epoch": 0.6411347517730497, + "grad_norm": 2.5210378170013428, + "learning_rate": 4.878218943891911e-06, + "loss": 0.5472, + "step": 1356 + }, + { + "epoch": 0.6416075650118204, + "grad_norm": 2.866785764694214, + "learning_rate": 4.878026540905858e-06, + "loss": 0.7108, + "step": 1357 + }, + { + "epoch": 0.642080378250591, + "grad_norm": 2.923314332962036, + "learning_rate": 4.877833989850519e-06, + "loss": 0.5557, + "step": 1358 + }, + { + "epoch": 0.6425531914893617, + "grad_norm": 2.925463914871216, + "learning_rate": 4.8776412907378845e-06, + "loss": 0.6382, + "step": 1359 + }, + { + "epoch": 0.6430260047281324, + "grad_norm": 2.909644365310669, + "learning_rate": 4.877448443579952e-06, + "loss": 0.5603, + "step": 1360 + }, + { + "epoch": 0.6434988179669031, + "grad_norm": 3.501148223876953, + "learning_rate": 4.8772554483887306e-06, + "loss": 0.6722, + "step": 1361 + }, + { + "epoch": 0.6439716312056738, + "grad_norm": 2.823765516281128, + "learning_rate": 4.877062305176235e-06, + "loss": 0.6408, + "step": 1362 + }, + { + "epoch": 0.6444444444444445, + "grad_norm": 2.9807584285736084, + "learning_rate": 4.8768690139544935e-06, + "loss": 0.5984, + "step": 1363 + }, + { + "epoch": 0.6449172576832152, + "grad_norm": 2.8411378860473633, + "learning_rate": 4.8766755747355405e-06, + "loss": 0.6231, + "step": 1364 + }, + { + "epoch": 0.6453900709219859, + "grad_norm": 3.158952236175537, + "learning_rate": 4.8764819875314215e-06, + "loss": 0.6441, + "step": 1365 + }, + { + "epoch": 0.6458628841607565, + "grad_norm": 2.9614369869232178, + "learning_rate": 4.876288252354189e-06, + "loss": 0.6308, + "step": 1366 + }, + { + "epoch": 0.6463356973995272, + "grad_norm": 3.073805570602417, + "learning_rate": 4.876094369215907e-06, + "loss": 0.6046, + "step": 1367 + }, + { + "epoch": 0.6468085106382979, + "grad_norm": 2.719189405441284, + "learning_rate": 4.875900338128648e-06, + "loss": 0.6082, + "step": 1368 + }, + { + "epoch": 0.6472813238770686, + "grad_norm": 2.676726818084717, + "learning_rate": 4.8757061591044914e-06, + "loss": 0.6344, + "step": 1369 + }, + { + "epoch": 0.6477541371158393, + "grad_norm": 2.955256938934326, + "learning_rate": 4.87551183215553e-06, + "loss": 0.6506, + "step": 1370 + }, + { + "epoch": 0.64822695035461, + "grad_norm": 2.5672218799591064, + "learning_rate": 4.875317357293864e-06, + "loss": 0.5284, + "step": 1371 + }, + { + "epoch": 0.6486997635933807, + "grad_norm": 2.5860238075256348, + "learning_rate": 4.875122734531602e-06, + "loss": 0.667, + "step": 1372 + }, + { + "epoch": 0.6491725768321513, + "grad_norm": 3.1037003993988037, + "learning_rate": 4.8749279638808605e-06, + "loss": 0.6902, + "step": 1373 + }, + { + "epoch": 0.649645390070922, + "grad_norm": 2.7715282440185547, + "learning_rate": 4.874733045353769e-06, + "loss": 0.6291, + "step": 1374 + }, + { + "epoch": 0.6501182033096927, + "grad_norm": 2.527071475982666, + "learning_rate": 4.874537978962463e-06, + "loss": 0.5565, + "step": 1375 + }, + { + "epoch": 0.6505910165484634, + "grad_norm": 2.722092628479004, + "learning_rate": 4.874342764719091e-06, + "loss": 0.5724, + "step": 1376 + }, + { + "epoch": 0.6510638297872341, + "grad_norm": 2.6342411041259766, + "learning_rate": 4.874147402635805e-06, + "loss": 0.6308, + "step": 1377 + }, + { + "epoch": 0.6515366430260048, + "grad_norm": 2.3850719928741455, + "learning_rate": 4.8739518927247695e-06, + "loss": 0.5692, + "step": 1378 + }, + { + "epoch": 0.6520094562647755, + "grad_norm": 2.9787259101867676, + "learning_rate": 4.873756234998161e-06, + "loss": 0.6953, + "step": 1379 + }, + { + "epoch": 0.6524822695035462, + "grad_norm": 2.634141683578491, + "learning_rate": 4.873560429468159e-06, + "loss": 0.6077, + "step": 1380 + }, + { + "epoch": 0.6529550827423168, + "grad_norm": 2.803046941757202, + "learning_rate": 4.873364476146958e-06, + "loss": 0.6657, + "step": 1381 + }, + { + "epoch": 0.6534278959810875, + "grad_norm": 2.762827157974243, + "learning_rate": 4.8731683750467574e-06, + "loss": 0.6061, + "step": 1382 + }, + { + "epoch": 0.6539007092198581, + "grad_norm": 2.6654391288757324, + "learning_rate": 4.872972126179768e-06, + "loss": 0.6387, + "step": 1383 + }, + { + "epoch": 0.6543735224586288, + "grad_norm": 2.4363625049591064, + "learning_rate": 4.872775729558209e-06, + "loss": 0.5623, + "step": 1384 + }, + { + "epoch": 0.6548463356973995, + "grad_norm": 2.528959035873413, + "learning_rate": 4.87257918519431e-06, + "loss": 0.5609, + "step": 1385 + }, + { + "epoch": 0.6553191489361702, + "grad_norm": 2.718383312225342, + "learning_rate": 4.872382493100309e-06, + "loss": 0.5575, + "step": 1386 + }, + { + "epoch": 0.6557919621749408, + "grad_norm": 2.660841226577759, + "learning_rate": 4.872185653288453e-06, + "loss": 0.6106, + "step": 1387 + }, + { + "epoch": 0.6562647754137115, + "grad_norm": 2.508753538131714, + "learning_rate": 4.871988665770997e-06, + "loss": 0.5705, + "step": 1388 + }, + { + "epoch": 0.6567375886524822, + "grad_norm": 2.5134334564208984, + "learning_rate": 4.871791530560208e-06, + "loss": 0.5592, + "step": 1389 + }, + { + "epoch": 0.6572104018912529, + "grad_norm": 2.7475597858428955, + "learning_rate": 4.871594247668361e-06, + "loss": 0.6277, + "step": 1390 + }, + { + "epoch": 0.6576832151300236, + "grad_norm": 2.793616533279419, + "learning_rate": 4.871396817107739e-06, + "loss": 0.595, + "step": 1391 + }, + { + "epoch": 0.6581560283687943, + "grad_norm": 2.8285086154937744, + "learning_rate": 4.871199238890635e-06, + "loss": 0.6094, + "step": 1392 + }, + { + "epoch": 0.658628841607565, + "grad_norm": 2.74124813079834, + "learning_rate": 4.871001513029352e-06, + "loss": 0.6296, + "step": 1393 + }, + { + "epoch": 0.6591016548463356, + "grad_norm": 2.761237621307373, + "learning_rate": 4.870803639536202e-06, + "loss": 0.5702, + "step": 1394 + }, + { + "epoch": 0.6595744680851063, + "grad_norm": 2.761038064956665, + "learning_rate": 4.870605618423504e-06, + "loss": 0.6195, + "step": 1395 + }, + { + "epoch": 0.660047281323877, + "grad_norm": 2.8812482357025146, + "learning_rate": 4.870407449703589e-06, + "loss": 0.616, + "step": 1396 + }, + { + "epoch": 0.6605200945626477, + "grad_norm": 2.9966578483581543, + "learning_rate": 4.870209133388797e-06, + "loss": 0.6547, + "step": 1397 + }, + { + "epoch": 0.6609929078014184, + "grad_norm": 2.7969017028808594, + "learning_rate": 4.870010669491474e-06, + "loss": 0.5762, + "step": 1398 + }, + { + "epoch": 0.6614657210401891, + "grad_norm": 2.557783842086792, + "learning_rate": 4.86981205802398e-06, + "loss": 0.6184, + "step": 1399 + }, + { + "epoch": 0.6619385342789598, + "grad_norm": 2.5393927097320557, + "learning_rate": 4.86961329899868e-06, + "loss": 0.5953, + "step": 1400 + }, + { + "epoch": 0.6624113475177305, + "grad_norm": 2.7745981216430664, + "learning_rate": 4.86941439242795e-06, + "loss": 0.5967, + "step": 1401 + }, + { + "epoch": 0.6628841607565011, + "grad_norm": 2.650381326675415, + "learning_rate": 4.869215338324176e-06, + "loss": 0.5667, + "step": 1402 + }, + { + "epoch": 0.6633569739952718, + "grad_norm": 2.583169937133789, + "learning_rate": 4.869016136699751e-06, + "loss": 0.549, + "step": 1403 + }, + { + "epoch": 0.6638297872340425, + "grad_norm": 2.984978437423706, + "learning_rate": 4.868816787567079e-06, + "loss": 0.5931, + "step": 1404 + }, + { + "epoch": 0.6643026004728132, + "grad_norm": 3.1947181224823, + "learning_rate": 4.868617290938573e-06, + "loss": 0.5473, + "step": 1405 + }, + { + "epoch": 0.6647754137115839, + "grad_norm": 2.562927007675171, + "learning_rate": 4.868417646826654e-06, + "loss": 0.6878, + "step": 1406 + }, + { + "epoch": 0.6652482269503546, + "grad_norm": 2.8741261959075928, + "learning_rate": 4.868217855243754e-06, + "loss": 0.6312, + "step": 1407 + }, + { + "epoch": 0.6657210401891253, + "grad_norm": 2.9834797382354736, + "learning_rate": 4.868017916202312e-06, + "loss": 0.5624, + "step": 1408 + }, + { + "epoch": 0.6661938534278959, + "grad_norm": 2.6935982704162598, + "learning_rate": 4.8678178297147785e-06, + "loss": 0.5857, + "step": 1409 + }, + { + "epoch": 0.6666666666666666, + "grad_norm": 2.8200576305389404, + "learning_rate": 4.86761759579361e-06, + "loss": 0.6153, + "step": 1410 + }, + { + "epoch": 0.6671394799054373, + "grad_norm": 2.831425189971924, + "learning_rate": 4.867417214451276e-06, + "loss": 0.6495, + "step": 1411 + }, + { + "epoch": 0.667612293144208, + "grad_norm": 2.733565092086792, + "learning_rate": 4.867216685700253e-06, + "loss": 0.6036, + "step": 1412 + }, + { + "epoch": 0.6680851063829787, + "grad_norm": 3.0609400272369385, + "learning_rate": 4.867016009553027e-06, + "loss": 0.6773, + "step": 1413 + }, + { + "epoch": 0.6685579196217494, + "grad_norm": 2.665452241897583, + "learning_rate": 4.866815186022093e-06, + "loss": 0.6256, + "step": 1414 + }, + { + "epoch": 0.6690307328605201, + "grad_norm": 2.9480721950531006, + "learning_rate": 4.866614215119956e-06, + "loss": 0.535, + "step": 1415 + }, + { + "epoch": 0.6695035460992907, + "grad_norm": 2.5514180660247803, + "learning_rate": 4.866413096859128e-06, + "loss": 0.6588, + "step": 1416 + }, + { + "epoch": 0.6699763593380614, + "grad_norm": 3.3442373275756836, + "learning_rate": 4.866211831252134e-06, + "loss": 0.5754, + "step": 1417 + }, + { + "epoch": 0.6704491725768321, + "grad_norm": 2.521467685699463, + "learning_rate": 4.866010418311504e-06, + "loss": 0.5546, + "step": 1418 + }, + { + "epoch": 0.6709219858156028, + "grad_norm": 2.930706262588501, + "learning_rate": 4.865808858049781e-06, + "loss": 0.589, + "step": 1419 + }, + { + "epoch": 0.6713947990543735, + "grad_norm": 2.6298375129699707, + "learning_rate": 4.865607150479513e-06, + "loss": 0.5915, + "step": 1420 + }, + { + "epoch": 0.6718676122931442, + "grad_norm": 2.9554293155670166, + "learning_rate": 4.8654052956132615e-06, + "loss": 0.6654, + "step": 1421 + }, + { + "epoch": 0.6723404255319149, + "grad_norm": 3.2706902027130127, + "learning_rate": 4.865203293463593e-06, + "loss": 0.7115, + "step": 1422 + }, + { + "epoch": 0.6728132387706856, + "grad_norm": 3.041539430618286, + "learning_rate": 4.865001144043088e-06, + "loss": 0.5818, + "step": 1423 + }, + { + "epoch": 0.6732860520094562, + "grad_norm": 3.1314544677734375, + "learning_rate": 4.864798847364331e-06, + "loss": 0.5822, + "step": 1424 + }, + { + "epoch": 0.6737588652482269, + "grad_norm": 2.5301461219787598, + "learning_rate": 4.86459640343992e-06, + "loss": 0.5525, + "step": 1425 + }, + { + "epoch": 0.6742316784869976, + "grad_norm": 2.809295892715454, + "learning_rate": 4.864393812282458e-06, + "loss": 0.6768, + "step": 1426 + }, + { + "epoch": 0.6747044917257683, + "grad_norm": 2.794664144515991, + "learning_rate": 4.864191073904562e-06, + "loss": 0.5793, + "step": 1427 + }, + { + "epoch": 0.675177304964539, + "grad_norm": 2.7771105766296387, + "learning_rate": 4.863988188318854e-06, + "loss": 0.6453, + "step": 1428 + }, + { + "epoch": 0.6756501182033097, + "grad_norm": 2.6431946754455566, + "learning_rate": 4.863785155537967e-06, + "loss": 0.5877, + "step": 1429 + }, + { + "epoch": 0.6761229314420804, + "grad_norm": 2.951353073120117, + "learning_rate": 4.863581975574544e-06, + "loss": 0.6793, + "step": 1430 + }, + { + "epoch": 0.676595744680851, + "grad_norm": 3.1336071491241455, + "learning_rate": 4.863378648441235e-06, + "loss": 0.6695, + "step": 1431 + }, + { + "epoch": 0.6770685579196217, + "grad_norm": 2.735982656478882, + "learning_rate": 4.8631751741507e-06, + "loss": 0.5239, + "step": 1432 + }, + { + "epoch": 0.6775413711583924, + "grad_norm": 2.7085206508636475, + "learning_rate": 4.862971552715611e-06, + "loss": 0.6837, + "step": 1433 + }, + { + "epoch": 0.6780141843971631, + "grad_norm": 3.136528730392456, + "learning_rate": 4.8627677841486436e-06, + "loss": 0.683, + "step": 1434 + }, + { + "epoch": 0.6784869976359338, + "grad_norm": 2.7879369258880615, + "learning_rate": 4.862563868462486e-06, + "loss": 0.608, + "step": 1435 + }, + { + "epoch": 0.6789598108747045, + "grad_norm": 2.7937729358673096, + "learning_rate": 4.862359805669837e-06, + "loss": 0.6131, + "step": 1436 + }, + { + "epoch": 0.6794326241134752, + "grad_norm": 2.5988364219665527, + "learning_rate": 4.862155595783401e-06, + "loss": 0.6303, + "step": 1437 + }, + { + "epoch": 0.6799054373522458, + "grad_norm": 3.251070499420166, + "learning_rate": 4.861951238815894e-06, + "loss": 0.7246, + "step": 1438 + }, + { + "epoch": 0.6803782505910165, + "grad_norm": 2.646759271621704, + "learning_rate": 4.861746734780039e-06, + "loss": 0.6313, + "step": 1439 + }, + { + "epoch": 0.6808510638297872, + "grad_norm": 2.773866891860962, + "learning_rate": 4.861542083688573e-06, + "loss": 0.6463, + "step": 1440 + }, + { + "epoch": 0.6813238770685579, + "grad_norm": 2.759965658187866, + "learning_rate": 4.861337285554235e-06, + "loss": 0.5428, + "step": 1441 + }, + { + "epoch": 0.6817966903073286, + "grad_norm": 3.3250818252563477, + "learning_rate": 4.861132340389779e-06, + "loss": 0.6522, + "step": 1442 + }, + { + "epoch": 0.6822695035460993, + "grad_norm": 2.661797523498535, + "learning_rate": 4.860927248207965e-06, + "loss": 0.5871, + "step": 1443 + }, + { + "epoch": 0.68274231678487, + "grad_norm": 2.706289052963257, + "learning_rate": 4.860722009021563e-06, + "loss": 0.6651, + "step": 1444 + }, + { + "epoch": 0.6832151300236406, + "grad_norm": 2.8459298610687256, + "learning_rate": 4.860516622843354e-06, + "loss": 0.5827, + "step": 1445 + }, + { + "epoch": 0.6836879432624113, + "grad_norm": 3.1041831970214844, + "learning_rate": 4.860311089686125e-06, + "loss": 0.6727, + "step": 1446 + }, + { + "epoch": 0.684160756501182, + "grad_norm": 2.9382801055908203, + "learning_rate": 4.8601054095626746e-06, + "loss": 0.6002, + "step": 1447 + }, + { + "epoch": 0.6846335697399527, + "grad_norm": 2.782475471496582, + "learning_rate": 4.859899582485808e-06, + "loss": 0.6951, + "step": 1448 + }, + { + "epoch": 0.6851063829787234, + "grad_norm": 3.313894510269165, + "learning_rate": 4.859693608468343e-06, + "loss": 0.6363, + "step": 1449 + }, + { + "epoch": 0.6855791962174941, + "grad_norm": 3.1639695167541504, + "learning_rate": 4.8594874875231045e-06, + "loss": 0.7002, + "step": 1450 + }, + { + "epoch": 0.6860520094562648, + "grad_norm": 2.6762218475341797, + "learning_rate": 4.859281219662926e-06, + "loss": 0.6246, + "step": 1451 + }, + { + "epoch": 0.6865248226950355, + "grad_norm": 2.8368663787841797, + "learning_rate": 4.85907480490065e-06, + "loss": 0.5906, + "step": 1452 + }, + { + "epoch": 0.6869976359338061, + "grad_norm": 2.887373208999634, + "learning_rate": 4.858868243249131e-06, + "loss": 0.5931, + "step": 1453 + }, + { + "epoch": 0.6874704491725768, + "grad_norm": 2.8115322589874268, + "learning_rate": 4.858661534721229e-06, + "loss": 0.6337, + "step": 1454 + }, + { + "epoch": 0.6879432624113475, + "grad_norm": 2.8470499515533447, + "learning_rate": 4.8584546793298174e-06, + "loss": 0.632, + "step": 1455 + }, + { + "epoch": 0.6884160756501182, + "grad_norm": 2.8229613304138184, + "learning_rate": 4.8582476770877725e-06, + "loss": 0.6494, + "step": 1456 + }, + { + "epoch": 0.6888888888888889, + "grad_norm": 2.4235479831695557, + "learning_rate": 4.858040528007987e-06, + "loss": 0.5709, + "step": 1457 + }, + { + "epoch": 0.6893617021276596, + "grad_norm": 2.9348199367523193, + "learning_rate": 4.857833232103356e-06, + "loss": 0.5404, + "step": 1458 + }, + { + "epoch": 0.6898345153664303, + "grad_norm": 2.8274219036102295, + "learning_rate": 4.857625789386789e-06, + "loss": 0.701, + "step": 1459 + }, + { + "epoch": 0.6903073286052009, + "grad_norm": 3.136929988861084, + "learning_rate": 4.857418199871203e-06, + "loss": 0.6971, + "step": 1460 + }, + { + "epoch": 0.6907801418439716, + "grad_norm": 2.8987185955047607, + "learning_rate": 4.8572104635695214e-06, + "loss": 0.6613, + "step": 1461 + }, + { + "epoch": 0.6912529550827423, + "grad_norm": 2.5073442459106445, + "learning_rate": 4.857002580494681e-06, + "loss": 0.6032, + "step": 1462 + }, + { + "epoch": 0.691725768321513, + "grad_norm": 2.7019522190093994, + "learning_rate": 4.856794550659625e-06, + "loss": 0.567, + "step": 1463 + }, + { + "epoch": 0.6921985815602837, + "grad_norm": 2.4795594215393066, + "learning_rate": 4.8565863740773054e-06, + "loss": 0.5777, + "step": 1464 + }, + { + "epoch": 0.6926713947990544, + "grad_norm": 3.032506227493286, + "learning_rate": 4.856378050760687e-06, + "loss": 0.607, + "step": 1465 + }, + { + "epoch": 0.6931442080378251, + "grad_norm": 3.052091121673584, + "learning_rate": 4.85616958072274e-06, + "loss": 0.591, + "step": 1466 + }, + { + "epoch": 0.6936170212765957, + "grad_norm": 2.704831838607788, + "learning_rate": 4.855960963976443e-06, + "loss": 0.6528, + "step": 1467 + }, + { + "epoch": 0.6940898345153664, + "grad_norm": 2.680995225906372, + "learning_rate": 4.855752200534788e-06, + "loss": 0.6294, + "step": 1468 + }, + { + "epoch": 0.6945626477541371, + "grad_norm": 2.3948659896850586, + "learning_rate": 4.855543290410774e-06, + "loss": 0.6091, + "step": 1469 + }, + { + "epoch": 0.6950354609929078, + "grad_norm": 2.6407411098480225, + "learning_rate": 4.855334233617407e-06, + "loss": 0.5572, + "step": 1470 + }, + { + "epoch": 0.6955082742316785, + "grad_norm": 2.5526835918426514, + "learning_rate": 4.8551250301677064e-06, + "loss": 0.5432, + "step": 1471 + }, + { + "epoch": 0.6959810874704492, + "grad_norm": 3.1237430572509766, + "learning_rate": 4.8549156800746965e-06, + "loss": 0.5944, + "step": 1472 + }, + { + "epoch": 0.6964539007092199, + "grad_norm": 2.8112540245056152, + "learning_rate": 4.854706183351412e-06, + "loss": 0.604, + "step": 1473 + }, + { + "epoch": 0.6969267139479906, + "grad_norm": 2.664644479751587, + "learning_rate": 4.8544965400109e-06, + "loss": 0.5647, + "step": 1474 + }, + { + "epoch": 0.6973995271867612, + "grad_norm": 3.26310133934021, + "learning_rate": 4.854286750066212e-06, + "loss": 0.6999, + "step": 1475 + }, + { + "epoch": 0.6978723404255319, + "grad_norm": 2.9717442989349365, + "learning_rate": 4.8540768135304115e-06, + "loss": 0.6655, + "step": 1476 + }, + { + "epoch": 0.6983451536643026, + "grad_norm": 2.5302982330322266, + "learning_rate": 4.85386673041657e-06, + "loss": 0.6384, + "step": 1477 + }, + { + "epoch": 0.6988179669030733, + "grad_norm": 2.864877700805664, + "learning_rate": 4.853656500737769e-06, + "loss": 0.6834, + "step": 1478 + }, + { + "epoch": 0.699290780141844, + "grad_norm": 2.5522031784057617, + "learning_rate": 4.853446124507098e-06, + "loss": 0.5929, + "step": 1479 + }, + { + "epoch": 0.6997635933806147, + "grad_norm": 3.096477746963501, + "learning_rate": 4.853235601737656e-06, + "loss": 0.5737, + "step": 1480 + }, + { + "epoch": 0.7002364066193854, + "grad_norm": 2.884779214859009, + "learning_rate": 4.853024932442552e-06, + "loss": 0.6362, + "step": 1481 + }, + { + "epoch": 0.700709219858156, + "grad_norm": 3.368558406829834, + "learning_rate": 4.852814116634903e-06, + "loss": 0.6721, + "step": 1482 + }, + { + "epoch": 0.7011820330969267, + "grad_norm": 2.742414951324463, + "learning_rate": 4.852603154327837e-06, + "loss": 0.6212, + "step": 1483 + }, + { + "epoch": 0.7016548463356974, + "grad_norm": 2.53454852104187, + "learning_rate": 4.8523920455344864e-06, + "loss": 0.6675, + "step": 1484 + }, + { + "epoch": 0.7021276595744681, + "grad_norm": 2.9354238510131836, + "learning_rate": 4.852180790267999e-06, + "loss": 0.6692, + "step": 1485 + }, + { + "epoch": 0.7026004728132388, + "grad_norm": 2.585070848464966, + "learning_rate": 4.8519693885415274e-06, + "loss": 0.6215, + "step": 1486 + }, + { + "epoch": 0.7030732860520095, + "grad_norm": 2.9047999382019043, + "learning_rate": 4.851757840368235e-06, + "loss": 0.6231, + "step": 1487 + }, + { + "epoch": 0.7035460992907802, + "grad_norm": 3.0930933952331543, + "learning_rate": 4.851546145761295e-06, + "loss": 0.7267, + "step": 1488 + }, + { + "epoch": 0.7040189125295508, + "grad_norm": 3.0224719047546387, + "learning_rate": 4.8513343047338875e-06, + "loss": 0.6293, + "step": 1489 + }, + { + "epoch": 0.7044917257683215, + "grad_norm": 2.5758471488952637, + "learning_rate": 4.851122317299203e-06, + "loss": 0.5855, + "step": 1490 + }, + { + "epoch": 0.7049645390070922, + "grad_norm": 2.579272508621216, + "learning_rate": 4.850910183470441e-06, + "loss": 0.582, + "step": 1491 + }, + { + "epoch": 0.7054373522458629, + "grad_norm": 2.8148300647735596, + "learning_rate": 4.85069790326081e-06, + "loss": 0.6396, + "step": 1492 + }, + { + "epoch": 0.7059101654846336, + "grad_norm": 2.6380527019500732, + "learning_rate": 4.850485476683528e-06, + "loss": 0.6114, + "step": 1493 + }, + { + "epoch": 0.7063829787234043, + "grad_norm": 2.7736263275146484, + "learning_rate": 4.850272903751823e-06, + "loss": 0.6683, + "step": 1494 + }, + { + "epoch": 0.706855791962175, + "grad_norm": 3.1958179473876953, + "learning_rate": 4.8500601844789285e-06, + "loss": 0.6265, + "step": 1495 + }, + { + "epoch": 0.7073286052009456, + "grad_norm": 3.783212423324585, + "learning_rate": 4.8498473188780916e-06, + "loss": 0.6078, + "step": 1496 + }, + { + "epoch": 0.7078014184397163, + "grad_norm": 2.6656646728515625, + "learning_rate": 4.849634306962566e-06, + "loss": 0.5756, + "step": 1497 + }, + { + "epoch": 0.708274231678487, + "grad_norm": 2.757141590118408, + "learning_rate": 4.849421148745615e-06, + "loss": 0.5596, + "step": 1498 + }, + { + "epoch": 0.7087470449172577, + "grad_norm": 3.0391886234283447, + "learning_rate": 4.849207844240511e-06, + "loss": 0.5293, + "step": 1499 + }, + { + "epoch": 0.7092198581560284, + "grad_norm": 2.981912851333618, + "learning_rate": 4.848994393460535e-06, + "loss": 0.598, + "step": 1500 + }, + { + "epoch": 0.7096926713947991, + "grad_norm": 2.5470798015594482, + "learning_rate": 4.848780796418978e-06, + "loss": 0.6266, + "step": 1501 + }, + { + "epoch": 0.7101654846335698, + "grad_norm": 2.8394415378570557, + "learning_rate": 4.8485670531291415e-06, + "loss": 0.6844, + "step": 1502 + }, + { + "epoch": 0.7106382978723405, + "grad_norm": 3.2023508548736572, + "learning_rate": 4.848353163604331e-06, + "loss": 0.6134, + "step": 1503 + }, + { + "epoch": 0.7111111111111111, + "grad_norm": 2.98245906829834, + "learning_rate": 4.848139127857867e-06, + "loss": 0.7084, + "step": 1504 + }, + { + "epoch": 0.7115839243498818, + "grad_norm": 2.5917441844940186, + "learning_rate": 4.847924945903076e-06, + "loss": 0.5676, + "step": 1505 + }, + { + "epoch": 0.7120567375886525, + "grad_norm": 2.8736681938171387, + "learning_rate": 4.847710617753294e-06, + "loss": 0.6304, + "step": 1506 + }, + { + "epoch": 0.7125295508274232, + "grad_norm": 2.7832682132720947, + "learning_rate": 4.847496143421866e-06, + "loss": 0.5705, + "step": 1507 + }, + { + "epoch": 0.7130023640661939, + "grad_norm": 2.480560779571533, + "learning_rate": 4.847281522922147e-06, + "loss": 0.5595, + "step": 1508 + }, + { + "epoch": 0.7134751773049646, + "grad_norm": 2.357675313949585, + "learning_rate": 4.847066756267499e-06, + "loss": 0.5065, + "step": 1509 + }, + { + "epoch": 0.7139479905437353, + "grad_norm": 2.632669448852539, + "learning_rate": 4.846851843471296e-06, + "loss": 0.6949, + "step": 1510 + }, + { + "epoch": 0.7144208037825059, + "grad_norm": 2.7691073417663574, + "learning_rate": 4.84663678454692e-06, + "loss": 0.6638, + "step": 1511 + }, + { + "epoch": 0.7148936170212766, + "grad_norm": 2.5647685527801514, + "learning_rate": 4.846421579507761e-06, + "loss": 0.6098, + "step": 1512 + }, + { + "epoch": 0.7153664302600473, + "grad_norm": 2.476701021194458, + "learning_rate": 4.846206228367218e-06, + "loss": 0.592, + "step": 1513 + }, + { + "epoch": 0.715839243498818, + "grad_norm": 2.805727958679199, + "learning_rate": 4.845990731138702e-06, + "loss": 0.5466, + "step": 1514 + }, + { + "epoch": 0.7163120567375887, + "grad_norm": 2.551392078399658, + "learning_rate": 4.84577508783563e-06, + "loss": 0.6039, + "step": 1515 + }, + { + "epoch": 0.7167848699763594, + "grad_norm": 2.6861350536346436, + "learning_rate": 4.845559298471429e-06, + "loss": 0.6427, + "step": 1516 + }, + { + "epoch": 0.7172576832151301, + "grad_norm": 3.1908371448516846, + "learning_rate": 4.845343363059535e-06, + "loss": 0.5447, + "step": 1517 + }, + { + "epoch": 0.7177304964539007, + "grad_norm": 2.9021761417388916, + "learning_rate": 4.845127281613394e-06, + "loss": 0.5836, + "step": 1518 + }, + { + "epoch": 0.7182033096926714, + "grad_norm": 2.476670742034912, + "learning_rate": 4.844911054146461e-06, + "loss": 0.5863, + "step": 1519 + }, + { + "epoch": 0.7186761229314421, + "grad_norm": 2.662935495376587, + "learning_rate": 4.844694680672198e-06, + "loss": 0.5678, + "step": 1520 + }, + { + "epoch": 0.7191489361702128, + "grad_norm": 2.677896738052368, + "learning_rate": 4.844478161204079e-06, + "loss": 0.6195, + "step": 1521 + }, + { + "epoch": 0.7196217494089835, + "grad_norm": 2.781921863555908, + "learning_rate": 4.844261495755585e-06, + "loss": 0.643, + "step": 1522 + }, + { + "epoch": 0.7200945626477542, + "grad_norm": 3.0157392024993896, + "learning_rate": 4.844044684340206e-06, + "loss": 0.7559, + "step": 1523 + }, + { + "epoch": 0.7205673758865249, + "grad_norm": 2.8109354972839355, + "learning_rate": 4.843827726971444e-06, + "loss": 0.6264, + "step": 1524 + }, + { + "epoch": 0.7210401891252955, + "grad_norm": 3.0953569412231445, + "learning_rate": 4.8436106236628064e-06, + "loss": 0.6429, + "step": 1525 + }, + { + "epoch": 0.7215130023640662, + "grad_norm": 2.6850643157958984, + "learning_rate": 4.843393374427812e-06, + "loss": 0.6598, + "step": 1526 + }, + { + "epoch": 0.7219858156028369, + "grad_norm": 3.043480634689331, + "learning_rate": 4.8431759792799874e-06, + "loss": 0.6331, + "step": 1527 + }, + { + "epoch": 0.7224586288416076, + "grad_norm": 2.723870038986206, + "learning_rate": 4.842958438232868e-06, + "loss": 0.6259, + "step": 1528 + }, + { + "epoch": 0.7229314420803783, + "grad_norm": 2.822492837905884, + "learning_rate": 4.842740751300002e-06, + "loss": 0.6554, + "step": 1529 + }, + { + "epoch": 0.723404255319149, + "grad_norm": 2.7866315841674805, + "learning_rate": 4.842522918494941e-06, + "loss": 0.6991, + "step": 1530 + }, + { + "epoch": 0.7238770685579197, + "grad_norm": 2.8881826400756836, + "learning_rate": 4.84230493983125e-06, + "loss": 0.5876, + "step": 1531 + }, + { + "epoch": 0.7243498817966904, + "grad_norm": 2.7456939220428467, + "learning_rate": 4.8420868153225e-06, + "loss": 0.6188, + "step": 1532 + }, + { + "epoch": 0.724822695035461, + "grad_norm": 3.0257532596588135, + "learning_rate": 4.841868544982274e-06, + "loss": 0.63, + "step": 1533 + }, + { + "epoch": 0.7252955082742317, + "grad_norm": 3.1581954956054688, + "learning_rate": 4.841650128824164e-06, + "loss": 0.7214, + "step": 1534 + }, + { + "epoch": 0.7257683215130024, + "grad_norm": 2.9174306392669678, + "learning_rate": 4.841431566861767e-06, + "loss": 0.704, + "step": 1535 + }, + { + "epoch": 0.7262411347517731, + "grad_norm": 2.5019054412841797, + "learning_rate": 4.8412128591086935e-06, + "loss": 0.6298, + "step": 1536 + }, + { + "epoch": 0.7267139479905438, + "grad_norm": 2.724285125732422, + "learning_rate": 4.840994005578562e-06, + "loss": 0.6289, + "step": 1537 + }, + { + "epoch": 0.7271867612293145, + "grad_norm": 2.5882341861724854, + "learning_rate": 4.840775006284998e-06, + "loss": 0.6355, + "step": 1538 + }, + { + "epoch": 0.7276595744680852, + "grad_norm": 3.1281991004943848, + "learning_rate": 4.840555861241638e-06, + "loss": 0.5551, + "step": 1539 + }, + { + "epoch": 0.7281323877068558, + "grad_norm": 2.6064817905426025, + "learning_rate": 4.840336570462127e-06, + "loss": 0.5543, + "step": 1540 + }, + { + "epoch": 0.7286052009456265, + "grad_norm": 2.67112398147583, + "learning_rate": 4.840117133960122e-06, + "loss": 0.6044, + "step": 1541 + }, + { + "epoch": 0.7290780141843972, + "grad_norm": 2.838022232055664, + "learning_rate": 4.839897551749282e-06, + "loss": 0.6814, + "step": 1542 + }, + { + "epoch": 0.7295508274231679, + "grad_norm": 2.8897151947021484, + "learning_rate": 4.839677823843283e-06, + "loss": 0.593, + "step": 1543 + }, + { + "epoch": 0.7300236406619386, + "grad_norm": 2.9238014221191406, + "learning_rate": 4.839457950255805e-06, + "loss": 0.5544, + "step": 1544 + }, + { + "epoch": 0.7304964539007093, + "grad_norm": 3.016876459121704, + "learning_rate": 4.839237931000538e-06, + "loss": 0.6099, + "step": 1545 + }, + { + "epoch": 0.7309692671394799, + "grad_norm": 2.9415392875671387, + "learning_rate": 4.839017766091182e-06, + "loss": 0.6413, + "step": 1546 + }, + { + "epoch": 0.7314420803782505, + "grad_norm": 2.658067226409912, + "learning_rate": 4.838797455541446e-06, + "loss": 0.6534, + "step": 1547 + }, + { + "epoch": 0.7319148936170212, + "grad_norm": 2.460358142852783, + "learning_rate": 4.838576999365049e-06, + "loss": 0.5307, + "step": 1548 + }, + { + "epoch": 0.7323877068557919, + "grad_norm": 2.5818674564361572, + "learning_rate": 4.838356397575716e-06, + "loss": 0.6265, + "step": 1549 + }, + { + "epoch": 0.7328605200945626, + "grad_norm": 3.009197473526001, + "learning_rate": 4.838135650187183e-06, + "loss": 0.6957, + "step": 1550 + }, + { + "epoch": 0.7333333333333333, + "grad_norm": 2.738543748855591, + "learning_rate": 4.837914757213196e-06, + "loss": 0.646, + "step": 1551 + }, + { + "epoch": 0.733806146572104, + "grad_norm": 2.8208494186401367, + "learning_rate": 4.837693718667508e-06, + "loss": 0.5936, + "step": 1552 + }, + { + "epoch": 0.7342789598108747, + "grad_norm": 3.1574649810791016, + "learning_rate": 4.837472534563883e-06, + "loss": 0.6455, + "step": 1553 + }, + { + "epoch": 0.7347517730496453, + "grad_norm": 2.6737420558929443, + "learning_rate": 4.837251204916093e-06, + "loss": 0.5921, + "step": 1554 + }, + { + "epoch": 0.735224586288416, + "grad_norm": 2.424983024597168, + "learning_rate": 4.837029729737918e-06, + "loss": 0.6346, + "step": 1555 + }, + { + "epoch": 0.7356973995271867, + "grad_norm": 2.5163493156433105, + "learning_rate": 4.836808109043151e-06, + "loss": 0.6061, + "step": 1556 + }, + { + "epoch": 0.7361702127659574, + "grad_norm": 2.8377044200897217, + "learning_rate": 4.836586342845588e-06, + "loss": 0.611, + "step": 1557 + }, + { + "epoch": 0.7366430260047281, + "grad_norm": 2.5929181575775146, + "learning_rate": 4.83636443115904e-06, + "loss": 0.5496, + "step": 1558 + }, + { + "epoch": 0.7371158392434988, + "grad_norm": 2.5017223358154297, + "learning_rate": 4.836142373997323e-06, + "loss": 0.6235, + "step": 1559 + }, + { + "epoch": 0.7375886524822695, + "grad_norm": 2.822500228881836, + "learning_rate": 4.835920171374265e-06, + "loss": 0.6147, + "step": 1560 + }, + { + "epoch": 0.7380614657210401, + "grad_norm": 2.7234230041503906, + "learning_rate": 4.8356978233037e-06, + "loss": 0.6228, + "step": 1561 + }, + { + "epoch": 0.7385342789598108, + "grad_norm": 2.9565515518188477, + "learning_rate": 4.835475329799472e-06, + "loss": 0.5728, + "step": 1562 + }, + { + "epoch": 0.7390070921985815, + "grad_norm": 2.4356038570404053, + "learning_rate": 4.835252690875438e-06, + "loss": 0.6723, + "step": 1563 + }, + { + "epoch": 0.7394799054373522, + "grad_norm": 2.765913248062134, + "learning_rate": 4.835029906545458e-06, + "loss": 0.5805, + "step": 1564 + }, + { + "epoch": 0.7399527186761229, + "grad_norm": 2.4481914043426514, + "learning_rate": 4.834806976823405e-06, + "loss": 0.599, + "step": 1565 + }, + { + "epoch": 0.7404255319148936, + "grad_norm": 2.620779514312744, + "learning_rate": 4.834583901723158e-06, + "loss": 0.63, + "step": 1566 + }, + { + "epoch": 0.7408983451536643, + "grad_norm": 2.654426097869873, + "learning_rate": 4.83436068125861e-06, + "loss": 0.6544, + "step": 1567 + }, + { + "epoch": 0.741371158392435, + "grad_norm": 2.589623212814331, + "learning_rate": 4.834137315443656e-06, + "loss": 0.5596, + "step": 1568 + }, + { + "epoch": 0.7418439716312056, + "grad_norm": 2.572883129119873, + "learning_rate": 4.833913804292209e-06, + "loss": 0.5974, + "step": 1569 + }, + { + "epoch": 0.7423167848699763, + "grad_norm": 2.8744914531707764, + "learning_rate": 4.833690147818181e-06, + "loss": 0.5364, + "step": 1570 + }, + { + "epoch": 0.742789598108747, + "grad_norm": 2.9800851345062256, + "learning_rate": 4.833466346035502e-06, + "loss": 0.6287, + "step": 1571 + }, + { + "epoch": 0.7432624113475177, + "grad_norm": 2.627784490585327, + "learning_rate": 4.833242398958105e-06, + "loss": 0.621, + "step": 1572 + }, + { + "epoch": 0.7437352245862884, + "grad_norm": 2.5187721252441406, + "learning_rate": 4.833018306599933e-06, + "loss": 0.5901, + "step": 1573 + }, + { + "epoch": 0.7442080378250591, + "grad_norm": 2.4843688011169434, + "learning_rate": 4.832794068974944e-06, + "loss": 0.6336, + "step": 1574 + }, + { + "epoch": 0.7446808510638298, + "grad_norm": 2.774911880493164, + "learning_rate": 4.832569686097096e-06, + "loss": 0.6091, + "step": 1575 + }, + { + "epoch": 0.7451536643026004, + "grad_norm": 3.2562527656555176, + "learning_rate": 4.8323451579803615e-06, + "loss": 0.7686, + "step": 1576 + }, + { + "epoch": 0.7456264775413711, + "grad_norm": 2.799570083618164, + "learning_rate": 4.832120484638721e-06, + "loss": 0.6233, + "step": 1577 + }, + { + "epoch": 0.7460992907801418, + "grad_norm": 2.661893367767334, + "learning_rate": 4.831895666086164e-06, + "loss": 0.5841, + "step": 1578 + }, + { + "epoch": 0.7465721040189125, + "grad_norm": 3.0382652282714844, + "learning_rate": 4.831670702336689e-06, + "loss": 0.5769, + "step": 1579 + }, + { + "epoch": 0.7470449172576832, + "grad_norm": 2.676398515701294, + "learning_rate": 4.831445593404304e-06, + "loss": 0.619, + "step": 1580 + }, + { + "epoch": 0.7475177304964539, + "grad_norm": 2.717916965484619, + "learning_rate": 4.831220339303024e-06, + "loss": 0.5787, + "step": 1581 + }, + { + "epoch": 0.7479905437352246, + "grad_norm": 2.3918066024780273, + "learning_rate": 4.830994940046876e-06, + "loss": 0.5108, + "step": 1582 + }, + { + "epoch": 0.7484633569739952, + "grad_norm": 2.709144115447998, + "learning_rate": 4.830769395649895e-06, + "loss": 0.6875, + "step": 1583 + }, + { + "epoch": 0.7489361702127659, + "grad_norm": 2.8711116313934326, + "learning_rate": 4.830543706126123e-06, + "loss": 0.6745, + "step": 1584 + }, + { + "epoch": 0.7494089834515366, + "grad_norm": 2.612339496612549, + "learning_rate": 4.830317871489614e-06, + "loss": 0.5738, + "step": 1585 + }, + { + "epoch": 0.7498817966903073, + "grad_norm": 2.4355857372283936, + "learning_rate": 4.830091891754429e-06, + "loss": 0.5907, + "step": 1586 + }, + { + "epoch": 0.750354609929078, + "grad_norm": 2.676051378250122, + "learning_rate": 4.829865766934638e-06, + "loss": 0.6628, + "step": 1587 + }, + { + "epoch": 0.7508274231678487, + "grad_norm": 2.66489839553833, + "learning_rate": 4.829639497044323e-06, + "loss": 0.5984, + "step": 1588 + }, + { + "epoch": 0.7513002364066194, + "grad_norm": 2.5358035564422607, + "learning_rate": 4.829413082097572e-06, + "loss": 0.5867, + "step": 1589 + }, + { + "epoch": 0.75177304964539, + "grad_norm": 2.6530144214630127, + "learning_rate": 4.8291865221084815e-06, + "loss": 0.5917, + "step": 1590 + }, + { + "epoch": 0.7522458628841607, + "grad_norm": 2.5160958766937256, + "learning_rate": 4.82895981709116e-06, + "loss": 0.6347, + "step": 1591 + }, + { + "epoch": 0.7527186761229314, + "grad_norm": 2.61592698097229, + "learning_rate": 4.8287329670597225e-06, + "loss": 0.5472, + "step": 1592 + }, + { + "epoch": 0.7531914893617021, + "grad_norm": 2.7528622150421143, + "learning_rate": 4.828505972028296e-06, + "loss": 0.5842, + "step": 1593 + }, + { + "epoch": 0.7536643026004728, + "grad_norm": 2.8154072761535645, + "learning_rate": 4.828278832011011e-06, + "loss": 0.5757, + "step": 1594 + }, + { + "epoch": 0.7541371158392435, + "grad_norm": 3.118515729904175, + "learning_rate": 4.828051547022013e-06, + "loss": 0.6472, + "step": 1595 + }, + { + "epoch": 0.7546099290780142, + "grad_norm": 2.452033758163452, + "learning_rate": 4.827824117075453e-06, + "loss": 0.5571, + "step": 1596 + }, + { + "epoch": 0.7550827423167848, + "grad_norm": 2.984388828277588, + "learning_rate": 4.827596542185492e-06, + "loss": 0.6656, + "step": 1597 + }, + { + "epoch": 0.7555555555555555, + "grad_norm": 2.61356782913208, + "learning_rate": 4.8273688223663014e-06, + "loss": 0.6444, + "step": 1598 + }, + { + "epoch": 0.7560283687943262, + "grad_norm": 2.8967196941375732, + "learning_rate": 4.8271409576320595e-06, + "loss": 0.6457, + "step": 1599 + }, + { + "epoch": 0.7565011820330969, + "grad_norm": 2.852367639541626, + "learning_rate": 4.826912947996954e-06, + "loss": 0.5629, + "step": 1600 + }, + { + "epoch": 0.7569739952718676, + "grad_norm": 2.905280590057373, + "learning_rate": 4.826684793475182e-06, + "loss": 0.6245, + "step": 1601 + }, + { + "epoch": 0.7574468085106383, + "grad_norm": 2.6156530380249023, + "learning_rate": 4.826456494080951e-06, + "loss": 0.5869, + "step": 1602 + }, + { + "epoch": 0.757919621749409, + "grad_norm": 2.6490228176116943, + "learning_rate": 4.826228049828475e-06, + "loss": 0.5461, + "step": 1603 + }, + { + "epoch": 0.7583924349881797, + "grad_norm": 2.9626693725585938, + "learning_rate": 4.825999460731978e-06, + "loss": 0.6842, + "step": 1604 + }, + { + "epoch": 0.7588652482269503, + "grad_norm": 2.6866023540496826, + "learning_rate": 4.825770726805695e-06, + "loss": 0.5726, + "step": 1605 + }, + { + "epoch": 0.759338061465721, + "grad_norm": 2.5525858402252197, + "learning_rate": 4.825541848063866e-06, + "loss": 0.6061, + "step": 1606 + }, + { + "epoch": 0.7598108747044917, + "grad_norm": 2.703977584838867, + "learning_rate": 4.825312824520743e-06, + "loss": 0.6726, + "step": 1607 + }, + { + "epoch": 0.7602836879432624, + "grad_norm": 2.856534957885742, + "learning_rate": 4.825083656190588e-06, + "loss": 0.625, + "step": 1608 + }, + { + "epoch": 0.7607565011820331, + "grad_norm": 2.8564887046813965, + "learning_rate": 4.824854343087668e-06, + "loss": 0.7251, + "step": 1609 + }, + { + "epoch": 0.7612293144208038, + "grad_norm": 2.327650308609009, + "learning_rate": 4.824624885226262e-06, + "loss": 0.526, + "step": 1610 + }, + { + "epoch": 0.7617021276595745, + "grad_norm": 3.0025737285614014, + "learning_rate": 4.824395282620659e-06, + "loss": 0.6043, + "step": 1611 + }, + { + "epoch": 0.7621749408983451, + "grad_norm": 2.5441737174987793, + "learning_rate": 4.824165535285152e-06, + "loss": 0.6276, + "step": 1612 + }, + { + "epoch": 0.7626477541371158, + "grad_norm": 2.4177372455596924, + "learning_rate": 4.823935643234049e-06, + "loss": 0.6419, + "step": 1613 + }, + { + "epoch": 0.7631205673758865, + "grad_norm": 2.9210550785064697, + "learning_rate": 4.823705606481664e-06, + "loss": 0.5663, + "step": 1614 + }, + { + "epoch": 0.7635933806146572, + "grad_norm": 2.6353724002838135, + "learning_rate": 4.82347542504232e-06, + "loss": 0.5669, + "step": 1615 + }, + { + "epoch": 0.7640661938534279, + "grad_norm": 2.419081926345825, + "learning_rate": 4.823245098930349e-06, + "loss": 0.5777, + "step": 1616 + }, + { + "epoch": 0.7645390070921986, + "grad_norm": 2.5077571868896484, + "learning_rate": 4.823014628160093e-06, + "loss": 0.5924, + "step": 1617 + }, + { + "epoch": 0.7650118203309693, + "grad_norm": 2.816056251525879, + "learning_rate": 4.822784012745902e-06, + "loss": 0.7273, + "step": 1618 + }, + { + "epoch": 0.76548463356974, + "grad_norm": 2.7163147926330566, + "learning_rate": 4.8225532527021366e-06, + "loss": 0.5545, + "step": 1619 + }, + { + "epoch": 0.7659574468085106, + "grad_norm": 2.4784302711486816, + "learning_rate": 4.822322348043164e-06, + "loss": 0.556, + "step": 1620 + }, + { + "epoch": 0.7664302600472813, + "grad_norm": 2.712467670440674, + "learning_rate": 4.822091298783361e-06, + "loss": 0.6501, + "step": 1621 + }, + { + "epoch": 0.766903073286052, + "grad_norm": 2.7217724323272705, + "learning_rate": 4.821860104937115e-06, + "loss": 0.5989, + "step": 1622 + }, + { + "epoch": 0.7673758865248227, + "grad_norm": 2.5622854232788086, + "learning_rate": 4.821628766518821e-06, + "loss": 0.5263, + "step": 1623 + }, + { + "epoch": 0.7678486997635934, + "grad_norm": 3.230923891067505, + "learning_rate": 4.821397283542884e-06, + "loss": 0.6707, + "step": 1624 + }, + { + "epoch": 0.7683215130023641, + "grad_norm": 2.37929105758667, + "learning_rate": 4.821165656023718e-06, + "loss": 0.6124, + "step": 1625 + }, + { + "epoch": 0.7687943262411348, + "grad_norm": 2.9811325073242188, + "learning_rate": 4.820933883975745e-06, + "loss": 0.6435, + "step": 1626 + }, + { + "epoch": 0.7692671394799054, + "grad_norm": 2.887380838394165, + "learning_rate": 4.820701967413395e-06, + "loss": 0.621, + "step": 1627 + }, + { + "epoch": 0.7697399527186761, + "grad_norm": 2.6762876510620117, + "learning_rate": 4.820469906351109e-06, + "loss": 0.5713, + "step": 1628 + }, + { + "epoch": 0.7702127659574468, + "grad_norm": 2.7347512245178223, + "learning_rate": 4.820237700803337e-06, + "loss": 0.6136, + "step": 1629 + }, + { + "epoch": 0.7706855791962175, + "grad_norm": 2.7244746685028076, + "learning_rate": 4.820005350784539e-06, + "loss": 0.5816, + "step": 1630 + }, + { + "epoch": 0.7711583924349882, + "grad_norm": 2.9293999671936035, + "learning_rate": 4.8197728563091795e-06, + "loss": 0.6649, + "step": 1631 + }, + { + "epoch": 0.7716312056737589, + "grad_norm": 2.4402127265930176, + "learning_rate": 4.819540217391736e-06, + "loss": 0.6481, + "step": 1632 + }, + { + "epoch": 0.7721040189125296, + "grad_norm": 3.083941698074341, + "learning_rate": 4.819307434046694e-06, + "loss": 0.6951, + "step": 1633 + }, + { + "epoch": 0.7725768321513002, + "grad_norm": 2.544952392578125, + "learning_rate": 4.819074506288548e-06, + "loss": 0.539, + "step": 1634 + }, + { + "epoch": 0.7730496453900709, + "grad_norm": 2.7791268825531006, + "learning_rate": 4.818841434131801e-06, + "loss": 0.5827, + "step": 1635 + }, + { + "epoch": 0.7735224586288416, + "grad_norm": 2.7349796295166016, + "learning_rate": 4.818608217590967e-06, + "loss": 0.5584, + "step": 1636 + }, + { + "epoch": 0.7739952718676123, + "grad_norm": 2.637652635574341, + "learning_rate": 4.818374856680565e-06, + "loss": 0.6386, + "step": 1637 + }, + { + "epoch": 0.774468085106383, + "grad_norm": 2.9821584224700928, + "learning_rate": 4.818141351415127e-06, + "loss": 0.6734, + "step": 1638 + }, + { + "epoch": 0.7749408983451537, + "grad_norm": 2.992938995361328, + "learning_rate": 4.817907701809192e-06, + "loss": 0.5899, + "step": 1639 + }, + { + "epoch": 0.7754137115839244, + "grad_norm": 4.35719633102417, + "learning_rate": 4.8176739078773076e-06, + "loss": 0.6281, + "step": 1640 + }, + { + "epoch": 0.775886524822695, + "grad_norm": 2.838146209716797, + "learning_rate": 4.8174399696340315e-06, + "loss": 0.5766, + "step": 1641 + }, + { + "epoch": 0.7763593380614657, + "grad_norm": 3.3116989135742188, + "learning_rate": 4.81720588709393e-06, + "loss": 0.6409, + "step": 1642 + }, + { + "epoch": 0.7768321513002364, + "grad_norm": 2.9843590259552, + "learning_rate": 4.816971660271579e-06, + "loss": 0.6108, + "step": 1643 + }, + { + "epoch": 0.7773049645390071, + "grad_norm": 2.843770742416382, + "learning_rate": 4.816737289181562e-06, + "loss": 0.6053, + "step": 1644 + }, + { + "epoch": 0.7777777777777778, + "grad_norm": 2.7608556747436523, + "learning_rate": 4.816502773838473e-06, + "loss": 0.5854, + "step": 1645 + }, + { + "epoch": 0.7782505910165485, + "grad_norm": 3.343682289123535, + "learning_rate": 4.816268114256914e-06, + "loss": 0.6329, + "step": 1646 + }, + { + "epoch": 0.7787234042553192, + "grad_norm": 2.769768476486206, + "learning_rate": 4.816033310451496e-06, + "loss": 0.6242, + "step": 1647 + }, + { + "epoch": 0.7791962174940898, + "grad_norm": 2.989851713180542, + "learning_rate": 4.815798362436838e-06, + "loss": 0.6493, + "step": 1648 + }, + { + "epoch": 0.7796690307328605, + "grad_norm": 3.170736312866211, + "learning_rate": 4.8155632702275716e-06, + "loss": 0.6341, + "step": 1649 + }, + { + "epoch": 0.7801418439716312, + "grad_norm": 2.7372522354125977, + "learning_rate": 4.815328033838334e-06, + "loss": 0.5445, + "step": 1650 + }, + { + "epoch": 0.7806146572104019, + "grad_norm": 2.6947238445281982, + "learning_rate": 4.8150926532837715e-06, + "loss": 0.6437, + "step": 1651 + }, + { + "epoch": 0.7810874704491726, + "grad_norm": 2.472323179244995, + "learning_rate": 4.81485712857854e-06, + "loss": 0.5751, + "step": 1652 + }, + { + "epoch": 0.7815602836879433, + "grad_norm": 2.791114091873169, + "learning_rate": 4.814621459737308e-06, + "loss": 0.5996, + "step": 1653 + }, + { + "epoch": 0.782033096926714, + "grad_norm": 3.1957521438598633, + "learning_rate": 4.814385646774745e-06, + "loss": 0.5803, + "step": 1654 + }, + { + "epoch": 0.7825059101654847, + "grad_norm": 2.4120798110961914, + "learning_rate": 4.8141496897055364e-06, + "loss": 0.5814, + "step": 1655 + }, + { + "epoch": 0.7829787234042553, + "grad_norm": 2.9262423515319824, + "learning_rate": 4.813913588544374e-06, + "loss": 0.6292, + "step": 1656 + }, + { + "epoch": 0.783451536643026, + "grad_norm": 2.8251047134399414, + "learning_rate": 4.813677343305959e-06, + "loss": 0.6787, + "step": 1657 + }, + { + "epoch": 0.7839243498817967, + "grad_norm": 2.931659698486328, + "learning_rate": 4.8134409540050005e-06, + "loss": 0.6163, + "step": 1658 + }, + { + "epoch": 0.7843971631205674, + "grad_norm": 2.7160706520080566, + "learning_rate": 4.813204420656219e-06, + "loss": 0.6831, + "step": 1659 + }, + { + "epoch": 0.7848699763593381, + "grad_norm": 3.2134454250335693, + "learning_rate": 4.81296774327434e-06, + "loss": 0.6002, + "step": 1660 + }, + { + "epoch": 0.7853427895981088, + "grad_norm": 2.4002513885498047, + "learning_rate": 4.812730921874103e-06, + "loss": 0.5488, + "step": 1661 + }, + { + "epoch": 0.7858156028368795, + "grad_norm": 2.5559282302856445, + "learning_rate": 4.812493956470251e-06, + "loss": 0.5802, + "step": 1662 + }, + { + "epoch": 0.7862884160756501, + "grad_norm": 2.57478404045105, + "learning_rate": 4.812256847077541e-06, + "loss": 0.646, + "step": 1663 + }, + { + "epoch": 0.7867612293144208, + "grad_norm": 2.811851978302002, + "learning_rate": 4.812019593710736e-06, + "loss": 0.6245, + "step": 1664 + }, + { + "epoch": 0.7872340425531915, + "grad_norm": 2.5228829383850098, + "learning_rate": 4.811782196384609e-06, + "loss": 0.5949, + "step": 1665 + }, + { + "epoch": 0.7877068557919622, + "grad_norm": 2.744096040725708, + "learning_rate": 4.8115446551139415e-06, + "loss": 0.6006, + "step": 1666 + }, + { + "epoch": 0.7881796690307329, + "grad_norm": 3.129242420196533, + "learning_rate": 4.811306969913524e-06, + "loss": 0.7251, + "step": 1667 + }, + { + "epoch": 0.7886524822695036, + "grad_norm": 2.7855660915374756, + "learning_rate": 4.811069140798156e-06, + "loss": 0.6534, + "step": 1668 + }, + { + "epoch": 0.7891252955082743, + "grad_norm": 2.836603879928589, + "learning_rate": 4.810831167782647e-06, + "loss": 0.6661, + "step": 1669 + }, + { + "epoch": 0.789598108747045, + "grad_norm": 2.5339887142181396, + "learning_rate": 4.810593050881813e-06, + "loss": 0.5354, + "step": 1670 + }, + { + "epoch": 0.7900709219858156, + "grad_norm": 2.9553709030151367, + "learning_rate": 4.810354790110482e-06, + "loss": 0.6001, + "step": 1671 + }, + { + "epoch": 0.7905437352245863, + "grad_norm": 2.6581788063049316, + "learning_rate": 4.8101163854834885e-06, + "loss": 0.6802, + "step": 1672 + }, + { + "epoch": 0.791016548463357, + "grad_norm": 3.2002551555633545, + "learning_rate": 4.809877837015677e-06, + "loss": 0.6641, + "step": 1673 + }, + { + "epoch": 0.7914893617021277, + "grad_norm": 2.918792963027954, + "learning_rate": 4.809639144721902e-06, + "loss": 0.6758, + "step": 1674 + }, + { + "epoch": 0.7919621749408984, + "grad_norm": 2.7993946075439453, + "learning_rate": 4.8094003086170245e-06, + "loss": 0.5889, + "step": 1675 + }, + { + "epoch": 0.7924349881796691, + "grad_norm": 2.3698952198028564, + "learning_rate": 4.809161328715916e-06, + "loss": 0.6244, + "step": 1676 + }, + { + "epoch": 0.7929078014184398, + "grad_norm": 2.8891594409942627, + "learning_rate": 4.808922205033458e-06, + "loss": 0.5835, + "step": 1677 + }, + { + "epoch": 0.7933806146572104, + "grad_norm": 2.838345766067505, + "learning_rate": 4.808682937584537e-06, + "loss": 0.6907, + "step": 1678 + }, + { + "epoch": 0.7938534278959811, + "grad_norm": 2.8443174362182617, + "learning_rate": 4.808443526384053e-06, + "loss": 0.6692, + "step": 1679 + }, + { + "epoch": 0.7943262411347518, + "grad_norm": 2.7355034351348877, + "learning_rate": 4.808203971446913e-06, + "loss": 0.5799, + "step": 1680 + }, + { + "epoch": 0.7947990543735225, + "grad_norm": 2.7108020782470703, + "learning_rate": 4.807964272788033e-06, + "loss": 0.652, + "step": 1681 + }, + { + "epoch": 0.7952718676122932, + "grad_norm": 2.397650957107544, + "learning_rate": 4.807724430422338e-06, + "loss": 0.5418, + "step": 1682 + }, + { + "epoch": 0.7957446808510639, + "grad_norm": 2.4981582164764404, + "learning_rate": 4.807484444364762e-06, + "loss": 0.5731, + "step": 1683 + }, + { + "epoch": 0.7962174940898346, + "grad_norm": 2.7943713665008545, + "learning_rate": 4.8072443146302475e-06, + "loss": 0.5913, + "step": 1684 + }, + { + "epoch": 0.7966903073286052, + "grad_norm": 2.5691423416137695, + "learning_rate": 4.807004041233746e-06, + "loss": 0.6475, + "step": 1685 + }, + { + "epoch": 0.7971631205673759, + "grad_norm": 3.2367498874664307, + "learning_rate": 4.8067636241902195e-06, + "loss": 0.675, + "step": 1686 + }, + { + "epoch": 0.7976359338061466, + "grad_norm": 3.000595808029175, + "learning_rate": 4.806523063514637e-06, + "loss": 0.5481, + "step": 1687 + }, + { + "epoch": 0.7981087470449173, + "grad_norm": 2.702014207839966, + "learning_rate": 4.806282359221976e-06, + "loss": 0.5993, + "step": 1688 + }, + { + "epoch": 0.798581560283688, + "grad_norm": 2.383671998977661, + "learning_rate": 4.806041511327226e-06, + "loss": 0.562, + "step": 1689 + }, + { + "epoch": 0.7990543735224587, + "grad_norm": 2.6965041160583496, + "learning_rate": 4.8058005198453834e-06, + "loss": 0.5955, + "step": 1690 + }, + { + "epoch": 0.7995271867612294, + "grad_norm": 2.5906765460968018, + "learning_rate": 4.805559384791453e-06, + "loss": 0.5151, + "step": 1691 + }, + { + "epoch": 0.8, + "grad_norm": 2.5454652309417725, + "learning_rate": 4.8053181061804475e-06, + "loss": 0.5843, + "step": 1692 + }, + { + "epoch": 0.8004728132387707, + "grad_norm": 2.661343812942505, + "learning_rate": 4.8050766840273935e-06, + "loss": 0.5995, + "step": 1693 + }, + { + "epoch": 0.8009456264775414, + "grad_norm": 2.5635924339294434, + "learning_rate": 4.8048351183473215e-06, + "loss": 0.5676, + "step": 1694 + }, + { + "epoch": 0.8014184397163121, + "grad_norm": 2.5936667919158936, + "learning_rate": 4.804593409155274e-06, + "loss": 0.6291, + "step": 1695 + }, + { + "epoch": 0.8018912529550828, + "grad_norm": 2.6902432441711426, + "learning_rate": 4.804351556466299e-06, + "loss": 0.6114, + "step": 1696 + }, + { + "epoch": 0.8023640661938535, + "grad_norm": 2.7764673233032227, + "learning_rate": 4.804109560295457e-06, + "loss": 0.5768, + "step": 1697 + }, + { + "epoch": 0.8028368794326242, + "grad_norm": 2.9587221145629883, + "learning_rate": 4.803867420657816e-06, + "loss": 0.6048, + "step": 1698 + }, + { + "epoch": 0.8033096926713948, + "grad_norm": 2.9238998889923096, + "learning_rate": 4.803625137568453e-06, + "loss": 0.6329, + "step": 1699 + }, + { + "epoch": 0.8037825059101655, + "grad_norm": 2.70473313331604, + "learning_rate": 4.803382711042455e-06, + "loss": 0.5427, + "step": 1700 + }, + { + "epoch": 0.8042553191489362, + "grad_norm": 3.1604979038238525, + "learning_rate": 4.803140141094914e-06, + "loss": 0.626, + "step": 1701 + }, + { + "epoch": 0.8047281323877069, + "grad_norm": 2.9567699432373047, + "learning_rate": 4.802897427740936e-06, + "loss": 0.5319, + "step": 1702 + }, + { + "epoch": 0.8052009456264776, + "grad_norm": 2.90983247756958, + "learning_rate": 4.802654570995632e-06, + "loss": 0.586, + "step": 1703 + }, + { + "epoch": 0.8056737588652483, + "grad_norm": 2.783480167388916, + "learning_rate": 4.8024115708741255e-06, + "loss": 0.5773, + "step": 1704 + }, + { + "epoch": 0.806146572104019, + "grad_norm": 3.3307793140411377, + "learning_rate": 4.802168427391547e-06, + "loss": 0.6257, + "step": 1705 + }, + { + "epoch": 0.8066193853427897, + "grad_norm": 3.0475001335144043, + "learning_rate": 4.801925140563034e-06, + "loss": 0.6612, + "step": 1706 + }, + { + "epoch": 0.8070921985815603, + "grad_norm": 2.8278894424438477, + "learning_rate": 4.8016817104037375e-06, + "loss": 0.6449, + "step": 1707 + }, + { + "epoch": 0.807565011820331, + "grad_norm": 2.760244369506836, + "learning_rate": 4.801438136928812e-06, + "loss": 0.7007, + "step": 1708 + }, + { + "epoch": 0.8080378250591016, + "grad_norm": 2.827634572982788, + "learning_rate": 4.801194420153427e-06, + "loss": 0.6418, + "step": 1709 + }, + { + "epoch": 0.8085106382978723, + "grad_norm": 2.8655009269714355, + "learning_rate": 4.800950560092754e-06, + "loss": 0.6231, + "step": 1710 + }, + { + "epoch": 0.808983451536643, + "grad_norm": 2.738112688064575, + "learning_rate": 4.800706556761981e-06, + "loss": 0.6463, + "step": 1711 + }, + { + "epoch": 0.8094562647754137, + "grad_norm": 2.4781179428100586, + "learning_rate": 4.800462410176296e-06, + "loss": 0.5365, + "step": 1712 + }, + { + "epoch": 0.8099290780141843, + "grad_norm": 2.6049838066101074, + "learning_rate": 4.800218120350906e-06, + "loss": 0.6035, + "step": 1713 + }, + { + "epoch": 0.810401891252955, + "grad_norm": 2.9089980125427246, + "learning_rate": 4.79997368730102e-06, + "loss": 0.5828, + "step": 1714 + }, + { + "epoch": 0.8108747044917257, + "grad_norm": 2.831871747970581, + "learning_rate": 4.799729111041857e-06, + "loss": 0.5953, + "step": 1715 + }, + { + "epoch": 0.8113475177304964, + "grad_norm": 2.5611300468444824, + "learning_rate": 4.799484391588647e-06, + "loss": 0.6302, + "step": 1716 + }, + { + "epoch": 0.8118203309692671, + "grad_norm": 2.744070053100586, + "learning_rate": 4.799239528956625e-06, + "loss": 0.5561, + "step": 1717 + }, + { + "epoch": 0.8122931442080378, + "grad_norm": 2.7344231605529785, + "learning_rate": 4.798994523161041e-06, + "loss": 0.6317, + "step": 1718 + }, + { + "epoch": 0.8127659574468085, + "grad_norm": 2.3420889377593994, + "learning_rate": 4.798749374217149e-06, + "loss": 0.5415, + "step": 1719 + }, + { + "epoch": 0.8132387706855791, + "grad_norm": 2.57384991645813, + "learning_rate": 4.798504082140212e-06, + "loss": 0.6383, + "step": 1720 + }, + { + "epoch": 0.8137115839243498, + "grad_norm": 2.8819844722747803, + "learning_rate": 4.798258646945505e-06, + "loss": 0.6355, + "step": 1721 + }, + { + "epoch": 0.8141843971631205, + "grad_norm": 2.908123254776001, + "learning_rate": 4.79801306864831e-06, + "loss": 0.701, + "step": 1722 + }, + { + "epoch": 0.8146572104018912, + "grad_norm": 2.6500701904296875, + "learning_rate": 4.797767347263917e-06, + "loss": 0.6152, + "step": 1723 + }, + { + "epoch": 0.8151300236406619, + "grad_norm": 2.5513017177581787, + "learning_rate": 4.797521482807628e-06, + "loss": 0.6241, + "step": 1724 + }, + { + "epoch": 0.8156028368794326, + "grad_norm": 2.6239185333251953, + "learning_rate": 4.7972754752947495e-06, + "loss": 0.6072, + "step": 1725 + }, + { + "epoch": 0.8160756501182033, + "grad_norm": 2.673436403274536, + "learning_rate": 4.797029324740601e-06, + "loss": 0.5802, + "step": 1726 + }, + { + "epoch": 0.816548463356974, + "grad_norm": 2.533831834793091, + "learning_rate": 4.796783031160508e-06, + "loss": 0.5566, + "step": 1727 + }, + { + "epoch": 0.8170212765957446, + "grad_norm": 2.9806582927703857, + "learning_rate": 4.796536594569807e-06, + "loss": 0.6945, + "step": 1728 + }, + { + "epoch": 0.8174940898345153, + "grad_norm": 2.7093560695648193, + "learning_rate": 4.796290014983842e-06, + "loss": 0.7143, + "step": 1729 + }, + { + "epoch": 0.817966903073286, + "grad_norm": 2.814507246017456, + "learning_rate": 4.796043292417967e-06, + "loss": 0.6122, + "step": 1730 + }, + { + "epoch": 0.8184397163120567, + "grad_norm": 2.537156820297241, + "learning_rate": 4.795796426887543e-06, + "loss": 0.6229, + "step": 1731 + }, + { + "epoch": 0.8189125295508274, + "grad_norm": 2.4878013134002686, + "learning_rate": 4.795549418407944e-06, + "loss": 0.5442, + "step": 1732 + }, + { + "epoch": 0.8193853427895981, + "grad_norm": 2.839383363723755, + "learning_rate": 4.795302266994548e-06, + "loss": 0.6717, + "step": 1733 + }, + { + "epoch": 0.8198581560283688, + "grad_norm": 3.1981801986694336, + "learning_rate": 4.795054972662744e-06, + "loss": 0.6596, + "step": 1734 + }, + { + "epoch": 0.8203309692671394, + "grad_norm": 2.781730890274048, + "learning_rate": 4.79480753542793e-06, + "loss": 0.5845, + "step": 1735 + }, + { + "epoch": 0.8208037825059101, + "grad_norm": 2.689948558807373, + "learning_rate": 4.794559955305513e-06, + "loss": 0.5928, + "step": 1736 + }, + { + "epoch": 0.8212765957446808, + "grad_norm": 2.7267637252807617, + "learning_rate": 4.7943122323109105e-06, + "loss": 0.5224, + "step": 1737 + }, + { + "epoch": 0.8217494089834515, + "grad_norm": 2.4346601963043213, + "learning_rate": 4.794064366459544e-06, + "loss": 0.6431, + "step": 1738 + }, + { + "epoch": 0.8222222222222222, + "grad_norm": 2.7440176010131836, + "learning_rate": 4.793816357766849e-06, + "loss": 0.6083, + "step": 1739 + }, + { + "epoch": 0.8226950354609929, + "grad_norm": 2.6558027267456055, + "learning_rate": 4.793568206248268e-06, + "loss": 0.698, + "step": 1740 + }, + { + "epoch": 0.8231678486997636, + "grad_norm": 2.591658353805542, + "learning_rate": 4.793319911919251e-06, + "loss": 0.6601, + "step": 1741 + }, + { + "epoch": 0.8236406619385342, + "grad_norm": 2.5431172847747803, + "learning_rate": 4.79307147479526e-06, + "loss": 0.5917, + "step": 1742 + }, + { + "epoch": 0.8241134751773049, + "grad_norm": 2.7335588932037354, + "learning_rate": 4.792822894891762e-06, + "loss": 0.5925, + "step": 1743 + }, + { + "epoch": 0.8245862884160756, + "grad_norm": 2.2500839233398438, + "learning_rate": 4.792574172224237e-06, + "loss": 0.4984, + "step": 1744 + }, + { + "epoch": 0.8250591016548463, + "grad_norm": 2.691343069076538, + "learning_rate": 4.79232530680817e-06, + "loss": 0.6262, + "step": 1745 + }, + { + "epoch": 0.825531914893617, + "grad_norm": 2.612204074859619, + "learning_rate": 4.792076298659058e-06, + "loss": 0.5822, + "step": 1746 + }, + { + "epoch": 0.8260047281323877, + "grad_norm": 3.0163519382476807, + "learning_rate": 4.791827147792406e-06, + "loss": 0.6263, + "step": 1747 + }, + { + "epoch": 0.8264775413711584, + "grad_norm": 2.742183208465576, + "learning_rate": 4.791577854223727e-06, + "loss": 0.6628, + "step": 1748 + }, + { + "epoch": 0.826950354609929, + "grad_norm": 2.872213840484619, + "learning_rate": 4.791328417968542e-06, + "loss": 0.6332, + "step": 1749 + }, + { + "epoch": 0.8274231678486997, + "grad_norm": 2.725006580352783, + "learning_rate": 4.7910788390423844e-06, + "loss": 0.6266, + "step": 1750 + }, + { + "epoch": 0.8278959810874704, + "grad_norm": 3.0366697311401367, + "learning_rate": 4.790829117460793e-06, + "loss": 0.6403, + "step": 1751 + }, + { + "epoch": 0.8283687943262411, + "grad_norm": 2.594881772994995, + "learning_rate": 4.790579253239318e-06, + "loss": 0.521, + "step": 1752 + }, + { + "epoch": 0.8288416075650118, + "grad_norm": 2.4496347904205322, + "learning_rate": 4.790329246393517e-06, + "loss": 0.54, + "step": 1753 + }, + { + "epoch": 0.8293144208037825, + "grad_norm": 3.102278470993042, + "learning_rate": 4.790079096938956e-06, + "loss": 0.6142, + "step": 1754 + }, + { + "epoch": 0.8297872340425532, + "grad_norm": 2.4645912647247314, + "learning_rate": 4.789828804891212e-06, + "loss": 0.5212, + "step": 1755 + }, + { + "epoch": 0.8302600472813239, + "grad_norm": 2.7482516765594482, + "learning_rate": 4.789578370265868e-06, + "loss": 0.6712, + "step": 1756 + }, + { + "epoch": 0.8307328605200945, + "grad_norm": 2.61360502243042, + "learning_rate": 4.7893277930785195e-06, + "loss": 0.6367, + "step": 1757 + }, + { + "epoch": 0.8312056737588652, + "grad_norm": 2.79028058052063, + "learning_rate": 4.789077073344767e-06, + "loss": 0.5099, + "step": 1758 + }, + { + "epoch": 0.8316784869976359, + "grad_norm": 2.647662401199341, + "learning_rate": 4.788826211080222e-06, + "loss": 0.6698, + "step": 1759 + }, + { + "epoch": 0.8321513002364066, + "grad_norm": 3.0214831829071045, + "learning_rate": 4.7885752063005055e-06, + "loss": 0.6121, + "step": 1760 + }, + { + "epoch": 0.8326241134751773, + "grad_norm": 2.8244032859802246, + "learning_rate": 4.788324059021247e-06, + "loss": 0.6921, + "step": 1761 + }, + { + "epoch": 0.833096926713948, + "grad_norm": 3.1501076221466064, + "learning_rate": 4.788072769258082e-06, + "loss": 0.6872, + "step": 1762 + }, + { + "epoch": 0.8335697399527187, + "grad_norm": 2.6989903450012207, + "learning_rate": 4.7878213370266594e-06, + "loss": 0.5884, + "step": 1763 + }, + { + "epoch": 0.8340425531914893, + "grad_norm": 2.6982665061950684, + "learning_rate": 4.787569762342633e-06, + "loss": 0.6112, + "step": 1764 + }, + { + "epoch": 0.83451536643026, + "grad_norm": 2.6918323040008545, + "learning_rate": 4.7873180452216685e-06, + "loss": 0.5315, + "step": 1765 + }, + { + "epoch": 0.8349881796690307, + "grad_norm": 2.5494401454925537, + "learning_rate": 4.78706618567944e-06, + "loss": 0.5909, + "step": 1766 + }, + { + "epoch": 0.8354609929078014, + "grad_norm": 2.7532095909118652, + "learning_rate": 4.786814183731627e-06, + "loss": 0.5566, + "step": 1767 + }, + { + "epoch": 0.8359338061465721, + "grad_norm": 2.550865888595581, + "learning_rate": 4.786562039393923e-06, + "loss": 0.555, + "step": 1768 + }, + { + "epoch": 0.8364066193853428, + "grad_norm": 2.4477791786193848, + "learning_rate": 4.786309752682028e-06, + "loss": 0.5844, + "step": 1769 + }, + { + "epoch": 0.8368794326241135, + "grad_norm": 2.6982262134552, + "learning_rate": 4.7860573236116485e-06, + "loss": 0.6136, + "step": 1770 + }, + { + "epoch": 0.8373522458628841, + "grad_norm": 2.456263542175293, + "learning_rate": 4.785804752198503e-06, + "loss": 0.5055, + "step": 1771 + }, + { + "epoch": 0.8378250591016548, + "grad_norm": 2.428544521331787, + "learning_rate": 4.78555203845832e-06, + "loss": 0.5859, + "step": 1772 + }, + { + "epoch": 0.8382978723404255, + "grad_norm": 2.1782307624816895, + "learning_rate": 4.785299182406833e-06, + "loss": 0.5325, + "step": 1773 + }, + { + "epoch": 0.8387706855791962, + "grad_norm": 3.137956142425537, + "learning_rate": 4.785046184059786e-06, + "loss": 0.6097, + "step": 1774 + }, + { + "epoch": 0.8392434988179669, + "grad_norm": 2.6269001960754395, + "learning_rate": 4.7847930434329336e-06, + "loss": 0.5972, + "step": 1775 + }, + { + "epoch": 0.8397163120567376, + "grad_norm": 2.732659339904785, + "learning_rate": 4.784539760542037e-06, + "loss": 0.6054, + "step": 1776 + }, + { + "epoch": 0.8401891252955083, + "grad_norm": 2.5346736907958984, + "learning_rate": 4.784286335402866e-06, + "loss": 0.5521, + "step": 1777 + }, + { + "epoch": 0.840661938534279, + "grad_norm": 3.1420228481292725, + "learning_rate": 4.784032768031202e-06, + "loss": 0.6165, + "step": 1778 + }, + { + "epoch": 0.8411347517730496, + "grad_norm": 3.073793411254883, + "learning_rate": 4.783779058442831e-06, + "loss": 0.6414, + "step": 1779 + }, + { + "epoch": 0.8416075650118203, + "grad_norm": 2.6621336936950684, + "learning_rate": 4.783525206653554e-06, + "loss": 0.5836, + "step": 1780 + }, + { + "epoch": 0.842080378250591, + "grad_norm": 2.7029049396514893, + "learning_rate": 4.7832712126791745e-06, + "loss": 0.5897, + "step": 1781 + }, + { + "epoch": 0.8425531914893617, + "grad_norm": 2.4733822345733643, + "learning_rate": 4.783017076535509e-06, + "loss": 0.5913, + "step": 1782 + }, + { + "epoch": 0.8430260047281324, + "grad_norm": 2.8119473457336426, + "learning_rate": 4.782762798238381e-06, + "loss": 0.6105, + "step": 1783 + }, + { + "epoch": 0.8434988179669031, + "grad_norm": 2.5290818214416504, + "learning_rate": 4.782508377803622e-06, + "loss": 0.6119, + "step": 1784 + }, + { + "epoch": 0.8439716312056738, + "grad_norm": 3.193472385406494, + "learning_rate": 4.782253815247076e-06, + "loss": 0.6665, + "step": 1785 + }, + { + "epoch": 0.8444444444444444, + "grad_norm": 3.206759452819824, + "learning_rate": 4.781999110584592e-06, + "loss": 0.6012, + "step": 1786 + }, + { + "epoch": 0.8449172576832151, + "grad_norm": 2.6227457523345947, + "learning_rate": 4.781744263832029e-06, + "loss": 0.5845, + "step": 1787 + }, + { + "epoch": 0.8453900709219858, + "grad_norm": 2.838365316390991, + "learning_rate": 4.781489275005257e-06, + "loss": 0.5695, + "step": 1788 + }, + { + "epoch": 0.8458628841607565, + "grad_norm": 2.8348326683044434, + "learning_rate": 4.78123414412015e-06, + "loss": 0.6136, + "step": 1789 + }, + { + "epoch": 0.8463356973995272, + "grad_norm": 2.5698344707489014, + "learning_rate": 4.780978871192597e-06, + "loss": 0.6576, + "step": 1790 + }, + { + "epoch": 0.8468085106382979, + "grad_norm": 2.5198330879211426, + "learning_rate": 4.780723456238492e-06, + "loss": 0.5521, + "step": 1791 + }, + { + "epoch": 0.8472813238770686, + "grad_norm": 3.001325845718384, + "learning_rate": 4.780467899273737e-06, + "loss": 0.6075, + "step": 1792 + }, + { + "epoch": 0.8477541371158392, + "grad_norm": 2.7732746601104736, + "learning_rate": 4.780212200314247e-06, + "loss": 0.6245, + "step": 1793 + }, + { + "epoch": 0.8482269503546099, + "grad_norm": 2.6950337886810303, + "learning_rate": 4.77995635937594e-06, + "loss": 0.5723, + "step": 1794 + }, + { + "epoch": 0.8486997635933806, + "grad_norm": 2.82051420211792, + "learning_rate": 4.779700376474749e-06, + "loss": 0.6184, + "step": 1795 + }, + { + "epoch": 0.8491725768321513, + "grad_norm": 2.757791757583618, + "learning_rate": 4.779444251626611e-06, + "loss": 0.608, + "step": 1796 + }, + { + "epoch": 0.849645390070922, + "grad_norm": 2.394108533859253, + "learning_rate": 4.779187984847475e-06, + "loss": 0.6174, + "step": 1797 + }, + { + "epoch": 0.8501182033096927, + "grad_norm": 2.427562713623047, + "learning_rate": 4.778931576153296e-06, + "loss": 0.5618, + "step": 1798 + }, + { + "epoch": 0.8505910165484634, + "grad_norm": 2.891268491744995, + "learning_rate": 4.778675025560042e-06, + "loss": 0.6865, + "step": 1799 + }, + { + "epoch": 0.851063829787234, + "grad_norm": 2.665534257888794, + "learning_rate": 4.778418333083685e-06, + "loss": 0.5852, + "step": 1800 + }, + { + "epoch": 0.8515366430260047, + "grad_norm": 2.5492889881134033, + "learning_rate": 4.7781614987402095e-06, + "loss": 0.5161, + "step": 1801 + }, + { + "epoch": 0.8520094562647754, + "grad_norm": 2.400177001953125, + "learning_rate": 4.777904522545607e-06, + "loss": 0.5128, + "step": 1802 + }, + { + "epoch": 0.8524822695035461, + "grad_norm": 2.3949809074401855, + "learning_rate": 4.777647404515878e-06, + "loss": 0.571, + "step": 1803 + }, + { + "epoch": 0.8529550827423168, + "grad_norm": 2.3624472618103027, + "learning_rate": 4.7773901446670325e-06, + "loss": 0.5486, + "step": 1804 + }, + { + "epoch": 0.8534278959810875, + "grad_norm": 2.711366891860962, + "learning_rate": 4.7771327430150885e-06, + "loss": 0.5667, + "step": 1805 + }, + { + "epoch": 0.8539007092198582, + "grad_norm": 2.7681493759155273, + "learning_rate": 4.776875199576073e-06, + "loss": 0.5686, + "step": 1806 + }, + { + "epoch": 0.8543735224586289, + "grad_norm": 3.0369436740875244, + "learning_rate": 4.776617514366023e-06, + "loss": 0.6635, + "step": 1807 + }, + { + "epoch": 0.8548463356973995, + "grad_norm": 2.919649600982666, + "learning_rate": 4.776359687400983e-06, + "loss": 0.5749, + "step": 1808 + }, + { + "epoch": 0.8553191489361702, + "grad_norm": 2.7986185550689697, + "learning_rate": 4.776101718697007e-06, + "loss": 0.559, + "step": 1809 + }, + { + "epoch": 0.8557919621749409, + "grad_norm": 2.5951223373413086, + "learning_rate": 4.775843608270158e-06, + "loss": 0.5654, + "step": 1810 + }, + { + "epoch": 0.8562647754137116, + "grad_norm": 2.674138069152832, + "learning_rate": 4.775585356136505e-06, + "loss": 0.5286, + "step": 1811 + }, + { + "epoch": 0.8567375886524823, + "grad_norm": 3.045437812805176, + "learning_rate": 4.775326962312131e-06, + "loss": 0.6185, + "step": 1812 + }, + { + "epoch": 0.857210401891253, + "grad_norm": 2.6145293712615967, + "learning_rate": 4.775068426813124e-06, + "loss": 0.6075, + "step": 1813 + }, + { + "epoch": 0.8576832151300237, + "grad_norm": 2.6320106983184814, + "learning_rate": 4.7748097496555824e-06, + "loss": 0.561, + "step": 1814 + }, + { + "epoch": 0.8581560283687943, + "grad_norm": 2.5038623809814453, + "learning_rate": 4.774550930855612e-06, + "loss": 0.593, + "step": 1815 + }, + { + "epoch": 0.858628841607565, + "grad_norm": 2.8168089389801025, + "learning_rate": 4.774291970429329e-06, + "loss": 0.5196, + "step": 1816 + }, + { + "epoch": 0.8591016548463357, + "grad_norm": 2.778130292892456, + "learning_rate": 4.774032868392858e-06, + "loss": 0.5984, + "step": 1817 + }, + { + "epoch": 0.8595744680851064, + "grad_norm": 2.536458730697632, + "learning_rate": 4.7737736247623305e-06, + "loss": 0.568, + "step": 1818 + }, + { + "epoch": 0.8600472813238771, + "grad_norm": 2.6669719219207764, + "learning_rate": 4.77351423955389e-06, + "loss": 0.6233, + "step": 1819 + }, + { + "epoch": 0.8605200945626478, + "grad_norm": 2.578242540359497, + "learning_rate": 4.773254712783687e-06, + "loss": 0.579, + "step": 1820 + }, + { + "epoch": 0.8609929078014185, + "grad_norm": 2.816664457321167, + "learning_rate": 4.772995044467881e-06, + "loss": 0.6635, + "step": 1821 + }, + { + "epoch": 0.8614657210401891, + "grad_norm": 3.1111979484558105, + "learning_rate": 4.77273523462264e-06, + "loss": 0.6372, + "step": 1822 + }, + { + "epoch": 0.8619385342789598, + "grad_norm": 2.764552354812622, + "learning_rate": 4.772475283264142e-06, + "loss": 0.6216, + "step": 1823 + }, + { + "epoch": 0.8624113475177305, + "grad_norm": 2.9126830101013184, + "learning_rate": 4.772215190408572e-06, + "loss": 0.6396, + "step": 1824 + }, + { + "epoch": 0.8628841607565012, + "grad_norm": 2.7502307891845703, + "learning_rate": 4.7719549560721264e-06, + "loss": 0.6186, + "step": 1825 + }, + { + "epoch": 0.8633569739952719, + "grad_norm": 2.6279006004333496, + "learning_rate": 4.771694580271007e-06, + "loss": 0.5557, + "step": 1826 + }, + { + "epoch": 0.8638297872340426, + "grad_norm": 2.996563196182251, + "learning_rate": 4.7714340630214276e-06, + "loss": 0.6259, + "step": 1827 + }, + { + "epoch": 0.8643026004728133, + "grad_norm": 3.231323480606079, + "learning_rate": 4.771173404339609e-06, + "loss": 0.5473, + "step": 1828 + }, + { + "epoch": 0.864775413711584, + "grad_norm": 3.143519878387451, + "learning_rate": 4.770912604241781e-06, + "loss": 0.593, + "step": 1829 + }, + { + "epoch": 0.8652482269503546, + "grad_norm": 2.515484094619751, + "learning_rate": 4.770651662744184e-06, + "loss": 0.538, + "step": 1830 + }, + { + "epoch": 0.8657210401891253, + "grad_norm": 2.629058837890625, + "learning_rate": 4.770390579863064e-06, + "loss": 0.5745, + "step": 1831 + }, + { + "epoch": 0.866193853427896, + "grad_norm": 2.5826802253723145, + "learning_rate": 4.770129355614677e-06, + "loss": 0.6397, + "step": 1832 + }, + { + "epoch": 0.8666666666666667, + "grad_norm": 2.954623222351074, + "learning_rate": 4.769867990015289e-06, + "loss": 0.6106, + "step": 1833 + }, + { + "epoch": 0.8671394799054374, + "grad_norm": 2.742192268371582, + "learning_rate": 4.769606483081175e-06, + "loss": 0.6902, + "step": 1834 + }, + { + "epoch": 0.8676122931442081, + "grad_norm": 2.2619097232818604, + "learning_rate": 4.769344834828618e-06, + "loss": 0.5414, + "step": 1835 + }, + { + "epoch": 0.8680851063829788, + "grad_norm": 2.7384188175201416, + "learning_rate": 4.769083045273908e-06, + "loss": 0.5787, + "step": 1836 + }, + { + "epoch": 0.8685579196217494, + "grad_norm": 2.6734485626220703, + "learning_rate": 4.768821114433346e-06, + "loss": 0.5923, + "step": 1837 + }, + { + "epoch": 0.8690307328605201, + "grad_norm": 2.286140203475952, + "learning_rate": 4.768559042323243e-06, + "loss": 0.5822, + "step": 1838 + }, + { + "epoch": 0.8695035460992908, + "grad_norm": 3.0243725776672363, + "learning_rate": 4.768296828959915e-06, + "loss": 0.6623, + "step": 1839 + }, + { + "epoch": 0.8699763593380615, + "grad_norm": 2.4026312828063965, + "learning_rate": 4.768034474359689e-06, + "loss": 0.5554, + "step": 1840 + }, + { + "epoch": 0.8704491725768322, + "grad_norm": 2.7469029426574707, + "learning_rate": 4.767771978538903e-06, + "loss": 0.6316, + "step": 1841 + }, + { + "epoch": 0.8709219858156029, + "grad_norm": 2.729659080505371, + "learning_rate": 4.767509341513899e-06, + "loss": 0.5807, + "step": 1842 + }, + { + "epoch": 0.8713947990543736, + "grad_norm": 2.5336945056915283, + "learning_rate": 4.76724656330103e-06, + "loss": 0.6109, + "step": 1843 + }, + { + "epoch": 0.8718676122931442, + "grad_norm": 2.519880533218384, + "learning_rate": 4.76698364391666e-06, + "loss": 0.5313, + "step": 1844 + }, + { + "epoch": 0.8723404255319149, + "grad_norm": 2.698862075805664, + "learning_rate": 4.766720583377159e-06, + "loss": 0.5953, + "step": 1845 + }, + { + "epoch": 0.8728132387706856, + "grad_norm": 3.0195560455322266, + "learning_rate": 4.766457381698907e-06, + "loss": 0.5965, + "step": 1846 + }, + { + "epoch": 0.8732860520094563, + "grad_norm": 2.5972697734832764, + "learning_rate": 4.766194038898291e-06, + "loss": 0.6014, + "step": 1847 + }, + { + "epoch": 0.873758865248227, + "grad_norm": 2.7132294178009033, + "learning_rate": 4.76593055499171e-06, + "loss": 0.5638, + "step": 1848 + }, + { + "epoch": 0.8742316784869977, + "grad_norm": 2.7134575843811035, + "learning_rate": 4.765666929995568e-06, + "loss": 0.52, + "step": 1849 + }, + { + "epoch": 0.8747044917257684, + "grad_norm": 2.3804993629455566, + "learning_rate": 4.765403163926282e-06, + "loss": 0.5435, + "step": 1850 + }, + { + "epoch": 0.875177304964539, + "grad_norm": 2.8782761096954346, + "learning_rate": 4.765139256800274e-06, + "loss": 0.5843, + "step": 1851 + }, + { + "epoch": 0.8756501182033097, + "grad_norm": 2.836209774017334, + "learning_rate": 4.764875208633977e-06, + "loss": 0.6667, + "step": 1852 + }, + { + "epoch": 0.8761229314420804, + "grad_norm": 2.608851194381714, + "learning_rate": 4.764611019443831e-06, + "loss": 0.5436, + "step": 1853 + }, + { + "epoch": 0.8765957446808511, + "grad_norm": 2.788738965988159, + "learning_rate": 4.764346689246288e-06, + "loss": 0.7331, + "step": 1854 + }, + { + "epoch": 0.8770685579196218, + "grad_norm": 2.524277687072754, + "learning_rate": 4.764082218057805e-06, + "loss": 0.5067, + "step": 1855 + }, + { + "epoch": 0.8775413711583925, + "grad_norm": 3.7559316158294678, + "learning_rate": 4.763817605894851e-06, + "loss": 0.6809, + "step": 1856 + }, + { + "epoch": 0.8780141843971632, + "grad_norm": 2.9070613384246826, + "learning_rate": 4.763552852773899e-06, + "loss": 0.5913, + "step": 1857 + }, + { + "epoch": 0.8784869976359339, + "grad_norm": 2.7050609588623047, + "learning_rate": 4.7632879587114386e-06, + "loss": 0.6074, + "step": 1858 + }, + { + "epoch": 0.8789598108747045, + "grad_norm": 2.891134262084961, + "learning_rate": 4.76302292372396e-06, + "loss": 0.5939, + "step": 1859 + }, + { + "epoch": 0.8794326241134752, + "grad_norm": 2.8581702709198, + "learning_rate": 4.762757747827968e-06, + "loss": 0.5972, + "step": 1860 + }, + { + "epoch": 0.8799054373522459, + "grad_norm": 2.8266196250915527, + "learning_rate": 4.762492431039971e-06, + "loss": 0.5993, + "step": 1861 + }, + { + "epoch": 0.8803782505910166, + "grad_norm": 2.4853954315185547, + "learning_rate": 4.762226973376493e-06, + "loss": 0.6388, + "step": 1862 + }, + { + "epoch": 0.8808510638297873, + "grad_norm": 3.2212886810302734, + "learning_rate": 4.761961374854059e-06, + "loss": 0.6698, + "step": 1863 + }, + { + "epoch": 0.881323877068558, + "grad_norm": 3.1254501342773438, + "learning_rate": 4.761695635489211e-06, + "loss": 0.5263, + "step": 1864 + }, + { + "epoch": 0.8817966903073287, + "grad_norm": 2.6891462802886963, + "learning_rate": 4.761429755298491e-06, + "loss": 0.5359, + "step": 1865 + }, + { + "epoch": 0.8822695035460993, + "grad_norm": 2.8557538986206055, + "learning_rate": 4.761163734298457e-06, + "loss": 0.5933, + "step": 1866 + }, + { + "epoch": 0.88274231678487, + "grad_norm": 2.53548264503479, + "learning_rate": 4.7608975725056724e-06, + "loss": 0.6397, + "step": 1867 + }, + { + "epoch": 0.8832151300236407, + "grad_norm": 3.0237956047058105, + "learning_rate": 4.76063126993671e-06, + "loss": 0.6845, + "step": 1868 + }, + { + "epoch": 0.8836879432624114, + "grad_norm": 3.222886800765991, + "learning_rate": 4.76036482660815e-06, + "loss": 0.6055, + "step": 1869 + }, + { + "epoch": 0.8841607565011821, + "grad_norm": 3.1867551803588867, + "learning_rate": 4.760098242536584e-06, + "loss": 0.6592, + "step": 1870 + }, + { + "epoch": 0.8846335697399527, + "grad_norm": 2.782209873199463, + "learning_rate": 4.7598315177386115e-06, + "loss": 0.5833, + "step": 1871 + }, + { + "epoch": 0.8851063829787233, + "grad_norm": 2.899871587753296, + "learning_rate": 4.759564652230838e-06, + "loss": 0.6129, + "step": 1872 + }, + { + "epoch": 0.885579196217494, + "grad_norm": 2.5690579414367676, + "learning_rate": 4.759297646029882e-06, + "loss": 0.5827, + "step": 1873 + }, + { + "epoch": 0.8860520094562647, + "grad_norm": 2.666130304336548, + "learning_rate": 4.759030499152368e-06, + "loss": 0.5272, + "step": 1874 + }, + { + "epoch": 0.8865248226950354, + "grad_norm": 2.7030911445617676, + "learning_rate": 4.758763211614932e-06, + "loss": 0.6415, + "step": 1875 + }, + { + "epoch": 0.8869976359338061, + "grad_norm": 2.717512845993042, + "learning_rate": 4.7584957834342135e-06, + "loss": 0.5827, + "step": 1876 + }, + { + "epoch": 0.8874704491725768, + "grad_norm": 2.665823459625244, + "learning_rate": 4.758228214626867e-06, + "loss": 0.6209, + "step": 1877 + }, + { + "epoch": 0.8879432624113475, + "grad_norm": 2.636653184890747, + "learning_rate": 4.75796050520955e-06, + "loss": 0.6413, + "step": 1878 + }, + { + "epoch": 0.8884160756501182, + "grad_norm": 2.585115671157837, + "learning_rate": 4.7576926551989345e-06, + "loss": 0.5518, + "step": 1879 + }, + { + "epoch": 0.8888888888888888, + "grad_norm": 2.808526039123535, + "learning_rate": 4.757424664611697e-06, + "loss": 0.5717, + "step": 1880 + }, + { + "epoch": 0.8893617021276595, + "grad_norm": 3.5957939624786377, + "learning_rate": 4.757156533464524e-06, + "loss": 0.6323, + "step": 1881 + }, + { + "epoch": 0.8898345153664302, + "grad_norm": 2.5003883838653564, + "learning_rate": 4.756888261774111e-06, + "loss": 0.5937, + "step": 1882 + }, + { + "epoch": 0.8903073286052009, + "grad_norm": 2.749061346054077, + "learning_rate": 4.756619849557161e-06, + "loss": 0.6642, + "step": 1883 + }, + { + "epoch": 0.8907801418439716, + "grad_norm": 2.6757891178131104, + "learning_rate": 4.756351296830389e-06, + "loss": 0.5887, + "step": 1884 + }, + { + "epoch": 0.8912529550827423, + "grad_norm": 2.811925172805786, + "learning_rate": 4.756082603610516e-06, + "loss": 0.6571, + "step": 1885 + }, + { + "epoch": 0.891725768321513, + "grad_norm": 2.5054616928100586, + "learning_rate": 4.755813769914271e-06, + "loss": 0.6312, + "step": 1886 + }, + { + "epoch": 0.8921985815602836, + "grad_norm": 2.7518467903137207, + "learning_rate": 4.755544795758395e-06, + "loss": 0.6685, + "step": 1887 + }, + { + "epoch": 0.8926713947990543, + "grad_norm": 2.7527287006378174, + "learning_rate": 4.755275681159634e-06, + "loss": 0.5886, + "step": 1888 + }, + { + "epoch": 0.893144208037825, + "grad_norm": 2.6162452697753906, + "learning_rate": 4.755006426134745e-06, + "loss": 0.546, + "step": 1889 + }, + { + "epoch": 0.8936170212765957, + "grad_norm": 2.4016737937927246, + "learning_rate": 4.754737030700495e-06, + "loss": 0.5726, + "step": 1890 + }, + { + "epoch": 0.8940898345153664, + "grad_norm": 2.528327703475952, + "learning_rate": 4.754467494873656e-06, + "loss": 0.5682, + "step": 1891 + }, + { + "epoch": 0.8945626477541371, + "grad_norm": 2.3139286041259766, + "learning_rate": 4.7541978186710115e-06, + "loss": 0.6108, + "step": 1892 + }, + { + "epoch": 0.8950354609929078, + "grad_norm": 2.7269136905670166, + "learning_rate": 4.753928002109354e-06, + "loss": 0.5875, + "step": 1893 + }, + { + "epoch": 0.8955082742316784, + "grad_norm": 4.425495147705078, + "learning_rate": 4.753658045205482e-06, + "loss": 0.5572, + "step": 1894 + }, + { + "epoch": 0.8959810874704491, + "grad_norm": 2.535409927368164, + "learning_rate": 4.753387947976206e-06, + "loss": 0.5868, + "step": 1895 + }, + { + "epoch": 0.8964539007092198, + "grad_norm": 2.722458600997925, + "learning_rate": 4.753117710438343e-06, + "loss": 0.5935, + "step": 1896 + }, + { + "epoch": 0.8969267139479905, + "grad_norm": 2.743861436843872, + "learning_rate": 4.75284733260872e-06, + "loss": 0.572, + "step": 1897 + }, + { + "epoch": 0.8973995271867612, + "grad_norm": 2.60640549659729, + "learning_rate": 4.752576814504173e-06, + "loss": 0.567, + "step": 1898 + }, + { + "epoch": 0.8978723404255319, + "grad_norm": 2.7486042976379395, + "learning_rate": 4.7523061561415435e-06, + "loss": 0.5768, + "step": 1899 + }, + { + "epoch": 0.8983451536643026, + "grad_norm": 3.8410251140594482, + "learning_rate": 4.752035357537686e-06, + "loss": 0.6034, + "step": 1900 + }, + { + "epoch": 0.8988179669030733, + "grad_norm": 3.0935890674591064, + "learning_rate": 4.751764418709462e-06, + "loss": 0.5644, + "step": 1901 + }, + { + "epoch": 0.8992907801418439, + "grad_norm": 2.7989892959594727, + "learning_rate": 4.751493339673742e-06, + "loss": 0.656, + "step": 1902 + }, + { + "epoch": 0.8997635933806146, + "grad_norm": 3.6940557956695557, + "learning_rate": 4.751222120447403e-06, + "loss": 0.6632, + "step": 1903 + }, + { + "epoch": 0.9002364066193853, + "grad_norm": 2.3428797721862793, + "learning_rate": 4.750950761047335e-06, + "loss": 0.4485, + "step": 1904 + }, + { + "epoch": 0.900709219858156, + "grad_norm": 2.622544050216675, + "learning_rate": 4.750679261490432e-06, + "loss": 0.5857, + "step": 1905 + }, + { + "epoch": 0.9011820330969267, + "grad_norm": 2.4911322593688965, + "learning_rate": 4.750407621793601e-06, + "loss": 0.5618, + "step": 1906 + }, + { + "epoch": 0.9016548463356974, + "grad_norm": 2.6434662342071533, + "learning_rate": 4.750135841973755e-06, + "loss": 0.6057, + "step": 1907 + }, + { + "epoch": 0.902127659574468, + "grad_norm": 3.115443706512451, + "learning_rate": 4.749863922047817e-06, + "loss": 0.6064, + "step": 1908 + }, + { + "epoch": 0.9026004728132387, + "grad_norm": 2.5671091079711914, + "learning_rate": 4.749591862032718e-06, + "loss": 0.5625, + "step": 1909 + }, + { + "epoch": 0.9030732860520094, + "grad_norm": 3.2008655071258545, + "learning_rate": 4.749319661945398e-06, + "loss": 0.5547, + "step": 1910 + }, + { + "epoch": 0.9035460992907801, + "grad_norm": 2.905987024307251, + "learning_rate": 4.749047321802805e-06, + "loss": 0.6033, + "step": 1911 + }, + { + "epoch": 0.9040189125295508, + "grad_norm": 3.1456053256988525, + "learning_rate": 4.748774841621897e-06, + "loss": 0.5651, + "step": 1912 + }, + { + "epoch": 0.9044917257683215, + "grad_norm": 2.8116416931152344, + "learning_rate": 4.748502221419641e-06, + "loss": 0.5853, + "step": 1913 + }, + { + "epoch": 0.9049645390070922, + "grad_norm": 3.123835325241089, + "learning_rate": 4.748229461213011e-06, + "loss": 0.5427, + "step": 1914 + }, + { + "epoch": 0.9054373522458629, + "grad_norm": 2.4750146865844727, + "learning_rate": 4.747956561018989e-06, + "loss": 0.6517, + "step": 1915 + }, + { + "epoch": 0.9059101654846335, + "grad_norm": 2.6174299716949463, + "learning_rate": 4.7476835208545705e-06, + "loss": 0.6119, + "step": 1916 + }, + { + "epoch": 0.9063829787234042, + "grad_norm": 2.7390382289886475, + "learning_rate": 4.747410340736755e-06, + "loss": 0.5664, + "step": 1917 + }, + { + "epoch": 0.9068557919621749, + "grad_norm": 2.7940444946289062, + "learning_rate": 4.747137020682552e-06, + "loss": 0.5628, + "step": 1918 + }, + { + "epoch": 0.9073286052009456, + "grad_norm": 2.477365016937256, + "learning_rate": 4.7468635607089795e-06, + "loss": 0.5261, + "step": 1919 + }, + { + "epoch": 0.9078014184397163, + "grad_norm": 2.7016685009002686, + "learning_rate": 4.746589960833066e-06, + "loss": 0.5576, + "step": 1920 + }, + { + "epoch": 0.908274231678487, + "grad_norm": 2.8806519508361816, + "learning_rate": 4.746316221071846e-06, + "loss": 0.5925, + "step": 1921 + }, + { + "epoch": 0.9087470449172577, + "grad_norm": 3.0315234661102295, + "learning_rate": 4.746042341442365e-06, + "loss": 0.6142, + "step": 1922 + }, + { + "epoch": 0.9092198581560283, + "grad_norm": 4.2446160316467285, + "learning_rate": 4.745768321961676e-06, + "loss": 0.5352, + "step": 1923 + }, + { + "epoch": 0.909692671394799, + "grad_norm": 2.6517012119293213, + "learning_rate": 4.745494162646841e-06, + "loss": 0.6118, + "step": 1924 + }, + { + "epoch": 0.9101654846335697, + "grad_norm": 2.774900197982788, + "learning_rate": 4.7452198635149304e-06, + "loss": 0.572, + "step": 1925 + }, + { + "epoch": 0.9106382978723404, + "grad_norm": 3.0133683681488037, + "learning_rate": 4.744945424583024e-06, + "loss": 0.5897, + "step": 1926 + }, + { + "epoch": 0.9111111111111111, + "grad_norm": 2.7344839572906494, + "learning_rate": 4.744670845868211e-06, + "loss": 0.6207, + "step": 1927 + }, + { + "epoch": 0.9115839243498818, + "grad_norm": 2.636578321456909, + "learning_rate": 4.744396127387586e-06, + "loss": 0.6687, + "step": 1928 + }, + { + "epoch": 0.9120567375886525, + "grad_norm": 2.8663458824157715, + "learning_rate": 4.744121269158255e-06, + "loss": 0.5002, + "step": 1929 + }, + { + "epoch": 0.9125295508274232, + "grad_norm": 2.661079168319702, + "learning_rate": 4.743846271197333e-06, + "loss": 0.5848, + "step": 1930 + }, + { + "epoch": 0.9130023640661938, + "grad_norm": 2.881256341934204, + "learning_rate": 4.743571133521943e-06, + "loss": 0.5911, + "step": 1931 + }, + { + "epoch": 0.9134751773049645, + "grad_norm": 2.5540573596954346, + "learning_rate": 4.743295856149217e-06, + "loss": 0.5647, + "step": 1932 + }, + { + "epoch": 0.9139479905437352, + "grad_norm": 2.7060387134552, + "learning_rate": 4.743020439096293e-06, + "loss": 0.6267, + "step": 1933 + }, + { + "epoch": 0.9144208037825059, + "grad_norm": 2.694481372833252, + "learning_rate": 4.742744882380323e-06, + "loss": 0.6283, + "step": 1934 + }, + { + "epoch": 0.9148936170212766, + "grad_norm": 2.711555242538452, + "learning_rate": 4.7424691860184625e-06, + "loss": 0.5784, + "step": 1935 + }, + { + "epoch": 0.9153664302600473, + "grad_norm": 2.9077224731445312, + "learning_rate": 4.742193350027879e-06, + "loss": 0.5948, + "step": 1936 + }, + { + "epoch": 0.915839243498818, + "grad_norm": 2.9824187755584717, + "learning_rate": 4.7419173744257476e-06, + "loss": 0.6115, + "step": 1937 + }, + { + "epoch": 0.9163120567375886, + "grad_norm": 2.5127830505371094, + "learning_rate": 4.7416412592292515e-06, + "loss": 0.5803, + "step": 1938 + }, + { + "epoch": 0.9167848699763593, + "grad_norm": 3.1307175159454346, + "learning_rate": 4.741365004455583e-06, + "loss": 0.5657, + "step": 1939 + }, + { + "epoch": 0.91725768321513, + "grad_norm": 2.8205273151397705, + "learning_rate": 4.741088610121944e-06, + "loss": 0.6145, + "step": 1940 + }, + { + "epoch": 0.9177304964539007, + "grad_norm": 2.6119720935821533, + "learning_rate": 4.7408120762455444e-06, + "loss": 0.6058, + "step": 1941 + }, + { + "epoch": 0.9182033096926714, + "grad_norm": 2.421276092529297, + "learning_rate": 4.7405354028436025e-06, + "loss": 0.5973, + "step": 1942 + }, + { + "epoch": 0.9186761229314421, + "grad_norm": 2.9846808910369873, + "learning_rate": 4.740258589933346e-06, + "loss": 0.6892, + "step": 1943 + }, + { + "epoch": 0.9191489361702128, + "grad_norm": 2.6899871826171875, + "learning_rate": 4.739981637532009e-06, + "loss": 0.5705, + "step": 1944 + }, + { + "epoch": 0.9196217494089834, + "grad_norm": 2.8636131286621094, + "learning_rate": 4.739704545656839e-06, + "loss": 0.5775, + "step": 1945 + }, + { + "epoch": 0.9200945626477541, + "grad_norm": 2.7659449577331543, + "learning_rate": 4.739427314325087e-06, + "loss": 0.5823, + "step": 1946 + }, + { + "epoch": 0.9205673758865248, + "grad_norm": 4.71295166015625, + "learning_rate": 4.739149943554016e-06, + "loss": 0.5601, + "step": 1947 + }, + { + "epoch": 0.9210401891252955, + "grad_norm": 2.642636775970459, + "learning_rate": 4.738872433360896e-06, + "loss": 0.5278, + "step": 1948 + }, + { + "epoch": 0.9215130023640662, + "grad_norm": 2.4658217430114746, + "learning_rate": 4.7385947837630065e-06, + "loss": 0.6392, + "step": 1949 + }, + { + "epoch": 0.9219858156028369, + "grad_norm": 2.851602792739868, + "learning_rate": 4.738316994777636e-06, + "loss": 0.6164, + "step": 1950 + }, + { + "epoch": 0.9224586288416076, + "grad_norm": 2.394226551055908, + "learning_rate": 4.738039066422081e-06, + "loss": 0.5556, + "step": 1951 + }, + { + "epoch": 0.9229314420803783, + "grad_norm": 2.7985100746154785, + "learning_rate": 4.737760998713647e-06, + "loss": 0.5799, + "step": 1952 + }, + { + "epoch": 0.9234042553191489, + "grad_norm": 2.5974674224853516, + "learning_rate": 4.737482791669648e-06, + "loss": 0.6984, + "step": 1953 + }, + { + "epoch": 0.9238770685579196, + "grad_norm": 2.707636594772339, + "learning_rate": 4.737204445307406e-06, + "loss": 0.5548, + "step": 1954 + }, + { + "epoch": 0.9243498817966903, + "grad_norm": 2.7882707118988037, + "learning_rate": 4.736925959644254e-06, + "loss": 0.6026, + "step": 1955 + }, + { + "epoch": 0.924822695035461, + "grad_norm": 2.474482774734497, + "learning_rate": 4.7366473346975304e-06, + "loss": 0.5832, + "step": 1956 + }, + { + "epoch": 0.9252955082742317, + "grad_norm": 2.6196324825286865, + "learning_rate": 4.736368570484585e-06, + "loss": 0.5861, + "step": 1957 + }, + { + "epoch": 0.9257683215130024, + "grad_norm": 2.826864004135132, + "learning_rate": 4.736089667022775e-06, + "loss": 0.6173, + "step": 1958 + }, + { + "epoch": 0.926241134751773, + "grad_norm": 2.414473056793213, + "learning_rate": 4.735810624329466e-06, + "loss": 0.5753, + "step": 1959 + }, + { + "epoch": 0.9267139479905437, + "grad_norm": 2.8037970066070557, + "learning_rate": 4.7355314424220335e-06, + "loss": 0.6207, + "step": 1960 + }, + { + "epoch": 0.9271867612293144, + "grad_norm": 2.645458698272705, + "learning_rate": 4.735252121317861e-06, + "loss": 0.5959, + "step": 1961 + }, + { + "epoch": 0.9276595744680851, + "grad_norm": 2.7983884811401367, + "learning_rate": 4.734972661034339e-06, + "loss": 0.5696, + "step": 1962 + }, + { + "epoch": 0.9281323877068558, + "grad_norm": 3.0568997859954834, + "learning_rate": 4.73469306158887e-06, + "loss": 0.6194, + "step": 1963 + }, + { + "epoch": 0.9286052009456265, + "grad_norm": 2.7205135822296143, + "learning_rate": 4.734413322998863e-06, + "loss": 0.5292, + "step": 1964 + }, + { + "epoch": 0.9290780141843972, + "grad_norm": 3.3168489933013916, + "learning_rate": 4.734133445281735e-06, + "loss": 0.5654, + "step": 1965 + }, + { + "epoch": 0.9295508274231679, + "grad_norm": 3.0095653533935547, + "learning_rate": 4.733853428454916e-06, + "loss": 0.6508, + "step": 1966 + }, + { + "epoch": 0.9300236406619385, + "grad_norm": 2.7726712226867676, + "learning_rate": 4.733573272535838e-06, + "loss": 0.644, + "step": 1967 + }, + { + "epoch": 0.9304964539007092, + "grad_norm": 2.474397659301758, + "learning_rate": 4.7332929775419456e-06, + "loss": 0.5479, + "step": 1968 + }, + { + "epoch": 0.9309692671394799, + "grad_norm": 2.4518635272979736, + "learning_rate": 4.733012543490693e-06, + "loss": 0.6, + "step": 1969 + }, + { + "epoch": 0.9314420803782506, + "grad_norm": 2.9292192459106445, + "learning_rate": 4.73273197039954e-06, + "loss": 0.6647, + "step": 1970 + }, + { + "epoch": 0.9319148936170213, + "grad_norm": 2.425004720687866, + "learning_rate": 4.732451258285958e-06, + "loss": 0.6338, + "step": 1971 + }, + { + "epoch": 0.932387706855792, + "grad_norm": 2.904479503631592, + "learning_rate": 4.7321704071674255e-06, + "loss": 0.5923, + "step": 1972 + }, + { + "epoch": 0.9328605200945627, + "grad_norm": 2.477085590362549, + "learning_rate": 4.731889417061428e-06, + "loss": 0.5984, + "step": 1973 + }, + { + "epoch": 0.9333333333333333, + "grad_norm": 2.585240364074707, + "learning_rate": 4.731608287985465e-06, + "loss": 0.558, + "step": 1974 + }, + { + "epoch": 0.933806146572104, + "grad_norm": 2.658714532852173, + "learning_rate": 4.731327019957039e-06, + "loss": 0.5567, + "step": 1975 + }, + { + "epoch": 0.9342789598108747, + "grad_norm": 2.7593026161193848, + "learning_rate": 4.731045612993662e-06, + "loss": 0.5772, + "step": 1976 + }, + { + "epoch": 0.9347517730496454, + "grad_norm": 2.4386026859283447, + "learning_rate": 4.7307640671128585e-06, + "loss": 0.6199, + "step": 1977 + }, + { + "epoch": 0.9352245862884161, + "grad_norm": 2.681910514831543, + "learning_rate": 4.730482382332158e-06, + "loss": 0.5971, + "step": 1978 + }, + { + "epoch": 0.9356973995271868, + "grad_norm": 3.7593860626220703, + "learning_rate": 4.7302005586691e-06, + "loss": 0.6346, + "step": 1979 + }, + { + "epoch": 0.9361702127659575, + "grad_norm": 2.5789096355438232, + "learning_rate": 4.729918596141232e-06, + "loss": 0.5684, + "step": 1980 + }, + { + "epoch": 0.9366430260047282, + "grad_norm": 3.0607335567474365, + "learning_rate": 4.729636494766111e-06, + "loss": 0.6223, + "step": 1981 + }, + { + "epoch": 0.9371158392434988, + "grad_norm": 2.906643867492676, + "learning_rate": 4.729354254561303e-06, + "loss": 0.6513, + "step": 1982 + }, + { + "epoch": 0.9375886524822695, + "grad_norm": 3.192430019378662, + "learning_rate": 4.7290718755443795e-06, + "loss": 0.5095, + "step": 1983 + }, + { + "epoch": 0.9380614657210402, + "grad_norm": 2.661536931991577, + "learning_rate": 4.7287893577329255e-06, + "loss": 0.5525, + "step": 1984 + }, + { + "epoch": 0.9385342789598109, + "grad_norm": 2.8436734676361084, + "learning_rate": 4.728506701144531e-06, + "loss": 0.6323, + "step": 1985 + }, + { + "epoch": 0.9390070921985816, + "grad_norm": 2.75544810295105, + "learning_rate": 4.728223905796796e-06, + "loss": 0.6018, + "step": 1986 + }, + { + "epoch": 0.9394799054373523, + "grad_norm": 3.0652759075164795, + "learning_rate": 4.727940971707329e-06, + "loss": 0.62, + "step": 1987 + }, + { + "epoch": 0.939952718676123, + "grad_norm": 2.802567720413208, + "learning_rate": 4.727657898893747e-06, + "loss": 0.5809, + "step": 1988 + }, + { + "epoch": 0.9404255319148936, + "grad_norm": 2.6208512783050537, + "learning_rate": 4.7273746873736745e-06, + "loss": 0.5762, + "step": 1989 + }, + { + "epoch": 0.9408983451536643, + "grad_norm": 2.5901873111724854, + "learning_rate": 4.727091337164748e-06, + "loss": 0.6111, + "step": 1990 + }, + { + "epoch": 0.941371158392435, + "grad_norm": 3.002347707748413, + "learning_rate": 4.726807848284609e-06, + "loss": 0.6419, + "step": 1991 + }, + { + "epoch": 0.9418439716312057, + "grad_norm": 2.522151470184326, + "learning_rate": 4.72652422075091e-06, + "loss": 0.642, + "step": 1992 + }, + { + "epoch": 0.9423167848699764, + "grad_norm": 2.5571532249450684, + "learning_rate": 4.726240454581311e-06, + "loss": 0.5729, + "step": 1993 + }, + { + "epoch": 0.9427895981087471, + "grad_norm": 2.7704918384552, + "learning_rate": 4.72595654979348e-06, + "loss": 0.6816, + "step": 1994 + }, + { + "epoch": 0.9432624113475178, + "grad_norm": 2.517040491104126, + "learning_rate": 4.7256725064050955e-06, + "loss": 0.5782, + "step": 1995 + }, + { + "epoch": 0.9437352245862884, + "grad_norm": 2.613955020904541, + "learning_rate": 4.725388324433843e-06, + "loss": 0.6291, + "step": 1996 + }, + { + "epoch": 0.9442080378250591, + "grad_norm": 2.848891258239746, + "learning_rate": 4.725104003897418e-06, + "loss": 0.6544, + "step": 1997 + }, + { + "epoch": 0.9446808510638298, + "grad_norm": 3.0162429809570312, + "learning_rate": 4.724819544813523e-06, + "loss": 0.6301, + "step": 1998 + }, + { + "epoch": 0.9451536643026005, + "grad_norm": 2.613614559173584, + "learning_rate": 4.72453494719987e-06, + "loss": 0.5829, + "step": 1999 + }, + { + "epoch": 0.9456264775413712, + "grad_norm": 2.4838767051696777, + "learning_rate": 4.724250211074182e-06, + "loss": 0.6042, + "step": 2000 + }, + { + "epoch": 0.9460992907801419, + "grad_norm": 2.526470899581909, + "learning_rate": 4.723965336454185e-06, + "loss": 0.6167, + "step": 2001 + }, + { + "epoch": 0.9465721040189126, + "grad_norm": 2.504506826400757, + "learning_rate": 4.723680323357618e-06, + "loss": 0.6061, + "step": 2002 + }, + { + "epoch": 0.9470449172576832, + "grad_norm": 3.0547544956207275, + "learning_rate": 4.723395171802228e-06, + "loss": 0.6619, + "step": 2003 + }, + { + "epoch": 0.9475177304964539, + "grad_norm": 2.8692407608032227, + "learning_rate": 4.723109881805771e-06, + "loss": 0.5985, + "step": 2004 + }, + { + "epoch": 0.9479905437352246, + "grad_norm": 2.7929654121398926, + "learning_rate": 4.7228244533860094e-06, + "loss": 0.5869, + "step": 2005 + }, + { + "epoch": 0.9484633569739953, + "grad_norm": 2.764869451522827, + "learning_rate": 4.7225388865607146e-06, + "loss": 0.6288, + "step": 2006 + }, + { + "epoch": 0.948936170212766, + "grad_norm": 2.7656404972076416, + "learning_rate": 4.722253181347671e-06, + "loss": 0.5831, + "step": 2007 + }, + { + "epoch": 0.9494089834515367, + "grad_norm": 2.6698336601257324, + "learning_rate": 4.7219673377646635e-06, + "loss": 0.6087, + "step": 2008 + }, + { + "epoch": 0.9498817966903074, + "grad_norm": 2.524935722351074, + "learning_rate": 4.7216813558294946e-06, + "loss": 0.5675, + "step": 2009 + }, + { + "epoch": 0.950354609929078, + "grad_norm": 2.5998785495758057, + "learning_rate": 4.721395235559969e-06, + "loss": 0.5667, + "step": 2010 + }, + { + "epoch": 0.9508274231678487, + "grad_norm": 2.758021354675293, + "learning_rate": 4.721108976973902e-06, + "loss": 0.4931, + "step": 2011 + }, + { + "epoch": 0.9513002364066194, + "grad_norm": 2.767695903778076, + "learning_rate": 4.72082258008912e-06, + "loss": 0.5778, + "step": 2012 + }, + { + "epoch": 0.9517730496453901, + "grad_norm": 2.982314348220825, + "learning_rate": 4.720536044923453e-06, + "loss": 0.6096, + "step": 2013 + }, + { + "epoch": 0.9522458628841608, + "grad_norm": 2.7608799934387207, + "learning_rate": 4.720249371494743e-06, + "loss": 0.6242, + "step": 2014 + }, + { + "epoch": 0.9527186761229315, + "grad_norm": 2.60054349899292, + "learning_rate": 4.71996255982084e-06, + "loss": 0.6249, + "step": 2015 + }, + { + "epoch": 0.9531914893617022, + "grad_norm": 2.654355764389038, + "learning_rate": 4.719675609919603e-06, + "loss": 0.6327, + "step": 2016 + }, + { + "epoch": 0.9536643026004729, + "grad_norm": 2.589404582977295, + "learning_rate": 4.719388521808899e-06, + "loss": 0.6357, + "step": 2017 + }, + { + "epoch": 0.9541371158392435, + "grad_norm": 2.8016581535339355, + "learning_rate": 4.719101295506603e-06, + "loss": 0.5901, + "step": 2018 + }, + { + "epoch": 0.9546099290780142, + "grad_norm": 3.1408045291900635, + "learning_rate": 4.7188139310306e-06, + "loss": 0.598, + "step": 2019 + }, + { + "epoch": 0.9550827423167849, + "grad_norm": 2.7432665824890137, + "learning_rate": 4.718526428398783e-06, + "loss": 0.5508, + "step": 2020 + }, + { + "epoch": 0.9555555555555556, + "grad_norm": 2.947800874710083, + "learning_rate": 4.718238787629053e-06, + "loss": 0.6439, + "step": 2021 + }, + { + "epoch": 0.9560283687943263, + "grad_norm": 2.50828218460083, + "learning_rate": 4.71795100873932e-06, + "loss": 0.5441, + "step": 2022 + }, + { + "epoch": 0.956501182033097, + "grad_norm": 2.8558974266052246, + "learning_rate": 4.717663091747503e-06, + "loss": 0.5416, + "step": 2023 + }, + { + "epoch": 0.9569739952718677, + "grad_norm": 2.4803316593170166, + "learning_rate": 4.71737503667153e-06, + "loss": 0.5317, + "step": 2024 + }, + { + "epoch": 0.9574468085106383, + "grad_norm": 4.36754035949707, + "learning_rate": 4.717086843529336e-06, + "loss": 0.5808, + "step": 2025 + }, + { + "epoch": 0.957919621749409, + "grad_norm": 2.730185031890869, + "learning_rate": 4.7167985123388665e-06, + "loss": 0.5257, + "step": 2026 + }, + { + "epoch": 0.9583924349881797, + "grad_norm": 2.8136069774627686, + "learning_rate": 4.716510043118074e-06, + "loss": 0.5836, + "step": 2027 + }, + { + "epoch": 0.9588652482269504, + "grad_norm": 2.793975353240967, + "learning_rate": 4.71622143588492e-06, + "loss": 0.5706, + "step": 2028 + }, + { + "epoch": 0.9593380614657211, + "grad_norm": 2.3883821964263916, + "learning_rate": 4.7159326906573745e-06, + "loss": 0.5291, + "step": 2029 + }, + { + "epoch": 0.9598108747044918, + "grad_norm": 2.6135976314544678, + "learning_rate": 4.715643807453417e-06, + "loss": 0.6199, + "step": 2030 + }, + { + "epoch": 0.9602836879432625, + "grad_norm": 2.6245670318603516, + "learning_rate": 4.715354786291035e-06, + "loss": 0.5585, + "step": 2031 + }, + { + "epoch": 0.9607565011820332, + "grad_norm": 2.7870967388153076, + "learning_rate": 4.715065627188225e-06, + "loss": 0.6196, + "step": 2032 + }, + { + "epoch": 0.9612293144208038, + "grad_norm": 2.6983911991119385, + "learning_rate": 4.714776330162991e-06, + "loss": 0.6424, + "step": 2033 + }, + { + "epoch": 0.9617021276595744, + "grad_norm": 2.3221919536590576, + "learning_rate": 4.7144868952333465e-06, + "loss": 0.568, + "step": 2034 + }, + { + "epoch": 0.9621749408983451, + "grad_norm": 2.9408178329467773, + "learning_rate": 4.714197322417314e-06, + "loss": 0.6175, + "step": 2035 + }, + { + "epoch": 0.9626477541371158, + "grad_norm": 2.404057264328003, + "learning_rate": 4.713907611732921e-06, + "loss": 0.4943, + "step": 2036 + }, + { + "epoch": 0.9631205673758865, + "grad_norm": 3.547607660293579, + "learning_rate": 4.71361776319821e-06, + "loss": 0.5488, + "step": 2037 + }, + { + "epoch": 0.9635933806146572, + "grad_norm": 2.679614543914795, + "learning_rate": 4.713327776831227e-06, + "loss": 0.6234, + "step": 2038 + }, + { + "epoch": 0.9640661938534278, + "grad_norm": 2.526914119720459, + "learning_rate": 4.7130376526500286e-06, + "loss": 0.5891, + "step": 2039 + }, + { + "epoch": 0.9645390070921985, + "grad_norm": 2.6953470706939697, + "learning_rate": 4.71274739067268e-06, + "loss": 0.69, + "step": 2040 + }, + { + "epoch": 0.9650118203309692, + "grad_norm": 2.546660900115967, + "learning_rate": 4.712456990917254e-06, + "loss": 0.6185, + "step": 2041 + }, + { + "epoch": 0.9654846335697399, + "grad_norm": 3.3920490741729736, + "learning_rate": 4.712166453401832e-06, + "loss": 0.587, + "step": 2042 + }, + { + "epoch": 0.9659574468085106, + "grad_norm": 2.5961573123931885, + "learning_rate": 4.711875778144504e-06, + "loss": 0.6105, + "step": 2043 + }, + { + "epoch": 0.9664302600472813, + "grad_norm": 2.5111498832702637, + "learning_rate": 4.711584965163372e-06, + "loss": 0.5533, + "step": 2044 + }, + { + "epoch": 0.966903073286052, + "grad_norm": 2.4878132343292236, + "learning_rate": 4.7112940144765405e-06, + "loss": 0.5604, + "step": 2045 + }, + { + "epoch": 0.9673758865248226, + "grad_norm": 2.5714077949523926, + "learning_rate": 4.711002926102128e-06, + "loss": 0.5794, + "step": 2046 + }, + { + "epoch": 0.9678486997635933, + "grad_norm": 2.7069091796875, + "learning_rate": 4.710711700058257e-06, + "loss": 0.594, + "step": 2047 + }, + { + "epoch": 0.968321513002364, + "grad_norm": 2.8104631900787354, + "learning_rate": 4.710420336363063e-06, + "loss": 0.6247, + "step": 2048 + }, + { + "epoch": 0.9687943262411347, + "grad_norm": 2.8464386463165283, + "learning_rate": 4.7101288350346865e-06, + "loss": 0.6162, + "step": 2049 + }, + { + "epoch": 0.9692671394799054, + "grad_norm": 2.7187976837158203, + "learning_rate": 4.709837196091279e-06, + "loss": 0.6109, + "step": 2050 + }, + { + "epoch": 0.9697399527186761, + "grad_norm": 2.556734085083008, + "learning_rate": 4.709545419550999e-06, + "loss": 0.6297, + "step": 2051 + }, + { + "epoch": 0.9702127659574468, + "grad_norm": 2.937195062637329, + "learning_rate": 4.709253505432014e-06, + "loss": 0.6862, + "step": 2052 + }, + { + "epoch": 0.9706855791962175, + "grad_norm": 2.792175531387329, + "learning_rate": 4.7089614537525015e-06, + "loss": 0.6105, + "step": 2053 + }, + { + "epoch": 0.9711583924349881, + "grad_norm": 2.625636100769043, + "learning_rate": 4.708669264530644e-06, + "loss": 0.5849, + "step": 2054 + }, + { + "epoch": 0.9716312056737588, + "grad_norm": 2.6752610206604004, + "learning_rate": 4.708376937784637e-06, + "loss": 0.5949, + "step": 2055 + }, + { + "epoch": 0.9721040189125295, + "grad_norm": 2.6072793006896973, + "learning_rate": 4.708084473532681e-06, + "loss": 0.5776, + "step": 2056 + }, + { + "epoch": 0.9725768321513002, + "grad_norm": 2.728632926940918, + "learning_rate": 4.707791871792988e-06, + "loss": 0.6352, + "step": 2057 + }, + { + "epoch": 0.9730496453900709, + "grad_norm": 2.5841758251190186, + "learning_rate": 4.707499132583775e-06, + "loss": 0.5488, + "step": 2058 + }, + { + "epoch": 0.9735224586288416, + "grad_norm": 2.8464293479919434, + "learning_rate": 4.707206255923271e-06, + "loss": 0.7051, + "step": 2059 + }, + { + "epoch": 0.9739952718676123, + "grad_norm": 2.547297239303589, + "learning_rate": 4.706913241829712e-06, + "loss": 0.5937, + "step": 2060 + }, + { + "epoch": 0.9744680851063829, + "grad_norm": 2.6572306156158447, + "learning_rate": 4.706620090321341e-06, + "loss": 0.6041, + "step": 2061 + }, + { + "epoch": 0.9749408983451536, + "grad_norm": 2.3262805938720703, + "learning_rate": 4.706326801416414e-06, + "loss": 0.5144, + "step": 2062 + }, + { + "epoch": 0.9754137115839243, + "grad_norm": 2.9693965911865234, + "learning_rate": 4.706033375133191e-06, + "loss": 0.551, + "step": 2063 + }, + { + "epoch": 0.975886524822695, + "grad_norm": 2.5993731021881104, + "learning_rate": 4.7057398114899435e-06, + "loss": 0.6143, + "step": 2064 + }, + { + "epoch": 0.9763593380614657, + "grad_norm": 2.453336477279663, + "learning_rate": 4.70544611050495e-06, + "loss": 0.6093, + "step": 2065 + }, + { + "epoch": 0.9768321513002364, + "grad_norm": 2.898629665374756, + "learning_rate": 4.705152272196497e-06, + "loss": 0.6007, + "step": 2066 + }, + { + "epoch": 0.9773049645390071, + "grad_norm": 2.7990612983703613, + "learning_rate": 4.7048582965828815e-06, + "loss": 0.6687, + "step": 2067 + }, + { + "epoch": 0.9777777777777777, + "grad_norm": 2.635284423828125, + "learning_rate": 4.704564183682408e-06, + "loss": 0.5564, + "step": 2068 + }, + { + "epoch": 0.9782505910165484, + "grad_norm": 3.014547109603882, + "learning_rate": 4.704269933513389e-06, + "loss": 0.6084, + "step": 2069 + }, + { + "epoch": 0.9787234042553191, + "grad_norm": 2.659357786178589, + "learning_rate": 4.703975546094147e-06, + "loss": 0.6031, + "step": 2070 + }, + { + "epoch": 0.9791962174940898, + "grad_norm": 2.326932668685913, + "learning_rate": 4.703681021443013e-06, + "loss": 0.5859, + "step": 2071 + }, + { + "epoch": 0.9796690307328605, + "grad_norm": 2.958803653717041, + "learning_rate": 4.7033863595783235e-06, + "loss": 0.5586, + "step": 2072 + }, + { + "epoch": 0.9801418439716312, + "grad_norm": 2.921386957168579, + "learning_rate": 4.703091560518427e-06, + "loss": 0.6126, + "step": 2073 + }, + { + "epoch": 0.9806146572104019, + "grad_norm": 2.6500775814056396, + "learning_rate": 4.702796624281679e-06, + "loss": 0.5678, + "step": 2074 + }, + { + "epoch": 0.9810874704491725, + "grad_norm": 2.7740228176116943, + "learning_rate": 4.702501550886445e-06, + "loss": 0.6067, + "step": 2075 + }, + { + "epoch": 0.9815602836879432, + "grad_norm": 2.3296213150024414, + "learning_rate": 4.702206340351096e-06, + "loss": 0.5247, + "step": 2076 + }, + { + "epoch": 0.9820330969267139, + "grad_norm": 2.748300790786743, + "learning_rate": 4.701910992694016e-06, + "loss": 0.5197, + "step": 2077 + }, + { + "epoch": 0.9825059101654846, + "grad_norm": 2.250985622406006, + "learning_rate": 4.7016155079335926e-06, + "loss": 0.5214, + "step": 2078 + }, + { + "epoch": 0.9829787234042553, + "grad_norm": 2.389845848083496, + "learning_rate": 4.701319886088226e-06, + "loss": 0.519, + "step": 2079 + }, + { + "epoch": 0.983451536643026, + "grad_norm": 2.818220853805542, + "learning_rate": 4.701024127176322e-06, + "loss": 0.607, + "step": 2080 + }, + { + "epoch": 0.9839243498817967, + "grad_norm": 3.4058034420013428, + "learning_rate": 4.700728231216297e-06, + "loss": 0.5711, + "step": 2081 + }, + { + "epoch": 0.9843971631205674, + "grad_norm": 2.5297787189483643, + "learning_rate": 4.700432198226575e-06, + "loss": 0.5979, + "step": 2082 + }, + { + "epoch": 0.984869976359338, + "grad_norm": 3.0548105239868164, + "learning_rate": 4.7001360282255885e-06, + "loss": 0.6041, + "step": 2083 + }, + { + "epoch": 0.9853427895981087, + "grad_norm": 2.8983733654022217, + "learning_rate": 4.699839721231779e-06, + "loss": 0.5926, + "step": 2084 + }, + { + "epoch": 0.9858156028368794, + "grad_norm": 3.2717764377593994, + "learning_rate": 4.699543277263596e-06, + "loss": 0.6477, + "step": 2085 + }, + { + "epoch": 0.9862884160756501, + "grad_norm": 3.03729248046875, + "learning_rate": 4.699246696339497e-06, + "loss": 0.6786, + "step": 2086 + }, + { + "epoch": 0.9867612293144208, + "grad_norm": 2.852301597595215, + "learning_rate": 4.698949978477951e-06, + "loss": 0.6565, + "step": 2087 + }, + { + "epoch": 0.9872340425531915, + "grad_norm": 2.843485116958618, + "learning_rate": 4.698653123697431e-06, + "loss": 0.6627, + "step": 2088 + }, + { + "epoch": 0.9877068557919622, + "grad_norm": 2.6315064430236816, + "learning_rate": 4.698356132016423e-06, + "loss": 0.6577, + "step": 2089 + }, + { + "epoch": 0.9881796690307328, + "grad_norm": 2.7482151985168457, + "learning_rate": 4.698059003453417e-06, + "loss": 0.5514, + "step": 2090 + }, + { + "epoch": 0.9886524822695035, + "grad_norm": 2.826673746109009, + "learning_rate": 4.6977617380269145e-06, + "loss": 0.565, + "step": 2091 + }, + { + "epoch": 0.9891252955082742, + "grad_norm": 3.0273752212524414, + "learning_rate": 4.697464335755427e-06, + "loss": 0.6331, + "step": 2092 + }, + { + "epoch": 0.9895981087470449, + "grad_norm": 2.7551653385162354, + "learning_rate": 4.6971667966574695e-06, + "loss": 0.6486, + "step": 2093 + }, + { + "epoch": 0.9900709219858156, + "grad_norm": 2.656299114227295, + "learning_rate": 4.696869120751571e-06, + "loss": 0.6562, + "step": 2094 + }, + { + "epoch": 0.9905437352245863, + "grad_norm": 2.785322904586792, + "learning_rate": 4.696571308056265e-06, + "loss": 0.5892, + "step": 2095 + }, + { + "epoch": 0.991016548463357, + "grad_norm": 2.9334635734558105, + "learning_rate": 4.696273358590095e-06, + "loss": 0.6346, + "step": 2096 + }, + { + "epoch": 0.9914893617021276, + "grad_norm": 2.7944300174713135, + "learning_rate": 4.695975272371613e-06, + "loss": 0.5859, + "step": 2097 + }, + { + "epoch": 0.9919621749408983, + "grad_norm": 2.5416972637176514, + "learning_rate": 4.695677049419381e-06, + "loss": 0.5658, + "step": 2098 + }, + { + "epoch": 0.992434988179669, + "grad_norm": 2.4056856632232666, + "learning_rate": 4.695378689751966e-06, + "loss": 0.5121, + "step": 2099 + }, + { + "epoch": 0.9929078014184397, + "grad_norm": 2.614548683166504, + "learning_rate": 4.695080193387948e-06, + "loss": 0.5961, + "step": 2100 + }, + { + "epoch": 0.9933806146572104, + "grad_norm": 2.8966517448425293, + "learning_rate": 4.69478156034591e-06, + "loss": 0.5985, + "step": 2101 + }, + { + "epoch": 0.9938534278959811, + "grad_norm": 2.9514098167419434, + "learning_rate": 4.694482790644448e-06, + "loss": 0.5677, + "step": 2102 + }, + { + "epoch": 0.9943262411347518, + "grad_norm": 2.4326791763305664, + "learning_rate": 4.694183884302165e-06, + "loss": 0.5698, + "step": 2103 + }, + { + "epoch": 0.9947990543735225, + "grad_norm": 2.9242892265319824, + "learning_rate": 4.6938848413376735e-06, + "loss": 0.6245, + "step": 2104 + }, + { + "epoch": 0.9952718676122931, + "grad_norm": 2.9134104251861572, + "learning_rate": 4.693585661769593e-06, + "loss": 0.6164, + "step": 2105 + }, + { + "epoch": 0.9957446808510638, + "grad_norm": 2.472564458847046, + "learning_rate": 4.693286345616551e-06, + "loss": 0.5616, + "step": 2106 + }, + { + "epoch": 0.9962174940898345, + "grad_norm": 3.2456448078155518, + "learning_rate": 4.692986892897186e-06, + "loss": 0.6977, + "step": 2107 + }, + { + "epoch": 0.9966903073286052, + "grad_norm": 3.4032769203186035, + "learning_rate": 4.692687303630143e-06, + "loss": 0.643, + "step": 2108 + }, + { + "epoch": 0.9971631205673759, + "grad_norm": 2.722200870513916, + "learning_rate": 4.692387577834076e-06, + "loss": 0.5873, + "step": 2109 + }, + { + "epoch": 0.9976359338061466, + "grad_norm": 2.687532663345337, + "learning_rate": 4.692087715527648e-06, + "loss": 0.5423, + "step": 2110 + }, + { + "epoch": 0.9981087470449173, + "grad_norm": 2.578613042831421, + "learning_rate": 4.6917877167295305e-06, + "loss": 0.5689, + "step": 2111 + }, + { + "epoch": 0.9985815602836879, + "grad_norm": 3.1806094646453857, + "learning_rate": 4.691487581458402e-06, + "loss": 0.6133, + "step": 2112 + }, + { + "epoch": 0.9990543735224586, + "grad_norm": 2.4449520111083984, + "learning_rate": 4.691187309732952e-06, + "loss": 0.5841, + "step": 2113 + }, + { + "epoch": 0.9995271867612293, + "grad_norm": 2.908749580383301, + "learning_rate": 4.690886901571875e-06, + "loss": 0.534, + "step": 2114 + }, + { + "epoch": 1.0, + "grad_norm": 4.019968032836914, + "learning_rate": 4.6905863569938785e-06, + "loss": 0.596, + "step": 2115 + }, + { + "epoch": 1.0004728132387706, + "grad_norm": 2.4319307804107666, + "learning_rate": 4.690285676017675e-06, + "loss": 0.4973, + "step": 2116 + }, + { + "epoch": 1.0009456264775414, + "grad_norm": 2.6366477012634277, + "learning_rate": 4.689984858661986e-06, + "loss": 0.5682, + "step": 2117 + }, + { + "epoch": 1.001418439716312, + "grad_norm": 2.815114974975586, + "learning_rate": 4.689683904945542e-06, + "loss": 0.5616, + "step": 2118 + }, + { + "epoch": 1.0018912529550827, + "grad_norm": 2.6680490970611572, + "learning_rate": 4.689382814887084e-06, + "loss": 0.5161, + "step": 2119 + }, + { + "epoch": 1.0023640661938533, + "grad_norm": 2.7406351566314697, + "learning_rate": 4.689081588505358e-06, + "loss": 0.4937, + "step": 2120 + }, + { + "epoch": 1.0028368794326241, + "grad_norm": 2.2832298278808594, + "learning_rate": 4.68878022581912e-06, + "loss": 0.4986, + "step": 2121 + }, + { + "epoch": 1.0033096926713947, + "grad_norm": 2.5525307655334473, + "learning_rate": 4.688478726847136e-06, + "loss": 0.4909, + "step": 2122 + }, + { + "epoch": 1.0037825059101655, + "grad_norm": 2.9843199253082275, + "learning_rate": 4.688177091608176e-06, + "loss": 0.6046, + "step": 2123 + }, + { + "epoch": 1.004255319148936, + "grad_norm": 2.5231106281280518, + "learning_rate": 4.687875320121024e-06, + "loss": 0.5423, + "step": 2124 + }, + { + "epoch": 1.0047281323877069, + "grad_norm": 2.567599058151245, + "learning_rate": 4.68757341240447e-06, + "loss": 0.5092, + "step": 2125 + }, + { + "epoch": 1.0052009456264774, + "grad_norm": 2.768111228942871, + "learning_rate": 4.687271368477311e-06, + "loss": 0.5175, + "step": 2126 + }, + { + "epoch": 1.0056737588652482, + "grad_norm": 2.7223286628723145, + "learning_rate": 4.686969188358355e-06, + "loss": 0.5412, + "step": 2127 + }, + { + "epoch": 1.0061465721040188, + "grad_norm": 2.488299608230591, + "learning_rate": 4.686666872066418e-06, + "loss": 0.5288, + "step": 2128 + }, + { + "epoch": 1.0066193853427896, + "grad_norm": 2.882981777191162, + "learning_rate": 4.6863644196203215e-06, + "loss": 0.6117, + "step": 2129 + }, + { + "epoch": 1.0070921985815602, + "grad_norm": 3.0019447803497314, + "learning_rate": 4.686061831038901e-06, + "loss": 0.5308, + "step": 2130 + }, + { + "epoch": 1.007565011820331, + "grad_norm": 3.0056138038635254, + "learning_rate": 4.685759106340996e-06, + "loss": 0.5833, + "step": 2131 + }, + { + "epoch": 1.0080378250591016, + "grad_norm": 2.5709075927734375, + "learning_rate": 4.685456245545454e-06, + "loss": 0.5071, + "step": 2132 + }, + { + "epoch": 1.0085106382978724, + "grad_norm": 2.4641504287719727, + "learning_rate": 4.685153248671136e-06, + "loss": 0.4813, + "step": 2133 + }, + { + "epoch": 1.008983451536643, + "grad_norm": 2.374413013458252, + "learning_rate": 4.684850115736906e-06, + "loss": 0.5179, + "step": 2134 + }, + { + "epoch": 1.0094562647754137, + "grad_norm": 2.6504571437835693, + "learning_rate": 4.684546846761641e-06, + "loss": 0.437, + "step": 2135 + }, + { + "epoch": 1.0099290780141843, + "grad_norm": 2.5977871417999268, + "learning_rate": 4.684243441764221e-06, + "loss": 0.497, + "step": 2136 + }, + { + "epoch": 1.010401891252955, + "grad_norm": 2.4950785636901855, + "learning_rate": 4.683939900763541e-06, + "loss": 0.5624, + "step": 2137 + }, + { + "epoch": 1.0108747044917257, + "grad_norm": 3.065718412399292, + "learning_rate": 4.6836362237785e-06, + "loss": 0.512, + "step": 2138 + }, + { + "epoch": 1.0113475177304965, + "grad_norm": 2.7419207096099854, + "learning_rate": 4.6833324108280045e-06, + "loss": 0.5585, + "step": 2139 + }, + { + "epoch": 1.011820330969267, + "grad_norm": 2.623610496520996, + "learning_rate": 4.6830284619309744e-06, + "loss": 0.5163, + "step": 2140 + }, + { + "epoch": 1.0122931442080378, + "grad_norm": 2.774322986602783, + "learning_rate": 4.682724377106334e-06, + "loss": 0.527, + "step": 2141 + }, + { + "epoch": 1.0127659574468084, + "grad_norm": 2.959935188293457, + "learning_rate": 4.682420156373017e-06, + "loss": 0.6166, + "step": 2142 + }, + { + "epoch": 1.0132387706855792, + "grad_norm": 2.584026336669922, + "learning_rate": 4.682115799749968e-06, + "loss": 0.5086, + "step": 2143 + }, + { + "epoch": 1.0137115839243498, + "grad_norm": 2.6039700508117676, + "learning_rate": 4.6818113072561346e-06, + "loss": 0.49, + "step": 2144 + }, + { + "epoch": 1.0141843971631206, + "grad_norm": 2.466381072998047, + "learning_rate": 4.681506678910479e-06, + "loss": 0.4959, + "step": 2145 + }, + { + "epoch": 1.0146572104018912, + "grad_norm": 2.432636260986328, + "learning_rate": 4.681201914731969e-06, + "loss": 0.5057, + "step": 2146 + }, + { + "epoch": 1.015130023640662, + "grad_norm": 2.6134090423583984, + "learning_rate": 4.680897014739579e-06, + "loss": 0.4874, + "step": 2147 + }, + { + "epoch": 1.0156028368794325, + "grad_norm": 2.774481773376465, + "learning_rate": 4.680591978952295e-06, + "loss": 0.4967, + "step": 2148 + }, + { + "epoch": 1.0160756501182033, + "grad_norm": 2.66050124168396, + "learning_rate": 4.68028680738911e-06, + "loss": 0.4932, + "step": 2149 + }, + { + "epoch": 1.016548463356974, + "grad_norm": 3.020594835281372, + "learning_rate": 4.679981500069026e-06, + "loss": 0.5788, + "step": 2150 + }, + { + "epoch": 1.0170212765957447, + "grad_norm": 2.697758436203003, + "learning_rate": 4.679676057011053e-06, + "loss": 0.5441, + "step": 2151 + }, + { + "epoch": 1.0174940898345153, + "grad_norm": 6.986445903778076, + "learning_rate": 4.679370478234209e-06, + "loss": 0.6483, + "step": 2152 + }, + { + "epoch": 1.017966903073286, + "grad_norm": 2.6637115478515625, + "learning_rate": 4.679064763757522e-06, + "loss": 0.5859, + "step": 2153 + }, + { + "epoch": 1.0184397163120567, + "grad_norm": 2.7501862049102783, + "learning_rate": 4.678758913600027e-06, + "loss": 0.5745, + "step": 2154 + }, + { + "epoch": 1.0189125295508275, + "grad_norm": 2.7959372997283936, + "learning_rate": 4.678452927780768e-06, + "loss": 0.5076, + "step": 2155 + }, + { + "epoch": 1.019385342789598, + "grad_norm": 2.4377388954162598, + "learning_rate": 4.678146806318798e-06, + "loss": 0.5061, + "step": 2156 + }, + { + "epoch": 1.0198581560283688, + "grad_norm": 2.5478947162628174, + "learning_rate": 4.677840549233176e-06, + "loss": 0.4941, + "step": 2157 + }, + { + "epoch": 1.0203309692671394, + "grad_norm": 3.0956528186798096, + "learning_rate": 4.677534156542973e-06, + "loss": 0.5879, + "step": 2158 + }, + { + "epoch": 1.0208037825059102, + "grad_norm": 2.5247607231140137, + "learning_rate": 4.6772276282672666e-06, + "loss": 0.5532, + "step": 2159 + }, + { + "epoch": 1.0212765957446808, + "grad_norm": 3.1972787380218506, + "learning_rate": 4.676920964425143e-06, + "loss": 0.6081, + "step": 2160 + }, + { + "epoch": 1.0217494089834516, + "grad_norm": 2.6173388957977295, + "learning_rate": 4.6766141650356955e-06, + "loss": 0.5001, + "step": 2161 + }, + { + "epoch": 1.0222222222222221, + "grad_norm": 2.9914398193359375, + "learning_rate": 4.676307230118029e-06, + "loss": 0.5566, + "step": 2162 + }, + { + "epoch": 1.022695035460993, + "grad_norm": 2.8011834621429443, + "learning_rate": 4.676000159691254e-06, + "loss": 0.4909, + "step": 2163 + }, + { + "epoch": 1.0231678486997635, + "grad_norm": 2.6049559116363525, + "learning_rate": 4.67569295377449e-06, + "loss": 0.5018, + "step": 2164 + }, + { + "epoch": 1.0236406619385343, + "grad_norm": 2.8175013065338135, + "learning_rate": 4.675385612386866e-06, + "loss": 0.5309, + "step": 2165 + }, + { + "epoch": 1.0241134751773049, + "grad_norm": 2.854696750640869, + "learning_rate": 4.675078135547519e-06, + "loss": 0.5627, + "step": 2166 + }, + { + "epoch": 1.0245862884160757, + "grad_norm": 3.1856436729431152, + "learning_rate": 4.674770523275594e-06, + "loss": 0.5475, + "step": 2167 + }, + { + "epoch": 1.0250591016548463, + "grad_norm": 2.8289129734039307, + "learning_rate": 4.674462775590244e-06, + "loss": 0.5878, + "step": 2168 + }, + { + "epoch": 1.025531914893617, + "grad_norm": 2.8824517726898193, + "learning_rate": 4.6741548925106325e-06, + "loss": 0.4392, + "step": 2169 + }, + { + "epoch": 1.0260047281323876, + "grad_norm": 2.7044589519500732, + "learning_rate": 4.673846874055928e-06, + "loss": 0.5264, + "step": 2170 + }, + { + "epoch": 1.0264775413711584, + "grad_norm": 2.575035810470581, + "learning_rate": 4.673538720245312e-06, + "loss": 0.4615, + "step": 2171 + }, + { + "epoch": 1.026950354609929, + "grad_norm": 2.48168683052063, + "learning_rate": 4.67323043109797e-06, + "loss": 0.4404, + "step": 2172 + }, + { + "epoch": 1.0274231678486998, + "grad_norm": 2.926593065261841, + "learning_rate": 4.672922006633098e-06, + "loss": 0.54, + "step": 2173 + }, + { + "epoch": 1.0278959810874704, + "grad_norm": 2.4610698223114014, + "learning_rate": 4.672613446869901e-06, + "loss": 0.5555, + "step": 2174 + }, + { + "epoch": 1.0283687943262412, + "grad_norm": 3.026901960372925, + "learning_rate": 4.672304751827592e-06, + "loss": 0.62, + "step": 2175 + }, + { + "epoch": 1.0288416075650118, + "grad_norm": 2.3946213722229004, + "learning_rate": 4.671995921525391e-06, + "loss": 0.5228, + "step": 2176 + }, + { + "epoch": 1.0293144208037825, + "grad_norm": 2.985020399093628, + "learning_rate": 4.671686955982528e-06, + "loss": 0.6256, + "step": 2177 + }, + { + "epoch": 1.0297872340425531, + "grad_norm": 3.0910139083862305, + "learning_rate": 4.671377855218239e-06, + "loss": 0.5893, + "step": 2178 + }, + { + "epoch": 1.030260047281324, + "grad_norm": 2.507805109024048, + "learning_rate": 4.6710686192517744e-06, + "loss": 0.5329, + "step": 2179 + }, + { + "epoch": 1.0307328605200945, + "grad_norm": 2.4514641761779785, + "learning_rate": 4.670759248102386e-06, + "loss": 0.4585, + "step": 2180 + }, + { + "epoch": 1.0312056737588653, + "grad_norm": 2.742838144302368, + "learning_rate": 4.670449741789337e-06, + "loss": 0.6255, + "step": 2181 + }, + { + "epoch": 1.0316784869976359, + "grad_norm": 2.374349594116211, + "learning_rate": 4.670140100331901e-06, + "loss": 0.5049, + "step": 2182 + }, + { + "epoch": 1.0321513002364067, + "grad_norm": 2.78894305229187, + "learning_rate": 4.669830323749356e-06, + "loss": 0.6061, + "step": 2183 + }, + { + "epoch": 1.0326241134751772, + "grad_norm": 2.7195091247558594, + "learning_rate": 4.6695204120609905e-06, + "loss": 0.592, + "step": 2184 + }, + { + "epoch": 1.033096926713948, + "grad_norm": 2.824411630630493, + "learning_rate": 4.6692103652861035e-06, + "loss": 0.5666, + "step": 2185 + }, + { + "epoch": 1.0335697399527186, + "grad_norm": 2.4981014728546143, + "learning_rate": 4.6689001834439975e-06, + "loss": 0.5045, + "step": 2186 + }, + { + "epoch": 1.0340425531914894, + "grad_norm": 2.7375214099884033, + "learning_rate": 4.668589866553988e-06, + "loss": 0.5305, + "step": 2187 + }, + { + "epoch": 1.03451536643026, + "grad_norm": 2.625345468521118, + "learning_rate": 4.668279414635396e-06, + "loss": 0.4819, + "step": 2188 + }, + { + "epoch": 1.0349881796690308, + "grad_norm": 2.60479736328125, + "learning_rate": 4.667968827707553e-06, + "loss": 0.55, + "step": 2189 + }, + { + "epoch": 1.0354609929078014, + "grad_norm": 2.642014741897583, + "learning_rate": 4.667658105789797e-06, + "loss": 0.5264, + "step": 2190 + }, + { + "epoch": 1.0359338061465722, + "grad_norm": 2.5439083576202393, + "learning_rate": 4.667347248901476e-06, + "loss": 0.4657, + "step": 2191 + }, + { + "epoch": 1.0364066193853427, + "grad_norm": 2.5537586212158203, + "learning_rate": 4.667036257061945e-06, + "loss": 0.527, + "step": 2192 + }, + { + "epoch": 1.0368794326241135, + "grad_norm": 2.595466375350952, + "learning_rate": 4.666725130290569e-06, + "loss": 0.5336, + "step": 2193 + }, + { + "epoch": 1.037352245862884, + "grad_norm": 3.5106313228607178, + "learning_rate": 4.666413868606719e-06, + "loss": 0.5176, + "step": 2194 + }, + { + "epoch": 1.037825059101655, + "grad_norm": 2.931553363800049, + "learning_rate": 4.666102472029778e-06, + "loss": 0.549, + "step": 2195 + }, + { + "epoch": 1.0382978723404255, + "grad_norm": 2.4325125217437744, + "learning_rate": 4.665790940579133e-06, + "loss": 0.5095, + "step": 2196 + }, + { + "epoch": 1.0387706855791963, + "grad_norm": 2.708477258682251, + "learning_rate": 4.665479274274184e-06, + "loss": 0.5264, + "step": 2197 + }, + { + "epoch": 1.0392434988179668, + "grad_norm": 2.905977487564087, + "learning_rate": 4.665167473134335e-06, + "loss": 0.5575, + "step": 2198 + }, + { + "epoch": 1.0397163120567376, + "grad_norm": 2.428938865661621, + "learning_rate": 4.664855537179003e-06, + "loss": 0.5099, + "step": 2199 + }, + { + "epoch": 1.0401891252955082, + "grad_norm": 2.8432137966156006, + "learning_rate": 4.6645434664276075e-06, + "loss": 0.5331, + "step": 2200 + }, + { + "epoch": 1.040661938534279, + "grad_norm": 2.5185136795043945, + "learning_rate": 4.6642312608995825e-06, + "loss": 0.5217, + "step": 2201 + }, + { + "epoch": 1.0411347517730496, + "grad_norm": 2.556607723236084, + "learning_rate": 4.663918920614366e-06, + "loss": 0.4431, + "step": 2202 + }, + { + "epoch": 1.0416075650118204, + "grad_norm": 3.1271166801452637, + "learning_rate": 4.663606445591407e-06, + "loss": 0.5398, + "step": 2203 + }, + { + "epoch": 1.042080378250591, + "grad_norm": 2.573680877685547, + "learning_rate": 4.663293835850162e-06, + "loss": 0.4713, + "step": 2204 + }, + { + "epoch": 1.0425531914893618, + "grad_norm": 2.5230324268341064, + "learning_rate": 4.662981091410096e-06, + "loss": 0.5571, + "step": 2205 + }, + { + "epoch": 1.0430260047281323, + "grad_norm": 2.552182912826538, + "learning_rate": 4.662668212290681e-06, + "loss": 0.5173, + "step": 2206 + }, + { + "epoch": 1.0434988179669031, + "grad_norm": 2.832345724105835, + "learning_rate": 4.6623551985113995e-06, + "loss": 0.525, + "step": 2207 + }, + { + "epoch": 1.0439716312056737, + "grad_norm": 2.9729080200195312, + "learning_rate": 4.6620420500917416e-06, + "loss": 0.6308, + "step": 2208 + }, + { + "epoch": 1.0444444444444445, + "grad_norm": 2.618187665939331, + "learning_rate": 4.661728767051206e-06, + "loss": 0.4942, + "step": 2209 + }, + { + "epoch": 1.044917257683215, + "grad_norm": 2.515566349029541, + "learning_rate": 4.661415349409299e-06, + "loss": 0.5229, + "step": 2210 + }, + { + "epoch": 1.0453900709219859, + "grad_norm": 2.8651459217071533, + "learning_rate": 4.6611017971855356e-06, + "loss": 0.5029, + "step": 2211 + }, + { + "epoch": 1.0458628841607565, + "grad_norm": 2.502405881881714, + "learning_rate": 4.660788110399439e-06, + "loss": 0.4732, + "step": 2212 + }, + { + "epoch": 1.0463356973995273, + "grad_norm": 2.540668249130249, + "learning_rate": 4.660474289070541e-06, + "loss": 0.547, + "step": 2213 + }, + { + "epoch": 1.0468085106382978, + "grad_norm": 2.803469181060791, + "learning_rate": 4.660160333218384e-06, + "loss": 0.5441, + "step": 2214 + }, + { + "epoch": 1.0472813238770686, + "grad_norm": 3.233325481414795, + "learning_rate": 4.659846242862514e-06, + "loss": 0.4457, + "step": 2215 + }, + { + "epoch": 1.0477541371158392, + "grad_norm": 2.549548387527466, + "learning_rate": 4.659532018022489e-06, + "loss": 0.5684, + "step": 2216 + }, + { + "epoch": 1.04822695035461, + "grad_norm": 2.6112852096557617, + "learning_rate": 4.659217658717875e-06, + "loss": 0.5323, + "step": 2217 + }, + { + "epoch": 1.0486997635933806, + "grad_norm": 2.347418785095215, + "learning_rate": 4.658903164968245e-06, + "loss": 0.5349, + "step": 2218 + }, + { + "epoch": 1.0491725768321514, + "grad_norm": 2.695502281188965, + "learning_rate": 4.658588536793182e-06, + "loss": 0.4883, + "step": 2219 + }, + { + "epoch": 1.049645390070922, + "grad_norm": 2.7575674057006836, + "learning_rate": 4.658273774212275e-06, + "loss": 0.5517, + "step": 2220 + }, + { + "epoch": 1.0501182033096927, + "grad_norm": 2.787855386734009, + "learning_rate": 4.6579588772451245e-06, + "loss": 0.5744, + "step": 2221 + }, + { + "epoch": 1.0505910165484633, + "grad_norm": 3.0699398517608643, + "learning_rate": 4.657643845911337e-06, + "loss": 0.5258, + "step": 2222 + }, + { + "epoch": 1.0510638297872341, + "grad_norm": 2.652040719985962, + "learning_rate": 4.657328680230527e-06, + "loss": 0.5141, + "step": 2223 + }, + { + "epoch": 1.0515366430260047, + "grad_norm": 2.6896369457244873, + "learning_rate": 4.657013380222322e-06, + "loss": 0.5139, + "step": 2224 + }, + { + "epoch": 1.0520094562647755, + "grad_norm": 2.551839590072632, + "learning_rate": 4.65669794590635e-06, + "loss": 0.5099, + "step": 2225 + }, + { + "epoch": 1.052482269503546, + "grad_norm": 2.8543262481689453, + "learning_rate": 4.656382377302255e-06, + "loss": 0.6085, + "step": 2226 + }, + { + "epoch": 1.0529550827423169, + "grad_norm": 2.871469259262085, + "learning_rate": 4.656066674429685e-06, + "loss": 0.6108, + "step": 2227 + }, + { + "epoch": 1.0534278959810874, + "grad_norm": 2.4840824604034424, + "learning_rate": 4.655750837308296e-06, + "loss": 0.4994, + "step": 2228 + }, + { + "epoch": 1.0539007092198582, + "grad_norm": 2.5203280448913574, + "learning_rate": 4.6554348659577555e-06, + "loss": 0.4928, + "step": 2229 + }, + { + "epoch": 1.0543735224586288, + "grad_norm": 2.9327683448791504, + "learning_rate": 4.655118760397737e-06, + "loss": 0.6324, + "step": 2230 + }, + { + "epoch": 1.0548463356973996, + "grad_norm": 2.6766855716705322, + "learning_rate": 4.654802520647924e-06, + "loss": 0.5178, + "step": 2231 + }, + { + "epoch": 1.0553191489361702, + "grad_norm": 2.8438873291015625, + "learning_rate": 4.654486146728006e-06, + "loss": 0.509, + "step": 2232 + }, + { + "epoch": 1.055791962174941, + "grad_norm": 2.538661241531372, + "learning_rate": 4.6541696386576826e-06, + "loss": 0.5463, + "step": 2233 + }, + { + "epoch": 1.0562647754137116, + "grad_norm": 2.829030990600586, + "learning_rate": 4.653852996456662e-06, + "loss": 0.5404, + "step": 2234 + }, + { + "epoch": 1.0567375886524824, + "grad_norm": 2.5657269954681396, + "learning_rate": 4.653536220144659e-06, + "loss": 0.5479, + "step": 2235 + }, + { + "epoch": 1.057210401891253, + "grad_norm": 2.6641297340393066, + "learning_rate": 4.653219309741399e-06, + "loss": 0.5503, + "step": 2236 + }, + { + "epoch": 1.0576832151300237, + "grad_norm": 2.966350555419922, + "learning_rate": 4.652902265266615e-06, + "loss": 0.6404, + "step": 2237 + }, + { + "epoch": 1.0581560283687943, + "grad_norm": 2.462430000305176, + "learning_rate": 4.6525850867400455e-06, + "loss": 0.4885, + "step": 2238 + }, + { + "epoch": 1.058628841607565, + "grad_norm": 2.1791880130767822, + "learning_rate": 4.652267774181443e-06, + "loss": 0.4405, + "step": 2239 + }, + { + "epoch": 1.0591016548463357, + "grad_norm": 2.5473732948303223, + "learning_rate": 4.651950327610563e-06, + "loss": 0.5295, + "step": 2240 + }, + { + "epoch": 1.0595744680851065, + "grad_norm": 2.70904803276062, + "learning_rate": 4.651632747047172e-06, + "loss": 0.5169, + "step": 2241 + }, + { + "epoch": 1.060047281323877, + "grad_norm": 3.8442928791046143, + "learning_rate": 4.651315032511045e-06, + "loss": 0.5473, + "step": 2242 + }, + { + "epoch": 1.0605200945626478, + "grad_norm": 2.8613383769989014, + "learning_rate": 4.650997184021963e-06, + "loss": 0.5445, + "step": 2243 + }, + { + "epoch": 1.0609929078014184, + "grad_norm": 2.5995829105377197, + "learning_rate": 4.6506792015997184e-06, + "loss": 0.5525, + "step": 2244 + }, + { + "epoch": 1.0614657210401892, + "grad_norm": 2.5465996265411377, + "learning_rate": 4.650361085264111e-06, + "loss": 0.5093, + "step": 2245 + }, + { + "epoch": 1.0619385342789598, + "grad_norm": 2.46553111076355, + "learning_rate": 4.650042835034948e-06, + "loss": 0.5375, + "step": 2246 + }, + { + "epoch": 1.0624113475177306, + "grad_norm": 2.6907830238342285, + "learning_rate": 4.649724450932045e-06, + "loss": 0.572, + "step": 2247 + }, + { + "epoch": 1.0628841607565012, + "grad_norm": 3.0671346187591553, + "learning_rate": 4.649405932975226e-06, + "loss": 0.4974, + "step": 2248 + }, + { + "epoch": 1.063356973995272, + "grad_norm": 2.5392491817474365, + "learning_rate": 4.649087281184325e-06, + "loss": 0.524, + "step": 2249 + }, + { + "epoch": 1.0638297872340425, + "grad_norm": 2.7498562335968018, + "learning_rate": 4.648768495579183e-06, + "loss": 0.5801, + "step": 2250 + }, + { + "epoch": 1.0643026004728133, + "grad_norm": 2.8536248207092285, + "learning_rate": 4.648449576179649e-06, + "loss": 0.5384, + "step": 2251 + }, + { + "epoch": 1.064775413711584, + "grad_norm": 2.7062792778015137, + "learning_rate": 4.64813052300558e-06, + "loss": 0.5262, + "step": 2252 + }, + { + "epoch": 1.0652482269503547, + "grad_norm": 2.798650026321411, + "learning_rate": 4.647811336076841e-06, + "loss": 0.5719, + "step": 2253 + }, + { + "epoch": 1.0657210401891253, + "grad_norm": 2.9793951511383057, + "learning_rate": 4.647492015413311e-06, + "loss": 0.5377, + "step": 2254 + }, + { + "epoch": 1.066193853427896, + "grad_norm": 2.572129011154175, + "learning_rate": 4.647172561034868e-06, + "loss": 0.4791, + "step": 2255 + }, + { + "epoch": 1.0666666666666667, + "grad_norm": 3.7490930557250977, + "learning_rate": 4.646852972961405e-06, + "loss": 0.5423, + "step": 2256 + }, + { + "epoch": 1.0671394799054374, + "grad_norm": 2.626255750656128, + "learning_rate": 4.646533251212821e-06, + "loss": 0.5558, + "step": 2257 + }, + { + "epoch": 1.067612293144208, + "grad_norm": 2.8408126831054688, + "learning_rate": 4.646213395809023e-06, + "loss": 0.55, + "step": 2258 + }, + { + "epoch": 1.0680851063829788, + "grad_norm": 3.255606174468994, + "learning_rate": 4.645893406769929e-06, + "loss": 0.547, + "step": 2259 + }, + { + "epoch": 1.0685579196217494, + "grad_norm": 2.4352102279663086, + "learning_rate": 4.645573284115461e-06, + "loss": 0.4898, + "step": 2260 + }, + { + "epoch": 1.0690307328605202, + "grad_norm": 2.408634662628174, + "learning_rate": 4.6452530278655535e-06, + "loss": 0.5264, + "step": 2261 + }, + { + "epoch": 1.0695035460992908, + "grad_norm": 2.4220449924468994, + "learning_rate": 4.644932638040146e-06, + "loss": 0.5166, + "step": 2262 + }, + { + "epoch": 1.0699763593380616, + "grad_norm": 2.9188082218170166, + "learning_rate": 4.644612114659188e-06, + "loss": 0.5611, + "step": 2263 + }, + { + "epoch": 1.0704491725768321, + "grad_norm": 2.906557083129883, + "learning_rate": 4.644291457742638e-06, + "loss": 0.5515, + "step": 2264 + }, + { + "epoch": 1.070921985815603, + "grad_norm": 2.9039015769958496, + "learning_rate": 4.643970667310462e-06, + "loss": 0.5732, + "step": 2265 + }, + { + "epoch": 1.0713947990543735, + "grad_norm": 2.9985480308532715, + "learning_rate": 4.643649743382632e-06, + "loss": 0.563, + "step": 2266 + }, + { + "epoch": 1.0718676122931443, + "grad_norm": 2.5780906677246094, + "learning_rate": 4.6433286859791335e-06, + "loss": 0.502, + "step": 2267 + }, + { + "epoch": 1.0723404255319149, + "grad_norm": 2.590209722518921, + "learning_rate": 4.643007495119955e-06, + "loss": 0.4995, + "step": 2268 + }, + { + "epoch": 1.0728132387706855, + "grad_norm": 2.378894805908203, + "learning_rate": 4.642686170825097e-06, + "loss": 0.4886, + "step": 2269 + }, + { + "epoch": 1.0732860520094563, + "grad_norm": 2.6826229095458984, + "learning_rate": 4.642364713114567e-06, + "loss": 0.465, + "step": 2270 + }, + { + "epoch": 1.073758865248227, + "grad_norm": 2.627819538116455, + "learning_rate": 4.64204312200838e-06, + "loss": 0.4954, + "step": 2271 + }, + { + "epoch": 1.0742316784869976, + "grad_norm": 2.993021249771118, + "learning_rate": 4.641721397526561e-06, + "loss": 0.5073, + "step": 2272 + }, + { + "epoch": 1.0747044917257682, + "grad_norm": 2.719052791595459, + "learning_rate": 4.64139953968914e-06, + "loss": 0.538, + "step": 2273 + }, + { + "epoch": 1.075177304964539, + "grad_norm": 2.729252576828003, + "learning_rate": 4.6410775485161605e-06, + "loss": 0.552, + "step": 2274 + }, + { + "epoch": 1.0756501182033098, + "grad_norm": 2.924142599105835, + "learning_rate": 4.640755424027671e-06, + "loss": 0.522, + "step": 2275 + }, + { + "epoch": 1.0761229314420804, + "grad_norm": 3.329162120819092, + "learning_rate": 4.640433166243728e-06, + "loss": 0.5965, + "step": 2276 + }, + { + "epoch": 1.076595744680851, + "grad_norm": 2.9810245037078857, + "learning_rate": 4.640110775184396e-06, + "loss": 0.5653, + "step": 2277 + }, + { + "epoch": 1.0770685579196217, + "grad_norm": 2.61772084236145, + "learning_rate": 4.639788250869751e-06, + "loss": 0.5382, + "step": 2278 + }, + { + "epoch": 1.0775413711583925, + "grad_norm": 2.741225004196167, + "learning_rate": 4.639465593319874e-06, + "loss": 0.4866, + "step": 2279 + }, + { + "epoch": 1.0780141843971631, + "grad_norm": 2.7945218086242676, + "learning_rate": 4.639142802554856e-06, + "loss": 0.4711, + "step": 2280 + }, + { + "epoch": 1.0784869976359337, + "grad_norm": 2.4282329082489014, + "learning_rate": 4.638819878594795e-06, + "loss": 0.4911, + "step": 2281 + }, + { + "epoch": 1.0789598108747045, + "grad_norm": 2.551741361618042, + "learning_rate": 4.638496821459799e-06, + "loss": 0.453, + "step": 2282 + }, + { + "epoch": 1.0794326241134753, + "grad_norm": 2.5622754096984863, + "learning_rate": 4.638173631169983e-06, + "loss": 0.5983, + "step": 2283 + }, + { + "epoch": 1.0799054373522459, + "grad_norm": 2.7748284339904785, + "learning_rate": 4.6378503077454715e-06, + "loss": 0.5143, + "step": 2284 + }, + { + "epoch": 1.0803782505910164, + "grad_norm": 2.7693238258361816, + "learning_rate": 4.637526851206394e-06, + "loss": 0.5929, + "step": 2285 + }, + { + "epoch": 1.0808510638297872, + "grad_norm": 2.705548048019409, + "learning_rate": 4.637203261572893e-06, + "loss": 0.5577, + "step": 2286 + }, + { + "epoch": 1.081323877068558, + "grad_norm": 2.739307165145874, + "learning_rate": 4.636879538865117e-06, + "loss": 0.5676, + "step": 2287 + }, + { + "epoch": 1.0817966903073286, + "grad_norm": 2.514059543609619, + "learning_rate": 4.636555683103221e-06, + "loss": 0.5001, + "step": 2288 + }, + { + "epoch": 1.0822695035460992, + "grad_norm": 2.7166874408721924, + "learning_rate": 4.636231694307372e-06, + "loss": 0.5411, + "step": 2289 + }, + { + "epoch": 1.08274231678487, + "grad_norm": 2.7661683559417725, + "learning_rate": 4.635907572497741e-06, + "loss": 0.6353, + "step": 2290 + }, + { + "epoch": 1.0832151300236406, + "grad_norm": 2.598381996154785, + "learning_rate": 4.635583317694512e-06, + "loss": 0.5213, + "step": 2291 + }, + { + "epoch": 1.0836879432624114, + "grad_norm": 2.821491003036499, + "learning_rate": 4.6352589299178744e-06, + "loss": 0.6172, + "step": 2292 + }, + { + "epoch": 1.084160756501182, + "grad_norm": 2.5422823429107666, + "learning_rate": 4.634934409188025e-06, + "loss": 0.5245, + "step": 2293 + }, + { + "epoch": 1.0846335697399527, + "grad_norm": 2.8264620304107666, + "learning_rate": 4.634609755525173e-06, + "loss": 0.5004, + "step": 2294 + }, + { + "epoch": 1.0851063829787233, + "grad_norm": 2.3286643028259277, + "learning_rate": 4.63428496894953e-06, + "loss": 0.4561, + "step": 2295 + }, + { + "epoch": 1.085579196217494, + "grad_norm": 2.462005376815796, + "learning_rate": 4.633960049481321e-06, + "loss": 0.4948, + "step": 2296 + }, + { + "epoch": 1.0860520094562647, + "grad_norm": 2.760258913040161, + "learning_rate": 4.633634997140777e-06, + "loss": 0.5407, + "step": 2297 + }, + { + "epoch": 1.0865248226950355, + "grad_norm": 3.0234217643737793, + "learning_rate": 4.633309811948138e-06, + "loss": 0.4914, + "step": 2298 + }, + { + "epoch": 1.086997635933806, + "grad_norm": 2.8380849361419678, + "learning_rate": 4.63298449392365e-06, + "loss": 0.5562, + "step": 2299 + }, + { + "epoch": 1.0874704491725768, + "grad_norm": 2.6201648712158203, + "learning_rate": 4.632659043087572e-06, + "loss": 0.5882, + "step": 2300 + }, + { + "epoch": 1.0879432624113474, + "grad_norm": 2.586339235305786, + "learning_rate": 4.632333459460165e-06, + "loss": 0.4991, + "step": 2301 + }, + { + "epoch": 1.0884160756501182, + "grad_norm": 2.500115394592285, + "learning_rate": 4.632007743061705e-06, + "loss": 0.552, + "step": 2302 + }, + { + "epoch": 1.0888888888888888, + "grad_norm": 2.816390037536621, + "learning_rate": 4.63168189391247e-06, + "loss": 0.5301, + "step": 2303 + }, + { + "epoch": 1.0893617021276596, + "grad_norm": 2.975400924682617, + "learning_rate": 4.631355912032753e-06, + "loss": 0.6056, + "step": 2304 + }, + { + "epoch": 1.0898345153664302, + "grad_norm": 2.747985363006592, + "learning_rate": 4.631029797442846e-06, + "loss": 0.5335, + "step": 2305 + }, + { + "epoch": 1.090307328605201, + "grad_norm": 2.609281539916992, + "learning_rate": 4.630703550163059e-06, + "loss": 0.5189, + "step": 2306 + }, + { + "epoch": 1.0907801418439715, + "grad_norm": 2.624131202697754, + "learning_rate": 4.630377170213705e-06, + "loss": 0.5646, + "step": 2307 + }, + { + "epoch": 1.0912529550827423, + "grad_norm": 2.6186959743499756, + "learning_rate": 4.630050657615107e-06, + "loss": 0.5187, + "step": 2308 + }, + { + "epoch": 1.091725768321513, + "grad_norm": 2.9961764812469482, + "learning_rate": 4.629724012387594e-06, + "loss": 0.6207, + "step": 2309 + }, + { + "epoch": 1.0921985815602837, + "grad_norm": 2.665799140930176, + "learning_rate": 4.629397234551505e-06, + "loss": 0.5046, + "step": 2310 + }, + { + "epoch": 1.0926713947990543, + "grad_norm": 2.6154725551605225, + "learning_rate": 4.629070324127187e-06, + "loss": 0.5553, + "step": 2311 + }, + { + "epoch": 1.093144208037825, + "grad_norm": 2.702967643737793, + "learning_rate": 4.628743281134996e-06, + "loss": 0.5159, + "step": 2312 + }, + { + "epoch": 1.0936170212765957, + "grad_norm": 2.578080177307129, + "learning_rate": 4.628416105595295e-06, + "loss": 0.4934, + "step": 2313 + }, + { + "epoch": 1.0940898345153665, + "grad_norm": 2.8763060569763184, + "learning_rate": 4.628088797528456e-06, + "loss": 0.5404, + "step": 2314 + }, + { + "epoch": 1.094562647754137, + "grad_norm": 2.5301198959350586, + "learning_rate": 4.6277613569548585e-06, + "loss": 0.524, + "step": 2315 + }, + { + "epoch": 1.0950354609929078, + "grad_norm": 2.559903144836426, + "learning_rate": 4.627433783894892e-06, + "loss": 0.5177, + "step": 2316 + }, + { + "epoch": 1.0955082742316784, + "grad_norm": 2.430863380432129, + "learning_rate": 4.627106078368952e-06, + "loss": 0.5368, + "step": 2317 + }, + { + "epoch": 1.0959810874704492, + "grad_norm": 2.687567949295044, + "learning_rate": 4.626778240397444e-06, + "loss": 0.5385, + "step": 2318 + }, + { + "epoch": 1.0964539007092198, + "grad_norm": 3.053466558456421, + "learning_rate": 4.62645027000078e-06, + "loss": 0.5814, + "step": 2319 + }, + { + "epoch": 1.0969267139479906, + "grad_norm": 2.4612979888916016, + "learning_rate": 4.6261221671993815e-06, + "loss": 0.5069, + "step": 2320 + }, + { + "epoch": 1.0973995271867611, + "grad_norm": 2.6153628826141357, + "learning_rate": 4.625793932013679e-06, + "loss": 0.5422, + "step": 2321 + }, + { + "epoch": 1.097872340425532, + "grad_norm": 2.8918874263763428, + "learning_rate": 4.62546556446411e-06, + "loss": 0.5326, + "step": 2322 + }, + { + "epoch": 1.0983451536643025, + "grad_norm": 3.62565279006958, + "learning_rate": 4.625137064571119e-06, + "loss": 0.5164, + "step": 2323 + }, + { + "epoch": 1.0988179669030733, + "grad_norm": 2.4285085201263428, + "learning_rate": 4.624808432355164e-06, + "loss": 0.5084, + "step": 2324 + }, + { + "epoch": 1.099290780141844, + "grad_norm": 2.593979835510254, + "learning_rate": 4.624479667836702e-06, + "loss": 0.4986, + "step": 2325 + }, + { + "epoch": 1.0997635933806147, + "grad_norm": 2.490752935409546, + "learning_rate": 4.624150771036208e-06, + "loss": 0.5296, + "step": 2326 + }, + { + "epoch": 1.1002364066193853, + "grad_norm": 2.67694091796875, + "learning_rate": 4.6238217419741595e-06, + "loss": 0.5229, + "step": 2327 + }, + { + "epoch": 1.100709219858156, + "grad_norm": 2.594147205352783, + "learning_rate": 4.623492580671044e-06, + "loss": 0.4916, + "step": 2328 + }, + { + "epoch": 1.1011820330969266, + "grad_norm": 2.943472385406494, + "learning_rate": 4.623163287147356e-06, + "loss": 0.5591, + "step": 2329 + }, + { + "epoch": 1.1016548463356974, + "grad_norm": 2.569410562515259, + "learning_rate": 4.622833861423601e-06, + "loss": 0.4648, + "step": 2330 + }, + { + "epoch": 1.102127659574468, + "grad_norm": 2.5490405559539795, + "learning_rate": 4.6225043035202886e-06, + "loss": 0.5493, + "step": 2331 + }, + { + "epoch": 1.1026004728132388, + "grad_norm": 2.5964598655700684, + "learning_rate": 4.622174613457941e-06, + "loss": 0.5358, + "step": 2332 + }, + { + "epoch": 1.1030732860520094, + "grad_norm": 2.6456820964813232, + "learning_rate": 4.621844791257085e-06, + "loss": 0.5864, + "step": 2333 + }, + { + "epoch": 1.1035460992907802, + "grad_norm": 2.861180067062378, + "learning_rate": 4.621514836938259e-06, + "loss": 0.6064, + "step": 2334 + }, + { + "epoch": 1.1040189125295508, + "grad_norm": 2.8199548721313477, + "learning_rate": 4.621184750522005e-06, + "loss": 0.5244, + "step": 2335 + }, + { + "epoch": 1.1044917257683216, + "grad_norm": 2.7398853302001953, + "learning_rate": 4.6208545320288795e-06, + "loss": 0.5496, + "step": 2336 + }, + { + "epoch": 1.1049645390070921, + "grad_norm": 2.7941031455993652, + "learning_rate": 4.620524181479441e-06, + "loss": 0.5496, + "step": 2337 + }, + { + "epoch": 1.105437352245863, + "grad_norm": 2.973785161972046, + "learning_rate": 4.620193698894259e-06, + "loss": 0.5492, + "step": 2338 + }, + { + "epoch": 1.1059101654846335, + "grad_norm": 2.650355815887451, + "learning_rate": 4.6198630842939144e-06, + "loss": 0.5392, + "step": 2339 + }, + { + "epoch": 1.1063829787234043, + "grad_norm": 2.9092214107513428, + "learning_rate": 4.61953233769899e-06, + "loss": 0.5305, + "step": 2340 + }, + { + "epoch": 1.1068557919621749, + "grad_norm": 2.6329731941223145, + "learning_rate": 4.61920145913008e-06, + "loss": 0.5031, + "step": 2341 + }, + { + "epoch": 1.1073286052009457, + "grad_norm": 2.7214207649230957, + "learning_rate": 4.618870448607788e-06, + "loss": 0.5536, + "step": 2342 + }, + { + "epoch": 1.1078014184397162, + "grad_norm": 2.873119592666626, + "learning_rate": 4.618539306152724e-06, + "loss": 0.4531, + "step": 2343 + }, + { + "epoch": 1.108274231678487, + "grad_norm": 2.701042413711548, + "learning_rate": 4.618208031785507e-06, + "loss": 0.5217, + "step": 2344 + }, + { + "epoch": 1.1087470449172576, + "grad_norm": 2.7189881801605225, + "learning_rate": 4.6178766255267635e-06, + "loss": 0.6205, + "step": 2345 + }, + { + "epoch": 1.1092198581560284, + "grad_norm": 2.546382188796997, + "learning_rate": 4.61754508739713e-06, + "loss": 0.5475, + "step": 2346 + }, + { + "epoch": 1.109692671394799, + "grad_norm": 2.8429276943206787, + "learning_rate": 4.617213417417249e-06, + "loss": 0.4809, + "step": 2347 + }, + { + "epoch": 1.1101654846335698, + "grad_norm": 2.9515812397003174, + "learning_rate": 4.616881615607772e-06, + "loss": 0.5067, + "step": 2348 + }, + { + "epoch": 1.1106382978723404, + "grad_norm": 2.5910723209381104, + "learning_rate": 4.616549681989358e-06, + "loss": 0.5368, + "step": 2349 + }, + { + "epoch": 1.1111111111111112, + "grad_norm": 2.80855655670166, + "learning_rate": 4.616217616582678e-06, + "loss": 0.5827, + "step": 2350 + }, + { + "epoch": 1.1115839243498817, + "grad_norm": 2.604383945465088, + "learning_rate": 4.6158854194084044e-06, + "loss": 0.5716, + "step": 2351 + }, + { + "epoch": 1.1120567375886525, + "grad_norm": 3.0585904121398926, + "learning_rate": 4.6155530904872246e-06, + "loss": 0.4998, + "step": 2352 + }, + { + "epoch": 1.112529550827423, + "grad_norm": 2.660961627960205, + "learning_rate": 4.61522062983983e-06, + "loss": 0.4533, + "step": 2353 + }, + { + "epoch": 1.113002364066194, + "grad_norm": 2.8042070865631104, + "learning_rate": 4.614888037486923e-06, + "loss": 0.5592, + "step": 2354 + }, + { + "epoch": 1.1134751773049645, + "grad_norm": 2.681664228439331, + "learning_rate": 4.61455531344921e-06, + "loss": 0.5439, + "step": 2355 + }, + { + "epoch": 1.1139479905437353, + "grad_norm": 2.905054807662964, + "learning_rate": 4.61422245774741e-06, + "loss": 0.5497, + "step": 2356 + }, + { + "epoch": 1.1144208037825059, + "grad_norm": 2.7979753017425537, + "learning_rate": 4.6138894704022484e-06, + "loss": 0.5374, + "step": 2357 + }, + { + "epoch": 1.1148936170212767, + "grad_norm": 2.965611696243286, + "learning_rate": 4.613556351434458e-06, + "loss": 0.5145, + "step": 2358 + }, + { + "epoch": 1.1153664302600472, + "grad_norm": 2.583134889602661, + "learning_rate": 4.613223100864782e-06, + "loss": 0.535, + "step": 2359 + }, + { + "epoch": 1.115839243498818, + "grad_norm": 2.5979621410369873, + "learning_rate": 4.61288971871397e-06, + "loss": 0.5514, + "step": 2360 + }, + { + "epoch": 1.1163120567375886, + "grad_norm": 3.0117669105529785, + "learning_rate": 4.612556205002779e-06, + "loss": 0.5266, + "step": 2361 + }, + { + "epoch": 1.1167848699763594, + "grad_norm": 2.425133466720581, + "learning_rate": 4.612222559751976e-06, + "loss": 0.4838, + "step": 2362 + }, + { + "epoch": 1.11725768321513, + "grad_norm": 2.5102691650390625, + "learning_rate": 4.611888782982337e-06, + "loss": 0.3947, + "step": 2363 + }, + { + "epoch": 1.1177304964539008, + "grad_norm": 3.0327367782592773, + "learning_rate": 4.611554874714645e-06, + "loss": 0.5753, + "step": 2364 + }, + { + "epoch": 1.1182033096926713, + "grad_norm": 2.4561009407043457, + "learning_rate": 4.6112208349696875e-06, + "loss": 0.5054, + "step": 2365 + }, + { + "epoch": 1.1186761229314421, + "grad_norm": 3.3898050785064697, + "learning_rate": 4.610886663768267e-06, + "loss": 0.5946, + "step": 2366 + }, + { + "epoch": 1.1191489361702127, + "grad_norm": 2.8112242221832275, + "learning_rate": 4.61055236113119e-06, + "loss": 0.5475, + "step": 2367 + }, + { + "epoch": 1.1196217494089835, + "grad_norm": 3.152946710586548, + "learning_rate": 4.610217927079272e-06, + "loss": 0.5165, + "step": 2368 + }, + { + "epoch": 1.120094562647754, + "grad_norm": 2.7847867012023926, + "learning_rate": 4.609883361633336e-06, + "loss": 0.5533, + "step": 2369 + }, + { + "epoch": 1.1205673758865249, + "grad_norm": 2.6376686096191406, + "learning_rate": 4.6095486648142155e-06, + "loss": 0.4942, + "step": 2370 + }, + { + "epoch": 1.1210401891252955, + "grad_norm": 3.123072862625122, + "learning_rate": 4.609213836642749e-06, + "loss": 0.616, + "step": 2371 + }, + { + "epoch": 1.1215130023640663, + "grad_norm": 2.802694320678711, + "learning_rate": 4.608878877139786e-06, + "loss": 0.5323, + "step": 2372 + }, + { + "epoch": 1.1219858156028368, + "grad_norm": 2.3567938804626465, + "learning_rate": 4.6085437863261825e-06, + "loss": 0.4822, + "step": 2373 + }, + { + "epoch": 1.1224586288416076, + "grad_norm": 2.553112030029297, + "learning_rate": 4.608208564222804e-06, + "loss": 0.5447, + "step": 2374 + }, + { + "epoch": 1.1229314420803782, + "grad_norm": 3.0020132064819336, + "learning_rate": 4.607873210850521e-06, + "loss": 0.6486, + "step": 2375 + }, + { + "epoch": 1.123404255319149, + "grad_norm": 2.832442045211792, + "learning_rate": 4.607537726230216e-06, + "loss": 0.5257, + "step": 2376 + }, + { + "epoch": 1.1238770685579196, + "grad_norm": 2.471527099609375, + "learning_rate": 4.607202110382778e-06, + "loss": 0.4816, + "step": 2377 + }, + { + "epoch": 1.1243498817966904, + "grad_norm": 2.4232118129730225, + "learning_rate": 4.606866363329105e-06, + "loss": 0.5533, + "step": 2378 + }, + { + "epoch": 1.124822695035461, + "grad_norm": 2.477506637573242, + "learning_rate": 4.6065304850901025e-06, + "loss": 0.5223, + "step": 2379 + }, + { + "epoch": 1.1252955082742317, + "grad_norm": 3.54127836227417, + "learning_rate": 4.6061944756866824e-06, + "loss": 0.6514, + "step": 2380 + }, + { + "epoch": 1.1257683215130023, + "grad_norm": 2.5148677825927734, + "learning_rate": 4.605858335139768e-06, + "loss": 0.4864, + "step": 2381 + }, + { + "epoch": 1.1262411347517731, + "grad_norm": 2.8363659381866455, + "learning_rate": 4.605522063470289e-06, + "loss": 0.5034, + "step": 2382 + }, + { + "epoch": 1.1267139479905437, + "grad_norm": 2.4996654987335205, + "learning_rate": 4.605185660699184e-06, + "loss": 0.4126, + "step": 2383 + }, + { + "epoch": 1.1271867612293145, + "grad_norm": 2.352543830871582, + "learning_rate": 4.604849126847398e-06, + "loss": 0.5224, + "step": 2384 + }, + { + "epoch": 1.127659574468085, + "grad_norm": 2.60101056098938, + "learning_rate": 4.6045124619358875e-06, + "loss": 0.4867, + "step": 2385 + }, + { + "epoch": 1.1281323877068559, + "grad_norm": 2.9471068382263184, + "learning_rate": 4.604175665985613e-06, + "loss": 0.6474, + "step": 2386 + }, + { + "epoch": 1.1286052009456264, + "grad_norm": 2.5933351516723633, + "learning_rate": 4.603838739017546e-06, + "loss": 0.5081, + "step": 2387 + }, + { + "epoch": 1.1290780141843972, + "grad_norm": 2.3740346431732178, + "learning_rate": 4.6035016810526665e-06, + "loss": 0.4438, + "step": 2388 + }, + { + "epoch": 1.1295508274231678, + "grad_norm": 2.675020217895508, + "learning_rate": 4.6031644921119614e-06, + "loss": 0.4968, + "step": 2389 + }, + { + "epoch": 1.1300236406619386, + "grad_norm": 2.599472999572754, + "learning_rate": 4.602827172216424e-06, + "loss": 0.5131, + "step": 2390 + }, + { + "epoch": 1.1304964539007092, + "grad_norm": 2.8176097869873047, + "learning_rate": 4.602489721387061e-06, + "loss": 0.5549, + "step": 2391 + }, + { + "epoch": 1.13096926713948, + "grad_norm": 2.466914176940918, + "learning_rate": 4.602152139644881e-06, + "loss": 0.5052, + "step": 2392 + }, + { + "epoch": 1.1314420803782506, + "grad_norm": 2.8938796520233154, + "learning_rate": 4.601814427010905e-06, + "loss": 0.6181, + "step": 2393 + }, + { + "epoch": 1.1319148936170214, + "grad_norm": 2.7390825748443604, + "learning_rate": 4.601476583506161e-06, + "loss": 0.5178, + "step": 2394 + }, + { + "epoch": 1.132387706855792, + "grad_norm": 3.180112838745117, + "learning_rate": 4.601138609151685e-06, + "loss": 0.6071, + "step": 2395 + }, + { + "epoch": 1.1328605200945627, + "grad_norm": 2.9282350540161133, + "learning_rate": 4.600800503968521e-06, + "loss": 0.5557, + "step": 2396 + }, + { + "epoch": 1.1333333333333333, + "grad_norm": 2.6689717769622803, + "learning_rate": 4.6004622679777215e-06, + "loss": 0.4679, + "step": 2397 + }, + { + "epoch": 1.133806146572104, + "grad_norm": 2.651582956314087, + "learning_rate": 4.600123901200347e-06, + "loss": 0.4907, + "step": 2398 + }, + { + "epoch": 1.1342789598108747, + "grad_norm": 2.5702924728393555, + "learning_rate": 4.599785403657464e-06, + "loss": 0.4919, + "step": 2399 + }, + { + "epoch": 1.1347517730496455, + "grad_norm": 2.636812448501587, + "learning_rate": 4.599446775370153e-06, + "loss": 0.5091, + "step": 2400 + }, + { + "epoch": 1.135224586288416, + "grad_norm": 2.5965442657470703, + "learning_rate": 4.599108016359497e-06, + "loss": 0.5035, + "step": 2401 + }, + { + "epoch": 1.1356973995271868, + "grad_norm": 2.689732313156128, + "learning_rate": 4.5987691266465885e-06, + "loss": 0.5307, + "step": 2402 + }, + { + "epoch": 1.1361702127659574, + "grad_norm": 2.7256956100463867, + "learning_rate": 4.59843010625253e-06, + "loss": 0.5066, + "step": 2403 + }, + { + "epoch": 1.1366430260047282, + "grad_norm": 2.726020574569702, + "learning_rate": 4.59809095519843e-06, + "loss": 0.4805, + "step": 2404 + }, + { + "epoch": 1.1371158392434988, + "grad_norm": 2.703339099884033, + "learning_rate": 4.597751673505406e-06, + "loss": 0.4992, + "step": 2405 + }, + { + "epoch": 1.1375886524822696, + "grad_norm": 2.54455304145813, + "learning_rate": 4.5974122611945835e-06, + "loss": 0.5251, + "step": 2406 + }, + { + "epoch": 1.1380614657210402, + "grad_norm": 2.623507022857666, + "learning_rate": 4.597072718287096e-06, + "loss": 0.4831, + "step": 2407 + }, + { + "epoch": 1.138534278959811, + "grad_norm": 2.653590202331543, + "learning_rate": 4.596733044804086e-06, + "loss": 0.5646, + "step": 2408 + }, + { + "epoch": 1.1390070921985815, + "grad_norm": 2.8230600357055664, + "learning_rate": 4.5963932407667035e-06, + "loss": 0.514, + "step": 2409 + }, + { + "epoch": 1.1394799054373523, + "grad_norm": 2.6077451705932617, + "learning_rate": 4.5960533061961065e-06, + "loss": 0.4713, + "step": 2410 + }, + { + "epoch": 1.139952718676123, + "grad_norm": 2.3945798873901367, + "learning_rate": 4.595713241113461e-06, + "loss": 0.466, + "step": 2411 + }, + { + "epoch": 1.1404255319148937, + "grad_norm": 2.8100006580352783, + "learning_rate": 4.595373045539941e-06, + "loss": 0.5365, + "step": 2412 + }, + { + "epoch": 1.1408983451536643, + "grad_norm": 2.6825881004333496, + "learning_rate": 4.59503271949673e-06, + "loss": 0.4457, + "step": 2413 + }, + { + "epoch": 1.141371158392435, + "grad_norm": 2.969435691833496, + "learning_rate": 4.594692263005016e-06, + "loss": 0.5459, + "step": 2414 + }, + { + "epoch": 1.1418439716312057, + "grad_norm": 2.4103164672851562, + "learning_rate": 4.594351676086002e-06, + "loss": 0.4573, + "step": 2415 + }, + { + "epoch": 1.1423167848699765, + "grad_norm": 2.9450128078460693, + "learning_rate": 4.594010958760892e-06, + "loss": 0.5529, + "step": 2416 + }, + { + "epoch": 1.142789598108747, + "grad_norm": 2.6416335105895996, + "learning_rate": 4.593670111050901e-06, + "loss": 0.5153, + "step": 2417 + }, + { + "epoch": 1.1432624113475178, + "grad_norm": 2.473177194595337, + "learning_rate": 4.593329132977253e-06, + "loss": 0.4962, + "step": 2418 + }, + { + "epoch": 1.1437352245862884, + "grad_norm": 2.4494502544403076, + "learning_rate": 4.592988024561179e-06, + "loss": 0.5182, + "step": 2419 + }, + { + "epoch": 1.1442080378250592, + "grad_norm": 2.773930311203003, + "learning_rate": 4.592646785823918e-06, + "loss": 0.4442, + "step": 2420 + }, + { + "epoch": 1.1446808510638298, + "grad_norm": 2.4733314514160156, + "learning_rate": 4.592305416786718e-06, + "loss": 0.5106, + "step": 2421 + }, + { + "epoch": 1.1451536643026006, + "grad_norm": 2.6870038509368896, + "learning_rate": 4.591963917470834e-06, + "loss": 0.5316, + "step": 2422 + }, + { + "epoch": 1.1456264775413711, + "grad_norm": 2.8989531993865967, + "learning_rate": 4.591622287897529e-06, + "loss": 0.5906, + "step": 2423 + }, + { + "epoch": 1.1460992907801417, + "grad_norm": 2.6349124908447266, + "learning_rate": 4.591280528088077e-06, + "loss": 0.6225, + "step": 2424 + }, + { + "epoch": 1.1465721040189125, + "grad_norm": 3.19022274017334, + "learning_rate": 4.5909386380637555e-06, + "loss": 0.555, + "step": 2425 + }, + { + "epoch": 1.1470449172576833, + "grad_norm": 3.1473541259765625, + "learning_rate": 4.5905966178458535e-06, + "loss": 0.537, + "step": 2426 + }, + { + "epoch": 1.147517730496454, + "grad_norm": 2.6996145248413086, + "learning_rate": 4.590254467455667e-06, + "loss": 0.565, + "step": 2427 + }, + { + "epoch": 1.1479905437352245, + "grad_norm": 2.830188274383545, + "learning_rate": 4.5899121869145015e-06, + "loss": 0.6773, + "step": 2428 + }, + { + "epoch": 1.1484633569739953, + "grad_norm": 2.4937260150909424, + "learning_rate": 4.589569776243667e-06, + "loss": 0.5484, + "step": 2429 + }, + { + "epoch": 1.148936170212766, + "grad_norm": 2.54011869430542, + "learning_rate": 4.589227235464486e-06, + "loss": 0.5307, + "step": 2430 + }, + { + "epoch": 1.1494089834515366, + "grad_norm": 2.8764214515686035, + "learning_rate": 4.5888845645982845e-06, + "loss": 0.5296, + "step": 2431 + }, + { + "epoch": 1.1498817966903072, + "grad_norm": 2.637033462524414, + "learning_rate": 4.588541763666402e-06, + "loss": 0.5975, + "step": 2432 + }, + { + "epoch": 1.150354609929078, + "grad_norm": 2.8534255027770996, + "learning_rate": 4.5881988326901815e-06, + "loss": 0.5431, + "step": 2433 + }, + { + "epoch": 1.1508274231678488, + "grad_norm": 2.8546559810638428, + "learning_rate": 4.587855771690976e-06, + "loss": 0.469, + "step": 2434 + }, + { + "epoch": 1.1513002364066194, + "grad_norm": 2.9084973335266113, + "learning_rate": 4.587512580690146e-06, + "loss": 0.5566, + "step": 2435 + }, + { + "epoch": 1.15177304964539, + "grad_norm": 3.0993130207061768, + "learning_rate": 4.587169259709063e-06, + "loss": 0.5612, + "step": 2436 + }, + { + "epoch": 1.1522458628841608, + "grad_norm": 10.847400665283203, + "learning_rate": 4.5868258087691e-06, + "loss": 0.4678, + "step": 2437 + }, + { + "epoch": 1.1527186761229316, + "grad_norm": 2.6648571491241455, + "learning_rate": 4.586482227891645e-06, + "loss": 0.5951, + "step": 2438 + }, + { + "epoch": 1.1531914893617021, + "grad_norm": 2.529043197631836, + "learning_rate": 4.586138517098091e-06, + "loss": 0.5048, + "step": 2439 + }, + { + "epoch": 1.1536643026004727, + "grad_norm": 2.833904504776001, + "learning_rate": 4.585794676409839e-06, + "loss": 0.536, + "step": 2440 + }, + { + "epoch": 1.1541371158392435, + "grad_norm": 3.507657766342163, + "learning_rate": 4.585450705848298e-06, + "loss": 0.5954, + "step": 2441 + }, + { + "epoch": 1.1546099290780143, + "grad_norm": 2.6108388900756836, + "learning_rate": 4.585106605434887e-06, + "loss": 0.5684, + "step": 2442 + }, + { + "epoch": 1.1550827423167849, + "grad_norm": 2.490708589553833, + "learning_rate": 4.58476237519103e-06, + "loss": 0.4678, + "step": 2443 + }, + { + "epoch": 1.1555555555555554, + "grad_norm": 2.8192343711853027, + "learning_rate": 4.584418015138161e-06, + "loss": 0.5291, + "step": 2444 + }, + { + "epoch": 1.1560283687943262, + "grad_norm": 3.0878679752349854, + "learning_rate": 4.584073525297722e-06, + "loss": 0.5691, + "step": 2445 + }, + { + "epoch": 1.156501182033097, + "grad_norm": 3.1444318294525146, + "learning_rate": 4.583728905691163e-06, + "loss": 0.5643, + "step": 2446 + }, + { + "epoch": 1.1569739952718676, + "grad_norm": 3.02382230758667, + "learning_rate": 4.583384156339942e-06, + "loss": 0.6008, + "step": 2447 + }, + { + "epoch": 1.1574468085106382, + "grad_norm": 2.5942490100860596, + "learning_rate": 4.583039277265525e-06, + "loss": 0.5105, + "step": 2448 + }, + { + "epoch": 1.157919621749409, + "grad_norm": 2.938608407974243, + "learning_rate": 4.582694268489386e-06, + "loss": 0.5123, + "step": 2449 + }, + { + "epoch": 1.1583924349881798, + "grad_norm": 2.4622268676757812, + "learning_rate": 4.5823491300330075e-06, + "loss": 0.4538, + "step": 2450 + }, + { + "epoch": 1.1588652482269504, + "grad_norm": 2.4380505084991455, + "learning_rate": 4.5820038619178795e-06, + "loss": 0.4682, + "step": 2451 + }, + { + "epoch": 1.159338061465721, + "grad_norm": 2.479896068572998, + "learning_rate": 4.581658464165501e-06, + "loss": 0.4877, + "step": 2452 + }, + { + "epoch": 1.1598108747044917, + "grad_norm": 2.3373546600341797, + "learning_rate": 4.5813129367973765e-06, + "loss": 0.445, + "step": 2453 + }, + { + "epoch": 1.1602836879432625, + "grad_norm": 2.8586013317108154, + "learning_rate": 4.5809672798350214e-06, + "loss": 0.5232, + "step": 2454 + }, + { + "epoch": 1.160756501182033, + "grad_norm": 3.2302439212799072, + "learning_rate": 4.5806214932999595e-06, + "loss": 0.5336, + "step": 2455 + }, + { + "epoch": 1.1612293144208037, + "grad_norm": 3.1005783081054688, + "learning_rate": 4.580275577213721e-06, + "loss": 0.5123, + "step": 2456 + }, + { + "epoch": 1.1617021276595745, + "grad_norm": 2.7131073474884033, + "learning_rate": 4.579929531597842e-06, + "loss": 0.5648, + "step": 2457 + }, + { + "epoch": 1.1621749408983453, + "grad_norm": 2.5067050457000732, + "learning_rate": 4.579583356473874e-06, + "loss": 0.5324, + "step": 2458 + }, + { + "epoch": 1.1626477541371159, + "grad_norm": 2.7870543003082275, + "learning_rate": 4.579237051863366e-06, + "loss": 0.5094, + "step": 2459 + }, + { + "epoch": 1.1631205673758864, + "grad_norm": 2.739196300506592, + "learning_rate": 4.578890617787887e-06, + "loss": 0.5103, + "step": 2460 + }, + { + "epoch": 1.1635933806146572, + "grad_norm": 2.7108185291290283, + "learning_rate": 4.578544054269003e-06, + "loss": 0.533, + "step": 2461 + }, + { + "epoch": 1.1640661938534278, + "grad_norm": 3.028005361557007, + "learning_rate": 4.578197361328295e-06, + "loss": 0.636, + "step": 2462 + }, + { + "epoch": 1.1645390070921986, + "grad_norm": 2.4855129718780518, + "learning_rate": 4.5778505389873505e-06, + "loss": 0.501, + "step": 2463 + }, + { + "epoch": 1.1650118203309692, + "grad_norm": 2.6314198970794678, + "learning_rate": 4.577503587267764e-06, + "loss": 0.5812, + "step": 2464 + }, + { + "epoch": 1.16548463356974, + "grad_norm": 2.4209671020507812, + "learning_rate": 4.5771565061911385e-06, + "loss": 0.5168, + "step": 2465 + }, + { + "epoch": 1.1659574468085105, + "grad_norm": 2.526388645172119, + "learning_rate": 4.576809295779085e-06, + "loss": 0.5047, + "step": 2466 + }, + { + "epoch": 1.1664302600472813, + "grad_norm": 2.8278191089630127, + "learning_rate": 4.576461956053224e-06, + "loss": 0.4759, + "step": 2467 + }, + { + "epoch": 1.166903073286052, + "grad_norm": 2.7862167358398438, + "learning_rate": 4.576114487035182e-06, + "loss": 0.5492, + "step": 2468 + }, + { + "epoch": 1.1673758865248227, + "grad_norm": 2.6303019523620605, + "learning_rate": 4.575766888746594e-06, + "loss": 0.5538, + "step": 2469 + }, + { + "epoch": 1.1678486997635933, + "grad_norm": 2.613104820251465, + "learning_rate": 4.5754191612091034e-06, + "loss": 0.5114, + "step": 2470 + }, + { + "epoch": 1.168321513002364, + "grad_norm": 2.653958320617676, + "learning_rate": 4.5750713044443625e-06, + "loss": 0.5858, + "step": 2471 + }, + { + "epoch": 1.1687943262411347, + "grad_norm": 3.1143975257873535, + "learning_rate": 4.574723318474031e-06, + "loss": 0.5193, + "step": 2472 + }, + { + "epoch": 1.1692671394799055, + "grad_norm": 3.05454421043396, + "learning_rate": 4.574375203319775e-06, + "loss": 0.464, + "step": 2473 + }, + { + "epoch": 1.169739952718676, + "grad_norm": 2.66626238822937, + "learning_rate": 4.574026959003272e-06, + "loss": 0.4988, + "step": 2474 + }, + { + "epoch": 1.1702127659574468, + "grad_norm": 2.8871963024139404, + "learning_rate": 4.573678585546203e-06, + "loss": 0.5557, + "step": 2475 + }, + { + "epoch": 1.1706855791962174, + "grad_norm": 2.592949628829956, + "learning_rate": 4.573330082970262e-06, + "loss": 0.5178, + "step": 2476 + }, + { + "epoch": 1.1711583924349882, + "grad_norm": 2.9111456871032715, + "learning_rate": 4.572981451297148e-06, + "loss": 0.5712, + "step": 2477 + }, + { + "epoch": 1.1716312056737588, + "grad_norm": 2.8152248859405518, + "learning_rate": 4.57263269054857e-06, + "loss": 0.5548, + "step": 2478 + }, + { + "epoch": 1.1721040189125296, + "grad_norm": 3.0292418003082275, + "learning_rate": 4.572283800746241e-06, + "loss": 0.5937, + "step": 2479 + }, + { + "epoch": 1.1725768321513002, + "grad_norm": 3.454618215560913, + "learning_rate": 4.571934781911886e-06, + "loss": 0.5537, + "step": 2480 + }, + { + "epoch": 1.173049645390071, + "grad_norm": 2.7817866802215576, + "learning_rate": 4.571585634067239e-06, + "loss": 0.5649, + "step": 2481 + }, + { + "epoch": 1.1735224586288415, + "grad_norm": 2.7989349365234375, + "learning_rate": 4.571236357234037e-06, + "loss": 0.5448, + "step": 2482 + }, + { + "epoch": 1.1739952718676123, + "grad_norm": 2.8863933086395264, + "learning_rate": 4.57088695143403e-06, + "loss": 0.63, + "step": 2483 + }, + { + "epoch": 1.174468085106383, + "grad_norm": 2.5738039016723633, + "learning_rate": 4.570537416688972e-06, + "loss": 0.4702, + "step": 2484 + }, + { + "epoch": 1.1749408983451537, + "grad_norm": 3.003643274307251, + "learning_rate": 4.570187753020629e-06, + "loss": 0.5918, + "step": 2485 + }, + { + "epoch": 1.1754137115839243, + "grad_norm": 2.8619167804718018, + "learning_rate": 4.569837960450772e-06, + "loss": 0.5268, + "step": 2486 + }, + { + "epoch": 1.175886524822695, + "grad_norm": 2.876077175140381, + "learning_rate": 4.569488039001181e-06, + "loss": 0.4915, + "step": 2487 + }, + { + "epoch": 1.1763593380614656, + "grad_norm": 3.407115936279297, + "learning_rate": 4.569137988693644e-06, + "loss": 0.5761, + "step": 2488 + }, + { + "epoch": 1.1768321513002364, + "grad_norm": 2.7292826175689697, + "learning_rate": 4.568787809549958e-06, + "loss": 0.541, + "step": 2489 + }, + { + "epoch": 1.177304964539007, + "grad_norm": 2.8805999755859375, + "learning_rate": 4.568437501591926e-06, + "loss": 0.6223, + "step": 2490 + }, + { + "epoch": 1.1777777777777778, + "grad_norm": 2.9264373779296875, + "learning_rate": 4.56808706484136e-06, + "loss": 0.6081, + "step": 2491 + }, + { + "epoch": 1.1782505910165484, + "grad_norm": 2.5167033672332764, + "learning_rate": 4.567736499320082e-06, + "loss": 0.5393, + "step": 2492 + }, + { + "epoch": 1.1787234042553192, + "grad_norm": 3.4647862911224365, + "learning_rate": 4.567385805049918e-06, + "loss": 0.4826, + "step": 2493 + }, + { + "epoch": 1.1791962174940898, + "grad_norm": 2.9824202060699463, + "learning_rate": 4.5670349820527055e-06, + "loss": 0.541, + "step": 2494 + }, + { + "epoch": 1.1796690307328606, + "grad_norm": 2.997105836868286, + "learning_rate": 4.5666840303502885e-06, + "loss": 0.5771, + "step": 2495 + }, + { + "epoch": 1.1801418439716311, + "grad_norm": 2.8728017807006836, + "learning_rate": 4.56633294996452e-06, + "loss": 0.4877, + "step": 2496 + }, + { + "epoch": 1.180614657210402, + "grad_norm": 2.626498222351074, + "learning_rate": 4.5659817409172565e-06, + "loss": 0.5296, + "step": 2497 + }, + { + "epoch": 1.1810874704491725, + "grad_norm": 2.87037992477417, + "learning_rate": 4.565630403230371e-06, + "loss": 0.539, + "step": 2498 + }, + { + "epoch": 1.1815602836879433, + "grad_norm": 2.5719685554504395, + "learning_rate": 4.5652789369257375e-06, + "loss": 0.5653, + "step": 2499 + }, + { + "epoch": 1.1820330969267139, + "grad_norm": 2.4842135906219482, + "learning_rate": 4.56492734202524e-06, + "loss": 0.515, + "step": 2500 + }, + { + "epoch": 1.1825059101654847, + "grad_norm": 2.640951156616211, + "learning_rate": 4.564575618550773e-06, + "loss": 0.5601, + "step": 2501 + }, + { + "epoch": 1.1829787234042553, + "grad_norm": 2.624394655227661, + "learning_rate": 4.564223766524234e-06, + "loss": 0.5551, + "step": 2502 + }, + { + "epoch": 1.183451536643026, + "grad_norm": 3.014537811279297, + "learning_rate": 4.563871785967533e-06, + "loss": 0.5212, + "step": 2503 + }, + { + "epoch": 1.1839243498817966, + "grad_norm": 2.8756890296936035, + "learning_rate": 4.563519676902585e-06, + "loss": 0.5132, + "step": 2504 + }, + { + "epoch": 1.1843971631205674, + "grad_norm": 2.636781692504883, + "learning_rate": 4.5631674393513145e-06, + "loss": 0.5323, + "step": 2505 + }, + { + "epoch": 1.184869976359338, + "grad_norm": 2.7233786582946777, + "learning_rate": 4.562815073335655e-06, + "loss": 0.5608, + "step": 2506 + }, + { + "epoch": 1.1853427895981088, + "grad_norm": 2.7158713340759277, + "learning_rate": 4.562462578877546e-06, + "loss": 0.5373, + "step": 2507 + }, + { + "epoch": 1.1858156028368794, + "grad_norm": 2.9754762649536133, + "learning_rate": 4.562109955998936e-06, + "loss": 0.5712, + "step": 2508 + }, + { + "epoch": 1.1862884160756502, + "grad_norm": 2.8815054893493652, + "learning_rate": 4.561757204721781e-06, + "loss": 0.6126, + "step": 2509 + }, + { + "epoch": 1.1867612293144207, + "grad_norm": 2.866319417953491, + "learning_rate": 4.561404325068045e-06, + "loss": 0.506, + "step": 2510 + }, + { + "epoch": 1.1872340425531915, + "grad_norm": 2.6187376976013184, + "learning_rate": 4.561051317059701e-06, + "loss": 0.4674, + "step": 2511 + }, + { + "epoch": 1.1877068557919621, + "grad_norm": 2.642552137374878, + "learning_rate": 4.560698180718729e-06, + "loss": 0.4793, + "step": 2512 + }, + { + "epoch": 1.188179669030733, + "grad_norm": 2.7815041542053223, + "learning_rate": 4.560344916067117e-06, + "loss": 0.5034, + "step": 2513 + }, + { + "epoch": 1.1886524822695035, + "grad_norm": 2.70853590965271, + "learning_rate": 4.559991523126862e-06, + "loss": 0.4811, + "step": 2514 + }, + { + "epoch": 1.1891252955082743, + "grad_norm": 2.7049436569213867, + "learning_rate": 4.559638001919967e-06, + "loss": 0.547, + "step": 2515 + }, + { + "epoch": 1.1895981087470449, + "grad_norm": 2.766773223876953, + "learning_rate": 4.559284352468445e-06, + "loss": 0.5362, + "step": 2516 + }, + { + "epoch": 1.1900709219858157, + "grad_norm": 3.0064334869384766, + "learning_rate": 4.558930574794316e-06, + "loss": 0.5915, + "step": 2517 + }, + { + "epoch": 1.1905437352245862, + "grad_norm": 2.4899885654449463, + "learning_rate": 4.558576668919609e-06, + "loss": 0.4379, + "step": 2518 + }, + { + "epoch": 1.191016548463357, + "grad_norm": 2.925963878631592, + "learning_rate": 4.558222634866358e-06, + "loss": 0.5389, + "step": 2519 + }, + { + "epoch": 1.1914893617021276, + "grad_norm": 6.087667465209961, + "learning_rate": 4.55786847265661e-06, + "loss": 0.4777, + "step": 2520 + }, + { + "epoch": 1.1919621749408984, + "grad_norm": 2.4560582637786865, + "learning_rate": 4.5575141823124145e-06, + "loss": 0.5576, + "step": 2521 + }, + { + "epoch": 1.192434988179669, + "grad_norm": 3.184252977371216, + "learning_rate": 4.557159763855834e-06, + "loss": 0.5151, + "step": 2522 + }, + { + "epoch": 1.1929078014184398, + "grad_norm": 2.359722137451172, + "learning_rate": 4.556805217308935e-06, + "loss": 0.478, + "step": 2523 + }, + { + "epoch": 1.1933806146572103, + "grad_norm": 3.0821568965911865, + "learning_rate": 4.5564505426937935e-06, + "loss": 0.5784, + "step": 2524 + }, + { + "epoch": 1.1938534278959811, + "grad_norm": 2.9905128479003906, + "learning_rate": 4.5560957400324936e-06, + "loss": 0.6087, + "step": 2525 + }, + { + "epoch": 1.1943262411347517, + "grad_norm": 2.462102174758911, + "learning_rate": 4.555740809347128e-06, + "loss": 0.4739, + "step": 2526 + }, + { + "epoch": 1.1947990543735225, + "grad_norm": 2.7931067943573, + "learning_rate": 4.555385750659796e-06, + "loss": 0.4961, + "step": 2527 + }, + { + "epoch": 1.195271867612293, + "grad_norm": 2.660320997238159, + "learning_rate": 4.555030563992607e-06, + "loss": 0.487, + "step": 2528 + }, + { + "epoch": 1.195744680851064, + "grad_norm": 2.8135557174682617, + "learning_rate": 4.554675249367675e-06, + "loss": 0.5269, + "step": 2529 + }, + { + "epoch": 1.1962174940898345, + "grad_norm": 2.661933422088623, + "learning_rate": 4.554319806807126e-06, + "loss": 0.4723, + "step": 2530 + }, + { + "epoch": 1.1966903073286053, + "grad_norm": 2.568176507949829, + "learning_rate": 4.553964236333089e-06, + "loss": 0.5258, + "step": 2531 + }, + { + "epoch": 1.1971631205673758, + "grad_norm": 2.6890947818756104, + "learning_rate": 4.553608537967705e-06, + "loss": 0.4965, + "step": 2532 + }, + { + "epoch": 1.1976359338061466, + "grad_norm": 3.133470058441162, + "learning_rate": 4.553252711733124e-06, + "loss": 0.5423, + "step": 2533 + }, + { + "epoch": 1.1981087470449172, + "grad_norm": 2.7086687088012695, + "learning_rate": 4.552896757651498e-06, + "loss": 0.5326, + "step": 2534 + }, + { + "epoch": 1.198581560283688, + "grad_norm": 2.8411715030670166, + "learning_rate": 4.552540675744994e-06, + "loss": 0.5793, + "step": 2535 + }, + { + "epoch": 1.1990543735224586, + "grad_norm": 3.041077136993408, + "learning_rate": 4.552184466035782e-06, + "loss": 0.5068, + "step": 2536 + }, + { + "epoch": 1.1995271867612294, + "grad_norm": 2.5921192169189453, + "learning_rate": 4.551828128546041e-06, + "loss": 0.5189, + "step": 2537 + }, + { + "epoch": 1.2, + "grad_norm": 2.923305034637451, + "learning_rate": 4.5514716632979605e-06, + "loss": 0.516, + "step": 2538 + }, + { + "epoch": 1.2004728132387708, + "grad_norm": 2.7083024978637695, + "learning_rate": 4.551115070313734e-06, + "loss": 0.4825, + "step": 2539 + }, + { + "epoch": 1.2009456264775413, + "grad_norm": 2.746842384338379, + "learning_rate": 4.550758349615567e-06, + "loss": 0.5691, + "step": 2540 + }, + { + "epoch": 1.2014184397163121, + "grad_norm": 2.6596429347991943, + "learning_rate": 4.550401501225669e-06, + "loss": 0.5983, + "step": 2541 + }, + { + "epoch": 1.2018912529550827, + "grad_norm": 2.9057931900024414, + "learning_rate": 4.550044525166261e-06, + "loss": 0.5069, + "step": 2542 + }, + { + "epoch": 1.2023640661938535, + "grad_norm": 2.6139039993286133, + "learning_rate": 4.5496874214595686e-06, + "loss": 0.5102, + "step": 2543 + }, + { + "epoch": 1.202836879432624, + "grad_norm": 2.630286455154419, + "learning_rate": 4.5493301901278285e-06, + "loss": 0.4902, + "step": 2544 + }, + { + "epoch": 1.2033096926713949, + "grad_norm": 2.639174222946167, + "learning_rate": 4.548972831193284e-06, + "loss": 0.4566, + "step": 2545 + }, + { + "epoch": 1.2037825059101654, + "grad_norm": 2.9569664001464844, + "learning_rate": 4.548615344678186e-06, + "loss": 0.5636, + "step": 2546 + }, + { + "epoch": 1.2042553191489362, + "grad_norm": 2.981734037399292, + "learning_rate": 4.5482577306047924e-06, + "loss": 0.4884, + "step": 2547 + }, + { + "epoch": 1.2047281323877068, + "grad_norm": 2.6760342121124268, + "learning_rate": 4.547899988995371e-06, + "loss": 0.5426, + "step": 2548 + }, + { + "epoch": 1.2052009456264776, + "grad_norm": 2.825805902481079, + "learning_rate": 4.547542119872198e-06, + "loss": 0.4989, + "step": 2549 + }, + { + "epoch": 1.2056737588652482, + "grad_norm": 2.856426954269409, + "learning_rate": 4.547184123257555e-06, + "loss": 0.5734, + "step": 2550 + }, + { + "epoch": 1.206146572104019, + "grad_norm": 2.555682420730591, + "learning_rate": 4.5468259991737334e-06, + "loss": 0.5299, + "step": 2551 + }, + { + "epoch": 1.2066193853427896, + "grad_norm": 2.6324024200439453, + "learning_rate": 4.546467747643032e-06, + "loss": 0.5906, + "step": 2552 + }, + { + "epoch": 1.2070921985815604, + "grad_norm": 3.4145350456237793, + "learning_rate": 4.546109368687757e-06, + "loss": 0.5153, + "step": 2553 + }, + { + "epoch": 1.207565011820331, + "grad_norm": 2.658691644668579, + "learning_rate": 4.545750862330225e-06, + "loss": 0.5759, + "step": 2554 + }, + { + "epoch": 1.2080378250591017, + "grad_norm": 3.162605047225952, + "learning_rate": 4.545392228592755e-06, + "loss": 0.5379, + "step": 2555 + }, + { + "epoch": 1.2085106382978723, + "grad_norm": 2.8631198406219482, + "learning_rate": 4.545033467497681e-06, + "loss": 0.5959, + "step": 2556 + }, + { + "epoch": 1.208983451536643, + "grad_norm": 2.457109212875366, + "learning_rate": 4.54467457906734e-06, + "loss": 0.4864, + "step": 2557 + }, + { + "epoch": 1.2094562647754137, + "grad_norm": 2.5307061672210693, + "learning_rate": 4.544315563324078e-06, + "loss": 0.5308, + "step": 2558 + }, + { + "epoch": 1.2099290780141845, + "grad_norm": 2.8482773303985596, + "learning_rate": 4.543956420290251e-06, + "loss": 0.5126, + "step": 2559 + }, + { + "epoch": 1.210401891252955, + "grad_norm": 2.4990832805633545, + "learning_rate": 4.5435971499882195e-06, + "loss": 0.4534, + "step": 2560 + }, + { + "epoch": 1.2108747044917259, + "grad_norm": 2.6292665004730225, + "learning_rate": 4.543237752440354e-06, + "loss": 0.4434, + "step": 2561 + }, + { + "epoch": 1.2113475177304964, + "grad_norm": 2.865983247756958, + "learning_rate": 4.542878227669033e-06, + "loss": 0.5667, + "step": 2562 + }, + { + "epoch": 1.2118203309692672, + "grad_norm": 2.745614528656006, + "learning_rate": 4.542518575696644e-06, + "loss": 0.4724, + "step": 2563 + }, + { + "epoch": 1.2122931442080378, + "grad_norm": 2.8562581539154053, + "learning_rate": 4.5421587965455785e-06, + "loss": 0.5405, + "step": 2564 + }, + { + "epoch": 1.2127659574468086, + "grad_norm": 2.6670095920562744, + "learning_rate": 4.5417988902382385e-06, + "loss": 0.5432, + "step": 2565 + }, + { + "epoch": 1.2132387706855792, + "grad_norm": 2.9320743083953857, + "learning_rate": 4.541438856797036e-06, + "loss": 0.5862, + "step": 2566 + }, + { + "epoch": 1.21371158392435, + "grad_norm": 2.577505588531494, + "learning_rate": 4.541078696244386e-06, + "loss": 0.4742, + "step": 2567 + }, + { + "epoch": 1.2141843971631205, + "grad_norm": 3.4476120471954346, + "learning_rate": 4.540718408602717e-06, + "loss": 0.5903, + "step": 2568 + }, + { + "epoch": 1.2146572104018913, + "grad_norm": 2.816210985183716, + "learning_rate": 4.540357993894459e-06, + "loss": 0.5033, + "step": 2569 + }, + { + "epoch": 1.215130023640662, + "grad_norm": 3.0806639194488525, + "learning_rate": 4.539997452142058e-06, + "loss": 0.6064, + "step": 2570 + }, + { + "epoch": 1.2156028368794327, + "grad_norm": 2.563060760498047, + "learning_rate": 4.5396367833679586e-06, + "loss": 0.5597, + "step": 2571 + }, + { + "epoch": 1.2160756501182033, + "grad_norm": 3.1014397144317627, + "learning_rate": 4.5392759875946215e-06, + "loss": 0.54, + "step": 2572 + }, + { + "epoch": 1.216548463356974, + "grad_norm": 3.124190330505371, + "learning_rate": 4.53891506484451e-06, + "loss": 0.5122, + "step": 2573 + }, + { + "epoch": 1.2170212765957447, + "grad_norm": 2.6688716411590576, + "learning_rate": 4.538554015140097e-06, + "loss": 0.5615, + "step": 2574 + }, + { + "epoch": 1.2174940898345155, + "grad_norm": 2.775543689727783, + "learning_rate": 4.538192838503866e-06, + "loss": 0.496, + "step": 2575 + }, + { + "epoch": 1.217966903073286, + "grad_norm": 2.7877283096313477, + "learning_rate": 4.537831534958303e-06, + "loss": 0.4995, + "step": 2576 + }, + { + "epoch": 1.2184397163120568, + "grad_norm": 2.824810028076172, + "learning_rate": 4.537470104525906e-06, + "loss": 0.5481, + "step": 2577 + }, + { + "epoch": 1.2189125295508274, + "grad_norm": 2.801269292831421, + "learning_rate": 4.53710854722918e-06, + "loss": 0.5628, + "step": 2578 + }, + { + "epoch": 1.2193853427895982, + "grad_norm": 2.7780683040618896, + "learning_rate": 4.536746863090637e-06, + "loss": 0.4845, + "step": 2579 + }, + { + "epoch": 1.2198581560283688, + "grad_norm": 2.536010265350342, + "learning_rate": 4.536385052132798e-06, + "loss": 0.4771, + "step": 2580 + }, + { + "epoch": 1.2203309692671396, + "grad_norm": 2.768775701522827, + "learning_rate": 4.536023114378191e-06, + "loss": 0.5366, + "step": 2581 + }, + { + "epoch": 1.2208037825059102, + "grad_norm": 2.658125877380371, + "learning_rate": 4.535661049849352e-06, + "loss": 0.524, + "step": 2582 + }, + { + "epoch": 1.2212765957446807, + "grad_norm": 2.558696746826172, + "learning_rate": 4.535298858568825e-06, + "loss": 0.5482, + "step": 2583 + }, + { + "epoch": 1.2217494089834515, + "grad_norm": 2.5284535884857178, + "learning_rate": 4.534936540559164e-06, + "loss": 0.4454, + "step": 2584 + }, + { + "epoch": 1.2222222222222223, + "grad_norm": 7.617330074310303, + "learning_rate": 4.534574095842927e-06, + "loss": 0.5615, + "step": 2585 + }, + { + "epoch": 1.222695035460993, + "grad_norm": 2.9120311737060547, + "learning_rate": 4.534211524442682e-06, + "loss": 0.5624, + "step": 2586 + }, + { + "epoch": 1.2231678486997635, + "grad_norm": 2.5004289150238037, + "learning_rate": 4.533848826381005e-06, + "loss": 0.4743, + "step": 2587 + }, + { + "epoch": 1.2236406619385343, + "grad_norm": 2.8395533561706543, + "learning_rate": 4.53348600168048e-06, + "loss": 0.4457, + "step": 2588 + }, + { + "epoch": 1.224113475177305, + "grad_norm": 2.832211494445801, + "learning_rate": 4.533123050363699e-06, + "loss": 0.5559, + "step": 2589 + }, + { + "epoch": 1.2245862884160756, + "grad_norm": 2.6318583488464355, + "learning_rate": 4.53275997245326e-06, + "loss": 0.5281, + "step": 2590 + }, + { + "epoch": 1.2250591016548462, + "grad_norm": 3.0509233474731445, + "learning_rate": 4.532396767971771e-06, + "loss": 0.6003, + "step": 2591 + }, + { + "epoch": 1.225531914893617, + "grad_norm": 2.6863620281219482, + "learning_rate": 4.532033436941847e-06, + "loss": 0.5219, + "step": 2592 + }, + { + "epoch": 1.2260047281323878, + "grad_norm": 2.401463747024536, + "learning_rate": 4.5316699793861104e-06, + "loss": 0.5994, + "step": 2593 + }, + { + "epoch": 1.2264775413711584, + "grad_norm": 2.613517999649048, + "learning_rate": 4.531306395327194e-06, + "loss": 0.5785, + "step": 2594 + }, + { + "epoch": 1.226950354609929, + "grad_norm": 2.5016374588012695, + "learning_rate": 4.530942684787735e-06, + "loss": 0.5695, + "step": 2595 + }, + { + "epoch": 1.2274231678486998, + "grad_norm": 2.576464891433716, + "learning_rate": 4.53057884779038e-06, + "loss": 0.4427, + "step": 2596 + }, + { + "epoch": 1.2278959810874706, + "grad_norm": 2.5688700675964355, + "learning_rate": 4.530214884357785e-06, + "loss": 0.4966, + "step": 2597 + }, + { + "epoch": 1.2283687943262411, + "grad_norm": 3.179013729095459, + "learning_rate": 4.52985079451261e-06, + "loss": 0.5239, + "step": 2598 + }, + { + "epoch": 1.2288416075650117, + "grad_norm": 2.6015284061431885, + "learning_rate": 4.529486578277527e-06, + "loss": 0.5135, + "step": 2599 + }, + { + "epoch": 1.2293144208037825, + "grad_norm": 2.3029589653015137, + "learning_rate": 4.529122235675214e-06, + "loss": 0.4044, + "step": 2600 + }, + { + "epoch": 1.2297872340425533, + "grad_norm": 2.994093656539917, + "learning_rate": 4.528757766728357e-06, + "loss": 0.5419, + "step": 2601 + }, + { + "epoch": 1.2302600472813239, + "grad_norm": 2.6297390460968018, + "learning_rate": 4.52839317145965e-06, + "loss": 0.488, + "step": 2602 + }, + { + "epoch": 1.2307328605200945, + "grad_norm": 2.4814043045043945, + "learning_rate": 4.528028449891793e-06, + "loss": 0.4917, + "step": 2603 + }, + { + "epoch": 1.2312056737588652, + "grad_norm": 3.6052863597869873, + "learning_rate": 4.527663602047499e-06, + "loss": 0.5301, + "step": 2604 + }, + { + "epoch": 1.231678486997636, + "grad_norm": 2.6984751224517822, + "learning_rate": 4.5272986279494825e-06, + "loss": 0.5253, + "step": 2605 + }, + { + "epoch": 1.2321513002364066, + "grad_norm": 2.514000415802002, + "learning_rate": 4.526933527620469e-06, + "loss": 0.5661, + "step": 2606 + }, + { + "epoch": 1.2326241134751772, + "grad_norm": 2.890921115875244, + "learning_rate": 4.526568301083195e-06, + "loss": 0.5585, + "step": 2607 + }, + { + "epoch": 1.233096926713948, + "grad_norm": 2.6390011310577393, + "learning_rate": 4.526202948360397e-06, + "loss": 0.5168, + "step": 2608 + }, + { + "epoch": 1.2335697399527188, + "grad_norm": 2.7370636463165283, + "learning_rate": 4.5258374694748266e-06, + "loss": 0.5453, + "step": 2609 + }, + { + "epoch": 1.2340425531914894, + "grad_norm": 2.8203976154327393, + "learning_rate": 4.52547186444924e-06, + "loss": 0.5763, + "step": 2610 + }, + { + "epoch": 1.23451536643026, + "grad_norm": 2.7567849159240723, + "learning_rate": 4.5251061333064025e-06, + "loss": 0.5194, + "step": 2611 + }, + { + "epoch": 1.2349881796690307, + "grad_norm": 2.767519474029541, + "learning_rate": 4.524740276069085e-06, + "loss": 0.5355, + "step": 2612 + }, + { + "epoch": 1.2354609929078015, + "grad_norm": 3.072035312652588, + "learning_rate": 4.5243742927600695e-06, + "loss": 0.5391, + "step": 2613 + }, + { + "epoch": 1.2359338061465721, + "grad_norm": 2.5957462787628174, + "learning_rate": 4.524008183402143e-06, + "loss": 0.5645, + "step": 2614 + }, + { + "epoch": 1.2364066193853427, + "grad_norm": 2.774897575378418, + "learning_rate": 4.523641948018101e-06, + "loss": 0.5576, + "step": 2615 + }, + { + "epoch": 1.2368794326241135, + "grad_norm": 2.635887622833252, + "learning_rate": 4.5232755866307496e-06, + "loss": 0.5254, + "step": 2616 + }, + { + "epoch": 1.2373522458628843, + "grad_norm": 2.4860997200012207, + "learning_rate": 4.522909099262899e-06, + "loss": 0.4692, + "step": 2617 + }, + { + "epoch": 1.2378250591016549, + "grad_norm": 2.595513105392456, + "learning_rate": 4.522542485937369e-06, + "loss": 0.5166, + "step": 2618 + }, + { + "epoch": 1.2382978723404254, + "grad_norm": 2.961474895477295, + "learning_rate": 4.522175746676986e-06, + "loss": 0.5455, + "step": 2619 + }, + { + "epoch": 1.2387706855791962, + "grad_norm": 2.813889741897583, + "learning_rate": 4.521808881504588e-06, + "loss": 0.5249, + "step": 2620 + }, + { + "epoch": 1.239243498817967, + "grad_norm": 2.8434813022613525, + "learning_rate": 4.521441890443015e-06, + "loss": 0.472, + "step": 2621 + }, + { + "epoch": 1.2397163120567376, + "grad_norm": 2.4264845848083496, + "learning_rate": 4.521074773515119e-06, + "loss": 0.4783, + "step": 2622 + }, + { + "epoch": 1.2401891252955082, + "grad_norm": 2.615169048309326, + "learning_rate": 4.520707530743761e-06, + "loss": 0.5324, + "step": 2623 + }, + { + "epoch": 1.240661938534279, + "grad_norm": 2.6772537231445312, + "learning_rate": 4.520340162151803e-06, + "loss": 0.5224, + "step": 2624 + }, + { + "epoch": 1.2411347517730495, + "grad_norm": 2.683393955230713, + "learning_rate": 4.519972667762124e-06, + "loss": 0.4863, + "step": 2625 + }, + { + "epoch": 1.2416075650118203, + "grad_norm": 3.0335750579833984, + "learning_rate": 4.519605047597603e-06, + "loss": 0.544, + "step": 2626 + }, + { + "epoch": 1.242080378250591, + "grad_norm": 2.8694353103637695, + "learning_rate": 4.519237301681132e-06, + "loss": 0.5576, + "step": 2627 + }, + { + "epoch": 1.2425531914893617, + "grad_norm": 3.217808246612549, + "learning_rate": 4.518869430035609e-06, + "loss": 0.5459, + "step": 2628 + }, + { + "epoch": 1.2430260047281323, + "grad_norm": 2.7700083255767822, + "learning_rate": 4.518501432683937e-06, + "loss": 0.5579, + "step": 2629 + }, + { + "epoch": 1.243498817966903, + "grad_norm": 2.4759175777435303, + "learning_rate": 4.5181333096490335e-06, + "loss": 0.5049, + "step": 2630 + }, + { + "epoch": 1.2439716312056737, + "grad_norm": 2.8652584552764893, + "learning_rate": 4.517765060953818e-06, + "loss": 0.5366, + "step": 2631 + }, + { + "epoch": 1.2444444444444445, + "grad_norm": 2.776334524154663, + "learning_rate": 4.517396686621218e-06, + "loss": 0.5677, + "step": 2632 + }, + { + "epoch": 1.244917257683215, + "grad_norm": 2.676708221435547, + "learning_rate": 4.517028186674174e-06, + "loss": 0.5055, + "step": 2633 + }, + { + "epoch": 1.2453900709219858, + "grad_norm": 2.6851537227630615, + "learning_rate": 4.516659561135629e-06, + "loss": 0.5537, + "step": 2634 + }, + { + "epoch": 1.2458628841607564, + "grad_norm": 2.619971513748169, + "learning_rate": 4.516290810028536e-06, + "loss": 0.5765, + "step": 2635 + }, + { + "epoch": 1.2463356973995272, + "grad_norm": 2.7302334308624268, + "learning_rate": 4.515921933375855e-06, + "loss": 0.5611, + "step": 2636 + }, + { + "epoch": 1.2468085106382978, + "grad_norm": 2.5005829334259033, + "learning_rate": 4.5155529312005554e-06, + "loss": 0.442, + "step": 2637 + }, + { + "epoch": 1.2472813238770686, + "grad_norm": 2.713587522506714, + "learning_rate": 4.515183803525612e-06, + "loss": 0.5023, + "step": 2638 + }, + { + "epoch": 1.2477541371158392, + "grad_norm": 2.5146236419677734, + "learning_rate": 4.514814550374009e-06, + "loss": 0.5195, + "step": 2639 + }, + { + "epoch": 1.24822695035461, + "grad_norm": 2.761060953140259, + "learning_rate": 4.51444517176874e-06, + "loss": 0.5138, + "step": 2640 + }, + { + "epoch": 1.2486997635933805, + "grad_norm": 3.082329273223877, + "learning_rate": 4.5140756677328026e-06, + "loss": 0.6105, + "step": 2641 + }, + { + "epoch": 1.2491725768321513, + "grad_norm": 2.6933493614196777, + "learning_rate": 4.513706038289205e-06, + "loss": 0.5185, + "step": 2642 + }, + { + "epoch": 1.249645390070922, + "grad_norm": 2.515856981277466, + "learning_rate": 4.513336283460962e-06, + "loss": 0.5375, + "step": 2643 + }, + { + "epoch": 1.2501182033096927, + "grad_norm": 2.8553731441497803, + "learning_rate": 4.512966403271096e-06, + "loss": 0.5582, + "step": 2644 + }, + { + "epoch": 1.2505910165484633, + "grad_norm": 2.640880823135376, + "learning_rate": 4.5125963977426405e-06, + "loss": 0.5125, + "step": 2645 + }, + { + "epoch": 1.251063829787234, + "grad_norm": 2.9845943450927734, + "learning_rate": 4.512226266898631e-06, + "loss": 0.4749, + "step": 2646 + }, + { + "epoch": 1.2515366430260046, + "grad_norm": 2.5131032466888428, + "learning_rate": 4.511856010762116e-06, + "loss": 0.4764, + "step": 2647 + }, + { + "epoch": 1.2520094562647754, + "grad_norm": 2.370638370513916, + "learning_rate": 4.511485629356148e-06, + "loss": 0.5153, + "step": 2648 + }, + { + "epoch": 1.252482269503546, + "grad_norm": 2.912461996078491, + "learning_rate": 4.511115122703791e-06, + "loss": 0.6117, + "step": 2649 + }, + { + "epoch": 1.2529550827423168, + "grad_norm": 2.7308082580566406, + "learning_rate": 4.510744490828113e-06, + "loss": 0.5076, + "step": 2650 + }, + { + "epoch": 1.2534278959810874, + "grad_norm": 2.8524296283721924, + "learning_rate": 4.510373733752193e-06, + "loss": 0.542, + "step": 2651 + }, + { + "epoch": 1.2539007092198582, + "grad_norm": 2.799377202987671, + "learning_rate": 4.5100028514991145e-06, + "loss": 0.486, + "step": 2652 + }, + { + "epoch": 1.2543735224586288, + "grad_norm": 2.7248027324676514, + "learning_rate": 4.509631844091973e-06, + "loss": 0.4972, + "step": 2653 + }, + { + "epoch": 1.2548463356973996, + "grad_norm": 2.8041458129882812, + "learning_rate": 4.5092607115538686e-06, + "loss": 0.588, + "step": 2654 + }, + { + "epoch": 1.2553191489361701, + "grad_norm": 2.679417133331299, + "learning_rate": 4.50888945390791e-06, + "loss": 0.4639, + "step": 2655 + }, + { + "epoch": 1.255791962174941, + "grad_norm": 3.1049270629882812, + "learning_rate": 4.508518071177214e-06, + "loss": 0.5857, + "step": 2656 + }, + { + "epoch": 1.2562647754137115, + "grad_norm": 2.8590362071990967, + "learning_rate": 4.508146563384904e-06, + "loss": 0.5451, + "step": 2657 + }, + { + "epoch": 1.2567375886524823, + "grad_norm": 2.9774081707000732, + "learning_rate": 4.507774930554114e-06, + "loss": 0.5493, + "step": 2658 + }, + { + "epoch": 1.2572104018912529, + "grad_norm": 2.617643356323242, + "learning_rate": 4.507403172707983e-06, + "loss": 0.5472, + "step": 2659 + }, + { + "epoch": 1.2576832151300237, + "grad_norm": 2.9195587635040283, + "learning_rate": 4.507031289869658e-06, + "loss": 0.5403, + "step": 2660 + }, + { + "epoch": 1.2581560283687943, + "grad_norm": 2.706089496612549, + "learning_rate": 4.506659282062295e-06, + "loss": 0.4899, + "step": 2661 + }, + { + "epoch": 1.258628841607565, + "grad_norm": 2.8229358196258545, + "learning_rate": 4.506287149309057e-06, + "loss": 0.5336, + "step": 2662 + }, + { + "epoch": 1.2591016548463356, + "grad_norm": 2.5295674800872803, + "learning_rate": 4.505914891633117e-06, + "loss": 0.4806, + "step": 2663 + }, + { + "epoch": 1.2595744680851064, + "grad_norm": 3.098208427429199, + "learning_rate": 4.505542509057651e-06, + "loss": 0.6039, + "step": 2664 + }, + { + "epoch": 1.260047281323877, + "grad_norm": 2.5118041038513184, + "learning_rate": 4.5051700016058475e-06, + "loss": 0.5279, + "step": 2665 + }, + { + "epoch": 1.2605200945626478, + "grad_norm": 2.6901369094848633, + "learning_rate": 4.5047973693009005e-06, + "loss": 0.5515, + "step": 2666 + }, + { + "epoch": 1.2609929078014184, + "grad_norm": 2.5622377395629883, + "learning_rate": 4.504424612166012e-06, + "loss": 0.5405, + "step": 2667 + }, + { + "epoch": 1.2614657210401892, + "grad_norm": 2.685751438140869, + "learning_rate": 4.5040517302243915e-06, + "loss": 0.5797, + "step": 2668 + }, + { + "epoch": 1.2619385342789597, + "grad_norm": 2.8525350093841553, + "learning_rate": 4.503678723499259e-06, + "loss": 0.5561, + "step": 2669 + }, + { + "epoch": 1.2624113475177305, + "grad_norm": 2.803386926651001, + "learning_rate": 4.503305592013836e-06, + "loss": 0.5376, + "step": 2670 + }, + { + "epoch": 1.2628841607565011, + "grad_norm": 2.78633189201355, + "learning_rate": 4.502932335791359e-06, + "loss": 0.4739, + "step": 2671 + }, + { + "epoch": 1.263356973995272, + "grad_norm": 2.8337297439575195, + "learning_rate": 4.502558954855069e-06, + "loss": 0.5406, + "step": 2672 + }, + { + "epoch": 1.2638297872340425, + "grad_norm": 2.610275983810425, + "learning_rate": 4.502185449228213e-06, + "loss": 0.5343, + "step": 2673 + }, + { + "epoch": 1.2643026004728133, + "grad_norm": 2.7842252254486084, + "learning_rate": 4.501811818934048e-06, + "loss": 0.532, + "step": 2674 + }, + { + "epoch": 1.2647754137115839, + "grad_norm": 2.4472389221191406, + "learning_rate": 4.501438063995839e-06, + "loss": 0.4976, + "step": 2675 + }, + { + "epoch": 1.2652482269503547, + "grad_norm": 3.076580762863159, + "learning_rate": 4.501064184436858e-06, + "loss": 0.507, + "step": 2676 + }, + { + "epoch": 1.2657210401891252, + "grad_norm": 2.5952908992767334, + "learning_rate": 4.500690180280384e-06, + "loss": 0.5498, + "step": 2677 + }, + { + "epoch": 1.266193853427896, + "grad_norm": 2.476943016052246, + "learning_rate": 4.500316051549706e-06, + "loss": 0.557, + "step": 2678 + }, + { + "epoch": 1.2666666666666666, + "grad_norm": 2.730579376220703, + "learning_rate": 4.499941798268118e-06, + "loss": 0.4975, + "step": 2679 + }, + { + "epoch": 1.2671394799054374, + "grad_norm": 2.7916698455810547, + "learning_rate": 4.499567420458924e-06, + "loss": 0.5673, + "step": 2680 + }, + { + "epoch": 1.267612293144208, + "grad_norm": 2.4249091148376465, + "learning_rate": 4.4991929181454355e-06, + "loss": 0.4836, + "step": 2681 + }, + { + "epoch": 1.2680851063829788, + "grad_norm": 2.661911725997925, + "learning_rate": 4.498818291350969e-06, + "loss": 0.5332, + "step": 2682 + }, + { + "epoch": 1.2685579196217494, + "grad_norm": 2.693657875061035, + "learning_rate": 4.498443540098852e-06, + "loss": 0.5257, + "step": 2683 + }, + { + "epoch": 1.2690307328605201, + "grad_norm": 2.609386682510376, + "learning_rate": 4.4980686644124195e-06, + "loss": 0.4918, + "step": 2684 + }, + { + "epoch": 1.2695035460992907, + "grad_norm": 3.2104930877685547, + "learning_rate": 4.4976936643150124e-06, + "loss": 0.6097, + "step": 2685 + }, + { + "epoch": 1.2699763593380615, + "grad_norm": 2.707860231399536, + "learning_rate": 4.49731853982998e-06, + "loss": 0.5109, + "step": 2686 + }, + { + "epoch": 1.270449172576832, + "grad_norm": 3.5046379566192627, + "learning_rate": 4.49694329098068e-06, + "loss": 0.5883, + "step": 2687 + }, + { + "epoch": 1.270921985815603, + "grad_norm": 2.5362324714660645, + "learning_rate": 4.496567917790477e-06, + "loss": 0.5301, + "step": 2688 + }, + { + "epoch": 1.2713947990543735, + "grad_norm": 2.7095518112182617, + "learning_rate": 4.496192420282746e-06, + "loss": 0.4772, + "step": 2689 + }, + { + "epoch": 1.2718676122931443, + "grad_norm": 2.416433095932007, + "learning_rate": 4.495816798480865e-06, + "loss": 0.5012, + "step": 2690 + }, + { + "epoch": 1.2723404255319148, + "grad_norm": 2.5362391471862793, + "learning_rate": 4.495441052408224e-06, + "loss": 0.5197, + "step": 2691 + }, + { + "epoch": 1.2728132387706856, + "grad_norm": 2.9093947410583496, + "learning_rate": 4.495065182088218e-06, + "loss": 0.4893, + "step": 2692 + }, + { + "epoch": 1.2732860520094562, + "grad_norm": 2.520470142364502, + "learning_rate": 4.494689187544251e-06, + "loss": 0.5072, + "step": 2693 + }, + { + "epoch": 1.273758865248227, + "grad_norm": 2.4385125637054443, + "learning_rate": 4.494313068799735e-06, + "loss": 0.4923, + "step": 2694 + }, + { + "epoch": 1.2742316784869976, + "grad_norm": 2.636852502822876, + "learning_rate": 4.493936825878089e-06, + "loss": 0.5409, + "step": 2695 + }, + { + "epoch": 1.2747044917257684, + "grad_norm": 2.7027053833007812, + "learning_rate": 4.493560458802741e-06, + "loss": 0.5906, + "step": 2696 + }, + { + "epoch": 1.275177304964539, + "grad_norm": 2.58752179145813, + "learning_rate": 4.493183967597123e-06, + "loss": 0.5292, + "step": 2697 + }, + { + "epoch": 1.2756501182033098, + "grad_norm": 2.7658379077911377, + "learning_rate": 4.49280735228468e-06, + "loss": 0.5613, + "step": 2698 + }, + { + "epoch": 1.2761229314420803, + "grad_norm": 3.272688388824463, + "learning_rate": 4.492430612888861e-06, + "loss": 0.5654, + "step": 2699 + }, + { + "epoch": 1.2765957446808511, + "grad_norm": 2.806819438934326, + "learning_rate": 4.492053749433125e-06, + "loss": 0.5388, + "step": 2700 + }, + { + "epoch": 1.2770685579196217, + "grad_norm": 2.879727602005005, + "learning_rate": 4.491676761940936e-06, + "loss": 0.5033, + "step": 2701 + }, + { + "epoch": 1.2775413711583925, + "grad_norm": 2.733347177505493, + "learning_rate": 4.4912996504357695e-06, + "loss": 0.5113, + "step": 2702 + }, + { + "epoch": 1.278014184397163, + "grad_norm": 2.7431252002716064, + "learning_rate": 4.490922414941104e-06, + "loss": 0.5417, + "step": 2703 + }, + { + "epoch": 1.2784869976359339, + "grad_norm": 2.9287240505218506, + "learning_rate": 4.490545055480431e-06, + "loss": 0.5875, + "step": 2704 + }, + { + "epoch": 1.2789598108747045, + "grad_norm": 2.576775550842285, + "learning_rate": 4.490167572077244e-06, + "loss": 0.5176, + "step": 2705 + }, + { + "epoch": 1.2794326241134752, + "grad_norm": 2.4335594177246094, + "learning_rate": 4.4897899647550505e-06, + "loss": 0.4749, + "step": 2706 + }, + { + "epoch": 1.2799054373522458, + "grad_norm": 2.6798062324523926, + "learning_rate": 4.489412233537361e-06, + "loss": 0.5439, + "step": 2707 + }, + { + "epoch": 1.2803782505910166, + "grad_norm": 2.8440675735473633, + "learning_rate": 4.489034378447693e-06, + "loss": 0.552, + "step": 2708 + }, + { + "epoch": 1.2808510638297872, + "grad_norm": 2.9059503078460693, + "learning_rate": 4.488656399509577e-06, + "loss": 0.5667, + "step": 2709 + }, + { + "epoch": 1.281323877068558, + "grad_norm": 2.7415006160736084, + "learning_rate": 4.488278296746548e-06, + "loss": 0.5676, + "step": 2710 + }, + { + "epoch": 1.2817966903073286, + "grad_norm": 2.4584875106811523, + "learning_rate": 4.487900070182147e-06, + "loss": 0.4787, + "step": 2711 + }, + { + "epoch": 1.2822695035460994, + "grad_norm": 2.990940809249878, + "learning_rate": 4.487521719839924e-06, + "loss": 0.5239, + "step": 2712 + }, + { + "epoch": 1.28274231678487, + "grad_norm": 3.075201988220215, + "learning_rate": 4.487143245743441e-06, + "loss": 0.5103, + "step": 2713 + }, + { + "epoch": 1.2832151300236407, + "grad_norm": 2.543341875076294, + "learning_rate": 4.486764647916259e-06, + "loss": 0.5475, + "step": 2714 + }, + { + "epoch": 1.2836879432624113, + "grad_norm": 2.9927213191986084, + "learning_rate": 4.486385926381957e-06, + "loss": 0.4923, + "step": 2715 + }, + { + "epoch": 1.284160756501182, + "grad_norm": 2.4220657348632812, + "learning_rate": 4.486007081164111e-06, + "loss": 0.543, + "step": 2716 + }, + { + "epoch": 1.2846335697399527, + "grad_norm": 2.468214988708496, + "learning_rate": 4.4856281122863134e-06, + "loss": 0.5248, + "step": 2717 + }, + { + "epoch": 1.2851063829787235, + "grad_norm": 2.633711099624634, + "learning_rate": 4.48524901977216e-06, + "loss": 0.4764, + "step": 2718 + }, + { + "epoch": 1.285579196217494, + "grad_norm": 2.8399546146392822, + "learning_rate": 4.484869803645254e-06, + "loss": 0.5503, + "step": 2719 + }, + { + "epoch": 1.2860520094562649, + "grad_norm": 2.769063949584961, + "learning_rate": 4.484490463929209e-06, + "loss": 0.5468, + "step": 2720 + }, + { + "epoch": 1.2865248226950354, + "grad_norm": 2.617863893508911, + "learning_rate": 4.4841110006476465e-06, + "loss": 0.5906, + "step": 2721 + }, + { + "epoch": 1.2869976359338062, + "grad_norm": 2.7639541625976562, + "learning_rate": 4.4837314138241905e-06, + "loss": 0.552, + "step": 2722 + }, + { + "epoch": 1.2874704491725768, + "grad_norm": 2.7711129188537598, + "learning_rate": 4.483351703482478e-06, + "loss": 0.5229, + "step": 2723 + }, + { + "epoch": 1.2879432624113476, + "grad_norm": 2.611205577850342, + "learning_rate": 4.482971869646152e-06, + "loss": 0.5055, + "step": 2724 + }, + { + "epoch": 1.2884160756501182, + "grad_norm": 2.8602211475372314, + "learning_rate": 4.482591912338862e-06, + "loss": 0.5561, + "step": 2725 + }, + { + "epoch": 1.2888888888888888, + "grad_norm": 2.5882298946380615, + "learning_rate": 4.4822118315842675e-06, + "loss": 0.5555, + "step": 2726 + }, + { + "epoch": 1.2893617021276595, + "grad_norm": 2.7533531188964844, + "learning_rate": 4.481831627406033e-06, + "loss": 0.5346, + "step": 2727 + }, + { + "epoch": 1.2898345153664303, + "grad_norm": 2.4296958446502686, + "learning_rate": 4.481451299827835e-06, + "loss": 0.4915, + "step": 2728 + }, + { + "epoch": 1.290307328605201, + "grad_norm": 2.4403445720672607, + "learning_rate": 4.481070848873352e-06, + "loss": 0.5648, + "step": 2729 + }, + { + "epoch": 1.2907801418439715, + "grad_norm": 2.473224401473999, + "learning_rate": 4.480690274566274e-06, + "loss": 0.4849, + "step": 2730 + }, + { + "epoch": 1.2912529550827423, + "grad_norm": 2.637899875640869, + "learning_rate": 4.480309576930297e-06, + "loss": 0.4968, + "step": 2731 + }, + { + "epoch": 1.291725768321513, + "grad_norm": 2.7156927585601807, + "learning_rate": 4.479928755989127e-06, + "loss": 0.4759, + "step": 2732 + }, + { + "epoch": 1.2921985815602837, + "grad_norm": 2.632786989212036, + "learning_rate": 4.479547811766475e-06, + "loss": 0.5468, + "step": 2733 + }, + { + "epoch": 1.2926713947990542, + "grad_norm": 2.529218912124634, + "learning_rate": 4.479166744286061e-06, + "loss": 0.4852, + "step": 2734 + }, + { + "epoch": 1.293144208037825, + "grad_norm": 2.561978340148926, + "learning_rate": 4.4787855535716115e-06, + "loss": 0.546, + "step": 2735 + }, + { + "epoch": 1.2936170212765958, + "grad_norm": 2.3684909343719482, + "learning_rate": 4.478404239646862e-06, + "loss": 0.5369, + "step": 2736 + }, + { + "epoch": 1.2940898345153664, + "grad_norm": 2.8940367698669434, + "learning_rate": 4.4780228025355566e-06, + "loss": 0.568, + "step": 2737 + }, + { + "epoch": 1.294562647754137, + "grad_norm": 2.6950316429138184, + "learning_rate": 4.477641242261445e-06, + "loss": 0.4576, + "step": 2738 + }, + { + "epoch": 1.2950354609929078, + "grad_norm": 2.4211716651916504, + "learning_rate": 4.4772595588482835e-06, + "loss": 0.4341, + "step": 2739 + }, + { + "epoch": 1.2955082742316786, + "grad_norm": 3.141097068786621, + "learning_rate": 4.47687775231984e-06, + "loss": 0.5944, + "step": 2740 + }, + { + "epoch": 1.2959810874704492, + "grad_norm": 3.077522039413452, + "learning_rate": 4.476495822699887e-06, + "loss": 0.5786, + "step": 2741 + }, + { + "epoch": 1.2964539007092197, + "grad_norm": 2.708139419555664, + "learning_rate": 4.476113770012206e-06, + "loss": 0.5014, + "step": 2742 + }, + { + "epoch": 1.2969267139479905, + "grad_norm": 2.7572035789489746, + "learning_rate": 4.475731594280586e-06, + "loss": 0.594, + "step": 2743 + }, + { + "epoch": 1.2973995271867613, + "grad_norm": 2.673126459121704, + "learning_rate": 4.475349295528822e-06, + "loss": 0.5317, + "step": 2744 + }, + { + "epoch": 1.297872340425532, + "grad_norm": 2.6757819652557373, + "learning_rate": 4.4749668737807195e-06, + "loss": 0.5614, + "step": 2745 + }, + { + "epoch": 1.2983451536643025, + "grad_norm": 2.7077620029449463, + "learning_rate": 4.47458432906009e-06, + "loss": 0.4916, + "step": 2746 + }, + { + "epoch": 1.2988179669030733, + "grad_norm": 2.446570873260498, + "learning_rate": 4.474201661390752e-06, + "loss": 0.5005, + "step": 2747 + }, + { + "epoch": 1.299290780141844, + "grad_norm": 2.642695665359497, + "learning_rate": 4.473818870796533e-06, + "loss": 0.5048, + "step": 2748 + }, + { + "epoch": 1.2997635933806146, + "grad_norm": 2.519824743270874, + "learning_rate": 4.4734359573012686e-06, + "loss": 0.5131, + "step": 2749 + }, + { + "epoch": 1.3002364066193852, + "grad_norm": 2.5901925563812256, + "learning_rate": 4.4730529209287995e-06, + "loss": 0.4582, + "step": 2750 + }, + { + "epoch": 1.300709219858156, + "grad_norm": 2.6789121627807617, + "learning_rate": 4.472669761702978e-06, + "loss": 0.5685, + "step": 2751 + }, + { + "epoch": 1.3011820330969268, + "grad_norm": 2.408003807067871, + "learning_rate": 4.472286479647659e-06, + "loss": 0.4329, + "step": 2752 + }, + { + "epoch": 1.3016548463356974, + "grad_norm": 2.681403398513794, + "learning_rate": 4.47190307478671e-06, + "loss": 0.4853, + "step": 2753 + }, + { + "epoch": 1.302127659574468, + "grad_norm": 2.9923183917999268, + "learning_rate": 4.4715195471440025e-06, + "loss": 0.5184, + "step": 2754 + }, + { + "epoch": 1.3026004728132388, + "grad_norm": 2.5100321769714355, + "learning_rate": 4.471135896743418e-06, + "loss": 0.5148, + "step": 2755 + }, + { + "epoch": 1.3030732860520096, + "grad_norm": 2.267881393432617, + "learning_rate": 4.4707521236088444e-06, + "loss": 0.5028, + "step": 2756 + }, + { + "epoch": 1.3035460992907801, + "grad_norm": 2.7779829502105713, + "learning_rate": 4.4703682277641775e-06, + "loss": 0.5724, + "step": 2757 + }, + { + "epoch": 1.3040189125295507, + "grad_norm": 2.4262194633483887, + "learning_rate": 4.4699842092333205e-06, + "loss": 0.5341, + "step": 2758 + }, + { + "epoch": 1.3044917257683215, + "grad_norm": 2.8682050704956055, + "learning_rate": 4.469600068040185e-06, + "loss": 0.6114, + "step": 2759 + }, + { + "epoch": 1.3049645390070923, + "grad_norm": 2.647853374481201, + "learning_rate": 4.46921580420869e-06, + "loss": 0.5107, + "step": 2760 + }, + { + "epoch": 1.3054373522458629, + "grad_norm": 2.561998128890991, + "learning_rate": 4.468831417762762e-06, + "loss": 0.6019, + "step": 2761 + }, + { + "epoch": 1.3059101654846335, + "grad_norm": 2.763425350189209, + "learning_rate": 4.468446908726334e-06, + "loss": 0.572, + "step": 2762 + }, + { + "epoch": 1.3063829787234043, + "grad_norm": 2.7052934169769287, + "learning_rate": 4.468062277123348e-06, + "loss": 0.4876, + "step": 2763 + }, + { + "epoch": 1.306855791962175, + "grad_norm": 2.997845411300659, + "learning_rate": 4.467677522977755e-06, + "loss": 0.5683, + "step": 2764 + }, + { + "epoch": 1.3073286052009456, + "grad_norm": 2.503129005432129, + "learning_rate": 4.46729264631351e-06, + "loss": 0.4951, + "step": 2765 + }, + { + "epoch": 1.3078014184397162, + "grad_norm": 2.617492437362671, + "learning_rate": 4.466907647154578e-06, + "loss": 0.5054, + "step": 2766 + }, + { + "epoch": 1.308274231678487, + "grad_norm": 2.934967279434204, + "learning_rate": 4.4665225255249315e-06, + "loss": 0.5299, + "step": 2767 + }, + { + "epoch": 1.3087470449172578, + "grad_norm": 2.787252187728882, + "learning_rate": 4.46613728144855e-06, + "loss": 0.4652, + "step": 2768 + }, + { + "epoch": 1.3092198581560284, + "grad_norm": 2.567439556121826, + "learning_rate": 4.465751914949422e-06, + "loss": 0.538, + "step": 2769 + }, + { + "epoch": 1.309692671394799, + "grad_norm": 2.6386024951934814, + "learning_rate": 4.4653664260515416e-06, + "loss": 0.464, + "step": 2770 + }, + { + "epoch": 1.3101654846335697, + "grad_norm": 2.966848134994507, + "learning_rate": 4.464980814778912e-06, + "loss": 0.4889, + "step": 2771 + }, + { + "epoch": 1.3106382978723405, + "grad_norm": 2.571256637573242, + "learning_rate": 4.464595081155542e-06, + "loss": 0.4979, + "step": 2772 + }, + { + "epoch": 1.3111111111111111, + "grad_norm": 2.774203062057495, + "learning_rate": 4.4642092252054515e-06, + "loss": 0.5366, + "step": 2773 + }, + { + "epoch": 1.3115839243498817, + "grad_norm": 2.682969331741333, + "learning_rate": 4.463823246952666e-06, + "loss": 0.5118, + "step": 2774 + }, + { + "epoch": 1.3120567375886525, + "grad_norm": 2.4873905181884766, + "learning_rate": 4.463437146421217e-06, + "loss": 0.5548, + "step": 2775 + }, + { + "epoch": 1.3125295508274233, + "grad_norm": 2.6769661903381348, + "learning_rate": 4.463050923635147e-06, + "loss": 0.5023, + "step": 2776 + }, + { + "epoch": 1.3130023640661939, + "grad_norm": 2.7190892696380615, + "learning_rate": 4.462664578618503e-06, + "loss": 0.5546, + "step": 2777 + }, + { + "epoch": 1.3134751773049644, + "grad_norm": 2.8193624019622803, + "learning_rate": 4.462278111395343e-06, + "loss": 0.5265, + "step": 2778 + }, + { + "epoch": 1.3139479905437352, + "grad_norm": 2.7324538230895996, + "learning_rate": 4.461891521989728e-06, + "loss": 0.5449, + "step": 2779 + }, + { + "epoch": 1.314420803782506, + "grad_norm": 2.87320876121521, + "learning_rate": 4.4615048104257305e-06, + "loss": 0.5367, + "step": 2780 + }, + { + "epoch": 1.3148936170212766, + "grad_norm": 2.6777031421661377, + "learning_rate": 4.4611179767274306e-06, + "loss": 0.5026, + "step": 2781 + }, + { + "epoch": 1.3153664302600472, + "grad_norm": 3.714524269104004, + "learning_rate": 4.460731020918913e-06, + "loss": 0.569, + "step": 2782 + }, + { + "epoch": 1.315839243498818, + "grad_norm": 2.7493600845336914, + "learning_rate": 4.460343943024273e-06, + "loss": 0.5826, + "step": 2783 + }, + { + "epoch": 1.3163120567375888, + "grad_norm": 2.6544079780578613, + "learning_rate": 4.459956743067609e-06, + "loss": 0.5399, + "step": 2784 + }, + { + "epoch": 1.3167848699763594, + "grad_norm": 2.4338037967681885, + "learning_rate": 4.459569421073036e-06, + "loss": 0.5186, + "step": 2785 + }, + { + "epoch": 1.31725768321513, + "grad_norm": 2.9312374591827393, + "learning_rate": 4.459181977064665e-06, + "loss": 0.5571, + "step": 2786 + }, + { + "epoch": 1.3177304964539007, + "grad_norm": 2.5988922119140625, + "learning_rate": 4.458794411066624e-06, + "loss": 0.5926, + "step": 2787 + }, + { + "epoch": 1.3182033096926715, + "grad_norm": 2.5193772315979004, + "learning_rate": 4.458406723103044e-06, + "loss": 0.5243, + "step": 2788 + }, + { + "epoch": 1.318676122931442, + "grad_norm": 2.8653743267059326, + "learning_rate": 4.458018913198066e-06, + "loss": 0.5421, + "step": 2789 + }, + { + "epoch": 1.3191489361702127, + "grad_norm": 2.486245632171631, + "learning_rate": 4.457630981375834e-06, + "loss": 0.4862, + "step": 2790 + }, + { + "epoch": 1.3196217494089835, + "grad_norm": 3.155435800552368, + "learning_rate": 4.457242927660506e-06, + "loss": 0.5386, + "step": 2791 + }, + { + "epoch": 1.3200945626477543, + "grad_norm": 3.102023124694824, + "learning_rate": 4.456854752076242e-06, + "loss": 0.5527, + "step": 2792 + }, + { + "epoch": 1.3205673758865248, + "grad_norm": 2.7995986938476562, + "learning_rate": 4.456466454647215e-06, + "loss": 0.4364, + "step": 2793 + }, + { + "epoch": 1.3210401891252954, + "grad_norm": 2.8328311443328857, + "learning_rate": 4.456078035397599e-06, + "loss": 0.5516, + "step": 2794 + }, + { + "epoch": 1.3215130023640662, + "grad_norm": 2.606161594390869, + "learning_rate": 4.455689494351581e-06, + "loss": 0.5042, + "step": 2795 + }, + { + "epoch": 1.321985815602837, + "grad_norm": 2.6344757080078125, + "learning_rate": 4.455300831533354e-06, + "loss": 0.4807, + "step": 2796 + }, + { + "epoch": 1.3224586288416076, + "grad_norm": 2.8539786338806152, + "learning_rate": 4.454912046967118e-06, + "loss": 0.4694, + "step": 2797 + }, + { + "epoch": 1.3229314420803782, + "grad_norm": 2.849066734313965, + "learning_rate": 4.454523140677081e-06, + "loss": 0.5037, + "step": 2798 + }, + { + "epoch": 1.323404255319149, + "grad_norm": 2.6803371906280518, + "learning_rate": 4.454134112687458e-06, + "loss": 0.4959, + "step": 2799 + }, + { + "epoch": 1.3238770685579198, + "grad_norm": 3.0546066761016846, + "learning_rate": 4.453744963022473e-06, + "loss": 0.5935, + "step": 2800 + }, + { + "epoch": 1.3243498817966903, + "grad_norm": 2.625602960586548, + "learning_rate": 4.453355691706356e-06, + "loss": 0.5349, + "step": 2801 + }, + { + "epoch": 1.324822695035461, + "grad_norm": 2.7568554878234863, + "learning_rate": 4.452966298763345e-06, + "loss": 0.5012, + "step": 2802 + }, + { + "epoch": 1.3252955082742317, + "grad_norm": 2.940427303314209, + "learning_rate": 4.452576784217686e-06, + "loss": 0.5246, + "step": 2803 + }, + { + "epoch": 1.3257683215130025, + "grad_norm": 2.5485289096832275, + "learning_rate": 4.452187148093633e-06, + "loss": 0.5282, + "step": 2804 + }, + { + "epoch": 1.326241134751773, + "grad_norm": 2.8152987957000732, + "learning_rate": 4.4517973904154455e-06, + "loss": 0.5468, + "step": 2805 + }, + { + "epoch": 1.3267139479905437, + "grad_norm": 2.9399688243865967, + "learning_rate": 4.451407511207393e-06, + "loss": 0.5586, + "step": 2806 + }, + { + "epoch": 1.3271867612293144, + "grad_norm": 2.3870036602020264, + "learning_rate": 4.451017510493751e-06, + "loss": 0.4807, + "step": 2807 + }, + { + "epoch": 1.327659574468085, + "grad_norm": 3.4667887687683105, + "learning_rate": 4.450627388298805e-06, + "loss": 0.5571, + "step": 2808 + }, + { + "epoch": 1.3281323877068558, + "grad_norm": 2.685986042022705, + "learning_rate": 4.450237144646844e-06, + "loss": 0.5525, + "step": 2809 + }, + { + "epoch": 1.3286052009456264, + "grad_norm": 2.8529131412506104, + "learning_rate": 4.449846779562168e-06, + "loss": 0.491, + "step": 2810 + }, + { + "epoch": 1.3290780141843972, + "grad_norm": 2.7360332012176514, + "learning_rate": 4.449456293069082e-06, + "loss": 0.5574, + "step": 2811 + }, + { + "epoch": 1.3295508274231678, + "grad_norm": 2.4656026363372803, + "learning_rate": 4.4490656851919015e-06, + "loss": 0.4678, + "step": 2812 + }, + { + "epoch": 1.3300236406619386, + "grad_norm": 2.602651357650757, + "learning_rate": 4.448674955954947e-06, + "loss": 0.5118, + "step": 2813 + }, + { + "epoch": 1.3304964539007091, + "grad_norm": 3.0129756927490234, + "learning_rate": 4.448284105382548e-06, + "loss": 0.6136, + "step": 2814 + }, + { + "epoch": 1.33096926713948, + "grad_norm": 2.8499927520751953, + "learning_rate": 4.447893133499039e-06, + "loss": 0.5286, + "step": 2815 + }, + { + "epoch": 1.3314420803782505, + "grad_norm": 2.8320744037628174, + "learning_rate": 4.447502040328767e-06, + "loss": 0.5186, + "step": 2816 + }, + { + "epoch": 1.3319148936170213, + "grad_norm": 2.499950885772705, + "learning_rate": 4.447110825896084e-06, + "loss": 0.5338, + "step": 2817 + }, + { + "epoch": 1.3323877068557919, + "grad_norm": 2.530895233154297, + "learning_rate": 4.446719490225346e-06, + "loss": 0.5151, + "step": 2818 + }, + { + "epoch": 1.3328605200945627, + "grad_norm": 2.5276098251342773, + "learning_rate": 4.446328033340921e-06, + "loss": 0.5424, + "step": 2819 + }, + { + "epoch": 1.3333333333333333, + "grad_norm": 2.90218186378479, + "learning_rate": 4.4459364552671845e-06, + "loss": 0.5747, + "step": 2820 + }, + { + "epoch": 1.333806146572104, + "grad_norm": 2.500943183898926, + "learning_rate": 4.445544756028518e-06, + "loss": 0.5459, + "step": 2821 + }, + { + "epoch": 1.3342789598108746, + "grad_norm": 2.960374355316162, + "learning_rate": 4.44515293564931e-06, + "loss": 0.6092, + "step": 2822 + }, + { + "epoch": 1.3347517730496454, + "grad_norm": 2.813671827316284, + "learning_rate": 4.444760994153958e-06, + "loss": 0.5536, + "step": 2823 + }, + { + "epoch": 1.335224586288416, + "grad_norm": 2.7147483825683594, + "learning_rate": 4.444368931566867e-06, + "loss": 0.5291, + "step": 2824 + }, + { + "epoch": 1.3356973995271868, + "grad_norm": 2.710101842880249, + "learning_rate": 4.443976747912447e-06, + "loss": 0.5138, + "step": 2825 + }, + { + "epoch": 1.3361702127659574, + "grad_norm": 2.711419105529785, + "learning_rate": 4.443584443215121e-06, + "loss": 0.5223, + "step": 2826 + }, + { + "epoch": 1.3366430260047282, + "grad_norm": 2.887472152709961, + "learning_rate": 4.443192017499313e-06, + "loss": 0.5464, + "step": 2827 + }, + { + "epoch": 1.3371158392434987, + "grad_norm": 2.8867223262786865, + "learning_rate": 4.4427994707894585e-06, + "loss": 0.5748, + "step": 2828 + }, + { + "epoch": 1.3375886524822695, + "grad_norm": 2.407247543334961, + "learning_rate": 4.44240680311e-06, + "loss": 0.4727, + "step": 2829 + }, + { + "epoch": 1.3380614657210401, + "grad_norm": 2.578420877456665, + "learning_rate": 4.4420140144853865e-06, + "loss": 0.5129, + "step": 2830 + }, + { + "epoch": 1.338534278959811, + "grad_norm": 2.884373426437378, + "learning_rate": 4.441621104940077e-06, + "loss": 0.5366, + "step": 2831 + }, + { + "epoch": 1.3390070921985815, + "grad_norm": 2.8652374744415283, + "learning_rate": 4.441228074498534e-06, + "loss": 0.5045, + "step": 2832 + }, + { + "epoch": 1.3394799054373523, + "grad_norm": 2.5380210876464844, + "learning_rate": 4.440834923185231e-06, + "loss": 0.509, + "step": 2833 + }, + { + "epoch": 1.3399527186761229, + "grad_norm": 2.415734052658081, + "learning_rate": 4.440441651024648e-06, + "loss": 0.5066, + "step": 2834 + }, + { + "epoch": 1.3404255319148937, + "grad_norm": 2.503051996231079, + "learning_rate": 4.440048258041272e-06, + "loss": 0.5118, + "step": 2835 + }, + { + "epoch": 1.3408983451536642, + "grad_norm": 3.351001024246216, + "learning_rate": 4.439654744259598e-06, + "loss": 0.5758, + "step": 2836 + }, + { + "epoch": 1.341371158392435, + "grad_norm": 2.7368781566619873, + "learning_rate": 4.439261109704129e-06, + "loss": 0.5674, + "step": 2837 + }, + { + "epoch": 1.3418439716312056, + "grad_norm": 3.008199453353882, + "learning_rate": 4.438867354399372e-06, + "loss": 0.5891, + "step": 2838 + }, + { + "epoch": 1.3423167848699764, + "grad_norm": 2.538907766342163, + "learning_rate": 4.438473478369847e-06, + "loss": 0.5102, + "step": 2839 + }, + { + "epoch": 1.342789598108747, + "grad_norm": 2.7169063091278076, + "learning_rate": 4.438079481640079e-06, + "loss": 0.6131, + "step": 2840 + }, + { + "epoch": 1.3432624113475178, + "grad_norm": 2.7411608695983887, + "learning_rate": 4.437685364234601e-06, + "loss": 0.5337, + "step": 2841 + }, + { + "epoch": 1.3437352245862884, + "grad_norm": 3.2374939918518066, + "learning_rate": 4.43729112617795e-06, + "loss": 0.5401, + "step": 2842 + }, + { + "epoch": 1.3442080378250592, + "grad_norm": 2.4712226390838623, + "learning_rate": 4.436896767494676e-06, + "loss": 0.5365, + "step": 2843 + }, + { + "epoch": 1.3446808510638297, + "grad_norm": 2.661619186401367, + "learning_rate": 4.436502288209334e-06, + "loss": 0.4919, + "step": 2844 + }, + { + "epoch": 1.3451536643026005, + "grad_norm": 2.5943779945373535, + "learning_rate": 4.4361076883464845e-06, + "loss": 0.5253, + "step": 2845 + }, + { + "epoch": 1.345626477541371, + "grad_norm": 2.672297477722168, + "learning_rate": 4.4357129679307e-06, + "loss": 0.541, + "step": 2846 + }, + { + "epoch": 1.346099290780142, + "grad_norm": 2.6830925941467285, + "learning_rate": 4.435318126986557e-06, + "loss": 0.5641, + "step": 2847 + }, + { + "epoch": 1.3465721040189125, + "grad_norm": 2.7394626140594482, + "learning_rate": 4.434923165538639e-06, + "loss": 0.5591, + "step": 2848 + }, + { + "epoch": 1.3470449172576833, + "grad_norm": 2.9656317234039307, + "learning_rate": 4.434528083611541e-06, + "loss": 0.515, + "step": 2849 + }, + { + "epoch": 1.3475177304964538, + "grad_norm": 3.30155086517334, + "learning_rate": 4.434132881229861e-06, + "loss": 0.5871, + "step": 2850 + }, + { + "epoch": 1.3479905437352246, + "grad_norm": 2.6222476959228516, + "learning_rate": 4.433737558418209e-06, + "loss": 0.5143, + "step": 2851 + }, + { + "epoch": 1.3484633569739952, + "grad_norm": 2.903158187866211, + "learning_rate": 4.4333421152011965e-06, + "loss": 0.4484, + "step": 2852 + }, + { + "epoch": 1.348936170212766, + "grad_norm": 2.863116979598999, + "learning_rate": 4.432946551603449e-06, + "loss": 0.5213, + "step": 2853 + }, + { + "epoch": 1.3494089834515366, + "grad_norm": 2.8253962993621826, + "learning_rate": 4.432550867649596e-06, + "loss": 0.5713, + "step": 2854 + }, + { + "epoch": 1.3498817966903074, + "grad_norm": 2.652493953704834, + "learning_rate": 4.432155063364273e-06, + "loss": 0.5559, + "step": 2855 + }, + { + "epoch": 1.350354609929078, + "grad_norm": 2.4289376735687256, + "learning_rate": 4.431759138772127e-06, + "loss": 0.5122, + "step": 2856 + }, + { + "epoch": 1.3508274231678488, + "grad_norm": 2.6329853534698486, + "learning_rate": 4.43136309389781e-06, + "loss": 0.5332, + "step": 2857 + }, + { + "epoch": 1.3513002364066193, + "grad_norm": 2.431103229522705, + "learning_rate": 4.430966928765982e-06, + "loss": 0.4863, + "step": 2858 + }, + { + "epoch": 1.3517730496453901, + "grad_norm": 2.7529025077819824, + "learning_rate": 4.4305706434013106e-06, + "loss": 0.5263, + "step": 2859 + }, + { + "epoch": 1.3522458628841607, + "grad_norm": 2.884605646133423, + "learning_rate": 4.43017423782847e-06, + "loss": 0.564, + "step": 2860 + }, + { + "epoch": 1.3527186761229315, + "grad_norm": 3.027771234512329, + "learning_rate": 4.4297777120721435e-06, + "loss": 0.5846, + "step": 2861 + }, + { + "epoch": 1.353191489361702, + "grad_norm": 3.0140626430511475, + "learning_rate": 4.4293810661570205e-06, + "loss": 0.6621, + "step": 2862 + }, + { + "epoch": 1.3536643026004729, + "grad_norm": 2.721799612045288, + "learning_rate": 4.428984300107799e-06, + "loss": 0.5566, + "step": 2863 + }, + { + "epoch": 1.3541371158392435, + "grad_norm": 3.0016496181488037, + "learning_rate": 4.428587413949183e-06, + "loss": 0.5525, + "step": 2864 + }, + { + "epoch": 1.3546099290780143, + "grad_norm": 2.77138614654541, + "learning_rate": 4.428190407705886e-06, + "loss": 0.6016, + "step": 2865 + }, + { + "epoch": 1.3550827423167848, + "grad_norm": 2.9783477783203125, + "learning_rate": 4.427793281402627e-06, + "loss": 0.5556, + "step": 2866 + }, + { + "epoch": 1.3555555555555556, + "grad_norm": 2.2490382194519043, + "learning_rate": 4.427396035064132e-06, + "loss": 0.5138, + "step": 2867 + }, + { + "epoch": 1.3560283687943262, + "grad_norm": 2.442225217819214, + "learning_rate": 4.426998668715139e-06, + "loss": 0.4843, + "step": 2868 + }, + { + "epoch": 1.356501182033097, + "grad_norm": 2.74040150642395, + "learning_rate": 4.426601182380388e-06, + "loss": 0.54, + "step": 2869 + }, + { + "epoch": 1.3569739952718676, + "grad_norm": 2.4434332847595215, + "learning_rate": 4.426203576084629e-06, + "loss": 0.5199, + "step": 2870 + }, + { + "epoch": 1.3574468085106384, + "grad_norm": 2.6380388736724854, + "learning_rate": 4.42580584985262e-06, + "loss": 0.5049, + "step": 2871 + }, + { + "epoch": 1.357919621749409, + "grad_norm": 2.7324254512786865, + "learning_rate": 4.425408003709125e-06, + "loss": 0.5036, + "step": 2872 + }, + { + "epoch": 1.3583924349881797, + "grad_norm": 2.661012649536133, + "learning_rate": 4.425010037678916e-06, + "loss": 0.4965, + "step": 2873 + }, + { + "epoch": 1.3588652482269503, + "grad_norm": 2.5380208492279053, + "learning_rate": 4.424611951786773e-06, + "loss": 0.4293, + "step": 2874 + }, + { + "epoch": 1.3593380614657211, + "grad_norm": 2.6060714721679688, + "learning_rate": 4.424213746057483e-06, + "loss": 0.5335, + "step": 2875 + }, + { + "epoch": 1.3598108747044917, + "grad_norm": 2.98282527923584, + "learning_rate": 4.423815420515841e-06, + "loss": 0.5626, + "step": 2876 + }, + { + "epoch": 1.3602836879432625, + "grad_norm": 2.779371500015259, + "learning_rate": 4.423416975186647e-06, + "loss": 0.5353, + "step": 2877 + }, + { + "epoch": 1.360756501182033, + "grad_norm": 2.8033530712127686, + "learning_rate": 4.423018410094713e-06, + "loss": 0.538, + "step": 2878 + }, + { + "epoch": 1.3612293144208039, + "grad_norm": 3.225177764892578, + "learning_rate": 4.422619725264855e-06, + "loss": 0.5441, + "step": 2879 + }, + { + "epoch": 1.3617021276595744, + "grad_norm": 2.959135055541992, + "learning_rate": 4.422220920721896e-06, + "loss": 0.5293, + "step": 2880 + }, + { + "epoch": 1.3621749408983452, + "grad_norm": 2.5558884143829346, + "learning_rate": 4.4218219964906704e-06, + "loss": 0.442, + "step": 2881 + }, + { + "epoch": 1.3626477541371158, + "grad_norm": 2.694899797439575, + "learning_rate": 4.421422952596015e-06, + "loss": 0.5318, + "step": 2882 + }, + { + "epoch": 1.3631205673758866, + "grad_norm": 2.7909531593322754, + "learning_rate": 4.421023789062777e-06, + "loss": 0.6648, + "step": 2883 + }, + { + "epoch": 1.3635933806146572, + "grad_norm": 2.421995162963867, + "learning_rate": 4.420624505915813e-06, + "loss": 0.4644, + "step": 2884 + }, + { + "epoch": 1.364066193853428, + "grad_norm": 2.5876688957214355, + "learning_rate": 4.420225103179981e-06, + "loss": 0.5743, + "step": 2885 + }, + { + "epoch": 1.3645390070921986, + "grad_norm": 2.89341139793396, + "learning_rate": 4.419825580880152e-06, + "loss": 0.5454, + "step": 2886 + }, + { + "epoch": 1.3650118203309693, + "grad_norm": 2.534708261489868, + "learning_rate": 4.419425939041203e-06, + "loss": 0.5572, + "step": 2887 + }, + { + "epoch": 1.36548463356974, + "grad_norm": 2.6052141189575195, + "learning_rate": 4.419026177688017e-06, + "loss": 0.4763, + "step": 2888 + }, + { + "epoch": 1.3659574468085105, + "grad_norm": 2.723720073699951, + "learning_rate": 4.4186262968454854e-06, + "loss": 0.5659, + "step": 2889 + }, + { + "epoch": 1.3664302600472813, + "grad_norm": 2.8909599781036377, + "learning_rate": 4.418226296538507e-06, + "loss": 0.4996, + "step": 2890 + }, + { + "epoch": 1.366903073286052, + "grad_norm": 2.551375389099121, + "learning_rate": 4.417826176791988e-06, + "loss": 0.5259, + "step": 2891 + }, + { + "epoch": 1.3673758865248227, + "grad_norm": 3.360267162322998, + "learning_rate": 4.417425937630843e-06, + "loss": 0.5381, + "step": 2892 + }, + { + "epoch": 1.3678486997635932, + "grad_norm": 2.7611942291259766, + "learning_rate": 4.417025579079992e-06, + "loss": 0.6022, + "step": 2893 + }, + { + "epoch": 1.368321513002364, + "grad_norm": 2.5931224822998047, + "learning_rate": 4.416625101164365e-06, + "loss": 0.5102, + "step": 2894 + }, + { + "epoch": 1.3687943262411348, + "grad_norm": 2.5888102054595947, + "learning_rate": 4.416224503908897e-06, + "loss": 0.4955, + "step": 2895 + }, + { + "epoch": 1.3692671394799054, + "grad_norm": 2.6262896060943604, + "learning_rate": 4.41582378733853e-06, + "loss": 0.5101, + "step": 2896 + }, + { + "epoch": 1.369739952718676, + "grad_norm": 3.339170217514038, + "learning_rate": 4.415422951478218e-06, + "loss": 0.4939, + "step": 2897 + }, + { + "epoch": 1.3702127659574468, + "grad_norm": 2.940866708755493, + "learning_rate": 4.415021996352917e-06, + "loss": 0.5157, + "step": 2898 + }, + { + "epoch": 1.3706855791962176, + "grad_norm": 2.7423818111419678, + "learning_rate": 4.414620921987594e-06, + "loss": 0.5308, + "step": 2899 + }, + { + "epoch": 1.3711583924349882, + "grad_norm": 2.7177040576934814, + "learning_rate": 4.414219728407221e-06, + "loss": 0.5429, + "step": 2900 + }, + { + "epoch": 1.3716312056737587, + "grad_norm": 2.560774087905884, + "learning_rate": 4.4138184156367794e-06, + "loss": 0.5266, + "step": 2901 + }, + { + "epoch": 1.3721040189125295, + "grad_norm": 2.5649116039276123, + "learning_rate": 4.413416983701256e-06, + "loss": 0.4718, + "step": 2902 + }, + { + "epoch": 1.3725768321513003, + "grad_norm": 2.8547167778015137, + "learning_rate": 4.413015432625648e-06, + "loss": 0.5129, + "step": 2903 + }, + { + "epoch": 1.373049645390071, + "grad_norm": 2.5413618087768555, + "learning_rate": 4.412613762434958e-06, + "loss": 0.5738, + "step": 2904 + }, + { + "epoch": 1.3735224586288415, + "grad_norm": 3.3252241611480713, + "learning_rate": 4.412211973154195e-06, + "loss": 0.5639, + "step": 2905 + }, + { + "epoch": 1.3739952718676123, + "grad_norm": 2.869102954864502, + "learning_rate": 4.411810064808376e-06, + "loss": 0.5384, + "step": 2906 + }, + { + "epoch": 1.374468085106383, + "grad_norm": 2.703199863433838, + "learning_rate": 4.411408037422529e-06, + "loss": 0.5742, + "step": 2907 + }, + { + "epoch": 1.3749408983451537, + "grad_norm": 2.685450792312622, + "learning_rate": 4.411005891021684e-06, + "loss": 0.5121, + "step": 2908 + }, + { + "epoch": 1.3754137115839242, + "grad_norm": 2.9572203159332275, + "learning_rate": 4.410603625630882e-06, + "loss": 0.5444, + "step": 2909 + }, + { + "epoch": 1.375886524822695, + "grad_norm": 2.707002878189087, + "learning_rate": 4.410201241275169e-06, + "loss": 0.5125, + "step": 2910 + }, + { + "epoch": 1.3763593380614658, + "grad_norm": 3.0158939361572266, + "learning_rate": 4.409798737979602e-06, + "loss": 0.5299, + "step": 2911 + }, + { + "epoch": 1.3768321513002364, + "grad_norm": 2.7932698726654053, + "learning_rate": 4.4093961157692415e-06, + "loss": 0.5437, + "step": 2912 + }, + { + "epoch": 1.377304964539007, + "grad_norm": 2.459510326385498, + "learning_rate": 4.408993374669156e-06, + "loss": 0.5548, + "step": 2913 + }, + { + "epoch": 1.3777777777777778, + "grad_norm": 2.7500696182250977, + "learning_rate": 4.408590514704425e-06, + "loss": 0.5186, + "step": 2914 + }, + { + "epoch": 1.3782505910165486, + "grad_norm": 2.7824268341064453, + "learning_rate": 4.4081875359001315e-06, + "loss": 0.4762, + "step": 2915 + }, + { + "epoch": 1.3787234042553191, + "grad_norm": 2.4202158451080322, + "learning_rate": 4.4077844382813675e-06, + "loss": 0.5005, + "step": 2916 + }, + { + "epoch": 1.3791962174940897, + "grad_norm": 2.5566670894622803, + "learning_rate": 4.4073812218732316e-06, + "loss": 0.5377, + "step": 2917 + }, + { + "epoch": 1.3796690307328605, + "grad_norm": 3.400874376296997, + "learning_rate": 4.406977886700831e-06, + "loss": 0.6637, + "step": 2918 + }, + { + "epoch": 1.3801418439716313, + "grad_norm": 2.8187878131866455, + "learning_rate": 4.406574432789278e-06, + "loss": 0.5033, + "step": 2919 + }, + { + "epoch": 1.3806146572104019, + "grad_norm": 2.5578041076660156, + "learning_rate": 4.406170860163697e-06, + "loss": 0.5293, + "step": 2920 + }, + { + "epoch": 1.3810874704491725, + "grad_norm": 2.6709718704223633, + "learning_rate": 4.405767168849213e-06, + "loss": 0.5144, + "step": 2921 + }, + { + "epoch": 1.3815602836879433, + "grad_norm": 3.049365997314453, + "learning_rate": 4.405363358870965e-06, + "loss": 0.4894, + "step": 2922 + }, + { + "epoch": 1.382033096926714, + "grad_norm": 2.5569891929626465, + "learning_rate": 4.404959430254095e-06, + "loss": 0.4929, + "step": 2923 + }, + { + "epoch": 1.3825059101654846, + "grad_norm": 2.8288230895996094, + "learning_rate": 4.404555383023754e-06, + "loss": 0.5438, + "step": 2924 + }, + { + "epoch": 1.3829787234042552, + "grad_norm": 2.8363358974456787, + "learning_rate": 4.404151217205102e-06, + "loss": 0.545, + "step": 2925 + }, + { + "epoch": 1.383451536643026, + "grad_norm": 2.720972776412964, + "learning_rate": 4.403746932823302e-06, + "loss": 0.5732, + "step": 2926 + }, + { + "epoch": 1.3839243498817968, + "grad_norm": 2.728043794631958, + "learning_rate": 4.403342529903528e-06, + "loss": 0.4944, + "step": 2927 + }, + { + "epoch": 1.3843971631205674, + "grad_norm": 2.4366135597229004, + "learning_rate": 4.402938008470961e-06, + "loss": 0.4441, + "step": 2928 + }, + { + "epoch": 1.384869976359338, + "grad_norm": 2.858454704284668, + "learning_rate": 4.402533368550788e-06, + "loss": 0.5359, + "step": 2929 + }, + { + "epoch": 1.3853427895981087, + "grad_norm": 2.805795907974243, + "learning_rate": 4.402128610168205e-06, + "loss": 0.4954, + "step": 2930 + }, + { + "epoch": 1.3858156028368795, + "grad_norm": 3.3514177799224854, + "learning_rate": 4.401723733348413e-06, + "loss": 0.579, + "step": 2931 + }, + { + "epoch": 1.3862884160756501, + "grad_norm": 2.6255125999450684, + "learning_rate": 4.401318738116624e-06, + "loss": 0.5002, + "step": 2932 + }, + { + "epoch": 1.3867612293144207, + "grad_norm": 2.3480796813964844, + "learning_rate": 4.400913624498054e-06, + "loss": 0.4688, + "step": 2933 + }, + { + "epoch": 1.3872340425531915, + "grad_norm": 2.710165023803711, + "learning_rate": 4.400508392517927e-06, + "loss": 0.5099, + "step": 2934 + }, + { + "epoch": 1.3877068557919623, + "grad_norm": 2.5820295810699463, + "learning_rate": 4.400103042201477e-06, + "loss": 0.512, + "step": 2935 + }, + { + "epoch": 1.3881796690307329, + "grad_norm": 2.750596523284912, + "learning_rate": 4.399697573573942e-06, + "loss": 0.463, + "step": 2936 + }, + { + "epoch": 1.3886524822695034, + "grad_norm": 3.497537612915039, + "learning_rate": 4.399291986660569e-06, + "loss": 0.5676, + "step": 2937 + }, + { + "epoch": 1.3891252955082742, + "grad_norm": 2.4046003818511963, + "learning_rate": 4.398886281486612e-06, + "loss": 0.5408, + "step": 2938 + }, + { + "epoch": 1.389598108747045, + "grad_norm": 2.941606283187866, + "learning_rate": 4.398480458077332e-06, + "loss": 0.5734, + "step": 2939 + }, + { + "epoch": 1.3900709219858156, + "grad_norm": 3.030214309692383, + "learning_rate": 4.398074516458e-06, + "loss": 0.5353, + "step": 2940 + }, + { + "epoch": 1.3905437352245862, + "grad_norm": 2.9991626739501953, + "learning_rate": 4.397668456653889e-06, + "loss": 0.5989, + "step": 2941 + }, + { + "epoch": 1.391016548463357, + "grad_norm": 4.163141250610352, + "learning_rate": 4.397262278690285e-06, + "loss": 0.5436, + "step": 2942 + }, + { + "epoch": 1.3914893617021278, + "grad_norm": 2.6576037406921387, + "learning_rate": 4.396855982592478e-06, + "loss": 0.5206, + "step": 2943 + }, + { + "epoch": 1.3919621749408984, + "grad_norm": 2.7729203701019287, + "learning_rate": 4.396449568385768e-06, + "loss": 0.5403, + "step": 2944 + }, + { + "epoch": 1.392434988179669, + "grad_norm": 2.4560446739196777, + "learning_rate": 4.396043036095457e-06, + "loss": 0.4924, + "step": 2945 + }, + { + "epoch": 1.3929078014184397, + "grad_norm": 2.6370556354522705, + "learning_rate": 4.39563638574686e-06, + "loss": 0.5543, + "step": 2946 + }, + { + "epoch": 1.3933806146572105, + "grad_norm": 2.593914270401001, + "learning_rate": 4.395229617365298e-06, + "loss": 0.5133, + "step": 2947 + }, + { + "epoch": 1.393853427895981, + "grad_norm": 2.3583998680114746, + "learning_rate": 4.394822730976099e-06, + "loss": 0.4436, + "step": 2948 + }, + { + "epoch": 1.3943262411347517, + "grad_norm": 3.2768537998199463, + "learning_rate": 4.394415726604596e-06, + "loss": 0.5489, + "step": 2949 + }, + { + "epoch": 1.3947990543735225, + "grad_norm": 2.88662052154541, + "learning_rate": 4.394008604276133e-06, + "loss": 0.5194, + "step": 2950 + }, + { + "epoch": 1.3952718676122933, + "grad_norm": 2.46610426902771, + "learning_rate": 4.393601364016059e-06, + "loss": 0.5255, + "step": 2951 + }, + { + "epoch": 1.3957446808510638, + "grad_norm": 3.122509241104126, + "learning_rate": 4.393194005849731e-06, + "loss": 0.6046, + "step": 2952 + }, + { + "epoch": 1.3962174940898344, + "grad_norm": 2.724926471710205, + "learning_rate": 4.392786529802513e-06, + "loss": 0.4958, + "step": 2953 + }, + { + "epoch": 1.3966903073286052, + "grad_norm": 2.491485595703125, + "learning_rate": 4.3923789358997785e-06, + "loss": 0.5209, + "step": 2954 + }, + { + "epoch": 1.397163120567376, + "grad_norm": 2.61110520362854, + "learning_rate": 4.3919712241669056e-06, + "loss": 0.5202, + "step": 2955 + }, + { + "epoch": 1.3976359338061466, + "grad_norm": 2.3814501762390137, + "learning_rate": 4.39156339462928e-06, + "loss": 0.4966, + "step": 2956 + }, + { + "epoch": 1.3981087470449172, + "grad_norm": 2.762498617172241, + "learning_rate": 4.391155447312296e-06, + "loss": 0.6025, + "step": 2957 + }, + { + "epoch": 1.398581560283688, + "grad_norm": 2.964975595474243, + "learning_rate": 4.390747382241355e-06, + "loss": 0.4845, + "step": 2958 + }, + { + "epoch": 1.3990543735224588, + "grad_norm": 3.0117249488830566, + "learning_rate": 4.3903391994418655e-06, + "loss": 0.5326, + "step": 2959 + }, + { + "epoch": 1.3995271867612293, + "grad_norm": 2.578626871109009, + "learning_rate": 4.389930898939243e-06, + "loss": 0.5271, + "step": 2960 + }, + { + "epoch": 1.4, + "grad_norm": 2.747441053390503, + "learning_rate": 4.38952248075891e-06, + "loss": 0.5553, + "step": 2961 + }, + { + "epoch": 1.4004728132387707, + "grad_norm": 2.8273086547851562, + "learning_rate": 4.389113944926297e-06, + "loss": 0.5475, + "step": 2962 + }, + { + "epoch": 1.4009456264775415, + "grad_norm": 2.55238676071167, + "learning_rate": 4.388705291466843e-06, + "loss": 0.4864, + "step": 2963 + }, + { + "epoch": 1.401418439716312, + "grad_norm": 2.597214460372925, + "learning_rate": 4.388296520405992e-06, + "loss": 0.4845, + "step": 2964 + }, + { + "epoch": 1.4018912529550827, + "grad_norm": 2.608962297439575, + "learning_rate": 4.387887631769196e-06, + "loss": 0.5544, + "step": 2965 + }, + { + "epoch": 1.4023640661938535, + "grad_norm": 2.2754876613616943, + "learning_rate": 4.3874786255819165e-06, + "loss": 0.5045, + "step": 2966 + }, + { + "epoch": 1.4028368794326243, + "grad_norm": 2.9900264739990234, + "learning_rate": 4.387069501869618e-06, + "loss": 0.562, + "step": 2967 + }, + { + "epoch": 1.4033096926713948, + "grad_norm": 2.8069417476654053, + "learning_rate": 4.386660260657778e-06, + "loss": 0.5284, + "step": 2968 + }, + { + "epoch": 1.4037825059101654, + "grad_norm": 2.68894624710083, + "learning_rate": 4.386250901971875e-06, + "loss": 0.5879, + "step": 2969 + }, + { + "epoch": 1.4042553191489362, + "grad_norm": 2.614485025405884, + "learning_rate": 4.385841425837399e-06, + "loss": 0.4771, + "step": 2970 + }, + { + "epoch": 1.4047281323877068, + "grad_norm": 2.487950325012207, + "learning_rate": 4.385431832279848e-06, + "loss": 0.5552, + "step": 2971 + }, + { + "epoch": 1.4052009456264776, + "grad_norm": 2.5098392963409424, + "learning_rate": 4.385022121324723e-06, + "loss": 0.5267, + "step": 2972 + }, + { + "epoch": 1.4056737588652481, + "grad_norm": 2.825838565826416, + "learning_rate": 4.384612292997537e-06, + "loss": 0.5336, + "step": 2973 + }, + { + "epoch": 1.406146572104019, + "grad_norm": 2.898188829421997, + "learning_rate": 4.384202347323806e-06, + "loss": 0.5685, + "step": 2974 + }, + { + "epoch": 1.4066193853427895, + "grad_norm": 2.8722569942474365, + "learning_rate": 4.383792284329057e-06, + "loss": 0.5977, + "step": 2975 + }, + { + "epoch": 1.4070921985815603, + "grad_norm": 2.832951307296753, + "learning_rate": 4.3833821040388235e-06, + "loss": 0.5766, + "step": 2976 + }, + { + "epoch": 1.407565011820331, + "grad_norm": 2.7353670597076416, + "learning_rate": 4.3829718064786446e-06, + "loss": 0.5461, + "step": 2977 + }, + { + "epoch": 1.4080378250591017, + "grad_norm": 2.6050429344177246, + "learning_rate": 4.3825613916740675e-06, + "loss": 0.5501, + "step": 2978 + }, + { + "epoch": 1.4085106382978723, + "grad_norm": 2.79719877243042, + "learning_rate": 4.382150859650647e-06, + "loss": 0.502, + "step": 2979 + }, + { + "epoch": 1.408983451536643, + "grad_norm": 2.5538079738616943, + "learning_rate": 4.381740210433946e-06, + "loss": 0.4762, + "step": 2980 + }, + { + "epoch": 1.4094562647754136, + "grad_norm": 2.7256062030792236, + "learning_rate": 4.381329444049533e-06, + "loss": 0.4692, + "step": 2981 + }, + { + "epoch": 1.4099290780141844, + "grad_norm": 2.7778146266937256, + "learning_rate": 4.3809185605229855e-06, + "loss": 0.5366, + "step": 2982 + }, + { + "epoch": 1.410401891252955, + "grad_norm": 2.6289451122283936, + "learning_rate": 4.380507559879887e-06, + "loss": 0.5412, + "step": 2983 + }, + { + "epoch": 1.4108747044917258, + "grad_norm": 2.697204828262329, + "learning_rate": 4.380096442145827e-06, + "loss": 0.5065, + "step": 2984 + }, + { + "epoch": 1.4113475177304964, + "grad_norm": 2.4709219932556152, + "learning_rate": 4.379685207346407e-06, + "loss": 0.568, + "step": 2985 + }, + { + "epoch": 1.4118203309692672, + "grad_norm": 2.9740655422210693, + "learning_rate": 4.379273855507231e-06, + "loss": 0.5512, + "step": 2986 + }, + { + "epoch": 1.4122931442080378, + "grad_norm": 3.0090627670288086, + "learning_rate": 4.378862386653911e-06, + "loss": 0.5459, + "step": 2987 + }, + { + "epoch": 1.4127659574468086, + "grad_norm": 2.8835368156433105, + "learning_rate": 4.378450800812071e-06, + "loss": 0.5357, + "step": 2988 + }, + { + "epoch": 1.4132387706855791, + "grad_norm": 2.558824062347412, + "learning_rate": 4.378039098007335e-06, + "loss": 0.536, + "step": 2989 + }, + { + "epoch": 1.41371158392435, + "grad_norm": 2.5572092533111572, + "learning_rate": 4.377627278265339e-06, + "loss": 0.5183, + "step": 2990 + }, + { + "epoch": 1.4141843971631205, + "grad_norm": 2.7356579303741455, + "learning_rate": 4.377215341611727e-06, + "loss": 0.5087, + "step": 2991 + }, + { + "epoch": 1.4146572104018913, + "grad_norm": 2.7541024684906006, + "learning_rate": 4.376803288072146e-06, + "loss": 0.4509, + "step": 2992 + }, + { + "epoch": 1.4151300236406619, + "grad_norm": 2.7548446655273438, + "learning_rate": 4.376391117672254e-06, + "loss": 0.5532, + "step": 2993 + }, + { + "epoch": 1.4156028368794327, + "grad_norm": 2.9107465744018555, + "learning_rate": 4.375978830437715e-06, + "loss": 0.5719, + "step": 2994 + }, + { + "epoch": 1.4160756501182032, + "grad_norm": 2.7077393531799316, + "learning_rate": 4.3755664263942e-06, + "loss": 0.5084, + "step": 2995 + }, + { + "epoch": 1.416548463356974, + "grad_norm": 2.764209270477295, + "learning_rate": 4.375153905567388e-06, + "loss": 0.5976, + "step": 2996 + }, + { + "epoch": 1.4170212765957446, + "grad_norm": 2.7792932987213135, + "learning_rate": 4.374741267982964e-06, + "loss": 0.5358, + "step": 2997 + }, + { + "epoch": 1.4174940898345154, + "grad_norm": 2.459212064743042, + "learning_rate": 4.374328513666622e-06, + "loss": 0.5181, + "step": 2998 + }, + { + "epoch": 1.417966903073286, + "grad_norm": 2.548546552658081, + "learning_rate": 4.373915642644062e-06, + "loss": 0.528, + "step": 2999 + }, + { + "epoch": 1.4184397163120568, + "grad_norm": 2.998138189315796, + "learning_rate": 4.373502654940992e-06, + "loss": 0.5233, + "step": 3000 + }, + { + "epoch": 1.4189125295508274, + "grad_norm": 2.604341983795166, + "learning_rate": 4.373089550583126e-06, + "loss": 0.5274, + "step": 3001 + }, + { + "epoch": 1.4193853427895982, + "grad_norm": 2.6792588233947754, + "learning_rate": 4.372676329596188e-06, + "loss": 0.5061, + "step": 3002 + }, + { + "epoch": 1.4198581560283687, + "grad_norm": 2.5182368755340576, + "learning_rate": 4.372262992005906e-06, + "loss": 0.541, + "step": 3003 + }, + { + "epoch": 1.4203309692671395, + "grad_norm": 2.690718173980713, + "learning_rate": 4.371849537838018e-06, + "loss": 0.5308, + "step": 3004 + }, + { + "epoch": 1.42080378250591, + "grad_norm": 2.6797590255737305, + "learning_rate": 4.371435967118266e-06, + "loss": 0.5728, + "step": 3005 + }, + { + "epoch": 1.421276595744681, + "grad_norm": 2.847900152206421, + "learning_rate": 4.371022279872403e-06, + "loss": 0.5053, + "step": 3006 + }, + { + "epoch": 1.4217494089834515, + "grad_norm": 2.497810125350952, + "learning_rate": 4.370608476126186e-06, + "loss": 0.5057, + "step": 3007 + }, + { + "epoch": 1.4222222222222223, + "grad_norm": 2.5259225368499756, + "learning_rate": 4.370194555905382e-06, + "loss": 0.5508, + "step": 3008 + }, + { + "epoch": 1.4226950354609929, + "grad_norm": 2.774118423461914, + "learning_rate": 4.369780519235763e-06, + "loss": 0.5419, + "step": 3009 + }, + { + "epoch": 1.4231678486997636, + "grad_norm": 2.2764663696289062, + "learning_rate": 4.369366366143111e-06, + "loss": 0.5032, + "step": 3010 + }, + { + "epoch": 1.4236406619385342, + "grad_norm": 2.736347198486328, + "learning_rate": 4.368952096653211e-06, + "loss": 0.5184, + "step": 3011 + }, + { + "epoch": 1.424113475177305, + "grad_norm": 2.476762056350708, + "learning_rate": 4.36853771079186e-06, + "loss": 0.5331, + "step": 3012 + }, + { + "epoch": 1.4245862884160756, + "grad_norm": 2.8006162643432617, + "learning_rate": 4.3681232085848585e-06, + "loss": 0.5331, + "step": 3013 + }, + { + "epoch": 1.4250591016548464, + "grad_norm": 2.509143590927124, + "learning_rate": 4.367708590058016e-06, + "loss": 0.5127, + "step": 3014 + }, + { + "epoch": 1.425531914893617, + "grad_norm": 3.030137538909912, + "learning_rate": 4.3672938552371505e-06, + "loss": 0.5555, + "step": 3015 + }, + { + "epoch": 1.4260047281323878, + "grad_norm": 3.0536904335021973, + "learning_rate": 4.3668790041480835e-06, + "loss": 0.5241, + "step": 3016 + }, + { + "epoch": 1.4264775413711583, + "grad_norm": 2.6400439739227295, + "learning_rate": 4.366464036816647e-06, + "loss": 0.4946, + "step": 3017 + }, + { + "epoch": 1.4269503546099291, + "grad_norm": 2.7302589416503906, + "learning_rate": 4.366048953268679e-06, + "loss": 0.5105, + "step": 3018 + }, + { + "epoch": 1.4274231678486997, + "grad_norm": 2.504549264907837, + "learning_rate": 4.365633753530026e-06, + "loss": 0.4844, + "step": 3019 + }, + { + "epoch": 1.4278959810874705, + "grad_norm": 2.3872320652008057, + "learning_rate": 4.365218437626539e-06, + "loss": 0.4402, + "step": 3020 + }, + { + "epoch": 1.428368794326241, + "grad_norm": 2.531649351119995, + "learning_rate": 4.364803005584078e-06, + "loss": 0.4913, + "step": 3021 + }, + { + "epoch": 1.4288416075650119, + "grad_norm": 2.4683783054351807, + "learning_rate": 4.364387457428512e-06, + "loss": 0.515, + "step": 3022 + }, + { + "epoch": 1.4293144208037825, + "grad_norm": 2.632336378097534, + "learning_rate": 4.363971793185713e-06, + "loss": 0.5398, + "step": 3023 + }, + { + "epoch": 1.4297872340425533, + "grad_norm": 2.7456719875335693, + "learning_rate": 4.363556012881565e-06, + "loss": 0.5254, + "step": 3024 + }, + { + "epoch": 1.4302600472813238, + "grad_norm": 2.607177972793579, + "learning_rate": 4.363140116541955e-06, + "loss": 0.5266, + "step": 3025 + }, + { + "epoch": 1.4307328605200946, + "grad_norm": 2.640127420425415, + "learning_rate": 4.3627241041927796e-06, + "loss": 0.5157, + "step": 3026 + }, + { + "epoch": 1.4312056737588652, + "grad_norm": 2.4210736751556396, + "learning_rate": 4.362307975859941e-06, + "loss": 0.4599, + "step": 3027 + }, + { + "epoch": 1.431678486997636, + "grad_norm": 2.6007790565490723, + "learning_rate": 4.361891731569352e-06, + "loss": 0.5298, + "step": 3028 + }, + { + "epoch": 1.4321513002364066, + "grad_norm": 2.5352046489715576, + "learning_rate": 4.361475371346928e-06, + "loss": 0.5128, + "step": 3029 + }, + { + "epoch": 1.4326241134751774, + "grad_norm": 2.4204049110412598, + "learning_rate": 4.361058895218596e-06, + "loss": 0.4669, + "step": 3030 + }, + { + "epoch": 1.433096926713948, + "grad_norm": 2.525240182876587, + "learning_rate": 4.360642303210286e-06, + "loss": 0.4925, + "step": 3031 + }, + { + "epoch": 1.4335697399527187, + "grad_norm": 2.839646339416504, + "learning_rate": 4.360225595347939e-06, + "loss": 0.5868, + "step": 3032 + }, + { + "epoch": 1.4340425531914893, + "grad_norm": 2.5043296813964844, + "learning_rate": 4.359808771657501e-06, + "loss": 0.4951, + "step": 3033 + }, + { + "epoch": 1.4345153664302601, + "grad_norm": 2.9082300662994385, + "learning_rate": 4.359391832164927e-06, + "loss": 0.5259, + "step": 3034 + }, + { + "epoch": 1.4349881796690307, + "grad_norm": 2.6651999950408936, + "learning_rate": 4.3589747768961745e-06, + "loss": 0.537, + "step": 3035 + }, + { + "epoch": 1.4354609929078015, + "grad_norm": 2.577077865600586, + "learning_rate": 4.358557605877216e-06, + "loss": 0.5186, + "step": 3036 + }, + { + "epoch": 1.435933806146572, + "grad_norm": 2.7445287704467773, + "learning_rate": 4.3581403191340236e-06, + "loss": 0.5573, + "step": 3037 + }, + { + "epoch": 1.4364066193853429, + "grad_norm": 2.502086639404297, + "learning_rate": 4.357722916692582e-06, + "loss": 0.5039, + "step": 3038 + }, + { + "epoch": 1.4368794326241134, + "grad_norm": 2.4476163387298584, + "learning_rate": 4.357305398578879e-06, + "loss": 0.5638, + "step": 3039 + }, + { + "epoch": 1.4373522458628842, + "grad_norm": 2.7705588340759277, + "learning_rate": 4.356887764818915e-06, + "loss": 0.5485, + "step": 3040 + }, + { + "epoch": 1.4378250591016548, + "grad_norm": 2.498225450515747, + "learning_rate": 4.356470015438691e-06, + "loss": 0.5486, + "step": 3041 + }, + { + "epoch": 1.4382978723404256, + "grad_norm": 2.394320011138916, + "learning_rate": 4.356052150464219e-06, + "loss": 0.512, + "step": 3042 + }, + { + "epoch": 1.4387706855791962, + "grad_norm": 2.8725767135620117, + "learning_rate": 4.3556341699215185e-06, + "loss": 0.5202, + "step": 3043 + }, + { + "epoch": 1.439243498817967, + "grad_norm": 3.1707918643951416, + "learning_rate": 4.355216073836615e-06, + "loss": 0.5229, + "step": 3044 + }, + { + "epoch": 1.4397163120567376, + "grad_norm": 2.532578468322754, + "learning_rate": 4.3547978622355415e-06, + "loss": 0.4569, + "step": 3045 + }, + { + "epoch": 1.4401891252955084, + "grad_norm": 3.0111029148101807, + "learning_rate": 4.354379535144338e-06, + "loss": 0.5801, + "step": 3046 + }, + { + "epoch": 1.440661938534279, + "grad_norm": 2.9554224014282227, + "learning_rate": 4.353961092589052e-06, + "loss": 0.5968, + "step": 3047 + }, + { + "epoch": 1.4411347517730497, + "grad_norm": 2.7562637329101562, + "learning_rate": 4.353542534595738e-06, + "loss": 0.5005, + "step": 3048 + }, + { + "epoch": 1.4416075650118203, + "grad_norm": 3.083254337310791, + "learning_rate": 4.3531238611904595e-06, + "loss": 0.5389, + "step": 3049 + }, + { + "epoch": 1.442080378250591, + "grad_norm": 2.7778005599975586, + "learning_rate": 4.352705072399282e-06, + "loss": 0.5342, + "step": 3050 + }, + { + "epoch": 1.4425531914893617, + "grad_norm": 2.6673996448516846, + "learning_rate": 4.3522861682482845e-06, + "loss": 0.5213, + "step": 3051 + }, + { + "epoch": 1.4430260047281322, + "grad_norm": 2.637605905532837, + "learning_rate": 4.351867148763548e-06, + "loss": 0.4893, + "step": 3052 + }, + { + "epoch": 1.443498817966903, + "grad_norm": 2.834469795227051, + "learning_rate": 4.351448013971166e-06, + "loss": 0.5391, + "step": 3053 + }, + { + "epoch": 1.4439716312056738, + "grad_norm": 2.824153184890747, + "learning_rate": 4.351028763897234e-06, + "loss": 0.6403, + "step": 3054 + }, + { + "epoch": 1.4444444444444444, + "grad_norm": 2.558966875076294, + "learning_rate": 4.350609398567857e-06, + "loss": 0.4912, + "step": 3055 + }, + { + "epoch": 1.444917257683215, + "grad_norm": 2.281726360321045, + "learning_rate": 4.3501899180091475e-06, + "loss": 0.4655, + "step": 3056 + }, + { + "epoch": 1.4453900709219858, + "grad_norm": 2.499472141265869, + "learning_rate": 4.349770322247225e-06, + "loss": 0.4878, + "step": 3057 + }, + { + "epoch": 1.4458628841607566, + "grad_norm": 2.578615188598633, + "learning_rate": 4.349350611308215e-06, + "loss": 0.4855, + "step": 3058 + }, + { + "epoch": 1.4463356973995272, + "grad_norm": 2.7111165523529053, + "learning_rate": 4.348930785218252e-06, + "loss": 0.5415, + "step": 3059 + }, + { + "epoch": 1.4468085106382977, + "grad_norm": 2.8081610202789307, + "learning_rate": 4.348510844003476e-06, + "loss": 0.4881, + "step": 3060 + }, + { + "epoch": 1.4472813238770685, + "grad_norm": 2.9439868927001953, + "learning_rate": 4.348090787690036e-06, + "loss": 0.5485, + "step": 3061 + }, + { + "epoch": 1.4477541371158393, + "grad_norm": 2.592532157897949, + "learning_rate": 4.347670616304085e-06, + "loss": 0.4912, + "step": 3062 + }, + { + "epoch": 1.44822695035461, + "grad_norm": 2.960592746734619, + "learning_rate": 4.347250329871787e-06, + "loss": 0.5473, + "step": 3063 + }, + { + "epoch": 1.4486997635933805, + "grad_norm": 2.5786688327789307, + "learning_rate": 4.3468299284193116e-06, + "loss": 0.5348, + "step": 3064 + }, + { + "epoch": 1.4491725768321513, + "grad_norm": 2.6084046363830566, + "learning_rate": 4.346409411972834e-06, + "loss": 0.527, + "step": 3065 + }, + { + "epoch": 1.449645390070922, + "grad_norm": 2.489748239517212, + "learning_rate": 4.3459887805585385e-06, + "loss": 0.4943, + "step": 3066 + }, + { + "epoch": 1.4501182033096927, + "grad_norm": 2.452131986618042, + "learning_rate": 4.345568034202617e-06, + "loss": 0.4886, + "step": 3067 + }, + { + "epoch": 1.4505910165484632, + "grad_norm": 2.4034671783447266, + "learning_rate": 4.345147172931266e-06, + "loss": 0.4689, + "step": 3068 + }, + { + "epoch": 1.451063829787234, + "grad_norm": 2.6045448780059814, + "learning_rate": 4.344726196770691e-06, + "loss": 0.5842, + "step": 3069 + }, + { + "epoch": 1.4515366430260048, + "grad_norm": 2.697593927383423, + "learning_rate": 4.3443051057471045e-06, + "loss": 0.5358, + "step": 3070 + }, + { + "epoch": 1.4520094562647754, + "grad_norm": 2.6080820560455322, + "learning_rate": 4.343883899886727e-06, + "loss": 0.5361, + "step": 3071 + }, + { + "epoch": 1.452482269503546, + "grad_norm": 2.4605307579040527, + "learning_rate": 4.343462579215783e-06, + "loss": 0.4941, + "step": 3072 + }, + { + "epoch": 1.4529550827423168, + "grad_norm": 2.8025355339050293, + "learning_rate": 4.343041143760509e-06, + "loss": 0.5116, + "step": 3073 + }, + { + "epoch": 1.4534278959810876, + "grad_norm": 2.432515859603882, + "learning_rate": 4.3426195935471434e-06, + "loss": 0.4991, + "step": 3074 + }, + { + "epoch": 1.4539007092198581, + "grad_norm": 2.5838661193847656, + "learning_rate": 4.342197928601935e-06, + "loss": 0.4994, + "step": 3075 + }, + { + "epoch": 1.4543735224586287, + "grad_norm": 2.421692371368408, + "learning_rate": 4.341776148951141e-06, + "loss": 0.4945, + "step": 3076 + }, + { + "epoch": 1.4548463356973995, + "grad_norm": 2.5354676246643066, + "learning_rate": 4.341354254621021e-06, + "loss": 0.4859, + "step": 3077 + }, + { + "epoch": 1.4553191489361703, + "grad_norm": 2.7316789627075195, + "learning_rate": 4.340932245637846e-06, + "loss": 0.5136, + "step": 3078 + }, + { + "epoch": 1.455791962174941, + "grad_norm": 3.5903496742248535, + "learning_rate": 4.340510122027891e-06, + "loss": 0.6451, + "step": 3079 + }, + { + "epoch": 1.4562647754137115, + "grad_norm": 2.95190167427063, + "learning_rate": 4.340087883817442e-06, + "loss": 0.6354, + "step": 3080 + }, + { + "epoch": 1.4567375886524823, + "grad_norm": 2.8659214973449707, + "learning_rate": 4.339665531032789e-06, + "loss": 0.5514, + "step": 3081 + }, + { + "epoch": 1.457210401891253, + "grad_norm": 2.5681674480438232, + "learning_rate": 4.339243063700231e-06, + "loss": 0.5135, + "step": 3082 + }, + { + "epoch": 1.4576832151300236, + "grad_norm": 2.7353906631469727, + "learning_rate": 4.338820481846072e-06, + "loss": 0.4608, + "step": 3083 + }, + { + "epoch": 1.4581560283687942, + "grad_norm": 2.6116466522216797, + "learning_rate": 4.3383977854966245e-06, + "loss": 0.4924, + "step": 3084 + }, + { + "epoch": 1.458628841607565, + "grad_norm": 2.6676487922668457, + "learning_rate": 4.337974974678207e-06, + "loss": 0.5747, + "step": 3085 + }, + { + "epoch": 1.4591016548463358, + "grad_norm": 2.909031629562378, + "learning_rate": 4.337552049417147e-06, + "loss": 0.4618, + "step": 3086 + }, + { + "epoch": 1.4595744680851064, + "grad_norm": 2.7614190578460693, + "learning_rate": 4.33712900973978e-06, + "loss": 0.5154, + "step": 3087 + }, + { + "epoch": 1.460047281323877, + "grad_norm": 2.452188014984131, + "learning_rate": 4.336705855672444e-06, + "loss": 0.542, + "step": 3088 + }, + { + "epoch": 1.4605200945626478, + "grad_norm": 3.0004117488861084, + "learning_rate": 4.336282587241488e-06, + "loss": 0.5857, + "step": 3089 + }, + { + "epoch": 1.4609929078014185, + "grad_norm": 2.870783567428589, + "learning_rate": 4.335859204473268e-06, + "loss": 0.5506, + "step": 3090 + }, + { + "epoch": 1.4614657210401891, + "grad_norm": 3.1078689098358154, + "learning_rate": 4.335435707394145e-06, + "loss": 0.5138, + "step": 3091 + }, + { + "epoch": 1.4619385342789597, + "grad_norm": 2.8516197204589844, + "learning_rate": 4.335012096030488e-06, + "loss": 0.5842, + "step": 3092 + }, + { + "epoch": 1.4624113475177305, + "grad_norm": 2.615922212600708, + "learning_rate": 4.334588370408675e-06, + "loss": 0.4896, + "step": 3093 + }, + { + "epoch": 1.4628841607565013, + "grad_norm": 3.1911802291870117, + "learning_rate": 4.334164530555088e-06, + "loss": 0.4974, + "step": 3094 + }, + { + "epoch": 1.4633569739952719, + "grad_norm": 3.075051784515381, + "learning_rate": 4.3337405764961186e-06, + "loss": 0.567, + "step": 3095 + }, + { + "epoch": 1.4638297872340424, + "grad_norm": 2.550625801086426, + "learning_rate": 4.333316508258163e-06, + "loss": 0.4887, + "step": 3096 + }, + { + "epoch": 1.4643026004728132, + "grad_norm": 2.3986475467681885, + "learning_rate": 4.332892325867629e-06, + "loss": 0.5047, + "step": 3097 + }, + { + "epoch": 1.464775413711584, + "grad_norm": 2.5045125484466553, + "learning_rate": 4.332468029350926e-06, + "loss": 0.4721, + "step": 3098 + }, + { + "epoch": 1.4652482269503546, + "grad_norm": 2.347365617752075, + "learning_rate": 4.332043618734474e-06, + "loss": 0.4913, + "step": 3099 + }, + { + "epoch": 1.4657210401891252, + "grad_norm": 2.459928512573242, + "learning_rate": 4.331619094044699e-06, + "loss": 0.523, + "step": 3100 + }, + { + "epoch": 1.466193853427896, + "grad_norm": 2.5771310329437256, + "learning_rate": 4.331194455308035e-06, + "loss": 0.593, + "step": 3101 + }, + { + "epoch": 1.4666666666666668, + "grad_norm": 3.1351823806762695, + "learning_rate": 4.330769702550921e-06, + "loss": 0.5852, + "step": 3102 + }, + { + "epoch": 1.4671394799054374, + "grad_norm": 2.589817523956299, + "learning_rate": 4.330344835799806e-06, + "loss": 0.508, + "step": 3103 + }, + { + "epoch": 1.467612293144208, + "grad_norm": 3.1140341758728027, + "learning_rate": 4.329919855081144e-06, + "loss": 0.469, + "step": 3104 + }, + { + "epoch": 1.4680851063829787, + "grad_norm": 2.8186635971069336, + "learning_rate": 4.329494760421396e-06, + "loss": 0.5088, + "step": 3105 + }, + { + "epoch": 1.4685579196217495, + "grad_norm": 2.676077365875244, + "learning_rate": 4.329069551847031e-06, + "loss": 0.52, + "step": 3106 + }, + { + "epoch": 1.46903073286052, + "grad_norm": 2.5543313026428223, + "learning_rate": 4.328644229384526e-06, + "loss": 0.5066, + "step": 3107 + }, + { + "epoch": 1.4695035460992907, + "grad_norm": 2.8176217079162598, + "learning_rate": 4.328218793060362e-06, + "loss": 0.6404, + "step": 3108 + }, + { + "epoch": 1.4699763593380615, + "grad_norm": 2.485217332839966, + "learning_rate": 4.3277932429010314e-06, + "loss": 0.4578, + "step": 3109 + }, + { + "epoch": 1.4704491725768323, + "grad_norm": 2.6741621494293213, + "learning_rate": 4.327367578933031e-06, + "loss": 0.5068, + "step": 3110 + }, + { + "epoch": 1.4709219858156029, + "grad_norm": 2.377242088317871, + "learning_rate": 4.326941801182863e-06, + "loss": 0.5249, + "step": 3111 + }, + { + "epoch": 1.4713947990543734, + "grad_norm": 2.790046215057373, + "learning_rate": 4.32651590967704e-06, + "loss": 0.5532, + "step": 3112 + }, + { + "epoch": 1.4718676122931442, + "grad_norm": 2.78019642829895, + "learning_rate": 4.326089904442081e-06, + "loss": 0.5362, + "step": 3113 + }, + { + "epoch": 1.472340425531915, + "grad_norm": 2.5661380290985107, + "learning_rate": 4.32566378550451e-06, + "loss": 0.5041, + "step": 3114 + }, + { + "epoch": 1.4728132387706856, + "grad_norm": 2.522153615951538, + "learning_rate": 4.3252375528908605e-06, + "loss": 0.5074, + "step": 3115 + }, + { + "epoch": 1.4732860520094562, + "grad_norm": 2.874688148498535, + "learning_rate": 4.3248112066276725e-06, + "loss": 0.59, + "step": 3116 + }, + { + "epoch": 1.473758865248227, + "grad_norm": 3.067866802215576, + "learning_rate": 4.324384746741492e-06, + "loss": 0.5924, + "step": 3117 + }, + { + "epoch": 1.4742316784869978, + "grad_norm": 3.359463930130005, + "learning_rate": 4.323958173258873e-06, + "loss": 0.6346, + "step": 3118 + }, + { + "epoch": 1.4747044917257683, + "grad_norm": 2.193024158477783, + "learning_rate": 4.323531486206376e-06, + "loss": 0.4594, + "step": 3119 + }, + { + "epoch": 1.475177304964539, + "grad_norm": 2.886889934539795, + "learning_rate": 4.323104685610569e-06, + "loss": 0.523, + "step": 3120 + }, + { + "epoch": 1.4756501182033097, + "grad_norm": 2.7558681964874268, + "learning_rate": 4.322677771498028e-06, + "loss": 0.5387, + "step": 3121 + }, + { + "epoch": 1.4761229314420805, + "grad_norm": 2.639277935028076, + "learning_rate": 4.322250743895335e-06, + "loss": 0.5599, + "step": 3122 + }, + { + "epoch": 1.476595744680851, + "grad_norm": 2.786198616027832, + "learning_rate": 4.321823602829078e-06, + "loss": 0.5405, + "step": 3123 + }, + { + "epoch": 1.4770685579196217, + "grad_norm": 2.582315683364868, + "learning_rate": 4.321396348325853e-06, + "loss": 0.4452, + "step": 3124 + }, + { + "epoch": 1.4775413711583925, + "grad_norm": 2.8574297428131104, + "learning_rate": 4.320968980412265e-06, + "loss": 0.4846, + "step": 3125 + }, + { + "epoch": 1.4780141843971633, + "grad_norm": 2.705281972885132, + "learning_rate": 4.320541499114922e-06, + "loss": 0.5548, + "step": 3126 + }, + { + "epoch": 1.4784869976359338, + "grad_norm": 2.3152754306793213, + "learning_rate": 4.320113904460444e-06, + "loss": 0.5216, + "step": 3127 + }, + { + "epoch": 1.4789598108747044, + "grad_norm": 3.230764150619507, + "learning_rate": 4.319686196475453e-06, + "loss": 0.6192, + "step": 3128 + }, + { + "epoch": 1.4794326241134752, + "grad_norm": 2.463380813598633, + "learning_rate": 4.319258375186583e-06, + "loss": 0.4872, + "step": 3129 + }, + { + "epoch": 1.479905437352246, + "grad_norm": 2.8477656841278076, + "learning_rate": 4.31883044062047e-06, + "loss": 0.5371, + "step": 3130 + }, + { + "epoch": 1.4803782505910166, + "grad_norm": 2.393911123275757, + "learning_rate": 4.318402392803762e-06, + "loss": 0.5334, + "step": 3131 + }, + { + "epoch": 1.4808510638297872, + "grad_norm": 2.6113736629486084, + "learning_rate": 4.317974231763109e-06, + "loss": 0.5572, + "step": 3132 + }, + { + "epoch": 1.481323877068558, + "grad_norm": 2.3941731452941895, + "learning_rate": 4.317545957525173e-06, + "loss": 0.4849, + "step": 3133 + }, + { + "epoch": 1.4817966903073285, + "grad_norm": 2.9536755084991455, + "learning_rate": 4.317117570116619e-06, + "loss": 0.6058, + "step": 3134 + }, + { + "epoch": 1.4822695035460993, + "grad_norm": 2.595754623413086, + "learning_rate": 4.316689069564123e-06, + "loss": 0.5193, + "step": 3135 + }, + { + "epoch": 1.48274231678487, + "grad_norm": 2.569833993911743, + "learning_rate": 4.316260455894364e-06, + "loss": 0.543, + "step": 3136 + }, + { + "epoch": 1.4832151300236407, + "grad_norm": 2.5137455463409424, + "learning_rate": 4.315831729134031e-06, + "loss": 0.5415, + "step": 3137 + }, + { + "epoch": 1.4836879432624113, + "grad_norm": 2.5582292079925537, + "learning_rate": 4.3154028893098176e-06, + "loss": 0.5338, + "step": 3138 + }, + { + "epoch": 1.484160756501182, + "grad_norm": 2.666426181793213, + "learning_rate": 4.3149739364484265e-06, + "loss": 0.5435, + "step": 3139 + }, + { + "epoch": 1.4846335697399526, + "grad_norm": 2.790851354598999, + "learning_rate": 4.314544870576568e-06, + "loss": 0.5746, + "step": 3140 + }, + { + "epoch": 1.4851063829787234, + "grad_norm": 2.620326042175293, + "learning_rate": 4.314115691720956e-06, + "loss": 0.5076, + "step": 3141 + }, + { + "epoch": 1.485579196217494, + "grad_norm": 3.075674533843994, + "learning_rate": 4.313686399908314e-06, + "loss": 0.5486, + "step": 3142 + }, + { + "epoch": 1.4860520094562648, + "grad_norm": 3.1347315311431885, + "learning_rate": 4.3132569951653745e-06, + "loss": 0.531, + "step": 3143 + }, + { + "epoch": 1.4865248226950354, + "grad_norm": 2.5783653259277344, + "learning_rate": 4.312827477518871e-06, + "loss": 0.5818, + "step": 3144 + }, + { + "epoch": 1.4869976359338062, + "grad_norm": 3.0247137546539307, + "learning_rate": 4.3123978469955505e-06, + "loss": 0.5347, + "step": 3145 + }, + { + "epoch": 1.4874704491725768, + "grad_norm": 2.4789345264434814, + "learning_rate": 4.311968103622163e-06, + "loss": 0.5, + "step": 3146 + }, + { + "epoch": 1.4879432624113476, + "grad_norm": 2.663341522216797, + "learning_rate": 4.311538247425466e-06, + "loss": 0.4825, + "step": 3147 + }, + { + "epoch": 1.4884160756501181, + "grad_norm": 2.633711099624634, + "learning_rate": 4.311108278432226e-06, + "loss": 0.5244, + "step": 3148 + }, + { + "epoch": 1.488888888888889, + "grad_norm": 2.51312518119812, + "learning_rate": 4.310678196669216e-06, + "loss": 0.513, + "step": 3149 + }, + { + "epoch": 1.4893617021276595, + "grad_norm": 2.5263755321502686, + "learning_rate": 4.310248002163214e-06, + "loss": 0.5236, + "step": 3150 + }, + { + "epoch": 1.4898345153664303, + "grad_norm": 2.559216260910034, + "learning_rate": 4.309817694941007e-06, + "loss": 0.5107, + "step": 3151 + }, + { + "epoch": 1.4903073286052009, + "grad_norm": 2.5023303031921387, + "learning_rate": 4.309387275029386e-06, + "loss": 0.4685, + "step": 3152 + }, + { + "epoch": 1.4907801418439717, + "grad_norm": 3.0314254760742188, + "learning_rate": 4.308956742455155e-06, + "loss": 0.5462, + "step": 3153 + }, + { + "epoch": 1.4912529550827422, + "grad_norm": 2.675295114517212, + "learning_rate": 4.308526097245119e-06, + "loss": 0.5398, + "step": 3154 + }, + { + "epoch": 1.491725768321513, + "grad_norm": 2.6613399982452393, + "learning_rate": 4.308095339426094e-06, + "loss": 0.5376, + "step": 3155 + }, + { + "epoch": 1.4921985815602836, + "grad_norm": 2.58937668800354, + "learning_rate": 4.307664469024899e-06, + "loss": 0.5385, + "step": 3156 + }, + { + "epoch": 1.4926713947990544, + "grad_norm": 2.583631992340088, + "learning_rate": 4.3072334860683655e-06, + "loss": 0.4927, + "step": 3157 + }, + { + "epoch": 1.493144208037825, + "grad_norm": 2.5889222621917725, + "learning_rate": 4.306802390583327e-06, + "loss": 0.47, + "step": 3158 + }, + { + "epoch": 1.4936170212765958, + "grad_norm": 2.9362716674804688, + "learning_rate": 4.3063711825966244e-06, + "loss": 0.4902, + "step": 3159 + }, + { + "epoch": 1.4940898345153664, + "grad_norm": 2.5385425090789795, + "learning_rate": 4.305939862135111e-06, + "loss": 0.5396, + "step": 3160 + }, + { + "epoch": 1.4945626477541372, + "grad_norm": 2.776326894760132, + "learning_rate": 4.305508429225641e-06, + "loss": 0.5169, + "step": 3161 + }, + { + "epoch": 1.4950354609929077, + "grad_norm": 2.575063467025757, + "learning_rate": 4.305076883895076e-06, + "loss": 0.4938, + "step": 3162 + }, + { + "epoch": 1.4955082742316785, + "grad_norm": 2.7552313804626465, + "learning_rate": 4.304645226170291e-06, + "loss": 0.6211, + "step": 3163 + }, + { + "epoch": 1.4959810874704491, + "grad_norm": 2.57149338722229, + "learning_rate": 4.30421345607816e-06, + "loss": 0.5241, + "step": 3164 + }, + { + "epoch": 1.49645390070922, + "grad_norm": 2.8142426013946533, + "learning_rate": 4.303781573645568e-06, + "loss": 0.5699, + "step": 3165 + }, + { + "epoch": 1.4969267139479905, + "grad_norm": 2.6344845294952393, + "learning_rate": 4.303349578899407e-06, + "loss": 0.5049, + "step": 3166 + }, + { + "epoch": 1.4973995271867613, + "grad_norm": 2.554410934448242, + "learning_rate": 4.302917471866575e-06, + "loss": 0.4404, + "step": 3167 + }, + { + "epoch": 1.4978723404255319, + "grad_norm": 2.896240711212158, + "learning_rate": 4.302485252573978e-06, + "loss": 0.602, + "step": 3168 + }, + { + "epoch": 1.4983451536643027, + "grad_norm": 2.4044477939605713, + "learning_rate": 4.302052921048527e-06, + "loss": 0.4857, + "step": 3169 + }, + { + "epoch": 1.4988179669030732, + "grad_norm": 2.7447879314422607, + "learning_rate": 4.301620477317144e-06, + "loss": 0.5438, + "step": 3170 + }, + { + "epoch": 1.499290780141844, + "grad_norm": 2.851820945739746, + "learning_rate": 4.301187921406752e-06, + "loss": 0.5245, + "step": 3171 + }, + { + "epoch": 1.4997635933806146, + "grad_norm": 3.247114419937134, + "learning_rate": 4.300755253344287e-06, + "loss": 0.504, + "step": 3172 + }, + { + "epoch": 1.5002364066193854, + "grad_norm": 3.117490291595459, + "learning_rate": 4.300322473156688e-06, + "loss": 0.4627, + "step": 3173 + }, + { + "epoch": 1.500709219858156, + "grad_norm": 2.558319330215454, + "learning_rate": 4.299889580870904e-06, + "loss": 0.5721, + "step": 3174 + }, + { + "epoch": 1.5011820330969265, + "grad_norm": 2.8983113765716553, + "learning_rate": 4.2994565765138865e-06, + "loss": 0.5257, + "step": 3175 + }, + { + "epoch": 1.5016548463356973, + "grad_norm": 2.744056463241577, + "learning_rate": 4.299023460112599e-06, + "loss": 0.4892, + "step": 3176 + }, + { + "epoch": 1.5021276595744681, + "grad_norm": 2.5506751537323, + "learning_rate": 4.29859023169401e-06, + "loss": 0.4933, + "step": 3177 + }, + { + "epoch": 1.5026004728132387, + "grad_norm": 2.842615842819214, + "learning_rate": 4.298156891285092e-06, + "loss": 0.6124, + "step": 3178 + }, + { + "epoch": 1.5030732860520093, + "grad_norm": 2.5355329513549805, + "learning_rate": 4.2977234389128305e-06, + "loss": 0.641, + "step": 3179 + }, + { + "epoch": 1.50354609929078, + "grad_norm": 2.674781084060669, + "learning_rate": 4.297289874604213e-06, + "loss": 0.475, + "step": 3180 + }, + { + "epoch": 1.5040189125295509, + "grad_norm": 2.6845548152923584, + "learning_rate": 4.296856198386235e-06, + "loss": 0.5328, + "step": 3181 + }, + { + "epoch": 1.5044917257683215, + "grad_norm": 2.9686241149902344, + "learning_rate": 4.296422410285902e-06, + "loss": 0.6216, + "step": 3182 + }, + { + "epoch": 1.504964539007092, + "grad_norm": 2.5095980167388916, + "learning_rate": 4.295988510330222e-06, + "loss": 0.4993, + "step": 3183 + }, + { + "epoch": 1.5054373522458628, + "grad_norm": 2.4906392097473145, + "learning_rate": 4.2955544985462125e-06, + "loss": 0.4795, + "step": 3184 + }, + { + "epoch": 1.5059101654846336, + "grad_norm": 2.5593366622924805, + "learning_rate": 4.295120374960897e-06, + "loss": 0.5527, + "step": 3185 + }, + { + "epoch": 1.5063829787234042, + "grad_norm": 2.691495180130005, + "learning_rate": 4.294686139601308e-06, + "loss": 0.5646, + "step": 3186 + }, + { + "epoch": 1.5068557919621748, + "grad_norm": 2.74320387840271, + "learning_rate": 4.294251792494483e-06, + "loss": 0.6149, + "step": 3187 + }, + { + "epoch": 1.5073286052009456, + "grad_norm": 2.8827052116394043, + "learning_rate": 4.293817333667465e-06, + "loss": 0.5414, + "step": 3188 + }, + { + "epoch": 1.5078014184397164, + "grad_norm": 2.5652425289154053, + "learning_rate": 4.293382763147308e-06, + "loss": 0.5006, + "step": 3189 + }, + { + "epoch": 1.508274231678487, + "grad_norm": 2.729295253753662, + "learning_rate": 4.29294808096107e-06, + "loss": 0.522, + "step": 3190 + }, + { + "epoch": 1.5087470449172575, + "grad_norm": 2.348118305206299, + "learning_rate": 4.292513287135817e-06, + "loss": 0.4125, + "step": 3191 + }, + { + "epoch": 1.5092198581560283, + "grad_norm": 2.809551954269409, + "learning_rate": 4.292078381698621e-06, + "loss": 0.5577, + "step": 3192 + }, + { + "epoch": 1.5096926713947991, + "grad_norm": 2.6925361156463623, + "learning_rate": 4.291643364676563e-06, + "loss": 0.62, + "step": 3193 + }, + { + "epoch": 1.5101654846335697, + "grad_norm": 2.4200620651245117, + "learning_rate": 4.291208236096729e-06, + "loss": 0.5464, + "step": 3194 + }, + { + "epoch": 1.5106382978723403, + "grad_norm": 2.5659191608428955, + "learning_rate": 4.290772995986211e-06, + "loss": 0.5402, + "step": 3195 + }, + { + "epoch": 1.511111111111111, + "grad_norm": 2.3877315521240234, + "learning_rate": 4.290337644372113e-06, + "loss": 0.463, + "step": 3196 + }, + { + "epoch": 1.5115839243498819, + "grad_norm": 2.7063233852386475, + "learning_rate": 4.289902181281538e-06, + "loss": 0.5253, + "step": 3197 + }, + { + "epoch": 1.5120567375886524, + "grad_norm": 2.56788969039917, + "learning_rate": 4.289466606741603e-06, + "loss": 0.5012, + "step": 3198 + }, + { + "epoch": 1.512529550827423, + "grad_norm": 2.637164831161499, + "learning_rate": 4.28903092077943e-06, + "loss": 0.5236, + "step": 3199 + }, + { + "epoch": 1.5130023640661938, + "grad_norm": 2.767526865005493, + "learning_rate": 4.288595123422146e-06, + "loss": 0.5832, + "step": 3200 + }, + { + "epoch": 1.5134751773049646, + "grad_norm": 2.33365535736084, + "learning_rate": 4.2881592146968866e-06, + "loss": 0.4548, + "step": 3201 + }, + { + "epoch": 1.5139479905437352, + "grad_norm": 2.544189453125, + "learning_rate": 4.287723194630793e-06, + "loss": 0.5115, + "step": 3202 + }, + { + "epoch": 1.5144208037825058, + "grad_norm": 2.588793992996216, + "learning_rate": 4.2872870632510155e-06, + "loss": 0.4766, + "step": 3203 + }, + { + "epoch": 1.5148936170212766, + "grad_norm": 2.5382184982299805, + "learning_rate": 4.286850820584709e-06, + "loss": 0.5401, + "step": 3204 + }, + { + "epoch": 1.5153664302600474, + "grad_norm": 2.597930669784546, + "learning_rate": 4.286414466659038e-06, + "loss": 0.5346, + "step": 3205 + }, + { + "epoch": 1.515839243498818, + "grad_norm": 2.8522393703460693, + "learning_rate": 4.28597800150117e-06, + "loss": 0.486, + "step": 3206 + }, + { + "epoch": 1.5163120567375885, + "grad_norm": 2.4801454544067383, + "learning_rate": 4.285541425138285e-06, + "loss": 0.5162, + "step": 3207 + }, + { + "epoch": 1.5167848699763593, + "grad_norm": 2.353665351867676, + "learning_rate": 4.285104737597563e-06, + "loss": 0.5066, + "step": 3208 + }, + { + "epoch": 1.51725768321513, + "grad_norm": 2.767976760864258, + "learning_rate": 4.2846679389061975e-06, + "loss": 0.5331, + "step": 3209 + }, + { + "epoch": 1.5177304964539007, + "grad_norm": 2.9307682514190674, + "learning_rate": 4.284231029091385e-06, + "loss": 0.5291, + "step": 3210 + }, + { + "epoch": 1.5182033096926713, + "grad_norm": 2.39719820022583, + "learning_rate": 4.283794008180329e-06, + "loss": 0.4759, + "step": 3211 + }, + { + "epoch": 1.518676122931442, + "grad_norm": 2.452244758605957, + "learning_rate": 4.283356876200242e-06, + "loss": 0.4283, + "step": 3212 + }, + { + "epoch": 1.5191489361702128, + "grad_norm": 2.4911608695983887, + "learning_rate": 4.282919633178343e-06, + "loss": 0.4812, + "step": 3213 + }, + { + "epoch": 1.5196217494089834, + "grad_norm": 2.5813944339752197, + "learning_rate": 4.282482279141856e-06, + "loss": 0.4911, + "step": 3214 + }, + { + "epoch": 1.520094562647754, + "grad_norm": 2.503542184829712, + "learning_rate": 4.282044814118013e-06, + "loss": 0.4969, + "step": 3215 + }, + { + "epoch": 1.5205673758865248, + "grad_norm": 2.5090713500976562, + "learning_rate": 4.281607238134053e-06, + "loss": 0.5293, + "step": 3216 + }, + { + "epoch": 1.5210401891252956, + "grad_norm": 2.425994396209717, + "learning_rate": 4.281169551217223e-06, + "loss": 0.5365, + "step": 3217 + }, + { + "epoch": 1.5215130023640662, + "grad_norm": 2.637655258178711, + "learning_rate": 4.2807317533947765e-06, + "loss": 0.5589, + "step": 3218 + }, + { + "epoch": 1.5219858156028367, + "grad_norm": 2.9335296154022217, + "learning_rate": 4.28029384469397e-06, + "loss": 0.6071, + "step": 3219 + }, + { + "epoch": 1.5224586288416075, + "grad_norm": 2.898683547973633, + "learning_rate": 4.279855825142073e-06, + "loss": 0.5392, + "step": 3220 + }, + { + "epoch": 1.5229314420803783, + "grad_norm": 2.613914966583252, + "learning_rate": 4.279417694766359e-06, + "loss": 0.4968, + "step": 3221 + }, + { + "epoch": 1.523404255319149, + "grad_norm": 2.500682830810547, + "learning_rate": 4.278979453594106e-06, + "loss": 0.471, + "step": 3222 + }, + { + "epoch": 1.5238770685579195, + "grad_norm": 2.5269598960876465, + "learning_rate": 4.278541101652605e-06, + "loss": 0.471, + "step": 3223 + }, + { + "epoch": 1.5243498817966903, + "grad_norm": 2.8153114318847656, + "learning_rate": 4.2781026389691465e-06, + "loss": 0.5742, + "step": 3224 + }, + { + "epoch": 1.524822695035461, + "grad_norm": 2.5648019313812256, + "learning_rate": 4.277664065571034e-06, + "loss": 0.5315, + "step": 3225 + }, + { + "epoch": 1.5252955082742317, + "grad_norm": 2.778355836868286, + "learning_rate": 4.277225381485575e-06, + "loss": 0.5543, + "step": 3226 + }, + { + "epoch": 1.5257683215130022, + "grad_norm": 2.6736745834350586, + "learning_rate": 4.2767865867400846e-06, + "loss": 0.4947, + "step": 3227 + }, + { + "epoch": 1.526241134751773, + "grad_norm": 2.9560294151306152, + "learning_rate": 4.276347681361884e-06, + "loss": 0.5835, + "step": 3228 + }, + { + "epoch": 1.5267139479905438, + "grad_norm": 2.5580296516418457, + "learning_rate": 4.275908665378302e-06, + "loss": 0.4751, + "step": 3229 + }, + { + "epoch": 1.5271867612293144, + "grad_norm": 3.0705175399780273, + "learning_rate": 4.2754695388166755e-06, + "loss": 0.5327, + "step": 3230 + }, + { + "epoch": 1.527659574468085, + "grad_norm": 2.664652109146118, + "learning_rate": 4.275030301704346e-06, + "loss": 0.4934, + "step": 3231 + }, + { + "epoch": 1.5281323877068558, + "grad_norm": 2.308499813079834, + "learning_rate": 4.274590954068663e-06, + "loss": 0.4412, + "step": 3232 + }, + { + "epoch": 1.5286052009456266, + "grad_norm": 2.871189594268799, + "learning_rate": 4.2741514959369815e-06, + "loss": 0.5001, + "step": 3233 + }, + { + "epoch": 1.5290780141843971, + "grad_norm": 2.5274453163146973, + "learning_rate": 4.273711927336666e-06, + "loss": 0.4938, + "step": 3234 + }, + { + "epoch": 1.5295508274231677, + "grad_norm": 2.8848133087158203, + "learning_rate": 4.273272248295087e-06, + "loss": 0.5397, + "step": 3235 + }, + { + "epoch": 1.5300236406619385, + "grad_norm": 2.3927090167999268, + "learning_rate": 4.27283245883962e-06, + "loss": 0.5497, + "step": 3236 + }, + { + "epoch": 1.5304964539007093, + "grad_norm": 2.5413873195648193, + "learning_rate": 4.27239255899765e-06, + "loss": 0.5108, + "step": 3237 + }, + { + "epoch": 1.53096926713948, + "grad_norm": 2.7692389488220215, + "learning_rate": 4.271952548796567e-06, + "loss": 0.5768, + "step": 3238 + }, + { + "epoch": 1.5314420803782505, + "grad_norm": 2.4621126651763916, + "learning_rate": 4.271512428263768e-06, + "loss": 0.4698, + "step": 3239 + }, + { + "epoch": 1.5319148936170213, + "grad_norm": 2.6423375606536865, + "learning_rate": 4.271072197426659e-06, + "loss": 0.4929, + "step": 3240 + }, + { + "epoch": 1.532387706855792, + "grad_norm": 2.7097692489624023, + "learning_rate": 4.270631856312649e-06, + "loss": 0.4836, + "step": 3241 + }, + { + "epoch": 1.5328605200945626, + "grad_norm": 2.545706272125244, + "learning_rate": 4.270191404949158e-06, + "loss": 0.4636, + "step": 3242 + }, + { + "epoch": 1.5333333333333332, + "grad_norm": 3.138781785964966, + "learning_rate": 4.26975084336361e-06, + "loss": 0.5988, + "step": 3243 + }, + { + "epoch": 1.533806146572104, + "grad_norm": 2.492715835571289, + "learning_rate": 4.269310171583438e-06, + "loss": 0.5095, + "step": 3244 + }, + { + "epoch": 1.5342789598108748, + "grad_norm": 2.5705838203430176, + "learning_rate": 4.268869389636077e-06, + "loss": 0.4818, + "step": 3245 + }, + { + "epoch": 1.5347517730496454, + "grad_norm": 2.7633554935455322, + "learning_rate": 4.268428497548979e-06, + "loss": 0.547, + "step": 3246 + }, + { + "epoch": 1.535224586288416, + "grad_norm": 2.654528856277466, + "learning_rate": 4.2679874953495905e-06, + "loss": 0.5261, + "step": 3247 + }, + { + "epoch": 1.5356973995271868, + "grad_norm": 2.5039751529693604, + "learning_rate": 4.2675463830653744e-06, + "loss": 0.4941, + "step": 3248 + }, + { + "epoch": 1.5361702127659576, + "grad_norm": 2.897268295288086, + "learning_rate": 4.267105160723794e-06, + "loss": 0.5404, + "step": 3249 + }, + { + "epoch": 1.5366430260047281, + "grad_norm": 2.500732421875, + "learning_rate": 4.266663828352324e-06, + "loss": 0.5375, + "step": 3250 + }, + { + "epoch": 1.5371158392434987, + "grad_norm": 2.6310064792633057, + "learning_rate": 4.266222385978444e-06, + "loss": 0.5217, + "step": 3251 + }, + { + "epoch": 1.5375886524822695, + "grad_norm": 2.7440476417541504, + "learning_rate": 4.265780833629642e-06, + "loss": 0.5419, + "step": 3252 + }, + { + "epoch": 1.5380614657210403, + "grad_norm": 2.7037577629089355, + "learning_rate": 4.2653391713334095e-06, + "loss": 0.5634, + "step": 3253 + }, + { + "epoch": 1.5385342789598109, + "grad_norm": 2.548525810241699, + "learning_rate": 4.264897399117248e-06, + "loss": 0.535, + "step": 3254 + }, + { + "epoch": 1.5390070921985815, + "grad_norm": 2.6127355098724365, + "learning_rate": 4.264455517008663e-06, + "loss": 0.4619, + "step": 3255 + }, + { + "epoch": 1.5394799054373522, + "grad_norm": 2.5597004890441895, + "learning_rate": 4.264013525035171e-06, + "loss": 0.4477, + "step": 3256 + }, + { + "epoch": 1.539952718676123, + "grad_norm": 2.642432689666748, + "learning_rate": 4.263571423224292e-06, + "loss": 0.4749, + "step": 3257 + }, + { + "epoch": 1.5404255319148936, + "grad_norm": 2.5121877193450928, + "learning_rate": 4.2631292116035526e-06, + "loss": 0.4693, + "step": 3258 + }, + { + "epoch": 1.5408983451536642, + "grad_norm": 2.390292167663574, + "learning_rate": 4.262686890200489e-06, + "loss": 0.4872, + "step": 3259 + }, + { + "epoch": 1.541371158392435, + "grad_norm": 2.5898337364196777, + "learning_rate": 4.2622444590426405e-06, + "loss": 0.5193, + "step": 3260 + }, + { + "epoch": 1.5418439716312058, + "grad_norm": 2.508821487426758, + "learning_rate": 4.261801918157558e-06, + "loss": 0.511, + "step": 3261 + }, + { + "epoch": 1.5423167848699764, + "grad_norm": 2.6992101669311523, + "learning_rate": 4.261359267572795e-06, + "loss": 0.5069, + "step": 3262 + }, + { + "epoch": 1.542789598108747, + "grad_norm": 2.6011030673980713, + "learning_rate": 4.2609165073159145e-06, + "loss": 0.5887, + "step": 3263 + }, + { + "epoch": 1.5432624113475177, + "grad_norm": 2.887053966522217, + "learning_rate": 4.260473637414483e-06, + "loss": 0.5556, + "step": 3264 + }, + { + "epoch": 1.5437352245862885, + "grad_norm": 2.6433887481689453, + "learning_rate": 4.260030657896079e-06, + "loss": 0.4728, + "step": 3265 + }, + { + "epoch": 1.544208037825059, + "grad_norm": 2.6134607791900635, + "learning_rate": 4.259587568788282e-06, + "loss": 0.483, + "step": 3266 + }, + { + "epoch": 1.5446808510638297, + "grad_norm": 2.5308640003204346, + "learning_rate": 4.259144370118684e-06, + "loss": 0.5115, + "step": 3267 + }, + { + "epoch": 1.5451536643026005, + "grad_norm": 2.8256733417510986, + "learning_rate": 4.258701061914879e-06, + "loss": 0.5414, + "step": 3268 + }, + { + "epoch": 1.5456264775413713, + "grad_norm": 2.8648319244384766, + "learning_rate": 4.258257644204471e-06, + "loss": 0.5695, + "step": 3269 + }, + { + "epoch": 1.5460992907801419, + "grad_norm": 2.8568081855773926, + "learning_rate": 4.257814117015069e-06, + "loss": 0.5264, + "step": 3270 + }, + { + "epoch": 1.5465721040189124, + "grad_norm": 2.6065011024475098, + "learning_rate": 4.257370480374289e-06, + "loss": 0.5646, + "step": 3271 + }, + { + "epoch": 1.5470449172576832, + "grad_norm": 2.7840216159820557, + "learning_rate": 4.256926734309756e-06, + "loss": 0.5191, + "step": 3272 + }, + { + "epoch": 1.547517730496454, + "grad_norm": 2.85906982421875, + "learning_rate": 4.256482878849099e-06, + "loss": 0.5911, + "step": 3273 + }, + { + "epoch": 1.5479905437352246, + "grad_norm": 2.916029930114746, + "learning_rate": 4.256038914019954e-06, + "loss": 0.5589, + "step": 3274 + }, + { + "epoch": 1.5484633569739952, + "grad_norm": 2.6748716831207275, + "learning_rate": 4.255594839849967e-06, + "loss": 0.5323, + "step": 3275 + }, + { + "epoch": 1.548936170212766, + "grad_norm": 2.717212200164795, + "learning_rate": 4.255150656366787e-06, + "loss": 0.453, + "step": 3276 + }, + { + "epoch": 1.5494089834515368, + "grad_norm": 2.4974849224090576, + "learning_rate": 4.254706363598072e-06, + "loss": 0.4516, + "step": 3277 + }, + { + "epoch": 1.5498817966903073, + "grad_norm": 2.648151397705078, + "learning_rate": 4.254261961571485e-06, + "loss": 0.5452, + "step": 3278 + }, + { + "epoch": 1.550354609929078, + "grad_norm": 2.932905435562134, + "learning_rate": 4.253817450314699e-06, + "loss": 0.4813, + "step": 3279 + }, + { + "epoch": 1.5508274231678487, + "grad_norm": 2.862912178039551, + "learning_rate": 4.25337282985539e-06, + "loss": 0.5689, + "step": 3280 + }, + { + "epoch": 1.5513002364066195, + "grad_norm": 2.532156467437744, + "learning_rate": 4.2529281002212436e-06, + "loss": 0.485, + "step": 3281 + }, + { + "epoch": 1.55177304964539, + "grad_norm": 2.583299160003662, + "learning_rate": 4.25248326143995e-06, + "loss": 0.4661, + "step": 3282 + }, + { + "epoch": 1.5522458628841607, + "grad_norm": 2.5790653228759766, + "learning_rate": 4.252038313539209e-06, + "loss": 0.5455, + "step": 3283 + }, + { + "epoch": 1.5527186761229315, + "grad_norm": 2.872864007949829, + "learning_rate": 4.251593256546724e-06, + "loss": 0.5317, + "step": 3284 + }, + { + "epoch": 1.5531914893617023, + "grad_norm": 3.0382463932037354, + "learning_rate": 4.251148090490208e-06, + "loss": 0.5131, + "step": 3285 + }, + { + "epoch": 1.5536643026004728, + "grad_norm": 2.574399709701538, + "learning_rate": 4.250702815397379e-06, + "loss": 0.5399, + "step": 3286 + }, + { + "epoch": 1.5541371158392434, + "grad_norm": 2.9784770011901855, + "learning_rate": 4.250257431295962e-06, + "loss": 0.5209, + "step": 3287 + }, + { + "epoch": 1.5546099290780142, + "grad_norm": 2.6482062339782715, + "learning_rate": 4.249811938213689e-06, + "loss": 0.5416, + "step": 3288 + }, + { + "epoch": 1.555082742316785, + "grad_norm": 2.82142972946167, + "learning_rate": 4.2493663361783e-06, + "loss": 0.594, + "step": 3289 + }, + { + "epoch": 1.5555555555555556, + "grad_norm": 2.815595865249634, + "learning_rate": 4.24892062521754e-06, + "loss": 0.5381, + "step": 3290 + }, + { + "epoch": 1.5560283687943262, + "grad_norm": 2.689764976501465, + "learning_rate": 4.248474805359161e-06, + "loss": 0.5141, + "step": 3291 + }, + { + "epoch": 1.556501182033097, + "grad_norm": 2.7718515396118164, + "learning_rate": 4.248028876630922e-06, + "loss": 0.5324, + "step": 3292 + }, + { + "epoch": 1.5569739952718678, + "grad_norm": 3.0196774005889893, + "learning_rate": 4.247582839060591e-06, + "loss": 0.4971, + "step": 3293 + }, + { + "epoch": 1.5574468085106383, + "grad_norm": 2.608475923538208, + "learning_rate": 4.247136692675939e-06, + "loss": 0.5795, + "step": 3294 + }, + { + "epoch": 1.557919621749409, + "grad_norm": 2.4912326335906982, + "learning_rate": 4.246690437504746e-06, + "loss": 0.5348, + "step": 3295 + }, + { + "epoch": 1.5583924349881797, + "grad_norm": 2.519303560256958, + "learning_rate": 4.246244073574799e-06, + "loss": 0.4953, + "step": 3296 + }, + { + "epoch": 1.5588652482269505, + "grad_norm": 2.5667171478271484, + "learning_rate": 4.24579760091389e-06, + "loss": 0.5353, + "step": 3297 + }, + { + "epoch": 1.559338061465721, + "grad_norm": 2.8835761547088623, + "learning_rate": 4.24535101954982e-06, + "loss": 0.578, + "step": 3298 + }, + { + "epoch": 1.5598108747044916, + "grad_norm": 3.0506930351257324, + "learning_rate": 4.244904329510395e-06, + "loss": 0.6418, + "step": 3299 + }, + { + "epoch": 1.5602836879432624, + "grad_norm": 2.579446315765381, + "learning_rate": 4.244457530823428e-06, + "loss": 0.5027, + "step": 3300 + }, + { + "epoch": 1.5607565011820332, + "grad_norm": 2.72012996673584, + "learning_rate": 4.24401062351674e-06, + "loss": 0.5438, + "step": 3301 + }, + { + "epoch": 1.5612293144208038, + "grad_norm": 2.527007818222046, + "learning_rate": 4.243563607618158e-06, + "loss": 0.5303, + "step": 3302 + }, + { + "epoch": 1.5617021276595744, + "grad_norm": 2.4415159225463867, + "learning_rate": 4.243116483155516e-06, + "loss": 0.4893, + "step": 3303 + }, + { + "epoch": 1.5621749408983452, + "grad_norm": 2.462256669998169, + "learning_rate": 4.242669250156653e-06, + "loss": 0.5671, + "step": 3304 + }, + { + "epoch": 1.562647754137116, + "grad_norm": 2.479865074157715, + "learning_rate": 4.242221908649418e-06, + "loss": 0.5038, + "step": 3305 + }, + { + "epoch": 1.5631205673758866, + "grad_norm": 2.74670672416687, + "learning_rate": 4.241774458661662e-06, + "loss": 0.5689, + "step": 3306 + }, + { + "epoch": 1.5635933806146571, + "grad_norm": 2.55938982963562, + "learning_rate": 4.24132690022125e-06, + "loss": 0.492, + "step": 3307 + }, + { + "epoch": 1.564066193853428, + "grad_norm": 2.634956121444702, + "learning_rate": 4.240879233356048e-06, + "loss": 0.503, + "step": 3308 + }, + { + "epoch": 1.5645390070921987, + "grad_norm": 2.381775140762329, + "learning_rate": 4.240431458093928e-06, + "loss": 0.4939, + "step": 3309 + }, + { + "epoch": 1.5650118203309693, + "grad_norm": 2.8176610469818115, + "learning_rate": 4.239983574462774e-06, + "loss": 0.5609, + "step": 3310 + }, + { + "epoch": 1.5654846335697399, + "grad_norm": 3.0268442630767822, + "learning_rate": 4.239535582490471e-06, + "loss": 0.5427, + "step": 3311 + }, + { + "epoch": 1.5659574468085107, + "grad_norm": 2.5881481170654297, + "learning_rate": 4.239087482204916e-06, + "loss": 0.5538, + "step": 3312 + }, + { + "epoch": 1.5664302600472815, + "grad_norm": 2.5317704677581787, + "learning_rate": 4.238639273634008e-06, + "loss": 0.4915, + "step": 3313 + }, + { + "epoch": 1.566903073286052, + "grad_norm": 2.9608731269836426, + "learning_rate": 4.238190956805658e-06, + "loss": 0.564, + "step": 3314 + }, + { + "epoch": 1.5673758865248226, + "grad_norm": 3.022686243057251, + "learning_rate": 4.237742531747777e-06, + "loss": 0.5503, + "step": 3315 + }, + { + "epoch": 1.5678486997635934, + "grad_norm": 2.763622283935547, + "learning_rate": 4.23729399848829e-06, + "loss": 0.5241, + "step": 3316 + }, + { + "epoch": 1.5683215130023642, + "grad_norm": 2.6112794876098633, + "learning_rate": 4.236845357055122e-06, + "loss": 0.4919, + "step": 3317 + }, + { + "epoch": 1.5687943262411348, + "grad_norm": 2.649829149246216, + "learning_rate": 4.23639660747621e-06, + "loss": 0.5472, + "step": 3318 + }, + { + "epoch": 1.5692671394799054, + "grad_norm": 2.8888115882873535, + "learning_rate": 4.2359477497794955e-06, + "loss": 0.5077, + "step": 3319 + }, + { + "epoch": 1.5697399527186762, + "grad_norm": 2.5666911602020264, + "learning_rate": 4.235498783992927e-06, + "loss": 0.5365, + "step": 3320 + }, + { + "epoch": 1.570212765957447, + "grad_norm": 2.448758363723755, + "learning_rate": 4.2350497101444575e-06, + "loss": 0.5043, + "step": 3321 + }, + { + "epoch": 1.5706855791962175, + "grad_norm": 2.595207691192627, + "learning_rate": 4.234600528262052e-06, + "loss": 0.5303, + "step": 3322 + }, + { + "epoch": 1.5711583924349881, + "grad_norm": 2.7814228534698486, + "learning_rate": 4.234151238373676e-06, + "loss": 0.4521, + "step": 3323 + }, + { + "epoch": 1.571631205673759, + "grad_norm": 2.781538724899292, + "learning_rate": 4.233701840507308e-06, + "loss": 0.5193, + "step": 3324 + }, + { + "epoch": 1.5721040189125297, + "grad_norm": 2.771907329559326, + "learning_rate": 4.233252334690928e-06, + "loss": 0.497, + "step": 3325 + }, + { + "epoch": 1.5725768321513003, + "grad_norm": 2.5557498931884766, + "learning_rate": 4.232802720952525e-06, + "loss": 0.4913, + "step": 3326 + }, + { + "epoch": 1.5730496453900709, + "grad_norm": 2.478267192840576, + "learning_rate": 4.232352999320094e-06, + "loss": 0.4967, + "step": 3327 + }, + { + "epoch": 1.5735224586288417, + "grad_norm": 3.1548502445220947, + "learning_rate": 4.231903169821639e-06, + "loss": 0.5009, + "step": 3328 + }, + { + "epoch": 1.5739952718676125, + "grad_norm": 2.634824275970459, + "learning_rate": 4.231453232485168e-06, + "loss": 0.5223, + "step": 3329 + }, + { + "epoch": 1.574468085106383, + "grad_norm": 2.579102039337158, + "learning_rate": 4.231003187338695e-06, + "loss": 0.5513, + "step": 3330 + }, + { + "epoch": 1.5749408983451536, + "grad_norm": 2.8477070331573486, + "learning_rate": 4.230553034410245e-06, + "loss": 0.561, + "step": 3331 + }, + { + "epoch": 1.5754137115839244, + "grad_norm": 2.6714725494384766, + "learning_rate": 4.2301027737278446e-06, + "loss": 0.4687, + "step": 3332 + }, + { + "epoch": 1.5758865248226952, + "grad_norm": 2.6562764644622803, + "learning_rate": 4.229652405319532e-06, + "loss": 0.5925, + "step": 3333 + }, + { + "epoch": 1.5763593380614658, + "grad_norm": 2.750946283340454, + "learning_rate": 4.229201929213348e-06, + "loss": 0.4748, + "step": 3334 + }, + { + "epoch": 1.5768321513002364, + "grad_norm": 2.760470151901245, + "learning_rate": 4.228751345437342e-06, + "loss": 0.5989, + "step": 3335 + }, + { + "epoch": 1.5773049645390071, + "grad_norm": 3.1451845169067383, + "learning_rate": 4.2283006540195706e-06, + "loss": 0.562, + "step": 3336 + }, + { + "epoch": 1.5777777777777777, + "grad_norm": 2.563011407852173, + "learning_rate": 4.227849854988095e-06, + "loss": 0.5473, + "step": 3337 + }, + { + "epoch": 1.5782505910165483, + "grad_norm": 2.310469388961792, + "learning_rate": 4.2273989483709856e-06, + "loss": 0.5033, + "step": 3338 + }, + { + "epoch": 1.578723404255319, + "grad_norm": 2.677978754043579, + "learning_rate": 4.226947934196318e-06, + "loss": 0.5291, + "step": 3339 + }, + { + "epoch": 1.57919621749409, + "grad_norm": 3.0423545837402344, + "learning_rate": 4.226496812492176e-06, + "loss": 0.5201, + "step": 3340 + }, + { + "epoch": 1.5796690307328605, + "grad_norm": 2.357513904571533, + "learning_rate": 4.226045583286647e-06, + "loss": 0.4421, + "step": 3341 + }, + { + "epoch": 1.580141843971631, + "grad_norm": 2.719860315322876, + "learning_rate": 4.225594246607828e-06, + "loss": 0.4855, + "step": 3342 + }, + { + "epoch": 1.5806146572104018, + "grad_norm": 3.2645058631896973, + "learning_rate": 4.2251428024838215e-06, + "loss": 0.6654, + "step": 3343 + }, + { + "epoch": 1.5810874704491726, + "grad_norm": 2.2997004985809326, + "learning_rate": 4.224691250942737e-06, + "loss": 0.4565, + "step": 3344 + }, + { + "epoch": 1.5815602836879432, + "grad_norm": 2.8103034496307373, + "learning_rate": 4.2242395920126926e-06, + "loss": 0.5543, + "step": 3345 + }, + { + "epoch": 1.5820330969267138, + "grad_norm": 2.720254898071289, + "learning_rate": 4.223787825721808e-06, + "loss": 0.5028, + "step": 3346 + }, + { + "epoch": 1.5825059101654846, + "grad_norm": 2.735544204711914, + "learning_rate": 4.223335952098214e-06, + "loss": 0.5169, + "step": 3347 + }, + { + "epoch": 1.5829787234042554, + "grad_norm": 2.784254550933838, + "learning_rate": 4.222883971170047e-06, + "loss": 0.4989, + "step": 3348 + }, + { + "epoch": 1.583451536643026, + "grad_norm": 2.7192094326019287, + "learning_rate": 4.22243188296545e-06, + "loss": 0.502, + "step": 3349 + }, + { + "epoch": 1.5839243498817965, + "grad_norm": 2.716501474380493, + "learning_rate": 4.221979687512573e-06, + "loss": 0.5687, + "step": 3350 + }, + { + "epoch": 1.5843971631205673, + "grad_norm": 2.8420114517211914, + "learning_rate": 4.22152738483957e-06, + "loss": 0.5903, + "step": 3351 + }, + { + "epoch": 1.5848699763593381, + "grad_norm": 2.734872579574585, + "learning_rate": 4.2210749749746065e-06, + "loss": 0.5397, + "step": 3352 + }, + { + "epoch": 1.5853427895981087, + "grad_norm": 2.4343836307525635, + "learning_rate": 4.220622457945851e-06, + "loss": 0.436, + "step": 3353 + }, + { + "epoch": 1.5858156028368793, + "grad_norm": 2.728177547454834, + "learning_rate": 4.2201698337814785e-06, + "loss": 0.5703, + "step": 3354 + }, + { + "epoch": 1.58628841607565, + "grad_norm": 2.502098560333252, + "learning_rate": 4.219717102509674e-06, + "loss": 0.5275, + "step": 3355 + }, + { + "epoch": 1.5867612293144209, + "grad_norm": 2.6595494747161865, + "learning_rate": 4.219264264158627e-06, + "loss": 0.4659, + "step": 3356 + }, + { + "epoch": 1.5872340425531914, + "grad_norm": 2.5307185649871826, + "learning_rate": 4.218811318756532e-06, + "loss": 0.5048, + "step": 3357 + }, + { + "epoch": 1.587706855791962, + "grad_norm": 2.9300129413604736, + "learning_rate": 4.218358266331593e-06, + "loss": 0.5137, + "step": 3358 + }, + { + "epoch": 1.5881796690307328, + "grad_norm": 2.686586618423462, + "learning_rate": 4.21790510691202e-06, + "loss": 0.4529, + "step": 3359 + }, + { + "epoch": 1.5886524822695036, + "grad_norm": 2.9981517791748047, + "learning_rate": 4.217451840526029e-06, + "loss": 0.6054, + "step": 3360 + }, + { + "epoch": 1.5891252955082742, + "grad_norm": 2.6943674087524414, + "learning_rate": 4.216998467201841e-06, + "loss": 0.5153, + "step": 3361 + }, + { + "epoch": 1.5895981087470448, + "grad_norm": 2.707084894180298, + "learning_rate": 4.216544986967689e-06, + "loss": 0.5235, + "step": 3362 + }, + { + "epoch": 1.5900709219858156, + "grad_norm": 2.6553728580474854, + "learning_rate": 4.216091399851808e-06, + "loss": 0.5275, + "step": 3363 + }, + { + "epoch": 1.5905437352245864, + "grad_norm": 2.9136953353881836, + "learning_rate": 4.215637705882439e-06, + "loss": 0.5834, + "step": 3364 + }, + { + "epoch": 1.591016548463357, + "grad_norm": 2.7647159099578857, + "learning_rate": 4.2151839050878325e-06, + "loss": 0.5641, + "step": 3365 + }, + { + "epoch": 1.5914893617021275, + "grad_norm": 2.4556827545166016, + "learning_rate": 4.214729997496246e-06, + "loss": 0.5636, + "step": 3366 + }, + { + "epoch": 1.5919621749408983, + "grad_norm": 2.6111652851104736, + "learning_rate": 4.2142759831359414e-06, + "loss": 0.5097, + "step": 3367 + }, + { + "epoch": 1.592434988179669, + "grad_norm": 2.4886903762817383, + "learning_rate": 4.213821862035189e-06, + "loss": 0.531, + "step": 3368 + }, + { + "epoch": 1.5929078014184397, + "grad_norm": 2.5245840549468994, + "learning_rate": 4.213367634222263e-06, + "loss": 0.5085, + "step": 3369 + }, + { + "epoch": 1.5933806146572103, + "grad_norm": 2.970214605331421, + "learning_rate": 4.212913299725447e-06, + "loss": 0.5851, + "step": 3370 + }, + { + "epoch": 1.593853427895981, + "grad_norm": 2.5433361530303955, + "learning_rate": 4.212458858573032e-06, + "loss": 0.48, + "step": 3371 + }, + { + "epoch": 1.5943262411347519, + "grad_norm": 2.3550102710723877, + "learning_rate": 4.212004310793312e-06, + "loss": 0.4405, + "step": 3372 + }, + { + "epoch": 1.5947990543735224, + "grad_norm": 2.4824719429016113, + "learning_rate": 4.2115496564145896e-06, + "loss": 0.4634, + "step": 3373 + }, + { + "epoch": 1.595271867612293, + "grad_norm": 2.4751930236816406, + "learning_rate": 4.211094895465176e-06, + "loss": 0.5662, + "step": 3374 + }, + { + "epoch": 1.5957446808510638, + "grad_norm": 2.4193356037139893, + "learning_rate": 4.210640027973386e-06, + "loss": 0.4441, + "step": 3375 + }, + { + "epoch": 1.5962174940898346, + "grad_norm": 2.4477498531341553, + "learning_rate": 4.210185053967543e-06, + "loss": 0.5205, + "step": 3376 + }, + { + "epoch": 1.5966903073286052, + "grad_norm": 2.7954161167144775, + "learning_rate": 4.209729973475976e-06, + "loss": 0.4951, + "step": 3377 + }, + { + "epoch": 1.5971631205673757, + "grad_norm": 3.1907570362091064, + "learning_rate": 4.209274786527019e-06, + "loss": 0.6024, + "step": 3378 + }, + { + "epoch": 1.5976359338061465, + "grad_norm": 2.485245704650879, + "learning_rate": 4.2088194931490165e-06, + "loss": 0.5652, + "step": 3379 + }, + { + "epoch": 1.5981087470449173, + "grad_norm": 2.589310884475708, + "learning_rate": 4.208364093370317e-06, + "loss": 0.5085, + "step": 3380 + }, + { + "epoch": 1.598581560283688, + "grad_norm": 2.8941214084625244, + "learning_rate": 4.207908587219276e-06, + "loss": 0.53, + "step": 3381 + }, + { + "epoch": 1.5990543735224585, + "grad_norm": 2.480509042739868, + "learning_rate": 4.207452974724258e-06, + "loss": 0.4543, + "step": 3382 + }, + { + "epoch": 1.5995271867612293, + "grad_norm": 2.7884905338287354, + "learning_rate": 4.206997255913629e-06, + "loss": 0.5483, + "step": 3383 + }, + { + "epoch": 1.6, + "grad_norm": 2.7976696491241455, + "learning_rate": 4.206541430815766e-06, + "loss": 0.4734, + "step": 3384 + }, + { + "epoch": 1.6004728132387707, + "grad_norm": 2.5463132858276367, + "learning_rate": 4.206085499459051e-06, + "loss": 0.4931, + "step": 3385 + }, + { + "epoch": 1.6009456264775412, + "grad_norm": 2.8384251594543457, + "learning_rate": 4.205629461871871e-06, + "loss": 0.5066, + "step": 3386 + }, + { + "epoch": 1.601418439716312, + "grad_norm": 2.8578574657440186, + "learning_rate": 4.205173318082626e-06, + "loss": 0.458, + "step": 3387 + }, + { + "epoch": 1.6018912529550828, + "grad_norm": 2.7779932022094727, + "learning_rate": 4.204717068119715e-06, + "loss": 0.5293, + "step": 3388 + }, + { + "epoch": 1.6023640661938534, + "grad_norm": 2.9123778343200684, + "learning_rate": 4.204260712011546e-06, + "loss": 0.4866, + "step": 3389 + }, + { + "epoch": 1.602836879432624, + "grad_norm": 2.757922887802124, + "learning_rate": 4.203804249786537e-06, + "loss": 0.4925, + "step": 3390 + }, + { + "epoch": 1.6033096926713948, + "grad_norm": 3.287733316421509, + "learning_rate": 4.203347681473107e-06, + "loss": 0.6694, + "step": 3391 + }, + { + "epoch": 1.6037825059101656, + "grad_norm": 3.2117912769317627, + "learning_rate": 4.202891007099687e-06, + "loss": 0.5269, + "step": 3392 + }, + { + "epoch": 1.6042553191489362, + "grad_norm": 2.8489456176757812, + "learning_rate": 4.20243422669471e-06, + "loss": 0.5073, + "step": 3393 + }, + { + "epoch": 1.6047281323877067, + "grad_norm": 2.7660224437713623, + "learning_rate": 4.201977340286619e-06, + "loss": 0.5014, + "step": 3394 + }, + { + "epoch": 1.6052009456264775, + "grad_norm": 2.68182110786438, + "learning_rate": 4.201520347903862e-06, + "loss": 0.4542, + "step": 3395 + }, + { + "epoch": 1.6056737588652483, + "grad_norm": 2.7546045780181885, + "learning_rate": 4.2010632495748934e-06, + "loss": 0.516, + "step": 3396 + }, + { + "epoch": 1.606146572104019, + "grad_norm": 2.744668483734131, + "learning_rate": 4.200606045328176e-06, + "loss": 0.5243, + "step": 3397 + }, + { + "epoch": 1.6066193853427895, + "grad_norm": 2.935343027114868, + "learning_rate": 4.200148735192177e-06, + "loss": 0.5624, + "step": 3398 + }, + { + "epoch": 1.6070921985815603, + "grad_norm": 2.7392852306365967, + "learning_rate": 4.19969131919537e-06, + "loss": 0.5796, + "step": 3399 + }, + { + "epoch": 1.607565011820331, + "grad_norm": 2.864750385284424, + "learning_rate": 4.199233797366239e-06, + "loss": 0.549, + "step": 3400 + }, + { + "epoch": 1.6080378250591016, + "grad_norm": 2.684157371520996, + "learning_rate": 4.198776169733269e-06, + "loss": 0.5532, + "step": 3401 + }, + { + "epoch": 1.6085106382978722, + "grad_norm": 2.4717135429382324, + "learning_rate": 4.198318436324957e-06, + "loss": 0.5174, + "step": 3402 + }, + { + "epoch": 1.608983451536643, + "grad_norm": 2.640242338180542, + "learning_rate": 4.197860597169802e-06, + "loss": 0.5117, + "step": 3403 + }, + { + "epoch": 1.6094562647754138, + "grad_norm": 2.4957473278045654, + "learning_rate": 4.197402652296313e-06, + "loss": 0.474, + "step": 3404 + }, + { + "epoch": 1.6099290780141844, + "grad_norm": 2.416138172149658, + "learning_rate": 4.196944601733004e-06, + "loss": 0.4858, + "step": 3405 + }, + { + "epoch": 1.610401891252955, + "grad_norm": 2.4498109817504883, + "learning_rate": 4.196486445508395e-06, + "loss": 0.5048, + "step": 3406 + }, + { + "epoch": 1.6108747044917258, + "grad_norm": 2.415895938873291, + "learning_rate": 4.196028183651014e-06, + "loss": 0.4745, + "step": 3407 + }, + { + "epoch": 1.6113475177304966, + "grad_norm": 2.843665838241577, + "learning_rate": 4.195569816189395e-06, + "loss": 0.5219, + "step": 3408 + }, + { + "epoch": 1.6118203309692671, + "grad_norm": 2.608579158782959, + "learning_rate": 4.195111343152079e-06, + "loss": 0.4941, + "step": 3409 + }, + { + "epoch": 1.6122931442080377, + "grad_norm": 2.643789529800415, + "learning_rate": 4.194652764567611e-06, + "loss": 0.515, + "step": 3410 + }, + { + "epoch": 1.6127659574468085, + "grad_norm": 2.8099429607391357, + "learning_rate": 4.194194080464547e-06, + "loss": 0.4935, + "step": 3411 + }, + { + "epoch": 1.6132387706855793, + "grad_norm": 2.595628261566162, + "learning_rate": 4.193735290871446e-06, + "loss": 0.5571, + "step": 3412 + }, + { + "epoch": 1.6137115839243499, + "grad_norm": 2.7903778553009033, + "learning_rate": 4.193276395816876e-06, + "loss": 0.5228, + "step": 3413 + }, + { + "epoch": 1.6141843971631205, + "grad_norm": 2.83910870552063, + "learning_rate": 4.192817395329409e-06, + "loss": 0.6124, + "step": 3414 + }, + { + "epoch": 1.6146572104018913, + "grad_norm": 2.6155734062194824, + "learning_rate": 4.192358289437626e-06, + "loss": 0.552, + "step": 3415 + }, + { + "epoch": 1.615130023640662, + "grad_norm": 2.795832872390747, + "learning_rate": 4.191899078170113e-06, + "loss": 0.5561, + "step": 3416 + }, + { + "epoch": 1.6156028368794326, + "grad_norm": 2.3402161598205566, + "learning_rate": 4.191439761555464e-06, + "loss": 0.4889, + "step": 3417 + }, + { + "epoch": 1.6160756501182032, + "grad_norm": 3.1183433532714844, + "learning_rate": 4.190980339622276e-06, + "loss": 0.5337, + "step": 3418 + }, + { + "epoch": 1.616548463356974, + "grad_norm": 2.6262872219085693, + "learning_rate": 4.190520812399158e-06, + "loss": 0.525, + "step": 3419 + }, + { + "epoch": 1.6170212765957448, + "grad_norm": 2.578340530395508, + "learning_rate": 4.190061179914722e-06, + "loss": 0.4975, + "step": 3420 + }, + { + "epoch": 1.6174940898345154, + "grad_norm": 3.19482159614563, + "learning_rate": 4.189601442197586e-06, + "loss": 0.5832, + "step": 3421 + }, + { + "epoch": 1.617966903073286, + "grad_norm": 2.6398792266845703, + "learning_rate": 4.189141599276378e-06, + "loss": 0.4676, + "step": 3422 + }, + { + "epoch": 1.6184397163120567, + "grad_norm": 2.624865770339966, + "learning_rate": 4.1886816511797275e-06, + "loss": 0.4507, + "step": 3423 + }, + { + "epoch": 1.6189125295508275, + "grad_norm": 2.4136857986450195, + "learning_rate": 4.1882215979362775e-06, + "loss": 0.4616, + "step": 3424 + }, + { + "epoch": 1.6193853427895981, + "grad_norm": 2.6906614303588867, + "learning_rate": 4.18776143957467e-06, + "loss": 0.5142, + "step": 3425 + }, + { + "epoch": 1.6198581560283687, + "grad_norm": 2.5149154663085938, + "learning_rate": 4.187301176123558e-06, + "loss": 0.5252, + "step": 3426 + }, + { + "epoch": 1.6203309692671395, + "grad_norm": 2.677405834197998, + "learning_rate": 4.186840807611602e-06, + "loss": 0.4635, + "step": 3427 + }, + { + "epoch": 1.6208037825059103, + "grad_norm": 2.7164649963378906, + "learning_rate": 4.186380334067464e-06, + "loss": 0.5634, + "step": 3428 + }, + { + "epoch": 1.6212765957446809, + "grad_norm": 2.8299832344055176, + "learning_rate": 4.185919755519817e-06, + "loss": 0.5166, + "step": 3429 + }, + { + "epoch": 1.6217494089834514, + "grad_norm": 2.465848207473755, + "learning_rate": 4.18545907199734e-06, + "loss": 0.4696, + "step": 3430 + }, + { + "epoch": 1.6222222222222222, + "grad_norm": 2.407616376876831, + "learning_rate": 4.1849982835287175e-06, + "loss": 0.5111, + "step": 3431 + }, + { + "epoch": 1.622695035460993, + "grad_norm": 2.452146291732788, + "learning_rate": 4.184537390142639e-06, + "loss": 0.4574, + "step": 3432 + }, + { + "epoch": 1.6231678486997636, + "grad_norm": 2.653071165084839, + "learning_rate": 4.1840763918678055e-06, + "loss": 0.5611, + "step": 3433 + }, + { + "epoch": 1.6236406619385342, + "grad_norm": 2.5920350551605225, + "learning_rate": 4.183615288732919e-06, + "loss": 0.5437, + "step": 3434 + }, + { + "epoch": 1.624113475177305, + "grad_norm": 2.782900810241699, + "learning_rate": 4.18315408076669e-06, + "loss": 0.5824, + "step": 3435 + }, + { + "epoch": 1.6245862884160758, + "grad_norm": 2.8769774436950684, + "learning_rate": 4.1826927679978365e-06, + "loss": 0.5271, + "step": 3436 + }, + { + "epoch": 1.6250591016548463, + "grad_norm": 2.488598585128784, + "learning_rate": 4.182231350455084e-06, + "loss": 0.4684, + "step": 3437 + }, + { + "epoch": 1.625531914893617, + "grad_norm": 2.6472036838531494, + "learning_rate": 4.181769828167161e-06, + "loss": 0.5372, + "step": 3438 + }, + { + "epoch": 1.6260047281323877, + "grad_norm": 2.6498794555664062, + "learning_rate": 4.1813082011628045e-06, + "loss": 0.4805, + "step": 3439 + }, + { + "epoch": 1.6264775413711585, + "grad_norm": 2.5386533737182617, + "learning_rate": 4.1808464694707595e-06, + "loss": 0.5015, + "step": 3440 + }, + { + "epoch": 1.626950354609929, + "grad_norm": 2.8812551498413086, + "learning_rate": 4.180384633119775e-06, + "loss": 0.5225, + "step": 3441 + }, + { + "epoch": 1.6274231678486997, + "grad_norm": 2.870124578475952, + "learning_rate": 4.179922692138609e-06, + "loss": 0.537, + "step": 3442 + }, + { + "epoch": 1.6278959810874705, + "grad_norm": 2.5759785175323486, + "learning_rate": 4.179460646556021e-06, + "loss": 0.5142, + "step": 3443 + }, + { + "epoch": 1.6283687943262413, + "grad_norm": 2.629347324371338, + "learning_rate": 4.1789984964007836e-06, + "loss": 0.5007, + "step": 3444 + }, + { + "epoch": 1.6288416075650118, + "grad_norm": 2.751128673553467, + "learning_rate": 4.178536241701672e-06, + "loss": 0.5677, + "step": 3445 + }, + { + "epoch": 1.6293144208037824, + "grad_norm": 2.7582364082336426, + "learning_rate": 4.178073882487469e-06, + "loss": 0.499, + "step": 3446 + }, + { + "epoch": 1.6297872340425532, + "grad_norm": 3.136711359024048, + "learning_rate": 4.177611418786963e-06, + "loss": 0.5294, + "step": 3447 + }, + { + "epoch": 1.630260047281324, + "grad_norm": 2.7363100051879883, + "learning_rate": 4.17714885062895e-06, + "loss": 0.5264, + "step": 3448 + }, + { + "epoch": 1.6307328605200946, + "grad_norm": 2.7305946350097656, + "learning_rate": 4.176686178042233e-06, + "loss": 0.5235, + "step": 3449 + }, + { + "epoch": 1.6312056737588652, + "grad_norm": 2.6500556468963623, + "learning_rate": 4.176223401055619e-06, + "loss": 0.5463, + "step": 3450 + }, + { + "epoch": 1.631678486997636, + "grad_norm": 2.756321907043457, + "learning_rate": 4.175760519697924e-06, + "loss": 0.545, + "step": 3451 + }, + { + "epoch": 1.6321513002364068, + "grad_norm": 2.6234960556030273, + "learning_rate": 4.17529753399797e-06, + "loss": 0.4927, + "step": 3452 + }, + { + "epoch": 1.6326241134751773, + "grad_norm": 2.6358842849731445, + "learning_rate": 4.174834443984584e-06, + "loss": 0.5445, + "step": 3453 + }, + { + "epoch": 1.633096926713948, + "grad_norm": 2.541147470474243, + "learning_rate": 4.174371249686601e-06, + "loss": 0.4691, + "step": 3454 + }, + { + "epoch": 1.6335697399527187, + "grad_norm": 2.566981077194214, + "learning_rate": 4.173907951132863e-06, + "loss": 0.4932, + "step": 3455 + }, + { + "epoch": 1.6340425531914895, + "grad_norm": 2.670940399169922, + "learning_rate": 4.173444548352216e-06, + "loss": 0.4979, + "step": 3456 + }, + { + "epoch": 1.63451536643026, + "grad_norm": 2.5440268516540527, + "learning_rate": 4.172981041373515e-06, + "loss": 0.4716, + "step": 3457 + }, + { + "epoch": 1.6349881796690307, + "grad_norm": 2.3801631927490234, + "learning_rate": 4.17251743022562e-06, + "loss": 0.5126, + "step": 3458 + }, + { + "epoch": 1.6354609929078014, + "grad_norm": 2.5051121711730957, + "learning_rate": 4.1720537149373985e-06, + "loss": 0.4964, + "step": 3459 + }, + { + "epoch": 1.6359338061465722, + "grad_norm": 3.5521697998046875, + "learning_rate": 4.171589895537724e-06, + "loss": 0.5447, + "step": 3460 + }, + { + "epoch": 1.6364066193853428, + "grad_norm": 2.6041572093963623, + "learning_rate": 4.171125972055477e-06, + "loss": 0.4637, + "step": 3461 + }, + { + "epoch": 1.6368794326241134, + "grad_norm": 2.2297258377075195, + "learning_rate": 4.170661944519543e-06, + "loss": 0.4702, + "step": 3462 + }, + { + "epoch": 1.6373522458628842, + "grad_norm": 2.6764535903930664, + "learning_rate": 4.170197812958815e-06, + "loss": 0.5111, + "step": 3463 + }, + { + "epoch": 1.637825059101655, + "grad_norm": 2.86892032623291, + "learning_rate": 4.169733577402193e-06, + "loss": 0.5437, + "step": 3464 + }, + { + "epoch": 1.6382978723404256, + "grad_norm": 2.9007070064544678, + "learning_rate": 4.1692692378785825e-06, + "loss": 0.5425, + "step": 3465 + }, + { + "epoch": 1.6387706855791961, + "grad_norm": 2.5902905464172363, + "learning_rate": 4.168804794416896e-06, + "loss": 0.5252, + "step": 3466 + }, + { + "epoch": 1.639243498817967, + "grad_norm": 2.821183681488037, + "learning_rate": 4.168340247046053e-06, + "loss": 0.5265, + "step": 3467 + }, + { + "epoch": 1.6397163120567377, + "grad_norm": 2.7928314208984375, + "learning_rate": 4.167875595794978e-06, + "loss": 0.5151, + "step": 3468 + }, + { + "epoch": 1.6401891252955083, + "grad_norm": 2.3130412101745605, + "learning_rate": 4.167410840692603e-06, + "loss": 0.4941, + "step": 3469 + }, + { + "epoch": 1.6406619385342789, + "grad_norm": 2.6078619956970215, + "learning_rate": 4.1669459817678655e-06, + "loss": 0.493, + "step": 3470 + }, + { + "epoch": 1.6411347517730497, + "grad_norm": 2.5335731506347656, + "learning_rate": 4.166481019049712e-06, + "loss": 0.4969, + "step": 3471 + }, + { + "epoch": 1.6416075650118205, + "grad_norm": 2.8181469440460205, + "learning_rate": 4.166015952567093e-06, + "loss": 0.5062, + "step": 3472 + }, + { + "epoch": 1.642080378250591, + "grad_norm": 2.7256782054901123, + "learning_rate": 4.165550782348966e-06, + "loss": 0.5397, + "step": 3473 + }, + { + "epoch": 1.6425531914893616, + "grad_norm": 2.284345865249634, + "learning_rate": 4.1650855084242946e-06, + "loss": 0.4448, + "step": 3474 + }, + { + "epoch": 1.6430260047281324, + "grad_norm": 3.0383145809173584, + "learning_rate": 4.164620130822049e-06, + "loss": 0.5873, + "step": 3475 + }, + { + "epoch": 1.6434988179669032, + "grad_norm": 2.754448652267456, + "learning_rate": 4.1641546495712085e-06, + "loss": 0.4852, + "step": 3476 + }, + { + "epoch": 1.6439716312056738, + "grad_norm": 2.6820101737976074, + "learning_rate": 4.1636890647007535e-06, + "loss": 0.5325, + "step": 3477 + }, + { + "epoch": 1.6444444444444444, + "grad_norm": 2.6396398544311523, + "learning_rate": 4.163223376239676e-06, + "loss": 0.466, + "step": 3478 + }, + { + "epoch": 1.6449172576832152, + "grad_norm": 2.395049810409546, + "learning_rate": 4.162757584216972e-06, + "loss": 0.4531, + "step": 3479 + }, + { + "epoch": 1.645390070921986, + "grad_norm": 2.596670627593994, + "learning_rate": 4.162291688661645e-06, + "loss": 0.5207, + "step": 3480 + }, + { + "epoch": 1.6458628841607565, + "grad_norm": 2.4391872882843018, + "learning_rate": 4.161825689602703e-06, + "loss": 0.5133, + "step": 3481 + }, + { + "epoch": 1.6463356973995271, + "grad_norm": 2.6169841289520264, + "learning_rate": 4.161359587069162e-06, + "loss": 0.5096, + "step": 3482 + }, + { + "epoch": 1.646808510638298, + "grad_norm": 2.634089946746826, + "learning_rate": 4.1608933810900445e-06, + "loss": 0.4921, + "step": 3483 + }, + { + "epoch": 1.6472813238770687, + "grad_norm": 2.815877914428711, + "learning_rate": 4.160427071694379e-06, + "loss": 0.5045, + "step": 3484 + }, + { + "epoch": 1.6477541371158393, + "grad_norm": 2.417525053024292, + "learning_rate": 4.159960658911199e-06, + "loss": 0.4997, + "step": 3485 + }, + { + "epoch": 1.6482269503546099, + "grad_norm": 2.5713605880737305, + "learning_rate": 4.15949414276955e-06, + "loss": 0.5246, + "step": 3486 + }, + { + "epoch": 1.6486997635933807, + "grad_norm": 3.49833607673645, + "learning_rate": 4.159027523298475e-06, + "loss": 0.4901, + "step": 3487 + }, + { + "epoch": 1.6491725768321515, + "grad_norm": 2.985464334487915, + "learning_rate": 4.158560800527033e-06, + "loss": 0.5726, + "step": 3488 + }, + { + "epoch": 1.649645390070922, + "grad_norm": 2.72745680809021, + "learning_rate": 4.158093974484282e-06, + "loss": 0.5119, + "step": 3489 + }, + { + "epoch": 1.6501182033096926, + "grad_norm": 2.4885571002960205, + "learning_rate": 4.157627045199289e-06, + "loss": 0.4838, + "step": 3490 + }, + { + "epoch": 1.6505910165484634, + "grad_norm": 2.7622628211975098, + "learning_rate": 4.157160012701128e-06, + "loss": 0.5269, + "step": 3491 + }, + { + "epoch": 1.6510638297872342, + "grad_norm": 2.615122079849243, + "learning_rate": 4.156692877018879e-06, + "loss": 0.5501, + "step": 3492 + }, + { + "epoch": 1.6515366430260048, + "grad_norm": 2.827753782272339, + "learning_rate": 4.156225638181631e-06, + "loss": 0.5452, + "step": 3493 + }, + { + "epoch": 1.6520094562647754, + "grad_norm": 2.724820137023926, + "learning_rate": 4.155758296218474e-06, + "loss": 0.5155, + "step": 3494 + }, + { + "epoch": 1.6524822695035462, + "grad_norm": 2.5806174278259277, + "learning_rate": 4.155290851158508e-06, + "loss": 0.5292, + "step": 3495 + }, + { + "epoch": 1.652955082742317, + "grad_norm": 2.5655179023742676, + "learning_rate": 4.154823303030838e-06, + "loss": 0.4959, + "step": 3496 + }, + { + "epoch": 1.6534278959810875, + "grad_norm": 2.656548261642456, + "learning_rate": 4.154355651864579e-06, + "loss": 0.5703, + "step": 3497 + }, + { + "epoch": 1.653900709219858, + "grad_norm": 2.9085004329681396, + "learning_rate": 4.153887897688847e-06, + "loss": 0.5061, + "step": 3498 + }, + { + "epoch": 1.654373522458629, + "grad_norm": 2.608010768890381, + "learning_rate": 4.1534200405327665e-06, + "loss": 0.5165, + "step": 3499 + }, + { + "epoch": 1.6548463356973995, + "grad_norm": 2.600463628768921, + "learning_rate": 4.152952080425471e-06, + "loss": 0.4946, + "step": 3500 + }, + { + "epoch": 1.65531914893617, + "grad_norm": 2.5561563968658447, + "learning_rate": 4.152484017396098e-06, + "loss": 0.4804, + "step": 3501 + }, + { + "epoch": 1.6557919621749408, + "grad_norm": 2.788594961166382, + "learning_rate": 4.152015851473791e-06, + "loss": 0.5635, + "step": 3502 + }, + { + "epoch": 1.6562647754137116, + "grad_norm": 2.693302631378174, + "learning_rate": 4.151547582687699e-06, + "loss": 0.5139, + "step": 3503 + }, + { + "epoch": 1.6567375886524822, + "grad_norm": 2.7887485027313232, + "learning_rate": 4.1510792110669825e-06, + "loss": 0.4952, + "step": 3504 + }, + { + "epoch": 1.6572104018912528, + "grad_norm": 2.8982298374176025, + "learning_rate": 4.150610736640803e-06, + "loss": 0.4136, + "step": 3505 + }, + { + "epoch": 1.6576832151300236, + "grad_norm": 2.7569408416748047, + "learning_rate": 4.150142159438331e-06, + "loss": 0.5272, + "step": 3506 + }, + { + "epoch": 1.6581560283687944, + "grad_norm": 2.531648874282837, + "learning_rate": 4.149673479488742e-06, + "loss": 0.5016, + "step": 3507 + }, + { + "epoch": 1.658628841607565, + "grad_norm": 2.7706353664398193, + "learning_rate": 4.149204696821219e-06, + "loss": 0.5512, + "step": 3508 + }, + { + "epoch": 1.6591016548463355, + "grad_norm": 2.7307450771331787, + "learning_rate": 4.148735811464951e-06, + "loss": 0.4968, + "step": 3509 + }, + { + "epoch": 1.6595744680851063, + "grad_norm": 3.0097429752349854, + "learning_rate": 4.1482668234491335e-06, + "loss": 0.4797, + "step": 3510 + }, + { + "epoch": 1.6600472813238771, + "grad_norm": 2.6045308113098145, + "learning_rate": 4.147797732802969e-06, + "loss": 0.5496, + "step": 3511 + }, + { + "epoch": 1.6605200945626477, + "grad_norm": 2.702061176300049, + "learning_rate": 4.147328539555664e-06, + "loss": 0.5302, + "step": 3512 + }, + { + "epoch": 1.6609929078014183, + "grad_norm": 3.3724892139434814, + "learning_rate": 4.1468592437364356e-06, + "loss": 0.5124, + "step": 3513 + }, + { + "epoch": 1.661465721040189, + "grad_norm": 2.5117242336273193, + "learning_rate": 4.146389845374502e-06, + "loss": 0.4953, + "step": 3514 + }, + { + "epoch": 1.6619385342789599, + "grad_norm": 2.86547589302063, + "learning_rate": 4.145920344499092e-06, + "loss": 0.5337, + "step": 3515 + }, + { + "epoch": 1.6624113475177305, + "grad_norm": 2.745149850845337, + "learning_rate": 4.14545074113944e-06, + "loss": 0.5187, + "step": 3516 + }, + { + "epoch": 1.662884160756501, + "grad_norm": 2.5560994148254395, + "learning_rate": 4.1449810353247855e-06, + "loss": 0.5183, + "step": 3517 + }, + { + "epoch": 1.6633569739952718, + "grad_norm": 2.2318122386932373, + "learning_rate": 4.144511227084374e-06, + "loss": 0.4452, + "step": 3518 + }, + { + "epoch": 1.6638297872340426, + "grad_norm": 2.6980903148651123, + "learning_rate": 4.14404131644746e-06, + "loss": 0.4974, + "step": 3519 + }, + { + "epoch": 1.6643026004728132, + "grad_norm": 2.6875357627868652, + "learning_rate": 4.1435713034433025e-06, + "loss": 0.4582, + "step": 3520 + }, + { + "epoch": 1.6647754137115838, + "grad_norm": 2.9430019855499268, + "learning_rate": 4.143101188101166e-06, + "loss": 0.5004, + "step": 3521 + }, + { + "epoch": 1.6652482269503546, + "grad_norm": 2.4447221755981445, + "learning_rate": 4.142630970450323e-06, + "loss": 0.5436, + "step": 3522 + }, + { + "epoch": 1.6657210401891254, + "grad_norm": 2.571023941040039, + "learning_rate": 4.142160650520053e-06, + "loss": 0.5307, + "step": 3523 + }, + { + "epoch": 1.666193853427896, + "grad_norm": 2.9725306034088135, + "learning_rate": 4.14169022833964e-06, + "loss": 0.5918, + "step": 3524 + }, + { + "epoch": 1.6666666666666665, + "grad_norm": 2.5958926677703857, + "learning_rate": 4.141219703938375e-06, + "loss": 0.5036, + "step": 3525 + }, + { + "epoch": 1.6671394799054373, + "grad_norm": 2.935788631439209, + "learning_rate": 4.140749077345556e-06, + "loss": 0.5773, + "step": 3526 + }, + { + "epoch": 1.6676122931442081, + "grad_norm": 2.5460526943206787, + "learning_rate": 4.140278348590485e-06, + "loss": 0.4762, + "step": 3527 + }, + { + "epoch": 1.6680851063829787, + "grad_norm": 2.5729143619537354, + "learning_rate": 4.139807517702475e-06, + "loss": 0.5515, + "step": 3528 + }, + { + "epoch": 1.6685579196217493, + "grad_norm": 2.4377381801605225, + "learning_rate": 4.13933658471084e-06, + "loss": 0.5383, + "step": 3529 + }, + { + "epoch": 1.66903073286052, + "grad_norm": 2.6284425258636475, + "learning_rate": 4.138865549644905e-06, + "loss": 0.5396, + "step": 3530 + }, + { + "epoch": 1.6695035460992909, + "grad_norm": 2.857250928878784, + "learning_rate": 4.138394412533998e-06, + "loss": 0.5861, + "step": 3531 + }, + { + "epoch": 1.6699763593380614, + "grad_norm": 2.9226012229919434, + "learning_rate": 4.137923173407456e-06, + "loss": 0.5262, + "step": 3532 + }, + { + "epoch": 1.670449172576832, + "grad_norm": 4.839131832122803, + "learning_rate": 4.137451832294619e-06, + "loss": 0.651, + "step": 3533 + }, + { + "epoch": 1.6709219858156028, + "grad_norm": 2.4727771282196045, + "learning_rate": 4.1369803892248375e-06, + "loss": 0.5149, + "step": 3534 + }, + { + "epoch": 1.6713947990543736, + "grad_norm": 2.5391688346862793, + "learning_rate": 4.1365088442274635e-06, + "loss": 0.4907, + "step": 3535 + }, + { + "epoch": 1.6718676122931442, + "grad_norm": 2.5168209075927734, + "learning_rate": 4.136037197331862e-06, + "loss": 0.5091, + "step": 3536 + }, + { + "epoch": 1.6723404255319148, + "grad_norm": 2.6278600692749023, + "learning_rate": 4.135565448567396e-06, + "loss": 0.4357, + "step": 3537 + }, + { + "epoch": 1.6728132387706856, + "grad_norm": 2.835184097290039, + "learning_rate": 4.135093597963441e-06, + "loss": 0.4786, + "step": 3538 + }, + { + "epoch": 1.6732860520094563, + "grad_norm": 2.385328531265259, + "learning_rate": 4.134621645549379e-06, + "loss": 0.4849, + "step": 3539 + }, + { + "epoch": 1.673758865248227, + "grad_norm": 2.6504149436950684, + "learning_rate": 4.134149591354593e-06, + "loss": 0.6037, + "step": 3540 + }, + { + "epoch": 1.6742316784869975, + "grad_norm": 2.945634126663208, + "learning_rate": 4.1336774354084786e-06, + "loss": 0.532, + "step": 3541 + }, + { + "epoch": 1.6747044917257683, + "grad_norm": 2.8373215198516846, + "learning_rate": 4.133205177740434e-06, + "loss": 0.5138, + "step": 3542 + }, + { + "epoch": 1.675177304964539, + "grad_norm": 2.6616621017456055, + "learning_rate": 4.1327328183798634e-06, + "loss": 0.5543, + "step": 3543 + }, + { + "epoch": 1.6756501182033097, + "grad_norm": 3.0843071937561035, + "learning_rate": 4.13226035735618e-06, + "loss": 0.6585, + "step": 3544 + }, + { + "epoch": 1.6761229314420802, + "grad_norm": 2.2214272022247314, + "learning_rate": 4.131787794698802e-06, + "loss": 0.5413, + "step": 3545 + }, + { + "epoch": 1.676595744680851, + "grad_norm": 2.4515018463134766, + "learning_rate": 4.131315130437152e-06, + "loss": 0.4966, + "step": 3546 + }, + { + "epoch": 1.6770685579196218, + "grad_norm": 2.647414207458496, + "learning_rate": 4.130842364600663e-06, + "loss": 0.5401, + "step": 3547 + }, + { + "epoch": 1.6775413711583924, + "grad_norm": 2.648941993713379, + "learning_rate": 4.13036949721877e-06, + "loss": 0.4796, + "step": 3548 + }, + { + "epoch": 1.678014184397163, + "grad_norm": 2.7835679054260254, + "learning_rate": 4.129896528320919e-06, + "loss": 0.5653, + "step": 3549 + }, + { + "epoch": 1.6784869976359338, + "grad_norm": 2.995964288711548, + "learning_rate": 4.129423457936556e-06, + "loss": 0.4999, + "step": 3550 + }, + { + "epoch": 1.6789598108747046, + "grad_norm": 2.5980007648468018, + "learning_rate": 4.1289502860951405e-06, + "loss": 0.5177, + "step": 3551 + }, + { + "epoch": 1.6794326241134752, + "grad_norm": 2.442254066467285, + "learning_rate": 4.128477012826133e-06, + "loss": 0.5062, + "step": 3552 + }, + { + "epoch": 1.6799054373522457, + "grad_norm": 2.3007538318634033, + "learning_rate": 4.1280036381590025e-06, + "loss": 0.5029, + "step": 3553 + }, + { + "epoch": 1.6803782505910165, + "grad_norm": 2.4169347286224365, + "learning_rate": 4.1275301621232245e-06, + "loss": 0.515, + "step": 3554 + }, + { + "epoch": 1.6808510638297873, + "grad_norm": 2.6456379890441895, + "learning_rate": 4.127056584748279e-06, + "loss": 0.5343, + "step": 3555 + }, + { + "epoch": 1.681323877068558, + "grad_norm": 2.6406595706939697, + "learning_rate": 4.1265829060636546e-06, + "loss": 0.5047, + "step": 3556 + }, + { + "epoch": 1.6817966903073285, + "grad_norm": 2.9344475269317627, + "learning_rate": 4.126109126098846e-06, + "loss": 0.5501, + "step": 3557 + }, + { + "epoch": 1.6822695035460993, + "grad_norm": 2.3292455673217773, + "learning_rate": 4.125635244883351e-06, + "loss": 0.463, + "step": 3558 + }, + { + "epoch": 1.68274231678487, + "grad_norm": 2.4150657653808594, + "learning_rate": 4.125161262446677e-06, + "loss": 0.4802, + "step": 3559 + }, + { + "epoch": 1.6832151300236406, + "grad_norm": 2.604292392730713, + "learning_rate": 4.124687178818339e-06, + "loss": 0.5683, + "step": 3560 + }, + { + "epoch": 1.6836879432624112, + "grad_norm": 2.5676791667938232, + "learning_rate": 4.1242129940278544e-06, + "loss": 0.5519, + "step": 3561 + }, + { + "epoch": 1.684160756501182, + "grad_norm": 3.078514814376831, + "learning_rate": 4.123738708104748e-06, + "loss": 0.5194, + "step": 3562 + }, + { + "epoch": 1.6846335697399528, + "grad_norm": 2.893577814102173, + "learning_rate": 4.123264321078552e-06, + "loss": 0.5107, + "step": 3563 + }, + { + "epoch": 1.6851063829787234, + "grad_norm": 2.772413730621338, + "learning_rate": 4.122789832978804e-06, + "loss": 0.6147, + "step": 3564 + }, + { + "epoch": 1.685579196217494, + "grad_norm": 2.5804643630981445, + "learning_rate": 4.12231524383505e-06, + "loss": 0.5057, + "step": 3565 + }, + { + "epoch": 1.6860520094562648, + "grad_norm": 2.599571466445923, + "learning_rate": 4.121840553676839e-06, + "loss": 0.5591, + "step": 3566 + }, + { + "epoch": 1.6865248226950356, + "grad_norm": 2.9124577045440674, + "learning_rate": 4.1213657625337275e-06, + "loss": 0.565, + "step": 3567 + }, + { + "epoch": 1.6869976359338061, + "grad_norm": 2.6582155227661133, + "learning_rate": 4.120890870435281e-06, + "loss": 0.4607, + "step": 3568 + }, + { + "epoch": 1.6874704491725767, + "grad_norm": 2.929227590560913, + "learning_rate": 4.120415877411066e-06, + "loss": 0.5705, + "step": 3569 + }, + { + "epoch": 1.6879432624113475, + "grad_norm": 2.4443247318267822, + "learning_rate": 4.11994078349066e-06, + "loss": 0.4592, + "step": 3570 + }, + { + "epoch": 1.6884160756501183, + "grad_norm": 2.4799163341522217, + "learning_rate": 4.119465588703645e-06, + "loss": 0.5361, + "step": 3571 + }, + { + "epoch": 1.6888888888888889, + "grad_norm": 2.9408936500549316, + "learning_rate": 4.1189902930796085e-06, + "loss": 0.5347, + "step": 3572 + }, + { + "epoch": 1.6893617021276595, + "grad_norm": 3.3348076343536377, + "learning_rate": 4.118514896648146e-06, + "loss": 0.5612, + "step": 3573 + }, + { + "epoch": 1.6898345153664303, + "grad_norm": 2.764889717102051, + "learning_rate": 4.118039399438857e-06, + "loss": 0.4745, + "step": 3574 + }, + { + "epoch": 1.690307328605201, + "grad_norm": 2.7023751735687256, + "learning_rate": 4.11756380148135e-06, + "loss": 0.5106, + "step": 3575 + }, + { + "epoch": 1.6907801418439716, + "grad_norm": 2.8816208839416504, + "learning_rate": 4.117088102805238e-06, + "loss": 0.6016, + "step": 3576 + }, + { + "epoch": 1.6912529550827422, + "grad_norm": 2.215733289718628, + "learning_rate": 4.11661230344014e-06, + "loss": 0.4404, + "step": 3577 + }, + { + "epoch": 1.691725768321513, + "grad_norm": 2.8190999031066895, + "learning_rate": 4.116136403415683e-06, + "loss": 0.5038, + "step": 3578 + }, + { + "epoch": 1.6921985815602838, + "grad_norm": 2.616424083709717, + "learning_rate": 4.115660402761499e-06, + "loss": 0.5493, + "step": 3579 + }, + { + "epoch": 1.6926713947990544, + "grad_norm": 2.7738113403320312, + "learning_rate": 4.115184301507226e-06, + "loss": 0.5416, + "step": 3580 + }, + { + "epoch": 1.693144208037825, + "grad_norm": 2.4793593883514404, + "learning_rate": 4.114708099682509e-06, + "loss": 0.4526, + "step": 3581 + }, + { + "epoch": 1.6936170212765957, + "grad_norm": 2.390652894973755, + "learning_rate": 4.114231797316999e-06, + "loss": 0.4908, + "step": 3582 + }, + { + "epoch": 1.6940898345153665, + "grad_norm": 2.513197660446167, + "learning_rate": 4.113755394440352e-06, + "loss": 0.4738, + "step": 3583 + }, + { + "epoch": 1.6945626477541371, + "grad_norm": 2.504497766494751, + "learning_rate": 4.113278891082234e-06, + "loss": 0.4661, + "step": 3584 + }, + { + "epoch": 1.6950354609929077, + "grad_norm": 2.4966917037963867, + "learning_rate": 4.112802287272314e-06, + "loss": 0.4979, + "step": 3585 + }, + { + "epoch": 1.6955082742316785, + "grad_norm": 2.3129689693450928, + "learning_rate": 4.112325583040265e-06, + "loss": 0.4933, + "step": 3586 + }, + { + "epoch": 1.6959810874704493, + "grad_norm": 2.822136878967285, + "learning_rate": 4.111848778415774e-06, + "loss": 0.5087, + "step": 3587 + }, + { + "epoch": 1.6964539007092199, + "grad_norm": 2.5181210041046143, + "learning_rate": 4.111371873428527e-06, + "loss": 0.4836, + "step": 3588 + }, + { + "epoch": 1.6969267139479904, + "grad_norm": 2.7564687728881836, + "learning_rate": 4.110894868108218e-06, + "loss": 0.5224, + "step": 3589 + }, + { + "epoch": 1.6973995271867612, + "grad_norm": 2.424421787261963, + "learning_rate": 4.11041776248455e-06, + "loss": 0.4552, + "step": 3590 + }, + { + "epoch": 1.697872340425532, + "grad_norm": 2.7013823986053467, + "learning_rate": 4.10994055658723e-06, + "loss": 0.5535, + "step": 3591 + }, + { + "epoch": 1.6983451536643026, + "grad_norm": 2.5660946369171143, + "learning_rate": 4.10946325044597e-06, + "loss": 0.5351, + "step": 3592 + }, + { + "epoch": 1.6988179669030732, + "grad_norm": 2.5598108768463135, + "learning_rate": 4.10898584409049e-06, + "loss": 0.5246, + "step": 3593 + }, + { + "epoch": 1.699290780141844, + "grad_norm": 2.6318907737731934, + "learning_rate": 4.108508337550518e-06, + "loss": 0.5002, + "step": 3594 + }, + { + "epoch": 1.6997635933806148, + "grad_norm": 2.527099132537842, + "learning_rate": 4.108030730855784e-06, + "loss": 0.5366, + "step": 3595 + }, + { + "epoch": 1.7002364066193854, + "grad_norm": 2.8629603385925293, + "learning_rate": 4.107553024036029e-06, + "loss": 0.5742, + "step": 3596 + }, + { + "epoch": 1.700709219858156, + "grad_norm": 2.8084018230438232, + "learning_rate": 4.107075217120994e-06, + "loss": 0.5618, + "step": 3597 + }, + { + "epoch": 1.7011820330969267, + "grad_norm": 3.6470065116882324, + "learning_rate": 4.1065973101404325e-06, + "loss": 0.508, + "step": 3598 + }, + { + "epoch": 1.7016548463356975, + "grad_norm": 3.0332422256469727, + "learning_rate": 4.106119303124102e-06, + "loss": 0.51, + "step": 3599 + }, + { + "epoch": 1.702127659574468, + "grad_norm": 2.4887590408325195, + "learning_rate": 4.105641196101765e-06, + "loss": 0.5109, + "step": 3600 + }, + { + "epoch": 1.7026004728132387, + "grad_norm": 2.6102066040039062, + "learning_rate": 4.105162989103191e-06, + "loss": 0.5278, + "step": 3601 + }, + { + "epoch": 1.7030732860520095, + "grad_norm": 2.771578073501587, + "learning_rate": 4.104684682158156e-06, + "loss": 0.498, + "step": 3602 + }, + { + "epoch": 1.7035460992907803, + "grad_norm": 2.5452702045440674, + "learning_rate": 4.1042062752964425e-06, + "loss": 0.4939, + "step": 3603 + }, + { + "epoch": 1.7040189125295508, + "grad_norm": 2.4287021160125732, + "learning_rate": 4.103727768547838e-06, + "loss": 0.4819, + "step": 3604 + }, + { + "epoch": 1.7044917257683214, + "grad_norm": 2.412280321121216, + "learning_rate": 4.103249161942138e-06, + "loss": 0.5196, + "step": 3605 + }, + { + "epoch": 1.7049645390070922, + "grad_norm": 2.8850717544555664, + "learning_rate": 4.102770455509142e-06, + "loss": 0.5724, + "step": 3606 + }, + { + "epoch": 1.705437352245863, + "grad_norm": 2.7979609966278076, + "learning_rate": 4.102291649278659e-06, + "loss": 0.5295, + "step": 3607 + }, + { + "epoch": 1.7059101654846336, + "grad_norm": 2.762238025665283, + "learning_rate": 4.1018127432805e-06, + "loss": 0.5166, + "step": 3608 + }, + { + "epoch": 1.7063829787234042, + "grad_norm": 2.921586513519287, + "learning_rate": 4.101333737544485e-06, + "loss": 0.5607, + "step": 3609 + }, + { + "epoch": 1.706855791962175, + "grad_norm": 3.001929998397827, + "learning_rate": 4.100854632100439e-06, + "loss": 0.6255, + "step": 3610 + }, + { + "epoch": 1.7073286052009458, + "grad_norm": 2.752713918685913, + "learning_rate": 4.100375426978196e-06, + "loss": 0.5732, + "step": 3611 + }, + { + "epoch": 1.7078014184397163, + "grad_norm": 2.6496472358703613, + "learning_rate": 4.099896122207593e-06, + "loss": 0.5138, + "step": 3612 + }, + { + "epoch": 1.708274231678487, + "grad_norm": 3.0079452991485596, + "learning_rate": 4.099416717818473e-06, + "loss": 0.5746, + "step": 3613 + }, + { + "epoch": 1.7087470449172577, + "grad_norm": 2.5762360095977783, + "learning_rate": 4.098937213840687e-06, + "loss": 0.5308, + "step": 3614 + }, + { + "epoch": 1.7092198581560285, + "grad_norm": 2.6026158332824707, + "learning_rate": 4.098457610304092e-06, + "loss": 0.4857, + "step": 3615 + }, + { + "epoch": 1.709692671394799, + "grad_norm": 2.587583541870117, + "learning_rate": 4.097977907238551e-06, + "loss": 0.4591, + "step": 3616 + }, + { + "epoch": 1.7101654846335697, + "grad_norm": 2.6996991634368896, + "learning_rate": 4.097498104673932e-06, + "loss": 0.5298, + "step": 3617 + }, + { + "epoch": 1.7106382978723405, + "grad_norm": 2.600029945373535, + "learning_rate": 4.097018202640111e-06, + "loss": 0.4726, + "step": 3618 + }, + { + "epoch": 1.7111111111111112, + "grad_norm": 2.8261220455169678, + "learning_rate": 4.096538201166969e-06, + "loss": 0.5242, + "step": 3619 + }, + { + "epoch": 1.7115839243498818, + "grad_norm": 3.053027629852295, + "learning_rate": 4.096058100284394e-06, + "loss": 0.5568, + "step": 3620 + }, + { + "epoch": 1.7120567375886524, + "grad_norm": 2.9638442993164062, + "learning_rate": 4.0955779000222805e-06, + "loss": 0.5325, + "step": 3621 + }, + { + "epoch": 1.7125295508274232, + "grad_norm": 2.731095790863037, + "learning_rate": 4.095097600410527e-06, + "loss": 0.4733, + "step": 3622 + }, + { + "epoch": 1.713002364066194, + "grad_norm": 2.632490873336792, + "learning_rate": 4.09461720147904e-06, + "loss": 0.5253, + "step": 3623 + }, + { + "epoch": 1.7134751773049646, + "grad_norm": 2.847689390182495, + "learning_rate": 4.094136703257732e-06, + "loss": 0.57, + "step": 3624 + }, + { + "epoch": 1.7139479905437351, + "grad_norm": 3.1078696250915527, + "learning_rate": 4.0936561057765215e-06, + "loss": 0.5368, + "step": 3625 + }, + { + "epoch": 1.714420803782506, + "grad_norm": 2.696349620819092, + "learning_rate": 4.0931754090653334e-06, + "loss": 0.491, + "step": 3626 + }, + { + "epoch": 1.7148936170212767, + "grad_norm": 2.712958812713623, + "learning_rate": 4.092694613154099e-06, + "loss": 0.5768, + "step": 3627 + }, + { + "epoch": 1.7153664302600473, + "grad_norm": 2.5421478748321533, + "learning_rate": 4.092213718072754e-06, + "loss": 0.4839, + "step": 3628 + }, + { + "epoch": 1.715839243498818, + "grad_norm": 2.5176162719726562, + "learning_rate": 4.091732723851243e-06, + "loss": 0.5049, + "step": 3629 + }, + { + "epoch": 1.7163120567375887, + "grad_norm": 2.642185926437378, + "learning_rate": 4.091251630519514e-06, + "loss": 0.589, + "step": 3630 + }, + { + "epoch": 1.7167848699763595, + "grad_norm": 2.587348461151123, + "learning_rate": 4.0907704381075245e-06, + "loss": 0.5281, + "step": 3631 + }, + { + "epoch": 1.71725768321513, + "grad_norm": 2.4628195762634277, + "learning_rate": 4.090289146645234e-06, + "loss": 0.5592, + "step": 3632 + }, + { + "epoch": 1.7177304964539006, + "grad_norm": 2.2751028537750244, + "learning_rate": 4.0898077561626125e-06, + "loss": 0.502, + "step": 3633 + }, + { + "epoch": 1.7182033096926714, + "grad_norm": 2.7712769508361816, + "learning_rate": 4.089326266689632e-06, + "loss": 0.5143, + "step": 3634 + }, + { + "epoch": 1.7186761229314422, + "grad_norm": 2.5297727584838867, + "learning_rate": 4.088844678256275e-06, + "loss": 0.5035, + "step": 3635 + }, + { + "epoch": 1.7191489361702128, + "grad_norm": 2.739130735397339, + "learning_rate": 4.088362990892527e-06, + "loss": 0.5959, + "step": 3636 + }, + { + "epoch": 1.7196217494089834, + "grad_norm": 2.3708314895629883, + "learning_rate": 4.08788120462838e-06, + "loss": 0.4796, + "step": 3637 + }, + { + "epoch": 1.7200945626477542, + "grad_norm": 2.7664241790771484, + "learning_rate": 4.087399319493832e-06, + "loss": 0.6052, + "step": 3638 + }, + { + "epoch": 1.720567375886525, + "grad_norm": 2.5900204181671143, + "learning_rate": 4.0869173355188895e-06, + "loss": 0.4955, + "step": 3639 + }, + { + "epoch": 1.7210401891252955, + "grad_norm": 2.6771862506866455, + "learning_rate": 4.0864352527335635e-06, + "loss": 0.4889, + "step": 3640 + }, + { + "epoch": 1.7215130023640661, + "grad_norm": 2.888479471206665, + "learning_rate": 4.085953071167871e-06, + "loss": 0.5719, + "step": 3641 + }, + { + "epoch": 1.721985815602837, + "grad_norm": 2.5967187881469727, + "learning_rate": 4.085470790851833e-06, + "loss": 0.4959, + "step": 3642 + }, + { + "epoch": 1.7224586288416077, + "grad_norm": 2.5317695140838623, + "learning_rate": 4.084988411815483e-06, + "loss": 0.4596, + "step": 3643 + }, + { + "epoch": 1.7229314420803783, + "grad_norm": 2.6531455516815186, + "learning_rate": 4.084505934088853e-06, + "loss": 0.5346, + "step": 3644 + }, + { + "epoch": 1.7234042553191489, + "grad_norm": 2.6525208950042725, + "learning_rate": 4.084023357701987e-06, + "loss": 0.5178, + "step": 3645 + }, + { + "epoch": 1.7238770685579197, + "grad_norm": 2.461954116821289, + "learning_rate": 4.083540682684932e-06, + "loss": 0.4802, + "step": 3646 + }, + { + "epoch": 1.7243498817966905, + "grad_norm": 2.794696807861328, + "learning_rate": 4.083057909067743e-06, + "loss": 0.5148, + "step": 3647 + }, + { + "epoch": 1.724822695035461, + "grad_norm": 2.867572546005249, + "learning_rate": 4.082575036880479e-06, + "loss": 0.5352, + "step": 3648 + }, + { + "epoch": 1.7252955082742316, + "grad_norm": 2.642820358276367, + "learning_rate": 4.082092066153207e-06, + "loss": 0.4652, + "step": 3649 + }, + { + "epoch": 1.7257683215130024, + "grad_norm": 2.782142400741577, + "learning_rate": 4.081608996915999e-06, + "loss": 0.5591, + "step": 3650 + }, + { + "epoch": 1.7262411347517732, + "grad_norm": 2.327331304550171, + "learning_rate": 4.081125829198934e-06, + "loss": 0.4339, + "step": 3651 + }, + { + "epoch": 1.7267139479905438, + "grad_norm": 2.7959988117218018, + "learning_rate": 4.0806425630320965e-06, + "loss": 0.5783, + "step": 3652 + }, + { + "epoch": 1.7271867612293144, + "grad_norm": 2.595053195953369, + "learning_rate": 4.080159198445578e-06, + "loss": 0.4602, + "step": 3653 + }, + { + "epoch": 1.7276595744680852, + "grad_norm": 3.0968129634857178, + "learning_rate": 4.079675735469475e-06, + "loss": 0.5775, + "step": 3654 + }, + { + "epoch": 1.728132387706856, + "grad_norm": 2.628044605255127, + "learning_rate": 4.07919217413389e-06, + "loss": 0.486, + "step": 3655 + }, + { + "epoch": 1.7286052009456265, + "grad_norm": 2.782799005508423, + "learning_rate": 4.078708514468933e-06, + "loss": 0.5282, + "step": 3656 + }, + { + "epoch": 1.729078014184397, + "grad_norm": 2.655365467071533, + "learning_rate": 4.0782247565047205e-06, + "loss": 0.4873, + "step": 3657 + }, + { + "epoch": 1.729550827423168, + "grad_norm": 2.9461584091186523, + "learning_rate": 4.077740900271371e-06, + "loss": 0.548, + "step": 3658 + }, + { + "epoch": 1.7300236406619387, + "grad_norm": 2.5094761848449707, + "learning_rate": 4.077256945799015e-06, + "loss": 0.5437, + "step": 3659 + }, + { + "epoch": 1.7304964539007093, + "grad_norm": 2.555793285369873, + "learning_rate": 4.0767728931177845e-06, + "loss": 0.5268, + "step": 3660 + }, + { + "epoch": 1.7309692671394799, + "grad_norm": 2.4433486461639404, + "learning_rate": 4.07628874225782e-06, + "loss": 0.5211, + "step": 3661 + }, + { + "epoch": 1.7314420803782506, + "grad_norm": 2.365206003189087, + "learning_rate": 4.075804493249267e-06, + "loss": 0.5084, + "step": 3662 + }, + { + "epoch": 1.7319148936170212, + "grad_norm": 2.514305830001831, + "learning_rate": 4.075320146122278e-06, + "loss": 0.4693, + "step": 3663 + }, + { + "epoch": 1.7323877068557918, + "grad_norm": 2.9270083904266357, + "learning_rate": 4.074835700907012e-06, + "loss": 0.5724, + "step": 3664 + }, + { + "epoch": 1.7328605200945626, + "grad_norm": 2.938692569732666, + "learning_rate": 4.0743511576336315e-06, + "loss": 0.5361, + "step": 3665 + }, + { + "epoch": 1.7333333333333334, + "grad_norm": 3.1978867053985596, + "learning_rate": 4.073866516332307e-06, + "loss": 0.6277, + "step": 3666 + }, + { + "epoch": 1.733806146572104, + "grad_norm": 2.3477370738983154, + "learning_rate": 4.073381777033217e-06, + "loss": 0.5139, + "step": 3667 + }, + { + "epoch": 1.7342789598108745, + "grad_norm": 2.5954184532165527, + "learning_rate": 4.072896939766543e-06, + "loss": 0.537, + "step": 3668 + }, + { + "epoch": 1.7347517730496453, + "grad_norm": 2.8999998569488525, + "learning_rate": 4.072412004562472e-06, + "loss": 0.5486, + "step": 3669 + }, + { + "epoch": 1.7352245862884161, + "grad_norm": 2.7320556640625, + "learning_rate": 4.071926971451201e-06, + "loss": 0.6025, + "step": 3670 + }, + { + "epoch": 1.7356973995271867, + "grad_norm": 2.499234676361084, + "learning_rate": 4.0714418404629304e-06, + "loss": 0.456, + "step": 3671 + }, + { + "epoch": 1.7361702127659573, + "grad_norm": 2.485924243927002, + "learning_rate": 4.070956611627867e-06, + "loss": 0.5097, + "step": 3672 + }, + { + "epoch": 1.736643026004728, + "grad_norm": 2.513723373413086, + "learning_rate": 4.070471284976225e-06, + "loss": 0.4744, + "step": 3673 + }, + { + "epoch": 1.7371158392434989, + "grad_norm": 2.281977653503418, + "learning_rate": 4.06998586053822e-06, + "loss": 0.5124, + "step": 3674 + }, + { + "epoch": 1.7375886524822695, + "grad_norm": 2.3683905601501465, + "learning_rate": 4.069500338344081e-06, + "loss": 0.4816, + "step": 3675 + }, + { + "epoch": 1.73806146572104, + "grad_norm": 2.5635924339294434, + "learning_rate": 4.069014718424038e-06, + "loss": 0.5665, + "step": 3676 + }, + { + "epoch": 1.7385342789598108, + "grad_norm": 2.7308456897735596, + "learning_rate": 4.068529000808328e-06, + "loss": 0.534, + "step": 3677 + }, + { + "epoch": 1.7390070921985816, + "grad_norm": 2.788452625274658, + "learning_rate": 4.068043185527196e-06, + "loss": 0.5609, + "step": 3678 + }, + { + "epoch": 1.7394799054373522, + "grad_norm": 2.832368850708008, + "learning_rate": 4.067557272610889e-06, + "loss": 0.553, + "step": 3679 + }, + { + "epoch": 1.7399527186761228, + "grad_norm": 2.9987435340881348, + "learning_rate": 4.067071262089665e-06, + "loss": 0.5, + "step": 3680 + }, + { + "epoch": 1.7404255319148936, + "grad_norm": 3.04913067817688, + "learning_rate": 4.066585153993785e-06, + "loss": 0.5158, + "step": 3681 + }, + { + "epoch": 1.7408983451536644, + "grad_norm": 2.5177130699157715, + "learning_rate": 4.066098948353516e-06, + "loss": 0.4508, + "step": 3682 + }, + { + "epoch": 1.741371158392435, + "grad_norm": 2.8991222381591797, + "learning_rate": 4.065612645199133e-06, + "loss": 0.5268, + "step": 3683 + }, + { + "epoch": 1.7418439716312055, + "grad_norm": 2.4928159713745117, + "learning_rate": 4.0651262445609156e-06, + "loss": 0.5024, + "step": 3684 + }, + { + "epoch": 1.7423167848699763, + "grad_norm": 2.9737319946289062, + "learning_rate": 4.06463974646915e-06, + "loss": 0.5429, + "step": 3685 + }, + { + "epoch": 1.7427895981087471, + "grad_norm": 2.6485493183135986, + "learning_rate": 4.064153150954128e-06, + "loss": 0.5619, + "step": 3686 + }, + { + "epoch": 1.7432624113475177, + "grad_norm": 2.564861297607422, + "learning_rate": 4.063666458046148e-06, + "loss": 0.4878, + "step": 3687 + }, + { + "epoch": 1.7437352245862883, + "grad_norm": 2.6048383712768555, + "learning_rate": 4.063179667775514e-06, + "loss": 0.4836, + "step": 3688 + }, + { + "epoch": 1.744208037825059, + "grad_norm": 2.751638650894165, + "learning_rate": 4.062692780172536e-06, + "loss": 0.5558, + "step": 3689 + }, + { + "epoch": 1.7446808510638299, + "grad_norm": 3.3866634368896484, + "learning_rate": 4.062205795267531e-06, + "loss": 0.4825, + "step": 3690 + }, + { + "epoch": 1.7451536643026004, + "grad_norm": 3.0112249851226807, + "learning_rate": 4.061718713090822e-06, + "loss": 0.5732, + "step": 3691 + }, + { + "epoch": 1.745626477541371, + "grad_norm": 2.5889365673065186, + "learning_rate": 4.061231533672736e-06, + "loss": 0.483, + "step": 3692 + }, + { + "epoch": 1.7460992907801418, + "grad_norm": 2.624598979949951, + "learning_rate": 4.0607442570436085e-06, + "loss": 0.5706, + "step": 3693 + }, + { + "epoch": 1.7465721040189126, + "grad_norm": 2.9219250679016113, + "learning_rate": 4.060256883233779e-06, + "loss": 0.5153, + "step": 3694 + }, + { + "epoch": 1.7470449172576832, + "grad_norm": 3.2219252586364746, + "learning_rate": 4.059769412273595e-06, + "loss": 0.5184, + "step": 3695 + }, + { + "epoch": 1.7475177304964538, + "grad_norm": 2.890697956085205, + "learning_rate": 4.05928184419341e-06, + "loss": 0.5312, + "step": 3696 + }, + { + "epoch": 1.7479905437352246, + "grad_norm": 2.673809289932251, + "learning_rate": 4.0587941790235816e-06, + "loss": 0.4893, + "step": 3697 + }, + { + "epoch": 1.7484633569739954, + "grad_norm": 2.5339348316192627, + "learning_rate": 4.058306416794474e-06, + "loss": 0.5115, + "step": 3698 + }, + { + "epoch": 1.748936170212766, + "grad_norm": 2.6525840759277344, + "learning_rate": 4.05781855753646e-06, + "loss": 0.5256, + "step": 3699 + }, + { + "epoch": 1.7494089834515365, + "grad_norm": 2.7868754863739014, + "learning_rate": 4.057330601279914e-06, + "loss": 0.5227, + "step": 3700 + }, + { + "epoch": 1.7498817966903073, + "grad_norm": 3.1629884243011475, + "learning_rate": 4.056842548055221e-06, + "loss": 0.5617, + "step": 3701 + }, + { + "epoch": 1.750354609929078, + "grad_norm": 2.9350688457489014, + "learning_rate": 4.056354397892769e-06, + "loss": 0.4753, + "step": 3702 + }, + { + "epoch": 1.7508274231678487, + "grad_norm": 2.9688615798950195, + "learning_rate": 4.0558661508229525e-06, + "loss": 0.596, + "step": 3703 + }, + { + "epoch": 1.7513002364066192, + "grad_norm": 2.802205801010132, + "learning_rate": 4.055377806876174e-06, + "loss": 0.5793, + "step": 3704 + }, + { + "epoch": 1.75177304964539, + "grad_norm": 2.4933416843414307, + "learning_rate": 4.054889366082839e-06, + "loss": 0.4824, + "step": 3705 + }, + { + "epoch": 1.7522458628841608, + "grad_norm": 3.7904608249664307, + "learning_rate": 4.054400828473361e-06, + "loss": 0.5124, + "step": 3706 + }, + { + "epoch": 1.7527186761229314, + "grad_norm": 2.694838762283325, + "learning_rate": 4.053912194078159e-06, + "loss": 0.5604, + "step": 3707 + }, + { + "epoch": 1.753191489361702, + "grad_norm": 2.3721256256103516, + "learning_rate": 4.053423462927659e-06, + "loss": 0.4978, + "step": 3708 + }, + { + "epoch": 1.7536643026004728, + "grad_norm": 2.718512773513794, + "learning_rate": 4.052934635052292e-06, + "loss": 0.5029, + "step": 3709 + }, + { + "epoch": 1.7541371158392436, + "grad_norm": 3.061558246612549, + "learning_rate": 4.052445710482493e-06, + "loss": 0.4886, + "step": 3710 + }, + { + "epoch": 1.7546099290780142, + "grad_norm": 3.0490729808807373, + "learning_rate": 4.051956689248709e-06, + "loss": 0.5363, + "step": 3711 + }, + { + "epoch": 1.7550827423167847, + "grad_norm": 2.611661672592163, + "learning_rate": 4.051467571381385e-06, + "loss": 0.5397, + "step": 3712 + }, + { + "epoch": 1.7555555555555555, + "grad_norm": 2.7829177379608154, + "learning_rate": 4.050978356910979e-06, + "loss": 0.4973, + "step": 3713 + }, + { + "epoch": 1.7560283687943263, + "grad_norm": 2.6228256225585938, + "learning_rate": 4.0504890458679525e-06, + "loss": 0.4551, + "step": 3714 + }, + { + "epoch": 1.756501182033097, + "grad_norm": 2.6801326274871826, + "learning_rate": 4.049999638282771e-06, + "loss": 0.5581, + "step": 3715 + }, + { + "epoch": 1.7569739952718675, + "grad_norm": 2.4476819038391113, + "learning_rate": 4.049510134185908e-06, + "loss": 0.5226, + "step": 3716 + }, + { + "epoch": 1.7574468085106383, + "grad_norm": 2.5661075115203857, + "learning_rate": 4.049020533607844e-06, + "loss": 0.5163, + "step": 3717 + }, + { + "epoch": 1.757919621749409, + "grad_norm": 2.3923349380493164, + "learning_rate": 4.048530836579065e-06, + "loss": 0.5076, + "step": 3718 + }, + { + "epoch": 1.7583924349881797, + "grad_norm": 2.8204405307769775, + "learning_rate": 4.0480410431300585e-06, + "loss": 0.5883, + "step": 3719 + }, + { + "epoch": 1.7588652482269502, + "grad_norm": 2.323107957839966, + "learning_rate": 4.047551153291325e-06, + "loss": 0.5116, + "step": 3720 + }, + { + "epoch": 1.759338061465721, + "grad_norm": 2.8306009769439697, + "learning_rate": 4.047061167093368e-06, + "loss": 0.5094, + "step": 3721 + }, + { + "epoch": 1.7598108747044918, + "grad_norm": 2.568765640258789, + "learning_rate": 4.046571084566695e-06, + "loss": 0.4725, + "step": 3722 + }, + { + "epoch": 1.7602836879432624, + "grad_norm": 2.7212061882019043, + "learning_rate": 4.046080905741822e-06, + "loss": 0.4741, + "step": 3723 + }, + { + "epoch": 1.760756501182033, + "grad_norm": 2.802917003631592, + "learning_rate": 4.04559063064927e-06, + "loss": 0.5691, + "step": 3724 + }, + { + "epoch": 1.7612293144208038, + "grad_norm": 3.1044139862060547, + "learning_rate": 4.0451002593195675e-06, + "loss": 0.5472, + "step": 3725 + }, + { + "epoch": 1.7617021276595746, + "grad_norm": 2.5855562686920166, + "learning_rate": 4.044609791783246e-06, + "loss": 0.4852, + "step": 3726 + }, + { + "epoch": 1.7621749408983451, + "grad_norm": 2.6235129833221436, + "learning_rate": 4.0441192280708465e-06, + "loss": 0.5269, + "step": 3727 + }, + { + "epoch": 1.7626477541371157, + "grad_norm": 3.535630464553833, + "learning_rate": 4.043628568212914e-06, + "loss": 0.5266, + "step": 3728 + }, + { + "epoch": 1.7631205673758865, + "grad_norm": 2.7783355712890625, + "learning_rate": 4.043137812239998e-06, + "loss": 0.5609, + "step": 3729 + }, + { + "epoch": 1.7635933806146573, + "grad_norm": 2.9344944953918457, + "learning_rate": 4.042646960182657e-06, + "loss": 0.5056, + "step": 3730 + }, + { + "epoch": 1.7640661938534279, + "grad_norm": 2.6205739974975586, + "learning_rate": 4.042156012071453e-06, + "loss": 0.4914, + "step": 3731 + }, + { + "epoch": 1.7645390070921985, + "grad_norm": 2.8004493713378906, + "learning_rate": 4.041664967936958e-06, + "loss": 0.4901, + "step": 3732 + }, + { + "epoch": 1.7650118203309693, + "grad_norm": 2.944589138031006, + "learning_rate": 4.041173827809745e-06, + "loss": 0.5572, + "step": 3733 + }, + { + "epoch": 1.76548463356974, + "grad_norm": 2.5021605491638184, + "learning_rate": 4.040682591720397e-06, + "loss": 0.4637, + "step": 3734 + }, + { + "epoch": 1.7659574468085106, + "grad_norm": 2.448030948638916, + "learning_rate": 4.040191259699497e-06, + "loss": 0.4785, + "step": 3735 + }, + { + "epoch": 1.7664302600472812, + "grad_norm": 2.7171032428741455, + "learning_rate": 4.039699831777643e-06, + "loss": 0.4919, + "step": 3736 + }, + { + "epoch": 1.766903073286052, + "grad_norm": 2.453118324279785, + "learning_rate": 4.03920830798543e-06, + "loss": 0.4326, + "step": 3737 + }, + { + "epoch": 1.7673758865248228, + "grad_norm": 3.112877368927002, + "learning_rate": 4.038716688353466e-06, + "loss": 0.5375, + "step": 3738 + }, + { + "epoch": 1.7678486997635934, + "grad_norm": 2.742239236831665, + "learning_rate": 4.038224972912361e-06, + "loss": 0.5267, + "step": 3739 + }, + { + "epoch": 1.768321513002364, + "grad_norm": 2.544785737991333, + "learning_rate": 4.037733161692731e-06, + "loss": 0.5032, + "step": 3740 + }, + { + "epoch": 1.7687943262411348, + "grad_norm": 2.4639062881469727, + "learning_rate": 4.037241254725201e-06, + "loss": 0.5532, + "step": 3741 + }, + { + "epoch": 1.7692671394799055, + "grad_norm": 2.866290330886841, + "learning_rate": 4.036749252040398e-06, + "loss": 0.5503, + "step": 3742 + }, + { + "epoch": 1.7697399527186761, + "grad_norm": 2.3466262817382812, + "learning_rate": 4.0362571536689575e-06, + "loss": 0.5286, + "step": 3743 + }, + { + "epoch": 1.7702127659574467, + "grad_norm": 2.246464967727661, + "learning_rate": 4.03576495964152e-06, + "loss": 0.4656, + "step": 3744 + }, + { + "epoch": 1.7706855791962175, + "grad_norm": 2.667558431625366, + "learning_rate": 4.035272669988733e-06, + "loss": 0.5205, + "step": 3745 + }, + { + "epoch": 1.7711583924349883, + "grad_norm": 2.974666118621826, + "learning_rate": 4.034780284741249e-06, + "loss": 0.6007, + "step": 3746 + }, + { + "epoch": 1.7716312056737589, + "grad_norm": 2.7164433002471924, + "learning_rate": 4.034287803929726e-06, + "loss": 0.4913, + "step": 3747 + }, + { + "epoch": 1.7721040189125294, + "grad_norm": 2.5923962593078613, + "learning_rate": 4.033795227584829e-06, + "loss": 0.5275, + "step": 3748 + }, + { + "epoch": 1.7725768321513002, + "grad_norm": 2.606027126312256, + "learning_rate": 4.033302555737229e-06, + "loss": 0.4869, + "step": 3749 + }, + { + "epoch": 1.773049645390071, + "grad_norm": 3.0110089778900146, + "learning_rate": 4.032809788417602e-06, + "loss": 0.4956, + "step": 3750 + }, + { + "epoch": 1.7735224586288416, + "grad_norm": 3.004598617553711, + "learning_rate": 4.032316925656632e-06, + "loss": 0.5159, + "step": 3751 + }, + { + "epoch": 1.7739952718676122, + "grad_norm": 2.731539249420166, + "learning_rate": 4.031823967485005e-06, + "loss": 0.5237, + "step": 3752 + }, + { + "epoch": 1.774468085106383, + "grad_norm": 2.7466373443603516, + "learning_rate": 4.0313309139334155e-06, + "loss": 0.4948, + "step": 3753 + }, + { + "epoch": 1.7749408983451538, + "grad_norm": 2.8596460819244385, + "learning_rate": 4.030837765032565e-06, + "loss": 0.5016, + "step": 3754 + }, + { + "epoch": 1.7754137115839244, + "grad_norm": 3.2886788845062256, + "learning_rate": 4.03034452081316e-06, + "loss": 0.5377, + "step": 3755 + }, + { + "epoch": 1.775886524822695, + "grad_norm": 2.5629258155822754, + "learning_rate": 4.029851181305912e-06, + "loss": 0.519, + "step": 3756 + }, + { + "epoch": 1.7763593380614657, + "grad_norm": 2.5988714694976807, + "learning_rate": 4.029357746541539e-06, + "loss": 0.5521, + "step": 3757 + }, + { + "epoch": 1.7768321513002365, + "grad_norm": 2.987884759902954, + "learning_rate": 4.028864216550765e-06, + "loss": 0.6225, + "step": 3758 + }, + { + "epoch": 1.777304964539007, + "grad_norm": 2.6875851154327393, + "learning_rate": 4.02837059136432e-06, + "loss": 0.5321, + "step": 3759 + }, + { + "epoch": 1.7777777777777777, + "grad_norm": 2.6414570808410645, + "learning_rate": 4.02787687101294e-06, + "loss": 0.4831, + "step": 3760 + }, + { + "epoch": 1.7782505910165485, + "grad_norm": 2.581475019454956, + "learning_rate": 4.027383055527368e-06, + "loss": 0.5204, + "step": 3761 + }, + { + "epoch": 1.7787234042553193, + "grad_norm": 2.811298131942749, + "learning_rate": 4.026889144938349e-06, + "loss": 0.5486, + "step": 3762 + }, + { + "epoch": 1.7791962174940898, + "grad_norm": 3.1589081287384033, + "learning_rate": 4.026395139276639e-06, + "loss": 0.4979, + "step": 3763 + }, + { + "epoch": 1.7796690307328604, + "grad_norm": 2.3773093223571777, + "learning_rate": 4.025901038572996e-06, + "loss": 0.503, + "step": 3764 + }, + { + "epoch": 1.7801418439716312, + "grad_norm": 2.962541341781616, + "learning_rate": 4.025406842858187e-06, + "loss": 0.4613, + "step": 3765 + }, + { + "epoch": 1.780614657210402, + "grad_norm": 2.603092908859253, + "learning_rate": 4.024912552162982e-06, + "loss": 0.5142, + "step": 3766 + }, + { + "epoch": 1.7810874704491726, + "grad_norm": 2.648927927017212, + "learning_rate": 4.024418166518159e-06, + "loss": 0.4491, + "step": 3767 + }, + { + "epoch": 1.7815602836879432, + "grad_norm": 3.3239917755126953, + "learning_rate": 4.023923685954502e-06, + "loss": 0.6272, + "step": 3768 + }, + { + "epoch": 1.782033096926714, + "grad_norm": 2.672821283340454, + "learning_rate": 4.023429110502798e-06, + "loss": 0.5171, + "step": 3769 + }, + { + "epoch": 1.7825059101654848, + "grad_norm": 2.364332437515259, + "learning_rate": 4.022934440193844e-06, + "loss": 0.4513, + "step": 3770 + }, + { + "epoch": 1.7829787234042553, + "grad_norm": 3.03108549118042, + "learning_rate": 4.022439675058441e-06, + "loss": 0.4324, + "step": 3771 + }, + { + "epoch": 1.783451536643026, + "grad_norm": 2.647557020187378, + "learning_rate": 4.021944815127393e-06, + "loss": 0.5162, + "step": 3772 + }, + { + "epoch": 1.7839243498817967, + "grad_norm": 2.4111907482147217, + "learning_rate": 4.021449860431517e-06, + "loss": 0.4712, + "step": 3773 + }, + { + "epoch": 1.7843971631205675, + "grad_norm": 2.796175718307495, + "learning_rate": 4.020954811001629e-06, + "loss": 0.5131, + "step": 3774 + }, + { + "epoch": 1.784869976359338, + "grad_norm": 2.4594924449920654, + "learning_rate": 4.020459666868553e-06, + "loss": 0.4739, + "step": 3775 + }, + { + "epoch": 1.7853427895981087, + "grad_norm": 2.5735671520233154, + "learning_rate": 4.0199644280631215e-06, + "loss": 0.4716, + "step": 3776 + }, + { + "epoch": 1.7858156028368795, + "grad_norm": 2.419990062713623, + "learning_rate": 4.01946909461617e-06, + "loss": 0.4866, + "step": 3777 + }, + { + "epoch": 1.7862884160756503, + "grad_norm": 2.5597951412200928, + "learning_rate": 4.01897366655854e-06, + "loss": 0.5569, + "step": 3778 + }, + { + "epoch": 1.7867612293144208, + "grad_norm": 2.462383985519409, + "learning_rate": 4.018478143921081e-06, + "loss": 0.4588, + "step": 3779 + }, + { + "epoch": 1.7872340425531914, + "grad_norm": 2.536701202392578, + "learning_rate": 4.017982526734646e-06, + "loss": 0.5278, + "step": 3780 + }, + { + "epoch": 1.7877068557919622, + "grad_norm": 2.691077470779419, + "learning_rate": 4.017486815030095e-06, + "loss": 0.4815, + "step": 3781 + }, + { + "epoch": 1.788179669030733, + "grad_norm": 2.4277288913726807, + "learning_rate": 4.016991008838294e-06, + "loss": 0.4877, + "step": 3782 + }, + { + "epoch": 1.7886524822695036, + "grad_norm": 2.6740009784698486, + "learning_rate": 4.016495108190115e-06, + "loss": 0.572, + "step": 3783 + }, + { + "epoch": 1.7891252955082741, + "grad_norm": 3.179232120513916, + "learning_rate": 4.0159991131164355e-06, + "loss": 0.4821, + "step": 3784 + }, + { + "epoch": 1.789598108747045, + "grad_norm": 3.2747793197631836, + "learning_rate": 4.015503023648138e-06, + "loss": 0.5517, + "step": 3785 + }, + { + "epoch": 1.7900709219858157, + "grad_norm": 2.671367645263672, + "learning_rate": 4.015006839816113e-06, + "loss": 0.5158, + "step": 3786 + }, + { + "epoch": 1.7905437352245863, + "grad_norm": 2.6600193977355957, + "learning_rate": 4.014510561651256e-06, + "loss": 0.535, + "step": 3787 + }, + { + "epoch": 1.791016548463357, + "grad_norm": 2.481509208679199, + "learning_rate": 4.014014189184466e-06, + "loss": 0.5596, + "step": 3788 + }, + { + "epoch": 1.7914893617021277, + "grad_norm": 2.759816884994507, + "learning_rate": 4.013517722446652e-06, + "loss": 0.5201, + "step": 3789 + }, + { + "epoch": 1.7919621749408985, + "grad_norm": 2.6913561820983887, + "learning_rate": 4.013021161468724e-06, + "loss": 0.5758, + "step": 3790 + }, + { + "epoch": 1.792434988179669, + "grad_norm": 2.775087594985962, + "learning_rate": 4.0125245062816044e-06, + "loss": 0.499, + "step": 3791 + }, + { + "epoch": 1.7929078014184396, + "grad_norm": 2.6134777069091797, + "learning_rate": 4.012027756916216e-06, + "loss": 0.5659, + "step": 3792 + }, + { + "epoch": 1.7933806146572104, + "grad_norm": 2.7109756469726562, + "learning_rate": 4.0115309134034895e-06, + "loss": 0.5337, + "step": 3793 + }, + { + "epoch": 1.7938534278959812, + "grad_norm": 2.5389950275421143, + "learning_rate": 4.0110339757743595e-06, + "loss": 0.4501, + "step": 3794 + }, + { + "epoch": 1.7943262411347518, + "grad_norm": 2.634648561477661, + "learning_rate": 4.010536944059771e-06, + "loss": 0.4411, + "step": 3795 + }, + { + "epoch": 1.7947990543735224, + "grad_norm": 2.527070999145508, + "learning_rate": 4.0100398182906695e-06, + "loss": 0.5145, + "step": 3796 + }, + { + "epoch": 1.7952718676122932, + "grad_norm": 2.62988543510437, + "learning_rate": 4.0095425984980105e-06, + "loss": 0.4981, + "step": 3797 + }, + { + "epoch": 1.795744680851064, + "grad_norm": 2.6032519340515137, + "learning_rate": 4.009045284712752e-06, + "loss": 0.453, + "step": 3798 + }, + { + "epoch": 1.7962174940898346, + "grad_norm": 2.735173463821411, + "learning_rate": 4.008547876965863e-06, + "loss": 0.5925, + "step": 3799 + }, + { + "epoch": 1.7966903073286051, + "grad_norm": 2.6296730041503906, + "learning_rate": 4.00805037528831e-06, + "loss": 0.5651, + "step": 3800 + }, + { + "epoch": 1.797163120567376, + "grad_norm": 2.641214370727539, + "learning_rate": 4.0075527797110735e-06, + "loss": 0.4973, + "step": 3801 + }, + { + "epoch": 1.7976359338061467, + "grad_norm": 2.6104819774627686, + "learning_rate": 4.007055090265136e-06, + "loss": 0.4432, + "step": 3802 + }, + { + "epoch": 1.7981087470449173, + "grad_norm": 2.8200619220733643, + "learning_rate": 4.0065573069814865e-06, + "loss": 0.4899, + "step": 3803 + }, + { + "epoch": 1.7985815602836879, + "grad_norm": 2.982354164123535, + "learning_rate": 4.006059429891119e-06, + "loss": 0.5488, + "step": 3804 + }, + { + "epoch": 1.7990543735224587, + "grad_norm": 2.7561678886413574, + "learning_rate": 4.005561459025034e-06, + "loss": 0.5637, + "step": 3805 + }, + { + "epoch": 1.7995271867612295, + "grad_norm": 2.702212333679199, + "learning_rate": 4.005063394414241e-06, + "loss": 0.4804, + "step": 3806 + }, + { + "epoch": 1.8, + "grad_norm": 2.8655319213867188, + "learning_rate": 4.004565236089748e-06, + "loss": 0.5759, + "step": 3807 + }, + { + "epoch": 1.8004728132387706, + "grad_norm": 2.703676223754883, + "learning_rate": 4.0040669840825756e-06, + "loss": 0.4728, + "step": 3808 + }, + { + "epoch": 1.8009456264775414, + "grad_norm": 2.802645683288574, + "learning_rate": 4.003568638423747e-06, + "loss": 0.5421, + "step": 3809 + }, + { + "epoch": 1.8014184397163122, + "grad_norm": 2.4723124504089355, + "learning_rate": 4.003070199144292e-06, + "loss": 0.4944, + "step": 3810 + }, + { + "epoch": 1.8018912529550828, + "grad_norm": 2.4889068603515625, + "learning_rate": 4.0025716662752475e-06, + "loss": 0.4774, + "step": 3811 + }, + { + "epoch": 1.8023640661938534, + "grad_norm": 2.5408077239990234, + "learning_rate": 4.002073039847653e-06, + "loss": 0.5233, + "step": 3812 + }, + { + "epoch": 1.8028368794326242, + "grad_norm": 2.734602689743042, + "learning_rate": 4.001574319892557e-06, + "loss": 0.5403, + "step": 3813 + }, + { + "epoch": 1.803309692671395, + "grad_norm": 3.3786163330078125, + "learning_rate": 4.001075506441012e-06, + "loss": 0.6969, + "step": 3814 + }, + { + "epoch": 1.8037825059101655, + "grad_norm": 2.7375378608703613, + "learning_rate": 4.000576599524078e-06, + "loss": 0.4907, + "step": 3815 + }, + { + "epoch": 1.804255319148936, + "grad_norm": 3.041804075241089, + "learning_rate": 4.000077599172818e-06, + "loss": 0.6021, + "step": 3816 + }, + { + "epoch": 1.804728132387707, + "grad_norm": 2.697599411010742, + "learning_rate": 3.999578505418305e-06, + "loss": 0.4743, + "step": 3817 + }, + { + "epoch": 1.8052009456264777, + "grad_norm": 2.276921272277832, + "learning_rate": 3.999079318291612e-06, + "loss": 0.4885, + "step": 3818 + }, + { + "epoch": 1.8056737588652483, + "grad_norm": 2.4896953105926514, + "learning_rate": 3.998580037823825e-06, + "loss": 0.503, + "step": 3819 + }, + { + "epoch": 1.8061465721040189, + "grad_norm": 2.6232175827026367, + "learning_rate": 3.998080664046029e-06, + "loss": 0.5058, + "step": 3820 + }, + { + "epoch": 1.8066193853427897, + "grad_norm": 2.695861339569092, + "learning_rate": 3.997581196989319e-06, + "loss": 0.4949, + "step": 3821 + }, + { + "epoch": 1.8070921985815604, + "grad_norm": 2.912886142730713, + "learning_rate": 3.997081636684795e-06, + "loss": 0.4971, + "step": 3822 + }, + { + "epoch": 1.807565011820331, + "grad_norm": 2.876500368118286, + "learning_rate": 3.996581983163561e-06, + "loss": 0.5584, + "step": 3823 + }, + { + "epoch": 1.8080378250591016, + "grad_norm": 2.857069730758667, + "learning_rate": 3.99608223645673e-06, + "loss": 0.5457, + "step": 3824 + }, + { + "epoch": 1.8085106382978724, + "grad_norm": 2.486743211746216, + "learning_rate": 3.995582396595419e-06, + "loss": 0.5291, + "step": 3825 + }, + { + "epoch": 1.808983451536643, + "grad_norm": 2.509441375732422, + "learning_rate": 3.9950824636107486e-06, + "loss": 0.4747, + "step": 3826 + }, + { + "epoch": 1.8094562647754135, + "grad_norm": 2.931394100189209, + "learning_rate": 3.99458243753385e-06, + "loss": 0.5116, + "step": 3827 + }, + { + "epoch": 1.8099290780141843, + "grad_norm": 2.4868650436401367, + "learning_rate": 3.994082318395856e-06, + "loss": 0.4671, + "step": 3828 + }, + { + "epoch": 1.8104018912529551, + "grad_norm": 2.5554752349853516, + "learning_rate": 3.993582106227907e-06, + "loss": 0.4969, + "step": 3829 + }, + { + "epoch": 1.8108747044917257, + "grad_norm": 2.8367133140563965, + "learning_rate": 3.99308180106115e-06, + "loss": 0.5507, + "step": 3830 + }, + { + "epoch": 1.8113475177304963, + "grad_norm": 2.68245792388916, + "learning_rate": 3.992581402926737e-06, + "loss": 0.5115, + "step": 3831 + }, + { + "epoch": 1.811820330969267, + "grad_norm": 2.406674385070801, + "learning_rate": 3.992080911855824e-06, + "loss": 0.545, + "step": 3832 + }, + { + "epoch": 1.8122931442080379, + "grad_norm": 2.5003464221954346, + "learning_rate": 3.991580327879575e-06, + "loss": 0.4331, + "step": 3833 + }, + { + "epoch": 1.8127659574468085, + "grad_norm": 2.49320912361145, + "learning_rate": 3.99107965102916e-06, + "loss": 0.5118, + "step": 3834 + }, + { + "epoch": 1.813238770685579, + "grad_norm": 2.6183295249938965, + "learning_rate": 3.990578881335752e-06, + "loss": 0.5286, + "step": 3835 + }, + { + "epoch": 1.8137115839243498, + "grad_norm": 3.1999518871307373, + "learning_rate": 3.990078018830534e-06, + "loss": 0.5048, + "step": 3836 + }, + { + "epoch": 1.8141843971631206, + "grad_norm": 2.4351117610931396, + "learning_rate": 3.9895770635446915e-06, + "loss": 0.514, + "step": 3837 + }, + { + "epoch": 1.8146572104018912, + "grad_norm": 2.6859259605407715, + "learning_rate": 3.989076015509416e-06, + "loss": 0.5575, + "step": 3838 + }, + { + "epoch": 1.8151300236406618, + "grad_norm": 2.790421962738037, + "learning_rate": 3.988574874755909e-06, + "loss": 0.5467, + "step": 3839 + }, + { + "epoch": 1.8156028368794326, + "grad_norm": 2.5202765464782715, + "learning_rate": 3.988073641315369e-06, + "loss": 0.5229, + "step": 3840 + }, + { + "epoch": 1.8160756501182034, + "grad_norm": 2.623652219772339, + "learning_rate": 3.987572315219009e-06, + "loss": 0.509, + "step": 3841 + }, + { + "epoch": 1.816548463356974, + "grad_norm": 2.6038360595703125, + "learning_rate": 3.987070896498044e-06, + "loss": 0.5304, + "step": 3842 + }, + { + "epoch": 1.8170212765957445, + "grad_norm": 2.9378011226654053, + "learning_rate": 3.9865693851836955e-06, + "loss": 0.5845, + "step": 3843 + }, + { + "epoch": 1.8174940898345153, + "grad_norm": 2.4061124324798584, + "learning_rate": 3.98606778130719e-06, + "loss": 0.4333, + "step": 3844 + }, + { + "epoch": 1.8179669030732861, + "grad_norm": 2.483489751815796, + "learning_rate": 3.985566084899759e-06, + "loss": 0.4827, + "step": 3845 + }, + { + "epoch": 1.8184397163120567, + "grad_norm": 2.7774932384490967, + "learning_rate": 3.985064295992642e-06, + "loss": 0.5016, + "step": 3846 + }, + { + "epoch": 1.8189125295508273, + "grad_norm": 2.5936765670776367, + "learning_rate": 3.984562414617083e-06, + "loss": 0.4448, + "step": 3847 + }, + { + "epoch": 1.819385342789598, + "grad_norm": 2.8608627319335938, + "learning_rate": 3.9840604408043325e-06, + "loss": 0.5735, + "step": 3848 + }, + { + "epoch": 1.8198581560283689, + "grad_norm": 2.6212472915649414, + "learning_rate": 3.983558374585646e-06, + "loss": 0.5091, + "step": 3849 + }, + { + "epoch": 1.8203309692671394, + "grad_norm": 2.832460641860962, + "learning_rate": 3.983056215992284e-06, + "loss": 0.5169, + "step": 3850 + }, + { + "epoch": 1.82080378250591, + "grad_norm": 2.5293610095977783, + "learning_rate": 3.982553965055514e-06, + "loss": 0.4708, + "step": 3851 + }, + { + "epoch": 1.8212765957446808, + "grad_norm": 2.9362871646881104, + "learning_rate": 3.982051621806611e-06, + "loss": 0.575, + "step": 3852 + }, + { + "epoch": 1.8217494089834516, + "grad_norm": 2.69073486328125, + "learning_rate": 3.98154918627685e-06, + "loss": 0.5278, + "step": 3853 + }, + { + "epoch": 1.8222222222222222, + "grad_norm": 2.6711034774780273, + "learning_rate": 3.98104665849752e-06, + "loss": 0.4918, + "step": 3854 + }, + { + "epoch": 1.8226950354609928, + "grad_norm": 2.571110963821411, + "learning_rate": 3.980544038499907e-06, + "loss": 0.5234, + "step": 3855 + }, + { + "epoch": 1.8231678486997636, + "grad_norm": 3.2603371143341064, + "learning_rate": 3.980041326315309e-06, + "loss": 0.5996, + "step": 3856 + }, + { + "epoch": 1.8236406619385344, + "grad_norm": 2.8472323417663574, + "learning_rate": 3.979538521975028e-06, + "loss": 0.4769, + "step": 3857 + }, + { + "epoch": 1.824113475177305, + "grad_norm": 2.6714751720428467, + "learning_rate": 3.979035625510371e-06, + "loss": 0.4826, + "step": 3858 + }, + { + "epoch": 1.8245862884160755, + "grad_norm": 2.6816468238830566, + "learning_rate": 3.97853263695265e-06, + "loss": 0.5127, + "step": 3859 + }, + { + "epoch": 1.8250591016548463, + "grad_norm": 2.6464123725891113, + "learning_rate": 3.978029556333185e-06, + "loss": 0.4925, + "step": 3860 + }, + { + "epoch": 1.825531914893617, + "grad_norm": 2.5317227840423584, + "learning_rate": 3.977526383683301e-06, + "loss": 0.4765, + "step": 3861 + }, + { + "epoch": 1.8260047281323877, + "grad_norm": 2.5052425861358643, + "learning_rate": 3.977023119034328e-06, + "loss": 0.4804, + "step": 3862 + }, + { + "epoch": 1.8264775413711583, + "grad_norm": 2.7022836208343506, + "learning_rate": 3.976519762417602e-06, + "loss": 0.4824, + "step": 3863 + }, + { + "epoch": 1.826950354609929, + "grad_norm": 2.7445900440216064, + "learning_rate": 3.976016313864464e-06, + "loss": 0.5698, + "step": 3864 + }, + { + "epoch": 1.8274231678486998, + "grad_norm": 2.442518711090088, + "learning_rate": 3.975512773406262e-06, + "loss": 0.5133, + "step": 3865 + }, + { + "epoch": 1.8278959810874704, + "grad_norm": 2.4100050926208496, + "learning_rate": 3.975009141074351e-06, + "loss": 0.5044, + "step": 3866 + }, + { + "epoch": 1.828368794326241, + "grad_norm": 2.9507648944854736, + "learning_rate": 3.974505416900088e-06, + "loss": 0.5367, + "step": 3867 + }, + { + "epoch": 1.8288416075650118, + "grad_norm": 2.5662600994110107, + "learning_rate": 3.974001600914837e-06, + "loss": 0.5878, + "step": 3868 + }, + { + "epoch": 1.8293144208037826, + "grad_norm": 2.4306657314300537, + "learning_rate": 3.973497693149971e-06, + "loss": 0.4647, + "step": 3869 + }, + { + "epoch": 1.8297872340425532, + "grad_norm": 2.974686622619629, + "learning_rate": 3.972993693636864e-06, + "loss": 0.4911, + "step": 3870 + }, + { + "epoch": 1.8302600472813237, + "grad_norm": 2.5711987018585205, + "learning_rate": 3.972489602406899e-06, + "loss": 0.5089, + "step": 3871 + }, + { + "epoch": 1.8307328605200945, + "grad_norm": 3.259617328643799, + "learning_rate": 3.971985419491463e-06, + "loss": 0.5966, + "step": 3872 + }, + { + "epoch": 1.8312056737588653, + "grad_norm": 2.7437000274658203, + "learning_rate": 3.971481144921949e-06, + "loss": 0.5097, + "step": 3873 + }, + { + "epoch": 1.831678486997636, + "grad_norm": 2.9597461223602295, + "learning_rate": 3.970976778729757e-06, + "loss": 0.5672, + "step": 3874 + }, + { + "epoch": 1.8321513002364065, + "grad_norm": 2.5775723457336426, + "learning_rate": 3.970472320946291e-06, + "loss": 0.4749, + "step": 3875 + }, + { + "epoch": 1.8326241134751773, + "grad_norm": 2.7381200790405273, + "learning_rate": 3.969967771602961e-06, + "loss": 0.5255, + "step": 3876 + }, + { + "epoch": 1.833096926713948, + "grad_norm": 2.651698350906372, + "learning_rate": 3.969463130731183e-06, + "loss": 0.5098, + "step": 3877 + }, + { + "epoch": 1.8335697399527187, + "grad_norm": 2.7277021408081055, + "learning_rate": 3.968958398362381e-06, + "loss": 0.5251, + "step": 3878 + }, + { + "epoch": 1.8340425531914892, + "grad_norm": 2.5184953212738037, + "learning_rate": 3.968453574527978e-06, + "loss": 0.5086, + "step": 3879 + }, + { + "epoch": 1.83451536643026, + "grad_norm": 2.8227882385253906, + "learning_rate": 3.967948659259412e-06, + "loss": 0.5742, + "step": 3880 + }, + { + "epoch": 1.8349881796690308, + "grad_norm": 2.547922134399414, + "learning_rate": 3.967443652588119e-06, + "loss": 0.5411, + "step": 3881 + }, + { + "epoch": 1.8354609929078014, + "grad_norm": 2.6572835445404053, + "learning_rate": 3.966938554545545e-06, + "loss": 0.4854, + "step": 3882 + }, + { + "epoch": 1.835933806146572, + "grad_norm": 2.9416658878326416, + "learning_rate": 3.966433365163139e-06, + "loss": 0.5236, + "step": 3883 + }, + { + "epoch": 1.8364066193853428, + "grad_norm": 2.344325304031372, + "learning_rate": 3.965928084472357e-06, + "loss": 0.4916, + "step": 3884 + }, + { + "epoch": 1.8368794326241136, + "grad_norm": 2.890418291091919, + "learning_rate": 3.965422712504662e-06, + "loss": 0.5287, + "step": 3885 + }, + { + "epoch": 1.8373522458628841, + "grad_norm": 2.6063363552093506, + "learning_rate": 3.96491724929152e-06, + "loss": 0.4842, + "step": 3886 + }, + { + "epoch": 1.8378250591016547, + "grad_norm": 2.5582427978515625, + "learning_rate": 3.964411694864404e-06, + "loss": 0.4768, + "step": 3887 + }, + { + "epoch": 1.8382978723404255, + "grad_norm": 2.84356951713562, + "learning_rate": 3.963906049254793e-06, + "loss": 0.5284, + "step": 3888 + }, + { + "epoch": 1.8387706855791963, + "grad_norm": 2.7048516273498535, + "learning_rate": 3.963400312494172e-06, + "loss": 0.5271, + "step": 3889 + }, + { + "epoch": 1.839243498817967, + "grad_norm": 2.5401699542999268, + "learning_rate": 3.962894484614031e-06, + "loss": 0.4734, + "step": 3890 + }, + { + "epoch": 1.8397163120567375, + "grad_norm": 2.208256244659424, + "learning_rate": 3.962388565645864e-06, + "loss": 0.4113, + "step": 3891 + }, + { + "epoch": 1.8401891252955083, + "grad_norm": 2.775139331817627, + "learning_rate": 3.961882555621173e-06, + "loss": 0.5172, + "step": 3892 + }, + { + "epoch": 1.840661938534279, + "grad_norm": 2.7540855407714844, + "learning_rate": 3.961376454571466e-06, + "loss": 0.5252, + "step": 3893 + }, + { + "epoch": 1.8411347517730496, + "grad_norm": 2.6731574535369873, + "learning_rate": 3.960870262528255e-06, + "loss": 0.4495, + "step": 3894 + }, + { + "epoch": 1.8416075650118202, + "grad_norm": 2.791492223739624, + "learning_rate": 3.960363979523058e-06, + "loss": 0.5457, + "step": 3895 + }, + { + "epoch": 1.842080378250591, + "grad_norm": 2.9280290603637695, + "learning_rate": 3.959857605587401e-06, + "loss": 0.5373, + "step": 3896 + }, + { + "epoch": 1.8425531914893618, + "grad_norm": 2.5652217864990234, + "learning_rate": 3.95935114075281e-06, + "loss": 0.5191, + "step": 3897 + }, + { + "epoch": 1.8430260047281324, + "grad_norm": 2.7297749519348145, + "learning_rate": 3.958844585050824e-06, + "loss": 0.5366, + "step": 3898 + }, + { + "epoch": 1.843498817966903, + "grad_norm": 2.5302982330322266, + "learning_rate": 3.958337938512983e-06, + "loss": 0.569, + "step": 3899 + }, + { + "epoch": 1.8439716312056738, + "grad_norm": 2.644777297973633, + "learning_rate": 3.957831201170832e-06, + "loss": 0.521, + "step": 3900 + }, + { + "epoch": 1.8444444444444446, + "grad_norm": 2.8375515937805176, + "learning_rate": 3.957324373055925e-06, + "loss": 0.573, + "step": 3901 + }, + { + "epoch": 1.8449172576832151, + "grad_norm": 2.512296676635742, + "learning_rate": 3.956817454199819e-06, + "loss": 0.5081, + "step": 3902 + }, + { + "epoch": 1.8453900709219857, + "grad_norm": 2.3662109375, + "learning_rate": 3.956310444634079e-06, + "loss": 0.4989, + "step": 3903 + }, + { + "epoch": 1.8458628841607565, + "grad_norm": 2.6849682331085205, + "learning_rate": 3.955803344390272e-06, + "loss": 0.5459, + "step": 3904 + }, + { + "epoch": 1.8463356973995273, + "grad_norm": 2.8364317417144775, + "learning_rate": 3.9552961534999756e-06, + "loss": 0.5704, + "step": 3905 + }, + { + "epoch": 1.8468085106382979, + "grad_norm": 2.6006948947906494, + "learning_rate": 3.954788871994768e-06, + "loss": 0.5696, + "step": 3906 + }, + { + "epoch": 1.8472813238770684, + "grad_norm": 2.558300018310547, + "learning_rate": 3.9542814999062375e-06, + "loss": 0.5047, + "step": 3907 + }, + { + "epoch": 1.8477541371158392, + "grad_norm": 2.6343321800231934, + "learning_rate": 3.953774037265974e-06, + "loss": 0.525, + "step": 3908 + }, + { + "epoch": 1.84822695035461, + "grad_norm": 2.5050008296966553, + "learning_rate": 3.953266484105576e-06, + "loss": 0.4867, + "step": 3909 + }, + { + "epoch": 1.8486997635933806, + "grad_norm": 2.3775103092193604, + "learning_rate": 3.952758840456647e-06, + "loss": 0.4349, + "step": 3910 + }, + { + "epoch": 1.8491725768321512, + "grad_norm": 2.508376359939575, + "learning_rate": 3.952251106350794e-06, + "loss": 0.539, + "step": 3911 + }, + { + "epoch": 1.849645390070922, + "grad_norm": 2.7403106689453125, + "learning_rate": 3.951743281819633e-06, + "loss": 0.4478, + "step": 3912 + }, + { + "epoch": 1.8501182033096928, + "grad_norm": 2.5332062244415283, + "learning_rate": 3.951235366894784e-06, + "loss": 0.4658, + "step": 3913 + }, + { + "epoch": 1.8505910165484634, + "grad_norm": 3.0137248039245605, + "learning_rate": 3.950727361607872e-06, + "loss": 0.5047, + "step": 3914 + }, + { + "epoch": 1.851063829787234, + "grad_norm": 2.5820653438568115, + "learning_rate": 3.950219265990528e-06, + "loss": 0.542, + "step": 3915 + }, + { + "epoch": 1.8515366430260047, + "grad_norm": 2.555133819580078, + "learning_rate": 3.949711080074389e-06, + "loss": 0.5253, + "step": 3916 + }, + { + "epoch": 1.8520094562647755, + "grad_norm": 2.876882791519165, + "learning_rate": 3.949202803891099e-06, + "loss": 0.5242, + "step": 3917 + }, + { + "epoch": 1.852482269503546, + "grad_norm": 2.5929203033447266, + "learning_rate": 3.948694437472305e-06, + "loss": 0.5358, + "step": 3918 + }, + { + "epoch": 1.8529550827423167, + "grad_norm": 2.468513250350952, + "learning_rate": 3.948185980849659e-06, + "loss": 0.5119, + "step": 3919 + }, + { + "epoch": 1.8534278959810875, + "grad_norm": 2.9259560108184814, + "learning_rate": 3.947677434054824e-06, + "loss": 0.4756, + "step": 3920 + }, + { + "epoch": 1.8539007092198583, + "grad_norm": 2.5247011184692383, + "learning_rate": 3.947168797119462e-06, + "loss": 0.4627, + "step": 3921 + }, + { + "epoch": 1.8543735224586289, + "grad_norm": 2.7396671772003174, + "learning_rate": 3.946660070075245e-06, + "loss": 0.5013, + "step": 3922 + }, + { + "epoch": 1.8548463356973994, + "grad_norm": 2.7059738636016846, + "learning_rate": 3.946151252953849e-06, + "loss": 0.5875, + "step": 3923 + }, + { + "epoch": 1.8553191489361702, + "grad_norm": 2.5638437271118164, + "learning_rate": 3.945642345786955e-06, + "loss": 0.5063, + "step": 3924 + }, + { + "epoch": 1.855791962174941, + "grad_norm": 2.6647839546203613, + "learning_rate": 3.945133348606251e-06, + "loss": 0.5421, + "step": 3925 + }, + { + "epoch": 1.8562647754137116, + "grad_norm": 3.7235286235809326, + "learning_rate": 3.944624261443431e-06, + "loss": 0.5958, + "step": 3926 + }, + { + "epoch": 1.8567375886524822, + "grad_norm": 2.769984245300293, + "learning_rate": 3.944115084330192e-06, + "loss": 0.5678, + "step": 3927 + }, + { + "epoch": 1.857210401891253, + "grad_norm": 2.567249059677124, + "learning_rate": 3.9436058172982395e-06, + "loss": 0.4767, + "step": 3928 + }, + { + "epoch": 1.8576832151300238, + "grad_norm": 2.6196048259735107, + "learning_rate": 3.943096460379283e-06, + "loss": 0.5345, + "step": 3929 + }, + { + "epoch": 1.8581560283687943, + "grad_norm": 2.5999555587768555, + "learning_rate": 3.942587013605037e-06, + "loss": 0.5482, + "step": 3930 + }, + { + "epoch": 1.858628841607565, + "grad_norm": 2.630387783050537, + "learning_rate": 3.942077477007224e-06, + "loss": 0.6023, + "step": 3931 + }, + { + "epoch": 1.8591016548463357, + "grad_norm": 2.543503761291504, + "learning_rate": 3.941567850617569e-06, + "loss": 0.5157, + "step": 3932 + }, + { + "epoch": 1.8595744680851065, + "grad_norm": 2.5109236240386963, + "learning_rate": 3.941058134467805e-06, + "loss": 0.4774, + "step": 3933 + }, + { + "epoch": 1.860047281323877, + "grad_norm": 2.5110230445861816, + "learning_rate": 3.94054832858967e-06, + "loss": 0.5064, + "step": 3934 + }, + { + "epoch": 1.8605200945626477, + "grad_norm": 2.4780776500701904, + "learning_rate": 3.940038433014908e-06, + "loss": 0.5216, + "step": 3935 + }, + { + "epoch": 1.8609929078014185, + "grad_norm": 2.4398856163024902, + "learning_rate": 3.939528447775266e-06, + "loss": 0.4958, + "step": 3936 + }, + { + "epoch": 1.8614657210401893, + "grad_norm": 2.449498176574707, + "learning_rate": 3.9390183729025e-06, + "loss": 0.5165, + "step": 3937 + }, + { + "epoch": 1.8619385342789598, + "grad_norm": 2.982544422149658, + "learning_rate": 3.938508208428371e-06, + "loss": 0.4803, + "step": 3938 + }, + { + "epoch": 1.8624113475177304, + "grad_norm": 2.6574015617370605, + "learning_rate": 3.937997954384641e-06, + "loss": 0.4797, + "step": 3939 + }, + { + "epoch": 1.8628841607565012, + "grad_norm": 2.7773542404174805, + "learning_rate": 3.937487610803086e-06, + "loss": 0.4843, + "step": 3940 + }, + { + "epoch": 1.863356973995272, + "grad_norm": 2.588937759399414, + "learning_rate": 3.9369771777154805e-06, + "loss": 0.5426, + "step": 3941 + }, + { + "epoch": 1.8638297872340426, + "grad_norm": 2.855442523956299, + "learning_rate": 3.936466655153607e-06, + "loss": 0.5443, + "step": 3942 + }, + { + "epoch": 1.8643026004728132, + "grad_norm": 2.554676055908203, + "learning_rate": 3.935956043149253e-06, + "loss": 0.5334, + "step": 3943 + }, + { + "epoch": 1.864775413711584, + "grad_norm": 2.901599884033203, + "learning_rate": 3.935445341734212e-06, + "loss": 0.5842, + "step": 3944 + }, + { + "epoch": 1.8652482269503547, + "grad_norm": 2.554485321044922, + "learning_rate": 3.934934550940285e-06, + "loss": 0.4941, + "step": 3945 + }, + { + "epoch": 1.8657210401891253, + "grad_norm": 2.357203245162964, + "learning_rate": 3.934423670799275e-06, + "loss": 0.4402, + "step": 3946 + }, + { + "epoch": 1.866193853427896, + "grad_norm": 2.7036049365997314, + "learning_rate": 3.933912701342993e-06, + "loss": 0.4966, + "step": 3947 + }, + { + "epoch": 1.8666666666666667, + "grad_norm": 2.7817211151123047, + "learning_rate": 3.933401642603255e-06, + "loss": 0.4908, + "step": 3948 + }, + { + "epoch": 1.8671394799054375, + "grad_norm": 2.439490795135498, + "learning_rate": 3.932890494611882e-06, + "loss": 0.4322, + "step": 3949 + }, + { + "epoch": 1.867612293144208, + "grad_norm": 3.187152147293091, + "learning_rate": 3.9323792574007e-06, + "loss": 0.501, + "step": 3950 + }, + { + "epoch": 1.8680851063829786, + "grad_norm": 2.405773401260376, + "learning_rate": 3.931867931001543e-06, + "loss": 0.4477, + "step": 3951 + }, + { + "epoch": 1.8685579196217494, + "grad_norm": 2.4922525882720947, + "learning_rate": 3.931356515446248e-06, + "loss": 0.5098, + "step": 3952 + }, + { + "epoch": 1.8690307328605202, + "grad_norm": 2.7781267166137695, + "learning_rate": 3.93084501076666e-06, + "loss": 0.5815, + "step": 3953 + }, + { + "epoch": 1.8695035460992908, + "grad_norm": 2.74621844291687, + "learning_rate": 3.930333416994626e-06, + "loss": 0.5605, + "step": 3954 + }, + { + "epoch": 1.8699763593380614, + "grad_norm": 2.5527689456939697, + "learning_rate": 3.929821734162004e-06, + "loss": 0.5141, + "step": 3955 + }, + { + "epoch": 1.8704491725768322, + "grad_norm": 2.5730628967285156, + "learning_rate": 3.92930996230065e-06, + "loss": 0.5446, + "step": 3956 + }, + { + "epoch": 1.870921985815603, + "grad_norm": 2.7053353786468506, + "learning_rate": 3.9287981014424334e-06, + "loss": 0.4722, + "step": 3957 + }, + { + "epoch": 1.8713947990543736, + "grad_norm": 2.7591893672943115, + "learning_rate": 3.928286151619224e-06, + "loss": 0.509, + "step": 3958 + }, + { + "epoch": 1.8718676122931441, + "grad_norm": 2.6233739852905273, + "learning_rate": 3.927774112862898e-06, + "loss": 0.5266, + "step": 3959 + }, + { + "epoch": 1.872340425531915, + "grad_norm": 2.7715370655059814, + "learning_rate": 3.9272619852053396e-06, + "loss": 0.5612, + "step": 3960 + }, + { + "epoch": 1.8728132387706857, + "grad_norm": 2.4815211296081543, + "learning_rate": 3.926749768678435e-06, + "loss": 0.5498, + "step": 3961 + }, + { + "epoch": 1.8732860520094563, + "grad_norm": 2.6819605827331543, + "learning_rate": 3.926237463314078e-06, + "loss": 0.5499, + "step": 3962 + }, + { + "epoch": 1.8737588652482269, + "grad_norm": 2.638664722442627, + "learning_rate": 3.925725069144168e-06, + "loss": 0.5429, + "step": 3963 + }, + { + "epoch": 1.8742316784869977, + "grad_norm": 2.527294874191284, + "learning_rate": 3.925212586200611e-06, + "loss": 0.5451, + "step": 3964 + }, + { + "epoch": 1.8747044917257685, + "grad_norm": 2.831638813018799, + "learning_rate": 3.924700014515315e-06, + "loss": 0.5276, + "step": 3965 + }, + { + "epoch": 1.875177304964539, + "grad_norm": 2.5906996726989746, + "learning_rate": 3.924187354120196e-06, + "loss": 0.5323, + "step": 3966 + }, + { + "epoch": 1.8756501182033096, + "grad_norm": 2.5482442378997803, + "learning_rate": 3.923674605047175e-06, + "loss": 0.4882, + "step": 3967 + }, + { + "epoch": 1.8761229314420804, + "grad_norm": 2.56402850151062, + "learning_rate": 3.923161767328179e-06, + "loss": 0.5111, + "step": 3968 + }, + { + "epoch": 1.8765957446808512, + "grad_norm": 3.223782539367676, + "learning_rate": 3.9226488409951405e-06, + "loss": 0.5829, + "step": 3969 + }, + { + "epoch": 1.8770685579196218, + "grad_norm": 2.665964365005493, + "learning_rate": 3.922135826079997e-06, + "loss": 0.4739, + "step": 3970 + }, + { + "epoch": 1.8775413711583924, + "grad_norm": 2.602696418762207, + "learning_rate": 3.921622722614691e-06, + "loss": 0.5199, + "step": 3971 + }, + { + "epoch": 1.8780141843971632, + "grad_norm": 2.5384418964385986, + "learning_rate": 3.921109530631172e-06, + "loss": 0.5086, + "step": 3972 + }, + { + "epoch": 1.878486997635934, + "grad_norm": 2.7961080074310303, + "learning_rate": 3.920596250161394e-06, + "loss": 0.5454, + "step": 3973 + }, + { + "epoch": 1.8789598108747045, + "grad_norm": 3.022007465362549, + "learning_rate": 3.920082881237317e-06, + "loss": 0.5537, + "step": 3974 + }, + { + "epoch": 1.8794326241134751, + "grad_norm": 2.699885129928589, + "learning_rate": 3.9195694238909045e-06, + "loss": 0.5274, + "step": 3975 + }, + { + "epoch": 1.879905437352246, + "grad_norm": 2.3994593620300293, + "learning_rate": 3.919055878154129e-06, + "loss": 0.4134, + "step": 3976 + }, + { + "epoch": 1.8803782505910167, + "grad_norm": 4.093045711517334, + "learning_rate": 3.918542244058967e-06, + "loss": 0.5305, + "step": 3977 + }, + { + "epoch": 1.8808510638297873, + "grad_norm": 3.011643171310425, + "learning_rate": 3.9180285216374e-06, + "loss": 0.5481, + "step": 3978 + }, + { + "epoch": 1.8813238770685579, + "grad_norm": 2.6426854133605957, + "learning_rate": 3.917514710921414e-06, + "loss": 0.5415, + "step": 3979 + }, + { + "epoch": 1.8817966903073287, + "grad_norm": 2.4379019737243652, + "learning_rate": 3.917000811943002e-06, + "loss": 0.4566, + "step": 3980 + }, + { + "epoch": 1.8822695035460995, + "grad_norm": 3.18522047996521, + "learning_rate": 3.9164868247341634e-06, + "loss": 0.6079, + "step": 3981 + }, + { + "epoch": 1.88274231678487, + "grad_norm": 2.6451141834259033, + "learning_rate": 3.915972749326903e-06, + "loss": 0.515, + "step": 3982 + }, + { + "epoch": 1.8832151300236406, + "grad_norm": 2.565598726272583, + "learning_rate": 3.915458585753226e-06, + "loss": 0.4799, + "step": 3983 + }, + { + "epoch": 1.8836879432624114, + "grad_norm": 2.711651563644409, + "learning_rate": 3.91494433404515e-06, + "loss": 0.5595, + "step": 3984 + }, + { + "epoch": 1.8841607565011822, + "grad_norm": 2.749328851699829, + "learning_rate": 3.914429994234695e-06, + "loss": 0.495, + "step": 3985 + }, + { + "epoch": 1.8846335697399526, + "grad_norm": 2.9492287635803223, + "learning_rate": 3.913915566353886e-06, + "loss": 0.5683, + "step": 3986 + }, + { + "epoch": 1.8851063829787233, + "grad_norm": 3.07747745513916, + "learning_rate": 3.913401050434756e-06, + "loss": 0.4953, + "step": 3987 + }, + { + "epoch": 1.8855791962174941, + "grad_norm": 2.8746345043182373, + "learning_rate": 3.912886446509338e-06, + "loss": 0.4752, + "step": 3988 + }, + { + "epoch": 1.8860520094562647, + "grad_norm": 2.772954225540161, + "learning_rate": 3.912371754609677e-06, + "loss": 0.5473, + "step": 3989 + }, + { + "epoch": 1.8865248226950353, + "grad_norm": 2.8906044960021973, + "learning_rate": 3.911856974767821e-06, + "loss": 0.5285, + "step": 3990 + }, + { + "epoch": 1.886997635933806, + "grad_norm": 2.8992726802825928, + "learning_rate": 3.9113421070158206e-06, + "loss": 0.571, + "step": 3991 + }, + { + "epoch": 1.887470449172577, + "grad_norm": 2.624662160873413, + "learning_rate": 3.910827151385737e-06, + "loss": 0.5183, + "step": 3992 + }, + { + "epoch": 1.8879432624113475, + "grad_norm": 2.4491732120513916, + "learning_rate": 3.910312107909632e-06, + "loss": 0.4205, + "step": 3993 + }, + { + "epoch": 1.888416075650118, + "grad_norm": 2.278259515762329, + "learning_rate": 3.909796976619575e-06, + "loss": 0.4464, + "step": 3994 + }, + { + "epoch": 1.8888888888888888, + "grad_norm": 2.6481523513793945, + "learning_rate": 3.909281757547644e-06, + "loss": 0.5023, + "step": 3995 + }, + { + "epoch": 1.8893617021276596, + "grad_norm": 2.6687493324279785, + "learning_rate": 3.908766450725917e-06, + "loss": 0.495, + "step": 3996 + }, + { + "epoch": 1.8898345153664302, + "grad_norm": 2.507525682449341, + "learning_rate": 3.908251056186481e-06, + "loss": 0.4155, + "step": 3997 + }, + { + "epoch": 1.8903073286052008, + "grad_norm": 2.7048323154449463, + "learning_rate": 3.907735573961426e-06, + "loss": 0.4601, + "step": 3998 + }, + { + "epoch": 1.8907801418439716, + "grad_norm": 2.6825389862060547, + "learning_rate": 3.907220004082848e-06, + "loss": 0.5067, + "step": 3999 + }, + { + "epoch": 1.8912529550827424, + "grad_norm": 2.775696039199829, + "learning_rate": 3.906704346582852e-06, + "loss": 0.5411, + "step": 4000 + }, + { + "epoch": 1.891725768321513, + "grad_norm": 2.4492077827453613, + "learning_rate": 3.906188601493545e-06, + "loss": 0.4931, + "step": 4001 + }, + { + "epoch": 1.8921985815602835, + "grad_norm": 2.320810556411743, + "learning_rate": 3.905672768847041e-06, + "loss": 0.4908, + "step": 4002 + }, + { + "epoch": 1.8926713947990543, + "grad_norm": 2.455162525177002, + "learning_rate": 3.905156848675455e-06, + "loss": 0.508, + "step": 4003 + }, + { + "epoch": 1.8931442080378251, + "grad_norm": 2.515921115875244, + "learning_rate": 3.904640841010915e-06, + "loss": 0.5318, + "step": 4004 + }, + { + "epoch": 1.8936170212765957, + "grad_norm": 2.7230770587921143, + "learning_rate": 3.904124745885548e-06, + "loss": 0.4793, + "step": 4005 + }, + { + "epoch": 1.8940898345153663, + "grad_norm": 2.519934892654419, + "learning_rate": 3.903608563331491e-06, + "loss": 0.5013, + "step": 4006 + }, + { + "epoch": 1.894562647754137, + "grad_norm": 2.719674587249756, + "learning_rate": 3.903092293380883e-06, + "loss": 0.516, + "step": 4007 + }, + { + "epoch": 1.8950354609929079, + "grad_norm": 3.2107343673706055, + "learning_rate": 3.902575936065869e-06, + "loss": 0.6297, + "step": 4008 + }, + { + "epoch": 1.8955082742316784, + "grad_norm": 2.9773149490356445, + "learning_rate": 3.902059491418603e-06, + "loss": 0.566, + "step": 4009 + }, + { + "epoch": 1.895981087470449, + "grad_norm": 2.6754770278930664, + "learning_rate": 3.90154295947124e-06, + "loss": 0.5187, + "step": 4010 + }, + { + "epoch": 1.8964539007092198, + "grad_norm": 2.457303762435913, + "learning_rate": 3.901026340255943e-06, + "loss": 0.5757, + "step": 4011 + }, + { + "epoch": 1.8969267139479906, + "grad_norm": 2.5944161415100098, + "learning_rate": 3.900509633804878e-06, + "loss": 0.5049, + "step": 4012 + }, + { + "epoch": 1.8973995271867612, + "grad_norm": 2.610445022583008, + "learning_rate": 3.89999284015022e-06, + "loss": 0.521, + "step": 4013 + }, + { + "epoch": 1.8978723404255318, + "grad_norm": 2.6949338912963867, + "learning_rate": 3.899475959324146e-06, + "loss": 0.5619, + "step": 4014 + }, + { + "epoch": 1.8983451536643026, + "grad_norm": 2.7889559268951416, + "learning_rate": 3.898958991358841e-06, + "loss": 0.5223, + "step": 4015 + }, + { + "epoch": 1.8988179669030734, + "grad_norm": 2.569265842437744, + "learning_rate": 3.898441936286493e-06, + "loss": 0.5724, + "step": 4016 + }, + { + "epoch": 1.899290780141844, + "grad_norm": 2.3567774295806885, + "learning_rate": 3.897924794139299e-06, + "loss": 0.4784, + "step": 4017 + }, + { + "epoch": 1.8997635933806145, + "grad_norm": 2.9176526069641113, + "learning_rate": 3.897407564949457e-06, + "loss": 0.646, + "step": 4018 + }, + { + "epoch": 1.9002364066193853, + "grad_norm": 2.7870090007781982, + "learning_rate": 3.896890248749174e-06, + "loss": 0.4922, + "step": 4019 + }, + { + "epoch": 1.900709219858156, + "grad_norm": 2.8310980796813965, + "learning_rate": 3.89637284557066e-06, + "loss": 0.4746, + "step": 4020 + }, + { + "epoch": 1.9011820330969267, + "grad_norm": 2.434915542602539, + "learning_rate": 3.895855355446131e-06, + "loss": 0.4537, + "step": 4021 + }, + { + "epoch": 1.9016548463356973, + "grad_norm": 3.0547034740448, + "learning_rate": 3.89533777840781e-06, + "loss": 0.6161, + "step": 4022 + }, + { + "epoch": 1.902127659574468, + "grad_norm": 3.416774272918701, + "learning_rate": 3.894820114487925e-06, + "loss": 0.5448, + "step": 4023 + }, + { + "epoch": 1.9026004728132389, + "grad_norm": 2.606951951980591, + "learning_rate": 3.894302363718707e-06, + "loss": 0.5501, + "step": 4024 + }, + { + "epoch": 1.9030732860520094, + "grad_norm": 3.082165002822876, + "learning_rate": 3.8937845261323945e-06, + "loss": 0.6035, + "step": 4025 + }, + { + "epoch": 1.90354609929078, + "grad_norm": 2.616093397140503, + "learning_rate": 3.893266601761231e-06, + "loss": 0.5294, + "step": 4026 + }, + { + "epoch": 1.9040189125295508, + "grad_norm": 2.7141637802124023, + "learning_rate": 3.8927485906374654e-06, + "loss": 0.5481, + "step": 4027 + }, + { + "epoch": 1.9044917257683216, + "grad_norm": 2.5129404067993164, + "learning_rate": 3.892230492793352e-06, + "loss": 0.4958, + "step": 4028 + }, + { + "epoch": 1.9049645390070922, + "grad_norm": 2.703403949737549, + "learning_rate": 3.891712308261151e-06, + "loss": 0.4852, + "step": 4029 + }, + { + "epoch": 1.9054373522458627, + "grad_norm": 2.881058931350708, + "learning_rate": 3.891194037073127e-06, + "loss": 0.4662, + "step": 4030 + }, + { + "epoch": 1.9059101654846335, + "grad_norm": 3.216769218444824, + "learning_rate": 3.8906756792615505e-06, + "loss": 0.5076, + "step": 4031 + }, + { + "epoch": 1.9063829787234043, + "grad_norm": 2.442265748977661, + "learning_rate": 3.890157234858697e-06, + "loss": 0.4748, + "step": 4032 + }, + { + "epoch": 1.906855791962175, + "grad_norm": 3.088672399520874, + "learning_rate": 3.889638703896849e-06, + "loss": 0.5729, + "step": 4033 + }, + { + "epoch": 1.9073286052009455, + "grad_norm": 2.9304986000061035, + "learning_rate": 3.889120086408291e-06, + "loss": 0.603, + "step": 4034 + }, + { + "epoch": 1.9078014184397163, + "grad_norm": 2.686093807220459, + "learning_rate": 3.888601382425318e-06, + "loss": 0.4978, + "step": 4035 + }, + { + "epoch": 1.908274231678487, + "grad_norm": 2.5668389797210693, + "learning_rate": 3.888082591980225e-06, + "loss": 0.5086, + "step": 4036 + }, + { + "epoch": 1.9087470449172577, + "grad_norm": 2.530996561050415, + "learning_rate": 3.887563715105315e-06, + "loss": 0.4678, + "step": 4037 + }, + { + "epoch": 1.9092198581560282, + "grad_norm": 3.043342351913452, + "learning_rate": 3.887044751832897e-06, + "loss": 0.5452, + "step": 4038 + }, + { + "epoch": 1.909692671394799, + "grad_norm": 2.799734115600586, + "learning_rate": 3.886525702195284e-06, + "loss": 0.5265, + "step": 4039 + }, + { + "epoch": 1.9101654846335698, + "grad_norm": 2.890022039413452, + "learning_rate": 3.886006566224796e-06, + "loss": 0.4634, + "step": 4040 + }, + { + "epoch": 1.9106382978723404, + "grad_norm": 2.6804237365722656, + "learning_rate": 3.8854873439537555e-06, + "loss": 0.5031, + "step": 4041 + }, + { + "epoch": 1.911111111111111, + "grad_norm": 2.43038272857666, + "learning_rate": 3.884968035414495e-06, + "loss": 0.5098, + "step": 4042 + }, + { + "epoch": 1.9115839243498818, + "grad_norm": 2.589583396911621, + "learning_rate": 3.884448640639346e-06, + "loss": 0.498, + "step": 4043 + }, + { + "epoch": 1.9120567375886526, + "grad_norm": 2.4565231800079346, + "learning_rate": 3.8839291596606524e-06, + "loss": 0.4318, + "step": 4044 + }, + { + "epoch": 1.9125295508274232, + "grad_norm": 2.66762638092041, + "learning_rate": 3.8834095925107575e-06, + "loss": 0.5441, + "step": 4045 + }, + { + "epoch": 1.9130023640661937, + "grad_norm": 2.7334461212158203, + "learning_rate": 3.882889939222013e-06, + "loss": 0.5209, + "step": 4046 + }, + { + "epoch": 1.9134751773049645, + "grad_norm": 2.6398537158966064, + "learning_rate": 3.8823701998267765e-06, + "loss": 0.4874, + "step": 4047 + }, + { + "epoch": 1.9139479905437353, + "grad_norm": 2.82405161857605, + "learning_rate": 3.881850374357409e-06, + "loss": 0.4519, + "step": 4048 + }, + { + "epoch": 1.914420803782506, + "grad_norm": 2.7552523612976074, + "learning_rate": 3.8813304628462776e-06, + "loss": 0.547, + "step": 4049 + }, + { + "epoch": 1.9148936170212765, + "grad_norm": 2.5287928581237793, + "learning_rate": 3.880810465325755e-06, + "loss": 0.5226, + "step": 4050 + }, + { + "epoch": 1.9153664302600473, + "grad_norm": 2.7597358226776123, + "learning_rate": 3.88029038182822e-06, + "loss": 0.5171, + "step": 4051 + }, + { + "epoch": 1.915839243498818, + "grad_norm": 2.563899278640747, + "learning_rate": 3.879770212386055e-06, + "loss": 0.4911, + "step": 4052 + }, + { + "epoch": 1.9163120567375886, + "grad_norm": 2.499404191970825, + "learning_rate": 3.879249957031649e-06, + "loss": 0.5072, + "step": 4053 + }, + { + "epoch": 1.9167848699763592, + "grad_norm": 2.817713499069214, + "learning_rate": 3.878729615797396e-06, + "loss": 0.5452, + "step": 4054 + }, + { + "epoch": 1.91725768321513, + "grad_norm": 2.7152490615844727, + "learning_rate": 3.878209188715696e-06, + "loss": 0.4917, + "step": 4055 + }, + { + "epoch": 1.9177304964539008, + "grad_norm": 2.384265661239624, + "learning_rate": 3.877688675818953e-06, + "loss": 0.4823, + "step": 4056 + }, + { + "epoch": 1.9182033096926714, + "grad_norm": 2.61059308052063, + "learning_rate": 3.877168077139577e-06, + "loss": 0.478, + "step": 4057 + }, + { + "epoch": 1.918676122931442, + "grad_norm": 2.6107938289642334, + "learning_rate": 3.8766473927099824e-06, + "loss": 0.5202, + "step": 4058 + }, + { + "epoch": 1.9191489361702128, + "grad_norm": 2.2339766025543213, + "learning_rate": 3.876126622562592e-06, + "loss": 0.547, + "step": 4059 + }, + { + "epoch": 1.9196217494089836, + "grad_norm": 2.4324610233306885, + "learning_rate": 3.8756057667298304e-06, + "loss": 0.5333, + "step": 4060 + }, + { + "epoch": 1.9200945626477541, + "grad_norm": 2.5521230697631836, + "learning_rate": 3.875084825244131e-06, + "loss": 0.5503, + "step": 4061 + }, + { + "epoch": 1.9205673758865247, + "grad_norm": 2.6985747814178467, + "learning_rate": 3.874563798137928e-06, + "loss": 0.4944, + "step": 4062 + }, + { + "epoch": 1.9210401891252955, + "grad_norm": 2.422332525253296, + "learning_rate": 3.874042685443664e-06, + "loss": 0.4807, + "step": 4063 + }, + { + "epoch": 1.9215130023640663, + "grad_norm": 2.914553165435791, + "learning_rate": 3.873521487193788e-06, + "loss": 0.4439, + "step": 4064 + }, + { + "epoch": 1.9219858156028369, + "grad_norm": 2.8098697662353516, + "learning_rate": 3.873000203420752e-06, + "loss": 0.5433, + "step": 4065 + }, + { + "epoch": 1.9224586288416075, + "grad_norm": 2.6124703884124756, + "learning_rate": 3.872478834157013e-06, + "loss": 0.4812, + "step": 4066 + }, + { + "epoch": 1.9229314420803783, + "grad_norm": 2.511059522628784, + "learning_rate": 3.871957379435035e-06, + "loss": 0.4666, + "step": 4067 + }, + { + "epoch": 1.923404255319149, + "grad_norm": 2.950542688369751, + "learning_rate": 3.871435839287287e-06, + "loss": 0.5687, + "step": 4068 + }, + { + "epoch": 1.9238770685579196, + "grad_norm": 2.4969422817230225, + "learning_rate": 3.870914213746243e-06, + "loss": 0.5235, + "step": 4069 + }, + { + "epoch": 1.9243498817966902, + "grad_norm": 2.512152910232544, + "learning_rate": 3.870392502844382e-06, + "loss": 0.4524, + "step": 4070 + }, + { + "epoch": 1.924822695035461, + "grad_norm": 3.0212557315826416, + "learning_rate": 3.86987070661419e-06, + "loss": 0.4868, + "step": 4071 + }, + { + "epoch": 1.9252955082742318, + "grad_norm": 2.8949966430664062, + "learning_rate": 3.869348825088154e-06, + "loss": 0.5556, + "step": 4072 + }, + { + "epoch": 1.9257683215130024, + "grad_norm": 2.402043581008911, + "learning_rate": 3.868826858298772e-06, + "loss": 0.5307, + "step": 4073 + }, + { + "epoch": 1.926241134751773, + "grad_norm": 2.980992078781128, + "learning_rate": 3.868304806278543e-06, + "loss": 0.6313, + "step": 4074 + }, + { + "epoch": 1.9267139479905437, + "grad_norm": 2.7140514850616455, + "learning_rate": 3.867782669059975e-06, + "loss": 0.5359, + "step": 4075 + }, + { + "epoch": 1.9271867612293145, + "grad_norm": 2.499631643295288, + "learning_rate": 3.867260446675577e-06, + "loss": 0.4873, + "step": 4076 + }, + { + "epoch": 1.9276595744680851, + "grad_norm": 2.915583610534668, + "learning_rate": 3.866738139157866e-06, + "loss": 0.5736, + "step": 4077 + }, + { + "epoch": 1.9281323877068557, + "grad_norm": 2.4231131076812744, + "learning_rate": 3.866215746539363e-06, + "loss": 0.5096, + "step": 4078 + }, + { + "epoch": 1.9286052009456265, + "grad_norm": 2.360074996948242, + "learning_rate": 3.865693268852599e-06, + "loss": 0.4907, + "step": 4079 + }, + { + "epoch": 1.9290780141843973, + "grad_norm": 2.5410032272338867, + "learning_rate": 3.865170706130101e-06, + "loss": 0.473, + "step": 4080 + }, + { + "epoch": 1.9295508274231679, + "grad_norm": 2.780090808868408, + "learning_rate": 3.86464805840441e-06, + "loss": 0.5213, + "step": 4081 + }, + { + "epoch": 1.9300236406619384, + "grad_norm": 2.7318382263183594, + "learning_rate": 3.864125325708068e-06, + "loss": 0.5617, + "step": 4082 + }, + { + "epoch": 1.9304964539007092, + "grad_norm": 2.76509165763855, + "learning_rate": 3.863602508073623e-06, + "loss": 0.52, + "step": 4083 + }, + { + "epoch": 1.93096926713948, + "grad_norm": 2.8041110038757324, + "learning_rate": 3.863079605533631e-06, + "loss": 0.5343, + "step": 4084 + }, + { + "epoch": 1.9314420803782506, + "grad_norm": 2.4462404251098633, + "learning_rate": 3.862556618120647e-06, + "loss": 0.4657, + "step": 4085 + }, + { + "epoch": 1.9319148936170212, + "grad_norm": 2.460864305496216, + "learning_rate": 3.862033545867238e-06, + "loss": 0.517, + "step": 4086 + }, + { + "epoch": 1.932387706855792, + "grad_norm": 2.6480276584625244, + "learning_rate": 3.8615103888059715e-06, + "loss": 0.4702, + "step": 4087 + }, + { + "epoch": 1.9328605200945628, + "grad_norm": 2.7175381183624268, + "learning_rate": 3.860987146969424e-06, + "loss": 0.5073, + "step": 4088 + }, + { + "epoch": 1.9333333333333333, + "grad_norm": 2.4963486194610596, + "learning_rate": 3.860463820390175e-06, + "loss": 0.4491, + "step": 4089 + }, + { + "epoch": 1.933806146572104, + "grad_norm": 2.548135757446289, + "learning_rate": 3.8599404091008075e-06, + "loss": 0.5134, + "step": 4090 + }, + { + "epoch": 1.9342789598108747, + "grad_norm": 2.8693668842315674, + "learning_rate": 3.859416913133916e-06, + "loss": 0.5467, + "step": 4091 + }, + { + "epoch": 1.9347517730496455, + "grad_norm": 2.711273670196533, + "learning_rate": 3.858893332522092e-06, + "loss": 0.6287, + "step": 4092 + }, + { + "epoch": 1.935224586288416, + "grad_norm": 2.8604533672332764, + "learning_rate": 3.858369667297941e-06, + "loss": 0.5661, + "step": 4093 + }, + { + "epoch": 1.9356973995271867, + "grad_norm": 2.936988353729248, + "learning_rate": 3.857845917494066e-06, + "loss": 0.5311, + "step": 4094 + }, + { + "epoch": 1.9361702127659575, + "grad_norm": 2.414093494415283, + "learning_rate": 3.857322083143079e-06, + "loss": 0.505, + "step": 4095 + }, + { + "epoch": 1.9366430260047283, + "grad_norm": 2.5528934001922607, + "learning_rate": 3.856798164277599e-06, + "loss": 0.4759, + "step": 4096 + }, + { + "epoch": 1.9371158392434988, + "grad_norm": 2.592893600463867, + "learning_rate": 3.8562741609302456e-06, + "loss": 0.4932, + "step": 4097 + }, + { + "epoch": 1.9375886524822694, + "grad_norm": 2.9619107246398926, + "learning_rate": 3.855750073133648e-06, + "loss": 0.5563, + "step": 4098 + }, + { + "epoch": 1.9380614657210402, + "grad_norm": 2.864889621734619, + "learning_rate": 3.855225900920438e-06, + "loss": 0.5069, + "step": 4099 + }, + { + "epoch": 1.938534278959811, + "grad_norm": 2.3951032161712646, + "learning_rate": 3.854701644323253e-06, + "loss": 0.4883, + "step": 4100 + }, + { + "epoch": 1.9390070921985816, + "grad_norm": 2.6339633464813232, + "learning_rate": 3.854177303374737e-06, + "loss": 0.5207, + "step": 4101 + }, + { + "epoch": 1.9394799054373522, + "grad_norm": 2.6435508728027344, + "learning_rate": 3.853652878107539e-06, + "loss": 0.4679, + "step": 4102 + }, + { + "epoch": 1.939952718676123, + "grad_norm": 2.4635629653930664, + "learning_rate": 3.853128368554311e-06, + "loss": 0.5639, + "step": 4103 + }, + { + "epoch": 1.9404255319148938, + "grad_norm": 2.664635419845581, + "learning_rate": 3.852603774747714e-06, + "loss": 0.5697, + "step": 4104 + }, + { + "epoch": 1.9408983451536643, + "grad_norm": 2.7020363807678223, + "learning_rate": 3.8520790967204095e-06, + "loss": 0.5462, + "step": 4105 + }, + { + "epoch": 1.941371158392435, + "grad_norm": 3.529282331466675, + "learning_rate": 3.851554334505069e-06, + "loss": 0.54, + "step": 4106 + }, + { + "epoch": 1.9418439716312057, + "grad_norm": 2.7125768661499023, + "learning_rate": 3.851029488134367e-06, + "loss": 0.5355, + "step": 4107 + }, + { + "epoch": 1.9423167848699765, + "grad_norm": 2.5226643085479736, + "learning_rate": 3.850504557640981e-06, + "loss": 0.5106, + "step": 4108 + }, + { + "epoch": 1.942789598108747, + "grad_norm": 2.834352731704712, + "learning_rate": 3.8499795430575995e-06, + "loss": 0.6069, + "step": 4109 + }, + { + "epoch": 1.9432624113475176, + "grad_norm": 2.8484177589416504, + "learning_rate": 3.849454444416911e-06, + "loss": 0.5542, + "step": 4110 + }, + { + "epoch": 1.9437352245862884, + "grad_norm": 2.402539014816284, + "learning_rate": 3.848929261751612e-06, + "loss": 0.47, + "step": 4111 + }, + { + "epoch": 1.9442080378250592, + "grad_norm": 2.7010042667388916, + "learning_rate": 3.848403995094402e-06, + "loss": 0.5263, + "step": 4112 + }, + { + "epoch": 1.9446808510638298, + "grad_norm": 2.441689968109131, + "learning_rate": 3.847878644477988e-06, + "loss": 0.5607, + "step": 4113 + }, + { + "epoch": 1.9451536643026004, + "grad_norm": 2.5994722843170166, + "learning_rate": 3.847353209935081e-06, + "loss": 0.5103, + "step": 4114 + }, + { + "epoch": 1.9456264775413712, + "grad_norm": 2.452242136001587, + "learning_rate": 3.8468276914983975e-06, + "loss": 0.4409, + "step": 4115 + }, + { + "epoch": 1.946099290780142, + "grad_norm": 2.421023368835449, + "learning_rate": 3.84630208920066e-06, + "loss": 0.4429, + "step": 4116 + }, + { + "epoch": 1.9465721040189126, + "grad_norm": 2.696399688720703, + "learning_rate": 3.8457764030745945e-06, + "loss": 0.5352, + "step": 4117 + }, + { + "epoch": 1.9470449172576831, + "grad_norm": 2.3963489532470703, + "learning_rate": 3.845250633152933e-06, + "loss": 0.4505, + "step": 4118 + }, + { + "epoch": 1.947517730496454, + "grad_norm": 2.610649585723877, + "learning_rate": 3.8447247794684135e-06, + "loss": 0.501, + "step": 4119 + }, + { + "epoch": 1.9479905437352247, + "grad_norm": 2.740412712097168, + "learning_rate": 3.8441988420537775e-06, + "loss": 0.5362, + "step": 4120 + }, + { + "epoch": 1.9484633569739953, + "grad_norm": 2.2614004611968994, + "learning_rate": 3.8436728209417755e-06, + "loss": 0.4199, + "step": 4121 + }, + { + "epoch": 1.9489361702127659, + "grad_norm": 3.0683481693267822, + "learning_rate": 3.843146716165158e-06, + "loss": 0.5248, + "step": 4122 + }, + { + "epoch": 1.9494089834515367, + "grad_norm": 3.005174398422241, + "learning_rate": 3.842620527756684e-06, + "loss": 0.5246, + "step": 4123 + }, + { + "epoch": 1.9498817966903075, + "grad_norm": 2.672896385192871, + "learning_rate": 3.842094255749117e-06, + "loss": 0.5586, + "step": 4124 + }, + { + "epoch": 1.950354609929078, + "grad_norm": 2.5481197834014893, + "learning_rate": 3.8415679001752255e-06, + "loss": 0.5061, + "step": 4125 + }, + { + "epoch": 1.9508274231678486, + "grad_norm": 2.515789270401001, + "learning_rate": 3.8410414610677835e-06, + "loss": 0.4645, + "step": 4126 + }, + { + "epoch": 1.9513002364066194, + "grad_norm": 2.7236077785491943, + "learning_rate": 3.84051493845957e-06, + "loss": 0.5623, + "step": 4127 + }, + { + "epoch": 1.9517730496453902, + "grad_norm": 2.6252009868621826, + "learning_rate": 3.839988332383369e-06, + "loss": 0.5078, + "step": 4128 + }, + { + "epoch": 1.9522458628841608, + "grad_norm": 2.719196081161499, + "learning_rate": 3.83946164287197e-06, + "loss": 0.5481, + "step": 4129 + }, + { + "epoch": 1.9527186761229314, + "grad_norm": 2.484163284301758, + "learning_rate": 3.838934869958169e-06, + "loss": 0.5332, + "step": 4130 + }, + { + "epoch": 1.9531914893617022, + "grad_norm": 2.615382671356201, + "learning_rate": 3.838408013674764e-06, + "loss": 0.4742, + "step": 4131 + }, + { + "epoch": 1.953664302600473, + "grad_norm": 2.735321044921875, + "learning_rate": 3.83788107405456e-06, + "loss": 0.421, + "step": 4132 + }, + { + "epoch": 1.9541371158392435, + "grad_norm": 2.892652750015259, + "learning_rate": 3.837354051130369e-06, + "loss": 0.5326, + "step": 4133 + }, + { + "epoch": 1.9546099290780141, + "grad_norm": 2.6800546646118164, + "learning_rate": 3.8368269449350055e-06, + "loss": 0.5041, + "step": 4134 + }, + { + "epoch": 1.955082742316785, + "grad_norm": 2.362470865249634, + "learning_rate": 3.836299755501289e-06, + "loss": 0.4697, + "step": 4135 + }, + { + "epoch": 1.9555555555555557, + "grad_norm": 2.3855135440826416, + "learning_rate": 3.835772482862047e-06, + "loss": 0.5148, + "step": 4136 + }, + { + "epoch": 1.9560283687943263, + "grad_norm": 2.3338418006896973, + "learning_rate": 3.83524512705011e-06, + "loss": 0.4643, + "step": 4137 + }, + { + "epoch": 1.9565011820330969, + "grad_norm": 2.261355400085449, + "learning_rate": 3.834717688098313e-06, + "loss": 0.5573, + "step": 4138 + }, + { + "epoch": 1.9569739952718677, + "grad_norm": 2.8166391849517822, + "learning_rate": 3.834190166039498e-06, + "loss": 0.4868, + "step": 4139 + }, + { + "epoch": 1.9574468085106385, + "grad_norm": 2.4155869483947754, + "learning_rate": 3.833662560906512e-06, + "loss": 0.4923, + "step": 4140 + }, + { + "epoch": 1.957919621749409, + "grad_norm": 2.3977696895599365, + "learning_rate": 3.833134872732206e-06, + "loss": 0.5106, + "step": 4141 + }, + { + "epoch": 1.9583924349881796, + "grad_norm": 2.9541378021240234, + "learning_rate": 3.832607101549438e-06, + "loss": 0.4683, + "step": 4142 + }, + { + "epoch": 1.9588652482269504, + "grad_norm": 2.5862700939178467, + "learning_rate": 3.832079247391068e-06, + "loss": 0.4453, + "step": 4143 + }, + { + "epoch": 1.9593380614657212, + "grad_norm": 2.7459371089935303, + "learning_rate": 3.8315513102899644e-06, + "loss": 0.5511, + "step": 4144 + }, + { + "epoch": 1.9598108747044918, + "grad_norm": 2.904869556427002, + "learning_rate": 3.831023290279e-06, + "loss": 0.5348, + "step": 4145 + }, + { + "epoch": 1.9602836879432624, + "grad_norm": 3.092846632003784, + "learning_rate": 3.830495187391051e-06, + "loss": 0.5664, + "step": 4146 + }, + { + "epoch": 1.9607565011820332, + "grad_norm": 3.2838528156280518, + "learning_rate": 3.829967001659001e-06, + "loss": 0.5115, + "step": 4147 + }, + { + "epoch": 1.961229314420804, + "grad_norm": 2.7799549102783203, + "learning_rate": 3.829438733115738e-06, + "loss": 0.5145, + "step": 4148 + }, + { + "epoch": 1.9617021276595743, + "grad_norm": 2.436084270477295, + "learning_rate": 3.828910381794154e-06, + "loss": 0.4718, + "step": 4149 + }, + { + "epoch": 1.962174940898345, + "grad_norm": 2.6662371158599854, + "learning_rate": 3.828381947727148e-06, + "loss": 0.6129, + "step": 4150 + }, + { + "epoch": 1.962647754137116, + "grad_norm": 2.937000036239624, + "learning_rate": 3.827853430947622e-06, + "loss": 0.522, + "step": 4151 + }, + { + "epoch": 1.9631205673758865, + "grad_norm": 2.5737369060516357, + "learning_rate": 3.827324831488486e-06, + "loss": 0.4916, + "step": 4152 + }, + { + "epoch": 1.963593380614657, + "grad_norm": 2.70232892036438, + "learning_rate": 3.826796149382653e-06, + "loss": 0.4726, + "step": 4153 + }, + { + "epoch": 1.9640661938534278, + "grad_norm": 2.6899707317352295, + "learning_rate": 3.826267384663042e-06, + "loss": 0.529, + "step": 4154 + }, + { + "epoch": 1.9645390070921986, + "grad_norm": 2.6142728328704834, + "learning_rate": 3.825738537362575e-06, + "loss": 0.4999, + "step": 4155 + }, + { + "epoch": 1.9650118203309692, + "grad_norm": 2.43949818611145, + "learning_rate": 3.825209607514183e-06, + "loss": 0.5035, + "step": 4156 + }, + { + "epoch": 1.9654846335697398, + "grad_norm": 2.3735458850860596, + "learning_rate": 3.824680595150801e-06, + "loss": 0.4779, + "step": 4157 + }, + { + "epoch": 1.9659574468085106, + "grad_norm": 2.444307565689087, + "learning_rate": 3.824151500305365e-06, + "loss": 0.4825, + "step": 4158 + }, + { + "epoch": 1.9664302600472814, + "grad_norm": 2.8219668865203857, + "learning_rate": 3.8236223230108224e-06, + "loss": 0.5354, + "step": 4159 + }, + { + "epoch": 1.966903073286052, + "grad_norm": 2.720721483230591, + "learning_rate": 3.823093063300121e-06, + "loss": 0.5064, + "step": 4160 + }, + { + "epoch": 1.9673758865248225, + "grad_norm": 2.324190616607666, + "learning_rate": 3.822563721206217e-06, + "loss": 0.5348, + "step": 4161 + }, + { + "epoch": 1.9678486997635933, + "grad_norm": 2.702155351638794, + "learning_rate": 3.8220342967620695e-06, + "loss": 0.5388, + "step": 4162 + }, + { + "epoch": 1.9683215130023641, + "grad_norm": 2.4956369400024414, + "learning_rate": 3.821504790000642e-06, + "loss": 0.5071, + "step": 4163 + }, + { + "epoch": 1.9687943262411347, + "grad_norm": 2.568039655685425, + "learning_rate": 3.820975200954906e-06, + "loss": 0.5133, + "step": 4164 + }, + { + "epoch": 1.9692671394799053, + "grad_norm": 2.810868978500366, + "learning_rate": 3.820445529657837e-06, + "loss": 0.4856, + "step": 4165 + }, + { + "epoch": 1.969739952718676, + "grad_norm": 2.66365647315979, + "learning_rate": 3.819915776142415e-06, + "loss": 0.5235, + "step": 4166 + }, + { + "epoch": 1.9702127659574469, + "grad_norm": 2.2982139587402344, + "learning_rate": 3.8193859404416265e-06, + "loss": 0.4361, + "step": 4167 + }, + { + "epoch": 1.9706855791962175, + "grad_norm": 2.585672378540039, + "learning_rate": 3.818856022588458e-06, + "loss": 0.4842, + "step": 4168 + }, + { + "epoch": 1.971158392434988, + "grad_norm": 2.57857346534729, + "learning_rate": 3.81832602261591e-06, + "loss": 0.5249, + "step": 4169 + }, + { + "epoch": 1.9716312056737588, + "grad_norm": 2.6947224140167236, + "learning_rate": 3.817795940556981e-06, + "loss": 0.5234, + "step": 4170 + }, + { + "epoch": 1.9721040189125296, + "grad_norm": 2.7453415393829346, + "learning_rate": 3.8172657764446764e-06, + "loss": 0.5219, + "step": 4171 + }, + { + "epoch": 1.9725768321513002, + "grad_norm": 8.424073219299316, + "learning_rate": 3.816735530312009e-06, + "loss": 0.5162, + "step": 4172 + }, + { + "epoch": 1.9730496453900708, + "grad_norm": 2.8229739665985107, + "learning_rate": 3.816205202191993e-06, + "loss": 0.4621, + "step": 4173 + }, + { + "epoch": 1.9735224586288416, + "grad_norm": 2.5969009399414062, + "learning_rate": 3.815674792117651e-06, + "loss": 0.5044, + "step": 4174 + }, + { + "epoch": 1.9739952718676124, + "grad_norm": 2.646024227142334, + "learning_rate": 3.815144300122009e-06, + "loss": 0.5094, + "step": 4175 + }, + { + "epoch": 1.974468085106383, + "grad_norm": 2.4950616359710693, + "learning_rate": 3.814613726238097e-06, + "loss": 0.4827, + "step": 4176 + }, + { + "epoch": 1.9749408983451535, + "grad_norm": 2.5636119842529297, + "learning_rate": 3.8140830704989535e-06, + "loss": 0.5241, + "step": 4177 + }, + { + "epoch": 1.9754137115839243, + "grad_norm": 2.7936553955078125, + "learning_rate": 3.813552332937619e-06, + "loss": 0.5344, + "step": 4178 + }, + { + "epoch": 1.9758865248226951, + "grad_norm": 2.8085341453552246, + "learning_rate": 3.8130215135871405e-06, + "loss": 0.5647, + "step": 4179 + }, + { + "epoch": 1.9763593380614657, + "grad_norm": 2.4776322841644287, + "learning_rate": 3.8124906124805694e-06, + "loss": 0.542, + "step": 4180 + }, + { + "epoch": 1.9768321513002363, + "grad_norm": 2.3227856159210205, + "learning_rate": 3.8119596296509635e-06, + "loss": 0.4618, + "step": 4181 + }, + { + "epoch": 1.977304964539007, + "grad_norm": 2.5157814025878906, + "learning_rate": 3.8114285651313848e-06, + "loss": 0.538, + "step": 4182 + }, + { + "epoch": 1.9777777777777779, + "grad_norm": 2.5630218982696533, + "learning_rate": 3.8108974189548987e-06, + "loss": 0.5254, + "step": 4183 + }, + { + "epoch": 1.9782505910165484, + "grad_norm": 2.703237533569336, + "learning_rate": 3.8103661911545787e-06, + "loss": 0.4859, + "step": 4184 + }, + { + "epoch": 1.978723404255319, + "grad_norm": 2.8808000087738037, + "learning_rate": 3.809834881763502e-06, + "loss": 0.5585, + "step": 4185 + }, + { + "epoch": 1.9791962174940898, + "grad_norm": 2.9047577381134033, + "learning_rate": 3.8093034908147507e-06, + "loss": 0.5022, + "step": 4186 + }, + { + "epoch": 1.9796690307328606, + "grad_norm": 2.7417640686035156, + "learning_rate": 3.8087720183414125e-06, + "loss": 0.5275, + "step": 4187 + }, + { + "epoch": 1.9801418439716312, + "grad_norm": 2.952012062072754, + "learning_rate": 3.8082404643765786e-06, + "loss": 0.543, + "step": 4188 + }, + { + "epoch": 1.9806146572104018, + "grad_norm": 2.538376569747925, + "learning_rate": 3.807708828953348e-06, + "loss": 0.4969, + "step": 4189 + }, + { + "epoch": 1.9810874704491725, + "grad_norm": 2.3476181030273438, + "learning_rate": 3.807177112104823e-06, + "loss": 0.4979, + "step": 4190 + }, + { + "epoch": 1.9815602836879433, + "grad_norm": 2.6480464935302734, + "learning_rate": 3.80664531386411e-06, + "loss": 0.4894, + "step": 4191 + }, + { + "epoch": 1.982033096926714, + "grad_norm": 2.792916774749756, + "learning_rate": 3.8061134342643235e-06, + "loss": 0.5468, + "step": 4192 + }, + { + "epoch": 1.9825059101654845, + "grad_norm": 2.368736743927002, + "learning_rate": 3.805581473338581e-06, + "loss": 0.4672, + "step": 4193 + }, + { + "epoch": 1.9829787234042553, + "grad_norm": 2.379084348678589, + "learning_rate": 3.8050494311200037e-06, + "loss": 0.4577, + "step": 4194 + }, + { + "epoch": 1.983451536643026, + "grad_norm": 2.722471237182617, + "learning_rate": 3.804517307641722e-06, + "loss": 0.4988, + "step": 4195 + }, + { + "epoch": 1.9839243498817967, + "grad_norm": 2.356649875640869, + "learning_rate": 3.8039851029368674e-06, + "loss": 0.4933, + "step": 4196 + }, + { + "epoch": 1.9843971631205672, + "grad_norm": 2.9182281494140625, + "learning_rate": 3.8034528170385776e-06, + "loss": 0.4873, + "step": 4197 + }, + { + "epoch": 1.984869976359338, + "grad_norm": 2.6232199668884277, + "learning_rate": 3.8029204499799976e-06, + "loss": 0.4425, + "step": 4198 + }, + { + "epoch": 1.9853427895981088, + "grad_norm": 2.667541980743408, + "learning_rate": 3.802388001794274e-06, + "loss": 0.5022, + "step": 4199 + }, + { + "epoch": 1.9858156028368794, + "grad_norm": 3.168470621109009, + "learning_rate": 3.8018554725145596e-06, + "loss": 0.5505, + "step": 4200 + }, + { + "epoch": 1.98628841607565, + "grad_norm": 2.716625452041626, + "learning_rate": 3.8013228621740132e-06, + "loss": 0.4937, + "step": 4201 + }, + { + "epoch": 1.9867612293144208, + "grad_norm": 2.3014442920684814, + "learning_rate": 3.800790170805799e-06, + "loss": 0.4734, + "step": 4202 + }, + { + "epoch": 1.9872340425531916, + "grad_norm": 2.9426841735839844, + "learning_rate": 3.8002573984430847e-06, + "loss": 0.4983, + "step": 4203 + }, + { + "epoch": 1.9877068557919622, + "grad_norm": 2.5598278045654297, + "learning_rate": 3.7997245451190435e-06, + "loss": 0.4834, + "step": 4204 + }, + { + "epoch": 1.9881796690307327, + "grad_norm": 2.86458420753479, + "learning_rate": 3.7991916108668538e-06, + "loss": 0.5613, + "step": 4205 + }, + { + "epoch": 1.9886524822695035, + "grad_norm": 2.842914342880249, + "learning_rate": 3.7986585957196997e-06, + "loss": 0.4951, + "step": 4206 + }, + { + "epoch": 1.9891252955082743, + "grad_norm": 3.1828150749206543, + "learning_rate": 3.7981254997107686e-06, + "loss": 0.5913, + "step": 4207 + }, + { + "epoch": 1.989598108747045, + "grad_norm": 2.5765931606292725, + "learning_rate": 3.7975923228732547e-06, + "loss": 0.5544, + "step": 4208 + }, + { + "epoch": 1.9900709219858155, + "grad_norm": 2.492234945297241, + "learning_rate": 3.797059065240357e-06, + "loss": 0.5046, + "step": 4209 + }, + { + "epoch": 1.9905437352245863, + "grad_norm": 2.870346784591675, + "learning_rate": 3.7965257268452795e-06, + "loss": 0.5354, + "step": 4210 + }, + { + "epoch": 1.991016548463357, + "grad_norm": 2.4989993572235107, + "learning_rate": 3.795992307721229e-06, + "loss": 0.4677, + "step": 4211 + }, + { + "epoch": 1.9914893617021276, + "grad_norm": 2.931114673614502, + "learning_rate": 3.7954588079014206e-06, + "loss": 0.5504, + "step": 4212 + }, + { + "epoch": 1.9919621749408982, + "grad_norm": 2.5247652530670166, + "learning_rate": 3.794925227419073e-06, + "loss": 0.4736, + "step": 4213 + }, + { + "epoch": 1.992434988179669, + "grad_norm": 2.6238436698913574, + "learning_rate": 3.794391566307409e-06, + "loss": 0.4591, + "step": 4214 + }, + { + "epoch": 1.9929078014184398, + "grad_norm": 2.654886245727539, + "learning_rate": 3.7938578245996584e-06, + "loss": 0.5149, + "step": 4215 + }, + { + "epoch": 1.9933806146572104, + "grad_norm": 2.509164810180664, + "learning_rate": 3.793324002329054e-06, + "loss": 0.4951, + "step": 4216 + }, + { + "epoch": 1.993853427895981, + "grad_norm": 2.909632921218872, + "learning_rate": 3.7927900995288345e-06, + "loss": 0.5131, + "step": 4217 + }, + { + "epoch": 1.9943262411347518, + "grad_norm": 2.4354615211486816, + "learning_rate": 3.7922561162322456e-06, + "loss": 0.4716, + "step": 4218 + }, + { + "epoch": 1.9947990543735226, + "grad_norm": 2.6514649391174316, + "learning_rate": 3.791722052472534e-06, + "loss": 0.5714, + "step": 4219 + }, + { + "epoch": 1.9952718676122931, + "grad_norm": 2.77089262008667, + "learning_rate": 3.791187908282954e-06, + "loss": 0.5736, + "step": 4220 + }, + { + "epoch": 1.9957446808510637, + "grad_norm": 2.7651021480560303, + "learning_rate": 3.7906536836967657e-06, + "loss": 0.4948, + "step": 4221 + }, + { + "epoch": 1.9962174940898345, + "grad_norm": 2.7536795139312744, + "learning_rate": 3.7901193787472306e-06, + "loss": 0.512, + "step": 4222 + }, + { + "epoch": 1.9966903073286053, + "grad_norm": 2.684893846511841, + "learning_rate": 3.78958499346762e-06, + "loss": 0.5118, + "step": 4223 + }, + { + "epoch": 1.9971631205673759, + "grad_norm": 2.7616753578186035, + "learning_rate": 3.7890505278912054e-06, + "loss": 0.4516, + "step": 4224 + }, + { + "epoch": 1.9976359338061465, + "grad_norm": 2.4731967449188232, + "learning_rate": 3.7885159820512666e-06, + "loss": 0.4736, + "step": 4225 + }, + { + "epoch": 1.9981087470449173, + "grad_norm": 2.366631031036377, + "learning_rate": 3.7879813559810884e-06, + "loss": 0.4999, + "step": 4226 + }, + { + "epoch": 1.998581560283688, + "grad_norm": 2.994624137878418, + "learning_rate": 3.7874466497139582e-06, + "loss": 0.5273, + "step": 4227 + }, + { + "epoch": 1.9990543735224586, + "grad_norm": 2.4499242305755615, + "learning_rate": 3.7869118632831712e-06, + "loss": 0.5761, + "step": 4228 + }, + { + "epoch": 1.9995271867612292, + "grad_norm": 2.3370113372802734, + "learning_rate": 3.7863769967220243e-06, + "loss": 0.4673, + "step": 4229 + }, + { + "epoch": 2.0, + "grad_norm": 3.1131203174591064, + "learning_rate": 3.7858420500638236e-06, + "loss": 0.5118, + "step": 4230 + } + ], + "logging_steps": 1, + "max_steps": 12690, + "num_input_tokens_seen": 0, + "num_train_epochs": 6, + "save_steps": 2115, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 1.0684503718121964e+19, + "train_batch_size": 4, + "trial_name": null, + "trial_params": null +} diff --git a/checkpoint-4230/training_args.bin b/checkpoint-4230/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..3885e6a27b204c834ff6fbc6f3a5a4c7f69fe2cc --- /dev/null +++ b/checkpoint-4230/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:51188d498a3fe1bf9ddf234271786a31e25e89b3aa68721a97f8264e1013c9b6 +size 8056 diff --git a/checkpoint-4230/zero_to_fp32.py b/checkpoint-4230/zero_to_fp32.py new file mode 100644 index 0000000000000000000000000000000000000000..24cc342e78d1a006c782b3a4cd68d9ce786d8fd8 --- /dev/null +++ b/checkpoint-4230/zero_to_fp32.py @@ -0,0 +1,604 @@ +#!/usr/bin/env python + +# Copyright (c) Microsoft Corporation. +# SPDX-License-Identifier: Apache-2.0 + +# DeepSpeed Team + +# This script extracts fp32 consolidated weights from a zero 1, 2 and 3 DeepSpeed checkpoints. It gets +# copied into the top level checkpoint dir, so the user can easily do the conversion at any point in +# the future. Once extracted, the weights don't require DeepSpeed and can be used in any +# application. +# +# example: python zero_to_fp32.py . pytorch_model.bin + +import argparse +import torch +import glob +import math +import os +import re +from collections import OrderedDict +from dataclasses import dataclass + +# while this script doesn't use deepspeed to recover data, since the checkpoints are pickled with +# DeepSpeed data structures it has to be available in the current python environment. +from deepspeed.utils import logger +from deepspeed.checkpoint.constants import (DS_VERSION, OPTIMIZER_STATE_DICT, SINGLE_PARTITION_OF_FP32_GROUPS, + FP32_FLAT_GROUPS, ZERO_STAGE, PARTITION_COUNT, PARAM_SHAPES, BUFFER_NAMES, + FROZEN_PARAM_SHAPES, FROZEN_PARAM_FRAGMENTS) + + +@dataclass +class zero_model_state: + buffers: dict() + param_shapes: dict() + shared_params: list + ds_version: int + frozen_param_shapes: dict() + frozen_param_fragments: dict() + + +debug = 0 + +# load to cpu +device = torch.device('cpu') + + +def atoi(text): + return int(text) if text.isdigit() else text + + +def natural_keys(text): + ''' + alist.sort(key=natural_keys) sorts in human order + http://nedbatchelder.com/blog/200712/human_sorting.html + (See Toothy's implementation in the comments) + ''' + return [atoi(c) for c in re.split(r'(\d+)', text)] + + +def get_model_state_file(checkpoint_dir, zero_stage): + if not os.path.isdir(checkpoint_dir): + raise FileNotFoundError(f"Directory '{checkpoint_dir}' doesn't exist") + + # there should be only one file + if zero_stage <= 2: + file = os.path.join(checkpoint_dir, "mp_rank_00_model_states.pt") + elif zero_stage == 3: + file = os.path.join(checkpoint_dir, "zero_pp_rank_0_mp_rank_00_model_states.pt") + + if not os.path.exists(file): + raise FileNotFoundError(f"can't find model states file at '{file}'") + + return file + + +def get_checkpoint_files(checkpoint_dir, glob_pattern): + # XXX: need to test that this simple glob rule works for multi-node setup too + ckpt_files = sorted(glob.glob(os.path.join(checkpoint_dir, glob_pattern)), key=natural_keys) + + if len(ckpt_files) == 0: + raise FileNotFoundError(f"can't find {glob_pattern} files in directory '{checkpoint_dir}'") + + return ckpt_files + + +def get_optim_files(checkpoint_dir): + return get_checkpoint_files(checkpoint_dir, "*_optim_states.pt") + + +def get_model_state_files(checkpoint_dir): + return get_checkpoint_files(checkpoint_dir, "*_model_states.pt") + + +def parse_model_states(files): + zero_model_states = [] + for file in files: + state_dict = torch.load(file, map_location=device) + + if BUFFER_NAMES not in state_dict: + raise ValueError(f"{file} is not a model state checkpoint") + buffer_names = state_dict[BUFFER_NAMES] + if debug: + print("Found buffers:", buffer_names) + + # recover just the buffers while restoring them to fp32 if they were saved in fp16 + buffers = {k: v.float() for k, v in state_dict["module"].items() if k in buffer_names} + param_shapes = state_dict[PARAM_SHAPES] + + # collect parameters that are included in param_shapes + param_names = [] + for s in param_shapes: + for name in s.keys(): + param_names.append(name) + + # update with frozen parameters + frozen_param_shapes = state_dict.get(FROZEN_PARAM_SHAPES, None) + if frozen_param_shapes is not None: + if debug: + print(f"Found frozen_param_shapes: {frozen_param_shapes}") + param_names += list(frozen_param_shapes.keys()) + + # handle shared params + shared_params = [[k, v] for k, v in state_dict["shared_params"].items()] + + ds_version = state_dict.get(DS_VERSION, None) + + frozen_param_fragments = state_dict.get(FROZEN_PARAM_FRAGMENTS, None) + + z_model_state = zero_model_state(buffers=buffers, + param_shapes=param_shapes, + shared_params=shared_params, + ds_version=ds_version, + frozen_param_shapes=frozen_param_shapes, + frozen_param_fragments=frozen_param_fragments) + zero_model_states.append(z_model_state) + + return zero_model_states + + +def parse_optim_states(files, ds_checkpoint_dir): + + total_files = len(files) + state_dicts = [] + for f in files: + state_dict = torch.load(f, map_location=device) + # immediately discard the potentially huge 2 optimizer states as we only care for fp32 master weights + # and also handle the case where it was already removed by another helper script + state_dict["optimizer_state_dict"].pop("optimizer_state_dict", None) + state_dicts.append(state_dict) + + if not ZERO_STAGE in state_dicts[0][OPTIMIZER_STATE_DICT]: + raise ValueError(f"{files[0]} is not a zero checkpoint") + zero_stage = state_dicts[0][OPTIMIZER_STATE_DICT][ZERO_STAGE] + world_size = state_dicts[0][OPTIMIZER_STATE_DICT][PARTITION_COUNT] + + # For ZeRO-2 each param group can have different partition_count as data parallelism for expert + # parameters can be different from data parallelism for non-expert parameters. So we can just + # use the max of the partition_count to get the dp world_size. + + if type(world_size) is list: + world_size = max(world_size) + + if world_size != total_files: + raise ValueError( + f"Expected {world_size} of '*_optim_states.pt' under '{ds_checkpoint_dir}' but found {total_files} files. " + "Possibly due to an overwrite of an old checkpoint, or a checkpoint didn't get saved by one or more processes." + ) + + # the groups are named differently in each stage + if zero_stage <= 2: + fp32_groups_key = SINGLE_PARTITION_OF_FP32_GROUPS + elif zero_stage == 3: + fp32_groups_key = FP32_FLAT_GROUPS + else: + raise ValueError(f"unknown zero stage {zero_stage}") + + if zero_stage <= 2: + fp32_flat_groups = [state_dicts[i][OPTIMIZER_STATE_DICT][fp32_groups_key] for i in range(len(state_dicts))] + elif zero_stage == 3: + # if there is more than one param group, there will be multiple flattened tensors - one + # flattened tensor per group - for simplicity merge them into a single tensor + # + # XXX: could make the script more memory efficient for when there are multiple groups - it + # will require matching the sub-lists of param_shapes for each param group flattened tensor + + fp32_flat_groups = [ + torch.cat(state_dicts[i][OPTIMIZER_STATE_DICT][fp32_groups_key], 0) for i in range(len(state_dicts)) + ] + + return zero_stage, world_size, fp32_flat_groups + + +def _get_fp32_state_dict_from_zero_checkpoint(ds_checkpoint_dir, exclude_frozen_parameters): + """ + Returns fp32 state_dict reconstructed from ds checkpoint + + Args: + - ``ds_checkpoint_dir``: path to the deepspeed checkpoint folder (where the optimizer files are) + + """ + print(f"Processing zero checkpoint '{ds_checkpoint_dir}'") + + optim_files = get_optim_files(ds_checkpoint_dir) + zero_stage, world_size, fp32_flat_groups = parse_optim_states(optim_files, ds_checkpoint_dir) + print(f"Detected checkpoint of type zero stage {zero_stage}, world_size: {world_size}") + + model_files = get_model_state_files(ds_checkpoint_dir) + + zero_model_states = parse_model_states(model_files) + print(f'Parsing checkpoint created by deepspeed=={zero_model_states[0].ds_version}') + + if zero_stage <= 2: + return _get_fp32_state_dict_from_zero2_checkpoint(world_size, fp32_flat_groups, zero_model_states, + exclude_frozen_parameters) + elif zero_stage == 3: + return _get_fp32_state_dict_from_zero3_checkpoint(world_size, fp32_flat_groups, zero_model_states, + exclude_frozen_parameters) + + +def _zero2_merge_frozen_params(state_dict, zero_model_states): + if zero_model_states[0].frozen_param_shapes is None or len(zero_model_states[0].frozen_param_shapes) == 0: + return + + frozen_param_shapes = zero_model_states[0].frozen_param_shapes + frozen_param_fragments = zero_model_states[0].frozen_param_fragments + + if debug: + num_elem = sum(s.numel() for s in frozen_param_shapes.values()) + print(f'rank 0: {FROZEN_PARAM_SHAPES}.numel = {num_elem}') + + wanted_params = len(frozen_param_shapes) + wanted_numel = sum(s.numel() for s in frozen_param_shapes.values()) + avail_numel = sum([p.numel() for p in frozen_param_fragments.values()]) + print(f'Frozen params: Have {avail_numel} numels to process.') + print(f'Frozen params: Need {wanted_numel} numels in {wanted_params} params') + + total_params = 0 + total_numel = 0 + for name, shape in frozen_param_shapes.items(): + total_params += 1 + unpartitioned_numel = shape.numel() + total_numel += unpartitioned_numel + + state_dict[name] = frozen_param_fragments[name] + + if debug: + print(f"{name} full shape: {shape} unpartitioned numel {unpartitioned_numel} ") + + print(f"Reconstructed Frozen fp32 state dict with {total_params} params {total_numel} elements") + + +def _has_callable(obj, fn): + attr = getattr(obj, fn, None) + return callable(attr) + + +def _zero2_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states): + param_shapes = zero_model_states[0].param_shapes + + # Reconstruction protocol: + # + # XXX: document this + + if debug: + for i in range(world_size): + for j in range(len(fp32_flat_groups[0])): + print(f"{FP32_FLAT_GROUPS}[{i}][{j}].shape={fp32_flat_groups[i][j].shape}") + + # XXX: memory usage doubles here (zero2) + num_param_groups = len(fp32_flat_groups[0]) + merged_single_partition_of_fp32_groups = [] + for i in range(num_param_groups): + merged_partitions = [sd[i] for sd in fp32_flat_groups] + full_single_fp32_vector = torch.cat(merged_partitions, 0) + merged_single_partition_of_fp32_groups.append(full_single_fp32_vector) + avail_numel = sum( + [full_single_fp32_vector.numel() for full_single_fp32_vector in merged_single_partition_of_fp32_groups]) + + if debug: + wanted_params = sum([len(shapes) for shapes in param_shapes]) + wanted_numel = sum([sum(shape.numel() for shape in shapes.values()) for shapes in param_shapes]) + # not asserting if there is a mismatch due to possible padding + print(f"Have {avail_numel} numels to process.") + print(f"Need {wanted_numel} numels in {wanted_params} params.") + + # params + # XXX: for huge models that can't fit into the host's RAM we will have to recode this to support + # out-of-core computing solution + total_numel = 0 + total_params = 0 + for shapes, full_single_fp32_vector in zip(param_shapes, merged_single_partition_of_fp32_groups): + offset = 0 + avail_numel = full_single_fp32_vector.numel() + for name, shape in shapes.items(): + + unpartitioned_numel = shape.numel() if _has_callable(shape, 'numel') else math.prod(shape) + total_numel += unpartitioned_numel + total_params += 1 + + if debug: + print(f"{name} full shape: {shape} unpartitioned numel {unpartitioned_numel} ") + state_dict[name] = full_single_fp32_vector.narrow(0, offset, unpartitioned_numel).view(shape) + offset += unpartitioned_numel + + # Z2 started to align to 2*world_size to improve nccl performance. Therefore both offset and + # avail_numel can differ by anywhere between 0..2*world_size. Due to two unrelated complex + # paddings performed in the code it's almost impossible to predict the exact numbers w/o the + # live optimizer object, so we are checking that the numbers are within the right range + align_to = 2 * world_size + + def zero2_align(x): + return align_to * math.ceil(x / align_to) + + if debug: + print(f"original offset={offset}, avail_numel={avail_numel}") + + offset = zero2_align(offset) + avail_numel = zero2_align(avail_numel) + + if debug: + print(f"aligned offset={offset}, avail_numel={avail_numel}") + + # Sanity check + if offset != avail_numel: + raise ValueError(f"consumed {offset} numels out of {avail_numel} - something is wrong") + + print(f"Reconstructed fp32 state dict with {total_params} params {total_numel} elements") + + +def _get_fp32_state_dict_from_zero2_checkpoint(world_size, fp32_flat_groups, zero_model_states, + exclude_frozen_parameters): + state_dict = OrderedDict() + + # buffers + buffers = zero_model_states[0].buffers + state_dict.update(buffers) + if debug: + print(f"added {len(buffers)} buffers") + + if not exclude_frozen_parameters: + _zero2_merge_frozen_params(state_dict, zero_model_states) + + _zero2_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states) + + # recover shared parameters + for pair in zero_model_states[0].shared_params: + if pair[1] in state_dict: + state_dict[pair[0]] = state_dict[pair[1]] + + return state_dict + + +def zero3_partitioned_param_info(unpartitioned_numel, world_size): + remainder = unpartitioned_numel % world_size + padding_numel = (world_size - remainder) if remainder else 0 + partitioned_numel = math.ceil(unpartitioned_numel / world_size) + return partitioned_numel, padding_numel + + +def _zero3_merge_frozen_params(state_dict, world_size, zero_model_states): + if zero_model_states[0].frozen_param_shapes is None or len(zero_model_states[0].frozen_param_shapes) == 0: + return + + if debug: + for i in range(world_size): + num_elem = sum(s.numel() for s in zero_model_states[i].frozen_param_fragments.values()) + print(f'rank {i}: {FROZEN_PARAM_SHAPES}.numel = {num_elem}') + + frozen_param_shapes = zero_model_states[0].frozen_param_shapes + wanted_params = len(frozen_param_shapes) + wanted_numel = sum(s.numel() for s in frozen_param_shapes.values()) + avail_numel = sum([p.numel() for p in zero_model_states[0].frozen_param_fragments.values()]) * world_size + print(f'Frozen params: Have {avail_numel} numels to process.') + print(f'Frozen params: Need {wanted_numel} numels in {wanted_params} params') + + total_params = 0 + total_numel = 0 + for name, shape in zero_model_states[0].frozen_param_shapes.items(): + total_params += 1 + unpartitioned_numel = shape.numel() + total_numel += unpartitioned_numel + + param_frags = tuple(model_state.frozen_param_fragments[name] for model_state in zero_model_states) + state_dict[name] = torch.cat(param_frags, 0).narrow(0, 0, unpartitioned_numel).view(shape) + + partitioned_numel, partitioned_padding_numel = zero3_partitioned_param_info(unpartitioned_numel, world_size) + + if debug: + print( + f"Frozen params: {total_params} {name} full shape: {shape} partition0 numel={partitioned_numel} partitioned_padding_numel={partitioned_padding_numel}" + ) + + print(f"Reconstructed Frozen fp32 state dict with {total_params} params {total_numel} elements") + + +def _zero3_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states): + param_shapes = zero_model_states[0].param_shapes + avail_numel = fp32_flat_groups[0].numel() * world_size + # Reconstruction protocol: For zero3 we need to zip the partitions together at boundary of each + # param, re-consolidating each param, while dealing with padding if any + + # merge list of dicts, preserving order + param_shapes = {k: v for d in param_shapes for k, v in d.items()} + + if debug: + for i in range(world_size): + print(f"{FP32_FLAT_GROUPS}[{i}].shape={fp32_flat_groups[i].shape}") + + wanted_params = len(param_shapes) + wanted_numel = sum(shape.numel() for shape in param_shapes.values()) + # not asserting if there is a mismatch due to possible padding + avail_numel = fp32_flat_groups[0].numel() * world_size + print(f"Trainable params: Have {avail_numel} numels to process.") + print(f"Trainable params: Need {wanted_numel} numels in {wanted_params} params.") + + # params + # XXX: for huge models that can't fit into the host's RAM we will have to recode this to support + # out-of-core computing solution + offset = 0 + total_numel = 0 + total_params = 0 + for name, shape in param_shapes.items(): + + unpartitioned_numel = shape.numel() + total_numel += unpartitioned_numel + total_params += 1 + + partitioned_numel, partitioned_padding_numel = zero3_partitioned_param_info(unpartitioned_numel, world_size) + + if debug: + print( + f"Trainable params: {total_params} {name} full shape: {shape} partition0 numel={partitioned_numel} partitioned_padding_numel={partitioned_padding_numel}" + ) + + # XXX: memory usage doubles here + state_dict[name] = torch.cat( + tuple(fp32_flat_groups[i].narrow(0, offset, partitioned_numel) for i in range(world_size)), + 0).narrow(0, 0, unpartitioned_numel).view(shape) + offset += partitioned_numel + + offset *= world_size + + # Sanity check + if offset != avail_numel: + raise ValueError(f"consumed {offset} numels out of {avail_numel} - something is wrong") + + print(f"Reconstructed Trainable fp32 state dict with {total_params} params {total_numel} elements") + + +def _get_fp32_state_dict_from_zero3_checkpoint(world_size, fp32_flat_groups, zero_model_states, + exclude_frozen_parameters): + state_dict = OrderedDict() + + # buffers + buffers = zero_model_states[0].buffers + state_dict.update(buffers) + if debug: + print(f"added {len(buffers)} buffers") + + if not exclude_frozen_parameters: + _zero3_merge_frozen_params(state_dict, world_size, zero_model_states) + + _zero3_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states) + + # recover shared parameters + for pair in zero_model_states[0].shared_params: + if pair[1] in state_dict: + state_dict[pair[0]] = state_dict[pair[1]] + + return state_dict + + +def get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, tag=None, exclude_frozen_parameters=False): + """ + Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated state_dict that can be loaded with + ``load_state_dict()`` and used for training without DeepSpeed or shared with others, for example + via a model hub. + + Args: + - ``checkpoint_dir``: path to the desired checkpoint folder + - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in 'latest' file. e.g., ``global_step14`` + - ``exclude_frozen_parameters``: exclude frozen parameters + + Returns: + - pytorch ``state_dict`` + + Note: this approach may not work if your application doesn't have sufficient free CPU memory and + you may need to use the offline approach using the ``zero_to_fp32.py`` script that is saved with + the checkpoint. + + A typical usage might be :: + + from deepspeed.utils.zero_to_fp32 import get_fp32_state_dict_from_zero_checkpoint + # do the training and checkpoint saving + state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir) # already on cpu + model = model.cpu() # move to cpu + model.load_state_dict(state_dict) + # submit to model hub or save the model to share with others + + In this example the ``model`` will no longer be usable in the deepspeed context of the same + application. i.e. you will need to re-initialize the deepspeed engine, since + ``model.load_state_dict(state_dict)`` will remove all the deepspeed magic from it. + + If you want it all done for you, use ``load_state_dict_from_zero_checkpoint`` instead. + + """ + if tag is None: + latest_path = os.path.join(checkpoint_dir, 'latest') + if os.path.isfile(latest_path): + with open(latest_path, 'r') as fd: + tag = fd.read().strip() + else: + raise ValueError(f"Unable to find 'latest' file at {latest_path}") + + ds_checkpoint_dir = os.path.join(checkpoint_dir, tag) + + if not os.path.isdir(ds_checkpoint_dir): + raise FileNotFoundError(f"Directory '{ds_checkpoint_dir}' doesn't exist") + + return _get_fp32_state_dict_from_zero_checkpoint(ds_checkpoint_dir, exclude_frozen_parameters) + + +def convert_zero_checkpoint_to_fp32_state_dict(checkpoint_dir, output_file, tag=None, exclude_frozen_parameters=False): + """ + Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated ``state_dict`` file that can be + loaded with ``torch.load(file)`` + ``load_state_dict()`` and used for training without DeepSpeed. + + Args: + - ``checkpoint_dir``: path to the desired checkpoint folder. (one that contains the tag-folder, like ``global_step14``) + - ``output_file``: path to the pytorch fp32 state_dict output file (e.g. path/pytorch_model.bin) + - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in the file named ``latest`` in the checkpoint folder, e.g., ``global_step14`` + - ``exclude_frozen_parameters``: exclude frozen parameters + """ + + state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, tag, exclude_frozen_parameters) + print(f"Saving fp32 state dict to {output_file}") + torch.save(state_dict, output_file) + + +def load_state_dict_from_zero_checkpoint(model, checkpoint_dir, tag=None): + """ + 1. Put the provided model to cpu + 2. Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated ``state_dict`` + 3. Load it into the provided model + + Args: + - ``model``: the model object to update + - ``checkpoint_dir``: path to the desired checkpoint folder. (one that contains the tag-folder, like ``global_step14``) + - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in the file named ``latest`` in the checkpoint folder, e.g., ``global_step14`` + + Returns: + - ``model`: modified model + + Make sure you have plenty of CPU memory available before you call this function. If you don't + have enough use the ``zero_to_fp32.py`` utility to do the conversion. You will find it + conveniently placed for you in the checkpoint folder. + + A typical usage might be :: + + from deepspeed.utils.zero_to_fp32 import load_state_dict_from_zero_checkpoint + model = load_state_dict_from_zero_checkpoint(trainer.model, checkpoint_dir) + # submit to model hub or save the model to share with others + + Note, that once this was run, the ``model`` will no longer be usable in the deepspeed context + of the same application. i.e. you will need to re-initialize the deepspeed engine, since + ``model.load_state_dict(state_dict)`` will remove all the deepspeed magic from it. + + """ + logger.info(f"Extracting fp32 weights") + state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, tag) + + logger.info(f"Overwriting model with fp32 weights") + model = model.cpu() + model.load_state_dict(state_dict, strict=False) + + return model + + +if __name__ == "__main__": + + parser = argparse.ArgumentParser() + parser.add_argument("checkpoint_dir", + type=str, + help="path to the desired checkpoint folder, e.g., path/checkpoint-12") + parser.add_argument( + "output_file", + type=str, + help="path to the pytorch fp32 state_dict output file (e.g. path/checkpoint-12/pytorch_model.bin)") + parser.add_argument("-t", + "--tag", + type=str, + default=None, + help="checkpoint tag used as a unique identifier for checkpoint. e.g., global_step1") + parser.add_argument("--exclude_frozen_parameters", action='store_true', help="exclude frozen parameters") + parser.add_argument("-d", "--debug", action='store_true', help="enable debug") + args = parser.parse_args() + + debug = args.debug + + convert_zero_checkpoint_to_fp32_state_dict(args.checkpoint_dir, + args.output_file, + tag=args.tag, + exclude_frozen_parameters=args.exclude_frozen_parameters) diff --git a/checkpoint-6345/README.md b/checkpoint-6345/README.md new file mode 100644 index 0000000000000000000000000000000000000000..049d467664ca6172b7ffbe6ba60b3eac7479cac4 --- /dev/null +++ b/checkpoint-6345/README.md @@ -0,0 +1,202 @@ +--- +base_model: meta-llama/Llama-3.1-8B +library_name: peft +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.14.0 \ No newline at end of file diff --git a/checkpoint-6345/adapter_config.json b/checkpoint-6345/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..38890ca21f7e1854f7350049a9113a9eec8a443a --- /dev/null +++ b/checkpoint-6345/adapter_config.json @@ -0,0 +1,40 @@ +{ + "alpha_pattern": {}, + "auto_mapping": null, + "base_model_name_or_path": "meta-llama/Llama-3.1-8B", + "bias": "none", + "eva_config": null, + "exclude_modules": null, + "fan_in_fan_out": null, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 512, + "lora_bias": false, + "lora_dropout": 0.05, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": [ + "embed_tokens", + "lm_head" + ], + "peft_type": "LORA", + "r": 256, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "v_proj", + "q_proj", + "gate_proj", + "k_proj", + "down_proj", + "up_proj", + "o_proj" + ], + "task_type": "CAUSAL_LM", + "use_dora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/checkpoint-6345/adapter_model.safetensors b/checkpoint-6345/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..a53ac6eab0e0f6a4c9380ba3986fc09997e7c966 --- /dev/null +++ b/checkpoint-6345/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3db9057267cf2d082b32710b5fe435b23b00f8298e615fe927c3a0adc70c4767 +size 3443586272 diff --git a/checkpoint-6345/global_step6345/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt b/checkpoint-6345/global_step6345/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..d16de72224564ee5a203b10247bcc7645b7fbf44 --- /dev/null +++ b/checkpoint-6345/global_step6345/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0d957c7c6d3a677cf4693895155fa37c363f329306246b9a663e31cae9a23672 +size 20661195036 diff --git a/checkpoint-6345/global_step6345/mp_rank_00_model_states.pt b/checkpoint-6345/global_step6345/mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..157a53c213e78312f6246ec75059d2ac3e78ff5b --- /dev/null +++ b/checkpoint-6345/global_step6345/mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e51765e012d7c7d3bbdd93b2bc0bf866d582a03c65957556bf9c5fa12d9ab138 +size 3555326841 diff --git a/checkpoint-6345/latest b/checkpoint-6345/latest new file mode 100644 index 0000000000000000000000000000000000000000..fd3df6b13e10c2bf305cd21c1bc31f479846db0c --- /dev/null +++ b/checkpoint-6345/latest @@ -0,0 +1 @@ +global_step6345 \ No newline at end of file diff --git a/checkpoint-6345/rng_state.pth b/checkpoint-6345/rng_state.pth new file mode 100644 index 0000000000000000000000000000000000000000..130c6b431a345102d115f1e8198f9da92ce28bc5 --- /dev/null +++ b/checkpoint-6345/rng_state.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8dfe287174d835b24c21de3b28711ba5592cb0f747c67f0d46f820548100d9cb +size 14244 diff --git a/checkpoint-6345/scheduler.pt b/checkpoint-6345/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..2bca9b33032354b22350732af9e832e85170752f --- /dev/null +++ b/checkpoint-6345/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:db2cf1b2eb6f1e141df847d489467e491cfee5e0bebdaddda4c4564e7cfa498b +size 1064 diff --git a/checkpoint-6345/special_tokens_map.json b/checkpoint-6345/special_tokens_map.json new file mode 100644 index 0000000000000000000000000000000000000000..e5b39b6305d89284b04934011c68dbb26bf588ca --- /dev/null +++ b/checkpoint-6345/special_tokens_map.json @@ -0,0 +1,23 @@ +{ + "bos_token": { + "content": "<|begin_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "eos_token": { + "content": "<|end_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "pad_token": { + "content": "<|end_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + } +} diff --git a/checkpoint-6345/tokenizer.json b/checkpoint-6345/tokenizer.json new file mode 100644 index 0000000000000000000000000000000000000000..1c1d8d5c9024994f1d3b00f9662b8dd89ca13cf2 --- /dev/null +++ b/checkpoint-6345/tokenizer.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6b9e4e7fb171f92fd137b777cc2714bf87d11576700a1dcd7a399e7bbe39537b +size 17209920 diff --git a/checkpoint-6345/tokenizer_config.json b/checkpoint-6345/tokenizer_config.json new file mode 100644 index 0000000000000000000000000000000000000000..81dd14db6632ad5b35b9d447732e37ac074873a5 --- /dev/null +++ b/checkpoint-6345/tokenizer_config.json @@ -0,0 +1,2063 @@ +{ + "added_tokens_decoder": { + "128000": { + "content": "<|begin_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128001": { + "content": "<|end_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128002": { + "content": "<|reserved_special_token_0|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128003": { + "content": "<|reserved_special_token_1|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128004": { + "content": "<|finetune_right_pad_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128005": { + "content": "<|reserved_special_token_2|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128006": { + "content": "<|start_header_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128007": { + "content": "<|end_header_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128008": { + "content": "<|eom_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128009": { + "content": "<|eot_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128010": { + "content": "<|python_tag|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128011": { + "content": "<|reserved_special_token_3|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128012": { + "content": "<|reserved_special_token_4|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128013": { + "content": "<|reserved_special_token_5|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128014": { + "content": "<|reserved_special_token_6|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128015": { + "content": "<|reserved_special_token_7|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128016": { + "content": "<|reserved_special_token_8|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128017": { + "content": "<|reserved_special_token_9|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128018": { + "content": "<|reserved_special_token_10|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128019": { + "content": "<|reserved_special_token_11|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128020": { + "content": "<|reserved_special_token_12|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128021": { + "content": "<|reserved_special_token_13|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128022": { + "content": "<|reserved_special_token_14|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128023": { + "content": "<|reserved_special_token_15|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128024": { + "content": "<|reserved_special_token_16|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128025": { + "content": "<|reserved_special_token_17|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128026": { + "content": "<|reserved_special_token_18|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128027": { + "content": "<|reserved_special_token_19|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128028": { + "content": "<|reserved_special_token_20|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128029": { + "content": "<|reserved_special_token_21|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128030": { + "content": "<|reserved_special_token_22|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128031": { + "content": "<|reserved_special_token_23|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128032": { + "content": "<|reserved_special_token_24|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128033": { + "content": "<|reserved_special_token_25|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128034": { + "content": "<|reserved_special_token_26|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128035": { + "content": "<|reserved_special_token_27|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128036": { + "content": "<|reserved_special_token_28|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128037": { + "content": "<|reserved_special_token_29|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128038": { + "content": "<|reserved_special_token_30|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128039": { + "content": "<|reserved_special_token_31|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128040": { + "content": "<|reserved_special_token_32|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128041": { + "content": "<|reserved_special_token_33|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128042": { + "content": "<|reserved_special_token_34|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128043": { + "content": "<|reserved_special_token_35|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128044": { + "content": "<|reserved_special_token_36|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128045": { + "content": "<|reserved_special_token_37|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128046": { + "content": "<|reserved_special_token_38|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128047": { + "content": "<|reserved_special_token_39|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128048": { + "content": "<|reserved_special_token_40|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128049": { + "content": "<|reserved_special_token_41|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128050": { + "content": "<|reserved_special_token_42|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128051": { + "content": "<|reserved_special_token_43|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128052": { + "content": "<|reserved_special_token_44|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128053": { + "content": "<|reserved_special_token_45|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128054": { + "content": "<|reserved_special_token_46|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128055": { + "content": "<|reserved_special_token_47|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128056": { + "content": "<|reserved_special_token_48|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128057": { + "content": "<|reserved_special_token_49|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128058": { + "content": "<|reserved_special_token_50|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128059": { + "content": "<|reserved_special_token_51|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128060": { + "content": "<|reserved_special_token_52|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128061": { + "content": "<|reserved_special_token_53|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128062": { + "content": "<|reserved_special_token_54|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128063": { + "content": "<|reserved_special_token_55|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128064": { + "content": "<|reserved_special_token_56|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128065": { + "content": "<|reserved_special_token_57|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128066": { + "content": "<|reserved_special_token_58|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128067": { + "content": "<|reserved_special_token_59|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128068": { + "content": "<|reserved_special_token_60|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128069": { + "content": "<|reserved_special_token_61|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128070": { + "content": "<|reserved_special_token_62|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128071": { + "content": "<|reserved_special_token_63|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128072": { + "content": "<|reserved_special_token_64|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128073": { + "content": "<|reserved_special_token_65|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128074": { + "content": "<|reserved_special_token_66|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128075": { + "content": "<|reserved_special_token_67|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128076": { + "content": "<|reserved_special_token_68|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128077": { + "content": "<|reserved_special_token_69|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128078": { + "content": "<|reserved_special_token_70|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128079": { + "content": "<|reserved_special_token_71|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128080": { + "content": "<|reserved_special_token_72|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128081": { + "content": "<|reserved_special_token_73|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128082": { + "content": "<|reserved_special_token_74|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128083": { + "content": "<|reserved_special_token_75|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128084": { + "content": "<|reserved_special_token_76|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128085": { + "content": "<|reserved_special_token_77|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128086": { + "content": "<|reserved_special_token_78|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128087": { + "content": "<|reserved_special_token_79|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128088": { + "content": "<|reserved_special_token_80|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128089": { + "content": "<|reserved_special_token_81|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128090": { + "content": "<|reserved_special_token_82|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128091": { + "content": "<|reserved_special_token_83|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128092": { + "content": "<|reserved_special_token_84|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128093": { + "content": "<|reserved_special_token_85|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128094": { + "content": "<|reserved_special_token_86|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128095": { + "content": "<|reserved_special_token_87|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128096": { + "content": "<|reserved_special_token_88|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128097": { + "content": "<|reserved_special_token_89|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128098": { + "content": "<|reserved_special_token_90|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128099": { + "content": "<|reserved_special_token_91|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128100": { + "content": "<|reserved_special_token_92|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128101": { + "content": "<|reserved_special_token_93|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128102": { + "content": "<|reserved_special_token_94|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128103": { + "content": "<|reserved_special_token_95|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128104": { + "content": "<|reserved_special_token_96|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128105": { + "content": "<|reserved_special_token_97|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128106": { + "content": "<|reserved_special_token_98|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128107": { + "content": "<|reserved_special_token_99|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128108": { + "content": "<|reserved_special_token_100|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128109": { + "content": "<|reserved_special_token_101|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128110": { + "content": "<|reserved_special_token_102|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128111": { + "content": "<|reserved_special_token_103|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128112": { + "content": "<|reserved_special_token_104|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128113": { + "content": "<|reserved_special_token_105|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128114": { + "content": "<|reserved_special_token_106|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128115": { + "content": "<|reserved_special_token_107|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128116": { + "content": "<|reserved_special_token_108|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128117": { + "content": "<|reserved_special_token_109|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128118": { + "content": "<|reserved_special_token_110|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128119": { + "content": "<|reserved_special_token_111|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128120": { + "content": "<|reserved_special_token_112|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128121": { + "content": "<|reserved_special_token_113|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128122": { + "content": "<|reserved_special_token_114|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128123": { + "content": "<|reserved_special_token_115|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128124": { + "content": "<|reserved_special_token_116|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128125": { + "content": "<|reserved_special_token_117|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128126": { + "content": "<|reserved_special_token_118|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128127": { + "content": "<|reserved_special_token_119|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128128": { + "content": "<|reserved_special_token_120|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128129": { + "content": "<|reserved_special_token_121|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128130": { + "content": "<|reserved_special_token_122|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128131": { + "content": "<|reserved_special_token_123|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128132": { + "content": "<|reserved_special_token_124|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128133": { + "content": "<|reserved_special_token_125|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128134": { + "content": "<|reserved_special_token_126|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128135": { + "content": "<|reserved_special_token_127|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128136": { + "content": "<|reserved_special_token_128|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128137": { + "content": "<|reserved_special_token_129|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128138": { + "content": "<|reserved_special_token_130|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128139": { + "content": "<|reserved_special_token_131|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128140": { + "content": "<|reserved_special_token_132|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128141": { + "content": "<|reserved_special_token_133|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128142": { + "content": "<|reserved_special_token_134|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128143": { + "content": "<|reserved_special_token_135|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128144": { + "content": "<|reserved_special_token_136|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128145": { + "content": "<|reserved_special_token_137|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128146": { + "content": "<|reserved_special_token_138|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128147": { + "content": "<|reserved_special_token_139|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128148": { + "content": "<|reserved_special_token_140|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128149": { + "content": "<|reserved_special_token_141|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128150": { + "content": "<|reserved_special_token_142|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128151": { + "content": "<|reserved_special_token_143|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128152": { + "content": "<|reserved_special_token_144|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128153": { + "content": "<|reserved_special_token_145|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128154": { + "content": "<|reserved_special_token_146|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128155": { + "content": "<|reserved_special_token_147|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128156": { + "content": "<|reserved_special_token_148|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128157": { + "content": "<|reserved_special_token_149|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128158": { + "content": "<|reserved_special_token_150|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128159": { + "content": "<|reserved_special_token_151|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128160": { + "content": "<|reserved_special_token_152|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128161": { + "content": "<|reserved_special_token_153|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128162": { + "content": "<|reserved_special_token_154|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128163": { + "content": "<|reserved_special_token_155|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128164": { + "content": "<|reserved_special_token_156|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128165": { + "content": "<|reserved_special_token_157|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128166": { + "content": "<|reserved_special_token_158|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128167": { + "content": "<|reserved_special_token_159|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128168": { + "content": "<|reserved_special_token_160|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128169": { + "content": "<|reserved_special_token_161|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128170": { + "content": "<|reserved_special_token_162|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128171": { + "content": "<|reserved_special_token_163|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128172": { + "content": "<|reserved_special_token_164|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128173": { + "content": "<|reserved_special_token_165|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128174": { + "content": "<|reserved_special_token_166|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128175": { + "content": "<|reserved_special_token_167|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128176": { + "content": "<|reserved_special_token_168|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128177": { + "content": "<|reserved_special_token_169|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128178": { + "content": "<|reserved_special_token_170|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128179": { + "content": "<|reserved_special_token_171|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128180": { + "content": "<|reserved_special_token_172|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128181": { + "content": "<|reserved_special_token_173|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128182": { + "content": "<|reserved_special_token_174|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128183": { + "content": "<|reserved_special_token_175|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128184": { + "content": "<|reserved_special_token_176|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128185": { + "content": "<|reserved_special_token_177|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128186": { + "content": "<|reserved_special_token_178|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128187": { + "content": "<|reserved_special_token_179|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128188": { + "content": "<|reserved_special_token_180|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128189": { + "content": "<|reserved_special_token_181|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128190": { + "content": "<|reserved_special_token_182|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128191": { + "content": "<|reserved_special_token_183|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128192": { + "content": "<|reserved_special_token_184|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128193": { + "content": "<|reserved_special_token_185|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128194": { + "content": "<|reserved_special_token_186|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128195": { + "content": "<|reserved_special_token_187|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128196": { + "content": "<|reserved_special_token_188|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128197": { + "content": "<|reserved_special_token_189|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128198": { + "content": "<|reserved_special_token_190|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128199": { + "content": "<|reserved_special_token_191|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128200": { + "content": "<|reserved_special_token_192|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128201": { + "content": "<|reserved_special_token_193|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128202": { + "content": "<|reserved_special_token_194|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128203": { + "content": "<|reserved_special_token_195|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128204": { + "content": "<|reserved_special_token_196|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128205": { + "content": "<|reserved_special_token_197|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128206": { + "content": "<|reserved_special_token_198|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128207": { + "content": "<|reserved_special_token_199|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128208": { + "content": "<|reserved_special_token_200|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128209": { + "content": "<|reserved_special_token_201|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128210": { + "content": "<|reserved_special_token_202|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128211": { + "content": "<|reserved_special_token_203|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128212": { + "content": "<|reserved_special_token_204|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128213": { + "content": "<|reserved_special_token_205|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128214": { + "content": "<|reserved_special_token_206|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128215": { + "content": "<|reserved_special_token_207|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128216": { + "content": "<|reserved_special_token_208|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128217": { + "content": "<|reserved_special_token_209|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128218": { + "content": "<|reserved_special_token_210|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128219": { + "content": "<|reserved_special_token_211|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128220": { + "content": "<|reserved_special_token_212|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128221": { + "content": "<|reserved_special_token_213|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128222": { + "content": "<|reserved_special_token_214|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128223": { + "content": "<|reserved_special_token_215|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128224": { + "content": "<|reserved_special_token_216|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128225": { + "content": "<|reserved_special_token_217|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128226": { + "content": "<|reserved_special_token_218|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128227": { + "content": "<|reserved_special_token_219|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128228": { + "content": "<|reserved_special_token_220|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128229": { + "content": "<|reserved_special_token_221|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128230": { + "content": "<|reserved_special_token_222|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128231": { + "content": "<|reserved_special_token_223|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128232": { + "content": "<|reserved_special_token_224|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128233": { + "content": "<|reserved_special_token_225|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128234": { + "content": "<|reserved_special_token_226|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128235": { + "content": "<|reserved_special_token_227|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128236": { + "content": "<|reserved_special_token_228|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128237": { + "content": "<|reserved_special_token_229|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128238": { + "content": "<|reserved_special_token_230|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128239": { + "content": "<|reserved_special_token_231|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128240": { + "content": "<|reserved_special_token_232|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128241": { + "content": "<|reserved_special_token_233|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128242": { + "content": "<|reserved_special_token_234|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128243": { + "content": "<|reserved_special_token_235|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128244": { + "content": "<|reserved_special_token_236|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128245": { + "content": "<|reserved_special_token_237|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128246": { + "content": "<|reserved_special_token_238|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128247": { + "content": "<|reserved_special_token_239|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128248": { + "content": "<|reserved_special_token_240|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128249": { + "content": "<|reserved_special_token_241|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128250": { + "content": "<|reserved_special_token_242|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128251": { + "content": "<|reserved_special_token_243|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128252": { + "content": "<|reserved_special_token_244|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128253": { + "content": "<|reserved_special_token_245|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128254": { + "content": "<|reserved_special_token_246|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128255": { + "content": "<|reserved_special_token_247|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + } + }, + "bos_token": "<|begin_of_text|>", + "clean_up_tokenization_spaces": true, + "eos_token": "<|end_of_text|>", + "extra_special_tokens": {}, + "model_input_names": [ + "input_ids", + "attention_mask" + ], + "model_max_length": 131072, + "pad_token": "<|end_of_text|>", + "tokenizer_class": "PreTrainedTokenizer" +} diff --git a/checkpoint-6345/trainer_state.json b/checkpoint-6345/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..f6b3d8eecb764dad678e9a60e3411302a3f63f2c --- /dev/null +++ b/checkpoint-6345/trainer_state.json @@ -0,0 +1,44448 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 3.0, + "eval_steps": 500, + "global_step": 6345, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.00047281323877068556, + "grad_norm": 5.163570880889893, + "learning_rate": 5.0000000000000004e-08, + "loss": 1.4628, + "step": 1 + }, + { + "epoch": 0.0009456264775413711, + "grad_norm": 6.298020839691162, + "learning_rate": 1.0000000000000001e-07, + "loss": 1.5003, + "step": 2 + }, + { + "epoch": 0.0014184397163120568, + "grad_norm": 5.853623390197754, + "learning_rate": 1.5000000000000002e-07, + "loss": 1.4495, + "step": 3 + }, + { + "epoch": 0.0018912529550827422, + "grad_norm": 5.456025123596191, + "learning_rate": 2.0000000000000002e-07, + "loss": 1.3798, + "step": 4 + }, + { + "epoch": 0.002364066193853428, + "grad_norm": 5.757407188415527, + "learning_rate": 2.5000000000000004e-07, + "loss": 1.4515, + "step": 5 + }, + { + "epoch": 0.0028368794326241137, + "grad_norm": 5.872277736663818, + "learning_rate": 3.0000000000000004e-07, + "loss": 1.4424, + "step": 6 + }, + { + "epoch": 0.003309692671394799, + "grad_norm": 6.7816009521484375, + "learning_rate": 3.5000000000000004e-07, + "loss": 1.4004, + "step": 7 + }, + { + "epoch": 0.0037825059101654845, + "grad_norm": 6.229667663574219, + "learning_rate": 4.0000000000000003e-07, + "loss": 1.4494, + "step": 8 + }, + { + "epoch": 0.00425531914893617, + "grad_norm": 5.336202621459961, + "learning_rate": 4.5000000000000003e-07, + "loss": 1.3916, + "step": 9 + }, + { + "epoch": 0.004728132387706856, + "grad_norm": 5.589445114135742, + "learning_rate": 5.000000000000001e-07, + "loss": 1.2318, + "step": 10 + }, + { + "epoch": 0.005200945626477541, + "grad_norm": 5.720539569854736, + "learning_rate": 5.5e-07, + "loss": 1.4367, + "step": 11 + }, + { + "epoch": 0.005673758865248227, + "grad_norm": 5.913913726806641, + "learning_rate": 6.000000000000001e-07, + "loss": 1.342, + "step": 12 + }, + { + "epoch": 0.006146572104018913, + "grad_norm": 5.899744987487793, + "learning_rate": 6.5e-07, + "loss": 1.4307, + "step": 13 + }, + { + "epoch": 0.006619385342789598, + "grad_norm": 5.571037292480469, + "learning_rate": 7.000000000000001e-07, + "loss": 1.3372, + "step": 14 + }, + { + "epoch": 0.0070921985815602835, + "grad_norm": 5.480010509490967, + "learning_rate": 7.5e-07, + "loss": 1.3923, + "step": 15 + }, + { + "epoch": 0.007565011820330969, + "grad_norm": 5.254702091217041, + "learning_rate": 8.000000000000001e-07, + "loss": 1.2928, + "step": 16 + }, + { + "epoch": 0.008037825059101654, + "grad_norm": 6.090312480926514, + "learning_rate": 8.500000000000001e-07, + "loss": 1.4984, + "step": 17 + }, + { + "epoch": 0.00851063829787234, + "grad_norm": 5.689319610595703, + "learning_rate": 9.000000000000001e-07, + "loss": 1.4108, + "step": 18 + }, + { + "epoch": 0.008983451536643027, + "grad_norm": 5.386685848236084, + "learning_rate": 9.500000000000001e-07, + "loss": 1.425, + "step": 19 + }, + { + "epoch": 0.009456264775413711, + "grad_norm": 6.451584815979004, + "learning_rate": 1.0000000000000002e-06, + "loss": 1.5507, + "step": 20 + }, + { + "epoch": 0.009929078014184398, + "grad_norm": 5.37647008895874, + "learning_rate": 1.0500000000000001e-06, + "loss": 1.4109, + "step": 21 + }, + { + "epoch": 0.010401891252955082, + "grad_norm": 4.716553211212158, + "learning_rate": 1.1e-06, + "loss": 1.2028, + "step": 22 + }, + { + "epoch": 0.010874704491725768, + "grad_norm": 4.950989723205566, + "learning_rate": 1.1500000000000002e-06, + "loss": 1.3043, + "step": 23 + }, + { + "epoch": 0.011347517730496455, + "grad_norm": 4.688975811004639, + "learning_rate": 1.2000000000000002e-06, + "loss": 1.2708, + "step": 24 + }, + { + "epoch": 0.01182033096926714, + "grad_norm": 4.905868053436279, + "learning_rate": 1.25e-06, + "loss": 1.3268, + "step": 25 + }, + { + "epoch": 0.012293144208037825, + "grad_norm": 4.503395080566406, + "learning_rate": 1.3e-06, + "loss": 1.1799, + "step": 26 + }, + { + "epoch": 0.01276595744680851, + "grad_norm": 4.77382230758667, + "learning_rate": 1.3500000000000002e-06, + "loss": 1.3882, + "step": 27 + }, + { + "epoch": 0.013238770685579196, + "grad_norm": 4.734329700469971, + "learning_rate": 1.4000000000000001e-06, + "loss": 1.3476, + "step": 28 + }, + { + "epoch": 0.013711583924349883, + "grad_norm": 4.775066375732422, + "learning_rate": 1.45e-06, + "loss": 1.2429, + "step": 29 + }, + { + "epoch": 0.014184397163120567, + "grad_norm": 4.978334426879883, + "learning_rate": 1.5e-06, + "loss": 1.2119, + "step": 30 + }, + { + "epoch": 0.014657210401891253, + "grad_norm": 4.506785869598389, + "learning_rate": 1.5500000000000002e-06, + "loss": 1.3157, + "step": 31 + }, + { + "epoch": 0.015130023640661938, + "grad_norm": 4.007757186889648, + "learning_rate": 1.6000000000000001e-06, + "loss": 1.1451, + "step": 32 + }, + { + "epoch": 0.015602836879432624, + "grad_norm": 3.6621618270874023, + "learning_rate": 1.6500000000000003e-06, + "loss": 1.093, + "step": 33 + }, + { + "epoch": 0.01607565011820331, + "grad_norm": 3.8733766078948975, + "learning_rate": 1.7000000000000002e-06, + "loss": 1.2289, + "step": 34 + }, + { + "epoch": 0.016548463356973995, + "grad_norm": 4.3391900062561035, + "learning_rate": 1.75e-06, + "loss": 1.1453, + "step": 35 + }, + { + "epoch": 0.01702127659574468, + "grad_norm": 3.287623643875122, + "learning_rate": 1.8000000000000001e-06, + "loss": 1.0257, + "step": 36 + }, + { + "epoch": 0.017494089834515367, + "grad_norm": 3.591721773147583, + "learning_rate": 1.85e-06, + "loss": 0.9976, + "step": 37 + }, + { + "epoch": 0.017966903073286054, + "grad_norm": 4.028271675109863, + "learning_rate": 1.9000000000000002e-06, + "loss": 1.0773, + "step": 38 + }, + { + "epoch": 0.018439716312056736, + "grad_norm": 3.3543951511383057, + "learning_rate": 1.9500000000000004e-06, + "loss": 1.1677, + "step": 39 + }, + { + "epoch": 0.018912529550827423, + "grad_norm": 3.807624340057373, + "learning_rate": 2.0000000000000003e-06, + "loss": 1.1232, + "step": 40 + }, + { + "epoch": 0.01938534278959811, + "grad_norm": 4.242797374725342, + "learning_rate": 2.05e-06, + "loss": 1.1819, + "step": 41 + }, + { + "epoch": 0.019858156028368795, + "grad_norm": 3.4574992656707764, + "learning_rate": 2.1000000000000002e-06, + "loss": 0.9878, + "step": 42 + }, + { + "epoch": 0.02033096926713948, + "grad_norm": 3.906695604324341, + "learning_rate": 2.15e-06, + "loss": 1.0592, + "step": 43 + }, + { + "epoch": 0.020803782505910164, + "grad_norm": 3.7543163299560547, + "learning_rate": 2.2e-06, + "loss": 1.0309, + "step": 44 + }, + { + "epoch": 0.02127659574468085, + "grad_norm": 3.3777148723602295, + "learning_rate": 2.25e-06, + "loss": 1.0664, + "step": 45 + }, + { + "epoch": 0.021749408983451537, + "grad_norm": 3.6003634929656982, + "learning_rate": 2.3000000000000004e-06, + "loss": 1.0482, + "step": 46 + }, + { + "epoch": 0.022222222222222223, + "grad_norm": 3.3961377143859863, + "learning_rate": 2.35e-06, + "loss": 1.0252, + "step": 47 + }, + { + "epoch": 0.02269503546099291, + "grad_norm": 3.1601035594940186, + "learning_rate": 2.4000000000000003e-06, + "loss": 1.0435, + "step": 48 + }, + { + "epoch": 0.023167848699763592, + "grad_norm": 3.4192967414855957, + "learning_rate": 2.4500000000000003e-06, + "loss": 1.0935, + "step": 49 + }, + { + "epoch": 0.02364066193853428, + "grad_norm": 3.1225922107696533, + "learning_rate": 2.5e-06, + "loss": 0.8988, + "step": 50 + }, + { + "epoch": 0.024113475177304965, + "grad_norm": 3.1423380374908447, + "learning_rate": 2.55e-06, + "loss": 1.0159, + "step": 51 + }, + { + "epoch": 0.02458628841607565, + "grad_norm": 3.4782402515411377, + "learning_rate": 2.6e-06, + "loss": 1.0231, + "step": 52 + }, + { + "epoch": 0.025059101654846337, + "grad_norm": 3.8362693786621094, + "learning_rate": 2.6500000000000005e-06, + "loss": 1.0725, + "step": 53 + }, + { + "epoch": 0.02553191489361702, + "grad_norm": 3.033294916152954, + "learning_rate": 2.7000000000000004e-06, + "loss": 0.9377, + "step": 54 + }, + { + "epoch": 0.026004728132387706, + "grad_norm": 3.849741220474243, + "learning_rate": 2.7500000000000004e-06, + "loss": 1.0046, + "step": 55 + }, + { + "epoch": 0.026477541371158392, + "grad_norm": 3.141876220703125, + "learning_rate": 2.8000000000000003e-06, + "loss": 0.9226, + "step": 56 + }, + { + "epoch": 0.02695035460992908, + "grad_norm": 2.773594856262207, + "learning_rate": 2.85e-06, + "loss": 0.8662, + "step": 57 + }, + { + "epoch": 0.027423167848699765, + "grad_norm": 3.1460225582122803, + "learning_rate": 2.9e-06, + "loss": 0.9304, + "step": 58 + }, + { + "epoch": 0.027895981087470448, + "grad_norm": 3.293583631515503, + "learning_rate": 2.95e-06, + "loss": 1.0374, + "step": 59 + }, + { + "epoch": 0.028368794326241134, + "grad_norm": 3.8190863132476807, + "learning_rate": 3e-06, + "loss": 0.971, + "step": 60 + }, + { + "epoch": 0.02884160756501182, + "grad_norm": 3.4566776752471924, + "learning_rate": 3.05e-06, + "loss": 0.9631, + "step": 61 + }, + { + "epoch": 0.029314420803782507, + "grad_norm": 3.355741500854492, + "learning_rate": 3.1000000000000004e-06, + "loss": 1.0097, + "step": 62 + }, + { + "epoch": 0.029787234042553193, + "grad_norm": 3.29746675491333, + "learning_rate": 3.1500000000000003e-06, + "loss": 0.9459, + "step": 63 + }, + { + "epoch": 0.030260047281323876, + "grad_norm": 3.3122968673706055, + "learning_rate": 3.2000000000000003e-06, + "loss": 0.8594, + "step": 64 + }, + { + "epoch": 0.030732860520094562, + "grad_norm": 3.477701187133789, + "learning_rate": 3.2500000000000002e-06, + "loss": 0.9197, + "step": 65 + }, + { + "epoch": 0.031205673758865248, + "grad_norm": 3.3363406658172607, + "learning_rate": 3.3000000000000006e-06, + "loss": 0.9478, + "step": 66 + }, + { + "epoch": 0.03167848699763593, + "grad_norm": 4.143295764923096, + "learning_rate": 3.3500000000000005e-06, + "loss": 1.0534, + "step": 67 + }, + { + "epoch": 0.03215130023640662, + "grad_norm": 3.2363274097442627, + "learning_rate": 3.4000000000000005e-06, + "loss": 0.9454, + "step": 68 + }, + { + "epoch": 0.032624113475177303, + "grad_norm": 3.198746681213379, + "learning_rate": 3.45e-06, + "loss": 0.9388, + "step": 69 + }, + { + "epoch": 0.03309692671394799, + "grad_norm": 3.5751023292541504, + "learning_rate": 3.5e-06, + "loss": 0.9444, + "step": 70 + }, + { + "epoch": 0.033569739952718676, + "grad_norm": 3.1745729446411133, + "learning_rate": 3.5500000000000003e-06, + "loss": 0.8683, + "step": 71 + }, + { + "epoch": 0.03404255319148936, + "grad_norm": 3.3210883140563965, + "learning_rate": 3.6000000000000003e-06, + "loss": 0.8811, + "step": 72 + }, + { + "epoch": 0.03451536643026005, + "grad_norm": 3.2502429485321045, + "learning_rate": 3.65e-06, + "loss": 1.0012, + "step": 73 + }, + { + "epoch": 0.034988179669030735, + "grad_norm": 3.44598126411438, + "learning_rate": 3.7e-06, + "loss": 0.9217, + "step": 74 + }, + { + "epoch": 0.03546099290780142, + "grad_norm": 3.439117431640625, + "learning_rate": 3.7500000000000005e-06, + "loss": 0.8976, + "step": 75 + }, + { + "epoch": 0.03593380614657211, + "grad_norm": 3.523627758026123, + "learning_rate": 3.8000000000000005e-06, + "loss": 0.8996, + "step": 76 + }, + { + "epoch": 0.03640661938534279, + "grad_norm": 3.3716015815734863, + "learning_rate": 3.85e-06, + "loss": 0.9061, + "step": 77 + }, + { + "epoch": 0.03687943262411347, + "grad_norm": 3.33518385887146, + "learning_rate": 3.900000000000001e-06, + "loss": 0.9371, + "step": 78 + }, + { + "epoch": 0.03735224586288416, + "grad_norm": 3.833829879760742, + "learning_rate": 3.95e-06, + "loss": 0.9669, + "step": 79 + }, + { + "epoch": 0.037825059101654845, + "grad_norm": 3.260446786880493, + "learning_rate": 4.000000000000001e-06, + "loss": 0.9449, + "step": 80 + }, + { + "epoch": 0.03829787234042553, + "grad_norm": 3.532451629638672, + "learning_rate": 4.05e-06, + "loss": 0.897, + "step": 81 + }, + { + "epoch": 0.03877068557919622, + "grad_norm": 3.1156492233276367, + "learning_rate": 4.1e-06, + "loss": 0.8463, + "step": 82 + }, + { + "epoch": 0.039243498817966904, + "grad_norm": 2.8801751136779785, + "learning_rate": 4.15e-06, + "loss": 0.8616, + "step": 83 + }, + { + "epoch": 0.03971631205673759, + "grad_norm": 3.072476863861084, + "learning_rate": 4.2000000000000004e-06, + "loss": 0.8387, + "step": 84 + }, + { + "epoch": 0.04018912529550828, + "grad_norm": 2.9601376056671143, + "learning_rate": 4.25e-06, + "loss": 0.8538, + "step": 85 + }, + { + "epoch": 0.04066193853427896, + "grad_norm": 3.521664619445801, + "learning_rate": 4.3e-06, + "loss": 0.8894, + "step": 86 + }, + { + "epoch": 0.04113475177304964, + "grad_norm": 3.2670981884002686, + "learning_rate": 4.350000000000001e-06, + "loss": 0.8387, + "step": 87 + }, + { + "epoch": 0.04160756501182033, + "grad_norm": 3.422089099884033, + "learning_rate": 4.4e-06, + "loss": 0.7728, + "step": 88 + }, + { + "epoch": 0.042080378250591015, + "grad_norm": 3.414034128189087, + "learning_rate": 4.450000000000001e-06, + "loss": 0.7968, + "step": 89 + }, + { + "epoch": 0.0425531914893617, + "grad_norm": 4.234285354614258, + "learning_rate": 4.5e-06, + "loss": 0.8502, + "step": 90 + }, + { + "epoch": 0.04302600472813239, + "grad_norm": 3.1446919441223145, + "learning_rate": 4.5500000000000005e-06, + "loss": 0.8236, + "step": 91 + }, + { + "epoch": 0.043498817966903074, + "grad_norm": 3.683443307876587, + "learning_rate": 4.600000000000001e-06, + "loss": 0.9792, + "step": 92 + }, + { + "epoch": 0.04397163120567376, + "grad_norm": 3.664219617843628, + "learning_rate": 4.65e-06, + "loss": 0.8743, + "step": 93 + }, + { + "epoch": 0.044444444444444446, + "grad_norm": 3.369479179382324, + "learning_rate": 4.7e-06, + "loss": 0.8741, + "step": 94 + }, + { + "epoch": 0.04491725768321513, + "grad_norm": 3.694949150085449, + "learning_rate": 4.75e-06, + "loss": 0.7574, + "step": 95 + }, + { + "epoch": 0.04539007092198582, + "grad_norm": 3.5144498348236084, + "learning_rate": 4.800000000000001e-06, + "loss": 0.9934, + "step": 96 + }, + { + "epoch": 0.0458628841607565, + "grad_norm": 3.164451837539673, + "learning_rate": 4.85e-06, + "loss": 0.7463, + "step": 97 + }, + { + "epoch": 0.046335697399527184, + "grad_norm": 3.222785472869873, + "learning_rate": 4.9000000000000005e-06, + "loss": 0.7698, + "step": 98 + }, + { + "epoch": 0.04680851063829787, + "grad_norm": 2.9129555225372314, + "learning_rate": 4.95e-06, + "loss": 0.7856, + "step": 99 + }, + { + "epoch": 0.04728132387706856, + "grad_norm": 3.5061235427856445, + "learning_rate": 5e-06, + "loss": 0.8588, + "step": 100 + }, + { + "epoch": 0.04775413711583924, + "grad_norm": 3.2805044651031494, + "learning_rate": 4.999999922167982e-06, + "loss": 0.7643, + "step": 101 + }, + { + "epoch": 0.04822695035460993, + "grad_norm": 3.5461678504943848, + "learning_rate": 4.999999688671929e-06, + "loss": 0.8253, + "step": 102 + }, + { + "epoch": 0.048699763593380616, + "grad_norm": 3.2238264083862305, + "learning_rate": 4.99999929951186e-06, + "loss": 0.7622, + "step": 103 + }, + { + "epoch": 0.0491725768321513, + "grad_norm": 3.818955898284912, + "learning_rate": 4.999998754687795e-06, + "loss": 0.8471, + "step": 104 + }, + { + "epoch": 0.04964539007092199, + "grad_norm": 3.1252424716949463, + "learning_rate": 4.99999805419977e-06, + "loss": 0.8409, + "step": 105 + }, + { + "epoch": 0.050118203309692674, + "grad_norm": 3.604283571243286, + "learning_rate": 4.999997198047828e-06, + "loss": 0.9027, + "step": 106 + }, + { + "epoch": 0.050591016548463354, + "grad_norm": 3.6752424240112305, + "learning_rate": 4.999996186232023e-06, + "loss": 0.9336, + "step": 107 + }, + { + "epoch": 0.05106382978723404, + "grad_norm": 3.517557144165039, + "learning_rate": 4.9999950187524184e-06, + "loss": 0.8351, + "step": 108 + }, + { + "epoch": 0.051536643026004726, + "grad_norm": 3.427285671234131, + "learning_rate": 4.999993695609085e-06, + "loss": 0.8457, + "step": 109 + }, + { + "epoch": 0.05200945626477541, + "grad_norm": 3.2792510986328125, + "learning_rate": 4.999992216802107e-06, + "loss": 0.8391, + "step": 110 + }, + { + "epoch": 0.0524822695035461, + "grad_norm": 3.581094741821289, + "learning_rate": 4.999990582331576e-06, + "loss": 0.7533, + "step": 111 + }, + { + "epoch": 0.052955082742316785, + "grad_norm": 3.1667377948760986, + "learning_rate": 4.999988792197593e-06, + "loss": 0.9562, + "step": 112 + }, + { + "epoch": 0.05342789598108747, + "grad_norm": 3.3609890937805176, + "learning_rate": 4.99998684640027e-06, + "loss": 0.8181, + "step": 113 + }, + { + "epoch": 0.05390070921985816, + "grad_norm": 3.260627269744873, + "learning_rate": 4.999984744939729e-06, + "loss": 0.8012, + "step": 114 + }, + { + "epoch": 0.054373522458628844, + "grad_norm": 3.4535653591156006, + "learning_rate": 4.9999824878160985e-06, + "loss": 0.919, + "step": 115 + }, + { + "epoch": 0.05484633569739953, + "grad_norm": 3.4880740642547607, + "learning_rate": 4.999980075029522e-06, + "loss": 0.8114, + "step": 116 + }, + { + "epoch": 0.05531914893617021, + "grad_norm": 3.2546932697296143, + "learning_rate": 4.999977506580147e-06, + "loss": 0.8274, + "step": 117 + }, + { + "epoch": 0.055791962174940896, + "grad_norm": 3.2762744426727295, + "learning_rate": 4.999974782468136e-06, + "loss": 0.9018, + "step": 118 + }, + { + "epoch": 0.05626477541371158, + "grad_norm": 3.42825984954834, + "learning_rate": 4.999971902693657e-06, + "loss": 0.8262, + "step": 119 + }, + { + "epoch": 0.05673758865248227, + "grad_norm": 3.082496404647827, + "learning_rate": 4.99996886725689e-06, + "loss": 0.8181, + "step": 120 + }, + { + "epoch": 0.057210401891252954, + "grad_norm": 3.322869300842285, + "learning_rate": 4.9999656761580225e-06, + "loss": 0.8382, + "step": 121 + }, + { + "epoch": 0.05768321513002364, + "grad_norm": 3.6365339756011963, + "learning_rate": 4.9999623293972555e-06, + "loss": 0.7489, + "step": 122 + }, + { + "epoch": 0.05815602836879433, + "grad_norm": 3.376352548599243, + "learning_rate": 4.999958826974796e-06, + "loss": 0.9012, + "step": 123 + }, + { + "epoch": 0.05862884160756501, + "grad_norm": 3.49088716506958, + "learning_rate": 4.999955168890862e-06, + "loss": 0.8999, + "step": 124 + }, + { + "epoch": 0.0591016548463357, + "grad_norm": 3.3265068531036377, + "learning_rate": 4.999951355145682e-06, + "loss": 0.8161, + "step": 125 + }, + { + "epoch": 0.059574468085106386, + "grad_norm": 3.697282314300537, + "learning_rate": 4.999947385739493e-06, + "loss": 0.9623, + "step": 126 + }, + { + "epoch": 0.06004728132387707, + "grad_norm": 2.7901928424835205, + "learning_rate": 4.999943260672542e-06, + "loss": 0.7371, + "step": 127 + }, + { + "epoch": 0.06052009456264775, + "grad_norm": 3.110319137573242, + "learning_rate": 4.999938979945086e-06, + "loss": 0.715, + "step": 128 + }, + { + "epoch": 0.06099290780141844, + "grad_norm": 3.2211520671844482, + "learning_rate": 4.999934543557392e-06, + "loss": 0.8888, + "step": 129 + }, + { + "epoch": 0.061465721040189124, + "grad_norm": 3.2466187477111816, + "learning_rate": 4.999929951509735e-06, + "loss": 0.9389, + "step": 130 + }, + { + "epoch": 0.06193853427895981, + "grad_norm": 3.3574399948120117, + "learning_rate": 4.999925203802403e-06, + "loss": 0.8263, + "step": 131 + }, + { + "epoch": 0.062411347517730496, + "grad_norm": 3.275601625442505, + "learning_rate": 4.99992030043569e-06, + "loss": 0.8338, + "step": 132 + }, + { + "epoch": 0.06288416075650118, + "grad_norm": 3.6011312007904053, + "learning_rate": 4.999915241409902e-06, + "loss": 0.8351, + "step": 133 + }, + { + "epoch": 0.06335697399527186, + "grad_norm": 2.969011068344116, + "learning_rate": 4.999910026725352e-06, + "loss": 0.79, + "step": 134 + }, + { + "epoch": 0.06382978723404255, + "grad_norm": 3.690784454345703, + "learning_rate": 4.999904656382369e-06, + "loss": 0.8209, + "step": 135 + }, + { + "epoch": 0.06430260047281323, + "grad_norm": 3.3363115787506104, + "learning_rate": 4.999899130381283e-06, + "loss": 0.858, + "step": 136 + }, + { + "epoch": 0.06477541371158392, + "grad_norm": 3.206881523132324, + "learning_rate": 4.9998934487224405e-06, + "loss": 0.834, + "step": 137 + }, + { + "epoch": 0.06524822695035461, + "grad_norm": 2.773146152496338, + "learning_rate": 4.999887611406195e-06, + "loss": 0.7576, + "step": 138 + }, + { + "epoch": 0.0657210401891253, + "grad_norm": 3.307725667953491, + "learning_rate": 4.999881618432908e-06, + "loss": 0.7487, + "step": 139 + }, + { + "epoch": 0.06619385342789598, + "grad_norm": 4.273657321929932, + "learning_rate": 4.999875469802956e-06, + "loss": 0.8176, + "step": 140 + }, + { + "epoch": 0.06666666666666667, + "grad_norm": 3.0898005962371826, + "learning_rate": 4.999869165516719e-06, + "loss": 0.7578, + "step": 141 + }, + { + "epoch": 0.06713947990543735, + "grad_norm": 3.25150990486145, + "learning_rate": 4.9998627055745915e-06, + "loss": 0.7873, + "step": 142 + }, + { + "epoch": 0.06761229314420804, + "grad_norm": 2.9705755710601807, + "learning_rate": 4.999856089976974e-06, + "loss": 0.6473, + "step": 143 + }, + { + "epoch": 0.06808510638297872, + "grad_norm": 3.5658507347106934, + "learning_rate": 4.9998493187242804e-06, + "loss": 0.855, + "step": 144 + }, + { + "epoch": 0.06855791962174941, + "grad_norm": 3.3994076251983643, + "learning_rate": 4.99984239181693e-06, + "loss": 0.7926, + "step": 145 + }, + { + "epoch": 0.0690307328605201, + "grad_norm": 2.8266260623931885, + "learning_rate": 4.999835309255357e-06, + "loss": 0.7564, + "step": 146 + }, + { + "epoch": 0.06950354609929078, + "grad_norm": 3.1143875122070312, + "learning_rate": 4.999828071039999e-06, + "loss": 0.8398, + "step": 147 + }, + { + "epoch": 0.06997635933806147, + "grad_norm": 2.9364278316497803, + "learning_rate": 4.99982067717131e-06, + "loss": 0.7381, + "step": 148 + }, + { + "epoch": 0.07044917257683216, + "grad_norm": 3.4155616760253906, + "learning_rate": 4.999813127649748e-06, + "loss": 0.7933, + "step": 149 + }, + { + "epoch": 0.07092198581560284, + "grad_norm": 4.371236324310303, + "learning_rate": 4.999805422475784e-06, + "loss": 0.8292, + "step": 150 + }, + { + "epoch": 0.07139479905437353, + "grad_norm": 3.3967185020446777, + "learning_rate": 4.999797561649897e-06, + "loss": 0.8712, + "step": 151 + }, + { + "epoch": 0.07186761229314421, + "grad_norm": 3.343303680419922, + "learning_rate": 4.999789545172578e-06, + "loss": 0.8177, + "step": 152 + }, + { + "epoch": 0.07234042553191489, + "grad_norm": 3.040235757827759, + "learning_rate": 4.999781373044325e-06, + "loss": 0.7379, + "step": 153 + }, + { + "epoch": 0.07281323877068557, + "grad_norm": 3.4069204330444336, + "learning_rate": 4.999773045265647e-06, + "loss": 0.7939, + "step": 154 + }, + { + "epoch": 0.07328605200945626, + "grad_norm": 3.1939475536346436, + "learning_rate": 4.999764561837063e-06, + "loss": 0.8037, + "step": 155 + }, + { + "epoch": 0.07375886524822695, + "grad_norm": 4.452004909515381, + "learning_rate": 4.999755922759101e-06, + "loss": 0.8421, + "step": 156 + }, + { + "epoch": 0.07423167848699763, + "grad_norm": 3.2031240463256836, + "learning_rate": 4.999747128032298e-06, + "loss": 0.794, + "step": 157 + }, + { + "epoch": 0.07470449172576832, + "grad_norm": 3.175920009613037, + "learning_rate": 4.999738177657203e-06, + "loss": 0.759, + "step": 158 + }, + { + "epoch": 0.075177304964539, + "grad_norm": 3.7679688930511475, + "learning_rate": 4.9997290716343725e-06, + "loss": 0.8174, + "step": 159 + }, + { + "epoch": 0.07565011820330969, + "grad_norm": 3.7020037174224854, + "learning_rate": 4.999719809964373e-06, + "loss": 0.7116, + "step": 160 + }, + { + "epoch": 0.07612293144208038, + "grad_norm": 4.357471942901611, + "learning_rate": 4.999710392647783e-06, + "loss": 0.7649, + "step": 161 + }, + { + "epoch": 0.07659574468085106, + "grad_norm": 3.3439087867736816, + "learning_rate": 4.999700819685187e-06, + "loss": 0.7907, + "step": 162 + }, + { + "epoch": 0.07706855791962175, + "grad_norm": 3.210815191268921, + "learning_rate": 4.999691091077182e-06, + "loss": 0.8446, + "step": 163 + }, + { + "epoch": 0.07754137115839244, + "grad_norm": 3.1029553413391113, + "learning_rate": 4.9996812068243735e-06, + "loss": 0.7232, + "step": 164 + }, + { + "epoch": 0.07801418439716312, + "grad_norm": 2.9389400482177734, + "learning_rate": 4.999671166927378e-06, + "loss": 0.7413, + "step": 165 + }, + { + "epoch": 0.07848699763593381, + "grad_norm": 3.7062697410583496, + "learning_rate": 4.9996609713868185e-06, + "loss": 0.8773, + "step": 166 + }, + { + "epoch": 0.0789598108747045, + "grad_norm": 3.2768924236297607, + "learning_rate": 4.999650620203332e-06, + "loss": 0.8046, + "step": 167 + }, + { + "epoch": 0.07943262411347518, + "grad_norm": 3.380373001098633, + "learning_rate": 4.999640113377561e-06, + "loss": 0.7529, + "step": 168 + }, + { + "epoch": 0.07990543735224587, + "grad_norm": 3.520022392272949, + "learning_rate": 4.999629450910162e-06, + "loss": 0.7352, + "step": 169 + }, + { + "epoch": 0.08037825059101655, + "grad_norm": 3.43269419670105, + "learning_rate": 4.999618632801796e-06, + "loss": 0.9371, + "step": 170 + }, + { + "epoch": 0.08085106382978724, + "grad_norm": 3.555877923965454, + "learning_rate": 4.99960765905314e-06, + "loss": 0.8276, + "step": 171 + }, + { + "epoch": 0.08132387706855793, + "grad_norm": 3.597050189971924, + "learning_rate": 4.999596529664874e-06, + "loss": 0.8164, + "step": 172 + }, + { + "epoch": 0.0817966903073286, + "grad_norm": 3.2002956867218018, + "learning_rate": 4.999585244637693e-06, + "loss": 0.7824, + "step": 173 + }, + { + "epoch": 0.08226950354609928, + "grad_norm": 3.527275562286377, + "learning_rate": 4.999573803972299e-06, + "loss": 0.8033, + "step": 174 + }, + { + "epoch": 0.08274231678486997, + "grad_norm": 3.5184452533721924, + "learning_rate": 4.999562207669405e-06, + "loss": 0.724, + "step": 175 + }, + { + "epoch": 0.08321513002364066, + "grad_norm": 3.6635067462921143, + "learning_rate": 4.999550455729732e-06, + "loss": 0.819, + "step": 176 + }, + { + "epoch": 0.08368794326241134, + "grad_norm": 3.192399740219116, + "learning_rate": 4.999538548154012e-06, + "loss": 0.7999, + "step": 177 + }, + { + "epoch": 0.08416075650118203, + "grad_norm": 3.0946953296661377, + "learning_rate": 4.999526484942988e-06, + "loss": 0.7367, + "step": 178 + }, + { + "epoch": 0.08463356973995272, + "grad_norm": 2.847198009490967, + "learning_rate": 4.99951426609741e-06, + "loss": 0.7536, + "step": 179 + }, + { + "epoch": 0.0851063829787234, + "grad_norm": 2.7674827575683594, + "learning_rate": 4.999501891618037e-06, + "loss": 0.701, + "step": 180 + }, + { + "epoch": 0.08557919621749409, + "grad_norm": 3.357933521270752, + "learning_rate": 4.999489361505643e-06, + "loss": 0.8331, + "step": 181 + }, + { + "epoch": 0.08605200945626477, + "grad_norm": 3.1464426517486572, + "learning_rate": 4.999476675761004e-06, + "loss": 0.7931, + "step": 182 + }, + { + "epoch": 0.08652482269503546, + "grad_norm": 3.310697078704834, + "learning_rate": 4.999463834384915e-06, + "loss": 0.753, + "step": 183 + }, + { + "epoch": 0.08699763593380615, + "grad_norm": 2.9794881343841553, + "learning_rate": 4.999450837378171e-06, + "loss": 0.7091, + "step": 184 + }, + { + "epoch": 0.08747044917257683, + "grad_norm": 3.0776889324188232, + "learning_rate": 4.999437684741584e-06, + "loss": 0.7226, + "step": 185 + }, + { + "epoch": 0.08794326241134752, + "grad_norm": 3.6657519340515137, + "learning_rate": 4.999424376475972e-06, + "loss": 0.845, + "step": 186 + }, + { + "epoch": 0.0884160756501182, + "grad_norm": 3.872718572616577, + "learning_rate": 4.999410912582164e-06, + "loss": 0.812, + "step": 187 + }, + { + "epoch": 0.08888888888888889, + "grad_norm": 2.9184508323669434, + "learning_rate": 4.9993972930609976e-06, + "loss": 0.6823, + "step": 188 + }, + { + "epoch": 0.08936170212765958, + "grad_norm": 3.5567142963409424, + "learning_rate": 4.999383517913321e-06, + "loss": 0.7614, + "step": 189 + }, + { + "epoch": 0.08983451536643026, + "grad_norm": 3.3688533306121826, + "learning_rate": 4.999369587139992e-06, + "loss": 0.858, + "step": 190 + }, + { + "epoch": 0.09030732860520095, + "grad_norm": 2.893223524093628, + "learning_rate": 4.99935550074188e-06, + "loss": 0.6761, + "step": 191 + }, + { + "epoch": 0.09078014184397164, + "grad_norm": 3.400225877761841, + "learning_rate": 4.999341258719859e-06, + "loss": 0.7531, + "step": 192 + }, + { + "epoch": 0.09125295508274232, + "grad_norm": 3.6167714595794678, + "learning_rate": 4.999326861074817e-06, + "loss": 0.8164, + "step": 193 + }, + { + "epoch": 0.091725768321513, + "grad_norm": 4.325016498565674, + "learning_rate": 4.9993123078076506e-06, + "loss": 0.7069, + "step": 194 + }, + { + "epoch": 0.09219858156028368, + "grad_norm": 3.195317029953003, + "learning_rate": 4.999297598919266e-06, + "loss": 0.726, + "step": 195 + }, + { + "epoch": 0.09267139479905437, + "grad_norm": 3.146530866622925, + "learning_rate": 4.999282734410579e-06, + "loss": 0.7888, + "step": 196 + }, + { + "epoch": 0.09314420803782505, + "grad_norm": 3.5166752338409424, + "learning_rate": 4.999267714282515e-06, + "loss": 0.8473, + "step": 197 + }, + { + "epoch": 0.09361702127659574, + "grad_norm": 3.3140196800231934, + "learning_rate": 4.99925253853601e-06, + "loss": 0.7233, + "step": 198 + }, + { + "epoch": 0.09408983451536643, + "grad_norm": 3.0318164825439453, + "learning_rate": 4.999237207172008e-06, + "loss": 0.7543, + "step": 199 + }, + { + "epoch": 0.09456264775413711, + "grad_norm": 3.662214756011963, + "learning_rate": 4.999221720191464e-06, + "loss": 0.7783, + "step": 200 + }, + { + "epoch": 0.0950354609929078, + "grad_norm": 3.452078104019165, + "learning_rate": 4.9992060775953425e-06, + "loss": 0.7868, + "step": 201 + }, + { + "epoch": 0.09550827423167849, + "grad_norm": 3.4051287174224854, + "learning_rate": 4.999190279384617e-06, + "loss": 0.7849, + "step": 202 + }, + { + "epoch": 0.09598108747044917, + "grad_norm": 3.1377196311950684, + "learning_rate": 4.999174325560271e-06, + "loss": 0.8364, + "step": 203 + }, + { + "epoch": 0.09645390070921986, + "grad_norm": 3.129473924636841, + "learning_rate": 4.999158216123299e-06, + "loss": 0.7458, + "step": 204 + }, + { + "epoch": 0.09692671394799054, + "grad_norm": 3.169548749923706, + "learning_rate": 4.999141951074703e-06, + "loss": 0.7256, + "step": 205 + }, + { + "epoch": 0.09739952718676123, + "grad_norm": 3.186009168624878, + "learning_rate": 4.999125530415495e-06, + "loss": 0.783, + "step": 206 + }, + { + "epoch": 0.09787234042553192, + "grad_norm": 3.0995123386383057, + "learning_rate": 4.9991089541467e-06, + "loss": 0.7519, + "step": 207 + }, + { + "epoch": 0.0983451536643026, + "grad_norm": 3.1854088306427, + "learning_rate": 4.999092222269348e-06, + "loss": 0.7444, + "step": 208 + }, + { + "epoch": 0.09881796690307329, + "grad_norm": 3.1512246131896973, + "learning_rate": 4.999075334784482e-06, + "loss": 0.7882, + "step": 209 + }, + { + "epoch": 0.09929078014184398, + "grad_norm": 3.6199698448181152, + "learning_rate": 4.999058291693153e-06, + "loss": 0.8048, + "step": 210 + }, + { + "epoch": 0.09976359338061466, + "grad_norm": 2.956907272338867, + "learning_rate": 4.999041092996422e-06, + "loss": 0.7663, + "step": 211 + }, + { + "epoch": 0.10023640661938535, + "grad_norm": 3.3493971824645996, + "learning_rate": 4.99902373869536e-06, + "loss": 0.7639, + "step": 212 + }, + { + "epoch": 0.10070921985815603, + "grad_norm": 3.144812822341919, + "learning_rate": 4.9990062287910475e-06, + "loss": 0.7953, + "step": 213 + }, + { + "epoch": 0.10118203309692671, + "grad_norm": 3.5986971855163574, + "learning_rate": 4.998988563284576e-06, + "loss": 0.8297, + "step": 214 + }, + { + "epoch": 0.1016548463356974, + "grad_norm": 3.447584867477417, + "learning_rate": 4.998970742177044e-06, + "loss": 0.808, + "step": 215 + }, + { + "epoch": 0.10212765957446808, + "grad_norm": 3.791353940963745, + "learning_rate": 4.998952765469562e-06, + "loss": 0.8005, + "step": 216 + }, + { + "epoch": 0.10260047281323877, + "grad_norm": 3.4490807056427, + "learning_rate": 4.998934633163247e-06, + "loss": 0.8135, + "step": 217 + }, + { + "epoch": 0.10307328605200945, + "grad_norm": 3.1053314208984375, + "learning_rate": 4.998916345259232e-06, + "loss": 0.7888, + "step": 218 + }, + { + "epoch": 0.10354609929078014, + "grad_norm": 3.407862663269043, + "learning_rate": 4.9988979017586514e-06, + "loss": 0.7099, + "step": 219 + }, + { + "epoch": 0.10401891252955082, + "grad_norm": 3.116656541824341, + "learning_rate": 4.998879302662658e-06, + "loss": 0.8344, + "step": 220 + }, + { + "epoch": 0.10449172576832151, + "grad_norm": 3.339264154434204, + "learning_rate": 4.998860547972406e-06, + "loss": 0.8496, + "step": 221 + }, + { + "epoch": 0.1049645390070922, + "grad_norm": 3.251892566680908, + "learning_rate": 4.998841637689066e-06, + "loss": 0.7455, + "step": 222 + }, + { + "epoch": 0.10543735224586288, + "grad_norm": 4.098135471343994, + "learning_rate": 4.998822571813814e-06, + "loss": 0.7772, + "step": 223 + }, + { + "epoch": 0.10591016548463357, + "grad_norm": 3.9871134757995605, + "learning_rate": 4.998803350347837e-06, + "loss": 0.8261, + "step": 224 + }, + { + "epoch": 0.10638297872340426, + "grad_norm": 3.2822303771972656, + "learning_rate": 4.998783973292333e-06, + "loss": 0.8623, + "step": 225 + }, + { + "epoch": 0.10685579196217494, + "grad_norm": 3.0356857776641846, + "learning_rate": 4.998764440648507e-06, + "loss": 0.7426, + "step": 226 + }, + { + "epoch": 0.10732860520094563, + "grad_norm": 2.8932785987854004, + "learning_rate": 4.998744752417576e-06, + "loss": 0.6741, + "step": 227 + }, + { + "epoch": 0.10780141843971631, + "grad_norm": 3.085820436477661, + "learning_rate": 4.998724908600767e-06, + "loss": 0.6549, + "step": 228 + }, + { + "epoch": 0.108274231678487, + "grad_norm": 3.135829210281372, + "learning_rate": 4.998704909199314e-06, + "loss": 0.6702, + "step": 229 + }, + { + "epoch": 0.10874704491725769, + "grad_norm": 5.016134262084961, + "learning_rate": 4.9986847542144625e-06, + "loss": 0.7852, + "step": 230 + }, + { + "epoch": 0.10921985815602837, + "grad_norm": 3.9056200981140137, + "learning_rate": 4.998664443647468e-06, + "loss": 0.9654, + "step": 231 + }, + { + "epoch": 0.10969267139479906, + "grad_norm": 3.0880749225616455, + "learning_rate": 4.998643977499595e-06, + "loss": 0.7579, + "step": 232 + }, + { + "epoch": 0.11016548463356975, + "grad_norm": 3.6893601417541504, + "learning_rate": 4.998623355772118e-06, + "loss": 0.713, + "step": 233 + }, + { + "epoch": 0.11063829787234042, + "grad_norm": 4.181536674499512, + "learning_rate": 4.998602578466319e-06, + "loss": 0.7331, + "step": 234 + }, + { + "epoch": 0.1111111111111111, + "grad_norm": 3.036386728286743, + "learning_rate": 4.998581645583496e-06, + "loss": 0.7115, + "step": 235 + }, + { + "epoch": 0.11158392434988179, + "grad_norm": 3.6333255767822266, + "learning_rate": 4.998560557124948e-06, + "loss": 0.7544, + "step": 236 + }, + { + "epoch": 0.11205673758865248, + "grad_norm": 2.926417827606201, + "learning_rate": 4.9985393130919915e-06, + "loss": 0.715, + "step": 237 + }, + { + "epoch": 0.11252955082742316, + "grad_norm": 2.969158172607422, + "learning_rate": 4.998517913485946e-06, + "loss": 0.7304, + "step": 238 + }, + { + "epoch": 0.11300236406619385, + "grad_norm": 3.5254971981048584, + "learning_rate": 4.9984963583081466e-06, + "loss": 0.7725, + "step": 239 + }, + { + "epoch": 0.11347517730496454, + "grad_norm": 3.7840335369110107, + "learning_rate": 4.998474647559936e-06, + "loss": 0.8685, + "step": 240 + }, + { + "epoch": 0.11394799054373522, + "grad_norm": 3.0333125591278076, + "learning_rate": 4.9984527812426625e-06, + "loss": 0.7793, + "step": 241 + }, + { + "epoch": 0.11442080378250591, + "grad_norm": 3.290159225463867, + "learning_rate": 4.99843075935769e-06, + "loss": 0.7158, + "step": 242 + }, + { + "epoch": 0.1148936170212766, + "grad_norm": 3.3935494422912598, + "learning_rate": 4.99840858190639e-06, + "loss": 0.7643, + "step": 243 + }, + { + "epoch": 0.11536643026004728, + "grad_norm": 3.333965539932251, + "learning_rate": 4.998386248890142e-06, + "loss": 0.7255, + "step": 244 + }, + { + "epoch": 0.11583924349881797, + "grad_norm": 2.8129613399505615, + "learning_rate": 4.998363760310339e-06, + "loss": 0.768, + "step": 245 + }, + { + "epoch": 0.11631205673758865, + "grad_norm": 2.8678107261657715, + "learning_rate": 4.998341116168378e-06, + "loss": 0.7403, + "step": 246 + }, + { + "epoch": 0.11678486997635934, + "grad_norm": 2.8898239135742188, + "learning_rate": 4.998318316465672e-06, + "loss": 0.6844, + "step": 247 + }, + { + "epoch": 0.11725768321513003, + "grad_norm": 3.139777898788452, + "learning_rate": 4.998295361203637e-06, + "loss": 0.7936, + "step": 248 + }, + { + "epoch": 0.11773049645390071, + "grad_norm": 3.393721103668213, + "learning_rate": 4.998272250383707e-06, + "loss": 0.8173, + "step": 249 + }, + { + "epoch": 0.1182033096926714, + "grad_norm": 3.240973949432373, + "learning_rate": 4.998248984007318e-06, + "loss": 0.8252, + "step": 250 + }, + { + "epoch": 0.11867612293144209, + "grad_norm": 3.384855031967163, + "learning_rate": 4.998225562075918e-06, + "loss": 0.7244, + "step": 251 + }, + { + "epoch": 0.11914893617021277, + "grad_norm": 3.1881816387176514, + "learning_rate": 4.9982019845909675e-06, + "loss": 0.6818, + "step": 252 + }, + { + "epoch": 0.11962174940898346, + "grad_norm": 2.888364553451538, + "learning_rate": 4.998178251553934e-06, + "loss": 0.6753, + "step": 253 + }, + { + "epoch": 0.12009456264775414, + "grad_norm": 3.630093812942505, + "learning_rate": 4.9981543629662944e-06, + "loss": 0.7995, + "step": 254 + }, + { + "epoch": 0.12056737588652482, + "grad_norm": 2.9820947647094727, + "learning_rate": 4.998130318829537e-06, + "loss": 0.7478, + "step": 255 + }, + { + "epoch": 0.1210401891252955, + "grad_norm": 2.7094738483428955, + "learning_rate": 4.998106119145159e-06, + "loss": 0.7237, + "step": 256 + }, + { + "epoch": 0.12151300236406619, + "grad_norm": 3.1808104515075684, + "learning_rate": 4.9980817639146665e-06, + "loss": 0.7915, + "step": 257 + }, + { + "epoch": 0.12198581560283688, + "grad_norm": 3.1661291122436523, + "learning_rate": 4.998057253139575e-06, + "loss": 0.8053, + "step": 258 + }, + { + "epoch": 0.12245862884160756, + "grad_norm": 3.528749942779541, + "learning_rate": 4.998032586821413e-06, + "loss": 0.7946, + "step": 259 + }, + { + "epoch": 0.12293144208037825, + "grad_norm": 3.125964879989624, + "learning_rate": 4.998007764961716e-06, + "loss": 0.7569, + "step": 260 + }, + { + "epoch": 0.12340425531914893, + "grad_norm": 3.0778942108154297, + "learning_rate": 4.997982787562029e-06, + "loss": 0.7184, + "step": 261 + }, + { + "epoch": 0.12387706855791962, + "grad_norm": 3.3531930446624756, + "learning_rate": 4.997957654623906e-06, + "loss": 0.7586, + "step": 262 + }, + { + "epoch": 0.1243498817966903, + "grad_norm": 3.229278564453125, + "learning_rate": 4.997932366148913e-06, + "loss": 0.6092, + "step": 263 + }, + { + "epoch": 0.12482269503546099, + "grad_norm": 3.7286155223846436, + "learning_rate": 4.997906922138626e-06, + "loss": 0.7965, + "step": 264 + }, + { + "epoch": 0.12529550827423167, + "grad_norm": 3.300311803817749, + "learning_rate": 4.997881322594628e-06, + "loss": 0.7665, + "step": 265 + }, + { + "epoch": 0.12576832151300235, + "grad_norm": 3.411482572555542, + "learning_rate": 4.9978555675185115e-06, + "loss": 0.7253, + "step": 266 + }, + { + "epoch": 0.12624113475177304, + "grad_norm": 3.0884511470794678, + "learning_rate": 4.9978296569118825e-06, + "loss": 0.659, + "step": 267 + }, + { + "epoch": 0.12671394799054372, + "grad_norm": 3.0652925968170166, + "learning_rate": 4.9978035907763535e-06, + "loss": 0.6739, + "step": 268 + }, + { + "epoch": 0.1271867612293144, + "grad_norm": 3.280555009841919, + "learning_rate": 4.997777369113547e-06, + "loss": 0.8003, + "step": 269 + }, + { + "epoch": 0.1276595744680851, + "grad_norm": 2.980860948562622, + "learning_rate": 4.997750991925096e-06, + "loss": 0.7097, + "step": 270 + }, + { + "epoch": 0.12813238770685578, + "grad_norm": 3.301760673522949, + "learning_rate": 4.997724459212644e-06, + "loss": 0.7894, + "step": 271 + }, + { + "epoch": 0.12860520094562647, + "grad_norm": 2.9584903717041016, + "learning_rate": 4.997697770977841e-06, + "loss": 0.733, + "step": 272 + }, + { + "epoch": 0.12907801418439716, + "grad_norm": 3.5632214546203613, + "learning_rate": 4.99767092722235e-06, + "loss": 0.7228, + "step": 273 + }, + { + "epoch": 0.12955082742316784, + "grad_norm": 3.5900983810424805, + "learning_rate": 4.997643927947843e-06, + "loss": 0.7634, + "step": 274 + }, + { + "epoch": 0.13002364066193853, + "grad_norm": 3.332650661468506, + "learning_rate": 4.997616773156e-06, + "loss": 0.797, + "step": 275 + }, + { + "epoch": 0.13049645390070921, + "grad_norm": 3.1094167232513428, + "learning_rate": 4.997589462848512e-06, + "loss": 0.7849, + "step": 276 + }, + { + "epoch": 0.1309692671394799, + "grad_norm": 3.5359463691711426, + "learning_rate": 4.99756199702708e-06, + "loss": 0.6871, + "step": 277 + }, + { + "epoch": 0.1314420803782506, + "grad_norm": 3.190441846847534, + "learning_rate": 4.997534375693414e-06, + "loss": 0.6883, + "step": 278 + }, + { + "epoch": 0.13191489361702127, + "grad_norm": 3.063518762588501, + "learning_rate": 4.997506598849234e-06, + "loss": 0.7586, + "step": 279 + }, + { + "epoch": 0.13238770685579196, + "grad_norm": 3.4112050533294678, + "learning_rate": 4.997478666496269e-06, + "loss": 0.796, + "step": 280 + }, + { + "epoch": 0.13286052009456265, + "grad_norm": 3.231886386871338, + "learning_rate": 4.997450578636259e-06, + "loss": 0.7714, + "step": 281 + }, + { + "epoch": 0.13333333333333333, + "grad_norm": 3.279425621032715, + "learning_rate": 4.9974223352709515e-06, + "loss": 0.7793, + "step": 282 + }, + { + "epoch": 0.13380614657210402, + "grad_norm": 3.2154316902160645, + "learning_rate": 4.9973939364021075e-06, + "loss": 0.791, + "step": 283 + }, + { + "epoch": 0.1342789598108747, + "grad_norm": 3.2090768814086914, + "learning_rate": 4.9973653820314925e-06, + "loss": 0.6433, + "step": 284 + }, + { + "epoch": 0.1347517730496454, + "grad_norm": 3.1712026596069336, + "learning_rate": 4.997336672160886e-06, + "loss": 0.8128, + "step": 285 + }, + { + "epoch": 0.13522458628841608, + "grad_norm": 2.929229497909546, + "learning_rate": 4.997307806792076e-06, + "loss": 0.7594, + "step": 286 + }, + { + "epoch": 0.13569739952718676, + "grad_norm": 3.0363314151763916, + "learning_rate": 4.997278785926859e-06, + "loss": 0.7336, + "step": 287 + }, + { + "epoch": 0.13617021276595745, + "grad_norm": 3.1352357864379883, + "learning_rate": 4.997249609567042e-06, + "loss": 0.7225, + "step": 288 + }, + { + "epoch": 0.13664302600472814, + "grad_norm": 3.3171157836914062, + "learning_rate": 4.997220277714442e-06, + "loss": 0.7777, + "step": 289 + }, + { + "epoch": 0.13711583924349882, + "grad_norm": 3.050717353820801, + "learning_rate": 4.997190790370885e-06, + "loss": 0.6836, + "step": 290 + }, + { + "epoch": 0.1375886524822695, + "grad_norm": 3.0297694206237793, + "learning_rate": 4.997161147538208e-06, + "loss": 0.6883, + "step": 291 + }, + { + "epoch": 0.1380614657210402, + "grad_norm": 3.0566554069519043, + "learning_rate": 4.997131349218256e-06, + "loss": 0.6674, + "step": 292 + }, + { + "epoch": 0.13853427895981088, + "grad_norm": 3.799111843109131, + "learning_rate": 4.997101395412885e-06, + "loss": 0.8256, + "step": 293 + }, + { + "epoch": 0.13900709219858157, + "grad_norm": 3.1394248008728027, + "learning_rate": 4.9970712861239576e-06, + "loss": 0.7306, + "step": 294 + }, + { + "epoch": 0.13947990543735225, + "grad_norm": 3.0605666637420654, + "learning_rate": 4.997041021353352e-06, + "loss": 0.7212, + "step": 295 + }, + { + "epoch": 0.13995271867612294, + "grad_norm": 3.8813397884368896, + "learning_rate": 4.997010601102951e-06, + "loss": 0.769, + "step": 296 + }, + { + "epoch": 0.14042553191489363, + "grad_norm": 3.0514819622039795, + "learning_rate": 4.996980025374649e-06, + "loss": 0.7422, + "step": 297 + }, + { + "epoch": 0.1408983451536643, + "grad_norm": 2.9544146060943604, + "learning_rate": 4.99694929417035e-06, + "loss": 0.6912, + "step": 298 + }, + { + "epoch": 0.141371158392435, + "grad_norm": 3.2635602951049805, + "learning_rate": 4.996918407491966e-06, + "loss": 0.7395, + "step": 299 + }, + { + "epoch": 0.14184397163120568, + "grad_norm": 3.373882532119751, + "learning_rate": 4.996887365341423e-06, + "loss": 0.7799, + "step": 300 + }, + { + "epoch": 0.14231678486997637, + "grad_norm": 3.001128673553467, + "learning_rate": 4.996856167720652e-06, + "loss": 0.7168, + "step": 301 + }, + { + "epoch": 0.14278959810874706, + "grad_norm": 3.1026835441589355, + "learning_rate": 4.996824814631595e-06, + "loss": 0.7492, + "step": 302 + }, + { + "epoch": 0.14326241134751774, + "grad_norm": 3.41947603225708, + "learning_rate": 4.996793306076205e-06, + "loss": 0.6659, + "step": 303 + }, + { + "epoch": 0.14373522458628843, + "grad_norm": 3.2272400856018066, + "learning_rate": 4.996761642056444e-06, + "loss": 0.7184, + "step": 304 + }, + { + "epoch": 0.14420803782505912, + "grad_norm": 2.9488935470581055, + "learning_rate": 4.996729822574284e-06, + "loss": 0.7451, + "step": 305 + }, + { + "epoch": 0.14468085106382977, + "grad_norm": 3.268231153488159, + "learning_rate": 4.9966978476317065e-06, + "loss": 0.7798, + "step": 306 + }, + { + "epoch": 0.14515366430260046, + "grad_norm": 3.9086556434631348, + "learning_rate": 4.996665717230701e-06, + "loss": 0.7871, + "step": 307 + }, + { + "epoch": 0.14562647754137115, + "grad_norm": 3.3483879566192627, + "learning_rate": 4.996633431373269e-06, + "loss": 0.7415, + "step": 308 + }, + { + "epoch": 0.14609929078014183, + "grad_norm": 2.839400053024292, + "learning_rate": 4.99660099006142e-06, + "loss": 0.7192, + "step": 309 + }, + { + "epoch": 0.14657210401891252, + "grad_norm": 3.177302598953247, + "learning_rate": 4.996568393297175e-06, + "loss": 0.755, + "step": 310 + }, + { + "epoch": 0.1470449172576832, + "grad_norm": 3.5477044582366943, + "learning_rate": 4.996535641082563e-06, + "loss": 0.7531, + "step": 311 + }, + { + "epoch": 0.1475177304964539, + "grad_norm": 3.418576717376709, + "learning_rate": 4.996502733419624e-06, + "loss": 0.8009, + "step": 312 + }, + { + "epoch": 0.14799054373522458, + "grad_norm": 3.711341619491577, + "learning_rate": 4.996469670310407e-06, + "loss": 0.7362, + "step": 313 + }, + { + "epoch": 0.14846335697399526, + "grad_norm": 3.2419373989105225, + "learning_rate": 4.99643645175697e-06, + "loss": 0.7761, + "step": 314 + }, + { + "epoch": 0.14893617021276595, + "grad_norm": 3.121858835220337, + "learning_rate": 4.996403077761381e-06, + "loss": 0.6495, + "step": 315 + }, + { + "epoch": 0.14940898345153664, + "grad_norm": 3.123054265975952, + "learning_rate": 4.996369548325719e-06, + "loss": 0.7444, + "step": 316 + }, + { + "epoch": 0.14988179669030732, + "grad_norm": 2.780880928039551, + "learning_rate": 4.996335863452072e-06, + "loss": 0.672, + "step": 317 + }, + { + "epoch": 0.150354609929078, + "grad_norm": 3.3738629817962646, + "learning_rate": 4.996302023142536e-06, + "loss": 0.7972, + "step": 318 + }, + { + "epoch": 0.1508274231678487, + "grad_norm": 3.4874777793884277, + "learning_rate": 4.99626802739922e-06, + "loss": 0.8252, + "step": 319 + }, + { + "epoch": 0.15130023640661938, + "grad_norm": 3.7074787616729736, + "learning_rate": 4.9962338762242395e-06, + "loss": 0.8216, + "step": 320 + }, + { + "epoch": 0.15177304964539007, + "grad_norm": 3.281912326812744, + "learning_rate": 4.996199569619721e-06, + "loss": 0.8175, + "step": 321 + }, + { + "epoch": 0.15224586288416075, + "grad_norm": 2.9485340118408203, + "learning_rate": 4.996165107587801e-06, + "loss": 0.707, + "step": 322 + }, + { + "epoch": 0.15271867612293144, + "grad_norm": 3.3757646083831787, + "learning_rate": 4.996130490130625e-06, + "loss": 0.7955, + "step": 323 + }, + { + "epoch": 0.15319148936170213, + "grad_norm": 2.962181568145752, + "learning_rate": 4.996095717250349e-06, + "loss": 0.7067, + "step": 324 + }, + { + "epoch": 0.1536643026004728, + "grad_norm": 3.114272356033325, + "learning_rate": 4.996060788949136e-06, + "loss": 0.7486, + "step": 325 + }, + { + "epoch": 0.1541371158392435, + "grad_norm": 3.0621590614318848, + "learning_rate": 4.996025705229165e-06, + "loss": 0.6547, + "step": 326 + }, + { + "epoch": 0.15460992907801419, + "grad_norm": 2.8745882511138916, + "learning_rate": 4.995990466092616e-06, + "loss": 0.6435, + "step": 327 + }, + { + "epoch": 0.15508274231678487, + "grad_norm": 2.90841007232666, + "learning_rate": 4.995955071541686e-06, + "loss": 0.7331, + "step": 328 + }, + { + "epoch": 0.15555555555555556, + "grad_norm": 2.694580316543579, + "learning_rate": 4.9959195215785784e-06, + "loss": 0.6731, + "step": 329 + }, + { + "epoch": 0.15602836879432624, + "grad_norm": 3.158083438873291, + "learning_rate": 4.995883816205507e-06, + "loss": 0.7257, + "step": 330 + }, + { + "epoch": 0.15650118203309693, + "grad_norm": 3.3234715461730957, + "learning_rate": 4.995847955424694e-06, + "loss": 0.7389, + "step": 331 + }, + { + "epoch": 0.15697399527186762, + "grad_norm": 2.9406495094299316, + "learning_rate": 4.995811939238373e-06, + "loss": 0.643, + "step": 332 + }, + { + "epoch": 0.1574468085106383, + "grad_norm": 3.3191726207733154, + "learning_rate": 4.995775767648785e-06, + "loss": 0.7879, + "step": 333 + }, + { + "epoch": 0.157919621749409, + "grad_norm": 3.711925745010376, + "learning_rate": 4.995739440658185e-06, + "loss": 0.7586, + "step": 334 + }, + { + "epoch": 0.15839243498817968, + "grad_norm": 9.573421478271484, + "learning_rate": 4.995702958268833e-06, + "loss": 0.7842, + "step": 335 + }, + { + "epoch": 0.15886524822695036, + "grad_norm": 3.4154508113861084, + "learning_rate": 4.995666320483001e-06, + "loss": 0.6735, + "step": 336 + }, + { + "epoch": 0.15933806146572105, + "grad_norm": 3.4169859886169434, + "learning_rate": 4.995629527302971e-06, + "loss": 0.741, + "step": 337 + }, + { + "epoch": 0.15981087470449173, + "grad_norm": 3.287503242492676, + "learning_rate": 4.9955925787310335e-06, + "loss": 0.7139, + "step": 338 + }, + { + "epoch": 0.16028368794326242, + "grad_norm": 3.288409471511841, + "learning_rate": 4.995555474769488e-06, + "loss": 0.7636, + "step": 339 + }, + { + "epoch": 0.1607565011820331, + "grad_norm": 2.8021693229675293, + "learning_rate": 4.995518215420646e-06, + "loss": 0.5883, + "step": 340 + }, + { + "epoch": 0.1612293144208038, + "grad_norm": 2.7038564682006836, + "learning_rate": 4.995480800686827e-06, + "loss": 0.657, + "step": 341 + }, + { + "epoch": 0.16170212765957448, + "grad_norm": 3.2370235919952393, + "learning_rate": 4.9954432305703615e-06, + "loss": 0.6999, + "step": 342 + }, + { + "epoch": 0.16217494089834517, + "grad_norm": 2.8666412830352783, + "learning_rate": 4.995405505073588e-06, + "loss": 0.7199, + "step": 343 + }, + { + "epoch": 0.16264775413711585, + "grad_norm": 3.6467232704162598, + "learning_rate": 4.995367624198856e-06, + "loss": 0.7317, + "step": 344 + }, + { + "epoch": 0.16312056737588654, + "grad_norm": 2.7576327323913574, + "learning_rate": 4.9953295879485246e-06, + "loss": 0.647, + "step": 345 + }, + { + "epoch": 0.1635933806146572, + "grad_norm": 2.922232151031494, + "learning_rate": 4.995291396324959e-06, + "loss": 0.6686, + "step": 346 + }, + { + "epoch": 0.16406619385342788, + "grad_norm": 2.8693501949310303, + "learning_rate": 4.995253049330542e-06, + "loss": 0.6756, + "step": 347 + }, + { + "epoch": 0.16453900709219857, + "grad_norm": 3.671865701675415, + "learning_rate": 4.995214546967658e-06, + "loss": 0.7347, + "step": 348 + }, + { + "epoch": 0.16501182033096926, + "grad_norm": 3.024219274520874, + "learning_rate": 4.995175889238706e-06, + "loss": 0.7547, + "step": 349 + }, + { + "epoch": 0.16548463356973994, + "grad_norm": 2.8470778465270996, + "learning_rate": 4.995137076146091e-06, + "loss": 0.6764, + "step": 350 + }, + { + "epoch": 0.16595744680851063, + "grad_norm": 2.905057907104492, + "learning_rate": 4.9950981076922324e-06, + "loss": 0.6814, + "step": 351 + }, + { + "epoch": 0.16643026004728131, + "grad_norm": 3.504377841949463, + "learning_rate": 4.995058983879555e-06, + "loss": 0.7145, + "step": 352 + }, + { + "epoch": 0.166903073286052, + "grad_norm": 3.0029661655426025, + "learning_rate": 4.995019704710495e-06, + "loss": 0.7114, + "step": 353 + }, + { + "epoch": 0.1673758865248227, + "grad_norm": 2.8666274547576904, + "learning_rate": 4.994980270187499e-06, + "loss": 0.7416, + "step": 354 + }, + { + "epoch": 0.16784869976359337, + "grad_norm": 3.1644718647003174, + "learning_rate": 4.994940680313021e-06, + "loss": 0.661, + "step": 355 + }, + { + "epoch": 0.16832151300236406, + "grad_norm": 3.050391674041748, + "learning_rate": 4.994900935089527e-06, + "loss": 0.7243, + "step": 356 + }, + { + "epoch": 0.16879432624113475, + "grad_norm": 2.985466480255127, + "learning_rate": 4.994861034519491e-06, + "loss": 0.6917, + "step": 357 + }, + { + "epoch": 0.16926713947990543, + "grad_norm": 2.909342050552368, + "learning_rate": 4.9948209786053995e-06, + "loss": 0.6636, + "step": 358 + }, + { + "epoch": 0.16973995271867612, + "grad_norm": 3.2214784622192383, + "learning_rate": 4.9947807673497435e-06, + "loss": 0.7903, + "step": 359 + }, + { + "epoch": 0.1702127659574468, + "grad_norm": 2.5654983520507812, + "learning_rate": 4.994740400755029e-06, + "loss": 0.6129, + "step": 360 + }, + { + "epoch": 0.1706855791962175, + "grad_norm": 3.775646448135376, + "learning_rate": 4.99469987882377e-06, + "loss": 0.7145, + "step": 361 + }, + { + "epoch": 0.17115839243498818, + "grad_norm": 2.8965413570404053, + "learning_rate": 4.994659201558487e-06, + "loss": 0.7177, + "step": 362 + }, + { + "epoch": 0.17163120567375886, + "grad_norm": 3.485597848892212, + "learning_rate": 4.9946183689617146e-06, + "loss": 0.8107, + "step": 363 + }, + { + "epoch": 0.17210401891252955, + "grad_norm": 3.277839183807373, + "learning_rate": 4.994577381035995e-06, + "loss": 0.691, + "step": 364 + }, + { + "epoch": 0.17257683215130024, + "grad_norm": 2.8807685375213623, + "learning_rate": 4.99453623778388e-06, + "loss": 0.7627, + "step": 365 + }, + { + "epoch": 0.17304964539007092, + "grad_norm": 3.0659940242767334, + "learning_rate": 4.994494939207932e-06, + "loss": 0.6858, + "step": 366 + }, + { + "epoch": 0.1735224586288416, + "grad_norm": 3.0881855487823486, + "learning_rate": 4.994453485310723e-06, + "loss": 0.8212, + "step": 367 + }, + { + "epoch": 0.1739952718676123, + "grad_norm": 2.7199201583862305, + "learning_rate": 4.994411876094832e-06, + "loss": 0.6516, + "step": 368 + }, + { + "epoch": 0.17446808510638298, + "grad_norm": 2.955889940261841, + "learning_rate": 4.994370111562851e-06, + "loss": 0.6579, + "step": 369 + }, + { + "epoch": 0.17494089834515367, + "grad_norm": 3.1321663856506348, + "learning_rate": 4.994328191717382e-06, + "loss": 0.6891, + "step": 370 + }, + { + "epoch": 0.17541371158392435, + "grad_norm": 3.0560388565063477, + "learning_rate": 4.994286116561034e-06, + "loss": 0.7243, + "step": 371 + }, + { + "epoch": 0.17588652482269504, + "grad_norm": 3.1560704708099365, + "learning_rate": 4.994243886096425e-06, + "loss": 0.7262, + "step": 372 + }, + { + "epoch": 0.17635933806146573, + "grad_norm": 2.913541316986084, + "learning_rate": 4.994201500326187e-06, + "loss": 0.7318, + "step": 373 + }, + { + "epoch": 0.1768321513002364, + "grad_norm": 3.098376512527466, + "learning_rate": 4.994158959252958e-06, + "loss": 0.6419, + "step": 374 + }, + { + "epoch": 0.1773049645390071, + "grad_norm": 2.977508544921875, + "learning_rate": 4.994116262879387e-06, + "loss": 0.6709, + "step": 375 + }, + { + "epoch": 0.17777777777777778, + "grad_norm": 3.168186902999878, + "learning_rate": 4.994073411208133e-06, + "loss": 0.6608, + "step": 376 + }, + { + "epoch": 0.17825059101654847, + "grad_norm": 3.436844825744629, + "learning_rate": 4.994030404241864e-06, + "loss": 0.7227, + "step": 377 + }, + { + "epoch": 0.17872340425531916, + "grad_norm": 2.8998289108276367, + "learning_rate": 4.993987241983258e-06, + "loss": 0.6512, + "step": 378 + }, + { + "epoch": 0.17919621749408984, + "grad_norm": 3.407191514968872, + "learning_rate": 4.993943924435002e-06, + "loss": 0.616, + "step": 379 + }, + { + "epoch": 0.17966903073286053, + "grad_norm": 3.744858741760254, + "learning_rate": 4.993900451599793e-06, + "loss": 0.8599, + "step": 380 + }, + { + "epoch": 0.18014184397163122, + "grad_norm": 3.486283779144287, + "learning_rate": 4.993856823480338e-06, + "loss": 0.6634, + "step": 381 + }, + { + "epoch": 0.1806146572104019, + "grad_norm": 2.895719051361084, + "learning_rate": 4.993813040079355e-06, + "loss": 0.6972, + "step": 382 + }, + { + "epoch": 0.1810874704491726, + "grad_norm": 2.814133882522583, + "learning_rate": 4.993769101399569e-06, + "loss": 0.6271, + "step": 383 + }, + { + "epoch": 0.18156028368794327, + "grad_norm": 2.8609800338745117, + "learning_rate": 4.993725007443715e-06, + "loss": 0.6481, + "step": 384 + }, + { + "epoch": 0.18203309692671396, + "grad_norm": 3.2829644680023193, + "learning_rate": 4.99368075821454e-06, + "loss": 0.7999, + "step": 385 + }, + { + "epoch": 0.18250591016548465, + "grad_norm": 3.1417458057403564, + "learning_rate": 4.993636353714798e-06, + "loss": 0.6972, + "step": 386 + }, + { + "epoch": 0.1829787234042553, + "grad_norm": 3.0679385662078857, + "learning_rate": 4.993591793947256e-06, + "loss": 0.667, + "step": 387 + }, + { + "epoch": 0.183451536643026, + "grad_norm": 3.1387410163879395, + "learning_rate": 4.993547078914686e-06, + "loss": 0.7618, + "step": 388 + }, + { + "epoch": 0.18392434988179668, + "grad_norm": 2.9181406497955322, + "learning_rate": 4.993502208619872e-06, + "loss": 0.7391, + "step": 389 + }, + { + "epoch": 0.18439716312056736, + "grad_norm": 2.8952157497406006, + "learning_rate": 4.993457183065611e-06, + "loss": 0.6988, + "step": 390 + }, + { + "epoch": 0.18486997635933805, + "grad_norm": 3.2274813652038574, + "learning_rate": 4.993412002254704e-06, + "loss": 0.688, + "step": 391 + }, + { + "epoch": 0.18534278959810874, + "grad_norm": 3.4693779945373535, + "learning_rate": 4.993366666189965e-06, + "loss": 0.6634, + "step": 392 + }, + { + "epoch": 0.18581560283687942, + "grad_norm": 3.5358526706695557, + "learning_rate": 4.993321174874217e-06, + "loss": 0.7343, + "step": 393 + }, + { + "epoch": 0.1862884160756501, + "grad_norm": 3.013338088989258, + "learning_rate": 4.993275528310292e-06, + "loss": 0.7579, + "step": 394 + }, + { + "epoch": 0.1867612293144208, + "grad_norm": 2.694772720336914, + "learning_rate": 4.993229726501033e-06, + "loss": 0.718, + "step": 395 + }, + { + "epoch": 0.18723404255319148, + "grad_norm": 3.070612907409668, + "learning_rate": 4.9931837694492915e-06, + "loss": 0.6438, + "step": 396 + }, + { + "epoch": 0.18770685579196217, + "grad_norm": 2.9193027019500732, + "learning_rate": 4.993137657157928e-06, + "loss": 0.6788, + "step": 397 + }, + { + "epoch": 0.18817966903073285, + "grad_norm": 3.047682046890259, + "learning_rate": 4.993091389629816e-06, + "loss": 0.6826, + "step": 398 + }, + { + "epoch": 0.18865248226950354, + "grad_norm": 2.9629905223846436, + "learning_rate": 4.993044966867834e-06, + "loss": 0.7196, + "step": 399 + }, + { + "epoch": 0.18912529550827423, + "grad_norm": 3.0692050457000732, + "learning_rate": 4.992998388874874e-06, + "loss": 0.7015, + "step": 400 + }, + { + "epoch": 0.1895981087470449, + "grad_norm": 3.5427212715148926, + "learning_rate": 4.992951655653836e-06, + "loss": 0.8292, + "step": 401 + }, + { + "epoch": 0.1900709219858156, + "grad_norm": 2.643526554107666, + "learning_rate": 4.992904767207629e-06, + "loss": 0.624, + "step": 402 + }, + { + "epoch": 0.19054373522458629, + "grad_norm": 3.1185996532440186, + "learning_rate": 4.992857723539173e-06, + "loss": 0.7354, + "step": 403 + }, + { + "epoch": 0.19101654846335697, + "grad_norm": 3.006856679916382, + "learning_rate": 4.992810524651398e-06, + "loss": 0.7752, + "step": 404 + }, + { + "epoch": 0.19148936170212766, + "grad_norm": 2.9913275241851807, + "learning_rate": 4.9927631705472425e-06, + "loss": 0.7306, + "step": 405 + }, + { + "epoch": 0.19196217494089834, + "grad_norm": 2.6794071197509766, + "learning_rate": 4.992715661229655e-06, + "loss": 0.6136, + "step": 406 + }, + { + "epoch": 0.19243498817966903, + "grad_norm": 3.5933966636657715, + "learning_rate": 4.992667996701593e-06, + "loss": 0.7024, + "step": 407 + }, + { + "epoch": 0.19290780141843972, + "grad_norm": 2.862187623977661, + "learning_rate": 4.992620176966025e-06, + "loss": 0.692, + "step": 408 + }, + { + "epoch": 0.1933806146572104, + "grad_norm": 3.076845407485962, + "learning_rate": 4.9925722020259286e-06, + "loss": 0.7475, + "step": 409 + }, + { + "epoch": 0.1938534278959811, + "grad_norm": 3.372919797897339, + "learning_rate": 4.9925240718842895e-06, + "loss": 0.6886, + "step": 410 + }, + { + "epoch": 0.19432624113475178, + "grad_norm": 2.922977924346924, + "learning_rate": 4.992475786544108e-06, + "loss": 0.7049, + "step": 411 + }, + { + "epoch": 0.19479905437352246, + "grad_norm": 2.908034324645996, + "learning_rate": 4.992427346008387e-06, + "loss": 0.6498, + "step": 412 + }, + { + "epoch": 0.19527186761229315, + "grad_norm": 3.096723794937134, + "learning_rate": 4.992378750280144e-06, + "loss": 0.7151, + "step": 413 + }, + { + "epoch": 0.19574468085106383, + "grad_norm": 2.895237684249878, + "learning_rate": 4.992329999362405e-06, + "loss": 0.7277, + "step": 414 + }, + { + "epoch": 0.19621749408983452, + "grad_norm": 2.718230724334717, + "learning_rate": 4.9922810932582065e-06, + "loss": 0.6375, + "step": 415 + }, + { + "epoch": 0.1966903073286052, + "grad_norm": 3.187743663787842, + "learning_rate": 4.992232031970592e-06, + "loss": 0.6528, + "step": 416 + }, + { + "epoch": 0.1971631205673759, + "grad_norm": 2.996406316757202, + "learning_rate": 4.992182815502616e-06, + "loss": 0.6552, + "step": 417 + }, + { + "epoch": 0.19763593380614658, + "grad_norm": 3.301084041595459, + "learning_rate": 4.992133443857345e-06, + "loss": 0.7061, + "step": 418 + }, + { + "epoch": 0.19810874704491727, + "grad_norm": 3.7874677181243896, + "learning_rate": 4.992083917037853e-06, + "loss": 0.7859, + "step": 419 + }, + { + "epoch": 0.19858156028368795, + "grad_norm": 3.124253511428833, + "learning_rate": 4.992034235047222e-06, + "loss": 0.7615, + "step": 420 + }, + { + "epoch": 0.19905437352245864, + "grad_norm": 3.0488970279693604, + "learning_rate": 4.991984397888546e-06, + "loss": 0.6916, + "step": 421 + }, + { + "epoch": 0.19952718676122932, + "grad_norm": 3.1241321563720703, + "learning_rate": 4.991934405564929e-06, + "loss": 0.7055, + "step": 422 + }, + { + "epoch": 0.2, + "grad_norm": 3.396632432937622, + "learning_rate": 4.991884258079484e-06, + "loss": 0.7675, + "step": 423 + }, + { + "epoch": 0.2004728132387707, + "grad_norm": 3.7776873111724854, + "learning_rate": 4.9918339554353316e-06, + "loss": 0.7371, + "step": 424 + }, + { + "epoch": 0.20094562647754138, + "grad_norm": 3.3356032371520996, + "learning_rate": 4.991783497635606e-06, + "loss": 0.6778, + "step": 425 + }, + { + "epoch": 0.20141843971631207, + "grad_norm": 2.988856792449951, + "learning_rate": 4.9917328846834474e-06, + "loss": 0.6795, + "step": 426 + }, + { + "epoch": 0.20189125295508276, + "grad_norm": 3.264183282852173, + "learning_rate": 4.99168211658201e-06, + "loss": 0.7707, + "step": 427 + }, + { + "epoch": 0.20236406619385341, + "grad_norm": 3.878068208694458, + "learning_rate": 4.991631193334451e-06, + "loss": 0.857, + "step": 428 + }, + { + "epoch": 0.2028368794326241, + "grad_norm": 3.6377553939819336, + "learning_rate": 4.991580114943943e-06, + "loss": 0.8033, + "step": 429 + }, + { + "epoch": 0.2033096926713948, + "grad_norm": 2.95393967628479, + "learning_rate": 4.991528881413667e-06, + "loss": 0.6809, + "step": 430 + }, + { + "epoch": 0.20378250591016547, + "grad_norm": 3.058704376220703, + "learning_rate": 4.9914774927468125e-06, + "loss": 0.6664, + "step": 431 + }, + { + "epoch": 0.20425531914893616, + "grad_norm": 2.7783217430114746, + "learning_rate": 4.9914259489465795e-06, + "loss": 0.6478, + "step": 432 + }, + { + "epoch": 0.20472813238770685, + "grad_norm": 2.4825217723846436, + "learning_rate": 4.991374250016177e-06, + "loss": 0.6598, + "step": 433 + }, + { + "epoch": 0.20520094562647753, + "grad_norm": 2.8753600120544434, + "learning_rate": 4.991322395958824e-06, + "loss": 0.6947, + "step": 434 + }, + { + "epoch": 0.20567375886524822, + "grad_norm": 3.2339367866516113, + "learning_rate": 4.99127038677775e-06, + "loss": 0.8201, + "step": 435 + }, + { + "epoch": 0.2061465721040189, + "grad_norm": 2.9065537452697754, + "learning_rate": 4.991218222476193e-06, + "loss": 0.6679, + "step": 436 + }, + { + "epoch": 0.2066193853427896, + "grad_norm": 3.283228874206543, + "learning_rate": 4.991165903057401e-06, + "loss": 0.8039, + "step": 437 + }, + { + "epoch": 0.20709219858156028, + "grad_norm": 3.429872751235962, + "learning_rate": 4.991113428524631e-06, + "loss": 0.7392, + "step": 438 + }, + { + "epoch": 0.20756501182033096, + "grad_norm": 3.118943452835083, + "learning_rate": 4.991060798881152e-06, + "loss": 0.6794, + "step": 439 + }, + { + "epoch": 0.20803782505910165, + "grad_norm": 3.395970106124878, + "learning_rate": 4.99100801413024e-06, + "loss": 0.6862, + "step": 440 + }, + { + "epoch": 0.20851063829787234, + "grad_norm": 2.869191884994507, + "learning_rate": 4.99095507427518e-06, + "loss": 0.6076, + "step": 441 + }, + { + "epoch": 0.20898345153664302, + "grad_norm": 3.1934661865234375, + "learning_rate": 4.990901979319272e-06, + "loss": 0.6927, + "step": 442 + }, + { + "epoch": 0.2094562647754137, + "grad_norm": 2.9068603515625, + "learning_rate": 4.990848729265819e-06, + "loss": 0.6864, + "step": 443 + }, + { + "epoch": 0.2099290780141844, + "grad_norm": 3.0535948276519775, + "learning_rate": 4.9907953241181375e-06, + "loss": 0.6396, + "step": 444 + }, + { + "epoch": 0.21040189125295508, + "grad_norm": 2.871511459350586, + "learning_rate": 4.990741763879554e-06, + "loss": 0.6743, + "step": 445 + }, + { + "epoch": 0.21087470449172577, + "grad_norm": 2.9184393882751465, + "learning_rate": 4.9906880485534015e-06, + "loss": 0.6786, + "step": 446 + }, + { + "epoch": 0.21134751773049645, + "grad_norm": 3.0628271102905273, + "learning_rate": 4.990634178143026e-06, + "loss": 0.6326, + "step": 447 + }, + { + "epoch": 0.21182033096926714, + "grad_norm": 3.7878305912017822, + "learning_rate": 4.990580152651782e-06, + "loss": 0.7944, + "step": 448 + }, + { + "epoch": 0.21229314420803783, + "grad_norm": 2.8577189445495605, + "learning_rate": 4.990525972083031e-06, + "loss": 0.71, + "step": 449 + }, + { + "epoch": 0.2127659574468085, + "grad_norm": 3.307769775390625, + "learning_rate": 4.99047163644015e-06, + "loss": 0.6893, + "step": 450 + }, + { + "epoch": 0.2132387706855792, + "grad_norm": 2.7391717433929443, + "learning_rate": 4.990417145726519e-06, + "loss": 0.712, + "step": 451 + }, + { + "epoch": 0.21371158392434988, + "grad_norm": 2.938044786453247, + "learning_rate": 4.990362499945534e-06, + "loss": 0.7516, + "step": 452 + }, + { + "epoch": 0.21418439716312057, + "grad_norm": 2.7831056118011475, + "learning_rate": 4.990307699100595e-06, + "loss": 0.6168, + "step": 453 + }, + { + "epoch": 0.21465721040189126, + "grad_norm": 2.907977342605591, + "learning_rate": 4.990252743195116e-06, + "loss": 0.6706, + "step": 454 + }, + { + "epoch": 0.21513002364066194, + "grad_norm": 3.7882161140441895, + "learning_rate": 4.990197632232517e-06, + "loss": 0.6847, + "step": 455 + }, + { + "epoch": 0.21560283687943263, + "grad_norm": 2.899716854095459, + "learning_rate": 4.990142366216232e-06, + "loss": 0.6699, + "step": 456 + }, + { + "epoch": 0.21607565011820332, + "grad_norm": 2.907003879547119, + "learning_rate": 4.990086945149701e-06, + "loss": 0.6864, + "step": 457 + }, + { + "epoch": 0.216548463356974, + "grad_norm": 3.2407333850860596, + "learning_rate": 4.9900313690363736e-06, + "loss": 0.692, + "step": 458 + }, + { + "epoch": 0.2170212765957447, + "grad_norm": 2.9055583477020264, + "learning_rate": 4.989975637879712e-06, + "loss": 0.7113, + "step": 459 + }, + { + "epoch": 0.21749408983451538, + "grad_norm": 2.9836206436157227, + "learning_rate": 4.989919751683184e-06, + "loss": 0.6673, + "step": 460 + }, + { + "epoch": 0.21796690307328606, + "grad_norm": 3.371035575866699, + "learning_rate": 4.989863710450273e-06, + "loss": 0.7181, + "step": 461 + }, + { + "epoch": 0.21843971631205675, + "grad_norm": 2.9636635780334473, + "learning_rate": 4.989807514184465e-06, + "loss": 0.6082, + "step": 462 + }, + { + "epoch": 0.21891252955082743, + "grad_norm": 2.9634664058685303, + "learning_rate": 4.9897511628892615e-06, + "loss": 0.7086, + "step": 463 + }, + { + "epoch": 0.21938534278959812, + "grad_norm": 3.154763698577881, + "learning_rate": 4.98969465656817e-06, + "loss": 0.7027, + "step": 464 + }, + { + "epoch": 0.2198581560283688, + "grad_norm": 2.9959890842437744, + "learning_rate": 4.98963799522471e-06, + "loss": 0.6498, + "step": 465 + }, + { + "epoch": 0.2203309692671395, + "grad_norm": 3.5470590591430664, + "learning_rate": 4.989581178862408e-06, + "loss": 0.7199, + "step": 466 + }, + { + "epoch": 0.22080378250591018, + "grad_norm": 7.1873369216918945, + "learning_rate": 4.989524207484802e-06, + "loss": 0.6676, + "step": 467 + }, + { + "epoch": 0.22127659574468084, + "grad_norm": 3.1099541187286377, + "learning_rate": 4.98946708109544e-06, + "loss": 0.6785, + "step": 468 + }, + { + "epoch": 0.22174940898345152, + "grad_norm": 2.830991506576538, + "learning_rate": 4.9894097996978795e-06, + "loss": 0.6456, + "step": 469 + }, + { + "epoch": 0.2222222222222222, + "grad_norm": 3.0212316513061523, + "learning_rate": 4.989352363295687e-06, + "loss": 0.6048, + "step": 470 + }, + { + "epoch": 0.2226950354609929, + "grad_norm": 3.18776798248291, + "learning_rate": 4.989294771892437e-06, + "loss": 0.7078, + "step": 471 + }, + { + "epoch": 0.22316784869976358, + "grad_norm": 2.9972598552703857, + "learning_rate": 4.989237025491717e-06, + "loss": 0.7082, + "step": 472 + }, + { + "epoch": 0.22364066193853427, + "grad_norm": 3.4935688972473145, + "learning_rate": 4.989179124097123e-06, + "loss": 0.8199, + "step": 473 + }, + { + "epoch": 0.22411347517730495, + "grad_norm": 2.6485543251037598, + "learning_rate": 4.9891210677122595e-06, + "loss": 0.6371, + "step": 474 + }, + { + "epoch": 0.22458628841607564, + "grad_norm": 2.969233512878418, + "learning_rate": 4.989062856340742e-06, + "loss": 0.6879, + "step": 475 + }, + { + "epoch": 0.22505910165484633, + "grad_norm": 2.881875514984131, + "learning_rate": 4.989004489986194e-06, + "loss": 0.7415, + "step": 476 + }, + { + "epoch": 0.225531914893617, + "grad_norm": 2.624540090560913, + "learning_rate": 4.98894596865225e-06, + "loss": 0.6522, + "step": 477 + }, + { + "epoch": 0.2260047281323877, + "grad_norm": 3.61075496673584, + "learning_rate": 4.988887292342555e-06, + "loss": 0.7109, + "step": 478 + }, + { + "epoch": 0.2264775413711584, + "grad_norm": 2.9368972778320312, + "learning_rate": 4.988828461060762e-06, + "loss": 0.6843, + "step": 479 + }, + { + "epoch": 0.22695035460992907, + "grad_norm": 3.0670197010040283, + "learning_rate": 4.988769474810533e-06, + "loss": 0.6807, + "step": 480 + }, + { + "epoch": 0.22742316784869976, + "grad_norm": 2.9662792682647705, + "learning_rate": 4.988710333595542e-06, + "loss": 0.6796, + "step": 481 + }, + { + "epoch": 0.22789598108747045, + "grad_norm": 2.971235752105713, + "learning_rate": 4.988651037419472e-06, + "loss": 0.696, + "step": 482 + }, + { + "epoch": 0.22836879432624113, + "grad_norm": 2.931884527206421, + "learning_rate": 4.988591586286013e-06, + "loss": 0.7323, + "step": 483 + }, + { + "epoch": 0.22884160756501182, + "grad_norm": 2.8114213943481445, + "learning_rate": 4.988531980198868e-06, + "loss": 0.6584, + "step": 484 + }, + { + "epoch": 0.2293144208037825, + "grad_norm": 3.2785916328430176, + "learning_rate": 4.98847221916175e-06, + "loss": 0.7514, + "step": 485 + }, + { + "epoch": 0.2297872340425532, + "grad_norm": 3.0520215034484863, + "learning_rate": 4.988412303178377e-06, + "loss": 0.7564, + "step": 486 + }, + { + "epoch": 0.23026004728132388, + "grad_norm": 3.181002616882324, + "learning_rate": 4.988352232252483e-06, + "loss": 0.6768, + "step": 487 + }, + { + "epoch": 0.23073286052009456, + "grad_norm": 3.4953625202178955, + "learning_rate": 4.988292006387805e-06, + "loss": 0.7143, + "step": 488 + }, + { + "epoch": 0.23120567375886525, + "grad_norm": 3.326571226119995, + "learning_rate": 4.988231625588096e-06, + "loss": 0.7318, + "step": 489 + }, + { + "epoch": 0.23167848699763594, + "grad_norm": 3.09614634513855, + "learning_rate": 4.988171089857113e-06, + "loss": 0.6574, + "step": 490 + }, + { + "epoch": 0.23215130023640662, + "grad_norm": 2.7439446449279785, + "learning_rate": 4.9881103991986265e-06, + "loss": 0.6637, + "step": 491 + }, + { + "epoch": 0.2326241134751773, + "grad_norm": 3.0681190490722656, + "learning_rate": 4.988049553616416e-06, + "loss": 0.6326, + "step": 492 + }, + { + "epoch": 0.233096926713948, + "grad_norm": 3.0757341384887695, + "learning_rate": 4.98798855311427e-06, + "loss": 0.695, + "step": 493 + }, + { + "epoch": 0.23356973995271868, + "grad_norm": 2.8637635707855225, + "learning_rate": 4.987927397695985e-06, + "loss": 0.6598, + "step": 494 + }, + { + "epoch": 0.23404255319148937, + "grad_norm": 3.3641068935394287, + "learning_rate": 4.9878660873653715e-06, + "loss": 0.7435, + "step": 495 + }, + { + "epoch": 0.23451536643026005, + "grad_norm": 3.5025596618652344, + "learning_rate": 4.987804622126245e-06, + "loss": 0.735, + "step": 496 + }, + { + "epoch": 0.23498817966903074, + "grad_norm": 2.9298837184906006, + "learning_rate": 4.987743001982434e-06, + "loss": 0.7063, + "step": 497 + }, + { + "epoch": 0.23546099290780143, + "grad_norm": 2.70358943939209, + "learning_rate": 4.987681226937774e-06, + "loss": 0.6799, + "step": 498 + }, + { + "epoch": 0.2359338061465721, + "grad_norm": 3.027871608734131, + "learning_rate": 4.9876192969961125e-06, + "loss": 0.6881, + "step": 499 + }, + { + "epoch": 0.2364066193853428, + "grad_norm": 3.362306594848633, + "learning_rate": 4.987557212161304e-06, + "loss": 0.7906, + "step": 500 + }, + { + "epoch": 0.23687943262411348, + "grad_norm": 3.3136050701141357, + "learning_rate": 4.987494972437217e-06, + "loss": 0.6878, + "step": 501 + }, + { + "epoch": 0.23735224586288417, + "grad_norm": 3.017089605331421, + "learning_rate": 4.9874325778277255e-06, + "loss": 0.7279, + "step": 502 + }, + { + "epoch": 0.23782505910165486, + "grad_norm": 2.8300516605377197, + "learning_rate": 4.987370028336714e-06, + "loss": 0.6864, + "step": 503 + }, + { + "epoch": 0.23829787234042554, + "grad_norm": 3.201860189437866, + "learning_rate": 4.987307323968077e-06, + "loss": 0.7531, + "step": 504 + }, + { + "epoch": 0.23877068557919623, + "grad_norm": 2.685396194458008, + "learning_rate": 4.987244464725721e-06, + "loss": 0.5849, + "step": 505 + }, + { + "epoch": 0.23924349881796692, + "grad_norm": 2.8715312480926514, + "learning_rate": 4.987181450613557e-06, + "loss": 0.675, + "step": 506 + }, + { + "epoch": 0.2397163120567376, + "grad_norm": 2.813908815383911, + "learning_rate": 4.987118281635511e-06, + "loss": 0.6841, + "step": 507 + }, + { + "epoch": 0.2401891252955083, + "grad_norm": 3.2738473415374756, + "learning_rate": 4.987054957795514e-06, + "loss": 0.7158, + "step": 508 + }, + { + "epoch": 0.24066193853427895, + "grad_norm": 2.896134376525879, + "learning_rate": 4.986991479097511e-06, + "loss": 0.7542, + "step": 509 + }, + { + "epoch": 0.24113475177304963, + "grad_norm": 3.0390403270721436, + "learning_rate": 4.986927845545454e-06, + "loss": 0.6733, + "step": 510 + }, + { + "epoch": 0.24160756501182032, + "grad_norm": 3.0300254821777344, + "learning_rate": 4.9868640571433044e-06, + "loss": 0.722, + "step": 511 + }, + { + "epoch": 0.242080378250591, + "grad_norm": 3.3037352561950684, + "learning_rate": 4.986800113895035e-06, + "loss": 0.6811, + "step": 512 + }, + { + "epoch": 0.2425531914893617, + "grad_norm": 3.0358474254608154, + "learning_rate": 4.986736015804627e-06, + "loss": 0.7348, + "step": 513 + }, + { + "epoch": 0.24302600472813238, + "grad_norm": 3.108792304992676, + "learning_rate": 4.986671762876071e-06, + "loss": 0.6096, + "step": 514 + }, + { + "epoch": 0.24349881796690306, + "grad_norm": 3.1316237449645996, + "learning_rate": 4.986607355113367e-06, + "loss": 0.6357, + "step": 515 + }, + { + "epoch": 0.24397163120567375, + "grad_norm": 3.3095219135284424, + "learning_rate": 4.986542792520528e-06, + "loss": 0.7515, + "step": 516 + }, + { + "epoch": 0.24444444444444444, + "grad_norm": 3.4775984287261963, + "learning_rate": 4.986478075101572e-06, + "loss": 0.7104, + "step": 517 + }, + { + "epoch": 0.24491725768321512, + "grad_norm": 3.341708183288574, + "learning_rate": 4.986413202860528e-06, + "loss": 0.7339, + "step": 518 + }, + { + "epoch": 0.2453900709219858, + "grad_norm": 2.9646966457366943, + "learning_rate": 4.986348175801438e-06, + "loss": 0.6032, + "step": 519 + }, + { + "epoch": 0.2458628841607565, + "grad_norm": 3.1853902339935303, + "learning_rate": 4.986282993928349e-06, + "loss": 0.6925, + "step": 520 + }, + { + "epoch": 0.24633569739952718, + "grad_norm": 3.286909818649292, + "learning_rate": 4.98621765724532e-06, + "loss": 0.7447, + "step": 521 + }, + { + "epoch": 0.24680851063829787, + "grad_norm": 3.2255051136016846, + "learning_rate": 4.986152165756419e-06, + "loss": 0.7747, + "step": 522 + }, + { + "epoch": 0.24728132387706855, + "grad_norm": 3.002352237701416, + "learning_rate": 4.986086519465724e-06, + "loss": 0.6472, + "step": 523 + }, + { + "epoch": 0.24775413711583924, + "grad_norm": 3.4738974571228027, + "learning_rate": 4.986020718377322e-06, + "loss": 0.7381, + "step": 524 + }, + { + "epoch": 0.24822695035460993, + "grad_norm": 3.4470200538635254, + "learning_rate": 4.985954762495312e-06, + "loss": 0.6878, + "step": 525 + }, + { + "epoch": 0.2486997635933806, + "grad_norm": 2.9219350814819336, + "learning_rate": 4.985888651823799e-06, + "loss": 0.6317, + "step": 526 + }, + { + "epoch": 0.2491725768321513, + "grad_norm": 3.061767101287842, + "learning_rate": 4.985822386366899e-06, + "loss": 0.6842, + "step": 527 + }, + { + "epoch": 0.24964539007092199, + "grad_norm": 3.0291247367858887, + "learning_rate": 4.985755966128742e-06, + "loss": 0.6852, + "step": 528 + }, + { + "epoch": 0.25011820330969264, + "grad_norm": 2.964280843734741, + "learning_rate": 4.985689391113457e-06, + "loss": 0.7738, + "step": 529 + }, + { + "epoch": 0.25059101654846333, + "grad_norm": 3.058302164077759, + "learning_rate": 4.9856226613251955e-06, + "loss": 0.6677, + "step": 530 + }, + { + "epoch": 0.251063829787234, + "grad_norm": 3.345141649246216, + "learning_rate": 4.985555776768109e-06, + "loss": 0.7837, + "step": 531 + }, + { + "epoch": 0.2515366430260047, + "grad_norm": 3.565031051635742, + "learning_rate": 4.9854887374463636e-06, + "loss": 0.7231, + "step": 532 + }, + { + "epoch": 0.2520094562647754, + "grad_norm": 2.7953789234161377, + "learning_rate": 4.985421543364132e-06, + "loss": 0.6102, + "step": 533 + }, + { + "epoch": 0.2524822695035461, + "grad_norm": 2.887606620788574, + "learning_rate": 4.9853541945256e-06, + "loss": 0.6289, + "step": 534 + }, + { + "epoch": 0.25295508274231676, + "grad_norm": 3.1480495929718018, + "learning_rate": 4.985286690934961e-06, + "loss": 0.6348, + "step": 535 + }, + { + "epoch": 0.25342789598108745, + "grad_norm": 2.8912761211395264, + "learning_rate": 4.985219032596416e-06, + "loss": 0.595, + "step": 536 + }, + { + "epoch": 0.25390070921985813, + "grad_norm": 2.947936534881592, + "learning_rate": 4.98515121951418e-06, + "loss": 0.6196, + "step": 537 + }, + { + "epoch": 0.2543735224586288, + "grad_norm": 3.1085827350616455, + "learning_rate": 4.985083251692474e-06, + "loss": 0.6387, + "step": 538 + }, + { + "epoch": 0.2548463356973995, + "grad_norm": 3.1688334941864014, + "learning_rate": 4.985015129135531e-06, + "loss": 0.7055, + "step": 539 + }, + { + "epoch": 0.2553191489361702, + "grad_norm": 3.075042963027954, + "learning_rate": 4.984946851847593e-06, + "loss": 0.7515, + "step": 540 + }, + { + "epoch": 0.2557919621749409, + "grad_norm": 3.1933093070983887, + "learning_rate": 4.98487841983291e-06, + "loss": 0.7054, + "step": 541 + }, + { + "epoch": 0.25626477541371157, + "grad_norm": 3.043473958969116, + "learning_rate": 4.984809833095744e-06, + "loss": 0.6281, + "step": 542 + }, + { + "epoch": 0.25673758865248225, + "grad_norm": 3.0532584190368652, + "learning_rate": 4.9847410916403645e-06, + "loss": 0.6155, + "step": 543 + }, + { + "epoch": 0.25721040189125294, + "grad_norm": 3.608480215072632, + "learning_rate": 4.984672195471053e-06, + "loss": 0.7363, + "step": 544 + }, + { + "epoch": 0.2576832151300236, + "grad_norm": 2.7491862773895264, + "learning_rate": 4.9846031445921e-06, + "loss": 0.6594, + "step": 545 + }, + { + "epoch": 0.2581560283687943, + "grad_norm": 2.8602418899536133, + "learning_rate": 4.984533939007802e-06, + "loss": 0.6742, + "step": 546 + }, + { + "epoch": 0.258628841607565, + "grad_norm": 3.1782007217407227, + "learning_rate": 4.98446457872247e-06, + "loss": 0.731, + "step": 547 + }, + { + "epoch": 0.2591016548463357, + "grad_norm": 2.796147584915161, + "learning_rate": 4.984395063740423e-06, + "loss": 0.6617, + "step": 548 + }, + { + "epoch": 0.25957446808510637, + "grad_norm": 2.8392202854156494, + "learning_rate": 4.984325394065991e-06, + "loss": 0.6753, + "step": 549 + }, + { + "epoch": 0.26004728132387706, + "grad_norm": 3.134672164916992, + "learning_rate": 4.984255569703508e-06, + "loss": 0.7222, + "step": 550 + }, + { + "epoch": 0.26052009456264774, + "grad_norm": 2.734330177307129, + "learning_rate": 4.984185590657325e-06, + "loss": 0.6098, + "step": 551 + }, + { + "epoch": 0.26099290780141843, + "grad_norm": 3.739010810852051, + "learning_rate": 4.984115456931798e-06, + "loss": 0.7457, + "step": 552 + }, + { + "epoch": 0.2614657210401891, + "grad_norm": 2.8412528038024902, + "learning_rate": 4.9840451685312925e-06, + "loss": 0.6972, + "step": 553 + }, + { + "epoch": 0.2619385342789598, + "grad_norm": 3.017395496368408, + "learning_rate": 4.983974725460188e-06, + "loss": 0.6887, + "step": 554 + }, + { + "epoch": 0.2624113475177305, + "grad_norm": 3.2746949195861816, + "learning_rate": 4.98390412772287e-06, + "loss": 0.7047, + "step": 555 + }, + { + "epoch": 0.2628841607565012, + "grad_norm": 3.1561965942382812, + "learning_rate": 4.983833375323732e-06, + "loss": 0.7726, + "step": 556 + }, + { + "epoch": 0.26335697399527186, + "grad_norm": 3.2367217540740967, + "learning_rate": 4.9837624682671816e-06, + "loss": 0.6348, + "step": 557 + }, + { + "epoch": 0.26382978723404255, + "grad_norm": 2.8195858001708984, + "learning_rate": 4.983691406557633e-06, + "loss": 0.6387, + "step": 558 + }, + { + "epoch": 0.26430260047281323, + "grad_norm": 3.349820852279663, + "learning_rate": 4.983620190199511e-06, + "loss": 0.6776, + "step": 559 + }, + { + "epoch": 0.2647754137115839, + "grad_norm": 2.8025588989257812, + "learning_rate": 4.98354881919725e-06, + "loss": 0.6512, + "step": 560 + }, + { + "epoch": 0.2652482269503546, + "grad_norm": 2.9125499725341797, + "learning_rate": 4.983477293555295e-06, + "loss": 0.7024, + "step": 561 + }, + { + "epoch": 0.2657210401891253, + "grad_norm": 3.3479275703430176, + "learning_rate": 4.983405613278098e-06, + "loss": 0.688, + "step": 562 + }, + { + "epoch": 0.266193853427896, + "grad_norm": 3.123971462249756, + "learning_rate": 4.983333778370123e-06, + "loss": 0.6743, + "step": 563 + }, + { + "epoch": 0.26666666666666666, + "grad_norm": 2.891625165939331, + "learning_rate": 4.983261788835843e-06, + "loss": 0.5971, + "step": 564 + }, + { + "epoch": 0.26713947990543735, + "grad_norm": 3.5066864490509033, + "learning_rate": 4.98318964467974e-06, + "loss": 0.6958, + "step": 565 + }, + { + "epoch": 0.26761229314420804, + "grad_norm": 2.570547342300415, + "learning_rate": 4.983117345906306e-06, + "loss": 0.609, + "step": 566 + }, + { + "epoch": 0.2680851063829787, + "grad_norm": 3.005106210708618, + "learning_rate": 4.983044892520044e-06, + "loss": 0.6791, + "step": 567 + }, + { + "epoch": 0.2685579196217494, + "grad_norm": 3.429675340652466, + "learning_rate": 4.982972284525463e-06, + "loss": 0.6625, + "step": 568 + }, + { + "epoch": 0.2690307328605201, + "grad_norm": 3.825657367706299, + "learning_rate": 4.982899521927086e-06, + "loss": 0.6368, + "step": 569 + }, + { + "epoch": 0.2695035460992908, + "grad_norm": 2.8699095249176025, + "learning_rate": 4.982826604729443e-06, + "loss": 0.6425, + "step": 570 + }, + { + "epoch": 0.26997635933806147, + "grad_norm": 3.1688714027404785, + "learning_rate": 4.982753532937074e-06, + "loss": 0.6904, + "step": 571 + }, + { + "epoch": 0.27044917257683215, + "grad_norm": 3.3889992237091064, + "learning_rate": 4.98268030655453e-06, + "loss": 0.7575, + "step": 572 + }, + { + "epoch": 0.27092198581560284, + "grad_norm": 3.108315944671631, + "learning_rate": 4.982606925586367e-06, + "loss": 0.6648, + "step": 573 + }, + { + "epoch": 0.2713947990543735, + "grad_norm": 3.209831953048706, + "learning_rate": 4.982533390037159e-06, + "loss": 0.657, + "step": 574 + }, + { + "epoch": 0.2718676122931442, + "grad_norm": 3.1740927696228027, + "learning_rate": 4.982459699911482e-06, + "loss": 0.7262, + "step": 575 + }, + { + "epoch": 0.2723404255319149, + "grad_norm": 3.0190417766571045, + "learning_rate": 4.982385855213924e-06, + "loss": 0.6368, + "step": 576 + }, + { + "epoch": 0.2728132387706856, + "grad_norm": 3.05049467086792, + "learning_rate": 4.982311855949084e-06, + "loss": 0.72, + "step": 577 + }, + { + "epoch": 0.27328605200945627, + "grad_norm": 2.984816551208496, + "learning_rate": 4.98223770212157e-06, + "loss": 0.6856, + "step": 578 + }, + { + "epoch": 0.27375886524822696, + "grad_norm": 2.744969606399536, + "learning_rate": 4.982163393735998e-06, + "loss": 0.6023, + "step": 579 + }, + { + "epoch": 0.27423167848699764, + "grad_norm": 3.170564889907837, + "learning_rate": 4.982088930796996e-06, + "loss": 0.6678, + "step": 580 + }, + { + "epoch": 0.27470449172576833, + "grad_norm": 2.8686118125915527, + "learning_rate": 4.982014313309199e-06, + "loss": 0.6157, + "step": 581 + }, + { + "epoch": 0.275177304964539, + "grad_norm": 2.8768694400787354, + "learning_rate": 4.981939541277254e-06, + "loss": 0.6566, + "step": 582 + }, + { + "epoch": 0.2756501182033097, + "grad_norm": 2.621481418609619, + "learning_rate": 4.981864614705818e-06, + "loss": 0.7372, + "step": 583 + }, + { + "epoch": 0.2761229314420804, + "grad_norm": 3.527374267578125, + "learning_rate": 4.981789533599554e-06, + "loss": 0.6485, + "step": 584 + }, + { + "epoch": 0.2765957446808511, + "grad_norm": 3.3141074180603027, + "learning_rate": 4.981714297963138e-06, + "loss": 0.6816, + "step": 585 + }, + { + "epoch": 0.27706855791962176, + "grad_norm": 2.9247069358825684, + "learning_rate": 4.981638907801255e-06, + "loss": 0.7217, + "step": 586 + }, + { + "epoch": 0.27754137115839245, + "grad_norm": 2.875236749649048, + "learning_rate": 4.981563363118599e-06, + "loss": 0.6662, + "step": 587 + }, + { + "epoch": 0.27801418439716313, + "grad_norm": 2.9540364742279053, + "learning_rate": 4.981487663919874e-06, + "loss": 0.7225, + "step": 588 + }, + { + "epoch": 0.2784869976359338, + "grad_norm": 2.90889310836792, + "learning_rate": 4.981411810209793e-06, + "loss": 0.6054, + "step": 589 + }, + { + "epoch": 0.2789598108747045, + "grad_norm": 2.8541409969329834, + "learning_rate": 4.981335801993078e-06, + "loss": 0.6539, + "step": 590 + }, + { + "epoch": 0.2794326241134752, + "grad_norm": 3.1600730419158936, + "learning_rate": 4.981259639274465e-06, + "loss": 0.6415, + "step": 591 + }, + { + "epoch": 0.2799054373522459, + "grad_norm": 3.569376230239868, + "learning_rate": 4.981183322058693e-06, + "loss": 0.6944, + "step": 592 + }, + { + "epoch": 0.28037825059101656, + "grad_norm": 3.067667007446289, + "learning_rate": 4.981106850350515e-06, + "loss": 0.7378, + "step": 593 + }, + { + "epoch": 0.28085106382978725, + "grad_norm": 3.082073450088501, + "learning_rate": 4.981030224154693e-06, + "loss": 0.693, + "step": 594 + }, + { + "epoch": 0.28132387706855794, + "grad_norm": 2.902932643890381, + "learning_rate": 4.980953443475998e-06, + "loss": 0.6549, + "step": 595 + }, + { + "epoch": 0.2817966903073286, + "grad_norm": 2.6821181774139404, + "learning_rate": 4.980876508319211e-06, + "loss": 0.6231, + "step": 596 + }, + { + "epoch": 0.2822695035460993, + "grad_norm": 3.1747355461120605, + "learning_rate": 4.9807994186891215e-06, + "loss": 0.6826, + "step": 597 + }, + { + "epoch": 0.28274231678487, + "grad_norm": 2.6975860595703125, + "learning_rate": 4.980722174590531e-06, + "loss": 0.6669, + "step": 598 + }, + { + "epoch": 0.2832151300236407, + "grad_norm": 2.924285650253296, + "learning_rate": 4.9806447760282486e-06, + "loss": 0.689, + "step": 599 + }, + { + "epoch": 0.28368794326241137, + "grad_norm": 2.941417694091797, + "learning_rate": 4.980567223007093e-06, + "loss": 0.6672, + "step": 600 + }, + { + "epoch": 0.28416075650118205, + "grad_norm": 2.8582186698913574, + "learning_rate": 4.980489515531892e-06, + "loss": 0.6229, + "step": 601 + }, + { + "epoch": 0.28463356973995274, + "grad_norm": 2.6462013721466064, + "learning_rate": 4.9804116536074865e-06, + "loss": 0.606, + "step": 602 + }, + { + "epoch": 0.2851063829787234, + "grad_norm": 2.9029998779296875, + "learning_rate": 4.980333637238723e-06, + "loss": 0.5915, + "step": 603 + }, + { + "epoch": 0.2855791962174941, + "grad_norm": 3.9359042644500732, + "learning_rate": 4.980255466430462e-06, + "loss": 0.7035, + "step": 604 + }, + { + "epoch": 0.2860520094562648, + "grad_norm": 3.200524091720581, + "learning_rate": 4.980177141187566e-06, + "loss": 0.7156, + "step": 605 + }, + { + "epoch": 0.2865248226950355, + "grad_norm": 3.1708686351776123, + "learning_rate": 4.980098661514916e-06, + "loss": 0.746, + "step": 606 + }, + { + "epoch": 0.28699763593380617, + "grad_norm": 2.8926830291748047, + "learning_rate": 4.980020027417397e-06, + "loss": 0.6282, + "step": 607 + }, + { + "epoch": 0.28747044917257686, + "grad_norm": 3.0526294708251953, + "learning_rate": 4.979941238899906e-06, + "loss": 0.6594, + "step": 608 + }, + { + "epoch": 0.28794326241134754, + "grad_norm": 2.9869306087493896, + "learning_rate": 4.9798622959673486e-06, + "loss": 0.7771, + "step": 609 + }, + { + "epoch": 0.28841607565011823, + "grad_norm": 2.7894513607025146, + "learning_rate": 4.979783198624638e-06, + "loss": 0.6819, + "step": 610 + }, + { + "epoch": 0.28888888888888886, + "grad_norm": 2.958575963973999, + "learning_rate": 4.9797039468767025e-06, + "loss": 0.6474, + "step": 611 + }, + { + "epoch": 0.28936170212765955, + "grad_norm": 3.423748016357422, + "learning_rate": 4.979624540728475e-06, + "loss": 0.7389, + "step": 612 + }, + { + "epoch": 0.28983451536643023, + "grad_norm": 2.9641635417938232, + "learning_rate": 4.9795449801849e-06, + "loss": 0.6005, + "step": 613 + }, + { + "epoch": 0.2903073286052009, + "grad_norm": 3.02274227142334, + "learning_rate": 4.979465265250933e-06, + "loss": 0.6358, + "step": 614 + }, + { + "epoch": 0.2907801418439716, + "grad_norm": 3.0562758445739746, + "learning_rate": 4.979385395931534e-06, + "loss": 0.6313, + "step": 615 + }, + { + "epoch": 0.2912529550827423, + "grad_norm": 3.301816701889038, + "learning_rate": 4.97930537223168e-06, + "loss": 0.7264, + "step": 616 + }, + { + "epoch": 0.291725768321513, + "grad_norm": 2.975360870361328, + "learning_rate": 4.979225194156351e-06, + "loss": 0.613, + "step": 617 + }, + { + "epoch": 0.29219858156028367, + "grad_norm": 2.9245030879974365, + "learning_rate": 4.97914486171054e-06, + "loss": 0.6646, + "step": 618 + }, + { + "epoch": 0.29267139479905435, + "grad_norm": 3.1336188316345215, + "learning_rate": 4.979064374899249e-06, + "loss": 0.6421, + "step": 619 + }, + { + "epoch": 0.29314420803782504, + "grad_norm": 3.6298763751983643, + "learning_rate": 4.978983733727491e-06, + "loss": 0.6433, + "step": 620 + }, + { + "epoch": 0.2936170212765957, + "grad_norm": 2.919597625732422, + "learning_rate": 4.9789029382002845e-06, + "loss": 0.6288, + "step": 621 + }, + { + "epoch": 0.2940898345153664, + "grad_norm": 3.2206127643585205, + "learning_rate": 4.978821988322662e-06, + "loss": 0.7102, + "step": 622 + }, + { + "epoch": 0.2945626477541371, + "grad_norm": 3.1767101287841797, + "learning_rate": 4.978740884099664e-06, + "loss": 0.6722, + "step": 623 + }, + { + "epoch": 0.2950354609929078, + "grad_norm": 3.3425452709198, + "learning_rate": 4.97865962553634e-06, + "loss": 0.6492, + "step": 624 + }, + { + "epoch": 0.29550827423167847, + "grad_norm": 3.0408358573913574, + "learning_rate": 4.97857821263775e-06, + "loss": 0.6522, + "step": 625 + }, + { + "epoch": 0.29598108747044916, + "grad_norm": 2.8144783973693848, + "learning_rate": 4.978496645408963e-06, + "loss": 0.7237, + "step": 626 + }, + { + "epoch": 0.29645390070921984, + "grad_norm": 3.7010560035705566, + "learning_rate": 4.978414923855057e-06, + "loss": 0.7509, + "step": 627 + }, + { + "epoch": 0.29692671394799053, + "grad_norm": 2.9438371658325195, + "learning_rate": 4.978333047981122e-06, + "loss": 0.6244, + "step": 628 + }, + { + "epoch": 0.2973995271867612, + "grad_norm": 3.285982370376587, + "learning_rate": 4.978251017792255e-06, + "loss": 0.7553, + "step": 629 + }, + { + "epoch": 0.2978723404255319, + "grad_norm": 3.7021138668060303, + "learning_rate": 4.978168833293564e-06, + "loss": 0.7859, + "step": 630 + }, + { + "epoch": 0.2983451536643026, + "grad_norm": 3.481858730316162, + "learning_rate": 4.9780864944901654e-06, + "loss": 0.7146, + "step": 631 + }, + { + "epoch": 0.2988179669030733, + "grad_norm": 3.693824529647827, + "learning_rate": 4.978004001387188e-06, + "loss": 0.6608, + "step": 632 + }, + { + "epoch": 0.29929078014184396, + "grad_norm": 3.0069146156311035, + "learning_rate": 4.9779213539897665e-06, + "loss": 0.6506, + "step": 633 + }, + { + "epoch": 0.29976359338061465, + "grad_norm": 3.037644147872925, + "learning_rate": 4.977838552303048e-06, + "loss": 0.6487, + "step": 634 + }, + { + "epoch": 0.30023640661938533, + "grad_norm": 3.018554449081421, + "learning_rate": 4.977755596332188e-06, + "loss": 0.6128, + "step": 635 + }, + { + "epoch": 0.300709219858156, + "grad_norm": 3.000312089920044, + "learning_rate": 4.977672486082351e-06, + "loss": 0.6431, + "step": 636 + }, + { + "epoch": 0.3011820330969267, + "grad_norm": 2.836803913116455, + "learning_rate": 4.977589221558713e-06, + "loss": 0.5914, + "step": 637 + }, + { + "epoch": 0.3016548463356974, + "grad_norm": 3.080469846725464, + "learning_rate": 4.977505802766457e-06, + "loss": 0.7265, + "step": 638 + }, + { + "epoch": 0.3021276595744681, + "grad_norm": 3.2245471477508545, + "learning_rate": 4.97742222971078e-06, + "loss": 0.6895, + "step": 639 + }, + { + "epoch": 0.30260047281323876, + "grad_norm": 3.559006452560425, + "learning_rate": 4.977338502396882e-06, + "loss": 0.7439, + "step": 640 + }, + { + "epoch": 0.30307328605200945, + "grad_norm": 2.9116289615631104, + "learning_rate": 4.9772546208299795e-06, + "loss": 0.6907, + "step": 641 + }, + { + "epoch": 0.30354609929078014, + "grad_norm": 3.3645524978637695, + "learning_rate": 4.977170585015295e-06, + "loss": 0.6983, + "step": 642 + }, + { + "epoch": 0.3040189125295508, + "grad_norm": 3.080148458480835, + "learning_rate": 4.977086394958058e-06, + "loss": 0.7016, + "step": 643 + }, + { + "epoch": 0.3044917257683215, + "grad_norm": 2.9276750087738037, + "learning_rate": 4.977002050663515e-06, + "loss": 0.6509, + "step": 644 + }, + { + "epoch": 0.3049645390070922, + "grad_norm": 3.183609962463379, + "learning_rate": 4.976917552136914e-06, + "loss": 0.6814, + "step": 645 + }, + { + "epoch": 0.3054373522458629, + "grad_norm": 3.0980000495910645, + "learning_rate": 4.976832899383519e-06, + "loss": 0.6319, + "step": 646 + }, + { + "epoch": 0.30591016548463357, + "grad_norm": 3.211376190185547, + "learning_rate": 4.9767480924086e-06, + "loss": 0.6365, + "step": 647 + }, + { + "epoch": 0.30638297872340425, + "grad_norm": 3.214430093765259, + "learning_rate": 4.976663131217437e-06, + "loss": 0.6006, + "step": 648 + }, + { + "epoch": 0.30685579196217494, + "grad_norm": 3.0914318561553955, + "learning_rate": 4.976578015815321e-06, + "loss": 0.7162, + "step": 649 + }, + { + "epoch": 0.3073286052009456, + "grad_norm": 2.7644500732421875, + "learning_rate": 4.976492746207551e-06, + "loss": 0.6045, + "step": 650 + }, + { + "epoch": 0.3078014184397163, + "grad_norm": 3.1913280487060547, + "learning_rate": 4.9764073223994374e-06, + "loss": 0.6796, + "step": 651 + }, + { + "epoch": 0.308274231678487, + "grad_norm": 2.8919692039489746, + "learning_rate": 4.976321744396299e-06, + "loss": 0.6683, + "step": 652 + }, + { + "epoch": 0.3087470449172577, + "grad_norm": 2.862234115600586, + "learning_rate": 4.976236012203463e-06, + "loss": 0.6631, + "step": 653 + }, + { + "epoch": 0.30921985815602837, + "grad_norm": 2.9708092212677, + "learning_rate": 4.976150125826268e-06, + "loss": 0.6326, + "step": 654 + }, + { + "epoch": 0.30969267139479906, + "grad_norm": 2.892465353012085, + "learning_rate": 4.976064085270063e-06, + "loss": 0.6574, + "step": 655 + }, + { + "epoch": 0.31016548463356974, + "grad_norm": 3.9215126037597656, + "learning_rate": 4.975977890540205e-06, + "loss": 0.7351, + "step": 656 + }, + { + "epoch": 0.31063829787234043, + "grad_norm": 2.9544081687927246, + "learning_rate": 4.975891541642059e-06, + "loss": 0.7264, + "step": 657 + }, + { + "epoch": 0.3111111111111111, + "grad_norm": 2.995035409927368, + "learning_rate": 4.975805038581005e-06, + "loss": 0.7405, + "step": 658 + }, + { + "epoch": 0.3115839243498818, + "grad_norm": 2.9653120040893555, + "learning_rate": 4.975718381362427e-06, + "loss": 0.679, + "step": 659 + }, + { + "epoch": 0.3120567375886525, + "grad_norm": 2.93976092338562, + "learning_rate": 4.9756315699917205e-06, + "loss": 0.627, + "step": 660 + }, + { + "epoch": 0.3125295508274232, + "grad_norm": 3.106522560119629, + "learning_rate": 4.9755446044742915e-06, + "loss": 0.6329, + "step": 661 + }, + { + "epoch": 0.31300236406619386, + "grad_norm": 3.0238280296325684, + "learning_rate": 4.975457484815554e-06, + "loss": 0.6643, + "step": 662 + }, + { + "epoch": 0.31347517730496455, + "grad_norm": 2.943528175354004, + "learning_rate": 4.9753702110209356e-06, + "loss": 0.668, + "step": 663 + }, + { + "epoch": 0.31394799054373523, + "grad_norm": 2.6840121746063232, + "learning_rate": 4.9752827830958676e-06, + "loss": 0.5482, + "step": 664 + }, + { + "epoch": 0.3144208037825059, + "grad_norm": 2.823875904083252, + "learning_rate": 4.975195201045794e-06, + "loss": 0.7017, + "step": 665 + }, + { + "epoch": 0.3148936170212766, + "grad_norm": 3.148181200027466, + "learning_rate": 4.975107464876168e-06, + "loss": 0.747, + "step": 666 + }, + { + "epoch": 0.3153664302600473, + "grad_norm": 2.630584478378296, + "learning_rate": 4.9750195745924545e-06, + "loss": 0.5987, + "step": 667 + }, + { + "epoch": 0.315839243498818, + "grad_norm": 3.075866460800171, + "learning_rate": 4.974931530200124e-06, + "loss": 0.664, + "step": 668 + }, + { + "epoch": 0.31631205673758866, + "grad_norm": 2.947197914123535, + "learning_rate": 4.974843331704659e-06, + "loss": 0.631, + "step": 669 + }, + { + "epoch": 0.31678486997635935, + "grad_norm": 3.519646644592285, + "learning_rate": 4.974754979111552e-06, + "loss": 0.7154, + "step": 670 + }, + { + "epoch": 0.31725768321513004, + "grad_norm": 2.8687186241149902, + "learning_rate": 4.974666472426305e-06, + "loss": 0.6366, + "step": 671 + }, + { + "epoch": 0.3177304964539007, + "grad_norm": 2.6966612339019775, + "learning_rate": 4.974577811654426e-06, + "loss": 0.7112, + "step": 672 + }, + { + "epoch": 0.3182033096926714, + "grad_norm": 3.1390228271484375, + "learning_rate": 4.974488996801439e-06, + "loss": 0.6882, + "step": 673 + }, + { + "epoch": 0.3186761229314421, + "grad_norm": 3.4667599201202393, + "learning_rate": 4.974400027872871e-06, + "loss": 0.7153, + "step": 674 + }, + { + "epoch": 0.3191489361702128, + "grad_norm": 2.9632184505462646, + "learning_rate": 4.974310904874265e-06, + "loss": 0.7081, + "step": 675 + }, + { + "epoch": 0.31962174940898347, + "grad_norm": 3.46150279045105, + "learning_rate": 4.9742216278111666e-06, + "loss": 0.6242, + "step": 676 + }, + { + "epoch": 0.32009456264775416, + "grad_norm": 3.380403757095337, + "learning_rate": 4.974132196689137e-06, + "loss": 0.6863, + "step": 677 + }, + { + "epoch": 0.32056737588652484, + "grad_norm": 3.4279606342315674, + "learning_rate": 4.974042611513746e-06, + "loss": 0.6388, + "step": 678 + }, + { + "epoch": 0.3210401891252955, + "grad_norm": 2.634523391723633, + "learning_rate": 4.973952872290568e-06, + "loss": 0.6038, + "step": 679 + }, + { + "epoch": 0.3215130023640662, + "grad_norm": 3.19693922996521, + "learning_rate": 4.973862979025194e-06, + "loss": 0.6383, + "step": 680 + }, + { + "epoch": 0.3219858156028369, + "grad_norm": 3.437692165374756, + "learning_rate": 4.973772931723218e-06, + "loss": 0.7288, + "step": 681 + }, + { + "epoch": 0.3224586288416076, + "grad_norm": 2.506301164627075, + "learning_rate": 4.97368273039025e-06, + "loss": 0.5707, + "step": 682 + }, + { + "epoch": 0.3229314420803783, + "grad_norm": 3.0942845344543457, + "learning_rate": 4.9735923750319044e-06, + "loss": 0.6348, + "step": 683 + }, + { + "epoch": 0.32340425531914896, + "grad_norm": 3.0889835357666016, + "learning_rate": 4.973501865653809e-06, + "loss": 0.6697, + "step": 684 + }, + { + "epoch": 0.32387706855791965, + "grad_norm": 3.0391931533813477, + "learning_rate": 4.973411202261598e-06, + "loss": 0.7091, + "step": 685 + }, + { + "epoch": 0.32434988179669033, + "grad_norm": 3.0333497524261475, + "learning_rate": 4.973320384860917e-06, + "loss": 0.6403, + "step": 686 + }, + { + "epoch": 0.324822695035461, + "grad_norm": 2.9714622497558594, + "learning_rate": 4.973229413457421e-06, + "loss": 0.6977, + "step": 687 + }, + { + "epoch": 0.3252955082742317, + "grad_norm": 3.057558298110962, + "learning_rate": 4.973138288056774e-06, + "loss": 0.7236, + "step": 688 + }, + { + "epoch": 0.3257683215130024, + "grad_norm": 2.921093463897705, + "learning_rate": 4.97304700866465e-06, + "loss": 0.576, + "step": 689 + }, + { + "epoch": 0.3262411347517731, + "grad_norm": 3.0287256240844727, + "learning_rate": 4.972955575286732e-06, + "loss": 0.7077, + "step": 690 + }, + { + "epoch": 0.32671394799054376, + "grad_norm": 2.8621346950531006, + "learning_rate": 4.972863987928716e-06, + "loss": 0.6952, + "step": 691 + }, + { + "epoch": 0.3271867612293144, + "grad_norm": 2.631359100341797, + "learning_rate": 4.9727722465963006e-06, + "loss": 0.6931, + "step": 692 + }, + { + "epoch": 0.3276595744680851, + "grad_norm": 2.8484320640563965, + "learning_rate": 4.972680351295201e-06, + "loss": 0.6292, + "step": 693 + }, + { + "epoch": 0.32813238770685577, + "grad_norm": 2.593001365661621, + "learning_rate": 4.972588302031138e-06, + "loss": 0.5942, + "step": 694 + }, + { + "epoch": 0.32860520094562645, + "grad_norm": 2.6321065425872803, + "learning_rate": 4.972496098809844e-06, + "loss": 0.65, + "step": 695 + }, + { + "epoch": 0.32907801418439714, + "grad_norm": 3.2516732215881348, + "learning_rate": 4.972403741637059e-06, + "loss": 0.7385, + "step": 696 + }, + { + "epoch": 0.3295508274231678, + "grad_norm": 3.180854320526123, + "learning_rate": 4.972311230518535e-06, + "loss": 0.6569, + "step": 697 + }, + { + "epoch": 0.3300236406619385, + "grad_norm": 4.161016941070557, + "learning_rate": 4.972218565460031e-06, + "loss": 0.6416, + "step": 698 + }, + { + "epoch": 0.3304964539007092, + "grad_norm": 3.153897762298584, + "learning_rate": 4.972125746467317e-06, + "loss": 0.7196, + "step": 699 + }, + { + "epoch": 0.3309692671394799, + "grad_norm": 2.9595556259155273, + "learning_rate": 4.972032773546173e-06, + "loss": 0.7093, + "step": 700 + }, + { + "epoch": 0.33144208037825057, + "grad_norm": 3.1086833477020264, + "learning_rate": 4.9719396467023875e-06, + "loss": 0.6963, + "step": 701 + }, + { + "epoch": 0.33191489361702126, + "grad_norm": 2.958921432495117, + "learning_rate": 4.971846365941759e-06, + "loss": 0.6518, + "step": 702 + }, + { + "epoch": 0.33238770685579194, + "grad_norm": 2.8745479583740234, + "learning_rate": 4.971752931270096e-06, + "loss": 0.696, + "step": 703 + }, + { + "epoch": 0.33286052009456263, + "grad_norm": 3.224358558654785, + "learning_rate": 4.971659342693217e-06, + "loss": 0.6769, + "step": 704 + }, + { + "epoch": 0.3333333333333333, + "grad_norm": 2.696319580078125, + "learning_rate": 4.9715656002169486e-06, + "loss": 0.6833, + "step": 705 + }, + { + "epoch": 0.333806146572104, + "grad_norm": 2.9283502101898193, + "learning_rate": 4.971471703847127e-06, + "loss": 0.6784, + "step": 706 + }, + { + "epoch": 0.3342789598108747, + "grad_norm": 2.654914140701294, + "learning_rate": 4.9713776535896e-06, + "loss": 0.6337, + "step": 707 + }, + { + "epoch": 0.3347517730496454, + "grad_norm": 3.041555643081665, + "learning_rate": 4.971283449450224e-06, + "loss": 0.6227, + "step": 708 + }, + { + "epoch": 0.33522458628841606, + "grad_norm": 2.893008232116699, + "learning_rate": 4.971189091434863e-06, + "loss": 0.655, + "step": 709 + }, + { + "epoch": 0.33569739952718675, + "grad_norm": 2.8806653022766113, + "learning_rate": 4.971094579549393e-06, + "loss": 0.7077, + "step": 710 + }, + { + "epoch": 0.33617021276595743, + "grad_norm": 3.4830048084259033, + "learning_rate": 4.9709999137996986e-06, + "loss": 0.7461, + "step": 711 + }, + { + "epoch": 0.3366430260047281, + "grad_norm": 3.155444860458374, + "learning_rate": 4.970905094191674e-06, + "loss": 0.652, + "step": 712 + }, + { + "epoch": 0.3371158392434988, + "grad_norm": 2.7608706951141357, + "learning_rate": 4.970810120731225e-06, + "loss": 0.684, + "step": 713 + }, + { + "epoch": 0.3375886524822695, + "grad_norm": 2.8209474086761475, + "learning_rate": 4.970714993424265e-06, + "loss": 0.6009, + "step": 714 + }, + { + "epoch": 0.3380614657210402, + "grad_norm": 3.6532654762268066, + "learning_rate": 4.9706197122767145e-06, + "loss": 0.702, + "step": 715 + }, + { + "epoch": 0.33853427895981086, + "grad_norm": 2.6276566982269287, + "learning_rate": 4.970524277294508e-06, + "loss": 0.6338, + "step": 716 + }, + { + "epoch": 0.33900709219858155, + "grad_norm": 3.509871482849121, + "learning_rate": 4.970428688483589e-06, + "loss": 0.6853, + "step": 717 + }, + { + "epoch": 0.33947990543735224, + "grad_norm": 5.332682132720947, + "learning_rate": 4.970332945849906e-06, + "loss": 0.6684, + "step": 718 + }, + { + "epoch": 0.3399527186761229, + "grad_norm": 2.718801975250244, + "learning_rate": 4.970237049399424e-06, + "loss": 0.6676, + "step": 719 + }, + { + "epoch": 0.3404255319148936, + "grad_norm": 3.891003131866455, + "learning_rate": 4.970140999138112e-06, + "loss": 0.7043, + "step": 720 + }, + { + "epoch": 0.3408983451536643, + "grad_norm": 2.8863155841827393, + "learning_rate": 4.970044795071951e-06, + "loss": 0.6563, + "step": 721 + }, + { + "epoch": 0.341371158392435, + "grad_norm": 3.2527518272399902, + "learning_rate": 4.969948437206932e-06, + "loss": 0.7244, + "step": 722 + }, + { + "epoch": 0.34184397163120567, + "grad_norm": 2.9726758003234863, + "learning_rate": 4.969851925549054e-06, + "loss": 0.6548, + "step": 723 + }, + { + "epoch": 0.34231678486997635, + "grad_norm": 3.118309497833252, + "learning_rate": 4.969755260104327e-06, + "loss": 0.7293, + "step": 724 + }, + { + "epoch": 0.34278959810874704, + "grad_norm": 3.373068332672119, + "learning_rate": 4.969658440878769e-06, + "loss": 0.6444, + "step": 725 + }, + { + "epoch": 0.3432624113475177, + "grad_norm": 2.7157437801361084, + "learning_rate": 4.969561467878409e-06, + "loss": 0.642, + "step": 726 + }, + { + "epoch": 0.3437352245862884, + "grad_norm": 2.58929705619812, + "learning_rate": 4.969464341109285e-06, + "loss": 0.6165, + "step": 727 + }, + { + "epoch": 0.3442080378250591, + "grad_norm": 2.8811306953430176, + "learning_rate": 4.969367060577445e-06, + "loss": 0.7127, + "step": 728 + }, + { + "epoch": 0.3446808510638298, + "grad_norm": 3.494358539581299, + "learning_rate": 4.969269626288946e-06, + "loss": 0.7103, + "step": 729 + }, + { + "epoch": 0.34515366430260047, + "grad_norm": 2.9753928184509277, + "learning_rate": 4.969172038249855e-06, + "loss": 0.6911, + "step": 730 + }, + { + "epoch": 0.34562647754137116, + "grad_norm": 3.2885913848876953, + "learning_rate": 4.969074296466247e-06, + "loss": 0.6968, + "step": 731 + }, + { + "epoch": 0.34609929078014184, + "grad_norm": 2.7564568519592285, + "learning_rate": 4.968976400944211e-06, + "loss": 0.6843, + "step": 732 + }, + { + "epoch": 0.34657210401891253, + "grad_norm": 2.9255006313323975, + "learning_rate": 4.96887835168984e-06, + "loss": 0.6024, + "step": 733 + }, + { + "epoch": 0.3470449172576832, + "grad_norm": 3.1808290481567383, + "learning_rate": 4.968780148709239e-06, + "loss": 0.7377, + "step": 734 + }, + { + "epoch": 0.3475177304964539, + "grad_norm": 2.956666946411133, + "learning_rate": 4.968681792008523e-06, + "loss": 0.65, + "step": 735 + }, + { + "epoch": 0.3479905437352246, + "grad_norm": 2.9631855487823486, + "learning_rate": 4.9685832815938175e-06, + "loss": 0.677, + "step": 736 + }, + { + "epoch": 0.3484633569739953, + "grad_norm": 2.501917600631714, + "learning_rate": 4.968484617471256e-06, + "loss": 0.6282, + "step": 737 + }, + { + "epoch": 0.34893617021276596, + "grad_norm": 2.750779628753662, + "learning_rate": 4.968385799646981e-06, + "loss": 0.6507, + "step": 738 + }, + { + "epoch": 0.34940898345153665, + "grad_norm": 2.872300624847412, + "learning_rate": 4.968286828127146e-06, + "loss": 0.5949, + "step": 739 + }, + { + "epoch": 0.34988179669030733, + "grad_norm": 2.6316142082214355, + "learning_rate": 4.9681877029179124e-06, + "loss": 0.6328, + "step": 740 + }, + { + "epoch": 0.350354609929078, + "grad_norm": 3.244364023208618, + "learning_rate": 4.968088424025454e-06, + "loss": 0.7393, + "step": 741 + }, + { + "epoch": 0.3508274231678487, + "grad_norm": 2.620465040206909, + "learning_rate": 4.967988991455951e-06, + "loss": 0.6797, + "step": 742 + }, + { + "epoch": 0.3513002364066194, + "grad_norm": 2.854513645172119, + "learning_rate": 4.967889405215596e-06, + "loss": 0.6368, + "step": 743 + }, + { + "epoch": 0.3517730496453901, + "grad_norm": 2.579854726791382, + "learning_rate": 4.9677896653105886e-06, + "loss": 0.6489, + "step": 744 + }, + { + "epoch": 0.35224586288416077, + "grad_norm": 3.0697381496429443, + "learning_rate": 4.96768977174714e-06, + "loss": 0.6313, + "step": 745 + }, + { + "epoch": 0.35271867612293145, + "grad_norm": 3.369338035583496, + "learning_rate": 4.96758972453147e-06, + "loss": 0.7416, + "step": 746 + }, + { + "epoch": 0.35319148936170214, + "grad_norm": 2.836221933364868, + "learning_rate": 4.967489523669807e-06, + "loss": 0.6422, + "step": 747 + }, + { + "epoch": 0.3536643026004728, + "grad_norm": 2.929579496383667, + "learning_rate": 4.967389169168392e-06, + "loss": 0.6482, + "step": 748 + }, + { + "epoch": 0.3541371158392435, + "grad_norm": 2.9243831634521484, + "learning_rate": 4.967288661033472e-06, + "loss": 0.5813, + "step": 749 + }, + { + "epoch": 0.3546099290780142, + "grad_norm": 3.7555336952209473, + "learning_rate": 4.967187999271306e-06, + "loss": 0.6501, + "step": 750 + }, + { + "epoch": 0.3550827423167849, + "grad_norm": 3.4279143810272217, + "learning_rate": 4.9670871838881615e-06, + "loss": 0.6326, + "step": 751 + }, + { + "epoch": 0.35555555555555557, + "grad_norm": 2.875066041946411, + "learning_rate": 4.9669862148903166e-06, + "loss": 0.664, + "step": 752 + }, + { + "epoch": 0.35602836879432626, + "grad_norm": 3.130394697189331, + "learning_rate": 4.966885092284057e-06, + "loss": 0.706, + "step": 753 + }, + { + "epoch": 0.35650118203309694, + "grad_norm": 2.9606287479400635, + "learning_rate": 4.96678381607568e-06, + "loss": 0.693, + "step": 754 + }, + { + "epoch": 0.35697399527186763, + "grad_norm": 3.0584909915924072, + "learning_rate": 4.966682386271491e-06, + "loss": 0.6034, + "step": 755 + }, + { + "epoch": 0.3574468085106383, + "grad_norm": 2.8215200901031494, + "learning_rate": 4.966580802877805e-06, + "loss": 0.6217, + "step": 756 + }, + { + "epoch": 0.357919621749409, + "grad_norm": 2.7348055839538574, + "learning_rate": 4.966479065900949e-06, + "loss": 0.6194, + "step": 757 + }, + { + "epoch": 0.3583924349881797, + "grad_norm": 3.2347466945648193, + "learning_rate": 4.966377175347257e-06, + "loss": 0.6377, + "step": 758 + }, + { + "epoch": 0.3588652482269504, + "grad_norm": 3.311845302581787, + "learning_rate": 4.966275131223072e-06, + "loss": 0.6234, + "step": 759 + }, + { + "epoch": 0.35933806146572106, + "grad_norm": 3.0384368896484375, + "learning_rate": 4.96617293353475e-06, + "loss": 0.609, + "step": 760 + }, + { + "epoch": 0.35981087470449175, + "grad_norm": 3.516854763031006, + "learning_rate": 4.966070582288653e-06, + "loss": 0.6627, + "step": 761 + }, + { + "epoch": 0.36028368794326243, + "grad_norm": 3.2425215244293213, + "learning_rate": 4.9659680774911534e-06, + "loss": 0.7355, + "step": 762 + }, + { + "epoch": 0.3607565011820331, + "grad_norm": 3.2665750980377197, + "learning_rate": 4.965865419148636e-06, + "loss": 0.6787, + "step": 763 + }, + { + "epoch": 0.3612293144208038, + "grad_norm": 2.729428291320801, + "learning_rate": 4.96576260726749e-06, + "loss": 0.6272, + "step": 764 + }, + { + "epoch": 0.3617021276595745, + "grad_norm": 3.299969434738159, + "learning_rate": 4.965659641854119e-06, + "loss": 0.6552, + "step": 765 + }, + { + "epoch": 0.3621749408983452, + "grad_norm": 2.7090916633605957, + "learning_rate": 4.965556522914934e-06, + "loss": 0.6661, + "step": 766 + }, + { + "epoch": 0.36264775413711586, + "grad_norm": 2.488846778869629, + "learning_rate": 4.965453250456355e-06, + "loss": 0.5821, + "step": 767 + }, + { + "epoch": 0.36312056737588655, + "grad_norm": 2.5267233848571777, + "learning_rate": 4.965349824484813e-06, + "loss": 0.5593, + "step": 768 + }, + { + "epoch": 0.36359338061465724, + "grad_norm": 3.0646679401397705, + "learning_rate": 4.965246245006748e-06, + "loss": 0.6341, + "step": 769 + }, + { + "epoch": 0.3640661938534279, + "grad_norm": 2.9877712726593018, + "learning_rate": 4.965142512028609e-06, + "loss": 0.7202, + "step": 770 + }, + { + "epoch": 0.3645390070921986, + "grad_norm": 3.7494113445281982, + "learning_rate": 4.965038625556854e-06, + "loss": 0.7643, + "step": 771 + }, + { + "epoch": 0.3650118203309693, + "grad_norm": 2.8382890224456787, + "learning_rate": 4.964934585597954e-06, + "loss": 0.6522, + "step": 772 + }, + { + "epoch": 0.3654846335697399, + "grad_norm": 3.091655731201172, + "learning_rate": 4.9648303921583854e-06, + "loss": 0.7117, + "step": 773 + }, + { + "epoch": 0.3659574468085106, + "grad_norm": 3.0608325004577637, + "learning_rate": 4.964726045244635e-06, + "loss": 0.6538, + "step": 774 + }, + { + "epoch": 0.3664302600472813, + "grad_norm": 2.8492867946624756, + "learning_rate": 4.964621544863203e-06, + "loss": 0.6079, + "step": 775 + }, + { + "epoch": 0.366903073286052, + "grad_norm": 3.0669894218444824, + "learning_rate": 4.964516891020594e-06, + "loss": 0.6223, + "step": 776 + }, + { + "epoch": 0.36737588652482267, + "grad_norm": 3.089984893798828, + "learning_rate": 4.964412083723325e-06, + "loss": 0.671, + "step": 777 + }, + { + "epoch": 0.36784869976359336, + "grad_norm": 2.905242443084717, + "learning_rate": 4.964307122977921e-06, + "loss": 0.62, + "step": 778 + }, + { + "epoch": 0.36832151300236404, + "grad_norm": 3.954436779022217, + "learning_rate": 4.964202008790918e-06, + "loss": 0.6535, + "step": 779 + }, + { + "epoch": 0.36879432624113473, + "grad_norm": 2.6026058197021484, + "learning_rate": 4.9640967411688615e-06, + "loss": 0.5865, + "step": 780 + }, + { + "epoch": 0.3692671394799054, + "grad_norm": 2.9876346588134766, + "learning_rate": 4.963991320118306e-06, + "loss": 0.6698, + "step": 781 + }, + { + "epoch": 0.3697399527186761, + "grad_norm": 2.9411263465881348, + "learning_rate": 4.963885745645815e-06, + "loss": 0.6173, + "step": 782 + }, + { + "epoch": 0.3702127659574468, + "grad_norm": 2.5679805278778076, + "learning_rate": 4.963780017757962e-06, + "loss": 0.6285, + "step": 783 + }, + { + "epoch": 0.3706855791962175, + "grad_norm": 3.3100640773773193, + "learning_rate": 4.963674136461332e-06, + "loss": 0.5968, + "step": 784 + }, + { + "epoch": 0.37115839243498816, + "grad_norm": 3.1293699741363525, + "learning_rate": 4.963568101762515e-06, + "loss": 0.697, + "step": 785 + }, + { + "epoch": 0.37163120567375885, + "grad_norm": 3.043853759765625, + "learning_rate": 4.963461913668115e-06, + "loss": 0.5881, + "step": 786 + }, + { + "epoch": 0.37210401891252953, + "grad_norm": 3.07351016998291, + "learning_rate": 4.963355572184744e-06, + "loss": 0.6307, + "step": 787 + }, + { + "epoch": 0.3725768321513002, + "grad_norm": 2.7381317615509033, + "learning_rate": 4.9632490773190225e-06, + "loss": 0.716, + "step": 788 + }, + { + "epoch": 0.3730496453900709, + "grad_norm": 2.892221450805664, + "learning_rate": 4.963142429077582e-06, + "loss": 0.6867, + "step": 789 + }, + { + "epoch": 0.3735224586288416, + "grad_norm": 3.133122205734253, + "learning_rate": 4.963035627467064e-06, + "loss": 0.659, + "step": 790 + }, + { + "epoch": 0.3739952718676123, + "grad_norm": 3.032599925994873, + "learning_rate": 4.962928672494116e-06, + "loss": 0.6848, + "step": 791 + }, + { + "epoch": 0.37446808510638296, + "grad_norm": 3.0076355934143066, + "learning_rate": 4.9628215641654e-06, + "loss": 0.6549, + "step": 792 + }, + { + "epoch": 0.37494089834515365, + "grad_norm": 2.8904454708099365, + "learning_rate": 4.962714302487585e-06, + "loss": 0.6484, + "step": 793 + }, + { + "epoch": 0.37541371158392434, + "grad_norm": 2.881364107131958, + "learning_rate": 4.9626068874673486e-06, + "loss": 0.721, + "step": 794 + }, + { + "epoch": 0.375886524822695, + "grad_norm": 3.11668062210083, + "learning_rate": 4.962499319111379e-06, + "loss": 0.7824, + "step": 795 + }, + { + "epoch": 0.3763593380614657, + "grad_norm": 2.9201436042785645, + "learning_rate": 4.962391597426374e-06, + "loss": 0.6911, + "step": 796 + }, + { + "epoch": 0.3768321513002364, + "grad_norm": 2.926598072052002, + "learning_rate": 4.962283722419043e-06, + "loss": 0.6715, + "step": 797 + }, + { + "epoch": 0.3773049645390071, + "grad_norm": 2.7267675399780273, + "learning_rate": 4.962175694096101e-06, + "loss": 0.6111, + "step": 798 + }, + { + "epoch": 0.37777777777777777, + "grad_norm": 3.194031000137329, + "learning_rate": 4.962067512464275e-06, + "loss": 0.6558, + "step": 799 + }, + { + "epoch": 0.37825059101654845, + "grad_norm": 2.6249136924743652, + "learning_rate": 4.9619591775303e-06, + "loss": 0.6166, + "step": 800 + }, + { + "epoch": 0.37872340425531914, + "grad_norm": 2.6356167793273926, + "learning_rate": 4.961850689300923e-06, + "loss": 0.6112, + "step": 801 + }, + { + "epoch": 0.3791962174940898, + "grad_norm": 3.030724287033081, + "learning_rate": 4.961742047782898e-06, + "loss": 0.6511, + "step": 802 + }, + { + "epoch": 0.3796690307328605, + "grad_norm": 3.4987757205963135, + "learning_rate": 4.96163325298299e-06, + "loss": 0.5888, + "step": 803 + }, + { + "epoch": 0.3801418439716312, + "grad_norm": 3.0371780395507812, + "learning_rate": 4.961524304907974e-06, + "loss": 0.6385, + "step": 804 + }, + { + "epoch": 0.3806146572104019, + "grad_norm": 3.302570104598999, + "learning_rate": 4.961415203564632e-06, + "loss": 0.6515, + "step": 805 + }, + { + "epoch": 0.38108747044917257, + "grad_norm": 2.7597038745880127, + "learning_rate": 4.961305948959759e-06, + "loss": 0.6126, + "step": 806 + }, + { + "epoch": 0.38156028368794326, + "grad_norm": 2.789811849594116, + "learning_rate": 4.9611965411001575e-06, + "loss": 0.6601, + "step": 807 + }, + { + "epoch": 0.38203309692671394, + "grad_norm": 3.0403921604156494, + "learning_rate": 4.961086979992639e-06, + "loss": 0.6947, + "step": 808 + }, + { + "epoch": 0.38250591016548463, + "grad_norm": 3.2139980792999268, + "learning_rate": 4.960977265644026e-06, + "loss": 0.6876, + "step": 809 + }, + { + "epoch": 0.3829787234042553, + "grad_norm": 2.918515205383301, + "learning_rate": 4.960867398061149e-06, + "loss": 0.5997, + "step": 810 + }, + { + "epoch": 0.383451536643026, + "grad_norm": 3.197636604309082, + "learning_rate": 4.9607573772508495e-06, + "loss": 0.5754, + "step": 811 + }, + { + "epoch": 0.3839243498817967, + "grad_norm": 2.8848466873168945, + "learning_rate": 4.960647203219979e-06, + "loss": 0.6424, + "step": 812 + }, + { + "epoch": 0.3843971631205674, + "grad_norm": 3.4810187816619873, + "learning_rate": 4.960536875975397e-06, + "loss": 0.6851, + "step": 813 + }, + { + "epoch": 0.38486997635933806, + "grad_norm": 3.713934898376465, + "learning_rate": 4.960426395523972e-06, + "loss": 0.6122, + "step": 814 + }, + { + "epoch": 0.38534278959810875, + "grad_norm": 2.862600803375244, + "learning_rate": 4.960315761872585e-06, + "loss": 0.6493, + "step": 815 + }, + { + "epoch": 0.38581560283687943, + "grad_norm": 3.133882522583008, + "learning_rate": 4.960204975028123e-06, + "loss": 0.7535, + "step": 816 + }, + { + "epoch": 0.3862884160756501, + "grad_norm": 3.1526732444763184, + "learning_rate": 4.960094034997485e-06, + "loss": 0.6512, + "step": 817 + }, + { + "epoch": 0.3867612293144208, + "grad_norm": 2.7213544845581055, + "learning_rate": 4.959982941787579e-06, + "loss": 0.6121, + "step": 818 + }, + { + "epoch": 0.3872340425531915, + "grad_norm": 3.4935851097106934, + "learning_rate": 4.9598716954053214e-06, + "loss": 0.7852, + "step": 819 + }, + { + "epoch": 0.3877068557919622, + "grad_norm": 2.691016435623169, + "learning_rate": 4.9597602958576395e-06, + "loss": 0.6861, + "step": 820 + }, + { + "epoch": 0.38817966903073287, + "grad_norm": 2.8621015548706055, + "learning_rate": 4.959648743151469e-06, + "loss": 0.6262, + "step": 821 + }, + { + "epoch": 0.38865248226950355, + "grad_norm": 3.3887462615966797, + "learning_rate": 4.959537037293758e-06, + "loss": 0.7103, + "step": 822 + }, + { + "epoch": 0.38912529550827424, + "grad_norm": 2.7565438747406006, + "learning_rate": 4.95942517829146e-06, + "loss": 0.6471, + "step": 823 + }, + { + "epoch": 0.3895981087470449, + "grad_norm": 2.7920358180999756, + "learning_rate": 4.959313166151541e-06, + "loss": 0.6239, + "step": 824 + }, + { + "epoch": 0.3900709219858156, + "grad_norm": 3.18904185295105, + "learning_rate": 4.959201000880973e-06, + "loss": 0.7461, + "step": 825 + }, + { + "epoch": 0.3905437352245863, + "grad_norm": 2.727872371673584, + "learning_rate": 4.959088682486743e-06, + "loss": 0.6333, + "step": 826 + }, + { + "epoch": 0.391016548463357, + "grad_norm": 2.906378746032715, + "learning_rate": 4.958976210975844e-06, + "loss": 0.7547, + "step": 827 + }, + { + "epoch": 0.39148936170212767, + "grad_norm": 2.96482515335083, + "learning_rate": 4.958863586355278e-06, + "loss": 0.6312, + "step": 828 + }, + { + "epoch": 0.39196217494089836, + "grad_norm": 3.2890889644622803, + "learning_rate": 4.958750808632059e-06, + "loss": 0.6943, + "step": 829 + }, + { + "epoch": 0.39243498817966904, + "grad_norm": 2.7004311084747314, + "learning_rate": 4.958637877813207e-06, + "loss": 0.5918, + "step": 830 + }, + { + "epoch": 0.39290780141843973, + "grad_norm": 2.7487950325012207, + "learning_rate": 4.9585247939057566e-06, + "loss": 0.6201, + "step": 831 + }, + { + "epoch": 0.3933806146572104, + "grad_norm": 2.7873897552490234, + "learning_rate": 4.958411556916747e-06, + "loss": 0.6268, + "step": 832 + }, + { + "epoch": 0.3938534278959811, + "grad_norm": 2.8501343727111816, + "learning_rate": 4.958298166853229e-06, + "loss": 0.7119, + "step": 833 + }, + { + "epoch": 0.3943262411347518, + "grad_norm": 3.0391547679901123, + "learning_rate": 4.958184623722265e-06, + "loss": 0.6375, + "step": 834 + }, + { + "epoch": 0.3947990543735225, + "grad_norm": 2.850520133972168, + "learning_rate": 4.958070927530922e-06, + "loss": 0.5962, + "step": 835 + }, + { + "epoch": 0.39527186761229316, + "grad_norm": 3.351914644241333, + "learning_rate": 4.957957078286281e-06, + "loss": 0.7247, + "step": 836 + }, + { + "epoch": 0.39574468085106385, + "grad_norm": 2.9559543132781982, + "learning_rate": 4.957843075995431e-06, + "loss": 0.6571, + "step": 837 + }, + { + "epoch": 0.39621749408983453, + "grad_norm": 3.225785255432129, + "learning_rate": 4.95772892066547e-06, + "loss": 0.7074, + "step": 838 + }, + { + "epoch": 0.3966903073286052, + "grad_norm": 2.7842373847961426, + "learning_rate": 4.957614612303505e-06, + "loss": 0.6469, + "step": 839 + }, + { + "epoch": 0.3971631205673759, + "grad_norm": 4.249724864959717, + "learning_rate": 4.957500150916655e-06, + "loss": 0.741, + "step": 840 + }, + { + "epoch": 0.3976359338061466, + "grad_norm": 3.138221263885498, + "learning_rate": 4.957385536512046e-06, + "loss": 0.6676, + "step": 841 + }, + { + "epoch": 0.3981087470449173, + "grad_norm": 3.456423759460449, + "learning_rate": 4.957270769096816e-06, + "loss": 0.6877, + "step": 842 + }, + { + "epoch": 0.39858156028368796, + "grad_norm": 2.8676278591156006, + "learning_rate": 4.957155848678109e-06, + "loss": 0.5986, + "step": 843 + }, + { + "epoch": 0.39905437352245865, + "grad_norm": 2.705324411392212, + "learning_rate": 4.957040775263082e-06, + "loss": 0.6356, + "step": 844 + }, + { + "epoch": 0.39952718676122934, + "grad_norm": 3.0767486095428467, + "learning_rate": 4.9569255488589e-06, + "loss": 0.6844, + "step": 845 + }, + { + "epoch": 0.4, + "grad_norm": 2.7787704467773438, + "learning_rate": 4.956810169472736e-06, + "loss": 0.6641, + "step": 846 + }, + { + "epoch": 0.4004728132387707, + "grad_norm": 2.584277868270874, + "learning_rate": 4.956694637111777e-06, + "loss": 0.6256, + "step": 847 + }, + { + "epoch": 0.4009456264775414, + "grad_norm": 2.751641273498535, + "learning_rate": 4.956578951783215e-06, + "loss": 0.5954, + "step": 848 + }, + { + "epoch": 0.4014184397163121, + "grad_norm": 3.0181658267974854, + "learning_rate": 4.956463113494253e-06, + "loss": 0.6569, + "step": 849 + }, + { + "epoch": 0.40189125295508277, + "grad_norm": 3.0933220386505127, + "learning_rate": 4.956347122252104e-06, + "loss": 0.6248, + "step": 850 + }, + { + "epoch": 0.40236406619385345, + "grad_norm": 3.3767428398132324, + "learning_rate": 4.956230978063991e-06, + "loss": 0.719, + "step": 851 + }, + { + "epoch": 0.40283687943262414, + "grad_norm": 3.7666573524475098, + "learning_rate": 4.956114680937145e-06, + "loss": 0.6467, + "step": 852 + }, + { + "epoch": 0.4033096926713948, + "grad_norm": 2.9836843013763428, + "learning_rate": 4.955998230878808e-06, + "loss": 0.6993, + "step": 853 + }, + { + "epoch": 0.4037825059101655, + "grad_norm": 2.981497049331665, + "learning_rate": 4.955881627896229e-06, + "loss": 0.6578, + "step": 854 + }, + { + "epoch": 0.40425531914893614, + "grad_norm": 3.1369056701660156, + "learning_rate": 4.955764871996672e-06, + "loss": 0.6763, + "step": 855 + }, + { + "epoch": 0.40472813238770683, + "grad_norm": 2.7675817012786865, + "learning_rate": 4.9556479631874036e-06, + "loss": 0.6488, + "step": 856 + }, + { + "epoch": 0.4052009456264775, + "grad_norm": 3.035334825515747, + "learning_rate": 4.9555309014757034e-06, + "loss": 0.7076, + "step": 857 + }, + { + "epoch": 0.4056737588652482, + "grad_norm": 3.493704319000244, + "learning_rate": 4.955413686868862e-06, + "loss": 0.6773, + "step": 858 + }, + { + "epoch": 0.4061465721040189, + "grad_norm": 3.245487928390503, + "learning_rate": 4.9552963193741765e-06, + "loss": 0.6915, + "step": 859 + }, + { + "epoch": 0.4066193853427896, + "grad_norm": 3.189969539642334, + "learning_rate": 4.955178798998956e-06, + "loss": 0.7318, + "step": 860 + }, + { + "epoch": 0.40709219858156026, + "grad_norm": 2.7987146377563477, + "learning_rate": 4.955061125750517e-06, + "loss": 0.6162, + "step": 861 + }, + { + "epoch": 0.40756501182033095, + "grad_norm": 3.020118474960327, + "learning_rate": 4.954943299636187e-06, + "loss": 0.6678, + "step": 862 + }, + { + "epoch": 0.40803782505910163, + "grad_norm": 2.715463876724243, + "learning_rate": 4.954825320663302e-06, + "loss": 0.668, + "step": 863 + }, + { + "epoch": 0.4085106382978723, + "grad_norm": 2.595050096511841, + "learning_rate": 4.9547071888392085e-06, + "loss": 0.6557, + "step": 864 + }, + { + "epoch": 0.408983451536643, + "grad_norm": 3.131596088409424, + "learning_rate": 4.954588904171261e-06, + "loss": 0.6548, + "step": 865 + }, + { + "epoch": 0.4094562647754137, + "grad_norm": 2.5742313861846924, + "learning_rate": 4.954470466666827e-06, + "loss": 0.6592, + "step": 866 + }, + { + "epoch": 0.4099290780141844, + "grad_norm": 2.8612802028656006, + "learning_rate": 4.9543518763332785e-06, + "loss": 0.5391, + "step": 867 + }, + { + "epoch": 0.41040189125295506, + "grad_norm": 2.8973186016082764, + "learning_rate": 4.954233133178001e-06, + "loss": 0.6649, + "step": 868 + }, + { + "epoch": 0.41087470449172575, + "grad_norm": 2.802525043487549, + "learning_rate": 4.954114237208388e-06, + "loss": 0.6212, + "step": 869 + }, + { + "epoch": 0.41134751773049644, + "grad_norm": 2.5919506549835205, + "learning_rate": 4.953995188431843e-06, + "loss": 0.6596, + "step": 870 + }, + { + "epoch": 0.4118203309692671, + "grad_norm": 3.139169454574585, + "learning_rate": 4.953875986855777e-06, + "loss": 0.6799, + "step": 871 + }, + { + "epoch": 0.4122931442080378, + "grad_norm": 3.99727725982666, + "learning_rate": 4.953756632487614e-06, + "loss": 0.6519, + "step": 872 + }, + { + "epoch": 0.4127659574468085, + "grad_norm": 3.238706350326538, + "learning_rate": 4.953637125334784e-06, + "loss": 0.7361, + "step": 873 + }, + { + "epoch": 0.4132387706855792, + "grad_norm": 2.780019998550415, + "learning_rate": 4.9535174654047295e-06, + "loss": 0.6406, + "step": 874 + }, + { + "epoch": 0.41371158392434987, + "grad_norm": 2.7629551887512207, + "learning_rate": 4.953397652704901e-06, + "loss": 0.6131, + "step": 875 + }, + { + "epoch": 0.41418439716312055, + "grad_norm": 2.8008246421813965, + "learning_rate": 4.9532776872427585e-06, + "loss": 0.6464, + "step": 876 + }, + { + "epoch": 0.41465721040189124, + "grad_norm": 3.0970115661621094, + "learning_rate": 4.953157569025772e-06, + "loss": 0.7066, + "step": 877 + }, + { + "epoch": 0.4151300236406619, + "grad_norm": 2.8375589847564697, + "learning_rate": 4.9530372980614195e-06, + "loss": 0.6551, + "step": 878 + }, + { + "epoch": 0.4156028368794326, + "grad_norm": 2.718843936920166, + "learning_rate": 4.952916874357191e-06, + "loss": 0.5947, + "step": 879 + }, + { + "epoch": 0.4160756501182033, + "grad_norm": 2.7104697227478027, + "learning_rate": 4.952796297920585e-06, + "loss": 0.6708, + "step": 880 + }, + { + "epoch": 0.416548463356974, + "grad_norm": 2.8223445415496826, + "learning_rate": 4.952675568759108e-06, + "loss": 0.6214, + "step": 881 + }, + { + "epoch": 0.41702127659574467, + "grad_norm": 2.6598153114318848, + "learning_rate": 4.952554686880279e-06, + "loss": 0.6116, + "step": 882 + }, + { + "epoch": 0.41749408983451536, + "grad_norm": 2.8639824390411377, + "learning_rate": 4.952433652291623e-06, + "loss": 0.5971, + "step": 883 + }, + { + "epoch": 0.41796690307328604, + "grad_norm": 2.9578304290771484, + "learning_rate": 4.952312465000677e-06, + "loss": 0.6785, + "step": 884 + }, + { + "epoch": 0.41843971631205673, + "grad_norm": 2.872144937515259, + "learning_rate": 4.952191125014987e-06, + "loss": 0.6772, + "step": 885 + }, + { + "epoch": 0.4189125295508274, + "grad_norm": 2.7513675689697266, + "learning_rate": 4.952069632342108e-06, + "loss": 0.702, + "step": 886 + }, + { + "epoch": 0.4193853427895981, + "grad_norm": 2.9275078773498535, + "learning_rate": 4.951947986989606e-06, + "loss": 0.589, + "step": 887 + }, + { + "epoch": 0.4198581560283688, + "grad_norm": 2.740549325942993, + "learning_rate": 4.951826188965053e-06, + "loss": 0.5942, + "step": 888 + }, + { + "epoch": 0.4203309692671395, + "grad_norm": 2.92452073097229, + "learning_rate": 4.951704238276035e-06, + "loss": 0.6819, + "step": 889 + }, + { + "epoch": 0.42080378250591016, + "grad_norm": 2.842491865158081, + "learning_rate": 4.951582134930144e-06, + "loss": 0.6304, + "step": 890 + }, + { + "epoch": 0.42127659574468085, + "grad_norm": 2.613478422164917, + "learning_rate": 4.951459878934983e-06, + "loss": 0.6912, + "step": 891 + }, + { + "epoch": 0.42174940898345153, + "grad_norm": 3.2408607006073, + "learning_rate": 4.951337470298165e-06, + "loss": 0.6755, + "step": 892 + }, + { + "epoch": 0.4222222222222222, + "grad_norm": 3.1022439002990723, + "learning_rate": 4.9512149090273125e-06, + "loss": 0.6138, + "step": 893 + }, + { + "epoch": 0.4226950354609929, + "grad_norm": 2.6418895721435547, + "learning_rate": 4.951092195130055e-06, + "loss": 0.639, + "step": 894 + }, + { + "epoch": 0.4231678486997636, + "grad_norm": 3.010744333267212, + "learning_rate": 4.950969328614035e-06, + "loss": 0.7102, + "step": 895 + }, + { + "epoch": 0.4236406619385343, + "grad_norm": 2.673292636871338, + "learning_rate": 4.950846309486901e-06, + "loss": 0.5676, + "step": 896 + }, + { + "epoch": 0.42411347517730497, + "grad_norm": 3.6974737644195557, + "learning_rate": 4.950723137756314e-06, + "loss": 0.5722, + "step": 897 + }, + { + "epoch": 0.42458628841607565, + "grad_norm": 3.69028902053833, + "learning_rate": 4.9505998134299435e-06, + "loss": 0.6337, + "step": 898 + }, + { + "epoch": 0.42505910165484634, + "grad_norm": 3.2136125564575195, + "learning_rate": 4.950476336515469e-06, + "loss": 0.6469, + "step": 899 + }, + { + "epoch": 0.425531914893617, + "grad_norm": 2.7396016120910645, + "learning_rate": 4.950352707020577e-06, + "loss": 0.6656, + "step": 900 + }, + { + "epoch": 0.4260047281323877, + "grad_norm": 2.825416088104248, + "learning_rate": 4.950228924952967e-06, + "loss": 0.6298, + "step": 901 + }, + { + "epoch": 0.4264775413711584, + "grad_norm": 3.401658535003662, + "learning_rate": 4.950104990320345e-06, + "loss": 0.778, + "step": 902 + }, + { + "epoch": 0.4269503546099291, + "grad_norm": 2.7002272605895996, + "learning_rate": 4.9499809031304294e-06, + "loss": 0.6536, + "step": 903 + }, + { + "epoch": 0.42742316784869977, + "grad_norm": 2.62386417388916, + "learning_rate": 4.949856663390945e-06, + "loss": 0.6629, + "step": 904 + }, + { + "epoch": 0.42789598108747046, + "grad_norm": 2.584247589111328, + "learning_rate": 4.94973227110963e-06, + "loss": 0.5813, + "step": 905 + }, + { + "epoch": 0.42836879432624114, + "grad_norm": 3.4365768432617188, + "learning_rate": 4.9496077262942265e-06, + "loss": 0.7648, + "step": 906 + }, + { + "epoch": 0.42884160756501183, + "grad_norm": 2.8993639945983887, + "learning_rate": 4.949483028952492e-06, + "loss": 0.6696, + "step": 907 + }, + { + "epoch": 0.4293144208037825, + "grad_norm": 2.922809362411499, + "learning_rate": 4.94935817909219e-06, + "loss": 0.6892, + "step": 908 + }, + { + "epoch": 0.4297872340425532, + "grad_norm": 2.85478138923645, + "learning_rate": 4.9492331767210944e-06, + "loss": 0.536, + "step": 909 + }, + { + "epoch": 0.4302600472813239, + "grad_norm": 2.8639259338378906, + "learning_rate": 4.949108021846988e-06, + "loss": 0.634, + "step": 910 + }, + { + "epoch": 0.4307328605200946, + "grad_norm": 3.0533697605133057, + "learning_rate": 4.948982714477664e-06, + "loss": 0.6318, + "step": 911 + }, + { + "epoch": 0.43120567375886526, + "grad_norm": 2.331674814224243, + "learning_rate": 4.9488572546209255e-06, + "loss": 0.6562, + "step": 912 + }, + { + "epoch": 0.43167848699763595, + "grad_norm": 3.0154623985290527, + "learning_rate": 4.9487316422845835e-06, + "loss": 0.6675, + "step": 913 + }, + { + "epoch": 0.43215130023640663, + "grad_norm": 2.7354514598846436, + "learning_rate": 4.948605877476459e-06, + "loss": 0.6012, + "step": 914 + }, + { + "epoch": 0.4326241134751773, + "grad_norm": 2.863736629486084, + "learning_rate": 4.948479960204383e-06, + "loss": 0.6062, + "step": 915 + }, + { + "epoch": 0.433096926713948, + "grad_norm": 3.01998233795166, + "learning_rate": 4.948353890476197e-06, + "loss": 0.6749, + "step": 916 + }, + { + "epoch": 0.4335697399527187, + "grad_norm": 2.7550456523895264, + "learning_rate": 4.94822766829975e-06, + "loss": 0.6507, + "step": 917 + }, + { + "epoch": 0.4340425531914894, + "grad_norm": 3.370572805404663, + "learning_rate": 4.948101293682901e-06, + "loss": 0.714, + "step": 918 + }, + { + "epoch": 0.43451536643026006, + "grad_norm": 2.9736790657043457, + "learning_rate": 4.947974766633519e-06, + "loss": 0.729, + "step": 919 + }, + { + "epoch": 0.43498817966903075, + "grad_norm": 3.1036548614501953, + "learning_rate": 4.947848087159483e-06, + "loss": 0.7547, + "step": 920 + }, + { + "epoch": 0.43546099290780144, + "grad_norm": 2.895094871520996, + "learning_rate": 4.947721255268679e-06, + "loss": 0.6089, + "step": 921 + }, + { + "epoch": 0.4359338061465721, + "grad_norm": 2.798476219177246, + "learning_rate": 4.947594270969005e-06, + "loss": 0.5432, + "step": 922 + }, + { + "epoch": 0.4364066193853428, + "grad_norm": 2.7675702571868896, + "learning_rate": 4.94746713426837e-06, + "loss": 0.5693, + "step": 923 + }, + { + "epoch": 0.4368794326241135, + "grad_norm": 2.6851553916931152, + "learning_rate": 4.947339845174687e-06, + "loss": 0.6503, + "step": 924 + }, + { + "epoch": 0.4373522458628842, + "grad_norm": 2.909635543823242, + "learning_rate": 4.947212403695883e-06, + "loss": 0.6494, + "step": 925 + }, + { + "epoch": 0.43782505910165487, + "grad_norm": 2.604526996612549, + "learning_rate": 4.947084809839894e-06, + "loss": 0.6349, + "step": 926 + }, + { + "epoch": 0.43829787234042555, + "grad_norm": 3.118149518966675, + "learning_rate": 4.946957063614664e-06, + "loss": 0.6219, + "step": 927 + }, + { + "epoch": 0.43877068557919624, + "grad_norm": 2.7452616691589355, + "learning_rate": 4.9468291650281465e-06, + "loss": 0.6096, + "step": 928 + }, + { + "epoch": 0.4392434988179669, + "grad_norm": 3.30098819732666, + "learning_rate": 4.946701114088307e-06, + "loss": 0.6277, + "step": 929 + }, + { + "epoch": 0.4397163120567376, + "grad_norm": 2.789482593536377, + "learning_rate": 4.946572910803116e-06, + "loss": 0.7, + "step": 930 + }, + { + "epoch": 0.4401891252955083, + "grad_norm": 2.7283935546875, + "learning_rate": 4.946444555180559e-06, + "loss": 0.5375, + "step": 931 + }, + { + "epoch": 0.440661938534279, + "grad_norm": 3.101304054260254, + "learning_rate": 4.946316047228627e-06, + "loss": 0.6131, + "step": 932 + }, + { + "epoch": 0.44113475177304967, + "grad_norm": 3.573908805847168, + "learning_rate": 4.946187386955321e-06, + "loss": 0.7073, + "step": 933 + }, + { + "epoch": 0.44160756501182036, + "grad_norm": 3.214979648590088, + "learning_rate": 4.946058574368653e-06, + "loss": 0.6508, + "step": 934 + }, + { + "epoch": 0.44208037825059104, + "grad_norm": 3.145082712173462, + "learning_rate": 4.945929609476643e-06, + "loss": 0.64, + "step": 935 + }, + { + "epoch": 0.4425531914893617, + "grad_norm": 2.991780996322632, + "learning_rate": 4.945800492287321e-06, + "loss": 0.6315, + "step": 936 + }, + { + "epoch": 0.44302600472813236, + "grad_norm": 3.2441139221191406, + "learning_rate": 4.945671222808727e-06, + "loss": 0.7144, + "step": 937 + }, + { + "epoch": 0.44349881796690305, + "grad_norm": 2.9397029876708984, + "learning_rate": 4.94554180104891e-06, + "loss": 0.6818, + "step": 938 + }, + { + "epoch": 0.44397163120567373, + "grad_norm": 3.2471461296081543, + "learning_rate": 4.945412227015929e-06, + "loss": 0.6921, + "step": 939 + }, + { + "epoch": 0.4444444444444444, + "grad_norm": 3.0882487297058105, + "learning_rate": 4.945282500717851e-06, + "loss": 0.718, + "step": 940 + }, + { + "epoch": 0.4449172576832151, + "grad_norm": 2.6035783290863037, + "learning_rate": 4.945152622162753e-06, + "loss": 0.621, + "step": 941 + }, + { + "epoch": 0.4453900709219858, + "grad_norm": 2.83659029006958, + "learning_rate": 4.945022591358724e-06, + "loss": 0.6403, + "step": 942 + }, + { + "epoch": 0.4458628841607565, + "grad_norm": 2.824463129043579, + "learning_rate": 4.944892408313859e-06, + "loss": 0.6594, + "step": 943 + }, + { + "epoch": 0.44633569739952716, + "grad_norm": 2.753735065460205, + "learning_rate": 4.9447620730362645e-06, + "loss": 0.6116, + "step": 944 + }, + { + "epoch": 0.44680851063829785, + "grad_norm": 3.0659725666046143, + "learning_rate": 4.944631585534056e-06, + "loss": 0.5983, + "step": 945 + }, + { + "epoch": 0.44728132387706854, + "grad_norm": 2.969113349914551, + "learning_rate": 4.944500945815357e-06, + "loss": 0.6859, + "step": 946 + }, + { + "epoch": 0.4477541371158392, + "grad_norm": 2.810303211212158, + "learning_rate": 4.944370153888303e-06, + "loss": 0.7025, + "step": 947 + }, + { + "epoch": 0.4482269503546099, + "grad_norm": 3.027721643447876, + "learning_rate": 4.944239209761038e-06, + "loss": 0.7268, + "step": 948 + }, + { + "epoch": 0.4486997635933806, + "grad_norm": 2.661503314971924, + "learning_rate": 4.944108113441716e-06, + "loss": 0.6702, + "step": 949 + }, + { + "epoch": 0.4491725768321513, + "grad_norm": 2.738591432571411, + "learning_rate": 4.943976864938498e-06, + "loss": 0.6728, + "step": 950 + }, + { + "epoch": 0.44964539007092197, + "grad_norm": 3.447505474090576, + "learning_rate": 4.943845464259557e-06, + "loss": 0.6586, + "step": 951 + }, + { + "epoch": 0.45011820330969265, + "grad_norm": 3.0968854427337646, + "learning_rate": 4.943713911413075e-06, + "loss": 0.7666, + "step": 952 + }, + { + "epoch": 0.45059101654846334, + "grad_norm": 2.4113779067993164, + "learning_rate": 4.943582206407244e-06, + "loss": 0.6173, + "step": 953 + }, + { + "epoch": 0.451063829787234, + "grad_norm": 2.6357979774475098, + "learning_rate": 4.943450349250263e-06, + "loss": 0.5589, + "step": 954 + }, + { + "epoch": 0.4515366430260047, + "grad_norm": 2.9182233810424805, + "learning_rate": 4.9433183399503425e-06, + "loss": 0.6252, + "step": 955 + }, + { + "epoch": 0.4520094562647754, + "grad_norm": 2.832740306854248, + "learning_rate": 4.943186178515703e-06, + "loss": 0.6882, + "step": 956 + }, + { + "epoch": 0.4524822695035461, + "grad_norm": 2.9508981704711914, + "learning_rate": 4.943053864954574e-06, + "loss": 0.5722, + "step": 957 + }, + { + "epoch": 0.4529550827423168, + "grad_norm": 3.044729471206665, + "learning_rate": 4.9429213992751925e-06, + "loss": 0.6772, + "step": 958 + }, + { + "epoch": 0.45342789598108746, + "grad_norm": 2.606003522872925, + "learning_rate": 4.9427887814858075e-06, + "loss": 0.6445, + "step": 959 + }, + { + "epoch": 0.45390070921985815, + "grad_norm": 2.4634225368499756, + "learning_rate": 4.942656011594676e-06, + "loss": 0.6151, + "step": 960 + }, + { + "epoch": 0.45437352245862883, + "grad_norm": 2.8872334957122803, + "learning_rate": 4.942523089610066e-06, + "loss": 0.6255, + "step": 961 + }, + { + "epoch": 0.4548463356973995, + "grad_norm": 2.870605707168579, + "learning_rate": 4.942390015540253e-06, + "loss": 0.7481, + "step": 962 + }, + { + "epoch": 0.4553191489361702, + "grad_norm": 2.952680826187134, + "learning_rate": 4.942256789393524e-06, + "loss": 0.5556, + "step": 963 + }, + { + "epoch": 0.4557919621749409, + "grad_norm": 2.623680353164673, + "learning_rate": 4.9421234111781725e-06, + "loss": 0.6115, + "step": 964 + }, + { + "epoch": 0.4562647754137116, + "grad_norm": 2.6933600902557373, + "learning_rate": 4.941989880902505e-06, + "loss": 0.6102, + "step": 965 + }, + { + "epoch": 0.45673758865248226, + "grad_norm": 2.6047189235687256, + "learning_rate": 4.941856198574836e-06, + "loss": 0.612, + "step": 966 + }, + { + "epoch": 0.45721040189125295, + "grad_norm": 2.779186725616455, + "learning_rate": 4.9417223642034885e-06, + "loss": 0.5424, + "step": 967 + }, + { + "epoch": 0.45768321513002364, + "grad_norm": 2.6177165508270264, + "learning_rate": 4.941588377796795e-06, + "loss": 0.4661, + "step": 968 + }, + { + "epoch": 0.4581560283687943, + "grad_norm": 2.959676742553711, + "learning_rate": 4.941454239363101e-06, + "loss": 0.6966, + "step": 969 + }, + { + "epoch": 0.458628841607565, + "grad_norm": 2.9788379669189453, + "learning_rate": 4.941319948910756e-06, + "loss": 0.6181, + "step": 970 + }, + { + "epoch": 0.4591016548463357, + "grad_norm": 4.642750263214111, + "learning_rate": 4.941185506448122e-06, + "loss": 0.5602, + "step": 971 + }, + { + "epoch": 0.4595744680851064, + "grad_norm": 2.793002128601074, + "learning_rate": 4.941050911983572e-06, + "loss": 0.602, + "step": 972 + }, + { + "epoch": 0.46004728132387707, + "grad_norm": 2.6833035945892334, + "learning_rate": 4.9409161655254845e-06, + "loss": 0.5549, + "step": 973 + }, + { + "epoch": 0.46052009456264775, + "grad_norm": 3.905032157897949, + "learning_rate": 4.94078126708225e-06, + "loss": 0.6335, + "step": 974 + }, + { + "epoch": 0.46099290780141844, + "grad_norm": 2.922609329223633, + "learning_rate": 4.94064621666227e-06, + "loss": 0.5839, + "step": 975 + }, + { + "epoch": 0.4614657210401891, + "grad_norm": 2.8277416229248047, + "learning_rate": 4.940511014273952e-06, + "loss": 0.629, + "step": 976 + }, + { + "epoch": 0.4619385342789598, + "grad_norm": 3.07511043548584, + "learning_rate": 4.940375659925714e-06, + "loss": 0.7058, + "step": 977 + }, + { + "epoch": 0.4624113475177305, + "grad_norm": 3.65043044090271, + "learning_rate": 4.940240153625984e-06, + "loss": 0.7174, + "step": 978 + }, + { + "epoch": 0.4628841607565012, + "grad_norm": 2.755167245864868, + "learning_rate": 4.9401044953832e-06, + "loss": 0.6548, + "step": 979 + }, + { + "epoch": 0.46335697399527187, + "grad_norm": 2.9881057739257812, + "learning_rate": 4.939968685205808e-06, + "loss": 0.6245, + "step": 980 + }, + { + "epoch": 0.46382978723404256, + "grad_norm": 2.9484212398529053, + "learning_rate": 4.939832723102266e-06, + "loss": 0.655, + "step": 981 + }, + { + "epoch": 0.46430260047281324, + "grad_norm": 2.898918628692627, + "learning_rate": 4.939696609081038e-06, + "loss": 0.6178, + "step": 982 + }, + { + "epoch": 0.46477541371158393, + "grad_norm": 2.7052435874938965, + "learning_rate": 4.9395603431506e-06, + "loss": 0.6393, + "step": 983 + }, + { + "epoch": 0.4652482269503546, + "grad_norm": 2.5610013008117676, + "learning_rate": 4.939423925319436e-06, + "loss": 0.4847, + "step": 984 + }, + { + "epoch": 0.4657210401891253, + "grad_norm": 3.229083299636841, + "learning_rate": 4.939287355596042e-06, + "loss": 0.6473, + "step": 985 + }, + { + "epoch": 0.466193853427896, + "grad_norm": 2.907097816467285, + "learning_rate": 4.9391506339889195e-06, + "loss": 0.652, + "step": 986 + }, + { + "epoch": 0.4666666666666667, + "grad_norm": 2.6929478645324707, + "learning_rate": 4.939013760506582e-06, + "loss": 0.6175, + "step": 987 + }, + { + "epoch": 0.46713947990543736, + "grad_norm": 3.414813280105591, + "learning_rate": 4.938876735157554e-06, + "loss": 0.7597, + "step": 988 + }, + { + "epoch": 0.46761229314420805, + "grad_norm": 3.297360420227051, + "learning_rate": 4.938739557950365e-06, + "loss": 0.6824, + "step": 989 + }, + { + "epoch": 0.46808510638297873, + "grad_norm": 3.083155393600464, + "learning_rate": 4.938602228893557e-06, + "loss": 0.6505, + "step": 990 + }, + { + "epoch": 0.4685579196217494, + "grad_norm": 2.9781153202056885, + "learning_rate": 4.938464747995681e-06, + "loss": 0.666, + "step": 991 + }, + { + "epoch": 0.4690307328605201, + "grad_norm": 3.1494534015655518, + "learning_rate": 4.9383271152652975e-06, + "loss": 0.6422, + "step": 992 + }, + { + "epoch": 0.4695035460992908, + "grad_norm": 2.547868490219116, + "learning_rate": 4.938189330710976e-06, + "loss": 0.5766, + "step": 993 + }, + { + "epoch": 0.4699763593380615, + "grad_norm": 2.684736967086792, + "learning_rate": 4.938051394341297e-06, + "loss": 0.6407, + "step": 994 + }, + { + "epoch": 0.47044917257683216, + "grad_norm": 2.9619693756103516, + "learning_rate": 4.937913306164847e-06, + "loss": 0.6936, + "step": 995 + }, + { + "epoch": 0.47092198581560285, + "grad_norm": 2.9698498249053955, + "learning_rate": 4.937775066190227e-06, + "loss": 0.6464, + "step": 996 + }, + { + "epoch": 0.47139479905437354, + "grad_norm": 3.121049642562866, + "learning_rate": 4.937636674426042e-06, + "loss": 0.6383, + "step": 997 + }, + { + "epoch": 0.4718676122931442, + "grad_norm": 3.113672971725464, + "learning_rate": 4.93749813088091e-06, + "loss": 0.6892, + "step": 998 + }, + { + "epoch": 0.4723404255319149, + "grad_norm": 3.126113176345825, + "learning_rate": 4.937359435563458e-06, + "loss": 0.6728, + "step": 999 + }, + { + "epoch": 0.4728132387706856, + "grad_norm": 3.353966236114502, + "learning_rate": 4.937220588482321e-06, + "loss": 0.6041, + "step": 1000 + }, + { + "epoch": 0.4732860520094563, + "grad_norm": 2.8860628604888916, + "learning_rate": 4.937081589646144e-06, + "loss": 0.6798, + "step": 1001 + }, + { + "epoch": 0.47375886524822697, + "grad_norm": 3.0510590076446533, + "learning_rate": 4.936942439063584e-06, + "loss": 0.5841, + "step": 1002 + }, + { + "epoch": 0.47423167848699765, + "grad_norm": 2.6998369693756104, + "learning_rate": 4.936803136743303e-06, + "loss": 0.6403, + "step": 1003 + }, + { + "epoch": 0.47470449172576834, + "grad_norm": 2.875347137451172, + "learning_rate": 4.9366636826939765e-06, + "loss": 0.5811, + "step": 1004 + }, + { + "epoch": 0.475177304964539, + "grad_norm": 2.9122262001037598, + "learning_rate": 4.936524076924287e-06, + "loss": 0.6852, + "step": 1005 + }, + { + "epoch": 0.4756501182033097, + "grad_norm": 2.5167057514190674, + "learning_rate": 4.9363843194429265e-06, + "loss": 0.5367, + "step": 1006 + }, + { + "epoch": 0.4761229314420804, + "grad_norm": 2.5745551586151123, + "learning_rate": 4.9362444102585985e-06, + "loss": 0.6241, + "step": 1007 + }, + { + "epoch": 0.4765957446808511, + "grad_norm": 2.5024216175079346, + "learning_rate": 4.9361043493800125e-06, + "loss": 0.6133, + "step": 1008 + }, + { + "epoch": 0.47706855791962177, + "grad_norm": 2.7281384468078613, + "learning_rate": 4.935964136815892e-06, + "loss": 0.6834, + "step": 1009 + }, + { + "epoch": 0.47754137115839246, + "grad_norm": 3.0118913650512695, + "learning_rate": 4.935823772574965e-06, + "loss": 0.6922, + "step": 1010 + }, + { + "epoch": 0.47801418439716314, + "grad_norm": 3.016216993331909, + "learning_rate": 4.935683256665973e-06, + "loss": 0.6653, + "step": 1011 + }, + { + "epoch": 0.47848699763593383, + "grad_norm": 2.9526784420013428, + "learning_rate": 4.9355425890976636e-06, + "loss": 0.6423, + "step": 1012 + }, + { + "epoch": 0.4789598108747045, + "grad_norm": 6.222797393798828, + "learning_rate": 4.9354017698787985e-06, + "loss": 0.5884, + "step": 1013 + }, + { + "epoch": 0.4794326241134752, + "grad_norm": 2.6553597450256348, + "learning_rate": 4.935260799018143e-06, + "loss": 0.6624, + "step": 1014 + }, + { + "epoch": 0.4799054373522459, + "grad_norm": 3.0942065715789795, + "learning_rate": 4.935119676524475e-06, + "loss": 0.6623, + "step": 1015 + }, + { + "epoch": 0.4803782505910166, + "grad_norm": 2.626359224319458, + "learning_rate": 4.934978402406585e-06, + "loss": 0.6195, + "step": 1016 + }, + { + "epoch": 0.4808510638297872, + "grad_norm": 2.7954699993133545, + "learning_rate": 4.934836976673265e-06, + "loss": 0.5545, + "step": 1017 + }, + { + "epoch": 0.4813238770685579, + "grad_norm": 2.913557291030884, + "learning_rate": 4.934695399333324e-06, + "loss": 0.6288, + "step": 1018 + }, + { + "epoch": 0.4817966903073286, + "grad_norm": 3.1043739318847656, + "learning_rate": 4.9345536703955746e-06, + "loss": 0.6771, + "step": 1019 + }, + { + "epoch": 0.48226950354609927, + "grad_norm": 2.789357900619507, + "learning_rate": 4.934411789868845e-06, + "loss": 0.6227, + "step": 1020 + }, + { + "epoch": 0.48274231678486995, + "grad_norm": 2.480609655380249, + "learning_rate": 4.934269757761967e-06, + "loss": 0.5779, + "step": 1021 + }, + { + "epoch": 0.48321513002364064, + "grad_norm": 2.7946252822875977, + "learning_rate": 4.934127574083785e-06, + "loss": 0.6166, + "step": 1022 + }, + { + "epoch": 0.4836879432624113, + "grad_norm": 3.0670509338378906, + "learning_rate": 4.933985238843153e-06, + "loss": 0.7766, + "step": 1023 + }, + { + "epoch": 0.484160756501182, + "grad_norm": 2.8567559719085693, + "learning_rate": 4.933842752048932e-06, + "loss": 0.5088, + "step": 1024 + }, + { + "epoch": 0.4846335697399527, + "grad_norm": 2.5674657821655273, + "learning_rate": 4.933700113709996e-06, + "loss": 0.6036, + "step": 1025 + }, + { + "epoch": 0.4851063829787234, + "grad_norm": 2.782339096069336, + "learning_rate": 4.933557323835224e-06, + "loss": 0.5335, + "step": 1026 + }, + { + "epoch": 0.48557919621749407, + "grad_norm": 2.6334071159362793, + "learning_rate": 4.93341438243351e-06, + "loss": 0.6327, + "step": 1027 + }, + { + "epoch": 0.48605200945626476, + "grad_norm": 3.0853965282440186, + "learning_rate": 4.933271289513751e-06, + "loss": 0.7102, + "step": 1028 + }, + { + "epoch": 0.48652482269503544, + "grad_norm": 2.619997501373291, + "learning_rate": 4.933128045084859e-06, + "loss": 0.6138, + "step": 1029 + }, + { + "epoch": 0.48699763593380613, + "grad_norm": 2.8316116333007812, + "learning_rate": 4.932984649155753e-06, + "loss": 0.6346, + "step": 1030 + }, + { + "epoch": 0.4874704491725768, + "grad_norm": 3.153486490249634, + "learning_rate": 4.932841101735361e-06, + "loss": 0.7626, + "step": 1031 + }, + { + "epoch": 0.4879432624113475, + "grad_norm": 3.1831274032592773, + "learning_rate": 4.9326974028326214e-06, + "loss": 0.6607, + "step": 1032 + }, + { + "epoch": 0.4884160756501182, + "grad_norm": 2.791078567504883, + "learning_rate": 4.932553552456481e-06, + "loss": 0.6141, + "step": 1033 + }, + { + "epoch": 0.4888888888888889, + "grad_norm": 2.627263307571411, + "learning_rate": 4.932409550615898e-06, + "loss": 0.6777, + "step": 1034 + }, + { + "epoch": 0.48936170212765956, + "grad_norm": 2.8550007343292236, + "learning_rate": 4.932265397319838e-06, + "loss": 0.6379, + "step": 1035 + }, + { + "epoch": 0.48983451536643025, + "grad_norm": 4.505824089050293, + "learning_rate": 4.932121092577276e-06, + "loss": 0.5892, + "step": 1036 + }, + { + "epoch": 0.49030732860520093, + "grad_norm": 3.100191116333008, + "learning_rate": 4.931976636397199e-06, + "loss": 0.6443, + "step": 1037 + }, + { + "epoch": 0.4907801418439716, + "grad_norm": 2.921494245529175, + "learning_rate": 4.9318320287886e-06, + "loss": 0.6821, + "step": 1038 + }, + { + "epoch": 0.4912529550827423, + "grad_norm": 4.577807903289795, + "learning_rate": 4.931687269760485e-06, + "loss": 0.5946, + "step": 1039 + }, + { + "epoch": 0.491725768321513, + "grad_norm": 2.7347636222839355, + "learning_rate": 4.931542359321865e-06, + "loss": 0.5689, + "step": 1040 + }, + { + "epoch": 0.4921985815602837, + "grad_norm": 2.5289158821105957, + "learning_rate": 4.931397297481765e-06, + "loss": 0.5632, + "step": 1041 + }, + { + "epoch": 0.49267139479905436, + "grad_norm": 3.3518471717834473, + "learning_rate": 4.9312520842492165e-06, + "loss": 0.6349, + "step": 1042 + }, + { + "epoch": 0.49314420803782505, + "grad_norm": 3.0469748973846436, + "learning_rate": 4.931106719633261e-06, + "loss": 0.5734, + "step": 1043 + }, + { + "epoch": 0.49361702127659574, + "grad_norm": 3.104682445526123, + "learning_rate": 4.930961203642951e-06, + "loss": 0.6101, + "step": 1044 + }, + { + "epoch": 0.4940898345153664, + "grad_norm": 2.776705503463745, + "learning_rate": 4.930815536287346e-06, + "loss": 0.6397, + "step": 1045 + }, + { + "epoch": 0.4945626477541371, + "grad_norm": 2.760380983352661, + "learning_rate": 4.930669717575516e-06, + "loss": 0.668, + "step": 1046 + }, + { + "epoch": 0.4950354609929078, + "grad_norm": 2.70084547996521, + "learning_rate": 4.930523747516541e-06, + "loss": 0.5729, + "step": 1047 + }, + { + "epoch": 0.4955082742316785, + "grad_norm": 2.7319583892822266, + "learning_rate": 4.930377626119511e-06, + "loss": 0.6258, + "step": 1048 + }, + { + "epoch": 0.49598108747044917, + "grad_norm": 3.2515223026275635, + "learning_rate": 4.930231353393521e-06, + "loss": 0.7412, + "step": 1049 + }, + { + "epoch": 0.49645390070921985, + "grad_norm": 3.0646486282348633, + "learning_rate": 4.930084929347682e-06, + "loss": 0.5809, + "step": 1050 + }, + { + "epoch": 0.49692671394799054, + "grad_norm": 3.1621921062469482, + "learning_rate": 4.9299383539911096e-06, + "loss": 0.6282, + "step": 1051 + }, + { + "epoch": 0.4973995271867612, + "grad_norm": 2.864713191986084, + "learning_rate": 4.929791627332931e-06, + "loss": 0.6263, + "step": 1052 + }, + { + "epoch": 0.4978723404255319, + "grad_norm": 3.181016683578491, + "learning_rate": 4.929644749382283e-06, + "loss": 0.5697, + "step": 1053 + }, + { + "epoch": 0.4983451536643026, + "grad_norm": 2.9064836502075195, + "learning_rate": 4.929497720148309e-06, + "loss": 0.6161, + "step": 1054 + }, + { + "epoch": 0.4988179669030733, + "grad_norm": 3.058112859725952, + "learning_rate": 4.9293505396401655e-06, + "loss": 0.6477, + "step": 1055 + }, + { + "epoch": 0.49929078014184397, + "grad_norm": 2.5227596759796143, + "learning_rate": 4.929203207867016e-06, + "loss": 0.5819, + "step": 1056 + }, + { + "epoch": 0.49976359338061466, + "grad_norm": 3.386862277984619, + "learning_rate": 4.929055724838035e-06, + "loss": 0.7342, + "step": 1057 + }, + { + "epoch": 0.5002364066193853, + "grad_norm": 3.368346929550171, + "learning_rate": 4.928908090562404e-06, + "loss": 0.6622, + "step": 1058 + }, + { + "epoch": 0.500709219858156, + "grad_norm": 2.9108314514160156, + "learning_rate": 4.928760305049317e-06, + "loss": 0.6598, + "step": 1059 + }, + { + "epoch": 0.5011820330969267, + "grad_norm": 2.822305917739868, + "learning_rate": 4.928612368307977e-06, + "loss": 0.5841, + "step": 1060 + }, + { + "epoch": 0.5016548463356973, + "grad_norm": 2.689131259918213, + "learning_rate": 4.928464280347592e-06, + "loss": 0.6631, + "step": 1061 + }, + { + "epoch": 0.502127659574468, + "grad_norm": 3.337214946746826, + "learning_rate": 4.9283160411773864e-06, + "loss": 0.6105, + "step": 1062 + }, + { + "epoch": 0.5026004728132387, + "grad_norm": 3.035911798477173, + "learning_rate": 4.928167650806588e-06, + "loss": 0.6981, + "step": 1063 + }, + { + "epoch": 0.5030732860520094, + "grad_norm": 2.8820855617523193, + "learning_rate": 4.9280191092444375e-06, + "loss": 0.6408, + "step": 1064 + }, + { + "epoch": 0.5035460992907801, + "grad_norm": 3.080432415008545, + "learning_rate": 4.927870416500183e-06, + "loss": 0.6398, + "step": 1065 + }, + { + "epoch": 0.5040189125295508, + "grad_norm": 2.761612892150879, + "learning_rate": 4.927721572583084e-06, + "loss": 0.6126, + "step": 1066 + }, + { + "epoch": 0.5044917257683215, + "grad_norm": 2.8561882972717285, + "learning_rate": 4.927572577502408e-06, + "loss": 0.584, + "step": 1067 + }, + { + "epoch": 0.5049645390070922, + "grad_norm": 3.3386311531066895, + "learning_rate": 4.927423431267432e-06, + "loss": 0.6666, + "step": 1068 + }, + { + "epoch": 0.5054373522458628, + "grad_norm": 2.632906675338745, + "learning_rate": 4.927274133887443e-06, + "loss": 0.632, + "step": 1069 + }, + { + "epoch": 0.5059101654846335, + "grad_norm": 2.8737308979034424, + "learning_rate": 4.927124685371737e-06, + "loss": 0.6051, + "step": 1070 + }, + { + "epoch": 0.5063829787234042, + "grad_norm": 3.042222738265991, + "learning_rate": 4.926975085729619e-06, + "loss": 0.6954, + "step": 1071 + }, + { + "epoch": 0.5068557919621749, + "grad_norm": 3.3341481685638428, + "learning_rate": 4.926825334970404e-06, + "loss": 0.7148, + "step": 1072 + }, + { + "epoch": 0.5073286052009456, + "grad_norm": 2.7415387630462646, + "learning_rate": 4.926675433103418e-06, + "loss": 0.5456, + "step": 1073 + }, + { + "epoch": 0.5078014184397163, + "grad_norm": 2.7545325756073, + "learning_rate": 4.926525380137993e-06, + "loss": 0.6213, + "step": 1074 + }, + { + "epoch": 0.508274231678487, + "grad_norm": 2.9153690338134766, + "learning_rate": 4.926375176083472e-06, + "loss": 0.6466, + "step": 1075 + }, + { + "epoch": 0.5087470449172576, + "grad_norm": 4.210638523101807, + "learning_rate": 4.926224820949209e-06, + "loss": 0.6192, + "step": 1076 + }, + { + "epoch": 0.5092198581560283, + "grad_norm": 2.4357898235321045, + "learning_rate": 4.926074314744565e-06, + "loss": 0.594, + "step": 1077 + }, + { + "epoch": 0.509692671394799, + "grad_norm": 2.8004701137542725, + "learning_rate": 4.92592365747891e-06, + "loss": 0.6276, + "step": 1078 + }, + { + "epoch": 0.5101654846335697, + "grad_norm": 2.920675039291382, + "learning_rate": 4.925772849161628e-06, + "loss": 0.6043, + "step": 1079 + }, + { + "epoch": 0.5106382978723404, + "grad_norm": 2.791555404663086, + "learning_rate": 4.9256218898021055e-06, + "loss": 0.6837, + "step": 1080 + }, + { + "epoch": 0.5111111111111111, + "grad_norm": 3.1702463626861572, + "learning_rate": 4.925470779409746e-06, + "loss": 0.668, + "step": 1081 + }, + { + "epoch": 0.5115839243498818, + "grad_norm": 2.7149479389190674, + "learning_rate": 4.925319517993955e-06, + "loss": 0.5842, + "step": 1082 + }, + { + "epoch": 0.5120567375886524, + "grad_norm": 2.916311025619507, + "learning_rate": 4.925168105564153e-06, + "loss": 0.6893, + "step": 1083 + }, + { + "epoch": 0.5125295508274231, + "grad_norm": 2.917654514312744, + "learning_rate": 4.925016542129767e-06, + "loss": 0.6513, + "step": 1084 + }, + { + "epoch": 0.5130023640661938, + "grad_norm": 2.5568928718566895, + "learning_rate": 4.924864827700234e-06, + "loss": 0.6177, + "step": 1085 + }, + { + "epoch": 0.5134751773049645, + "grad_norm": 2.816720485687256, + "learning_rate": 4.924712962285001e-06, + "loss": 0.5833, + "step": 1086 + }, + { + "epoch": 0.5139479905437352, + "grad_norm": 2.6989188194274902, + "learning_rate": 4.9245609458935235e-06, + "loss": 0.6332, + "step": 1087 + }, + { + "epoch": 0.5144208037825059, + "grad_norm": 2.959599494934082, + "learning_rate": 4.924408778535268e-06, + "loss": 0.626, + "step": 1088 + }, + { + "epoch": 0.5148936170212766, + "grad_norm": 2.872814416885376, + "learning_rate": 4.924256460219708e-06, + "loss": 0.6407, + "step": 1089 + }, + { + "epoch": 0.5153664302600472, + "grad_norm": 2.6989097595214844, + "learning_rate": 4.924103990956329e-06, + "loss": 0.6391, + "step": 1090 + }, + { + "epoch": 0.5158392434988179, + "grad_norm": 2.986492156982422, + "learning_rate": 4.9239513707546235e-06, + "loss": 0.6911, + "step": 1091 + }, + { + "epoch": 0.5163120567375886, + "grad_norm": 3.069920301437378, + "learning_rate": 4.9237985996240954e-06, + "loss": 0.671, + "step": 1092 + }, + { + "epoch": 0.5167848699763593, + "grad_norm": 2.8214917182922363, + "learning_rate": 4.9236456775742555e-06, + "loss": 0.5885, + "step": 1093 + }, + { + "epoch": 0.51725768321513, + "grad_norm": 2.9416961669921875, + "learning_rate": 4.923492604614627e-06, + "loss": 0.6293, + "step": 1094 + }, + { + "epoch": 0.5177304964539007, + "grad_norm": 2.761780023574829, + "learning_rate": 4.923339380754741e-06, + "loss": 0.649, + "step": 1095 + }, + { + "epoch": 0.5182033096926714, + "grad_norm": 2.7648792266845703, + "learning_rate": 4.923186006004138e-06, + "loss": 0.5906, + "step": 1096 + }, + { + "epoch": 0.518676122931442, + "grad_norm": 3.5535428524017334, + "learning_rate": 4.923032480372367e-06, + "loss": 0.7138, + "step": 1097 + }, + { + "epoch": 0.5191489361702127, + "grad_norm": 2.6252479553222656, + "learning_rate": 4.922878803868988e-06, + "loss": 0.5499, + "step": 1098 + }, + { + "epoch": 0.5196217494089834, + "grad_norm": 2.901002883911133, + "learning_rate": 4.9227249765035715e-06, + "loss": 0.6991, + "step": 1099 + }, + { + "epoch": 0.5200945626477541, + "grad_norm": 2.621877431869507, + "learning_rate": 4.9225709982856925e-06, + "loss": 0.6269, + "step": 1100 + }, + { + "epoch": 0.5205673758865248, + "grad_norm": 2.872483015060425, + "learning_rate": 4.92241686922494e-06, + "loss": 0.6657, + "step": 1101 + }, + { + "epoch": 0.5210401891252955, + "grad_norm": 2.730447769165039, + "learning_rate": 4.922262589330912e-06, + "loss": 0.6061, + "step": 1102 + }, + { + "epoch": 0.5215130023640662, + "grad_norm": 2.646247386932373, + "learning_rate": 4.922108158613213e-06, + "loss": 0.5923, + "step": 1103 + }, + { + "epoch": 0.5219858156028369, + "grad_norm": 2.6488895416259766, + "learning_rate": 4.92195357708146e-06, + "loss": 0.6293, + "step": 1104 + }, + { + "epoch": 0.5224586288416075, + "grad_norm": 2.756338357925415, + "learning_rate": 4.921798844745278e-06, + "loss": 0.6374, + "step": 1105 + }, + { + "epoch": 0.5229314420803782, + "grad_norm": 3.1441280841827393, + "learning_rate": 4.921643961614301e-06, + "loss": 0.6652, + "step": 1106 + }, + { + "epoch": 0.5234042553191489, + "grad_norm": 3.050002098083496, + "learning_rate": 4.921488927698172e-06, + "loss": 0.6809, + "step": 1107 + }, + { + "epoch": 0.5238770685579196, + "grad_norm": 2.71750807762146, + "learning_rate": 4.921333743006547e-06, + "loss": 0.6266, + "step": 1108 + }, + { + "epoch": 0.5243498817966903, + "grad_norm": 2.8439245223999023, + "learning_rate": 4.921178407549086e-06, + "loss": 0.5663, + "step": 1109 + }, + { + "epoch": 0.524822695035461, + "grad_norm": 3.0722241401672363, + "learning_rate": 4.921022921335464e-06, + "loss": 0.6791, + "step": 1110 + }, + { + "epoch": 0.5252955082742317, + "grad_norm": 3.4381656646728516, + "learning_rate": 4.920867284375358e-06, + "loss": 0.6687, + "step": 1111 + }, + { + "epoch": 0.5257683215130023, + "grad_norm": 2.819812774658203, + "learning_rate": 4.920711496678463e-06, + "loss": 0.6299, + "step": 1112 + }, + { + "epoch": 0.526241134751773, + "grad_norm": 3.6587414741516113, + "learning_rate": 4.9205555582544765e-06, + "loss": 0.7392, + "step": 1113 + }, + { + "epoch": 0.5267139479905437, + "grad_norm": 2.774296522140503, + "learning_rate": 4.920399469113109e-06, + "loss": 0.6652, + "step": 1114 + }, + { + "epoch": 0.5271867612293144, + "grad_norm": 2.7480580806732178, + "learning_rate": 4.920243229264081e-06, + "loss": 0.596, + "step": 1115 + }, + { + "epoch": 0.5276595744680851, + "grad_norm": 3.213057518005371, + "learning_rate": 4.920086838717119e-06, + "loss": 0.6986, + "step": 1116 + }, + { + "epoch": 0.5281323877068558, + "grad_norm": 2.940546989440918, + "learning_rate": 4.919930297481962e-06, + "loss": 0.6481, + "step": 1117 + }, + { + "epoch": 0.5286052009456265, + "grad_norm": 2.5970494747161865, + "learning_rate": 4.9197736055683555e-06, + "loss": 0.5658, + "step": 1118 + }, + { + "epoch": 0.5290780141843971, + "grad_norm": 4.49385404586792, + "learning_rate": 4.919616762986057e-06, + "loss": 0.605, + "step": 1119 + }, + { + "epoch": 0.5295508274231678, + "grad_norm": 2.971857786178589, + "learning_rate": 4.919459769744833e-06, + "loss": 0.6539, + "step": 1120 + }, + { + "epoch": 0.5300236406619385, + "grad_norm": 2.6192965507507324, + "learning_rate": 4.919302625854457e-06, + "loss": 0.6226, + "step": 1121 + }, + { + "epoch": 0.5304964539007092, + "grad_norm": 2.665088176727295, + "learning_rate": 4.919145331324716e-06, + "loss": 0.6647, + "step": 1122 + }, + { + "epoch": 0.5309692671394799, + "grad_norm": 2.612126111984253, + "learning_rate": 4.918987886165403e-06, + "loss": 0.6965, + "step": 1123 + }, + { + "epoch": 0.5314420803782506, + "grad_norm": 3.80017352104187, + "learning_rate": 4.9188302903863205e-06, + "loss": 0.7396, + "step": 1124 + }, + { + "epoch": 0.5319148936170213, + "grad_norm": 2.781752824783325, + "learning_rate": 4.918672543997282e-06, + "loss": 0.5985, + "step": 1125 + }, + { + "epoch": 0.532387706855792, + "grad_norm": 2.6067914962768555, + "learning_rate": 4.91851464700811e-06, + "loss": 0.6159, + "step": 1126 + }, + { + "epoch": 0.5328605200945626, + "grad_norm": 2.670807123184204, + "learning_rate": 4.918356599428636e-06, + "loss": 0.5958, + "step": 1127 + }, + { + "epoch": 0.5333333333333333, + "grad_norm": 2.608611822128296, + "learning_rate": 4.9181984012687e-06, + "loss": 0.5768, + "step": 1128 + }, + { + "epoch": 0.533806146572104, + "grad_norm": 2.586764097213745, + "learning_rate": 4.918040052538154e-06, + "loss": 0.661, + "step": 1129 + }, + { + "epoch": 0.5342789598108747, + "grad_norm": 3.1317451000213623, + "learning_rate": 4.917881553246856e-06, + "loss": 0.6626, + "step": 1130 + }, + { + "epoch": 0.5347517730496454, + "grad_norm": 2.7135281562805176, + "learning_rate": 4.917722903404676e-06, + "loss": 0.6572, + "step": 1131 + }, + { + "epoch": 0.5352245862884161, + "grad_norm": 3.4546358585357666, + "learning_rate": 4.917564103021493e-06, + "loss": 0.5597, + "step": 1132 + }, + { + "epoch": 0.5356973995271868, + "grad_norm": 3.0943493843078613, + "learning_rate": 4.917405152107193e-06, + "loss": 0.7258, + "step": 1133 + }, + { + "epoch": 0.5361702127659574, + "grad_norm": 2.6069352626800537, + "learning_rate": 4.917246050671674e-06, + "loss": 0.6209, + "step": 1134 + }, + { + "epoch": 0.5366430260047281, + "grad_norm": 2.584883689880371, + "learning_rate": 4.917086798724844e-06, + "loss": 0.658, + "step": 1135 + }, + { + "epoch": 0.5371158392434988, + "grad_norm": 3.001976490020752, + "learning_rate": 4.9169273962766166e-06, + "loss": 0.6306, + "step": 1136 + }, + { + "epoch": 0.5375886524822695, + "grad_norm": 2.5013928413391113, + "learning_rate": 4.916767843336918e-06, + "loss": 0.572, + "step": 1137 + }, + { + "epoch": 0.5380614657210402, + "grad_norm": 2.9114553928375244, + "learning_rate": 4.916608139915684e-06, + "loss": 0.5841, + "step": 1138 + }, + { + "epoch": 0.5385342789598109, + "grad_norm": 2.8878467082977295, + "learning_rate": 4.9164482860228564e-06, + "loss": 0.6654, + "step": 1139 + }, + { + "epoch": 0.5390070921985816, + "grad_norm": 2.9827866554260254, + "learning_rate": 4.91628828166839e-06, + "loss": 0.6674, + "step": 1140 + }, + { + "epoch": 0.5394799054373522, + "grad_norm": 3.8696281909942627, + "learning_rate": 4.916128126862248e-06, + "loss": 0.6241, + "step": 1141 + }, + { + "epoch": 0.5399527186761229, + "grad_norm": 2.9556291103363037, + "learning_rate": 4.915967821614402e-06, + "loss": 0.6478, + "step": 1142 + }, + { + "epoch": 0.5404255319148936, + "grad_norm": 2.392942428588867, + "learning_rate": 4.915807365934834e-06, + "loss": 0.6097, + "step": 1143 + }, + { + "epoch": 0.5408983451536643, + "grad_norm": 3.032235860824585, + "learning_rate": 4.915646759833534e-06, + "loss": 0.7193, + "step": 1144 + }, + { + "epoch": 0.541371158392435, + "grad_norm": 2.840416193008423, + "learning_rate": 4.915486003320501e-06, + "loss": 0.5506, + "step": 1145 + }, + { + "epoch": 0.5418439716312057, + "grad_norm": 2.5438895225524902, + "learning_rate": 4.915325096405747e-06, + "loss": 0.6487, + "step": 1146 + }, + { + "epoch": 0.5423167848699764, + "grad_norm": 2.544334650039673, + "learning_rate": 4.9151640390992905e-06, + "loss": 0.6168, + "step": 1147 + }, + { + "epoch": 0.542789598108747, + "grad_norm": 2.8535678386688232, + "learning_rate": 4.91500283141116e-06, + "loss": 0.678, + "step": 1148 + }, + { + "epoch": 0.5432624113475177, + "grad_norm": 2.8086955547332764, + "learning_rate": 4.9148414733513915e-06, + "loss": 0.6473, + "step": 1149 + }, + { + "epoch": 0.5437352245862884, + "grad_norm": 2.4709885120391846, + "learning_rate": 4.914679964930034e-06, + "loss": 0.6797, + "step": 1150 + }, + { + "epoch": 0.5442080378250591, + "grad_norm": 2.8546934127807617, + "learning_rate": 4.9145183061571435e-06, + "loss": 0.6247, + "step": 1151 + }, + { + "epoch": 0.5446808510638298, + "grad_norm": 2.991184711456299, + "learning_rate": 4.9143564970427844e-06, + "loss": 0.5977, + "step": 1152 + }, + { + "epoch": 0.5451536643026005, + "grad_norm": 3.011216402053833, + "learning_rate": 4.914194537597033e-06, + "loss": 0.7005, + "step": 1153 + }, + { + "epoch": 0.5456264775413712, + "grad_norm": 2.807521343231201, + "learning_rate": 4.9140324278299744e-06, + "loss": 0.5412, + "step": 1154 + }, + { + "epoch": 0.5460992907801419, + "grad_norm": 3.0401229858398438, + "learning_rate": 4.913870167751701e-06, + "loss": 0.6394, + "step": 1155 + }, + { + "epoch": 0.5465721040189125, + "grad_norm": 2.853914976119995, + "learning_rate": 4.913707757372317e-06, + "loss": 0.6745, + "step": 1156 + }, + { + "epoch": 0.5470449172576832, + "grad_norm": 4.505620956420898, + "learning_rate": 4.913545196701935e-06, + "loss": 0.6668, + "step": 1157 + }, + { + "epoch": 0.5475177304964539, + "grad_norm": 3.0505781173706055, + "learning_rate": 4.913382485750676e-06, + "loss": 0.6926, + "step": 1158 + }, + { + "epoch": 0.5479905437352246, + "grad_norm": 2.798435688018799, + "learning_rate": 4.913219624528672e-06, + "loss": 0.605, + "step": 1159 + }, + { + "epoch": 0.5484633569739953, + "grad_norm": 2.7814908027648926, + "learning_rate": 4.913056613046065e-06, + "loss": 0.6678, + "step": 1160 + }, + { + "epoch": 0.548936170212766, + "grad_norm": 3.2089321613311768, + "learning_rate": 4.9128934513130025e-06, + "loss": 0.5995, + "step": 1161 + }, + { + "epoch": 0.5494089834515367, + "grad_norm": 2.7699952125549316, + "learning_rate": 4.9127301393396455e-06, + "loss": 0.7062, + "step": 1162 + }, + { + "epoch": 0.5498817966903073, + "grad_norm": 2.859368324279785, + "learning_rate": 4.912566677136162e-06, + "loss": 0.6063, + "step": 1163 + }, + { + "epoch": 0.550354609929078, + "grad_norm": 2.727334499359131, + "learning_rate": 4.91240306471273e-06, + "loss": 0.6848, + "step": 1164 + }, + { + "epoch": 0.5508274231678487, + "grad_norm": 2.6017510890960693, + "learning_rate": 4.912239302079537e-06, + "loss": 0.5808, + "step": 1165 + }, + { + "epoch": 0.5513002364066194, + "grad_norm": 3.539583206176758, + "learning_rate": 4.912075389246781e-06, + "loss": 0.7053, + "step": 1166 + }, + { + "epoch": 0.5517730496453901, + "grad_norm": 2.918280601501465, + "learning_rate": 4.911911326224666e-06, + "loss": 0.5904, + "step": 1167 + }, + { + "epoch": 0.5522458628841608, + "grad_norm": 3.0067362785339355, + "learning_rate": 4.9117471130234095e-06, + "loss": 0.6392, + "step": 1168 + }, + { + "epoch": 0.5527186761229315, + "grad_norm": 2.4374797344207764, + "learning_rate": 4.911582749653236e-06, + "loss": 0.5793, + "step": 1169 + }, + { + "epoch": 0.5531914893617021, + "grad_norm": 3.121182918548584, + "learning_rate": 4.911418236124378e-06, + "loss": 0.6636, + "step": 1170 + }, + { + "epoch": 0.5536643026004728, + "grad_norm": 3.1289851665496826, + "learning_rate": 4.91125357244708e-06, + "loss": 0.656, + "step": 1171 + }, + { + "epoch": 0.5541371158392435, + "grad_norm": 2.7034592628479004, + "learning_rate": 4.911088758631596e-06, + "loss": 0.6001, + "step": 1172 + }, + { + "epoch": 0.5546099290780142, + "grad_norm": 2.710146188735962, + "learning_rate": 4.910923794688187e-06, + "loss": 0.6007, + "step": 1173 + }, + { + "epoch": 0.5550827423167849, + "grad_norm": 2.5424487590789795, + "learning_rate": 4.910758680627124e-06, + "loss": 0.5193, + "step": 1174 + }, + { + "epoch": 0.5555555555555556, + "grad_norm": 2.615893602371216, + "learning_rate": 4.91059341645869e-06, + "loss": 0.5525, + "step": 1175 + }, + { + "epoch": 0.5560283687943263, + "grad_norm": 3.3179728984832764, + "learning_rate": 4.910428002193174e-06, + "loss": 0.7285, + "step": 1176 + }, + { + "epoch": 0.556501182033097, + "grad_norm": 2.7234175205230713, + "learning_rate": 4.910262437840875e-06, + "loss": 0.574, + "step": 1177 + }, + { + "epoch": 0.5569739952718676, + "grad_norm": 3.0416605472564697, + "learning_rate": 4.9100967234121034e-06, + "loss": 0.5623, + "step": 1178 + }, + { + "epoch": 0.5574468085106383, + "grad_norm": 3.067786455154419, + "learning_rate": 4.909930858917177e-06, + "loss": 0.6491, + "step": 1179 + }, + { + "epoch": 0.557919621749409, + "grad_norm": 3.0037379264831543, + "learning_rate": 4.909764844366422e-06, + "loss": 0.5696, + "step": 1180 + }, + { + "epoch": 0.5583924349881797, + "grad_norm": 2.966179609298706, + "learning_rate": 4.909598679770178e-06, + "loss": 0.6042, + "step": 1181 + }, + { + "epoch": 0.5588652482269504, + "grad_norm": 2.6000657081604004, + "learning_rate": 4.909432365138789e-06, + "loss": 0.5883, + "step": 1182 + }, + { + "epoch": 0.5593380614657211, + "grad_norm": 2.6794495582580566, + "learning_rate": 4.909265900482612e-06, + "loss": 0.6809, + "step": 1183 + }, + { + "epoch": 0.5598108747044918, + "grad_norm": 2.6765122413635254, + "learning_rate": 4.9090992858120115e-06, + "loss": 0.6601, + "step": 1184 + }, + { + "epoch": 0.5602836879432624, + "grad_norm": 2.6051928997039795, + "learning_rate": 4.908932521137363e-06, + "loss": 0.5946, + "step": 1185 + }, + { + "epoch": 0.5607565011820331, + "grad_norm": 3.0405542850494385, + "learning_rate": 4.908765606469048e-06, + "loss": 0.6998, + "step": 1186 + }, + { + "epoch": 0.5612293144208038, + "grad_norm": 2.7975668907165527, + "learning_rate": 4.908598541817462e-06, + "loss": 0.6218, + "step": 1187 + }, + { + "epoch": 0.5617021276595745, + "grad_norm": 2.5367627143859863, + "learning_rate": 4.908431327193005e-06, + "loss": 0.6354, + "step": 1188 + }, + { + "epoch": 0.5621749408983452, + "grad_norm": 3.7939631938934326, + "learning_rate": 4.908263962606091e-06, + "loss": 0.6376, + "step": 1189 + }, + { + "epoch": 0.5626477541371159, + "grad_norm": 2.864079475402832, + "learning_rate": 4.908096448067139e-06, + "loss": 0.5485, + "step": 1190 + }, + { + "epoch": 0.5631205673758866, + "grad_norm": 2.7855563163757324, + "learning_rate": 4.9079287835865804e-06, + "loss": 0.6645, + "step": 1191 + }, + { + "epoch": 0.5635933806146572, + "grad_norm": 2.6156625747680664, + "learning_rate": 4.9077609691748556e-06, + "loss": 0.5751, + "step": 1192 + }, + { + "epoch": 0.5640661938534279, + "grad_norm": 3.0475659370422363, + "learning_rate": 4.907593004842412e-06, + "loss": 0.6739, + "step": 1193 + }, + { + "epoch": 0.5645390070921986, + "grad_norm": 2.9176738262176514, + "learning_rate": 4.9074248905997104e-06, + "loss": 0.6493, + "step": 1194 + }, + { + "epoch": 0.5650118203309693, + "grad_norm": 2.6168384552001953, + "learning_rate": 4.907256626457216e-06, + "loss": 0.6154, + "step": 1195 + }, + { + "epoch": 0.56548463356974, + "grad_norm": 2.893980026245117, + "learning_rate": 4.907088212425408e-06, + "loss": 0.5808, + "step": 1196 + }, + { + "epoch": 0.5659574468085107, + "grad_norm": 3.3832836151123047, + "learning_rate": 4.90691964851477e-06, + "loss": 0.7888, + "step": 1197 + }, + { + "epoch": 0.5664302600472814, + "grad_norm": 3.088932752609253, + "learning_rate": 4.906750934735801e-06, + "loss": 0.6516, + "step": 1198 + }, + { + "epoch": 0.566903073286052, + "grad_norm": 2.494471549987793, + "learning_rate": 4.906582071099004e-06, + "loss": 0.6286, + "step": 1199 + }, + { + "epoch": 0.5673758865248227, + "grad_norm": 2.716550588607788, + "learning_rate": 4.906413057614895e-06, + "loss": 0.5939, + "step": 1200 + }, + { + "epoch": 0.5678486997635934, + "grad_norm": 2.5821073055267334, + "learning_rate": 4.906243894293995e-06, + "loss": 0.6668, + "step": 1201 + }, + { + "epoch": 0.5683215130023641, + "grad_norm": 3.651787042617798, + "learning_rate": 4.90607458114684e-06, + "loss": 0.6124, + "step": 1202 + }, + { + "epoch": 0.5687943262411348, + "grad_norm": 2.7567858695983887, + "learning_rate": 4.9059051181839705e-06, + "loss": 0.6656, + "step": 1203 + }, + { + "epoch": 0.5692671394799055, + "grad_norm": 2.8067586421966553, + "learning_rate": 4.90573550541594e-06, + "loss": 0.6306, + "step": 1204 + }, + { + "epoch": 0.5697399527186762, + "grad_norm": 2.6136393547058105, + "learning_rate": 4.905565742853307e-06, + "loss": 0.5992, + "step": 1205 + }, + { + "epoch": 0.5702127659574469, + "grad_norm": 2.899049758911133, + "learning_rate": 4.905395830506644e-06, + "loss": 0.621, + "step": 1206 + }, + { + "epoch": 0.5706855791962175, + "grad_norm": 3.036583185195923, + "learning_rate": 4.9052257683865294e-06, + "loss": 0.652, + "step": 1207 + }, + { + "epoch": 0.5711583924349882, + "grad_norm": 2.7947216033935547, + "learning_rate": 4.905055556503553e-06, + "loss": 0.6636, + "step": 1208 + }, + { + "epoch": 0.5716312056737589, + "grad_norm": 3.1646955013275146, + "learning_rate": 4.9048851948683135e-06, + "loss": 0.6376, + "step": 1209 + }, + { + "epoch": 0.5721040189125296, + "grad_norm": 2.8175766468048096, + "learning_rate": 4.904714683491417e-06, + "loss": 0.5929, + "step": 1210 + }, + { + "epoch": 0.5725768321513003, + "grad_norm": 2.923923969268799, + "learning_rate": 4.904544022383483e-06, + "loss": 0.6633, + "step": 1211 + }, + { + "epoch": 0.573049645390071, + "grad_norm": 2.7471134662628174, + "learning_rate": 4.9043732115551356e-06, + "loss": 0.6551, + "step": 1212 + }, + { + "epoch": 0.5735224586288417, + "grad_norm": 2.8660807609558105, + "learning_rate": 4.90420225101701e-06, + "loss": 0.6423, + "step": 1213 + }, + { + "epoch": 0.5739952718676123, + "grad_norm": 2.769247531890869, + "learning_rate": 4.904031140779754e-06, + "loss": 0.5982, + "step": 1214 + }, + { + "epoch": 0.574468085106383, + "grad_norm": 2.9043145179748535, + "learning_rate": 4.90385988085402e-06, + "loss": 0.5843, + "step": 1215 + }, + { + "epoch": 0.5749408983451537, + "grad_norm": 2.6639609336853027, + "learning_rate": 4.903688471250471e-06, + "loss": 0.5858, + "step": 1216 + }, + { + "epoch": 0.5754137115839244, + "grad_norm": 2.6967573165893555, + "learning_rate": 4.903516911979781e-06, + "loss": 0.5755, + "step": 1217 + }, + { + "epoch": 0.5758865248226951, + "grad_norm": 2.8865857124328613, + "learning_rate": 4.903345203052633e-06, + "loss": 0.6051, + "step": 1218 + }, + { + "epoch": 0.5763593380614658, + "grad_norm": 2.381979465484619, + "learning_rate": 4.903173344479717e-06, + "loss": 0.5727, + "step": 1219 + }, + { + "epoch": 0.5768321513002365, + "grad_norm": 2.7717981338500977, + "learning_rate": 4.903001336271734e-06, + "loss": 0.6406, + "step": 1220 + }, + { + "epoch": 0.577304964539007, + "grad_norm": 2.6431570053100586, + "learning_rate": 4.902829178439395e-06, + "loss": 0.6226, + "step": 1221 + }, + { + "epoch": 0.5777777777777777, + "grad_norm": 2.8090415000915527, + "learning_rate": 4.902656870993419e-06, + "loss": 0.5761, + "step": 1222 + }, + { + "epoch": 0.5782505910165484, + "grad_norm": 2.4769368171691895, + "learning_rate": 4.902484413944535e-06, + "loss": 0.5602, + "step": 1223 + }, + { + "epoch": 0.5787234042553191, + "grad_norm": 2.693316698074341, + "learning_rate": 4.902311807303481e-06, + "loss": 0.5222, + "step": 1224 + }, + { + "epoch": 0.5791962174940898, + "grad_norm": 2.7623913288116455, + "learning_rate": 4.902139051081004e-06, + "loss": 0.6978, + "step": 1225 + }, + { + "epoch": 0.5796690307328605, + "grad_norm": 2.6133766174316406, + "learning_rate": 4.901966145287863e-06, + "loss": 0.5802, + "step": 1226 + }, + { + "epoch": 0.5801418439716312, + "grad_norm": 2.7345972061157227, + "learning_rate": 4.901793089934821e-06, + "loss": 0.6294, + "step": 1227 + }, + { + "epoch": 0.5806146572104018, + "grad_norm": 2.7545835971832275, + "learning_rate": 4.9016198850326555e-06, + "loss": 0.6085, + "step": 1228 + }, + { + "epoch": 0.5810874704491725, + "grad_norm": 2.6947758197784424, + "learning_rate": 4.90144653059215e-06, + "loss": 0.6025, + "step": 1229 + }, + { + "epoch": 0.5815602836879432, + "grad_norm": 2.692967414855957, + "learning_rate": 4.901273026624099e-06, + "loss": 0.5715, + "step": 1230 + }, + { + "epoch": 0.5820330969267139, + "grad_norm": 2.78347110748291, + "learning_rate": 4.901099373139307e-06, + "loss": 0.6063, + "step": 1231 + }, + { + "epoch": 0.5825059101654846, + "grad_norm": 2.346496343612671, + "learning_rate": 4.900925570148585e-06, + "loss": 0.5869, + "step": 1232 + }, + { + "epoch": 0.5829787234042553, + "grad_norm": 2.606639862060547, + "learning_rate": 4.900751617662755e-06, + "loss": 0.6197, + "step": 1233 + }, + { + "epoch": 0.583451536643026, + "grad_norm": 2.5825929641723633, + "learning_rate": 4.900577515692649e-06, + "loss": 0.6721, + "step": 1234 + }, + { + "epoch": 0.5839243498817966, + "grad_norm": 2.731349468231201, + "learning_rate": 4.900403264249107e-06, + "loss": 0.6273, + "step": 1235 + }, + { + "epoch": 0.5843971631205673, + "grad_norm": 3.2133874893188477, + "learning_rate": 4.90022886334298e-06, + "loss": 0.6231, + "step": 1236 + }, + { + "epoch": 0.584869976359338, + "grad_norm": 2.9213852882385254, + "learning_rate": 4.900054312985127e-06, + "loss": 0.6677, + "step": 1237 + }, + { + "epoch": 0.5853427895981087, + "grad_norm": 2.815425157546997, + "learning_rate": 4.899879613186414e-06, + "loss": 0.6405, + "step": 1238 + }, + { + "epoch": 0.5858156028368794, + "grad_norm": 2.730782985687256, + "learning_rate": 4.899704763957721e-06, + "loss": 0.6233, + "step": 1239 + }, + { + "epoch": 0.5862884160756501, + "grad_norm": 2.6432766914367676, + "learning_rate": 4.899529765309936e-06, + "loss": 0.6267, + "step": 1240 + }, + { + "epoch": 0.5867612293144208, + "grad_norm": 2.616215229034424, + "learning_rate": 4.899354617253953e-06, + "loss": 0.6268, + "step": 1241 + }, + { + "epoch": 0.5872340425531914, + "grad_norm": 2.7630255222320557, + "learning_rate": 4.899179319800679e-06, + "loss": 0.6348, + "step": 1242 + }, + { + "epoch": 0.5877068557919621, + "grad_norm": 2.785095453262329, + "learning_rate": 4.899003872961029e-06, + "loss": 0.5839, + "step": 1243 + }, + { + "epoch": 0.5881796690307328, + "grad_norm": 2.9050328731536865, + "learning_rate": 4.898828276745927e-06, + "loss": 0.651, + "step": 1244 + }, + { + "epoch": 0.5886524822695035, + "grad_norm": 2.958092212677002, + "learning_rate": 4.8986525311663065e-06, + "loss": 0.6395, + "step": 1245 + }, + { + "epoch": 0.5891252955082742, + "grad_norm": 2.952310800552368, + "learning_rate": 4.898476636233111e-06, + "loss": 0.6731, + "step": 1246 + }, + { + "epoch": 0.5895981087470449, + "grad_norm": 2.9876346588134766, + "learning_rate": 4.898300591957293e-06, + "loss": 0.7015, + "step": 1247 + }, + { + "epoch": 0.5900709219858156, + "grad_norm": 2.8941752910614014, + "learning_rate": 4.898124398349813e-06, + "loss": 0.6452, + "step": 1248 + }, + { + "epoch": 0.5905437352245863, + "grad_norm": 2.9809536933898926, + "learning_rate": 4.897948055421642e-06, + "loss": 0.5736, + "step": 1249 + }, + { + "epoch": 0.5910165484633569, + "grad_norm": 2.927046775817871, + "learning_rate": 4.897771563183761e-06, + "loss": 0.5918, + "step": 1250 + }, + { + "epoch": 0.5914893617021276, + "grad_norm": 2.865020275115967, + "learning_rate": 4.897594921647158e-06, + "loss": 0.6924, + "step": 1251 + }, + { + "epoch": 0.5919621749408983, + "grad_norm": 2.7406699657440186, + "learning_rate": 4.897418130822832e-06, + "loss": 0.509, + "step": 1252 + }, + { + "epoch": 0.592434988179669, + "grad_norm": 2.781606912612915, + "learning_rate": 4.897241190721791e-06, + "loss": 0.5555, + "step": 1253 + }, + { + "epoch": 0.5929078014184397, + "grad_norm": 2.79209303855896, + "learning_rate": 4.8970641013550535e-06, + "loss": 0.6722, + "step": 1254 + }, + { + "epoch": 0.5933806146572104, + "grad_norm": 3.0672268867492676, + "learning_rate": 4.896886862733645e-06, + "loss": 0.6366, + "step": 1255 + }, + { + "epoch": 0.5938534278959811, + "grad_norm": 2.7456953525543213, + "learning_rate": 4.896709474868602e-06, + "loss": 0.6246, + "step": 1256 + }, + { + "epoch": 0.5943262411347517, + "grad_norm": 3.6731202602386475, + "learning_rate": 4.896531937770968e-06, + "loss": 0.668, + "step": 1257 + }, + { + "epoch": 0.5947990543735224, + "grad_norm": 2.6056087017059326, + "learning_rate": 4.8963542514518e-06, + "loss": 0.5815, + "step": 1258 + }, + { + "epoch": 0.5952718676122931, + "grad_norm": 2.719698905944824, + "learning_rate": 4.89617641592216e-06, + "loss": 0.6058, + "step": 1259 + }, + { + "epoch": 0.5957446808510638, + "grad_norm": 2.625838279724121, + "learning_rate": 4.895998431193121e-06, + "loss": 0.6143, + "step": 1260 + }, + { + "epoch": 0.5962174940898345, + "grad_norm": 2.7166085243225098, + "learning_rate": 4.895820297275767e-06, + "loss": 0.5187, + "step": 1261 + }, + { + "epoch": 0.5966903073286052, + "grad_norm": 2.7544102668762207, + "learning_rate": 4.8956420141811875e-06, + "loss": 0.5928, + "step": 1262 + }, + { + "epoch": 0.5971631205673759, + "grad_norm": 2.6678333282470703, + "learning_rate": 4.895463581920484e-06, + "loss": 0.611, + "step": 1263 + }, + { + "epoch": 0.5976359338061465, + "grad_norm": 2.853384494781494, + "learning_rate": 4.895285000504768e-06, + "loss": 0.642, + "step": 1264 + }, + { + "epoch": 0.5981087470449172, + "grad_norm": 2.637852430343628, + "learning_rate": 4.895106269945158e-06, + "loss": 0.6308, + "step": 1265 + }, + { + "epoch": 0.5985815602836879, + "grad_norm": 2.9880387783050537, + "learning_rate": 4.8949273902527826e-06, + "loss": 0.5781, + "step": 1266 + }, + { + "epoch": 0.5990543735224586, + "grad_norm": 3.5984015464782715, + "learning_rate": 4.89474836143878e-06, + "loss": 0.5865, + "step": 1267 + }, + { + "epoch": 0.5995271867612293, + "grad_norm": 2.719855546951294, + "learning_rate": 4.8945691835142975e-06, + "loss": 0.6393, + "step": 1268 + }, + { + "epoch": 0.6, + "grad_norm": 2.7885141372680664, + "learning_rate": 4.894389856490492e-06, + "loss": 0.66, + "step": 1269 + }, + { + "epoch": 0.6004728132387707, + "grad_norm": 2.698819875717163, + "learning_rate": 4.894210380378529e-06, + "loss": 0.6144, + "step": 1270 + }, + { + "epoch": 0.6009456264775414, + "grad_norm": 2.278045654296875, + "learning_rate": 4.894030755189584e-06, + "loss": 0.5609, + "step": 1271 + }, + { + "epoch": 0.601418439716312, + "grad_norm": 2.8729357719421387, + "learning_rate": 4.893850980934841e-06, + "loss": 0.6715, + "step": 1272 + }, + { + "epoch": 0.6018912529550827, + "grad_norm": 2.8541221618652344, + "learning_rate": 4.893671057625495e-06, + "loss": 0.6787, + "step": 1273 + }, + { + "epoch": 0.6023640661938534, + "grad_norm": 2.4561476707458496, + "learning_rate": 4.893490985272748e-06, + "loss": 0.6331, + "step": 1274 + }, + { + "epoch": 0.6028368794326241, + "grad_norm": 2.565739154815674, + "learning_rate": 4.893310763887812e-06, + "loss": 0.587, + "step": 1275 + }, + { + "epoch": 0.6033096926713948, + "grad_norm": 2.384951591491699, + "learning_rate": 4.8931303934819095e-06, + "loss": 0.5358, + "step": 1276 + }, + { + "epoch": 0.6037825059101655, + "grad_norm": 2.380808115005493, + "learning_rate": 4.89294987406627e-06, + "loss": 0.5402, + "step": 1277 + }, + { + "epoch": 0.6042553191489362, + "grad_norm": 2.764815092086792, + "learning_rate": 4.892769205652136e-06, + "loss": 0.6103, + "step": 1278 + }, + { + "epoch": 0.6047281323877068, + "grad_norm": 2.463350296020508, + "learning_rate": 4.892588388250754e-06, + "loss": 0.5937, + "step": 1279 + }, + { + "epoch": 0.6052009456264775, + "grad_norm": 3.099689245223999, + "learning_rate": 4.8924074218733855e-06, + "loss": 0.6354, + "step": 1280 + }, + { + "epoch": 0.6056737588652482, + "grad_norm": 2.804450035095215, + "learning_rate": 4.892226306531297e-06, + "loss": 0.6595, + "step": 1281 + }, + { + "epoch": 0.6061465721040189, + "grad_norm": 3.1559767723083496, + "learning_rate": 4.892045042235765e-06, + "loss": 0.6664, + "step": 1282 + }, + { + "epoch": 0.6066193853427896, + "grad_norm": 2.844341993331909, + "learning_rate": 4.891863628998079e-06, + "loss": 0.7454, + "step": 1283 + }, + { + "epoch": 0.6070921985815603, + "grad_norm": 2.686602830886841, + "learning_rate": 4.891682066829532e-06, + "loss": 0.6755, + "step": 1284 + }, + { + "epoch": 0.607565011820331, + "grad_norm": 2.736457347869873, + "learning_rate": 4.8915003557414285e-06, + "loss": 0.6305, + "step": 1285 + }, + { + "epoch": 0.6080378250591016, + "grad_norm": 2.661362409591675, + "learning_rate": 4.891318495745086e-06, + "loss": 0.5958, + "step": 1286 + }, + { + "epoch": 0.6085106382978723, + "grad_norm": 2.707348108291626, + "learning_rate": 4.8911364868518255e-06, + "loss": 0.5824, + "step": 1287 + }, + { + "epoch": 0.608983451536643, + "grad_norm": 2.9798858165740967, + "learning_rate": 4.890954329072981e-06, + "loss": 0.5981, + "step": 1288 + }, + { + "epoch": 0.6094562647754137, + "grad_norm": 2.6285455226898193, + "learning_rate": 4.890772022419895e-06, + "loss": 0.6194, + "step": 1289 + }, + { + "epoch": 0.6099290780141844, + "grad_norm": 2.9254322052001953, + "learning_rate": 4.890589566903917e-06, + "loss": 0.6002, + "step": 1290 + }, + { + "epoch": 0.6104018912529551, + "grad_norm": 2.6458325386047363, + "learning_rate": 4.89040696253641e-06, + "loss": 0.5457, + "step": 1291 + }, + { + "epoch": 0.6108747044917258, + "grad_norm": 2.508242607116699, + "learning_rate": 4.890224209328743e-06, + "loss": 0.6168, + "step": 1292 + }, + { + "epoch": 0.6113475177304964, + "grad_norm": 3.034785509109497, + "learning_rate": 4.890041307292296e-06, + "loss": 0.664, + "step": 1293 + }, + { + "epoch": 0.6118203309692671, + "grad_norm": 3.52469539642334, + "learning_rate": 4.889858256438455e-06, + "loss": 0.7301, + "step": 1294 + }, + { + "epoch": 0.6122931442080378, + "grad_norm": 2.9145348072052, + "learning_rate": 4.889675056778622e-06, + "loss": 0.6494, + "step": 1295 + }, + { + "epoch": 0.6127659574468085, + "grad_norm": 2.831829071044922, + "learning_rate": 4.8894917083242e-06, + "loss": 0.6064, + "step": 1296 + }, + { + "epoch": 0.6132387706855792, + "grad_norm": 2.6883130073547363, + "learning_rate": 4.889308211086608e-06, + "loss": 0.5642, + "step": 1297 + }, + { + "epoch": 0.6137115839243499, + "grad_norm": 3.0605485439300537, + "learning_rate": 4.889124565077269e-06, + "loss": 0.6695, + "step": 1298 + }, + { + "epoch": 0.6141843971631206, + "grad_norm": 3.44062876701355, + "learning_rate": 4.88894077030762e-06, + "loss": 0.6415, + "step": 1299 + }, + { + "epoch": 0.6146572104018913, + "grad_norm": 2.5970818996429443, + "learning_rate": 4.888756826789105e-06, + "loss": 0.6518, + "step": 1300 + }, + { + "epoch": 0.6151300236406619, + "grad_norm": 4.2233567237854, + "learning_rate": 4.8885727345331755e-06, + "loss": 0.6555, + "step": 1301 + }, + { + "epoch": 0.6156028368794326, + "grad_norm": 2.645385503768921, + "learning_rate": 4.888388493551297e-06, + "loss": 0.6762, + "step": 1302 + }, + { + "epoch": 0.6160756501182033, + "grad_norm": 2.907954454421997, + "learning_rate": 4.8882041038549385e-06, + "loss": 0.6526, + "step": 1303 + }, + { + "epoch": 0.616548463356974, + "grad_norm": 2.482771873474121, + "learning_rate": 4.888019565455583e-06, + "loss": 0.628, + "step": 1304 + }, + { + "epoch": 0.6170212765957447, + "grad_norm": 2.7165915966033936, + "learning_rate": 4.88783487836472e-06, + "loss": 0.5743, + "step": 1305 + }, + { + "epoch": 0.6174940898345154, + "grad_norm": 3.095627546310425, + "learning_rate": 4.88765004259385e-06, + "loss": 0.627, + "step": 1306 + }, + { + "epoch": 0.6179669030732861, + "grad_norm": 2.5018465518951416, + "learning_rate": 4.8874650581544805e-06, + "loss": 0.5215, + "step": 1307 + }, + { + "epoch": 0.6184397163120567, + "grad_norm": 3.094337224960327, + "learning_rate": 4.8872799250581316e-06, + "loss": 0.6979, + "step": 1308 + }, + { + "epoch": 0.6189125295508274, + "grad_norm": 3.1002209186553955, + "learning_rate": 4.887094643316329e-06, + "loss": 0.6565, + "step": 1309 + }, + { + "epoch": 0.6193853427895981, + "grad_norm": 2.551431894302368, + "learning_rate": 4.88690921294061e-06, + "loss": 0.5748, + "step": 1310 + }, + { + "epoch": 0.6198581560283688, + "grad_norm": 2.8282904624938965, + "learning_rate": 4.886723633942521e-06, + "loss": 0.676, + "step": 1311 + }, + { + "epoch": 0.6203309692671395, + "grad_norm": 2.8887810707092285, + "learning_rate": 4.886537906333617e-06, + "loss": 0.5971, + "step": 1312 + }, + { + "epoch": 0.6208037825059102, + "grad_norm": 2.9989118576049805, + "learning_rate": 4.886352030125462e-06, + "loss": 0.6341, + "step": 1313 + }, + { + "epoch": 0.6212765957446809, + "grad_norm": 2.8042776584625244, + "learning_rate": 4.886166005329629e-06, + "loss": 0.6578, + "step": 1314 + }, + { + "epoch": 0.6217494089834515, + "grad_norm": 2.4980967044830322, + "learning_rate": 4.8859798319577026e-06, + "loss": 0.6711, + "step": 1315 + }, + { + "epoch": 0.6222222222222222, + "grad_norm": 2.762369155883789, + "learning_rate": 4.885793510021274e-06, + "loss": 0.5747, + "step": 1316 + }, + { + "epoch": 0.6226950354609929, + "grad_norm": 3.136327028274536, + "learning_rate": 4.885607039531945e-06, + "loss": 0.7544, + "step": 1317 + }, + { + "epoch": 0.6231678486997636, + "grad_norm": 2.8736963272094727, + "learning_rate": 4.885420420501327e-06, + "loss": 0.6603, + "step": 1318 + }, + { + "epoch": 0.6236406619385343, + "grad_norm": 2.766237497329712, + "learning_rate": 4.885233652941039e-06, + "loss": 0.581, + "step": 1319 + }, + { + "epoch": 0.624113475177305, + "grad_norm": 2.4740939140319824, + "learning_rate": 4.88504673686271e-06, + "loss": 0.6335, + "step": 1320 + }, + { + "epoch": 0.6245862884160757, + "grad_norm": 3.324795961380005, + "learning_rate": 4.884859672277978e-06, + "loss": 0.6019, + "step": 1321 + }, + { + "epoch": 0.6250591016548463, + "grad_norm": 3.521327257156372, + "learning_rate": 4.884672459198493e-06, + "loss": 0.6104, + "step": 1322 + }, + { + "epoch": 0.625531914893617, + "grad_norm": 2.7728071212768555, + "learning_rate": 4.884485097635909e-06, + "loss": 0.6714, + "step": 1323 + }, + { + "epoch": 0.6260047281323877, + "grad_norm": 3.0738155841827393, + "learning_rate": 4.884297587601895e-06, + "loss": 0.604, + "step": 1324 + }, + { + "epoch": 0.6264775413711584, + "grad_norm": 2.719240427017212, + "learning_rate": 4.884109929108124e-06, + "loss": 0.6795, + "step": 1325 + }, + { + "epoch": 0.6269503546099291, + "grad_norm": 2.4108200073242188, + "learning_rate": 4.883922122166282e-06, + "loss": 0.5846, + "step": 1326 + }, + { + "epoch": 0.6274231678486998, + "grad_norm": 2.393899917602539, + "learning_rate": 4.883734166788063e-06, + "loss": 0.6188, + "step": 1327 + }, + { + "epoch": 0.6278959810874705, + "grad_norm": 4.555255889892578, + "learning_rate": 4.883546062985169e-06, + "loss": 0.5962, + "step": 1328 + }, + { + "epoch": 0.6283687943262412, + "grad_norm": 2.571075439453125, + "learning_rate": 4.883357810769315e-06, + "loss": 0.6165, + "step": 1329 + }, + { + "epoch": 0.6288416075650118, + "grad_norm": 2.553115129470825, + "learning_rate": 4.8831694101522185e-06, + "loss": 0.6787, + "step": 1330 + }, + { + "epoch": 0.6293144208037825, + "grad_norm": 3.2564642429351807, + "learning_rate": 4.882980861145614e-06, + "loss": 0.659, + "step": 1331 + }, + { + "epoch": 0.6297872340425532, + "grad_norm": 2.535216808319092, + "learning_rate": 4.882792163761241e-06, + "loss": 0.6176, + "step": 1332 + }, + { + "epoch": 0.6302600472813239, + "grad_norm": 3.097921848297119, + "learning_rate": 4.882603318010847e-06, + "loss": 0.6822, + "step": 1333 + }, + { + "epoch": 0.6307328605200946, + "grad_norm": 2.8135175704956055, + "learning_rate": 4.882414323906192e-06, + "loss": 0.6782, + "step": 1334 + }, + { + "epoch": 0.6312056737588653, + "grad_norm": 2.724634885787964, + "learning_rate": 4.882225181459044e-06, + "loss": 0.6545, + "step": 1335 + }, + { + "epoch": 0.631678486997636, + "grad_norm": 2.9585227966308594, + "learning_rate": 4.882035890681179e-06, + "loss": 0.6218, + "step": 1336 + }, + { + "epoch": 0.6321513002364066, + "grad_norm": 2.6952011585235596, + "learning_rate": 4.881846451584385e-06, + "loss": 0.6, + "step": 1337 + }, + { + "epoch": 0.6326241134751773, + "grad_norm": 3.1400704383850098, + "learning_rate": 4.881656864180455e-06, + "loss": 0.6687, + "step": 1338 + }, + { + "epoch": 0.633096926713948, + "grad_norm": 2.8382487297058105, + "learning_rate": 4.881467128481197e-06, + "loss": 0.574, + "step": 1339 + }, + { + "epoch": 0.6335697399527187, + "grad_norm": 2.8520095348358154, + "learning_rate": 4.881277244498422e-06, + "loss": 0.6582, + "step": 1340 + }, + { + "epoch": 0.6340425531914894, + "grad_norm": 2.703498363494873, + "learning_rate": 4.881087212243956e-06, + "loss": 0.7224, + "step": 1341 + }, + { + "epoch": 0.6345153664302601, + "grad_norm": 3.697205066680908, + "learning_rate": 4.880897031729629e-06, + "loss": 0.6582, + "step": 1342 + }, + { + "epoch": 0.6349881796690308, + "grad_norm": 2.7625808715820312, + "learning_rate": 4.880706702967284e-06, + "loss": 0.574, + "step": 1343 + }, + { + "epoch": 0.6354609929078014, + "grad_norm": 2.949984073638916, + "learning_rate": 4.880516225968771e-06, + "loss": 0.66, + "step": 1344 + }, + { + "epoch": 0.6359338061465721, + "grad_norm": 2.548269748687744, + "learning_rate": 4.8803256007459525e-06, + "loss": 0.642, + "step": 1345 + }, + { + "epoch": 0.6364066193853428, + "grad_norm": 2.5102174282073975, + "learning_rate": 4.8801348273106945e-06, + "loss": 0.6238, + "step": 1346 + }, + { + "epoch": 0.6368794326241135, + "grad_norm": 2.9847946166992188, + "learning_rate": 4.8799439056748786e-06, + "loss": 0.5416, + "step": 1347 + }, + { + "epoch": 0.6373522458628842, + "grad_norm": 2.8711049556732178, + "learning_rate": 4.879752835850391e-06, + "loss": 0.6427, + "step": 1348 + }, + { + "epoch": 0.6378250591016549, + "grad_norm": 2.7901716232299805, + "learning_rate": 4.879561617849129e-06, + "loss": 0.6026, + "step": 1349 + }, + { + "epoch": 0.6382978723404256, + "grad_norm": 2.659778356552124, + "learning_rate": 4.879370251682999e-06, + "loss": 0.6623, + "step": 1350 + }, + { + "epoch": 0.6387706855791963, + "grad_norm": 3.224386692047119, + "learning_rate": 4.879178737363917e-06, + "loss": 0.6485, + "step": 1351 + }, + { + "epoch": 0.6392434988179669, + "grad_norm": 2.6385605335235596, + "learning_rate": 4.8789870749038076e-06, + "loss": 0.5866, + "step": 1352 + }, + { + "epoch": 0.6397163120567376, + "grad_norm": 2.807713270187378, + "learning_rate": 4.8787952643146045e-06, + "loss": 0.6537, + "step": 1353 + }, + { + "epoch": 0.6401891252955083, + "grad_norm": 2.5689280033111572, + "learning_rate": 4.878603305608251e-06, + "loss": 0.6216, + "step": 1354 + }, + { + "epoch": 0.640661938534279, + "grad_norm": 2.7347843647003174, + "learning_rate": 4.8784111987967e-06, + "loss": 0.6318, + "step": 1355 + }, + { + "epoch": 0.6411347517730497, + "grad_norm": 2.5210378170013428, + "learning_rate": 4.878218943891911e-06, + "loss": 0.5472, + "step": 1356 + }, + { + "epoch": 0.6416075650118204, + "grad_norm": 2.866785764694214, + "learning_rate": 4.878026540905858e-06, + "loss": 0.7108, + "step": 1357 + }, + { + "epoch": 0.642080378250591, + "grad_norm": 2.923314332962036, + "learning_rate": 4.877833989850519e-06, + "loss": 0.5557, + "step": 1358 + }, + { + "epoch": 0.6425531914893617, + "grad_norm": 2.925463914871216, + "learning_rate": 4.8776412907378845e-06, + "loss": 0.6382, + "step": 1359 + }, + { + "epoch": 0.6430260047281324, + "grad_norm": 2.909644365310669, + "learning_rate": 4.877448443579952e-06, + "loss": 0.5603, + "step": 1360 + }, + { + "epoch": 0.6434988179669031, + "grad_norm": 3.501148223876953, + "learning_rate": 4.8772554483887306e-06, + "loss": 0.6722, + "step": 1361 + }, + { + "epoch": 0.6439716312056738, + "grad_norm": 2.823765516281128, + "learning_rate": 4.877062305176235e-06, + "loss": 0.6408, + "step": 1362 + }, + { + "epoch": 0.6444444444444445, + "grad_norm": 2.9807584285736084, + "learning_rate": 4.8768690139544935e-06, + "loss": 0.5984, + "step": 1363 + }, + { + "epoch": 0.6449172576832152, + "grad_norm": 2.8411378860473633, + "learning_rate": 4.8766755747355405e-06, + "loss": 0.6231, + "step": 1364 + }, + { + "epoch": 0.6453900709219859, + "grad_norm": 3.158952236175537, + "learning_rate": 4.8764819875314215e-06, + "loss": 0.6441, + "step": 1365 + }, + { + "epoch": 0.6458628841607565, + "grad_norm": 2.9614369869232178, + "learning_rate": 4.876288252354189e-06, + "loss": 0.6308, + "step": 1366 + }, + { + "epoch": 0.6463356973995272, + "grad_norm": 3.073805570602417, + "learning_rate": 4.876094369215907e-06, + "loss": 0.6046, + "step": 1367 + }, + { + "epoch": 0.6468085106382979, + "grad_norm": 2.719189405441284, + "learning_rate": 4.875900338128648e-06, + "loss": 0.6082, + "step": 1368 + }, + { + "epoch": 0.6472813238770686, + "grad_norm": 2.676726818084717, + "learning_rate": 4.8757061591044914e-06, + "loss": 0.6344, + "step": 1369 + }, + { + "epoch": 0.6477541371158393, + "grad_norm": 2.955256938934326, + "learning_rate": 4.87551183215553e-06, + "loss": 0.6506, + "step": 1370 + }, + { + "epoch": 0.64822695035461, + "grad_norm": 2.5672218799591064, + "learning_rate": 4.875317357293864e-06, + "loss": 0.5284, + "step": 1371 + }, + { + "epoch": 0.6486997635933807, + "grad_norm": 2.5860238075256348, + "learning_rate": 4.875122734531602e-06, + "loss": 0.667, + "step": 1372 + }, + { + "epoch": 0.6491725768321513, + "grad_norm": 3.1037003993988037, + "learning_rate": 4.8749279638808605e-06, + "loss": 0.6902, + "step": 1373 + }, + { + "epoch": 0.649645390070922, + "grad_norm": 2.7715282440185547, + "learning_rate": 4.874733045353769e-06, + "loss": 0.6291, + "step": 1374 + }, + { + "epoch": 0.6501182033096927, + "grad_norm": 2.527071475982666, + "learning_rate": 4.874537978962463e-06, + "loss": 0.5565, + "step": 1375 + }, + { + "epoch": 0.6505910165484634, + "grad_norm": 2.722092628479004, + "learning_rate": 4.874342764719091e-06, + "loss": 0.5724, + "step": 1376 + }, + { + "epoch": 0.6510638297872341, + "grad_norm": 2.6342411041259766, + "learning_rate": 4.874147402635805e-06, + "loss": 0.6308, + "step": 1377 + }, + { + "epoch": 0.6515366430260048, + "grad_norm": 2.3850719928741455, + "learning_rate": 4.8739518927247695e-06, + "loss": 0.5692, + "step": 1378 + }, + { + "epoch": 0.6520094562647755, + "grad_norm": 2.9787259101867676, + "learning_rate": 4.873756234998161e-06, + "loss": 0.6953, + "step": 1379 + }, + { + "epoch": 0.6524822695035462, + "grad_norm": 2.634141683578491, + "learning_rate": 4.873560429468159e-06, + "loss": 0.6077, + "step": 1380 + }, + { + "epoch": 0.6529550827423168, + "grad_norm": 2.803046941757202, + "learning_rate": 4.873364476146958e-06, + "loss": 0.6657, + "step": 1381 + }, + { + "epoch": 0.6534278959810875, + "grad_norm": 2.762827157974243, + "learning_rate": 4.8731683750467574e-06, + "loss": 0.6061, + "step": 1382 + }, + { + "epoch": 0.6539007092198581, + "grad_norm": 2.6654391288757324, + "learning_rate": 4.872972126179768e-06, + "loss": 0.6387, + "step": 1383 + }, + { + "epoch": 0.6543735224586288, + "grad_norm": 2.4363625049591064, + "learning_rate": 4.872775729558209e-06, + "loss": 0.5623, + "step": 1384 + }, + { + "epoch": 0.6548463356973995, + "grad_norm": 2.528959035873413, + "learning_rate": 4.87257918519431e-06, + "loss": 0.5609, + "step": 1385 + }, + { + "epoch": 0.6553191489361702, + "grad_norm": 2.718383312225342, + "learning_rate": 4.872382493100309e-06, + "loss": 0.5575, + "step": 1386 + }, + { + "epoch": 0.6557919621749408, + "grad_norm": 2.660841226577759, + "learning_rate": 4.872185653288453e-06, + "loss": 0.6106, + "step": 1387 + }, + { + "epoch": 0.6562647754137115, + "grad_norm": 2.508753538131714, + "learning_rate": 4.871988665770997e-06, + "loss": 0.5705, + "step": 1388 + }, + { + "epoch": 0.6567375886524822, + "grad_norm": 2.5134334564208984, + "learning_rate": 4.871791530560208e-06, + "loss": 0.5592, + "step": 1389 + }, + { + "epoch": 0.6572104018912529, + "grad_norm": 2.7475597858428955, + "learning_rate": 4.871594247668361e-06, + "loss": 0.6277, + "step": 1390 + }, + { + "epoch": 0.6576832151300236, + "grad_norm": 2.793616533279419, + "learning_rate": 4.871396817107739e-06, + "loss": 0.595, + "step": 1391 + }, + { + "epoch": 0.6581560283687943, + "grad_norm": 2.8285086154937744, + "learning_rate": 4.871199238890635e-06, + "loss": 0.6094, + "step": 1392 + }, + { + "epoch": 0.658628841607565, + "grad_norm": 2.74124813079834, + "learning_rate": 4.871001513029352e-06, + "loss": 0.6296, + "step": 1393 + }, + { + "epoch": 0.6591016548463356, + "grad_norm": 2.761237621307373, + "learning_rate": 4.870803639536202e-06, + "loss": 0.5702, + "step": 1394 + }, + { + "epoch": 0.6595744680851063, + "grad_norm": 2.761038064956665, + "learning_rate": 4.870605618423504e-06, + "loss": 0.6195, + "step": 1395 + }, + { + "epoch": 0.660047281323877, + "grad_norm": 2.8812482357025146, + "learning_rate": 4.870407449703589e-06, + "loss": 0.616, + "step": 1396 + }, + { + "epoch": 0.6605200945626477, + "grad_norm": 2.9966578483581543, + "learning_rate": 4.870209133388797e-06, + "loss": 0.6547, + "step": 1397 + }, + { + "epoch": 0.6609929078014184, + "grad_norm": 2.7969017028808594, + "learning_rate": 4.870010669491474e-06, + "loss": 0.5762, + "step": 1398 + }, + { + "epoch": 0.6614657210401891, + "grad_norm": 2.557783842086792, + "learning_rate": 4.86981205802398e-06, + "loss": 0.6184, + "step": 1399 + }, + { + "epoch": 0.6619385342789598, + "grad_norm": 2.5393927097320557, + "learning_rate": 4.86961329899868e-06, + "loss": 0.5953, + "step": 1400 + }, + { + "epoch": 0.6624113475177305, + "grad_norm": 2.7745981216430664, + "learning_rate": 4.86941439242795e-06, + "loss": 0.5967, + "step": 1401 + }, + { + "epoch": 0.6628841607565011, + "grad_norm": 2.650381326675415, + "learning_rate": 4.869215338324176e-06, + "loss": 0.5667, + "step": 1402 + }, + { + "epoch": 0.6633569739952718, + "grad_norm": 2.583169937133789, + "learning_rate": 4.869016136699751e-06, + "loss": 0.549, + "step": 1403 + }, + { + "epoch": 0.6638297872340425, + "grad_norm": 2.984978437423706, + "learning_rate": 4.868816787567079e-06, + "loss": 0.5931, + "step": 1404 + }, + { + "epoch": 0.6643026004728132, + "grad_norm": 3.1947181224823, + "learning_rate": 4.868617290938573e-06, + "loss": 0.5473, + "step": 1405 + }, + { + "epoch": 0.6647754137115839, + "grad_norm": 2.562927007675171, + "learning_rate": 4.868417646826654e-06, + "loss": 0.6878, + "step": 1406 + }, + { + "epoch": 0.6652482269503546, + "grad_norm": 2.8741261959075928, + "learning_rate": 4.868217855243754e-06, + "loss": 0.6312, + "step": 1407 + }, + { + "epoch": 0.6657210401891253, + "grad_norm": 2.9834797382354736, + "learning_rate": 4.868017916202312e-06, + "loss": 0.5624, + "step": 1408 + }, + { + "epoch": 0.6661938534278959, + "grad_norm": 2.6935982704162598, + "learning_rate": 4.8678178297147785e-06, + "loss": 0.5857, + "step": 1409 + }, + { + "epoch": 0.6666666666666666, + "grad_norm": 2.8200576305389404, + "learning_rate": 4.86761759579361e-06, + "loss": 0.6153, + "step": 1410 + }, + { + "epoch": 0.6671394799054373, + "grad_norm": 2.831425189971924, + "learning_rate": 4.867417214451276e-06, + "loss": 0.6495, + "step": 1411 + }, + { + "epoch": 0.667612293144208, + "grad_norm": 2.733565092086792, + "learning_rate": 4.867216685700253e-06, + "loss": 0.6036, + "step": 1412 + }, + { + "epoch": 0.6680851063829787, + "grad_norm": 3.0609400272369385, + "learning_rate": 4.867016009553027e-06, + "loss": 0.6773, + "step": 1413 + }, + { + "epoch": 0.6685579196217494, + "grad_norm": 2.665452241897583, + "learning_rate": 4.866815186022093e-06, + "loss": 0.6256, + "step": 1414 + }, + { + "epoch": 0.6690307328605201, + "grad_norm": 2.9480721950531006, + "learning_rate": 4.866614215119956e-06, + "loss": 0.535, + "step": 1415 + }, + { + "epoch": 0.6695035460992907, + "grad_norm": 2.5514180660247803, + "learning_rate": 4.866413096859128e-06, + "loss": 0.6588, + "step": 1416 + }, + { + "epoch": 0.6699763593380614, + "grad_norm": 3.3442373275756836, + "learning_rate": 4.866211831252134e-06, + "loss": 0.5754, + "step": 1417 + }, + { + "epoch": 0.6704491725768321, + "grad_norm": 2.521467685699463, + "learning_rate": 4.866010418311504e-06, + "loss": 0.5546, + "step": 1418 + }, + { + "epoch": 0.6709219858156028, + "grad_norm": 2.930706262588501, + "learning_rate": 4.865808858049781e-06, + "loss": 0.589, + "step": 1419 + }, + { + "epoch": 0.6713947990543735, + "grad_norm": 2.6298375129699707, + "learning_rate": 4.865607150479513e-06, + "loss": 0.5915, + "step": 1420 + }, + { + "epoch": 0.6718676122931442, + "grad_norm": 2.9554293155670166, + "learning_rate": 4.8654052956132615e-06, + "loss": 0.6654, + "step": 1421 + }, + { + "epoch": 0.6723404255319149, + "grad_norm": 3.2706902027130127, + "learning_rate": 4.865203293463593e-06, + "loss": 0.7115, + "step": 1422 + }, + { + "epoch": 0.6728132387706856, + "grad_norm": 3.041539430618286, + "learning_rate": 4.865001144043088e-06, + "loss": 0.5818, + "step": 1423 + }, + { + "epoch": 0.6732860520094562, + "grad_norm": 3.1314544677734375, + "learning_rate": 4.864798847364331e-06, + "loss": 0.5822, + "step": 1424 + }, + { + "epoch": 0.6737588652482269, + "grad_norm": 2.5301461219787598, + "learning_rate": 4.86459640343992e-06, + "loss": 0.5525, + "step": 1425 + }, + { + "epoch": 0.6742316784869976, + "grad_norm": 2.809295892715454, + "learning_rate": 4.864393812282458e-06, + "loss": 0.6768, + "step": 1426 + }, + { + "epoch": 0.6747044917257683, + "grad_norm": 2.794664144515991, + "learning_rate": 4.864191073904562e-06, + "loss": 0.5793, + "step": 1427 + }, + { + "epoch": 0.675177304964539, + "grad_norm": 2.7771105766296387, + "learning_rate": 4.863988188318854e-06, + "loss": 0.6453, + "step": 1428 + }, + { + "epoch": 0.6756501182033097, + "grad_norm": 2.6431946754455566, + "learning_rate": 4.863785155537967e-06, + "loss": 0.5877, + "step": 1429 + }, + { + "epoch": 0.6761229314420804, + "grad_norm": 2.951353073120117, + "learning_rate": 4.863581975574544e-06, + "loss": 0.6793, + "step": 1430 + }, + { + "epoch": 0.676595744680851, + "grad_norm": 3.1336071491241455, + "learning_rate": 4.863378648441235e-06, + "loss": 0.6695, + "step": 1431 + }, + { + "epoch": 0.6770685579196217, + "grad_norm": 2.735982656478882, + "learning_rate": 4.8631751741507e-06, + "loss": 0.5239, + "step": 1432 + }, + { + "epoch": 0.6775413711583924, + "grad_norm": 2.7085206508636475, + "learning_rate": 4.862971552715611e-06, + "loss": 0.6837, + "step": 1433 + }, + { + "epoch": 0.6780141843971631, + "grad_norm": 3.136528730392456, + "learning_rate": 4.8627677841486436e-06, + "loss": 0.683, + "step": 1434 + }, + { + "epoch": 0.6784869976359338, + "grad_norm": 2.7879369258880615, + "learning_rate": 4.862563868462486e-06, + "loss": 0.608, + "step": 1435 + }, + { + "epoch": 0.6789598108747045, + "grad_norm": 2.7937729358673096, + "learning_rate": 4.862359805669837e-06, + "loss": 0.6131, + "step": 1436 + }, + { + "epoch": 0.6794326241134752, + "grad_norm": 2.5988364219665527, + "learning_rate": 4.862155595783401e-06, + "loss": 0.6303, + "step": 1437 + }, + { + "epoch": 0.6799054373522458, + "grad_norm": 3.251070499420166, + "learning_rate": 4.861951238815894e-06, + "loss": 0.7246, + "step": 1438 + }, + { + "epoch": 0.6803782505910165, + "grad_norm": 2.646759271621704, + "learning_rate": 4.861746734780039e-06, + "loss": 0.6313, + "step": 1439 + }, + { + "epoch": 0.6808510638297872, + "grad_norm": 2.773866891860962, + "learning_rate": 4.861542083688573e-06, + "loss": 0.6463, + "step": 1440 + }, + { + "epoch": 0.6813238770685579, + "grad_norm": 2.759965658187866, + "learning_rate": 4.861337285554235e-06, + "loss": 0.5428, + "step": 1441 + }, + { + "epoch": 0.6817966903073286, + "grad_norm": 3.3250818252563477, + "learning_rate": 4.861132340389779e-06, + "loss": 0.6522, + "step": 1442 + }, + { + "epoch": 0.6822695035460993, + "grad_norm": 2.661797523498535, + "learning_rate": 4.860927248207965e-06, + "loss": 0.5871, + "step": 1443 + }, + { + "epoch": 0.68274231678487, + "grad_norm": 2.706289052963257, + "learning_rate": 4.860722009021563e-06, + "loss": 0.6651, + "step": 1444 + }, + { + "epoch": 0.6832151300236406, + "grad_norm": 2.8459298610687256, + "learning_rate": 4.860516622843354e-06, + "loss": 0.5827, + "step": 1445 + }, + { + "epoch": 0.6836879432624113, + "grad_norm": 3.1041831970214844, + "learning_rate": 4.860311089686125e-06, + "loss": 0.6727, + "step": 1446 + }, + { + "epoch": 0.684160756501182, + "grad_norm": 2.9382801055908203, + "learning_rate": 4.8601054095626746e-06, + "loss": 0.6002, + "step": 1447 + }, + { + "epoch": 0.6846335697399527, + "grad_norm": 2.782475471496582, + "learning_rate": 4.859899582485808e-06, + "loss": 0.6951, + "step": 1448 + }, + { + "epoch": 0.6851063829787234, + "grad_norm": 3.313894510269165, + "learning_rate": 4.859693608468343e-06, + "loss": 0.6363, + "step": 1449 + }, + { + "epoch": 0.6855791962174941, + "grad_norm": 3.1639695167541504, + "learning_rate": 4.8594874875231045e-06, + "loss": 0.7002, + "step": 1450 + }, + { + "epoch": 0.6860520094562648, + "grad_norm": 2.6762218475341797, + "learning_rate": 4.859281219662926e-06, + "loss": 0.6246, + "step": 1451 + }, + { + "epoch": 0.6865248226950355, + "grad_norm": 2.8368663787841797, + "learning_rate": 4.85907480490065e-06, + "loss": 0.5906, + "step": 1452 + }, + { + "epoch": 0.6869976359338061, + "grad_norm": 2.887373208999634, + "learning_rate": 4.858868243249131e-06, + "loss": 0.5931, + "step": 1453 + }, + { + "epoch": 0.6874704491725768, + "grad_norm": 2.8115322589874268, + "learning_rate": 4.858661534721229e-06, + "loss": 0.6337, + "step": 1454 + }, + { + "epoch": 0.6879432624113475, + "grad_norm": 2.8470499515533447, + "learning_rate": 4.8584546793298174e-06, + "loss": 0.632, + "step": 1455 + }, + { + "epoch": 0.6884160756501182, + "grad_norm": 2.8229613304138184, + "learning_rate": 4.8582476770877725e-06, + "loss": 0.6494, + "step": 1456 + }, + { + "epoch": 0.6888888888888889, + "grad_norm": 2.4235479831695557, + "learning_rate": 4.858040528007987e-06, + "loss": 0.5709, + "step": 1457 + }, + { + "epoch": 0.6893617021276596, + "grad_norm": 2.9348199367523193, + "learning_rate": 4.857833232103356e-06, + "loss": 0.5404, + "step": 1458 + }, + { + "epoch": 0.6898345153664303, + "grad_norm": 2.8274219036102295, + "learning_rate": 4.857625789386789e-06, + "loss": 0.701, + "step": 1459 + }, + { + "epoch": 0.6903073286052009, + "grad_norm": 3.136929988861084, + "learning_rate": 4.857418199871203e-06, + "loss": 0.6971, + "step": 1460 + }, + { + "epoch": 0.6907801418439716, + "grad_norm": 2.8987185955047607, + "learning_rate": 4.8572104635695214e-06, + "loss": 0.6613, + "step": 1461 + }, + { + "epoch": 0.6912529550827423, + "grad_norm": 2.5073442459106445, + "learning_rate": 4.857002580494681e-06, + "loss": 0.6032, + "step": 1462 + }, + { + "epoch": 0.691725768321513, + "grad_norm": 2.7019522190093994, + "learning_rate": 4.856794550659625e-06, + "loss": 0.567, + "step": 1463 + }, + { + "epoch": 0.6921985815602837, + "grad_norm": 2.4795594215393066, + "learning_rate": 4.8565863740773054e-06, + "loss": 0.5777, + "step": 1464 + }, + { + "epoch": 0.6926713947990544, + "grad_norm": 3.032506227493286, + "learning_rate": 4.856378050760687e-06, + "loss": 0.607, + "step": 1465 + }, + { + "epoch": 0.6931442080378251, + "grad_norm": 3.052091121673584, + "learning_rate": 4.85616958072274e-06, + "loss": 0.591, + "step": 1466 + }, + { + "epoch": 0.6936170212765957, + "grad_norm": 2.704831838607788, + "learning_rate": 4.855960963976443e-06, + "loss": 0.6528, + "step": 1467 + }, + { + "epoch": 0.6940898345153664, + "grad_norm": 2.680995225906372, + "learning_rate": 4.855752200534788e-06, + "loss": 0.6294, + "step": 1468 + }, + { + "epoch": 0.6945626477541371, + "grad_norm": 2.3948659896850586, + "learning_rate": 4.855543290410774e-06, + "loss": 0.6091, + "step": 1469 + }, + { + "epoch": 0.6950354609929078, + "grad_norm": 2.6407411098480225, + "learning_rate": 4.855334233617407e-06, + "loss": 0.5572, + "step": 1470 + }, + { + "epoch": 0.6955082742316785, + "grad_norm": 2.5526835918426514, + "learning_rate": 4.8551250301677064e-06, + "loss": 0.5432, + "step": 1471 + }, + { + "epoch": 0.6959810874704492, + "grad_norm": 3.1237430572509766, + "learning_rate": 4.8549156800746965e-06, + "loss": 0.5944, + "step": 1472 + }, + { + "epoch": 0.6964539007092199, + "grad_norm": 2.8112540245056152, + "learning_rate": 4.854706183351412e-06, + "loss": 0.604, + "step": 1473 + }, + { + "epoch": 0.6969267139479906, + "grad_norm": 2.664644479751587, + "learning_rate": 4.8544965400109e-06, + "loss": 0.5647, + "step": 1474 + }, + { + "epoch": 0.6973995271867612, + "grad_norm": 3.26310133934021, + "learning_rate": 4.854286750066212e-06, + "loss": 0.6999, + "step": 1475 + }, + { + "epoch": 0.6978723404255319, + "grad_norm": 2.9717442989349365, + "learning_rate": 4.8540768135304115e-06, + "loss": 0.6655, + "step": 1476 + }, + { + "epoch": 0.6983451536643026, + "grad_norm": 2.5302982330322266, + "learning_rate": 4.85386673041657e-06, + "loss": 0.6384, + "step": 1477 + }, + { + "epoch": 0.6988179669030733, + "grad_norm": 2.864877700805664, + "learning_rate": 4.853656500737769e-06, + "loss": 0.6834, + "step": 1478 + }, + { + "epoch": 0.699290780141844, + "grad_norm": 2.5522031784057617, + "learning_rate": 4.853446124507098e-06, + "loss": 0.5929, + "step": 1479 + }, + { + "epoch": 0.6997635933806147, + "grad_norm": 3.096477746963501, + "learning_rate": 4.853235601737656e-06, + "loss": 0.5737, + "step": 1480 + }, + { + "epoch": 0.7002364066193854, + "grad_norm": 2.884779214859009, + "learning_rate": 4.853024932442552e-06, + "loss": 0.6362, + "step": 1481 + }, + { + "epoch": 0.700709219858156, + "grad_norm": 3.368558406829834, + "learning_rate": 4.852814116634903e-06, + "loss": 0.6721, + "step": 1482 + }, + { + "epoch": 0.7011820330969267, + "grad_norm": 2.742414951324463, + "learning_rate": 4.852603154327837e-06, + "loss": 0.6212, + "step": 1483 + }, + { + "epoch": 0.7016548463356974, + "grad_norm": 2.53454852104187, + "learning_rate": 4.8523920455344864e-06, + "loss": 0.6675, + "step": 1484 + }, + { + "epoch": 0.7021276595744681, + "grad_norm": 2.9354238510131836, + "learning_rate": 4.852180790267999e-06, + "loss": 0.6692, + "step": 1485 + }, + { + "epoch": 0.7026004728132388, + "grad_norm": 2.585070848464966, + "learning_rate": 4.8519693885415274e-06, + "loss": 0.6215, + "step": 1486 + }, + { + "epoch": 0.7030732860520095, + "grad_norm": 2.9047999382019043, + "learning_rate": 4.851757840368235e-06, + "loss": 0.6231, + "step": 1487 + }, + { + "epoch": 0.7035460992907802, + "grad_norm": 3.0930933952331543, + "learning_rate": 4.851546145761295e-06, + "loss": 0.7267, + "step": 1488 + }, + { + "epoch": 0.7040189125295508, + "grad_norm": 3.0224719047546387, + "learning_rate": 4.8513343047338875e-06, + "loss": 0.6293, + "step": 1489 + }, + { + "epoch": 0.7044917257683215, + "grad_norm": 2.5758471488952637, + "learning_rate": 4.851122317299203e-06, + "loss": 0.5855, + "step": 1490 + }, + { + "epoch": 0.7049645390070922, + "grad_norm": 2.579272508621216, + "learning_rate": 4.850910183470441e-06, + "loss": 0.582, + "step": 1491 + }, + { + "epoch": 0.7054373522458629, + "grad_norm": 2.8148300647735596, + "learning_rate": 4.85069790326081e-06, + "loss": 0.6396, + "step": 1492 + }, + { + "epoch": 0.7059101654846336, + "grad_norm": 2.6380527019500732, + "learning_rate": 4.850485476683528e-06, + "loss": 0.6114, + "step": 1493 + }, + { + "epoch": 0.7063829787234043, + "grad_norm": 2.7736263275146484, + "learning_rate": 4.850272903751823e-06, + "loss": 0.6683, + "step": 1494 + }, + { + "epoch": 0.706855791962175, + "grad_norm": 3.1958179473876953, + "learning_rate": 4.8500601844789285e-06, + "loss": 0.6265, + "step": 1495 + }, + { + "epoch": 0.7073286052009456, + "grad_norm": 3.783212423324585, + "learning_rate": 4.8498473188780916e-06, + "loss": 0.6078, + "step": 1496 + }, + { + "epoch": 0.7078014184397163, + "grad_norm": 2.6656646728515625, + "learning_rate": 4.849634306962566e-06, + "loss": 0.5756, + "step": 1497 + }, + { + "epoch": 0.708274231678487, + "grad_norm": 2.757141590118408, + "learning_rate": 4.849421148745615e-06, + "loss": 0.5596, + "step": 1498 + }, + { + "epoch": 0.7087470449172577, + "grad_norm": 3.0391886234283447, + "learning_rate": 4.849207844240511e-06, + "loss": 0.5293, + "step": 1499 + }, + { + "epoch": 0.7092198581560284, + "grad_norm": 2.981912851333618, + "learning_rate": 4.848994393460535e-06, + "loss": 0.598, + "step": 1500 + }, + { + "epoch": 0.7096926713947991, + "grad_norm": 2.5470798015594482, + "learning_rate": 4.848780796418978e-06, + "loss": 0.6266, + "step": 1501 + }, + { + "epoch": 0.7101654846335698, + "grad_norm": 2.8394415378570557, + "learning_rate": 4.8485670531291415e-06, + "loss": 0.6844, + "step": 1502 + }, + { + "epoch": 0.7106382978723405, + "grad_norm": 3.2023508548736572, + "learning_rate": 4.848353163604331e-06, + "loss": 0.6134, + "step": 1503 + }, + { + "epoch": 0.7111111111111111, + "grad_norm": 2.98245906829834, + "learning_rate": 4.848139127857867e-06, + "loss": 0.7084, + "step": 1504 + }, + { + "epoch": 0.7115839243498818, + "grad_norm": 2.5917441844940186, + "learning_rate": 4.847924945903076e-06, + "loss": 0.5676, + "step": 1505 + }, + { + "epoch": 0.7120567375886525, + "grad_norm": 2.8736681938171387, + "learning_rate": 4.847710617753294e-06, + "loss": 0.6304, + "step": 1506 + }, + { + "epoch": 0.7125295508274232, + "grad_norm": 2.7832682132720947, + "learning_rate": 4.847496143421866e-06, + "loss": 0.5705, + "step": 1507 + }, + { + "epoch": 0.7130023640661939, + "grad_norm": 2.480560779571533, + "learning_rate": 4.847281522922147e-06, + "loss": 0.5595, + "step": 1508 + }, + { + "epoch": 0.7134751773049646, + "grad_norm": 2.357675313949585, + "learning_rate": 4.847066756267499e-06, + "loss": 0.5065, + "step": 1509 + }, + { + "epoch": 0.7139479905437353, + "grad_norm": 2.632669448852539, + "learning_rate": 4.846851843471296e-06, + "loss": 0.6949, + "step": 1510 + }, + { + "epoch": 0.7144208037825059, + "grad_norm": 2.7691073417663574, + "learning_rate": 4.84663678454692e-06, + "loss": 0.6638, + "step": 1511 + }, + { + "epoch": 0.7148936170212766, + "grad_norm": 2.5647685527801514, + "learning_rate": 4.846421579507761e-06, + "loss": 0.6098, + "step": 1512 + }, + { + "epoch": 0.7153664302600473, + "grad_norm": 2.476701021194458, + "learning_rate": 4.846206228367218e-06, + "loss": 0.592, + "step": 1513 + }, + { + "epoch": 0.715839243498818, + "grad_norm": 2.805727958679199, + "learning_rate": 4.845990731138702e-06, + "loss": 0.5466, + "step": 1514 + }, + { + "epoch": 0.7163120567375887, + "grad_norm": 2.551392078399658, + "learning_rate": 4.84577508783563e-06, + "loss": 0.6039, + "step": 1515 + }, + { + "epoch": 0.7167848699763594, + "grad_norm": 2.6861350536346436, + "learning_rate": 4.845559298471429e-06, + "loss": 0.6427, + "step": 1516 + }, + { + "epoch": 0.7172576832151301, + "grad_norm": 3.1908371448516846, + "learning_rate": 4.845343363059535e-06, + "loss": 0.5447, + "step": 1517 + }, + { + "epoch": 0.7177304964539007, + "grad_norm": 2.9021761417388916, + "learning_rate": 4.845127281613394e-06, + "loss": 0.5836, + "step": 1518 + }, + { + "epoch": 0.7182033096926714, + "grad_norm": 2.476670742034912, + "learning_rate": 4.844911054146461e-06, + "loss": 0.5863, + "step": 1519 + }, + { + "epoch": 0.7186761229314421, + "grad_norm": 2.662935495376587, + "learning_rate": 4.844694680672198e-06, + "loss": 0.5678, + "step": 1520 + }, + { + "epoch": 0.7191489361702128, + "grad_norm": 2.677896738052368, + "learning_rate": 4.844478161204079e-06, + "loss": 0.6195, + "step": 1521 + }, + { + "epoch": 0.7196217494089835, + "grad_norm": 2.781921863555908, + "learning_rate": 4.844261495755585e-06, + "loss": 0.643, + "step": 1522 + }, + { + "epoch": 0.7200945626477542, + "grad_norm": 3.0157392024993896, + "learning_rate": 4.844044684340206e-06, + "loss": 0.7559, + "step": 1523 + }, + { + "epoch": 0.7205673758865249, + "grad_norm": 2.8109354972839355, + "learning_rate": 4.843827726971444e-06, + "loss": 0.6264, + "step": 1524 + }, + { + "epoch": 0.7210401891252955, + "grad_norm": 3.0953569412231445, + "learning_rate": 4.8436106236628064e-06, + "loss": 0.6429, + "step": 1525 + }, + { + "epoch": 0.7215130023640662, + "grad_norm": 2.6850643157958984, + "learning_rate": 4.843393374427812e-06, + "loss": 0.6598, + "step": 1526 + }, + { + "epoch": 0.7219858156028369, + "grad_norm": 3.043480634689331, + "learning_rate": 4.8431759792799874e-06, + "loss": 0.6331, + "step": 1527 + }, + { + "epoch": 0.7224586288416076, + "grad_norm": 2.723870038986206, + "learning_rate": 4.842958438232868e-06, + "loss": 0.6259, + "step": 1528 + }, + { + "epoch": 0.7229314420803783, + "grad_norm": 2.822492837905884, + "learning_rate": 4.842740751300002e-06, + "loss": 0.6554, + "step": 1529 + }, + { + "epoch": 0.723404255319149, + "grad_norm": 2.7866315841674805, + "learning_rate": 4.842522918494941e-06, + "loss": 0.6991, + "step": 1530 + }, + { + "epoch": 0.7238770685579197, + "grad_norm": 2.8881826400756836, + "learning_rate": 4.84230493983125e-06, + "loss": 0.5876, + "step": 1531 + }, + { + "epoch": 0.7243498817966904, + "grad_norm": 2.7456939220428467, + "learning_rate": 4.8420868153225e-06, + "loss": 0.6188, + "step": 1532 + }, + { + "epoch": 0.724822695035461, + "grad_norm": 3.0257532596588135, + "learning_rate": 4.841868544982274e-06, + "loss": 0.63, + "step": 1533 + }, + { + "epoch": 0.7252955082742317, + "grad_norm": 3.1581954956054688, + "learning_rate": 4.841650128824164e-06, + "loss": 0.7214, + "step": 1534 + }, + { + "epoch": 0.7257683215130024, + "grad_norm": 2.9174306392669678, + "learning_rate": 4.841431566861767e-06, + "loss": 0.704, + "step": 1535 + }, + { + "epoch": 0.7262411347517731, + "grad_norm": 2.5019054412841797, + "learning_rate": 4.8412128591086935e-06, + "loss": 0.6298, + "step": 1536 + }, + { + "epoch": 0.7267139479905438, + "grad_norm": 2.724285125732422, + "learning_rate": 4.840994005578562e-06, + "loss": 0.6289, + "step": 1537 + }, + { + "epoch": 0.7271867612293145, + "grad_norm": 2.5882341861724854, + "learning_rate": 4.840775006284998e-06, + "loss": 0.6355, + "step": 1538 + }, + { + "epoch": 0.7276595744680852, + "grad_norm": 3.1281991004943848, + "learning_rate": 4.840555861241638e-06, + "loss": 0.5551, + "step": 1539 + }, + { + "epoch": 0.7281323877068558, + "grad_norm": 2.6064817905426025, + "learning_rate": 4.840336570462127e-06, + "loss": 0.5543, + "step": 1540 + }, + { + "epoch": 0.7286052009456265, + "grad_norm": 2.67112398147583, + "learning_rate": 4.840117133960122e-06, + "loss": 0.6044, + "step": 1541 + }, + { + "epoch": 0.7290780141843972, + "grad_norm": 2.838022232055664, + "learning_rate": 4.839897551749282e-06, + "loss": 0.6814, + "step": 1542 + }, + { + "epoch": 0.7295508274231679, + "grad_norm": 2.8897151947021484, + "learning_rate": 4.839677823843283e-06, + "loss": 0.593, + "step": 1543 + }, + { + "epoch": 0.7300236406619386, + "grad_norm": 2.9238014221191406, + "learning_rate": 4.839457950255805e-06, + "loss": 0.5544, + "step": 1544 + }, + { + "epoch": 0.7304964539007093, + "grad_norm": 3.016876459121704, + "learning_rate": 4.839237931000538e-06, + "loss": 0.6099, + "step": 1545 + }, + { + "epoch": 0.7309692671394799, + "grad_norm": 2.9415392875671387, + "learning_rate": 4.839017766091182e-06, + "loss": 0.6413, + "step": 1546 + }, + { + "epoch": 0.7314420803782505, + "grad_norm": 2.658067226409912, + "learning_rate": 4.838797455541446e-06, + "loss": 0.6534, + "step": 1547 + }, + { + "epoch": 0.7319148936170212, + "grad_norm": 2.460358142852783, + "learning_rate": 4.838576999365049e-06, + "loss": 0.5307, + "step": 1548 + }, + { + "epoch": 0.7323877068557919, + "grad_norm": 2.5818674564361572, + "learning_rate": 4.838356397575716e-06, + "loss": 0.6265, + "step": 1549 + }, + { + "epoch": 0.7328605200945626, + "grad_norm": 3.009197473526001, + "learning_rate": 4.838135650187183e-06, + "loss": 0.6957, + "step": 1550 + }, + { + "epoch": 0.7333333333333333, + "grad_norm": 2.738543748855591, + "learning_rate": 4.837914757213196e-06, + "loss": 0.646, + "step": 1551 + }, + { + "epoch": 0.733806146572104, + "grad_norm": 2.8208494186401367, + "learning_rate": 4.837693718667508e-06, + "loss": 0.5936, + "step": 1552 + }, + { + "epoch": 0.7342789598108747, + "grad_norm": 3.1574649810791016, + "learning_rate": 4.837472534563883e-06, + "loss": 0.6455, + "step": 1553 + }, + { + "epoch": 0.7347517730496453, + "grad_norm": 2.6737420558929443, + "learning_rate": 4.837251204916093e-06, + "loss": 0.5921, + "step": 1554 + }, + { + "epoch": 0.735224586288416, + "grad_norm": 2.424983024597168, + "learning_rate": 4.837029729737918e-06, + "loss": 0.6346, + "step": 1555 + }, + { + "epoch": 0.7356973995271867, + "grad_norm": 2.5163493156433105, + "learning_rate": 4.836808109043151e-06, + "loss": 0.6061, + "step": 1556 + }, + { + "epoch": 0.7361702127659574, + "grad_norm": 2.8377044200897217, + "learning_rate": 4.836586342845588e-06, + "loss": 0.611, + "step": 1557 + }, + { + "epoch": 0.7366430260047281, + "grad_norm": 2.5929181575775146, + "learning_rate": 4.83636443115904e-06, + "loss": 0.5496, + "step": 1558 + }, + { + "epoch": 0.7371158392434988, + "grad_norm": 2.5017223358154297, + "learning_rate": 4.836142373997323e-06, + "loss": 0.6235, + "step": 1559 + }, + { + "epoch": 0.7375886524822695, + "grad_norm": 2.822500228881836, + "learning_rate": 4.835920171374265e-06, + "loss": 0.6147, + "step": 1560 + }, + { + "epoch": 0.7380614657210401, + "grad_norm": 2.7234230041503906, + "learning_rate": 4.8356978233037e-06, + "loss": 0.6228, + "step": 1561 + }, + { + "epoch": 0.7385342789598108, + "grad_norm": 2.9565515518188477, + "learning_rate": 4.835475329799472e-06, + "loss": 0.5728, + "step": 1562 + }, + { + "epoch": 0.7390070921985815, + "grad_norm": 2.4356038570404053, + "learning_rate": 4.835252690875438e-06, + "loss": 0.6723, + "step": 1563 + }, + { + "epoch": 0.7394799054373522, + "grad_norm": 2.765913248062134, + "learning_rate": 4.835029906545458e-06, + "loss": 0.5805, + "step": 1564 + }, + { + "epoch": 0.7399527186761229, + "grad_norm": 2.4481914043426514, + "learning_rate": 4.834806976823405e-06, + "loss": 0.599, + "step": 1565 + }, + { + "epoch": 0.7404255319148936, + "grad_norm": 2.620779514312744, + "learning_rate": 4.834583901723158e-06, + "loss": 0.63, + "step": 1566 + }, + { + "epoch": 0.7408983451536643, + "grad_norm": 2.654426097869873, + "learning_rate": 4.83436068125861e-06, + "loss": 0.6544, + "step": 1567 + }, + { + "epoch": 0.741371158392435, + "grad_norm": 2.589623212814331, + "learning_rate": 4.834137315443656e-06, + "loss": 0.5596, + "step": 1568 + }, + { + "epoch": 0.7418439716312056, + "grad_norm": 2.572883129119873, + "learning_rate": 4.833913804292209e-06, + "loss": 0.5974, + "step": 1569 + }, + { + "epoch": 0.7423167848699763, + "grad_norm": 2.8744914531707764, + "learning_rate": 4.833690147818181e-06, + "loss": 0.5364, + "step": 1570 + }, + { + "epoch": 0.742789598108747, + "grad_norm": 2.9800851345062256, + "learning_rate": 4.833466346035502e-06, + "loss": 0.6287, + "step": 1571 + }, + { + "epoch": 0.7432624113475177, + "grad_norm": 2.627784490585327, + "learning_rate": 4.833242398958105e-06, + "loss": 0.621, + "step": 1572 + }, + { + "epoch": 0.7437352245862884, + "grad_norm": 2.5187721252441406, + "learning_rate": 4.833018306599933e-06, + "loss": 0.5901, + "step": 1573 + }, + { + "epoch": 0.7442080378250591, + "grad_norm": 2.4843688011169434, + "learning_rate": 4.832794068974944e-06, + "loss": 0.6336, + "step": 1574 + }, + { + "epoch": 0.7446808510638298, + "grad_norm": 2.774911880493164, + "learning_rate": 4.832569686097096e-06, + "loss": 0.6091, + "step": 1575 + }, + { + "epoch": 0.7451536643026004, + "grad_norm": 3.2562527656555176, + "learning_rate": 4.8323451579803615e-06, + "loss": 0.7686, + "step": 1576 + }, + { + "epoch": 0.7456264775413711, + "grad_norm": 2.799570083618164, + "learning_rate": 4.832120484638721e-06, + "loss": 0.6233, + "step": 1577 + }, + { + "epoch": 0.7460992907801418, + "grad_norm": 2.661893367767334, + "learning_rate": 4.831895666086164e-06, + "loss": 0.5841, + "step": 1578 + }, + { + "epoch": 0.7465721040189125, + "grad_norm": 3.0382652282714844, + "learning_rate": 4.831670702336689e-06, + "loss": 0.5769, + "step": 1579 + }, + { + "epoch": 0.7470449172576832, + "grad_norm": 2.676398515701294, + "learning_rate": 4.831445593404304e-06, + "loss": 0.619, + "step": 1580 + }, + { + "epoch": 0.7475177304964539, + "grad_norm": 2.717916965484619, + "learning_rate": 4.831220339303024e-06, + "loss": 0.5787, + "step": 1581 + }, + { + "epoch": 0.7479905437352246, + "grad_norm": 2.3918066024780273, + "learning_rate": 4.830994940046876e-06, + "loss": 0.5108, + "step": 1582 + }, + { + "epoch": 0.7484633569739952, + "grad_norm": 2.709144115447998, + "learning_rate": 4.830769395649895e-06, + "loss": 0.6875, + "step": 1583 + }, + { + "epoch": 0.7489361702127659, + "grad_norm": 2.8711116313934326, + "learning_rate": 4.830543706126123e-06, + "loss": 0.6745, + "step": 1584 + }, + { + "epoch": 0.7494089834515366, + "grad_norm": 2.612339496612549, + "learning_rate": 4.830317871489614e-06, + "loss": 0.5738, + "step": 1585 + }, + { + "epoch": 0.7498817966903073, + "grad_norm": 2.4355857372283936, + "learning_rate": 4.830091891754429e-06, + "loss": 0.5907, + "step": 1586 + }, + { + "epoch": 0.750354609929078, + "grad_norm": 2.676051378250122, + "learning_rate": 4.829865766934638e-06, + "loss": 0.6628, + "step": 1587 + }, + { + "epoch": 0.7508274231678487, + "grad_norm": 2.66489839553833, + "learning_rate": 4.829639497044323e-06, + "loss": 0.5984, + "step": 1588 + }, + { + "epoch": 0.7513002364066194, + "grad_norm": 2.5358035564422607, + "learning_rate": 4.829413082097572e-06, + "loss": 0.5867, + "step": 1589 + }, + { + "epoch": 0.75177304964539, + "grad_norm": 2.6530144214630127, + "learning_rate": 4.8291865221084815e-06, + "loss": 0.5917, + "step": 1590 + }, + { + "epoch": 0.7522458628841607, + "grad_norm": 2.5160958766937256, + "learning_rate": 4.82895981709116e-06, + "loss": 0.6347, + "step": 1591 + }, + { + "epoch": 0.7527186761229314, + "grad_norm": 2.61592698097229, + "learning_rate": 4.8287329670597225e-06, + "loss": 0.5472, + "step": 1592 + }, + { + "epoch": 0.7531914893617021, + "grad_norm": 2.7528622150421143, + "learning_rate": 4.828505972028296e-06, + "loss": 0.5842, + "step": 1593 + }, + { + "epoch": 0.7536643026004728, + "grad_norm": 2.8154072761535645, + "learning_rate": 4.828278832011011e-06, + "loss": 0.5757, + "step": 1594 + }, + { + "epoch": 0.7541371158392435, + "grad_norm": 3.118515729904175, + "learning_rate": 4.828051547022013e-06, + "loss": 0.6472, + "step": 1595 + }, + { + "epoch": 0.7546099290780142, + "grad_norm": 2.452033758163452, + "learning_rate": 4.827824117075453e-06, + "loss": 0.5571, + "step": 1596 + }, + { + "epoch": 0.7550827423167848, + "grad_norm": 2.984388828277588, + "learning_rate": 4.827596542185492e-06, + "loss": 0.6656, + "step": 1597 + }, + { + "epoch": 0.7555555555555555, + "grad_norm": 2.61356782913208, + "learning_rate": 4.8273688223663014e-06, + "loss": 0.6444, + "step": 1598 + }, + { + "epoch": 0.7560283687943262, + "grad_norm": 2.8967196941375732, + "learning_rate": 4.8271409576320595e-06, + "loss": 0.6457, + "step": 1599 + }, + { + "epoch": 0.7565011820330969, + "grad_norm": 2.852367639541626, + "learning_rate": 4.826912947996954e-06, + "loss": 0.5629, + "step": 1600 + }, + { + "epoch": 0.7569739952718676, + "grad_norm": 2.905280590057373, + "learning_rate": 4.826684793475182e-06, + "loss": 0.6245, + "step": 1601 + }, + { + "epoch": 0.7574468085106383, + "grad_norm": 2.6156530380249023, + "learning_rate": 4.826456494080951e-06, + "loss": 0.5869, + "step": 1602 + }, + { + "epoch": 0.757919621749409, + "grad_norm": 2.6490228176116943, + "learning_rate": 4.826228049828475e-06, + "loss": 0.5461, + "step": 1603 + }, + { + "epoch": 0.7583924349881797, + "grad_norm": 2.9626693725585938, + "learning_rate": 4.825999460731978e-06, + "loss": 0.6842, + "step": 1604 + }, + { + "epoch": 0.7588652482269503, + "grad_norm": 2.6866023540496826, + "learning_rate": 4.825770726805695e-06, + "loss": 0.5726, + "step": 1605 + }, + { + "epoch": 0.759338061465721, + "grad_norm": 2.5525858402252197, + "learning_rate": 4.825541848063866e-06, + "loss": 0.6061, + "step": 1606 + }, + { + "epoch": 0.7598108747044917, + "grad_norm": 2.703977584838867, + "learning_rate": 4.825312824520743e-06, + "loss": 0.6726, + "step": 1607 + }, + { + "epoch": 0.7602836879432624, + "grad_norm": 2.856534957885742, + "learning_rate": 4.825083656190588e-06, + "loss": 0.625, + "step": 1608 + }, + { + "epoch": 0.7607565011820331, + "grad_norm": 2.8564887046813965, + "learning_rate": 4.824854343087668e-06, + "loss": 0.7251, + "step": 1609 + }, + { + "epoch": 0.7612293144208038, + "grad_norm": 2.327650308609009, + "learning_rate": 4.824624885226262e-06, + "loss": 0.526, + "step": 1610 + }, + { + "epoch": 0.7617021276595745, + "grad_norm": 3.0025737285614014, + "learning_rate": 4.824395282620659e-06, + "loss": 0.6043, + "step": 1611 + }, + { + "epoch": 0.7621749408983451, + "grad_norm": 2.5441737174987793, + "learning_rate": 4.824165535285152e-06, + "loss": 0.6276, + "step": 1612 + }, + { + "epoch": 0.7626477541371158, + "grad_norm": 2.4177372455596924, + "learning_rate": 4.823935643234049e-06, + "loss": 0.6419, + "step": 1613 + }, + { + "epoch": 0.7631205673758865, + "grad_norm": 2.9210550785064697, + "learning_rate": 4.823705606481664e-06, + "loss": 0.5663, + "step": 1614 + }, + { + "epoch": 0.7635933806146572, + "grad_norm": 2.6353724002838135, + "learning_rate": 4.82347542504232e-06, + "loss": 0.5669, + "step": 1615 + }, + { + "epoch": 0.7640661938534279, + "grad_norm": 2.419081926345825, + "learning_rate": 4.823245098930349e-06, + "loss": 0.5777, + "step": 1616 + }, + { + "epoch": 0.7645390070921986, + "grad_norm": 2.5077571868896484, + "learning_rate": 4.823014628160093e-06, + "loss": 0.5924, + "step": 1617 + }, + { + "epoch": 0.7650118203309693, + "grad_norm": 2.816056251525879, + "learning_rate": 4.822784012745902e-06, + "loss": 0.7273, + "step": 1618 + }, + { + "epoch": 0.76548463356974, + "grad_norm": 2.7163147926330566, + "learning_rate": 4.8225532527021366e-06, + "loss": 0.5545, + "step": 1619 + }, + { + "epoch": 0.7659574468085106, + "grad_norm": 2.4784302711486816, + "learning_rate": 4.822322348043164e-06, + "loss": 0.556, + "step": 1620 + }, + { + "epoch": 0.7664302600472813, + "grad_norm": 2.712467670440674, + "learning_rate": 4.822091298783361e-06, + "loss": 0.6501, + "step": 1621 + }, + { + "epoch": 0.766903073286052, + "grad_norm": 2.7217724323272705, + "learning_rate": 4.821860104937115e-06, + "loss": 0.5989, + "step": 1622 + }, + { + "epoch": 0.7673758865248227, + "grad_norm": 2.5622854232788086, + "learning_rate": 4.821628766518821e-06, + "loss": 0.5263, + "step": 1623 + }, + { + "epoch": 0.7678486997635934, + "grad_norm": 3.230923891067505, + "learning_rate": 4.821397283542884e-06, + "loss": 0.6707, + "step": 1624 + }, + { + "epoch": 0.7683215130023641, + "grad_norm": 2.37929105758667, + "learning_rate": 4.821165656023718e-06, + "loss": 0.6124, + "step": 1625 + }, + { + "epoch": 0.7687943262411348, + "grad_norm": 2.9811325073242188, + "learning_rate": 4.820933883975745e-06, + "loss": 0.6435, + "step": 1626 + }, + { + "epoch": 0.7692671394799054, + "grad_norm": 2.887380838394165, + "learning_rate": 4.820701967413395e-06, + "loss": 0.621, + "step": 1627 + }, + { + "epoch": 0.7697399527186761, + "grad_norm": 2.6762876510620117, + "learning_rate": 4.820469906351109e-06, + "loss": 0.5713, + "step": 1628 + }, + { + "epoch": 0.7702127659574468, + "grad_norm": 2.7347512245178223, + "learning_rate": 4.820237700803337e-06, + "loss": 0.6136, + "step": 1629 + }, + { + "epoch": 0.7706855791962175, + "grad_norm": 2.7244746685028076, + "learning_rate": 4.820005350784539e-06, + "loss": 0.5816, + "step": 1630 + }, + { + "epoch": 0.7711583924349882, + "grad_norm": 2.9293999671936035, + "learning_rate": 4.8197728563091795e-06, + "loss": 0.6649, + "step": 1631 + }, + { + "epoch": 0.7716312056737589, + "grad_norm": 2.4402127265930176, + "learning_rate": 4.819540217391736e-06, + "loss": 0.6481, + "step": 1632 + }, + { + "epoch": 0.7721040189125296, + "grad_norm": 3.083941698074341, + "learning_rate": 4.819307434046694e-06, + "loss": 0.6951, + "step": 1633 + }, + { + "epoch": 0.7725768321513002, + "grad_norm": 2.544952392578125, + "learning_rate": 4.819074506288548e-06, + "loss": 0.539, + "step": 1634 + }, + { + "epoch": 0.7730496453900709, + "grad_norm": 2.7791268825531006, + "learning_rate": 4.818841434131801e-06, + "loss": 0.5827, + "step": 1635 + }, + { + "epoch": 0.7735224586288416, + "grad_norm": 2.7349796295166016, + "learning_rate": 4.818608217590967e-06, + "loss": 0.5584, + "step": 1636 + }, + { + "epoch": 0.7739952718676123, + "grad_norm": 2.637652635574341, + "learning_rate": 4.818374856680565e-06, + "loss": 0.6386, + "step": 1637 + }, + { + "epoch": 0.774468085106383, + "grad_norm": 2.9821584224700928, + "learning_rate": 4.818141351415127e-06, + "loss": 0.6734, + "step": 1638 + }, + { + "epoch": 0.7749408983451537, + "grad_norm": 2.992938995361328, + "learning_rate": 4.817907701809192e-06, + "loss": 0.5899, + "step": 1639 + }, + { + "epoch": 0.7754137115839244, + "grad_norm": 4.35719633102417, + "learning_rate": 4.8176739078773076e-06, + "loss": 0.6281, + "step": 1640 + }, + { + "epoch": 0.775886524822695, + "grad_norm": 2.838146209716797, + "learning_rate": 4.8174399696340315e-06, + "loss": 0.5766, + "step": 1641 + }, + { + "epoch": 0.7763593380614657, + "grad_norm": 3.3116989135742188, + "learning_rate": 4.81720588709393e-06, + "loss": 0.6409, + "step": 1642 + }, + { + "epoch": 0.7768321513002364, + "grad_norm": 2.9843590259552, + "learning_rate": 4.816971660271579e-06, + "loss": 0.6108, + "step": 1643 + }, + { + "epoch": 0.7773049645390071, + "grad_norm": 2.843770742416382, + "learning_rate": 4.816737289181562e-06, + "loss": 0.6053, + "step": 1644 + }, + { + "epoch": 0.7777777777777778, + "grad_norm": 2.7608556747436523, + "learning_rate": 4.816502773838473e-06, + "loss": 0.5854, + "step": 1645 + }, + { + "epoch": 0.7782505910165485, + "grad_norm": 3.343682289123535, + "learning_rate": 4.816268114256914e-06, + "loss": 0.6329, + "step": 1646 + }, + { + "epoch": 0.7787234042553192, + "grad_norm": 2.769768476486206, + "learning_rate": 4.816033310451496e-06, + "loss": 0.6242, + "step": 1647 + }, + { + "epoch": 0.7791962174940898, + "grad_norm": 2.989851713180542, + "learning_rate": 4.815798362436838e-06, + "loss": 0.6493, + "step": 1648 + }, + { + "epoch": 0.7796690307328605, + "grad_norm": 3.170736312866211, + "learning_rate": 4.8155632702275716e-06, + "loss": 0.6341, + "step": 1649 + }, + { + "epoch": 0.7801418439716312, + "grad_norm": 2.7372522354125977, + "learning_rate": 4.815328033838334e-06, + "loss": 0.5445, + "step": 1650 + }, + { + "epoch": 0.7806146572104019, + "grad_norm": 2.6947238445281982, + "learning_rate": 4.8150926532837715e-06, + "loss": 0.6437, + "step": 1651 + }, + { + "epoch": 0.7810874704491726, + "grad_norm": 2.472323179244995, + "learning_rate": 4.81485712857854e-06, + "loss": 0.5751, + "step": 1652 + }, + { + "epoch": 0.7815602836879433, + "grad_norm": 2.791114091873169, + "learning_rate": 4.814621459737308e-06, + "loss": 0.5996, + "step": 1653 + }, + { + "epoch": 0.782033096926714, + "grad_norm": 3.1957521438598633, + "learning_rate": 4.814385646774745e-06, + "loss": 0.5803, + "step": 1654 + }, + { + "epoch": 0.7825059101654847, + "grad_norm": 2.4120798110961914, + "learning_rate": 4.8141496897055364e-06, + "loss": 0.5814, + "step": 1655 + }, + { + "epoch": 0.7829787234042553, + "grad_norm": 2.9262423515319824, + "learning_rate": 4.813913588544374e-06, + "loss": 0.6292, + "step": 1656 + }, + { + "epoch": 0.783451536643026, + "grad_norm": 2.8251047134399414, + "learning_rate": 4.813677343305959e-06, + "loss": 0.6787, + "step": 1657 + }, + { + "epoch": 0.7839243498817967, + "grad_norm": 2.931659698486328, + "learning_rate": 4.8134409540050005e-06, + "loss": 0.6163, + "step": 1658 + }, + { + "epoch": 0.7843971631205674, + "grad_norm": 2.7160706520080566, + "learning_rate": 4.813204420656219e-06, + "loss": 0.6831, + "step": 1659 + }, + { + "epoch": 0.7848699763593381, + "grad_norm": 3.2134454250335693, + "learning_rate": 4.81296774327434e-06, + "loss": 0.6002, + "step": 1660 + }, + { + "epoch": 0.7853427895981088, + "grad_norm": 2.4002513885498047, + "learning_rate": 4.812730921874103e-06, + "loss": 0.5488, + "step": 1661 + }, + { + "epoch": 0.7858156028368795, + "grad_norm": 2.5559282302856445, + "learning_rate": 4.812493956470251e-06, + "loss": 0.5802, + "step": 1662 + }, + { + "epoch": 0.7862884160756501, + "grad_norm": 2.57478404045105, + "learning_rate": 4.812256847077541e-06, + "loss": 0.646, + "step": 1663 + }, + { + "epoch": 0.7867612293144208, + "grad_norm": 2.811851978302002, + "learning_rate": 4.812019593710736e-06, + "loss": 0.6245, + "step": 1664 + }, + { + "epoch": 0.7872340425531915, + "grad_norm": 2.5228829383850098, + "learning_rate": 4.811782196384609e-06, + "loss": 0.5949, + "step": 1665 + }, + { + "epoch": 0.7877068557919622, + "grad_norm": 2.744096040725708, + "learning_rate": 4.8115446551139415e-06, + "loss": 0.6006, + "step": 1666 + }, + { + "epoch": 0.7881796690307329, + "grad_norm": 3.129242420196533, + "learning_rate": 4.811306969913524e-06, + "loss": 0.7251, + "step": 1667 + }, + { + "epoch": 0.7886524822695036, + "grad_norm": 2.7855660915374756, + "learning_rate": 4.811069140798156e-06, + "loss": 0.6534, + "step": 1668 + }, + { + "epoch": 0.7891252955082743, + "grad_norm": 2.836603879928589, + "learning_rate": 4.810831167782647e-06, + "loss": 0.6661, + "step": 1669 + }, + { + "epoch": 0.789598108747045, + "grad_norm": 2.5339887142181396, + "learning_rate": 4.810593050881813e-06, + "loss": 0.5354, + "step": 1670 + }, + { + "epoch": 0.7900709219858156, + "grad_norm": 2.9553709030151367, + "learning_rate": 4.810354790110482e-06, + "loss": 0.6001, + "step": 1671 + }, + { + "epoch": 0.7905437352245863, + "grad_norm": 2.6581788063049316, + "learning_rate": 4.8101163854834885e-06, + "loss": 0.6802, + "step": 1672 + }, + { + "epoch": 0.791016548463357, + "grad_norm": 3.2002551555633545, + "learning_rate": 4.809877837015677e-06, + "loss": 0.6641, + "step": 1673 + }, + { + "epoch": 0.7914893617021277, + "grad_norm": 2.918792963027954, + "learning_rate": 4.809639144721902e-06, + "loss": 0.6758, + "step": 1674 + }, + { + "epoch": 0.7919621749408984, + "grad_norm": 2.7993946075439453, + "learning_rate": 4.8094003086170245e-06, + "loss": 0.5889, + "step": 1675 + }, + { + "epoch": 0.7924349881796691, + "grad_norm": 2.3698952198028564, + "learning_rate": 4.809161328715916e-06, + "loss": 0.6244, + "step": 1676 + }, + { + "epoch": 0.7929078014184398, + "grad_norm": 2.8891594409942627, + "learning_rate": 4.808922205033458e-06, + "loss": 0.5835, + "step": 1677 + }, + { + "epoch": 0.7933806146572104, + "grad_norm": 2.838345766067505, + "learning_rate": 4.808682937584537e-06, + "loss": 0.6907, + "step": 1678 + }, + { + "epoch": 0.7938534278959811, + "grad_norm": 2.8443174362182617, + "learning_rate": 4.808443526384053e-06, + "loss": 0.6692, + "step": 1679 + }, + { + "epoch": 0.7943262411347518, + "grad_norm": 2.7355034351348877, + "learning_rate": 4.808203971446913e-06, + "loss": 0.5799, + "step": 1680 + }, + { + "epoch": 0.7947990543735225, + "grad_norm": 2.7108020782470703, + "learning_rate": 4.807964272788033e-06, + "loss": 0.652, + "step": 1681 + }, + { + "epoch": 0.7952718676122932, + "grad_norm": 2.397650957107544, + "learning_rate": 4.807724430422338e-06, + "loss": 0.5418, + "step": 1682 + }, + { + "epoch": 0.7957446808510639, + "grad_norm": 2.4981582164764404, + "learning_rate": 4.807484444364762e-06, + "loss": 0.5731, + "step": 1683 + }, + { + "epoch": 0.7962174940898346, + "grad_norm": 2.7943713665008545, + "learning_rate": 4.8072443146302475e-06, + "loss": 0.5913, + "step": 1684 + }, + { + "epoch": 0.7966903073286052, + "grad_norm": 2.5691423416137695, + "learning_rate": 4.807004041233746e-06, + "loss": 0.6475, + "step": 1685 + }, + { + "epoch": 0.7971631205673759, + "grad_norm": 3.2367498874664307, + "learning_rate": 4.8067636241902195e-06, + "loss": 0.675, + "step": 1686 + }, + { + "epoch": 0.7976359338061466, + "grad_norm": 3.000595808029175, + "learning_rate": 4.806523063514637e-06, + "loss": 0.5481, + "step": 1687 + }, + { + "epoch": 0.7981087470449173, + "grad_norm": 2.702014207839966, + "learning_rate": 4.806282359221976e-06, + "loss": 0.5993, + "step": 1688 + }, + { + "epoch": 0.798581560283688, + "grad_norm": 2.383671998977661, + "learning_rate": 4.806041511327226e-06, + "loss": 0.562, + "step": 1689 + }, + { + "epoch": 0.7990543735224587, + "grad_norm": 2.6965041160583496, + "learning_rate": 4.8058005198453834e-06, + "loss": 0.5955, + "step": 1690 + }, + { + "epoch": 0.7995271867612294, + "grad_norm": 2.5906765460968018, + "learning_rate": 4.805559384791453e-06, + "loss": 0.5151, + "step": 1691 + }, + { + "epoch": 0.8, + "grad_norm": 2.5454652309417725, + "learning_rate": 4.8053181061804475e-06, + "loss": 0.5843, + "step": 1692 + }, + { + "epoch": 0.8004728132387707, + "grad_norm": 2.661343812942505, + "learning_rate": 4.8050766840273935e-06, + "loss": 0.5995, + "step": 1693 + }, + { + "epoch": 0.8009456264775414, + "grad_norm": 2.5635924339294434, + "learning_rate": 4.8048351183473215e-06, + "loss": 0.5676, + "step": 1694 + }, + { + "epoch": 0.8014184397163121, + "grad_norm": 2.5936667919158936, + "learning_rate": 4.804593409155274e-06, + "loss": 0.6291, + "step": 1695 + }, + { + "epoch": 0.8018912529550828, + "grad_norm": 2.6902432441711426, + "learning_rate": 4.804351556466299e-06, + "loss": 0.6114, + "step": 1696 + }, + { + "epoch": 0.8023640661938535, + "grad_norm": 2.7764673233032227, + "learning_rate": 4.804109560295457e-06, + "loss": 0.5768, + "step": 1697 + }, + { + "epoch": 0.8028368794326242, + "grad_norm": 2.9587221145629883, + "learning_rate": 4.803867420657816e-06, + "loss": 0.6048, + "step": 1698 + }, + { + "epoch": 0.8033096926713948, + "grad_norm": 2.9238998889923096, + "learning_rate": 4.803625137568453e-06, + "loss": 0.6329, + "step": 1699 + }, + { + "epoch": 0.8037825059101655, + "grad_norm": 2.70473313331604, + "learning_rate": 4.803382711042455e-06, + "loss": 0.5427, + "step": 1700 + }, + { + "epoch": 0.8042553191489362, + "grad_norm": 3.1604979038238525, + "learning_rate": 4.803140141094914e-06, + "loss": 0.626, + "step": 1701 + }, + { + "epoch": 0.8047281323877069, + "grad_norm": 2.9567699432373047, + "learning_rate": 4.802897427740936e-06, + "loss": 0.5319, + "step": 1702 + }, + { + "epoch": 0.8052009456264776, + "grad_norm": 2.90983247756958, + "learning_rate": 4.802654570995632e-06, + "loss": 0.586, + "step": 1703 + }, + { + "epoch": 0.8056737588652483, + "grad_norm": 2.783480167388916, + "learning_rate": 4.8024115708741255e-06, + "loss": 0.5773, + "step": 1704 + }, + { + "epoch": 0.806146572104019, + "grad_norm": 3.3307793140411377, + "learning_rate": 4.802168427391547e-06, + "loss": 0.6257, + "step": 1705 + }, + { + "epoch": 0.8066193853427897, + "grad_norm": 3.0475001335144043, + "learning_rate": 4.801925140563034e-06, + "loss": 0.6612, + "step": 1706 + }, + { + "epoch": 0.8070921985815603, + "grad_norm": 2.8278894424438477, + "learning_rate": 4.8016817104037375e-06, + "loss": 0.6449, + "step": 1707 + }, + { + "epoch": 0.807565011820331, + "grad_norm": 2.760244369506836, + "learning_rate": 4.801438136928812e-06, + "loss": 0.7007, + "step": 1708 + }, + { + "epoch": 0.8080378250591016, + "grad_norm": 2.827634572982788, + "learning_rate": 4.801194420153427e-06, + "loss": 0.6418, + "step": 1709 + }, + { + "epoch": 0.8085106382978723, + "grad_norm": 2.8655009269714355, + "learning_rate": 4.800950560092754e-06, + "loss": 0.6231, + "step": 1710 + }, + { + "epoch": 0.808983451536643, + "grad_norm": 2.738112688064575, + "learning_rate": 4.800706556761981e-06, + "loss": 0.6463, + "step": 1711 + }, + { + "epoch": 0.8094562647754137, + "grad_norm": 2.4781179428100586, + "learning_rate": 4.800462410176296e-06, + "loss": 0.5365, + "step": 1712 + }, + { + "epoch": 0.8099290780141843, + "grad_norm": 2.6049838066101074, + "learning_rate": 4.800218120350906e-06, + "loss": 0.6035, + "step": 1713 + }, + { + "epoch": 0.810401891252955, + "grad_norm": 2.9089980125427246, + "learning_rate": 4.79997368730102e-06, + "loss": 0.5828, + "step": 1714 + }, + { + "epoch": 0.8108747044917257, + "grad_norm": 2.831871747970581, + "learning_rate": 4.799729111041857e-06, + "loss": 0.5953, + "step": 1715 + }, + { + "epoch": 0.8113475177304964, + "grad_norm": 2.5611300468444824, + "learning_rate": 4.799484391588647e-06, + "loss": 0.6302, + "step": 1716 + }, + { + "epoch": 0.8118203309692671, + "grad_norm": 2.744070053100586, + "learning_rate": 4.799239528956625e-06, + "loss": 0.5561, + "step": 1717 + }, + { + "epoch": 0.8122931442080378, + "grad_norm": 2.7344231605529785, + "learning_rate": 4.798994523161041e-06, + "loss": 0.6317, + "step": 1718 + }, + { + "epoch": 0.8127659574468085, + "grad_norm": 2.3420889377593994, + "learning_rate": 4.798749374217149e-06, + "loss": 0.5415, + "step": 1719 + }, + { + "epoch": 0.8132387706855791, + "grad_norm": 2.57384991645813, + "learning_rate": 4.798504082140212e-06, + "loss": 0.6383, + "step": 1720 + }, + { + "epoch": 0.8137115839243498, + "grad_norm": 2.8819844722747803, + "learning_rate": 4.798258646945505e-06, + "loss": 0.6355, + "step": 1721 + }, + { + "epoch": 0.8141843971631205, + "grad_norm": 2.908123254776001, + "learning_rate": 4.79801306864831e-06, + "loss": 0.701, + "step": 1722 + }, + { + "epoch": 0.8146572104018912, + "grad_norm": 2.6500701904296875, + "learning_rate": 4.797767347263917e-06, + "loss": 0.6152, + "step": 1723 + }, + { + "epoch": 0.8151300236406619, + "grad_norm": 2.5513017177581787, + "learning_rate": 4.797521482807628e-06, + "loss": 0.6241, + "step": 1724 + }, + { + "epoch": 0.8156028368794326, + "grad_norm": 2.6239185333251953, + "learning_rate": 4.7972754752947495e-06, + "loss": 0.6072, + "step": 1725 + }, + { + "epoch": 0.8160756501182033, + "grad_norm": 2.673436403274536, + "learning_rate": 4.797029324740601e-06, + "loss": 0.5802, + "step": 1726 + }, + { + "epoch": 0.816548463356974, + "grad_norm": 2.533831834793091, + "learning_rate": 4.796783031160508e-06, + "loss": 0.5566, + "step": 1727 + }, + { + "epoch": 0.8170212765957446, + "grad_norm": 2.9806582927703857, + "learning_rate": 4.796536594569807e-06, + "loss": 0.6945, + "step": 1728 + }, + { + "epoch": 0.8174940898345153, + "grad_norm": 2.7093560695648193, + "learning_rate": 4.796290014983842e-06, + "loss": 0.7143, + "step": 1729 + }, + { + "epoch": 0.817966903073286, + "grad_norm": 2.814507246017456, + "learning_rate": 4.796043292417967e-06, + "loss": 0.6122, + "step": 1730 + }, + { + "epoch": 0.8184397163120567, + "grad_norm": 2.537156820297241, + "learning_rate": 4.795796426887543e-06, + "loss": 0.6229, + "step": 1731 + }, + { + "epoch": 0.8189125295508274, + "grad_norm": 2.4878013134002686, + "learning_rate": 4.795549418407944e-06, + "loss": 0.5442, + "step": 1732 + }, + { + "epoch": 0.8193853427895981, + "grad_norm": 2.839383363723755, + "learning_rate": 4.795302266994548e-06, + "loss": 0.6717, + "step": 1733 + }, + { + "epoch": 0.8198581560283688, + "grad_norm": 3.1981801986694336, + "learning_rate": 4.795054972662744e-06, + "loss": 0.6596, + "step": 1734 + }, + { + "epoch": 0.8203309692671394, + "grad_norm": 2.781730890274048, + "learning_rate": 4.79480753542793e-06, + "loss": 0.5845, + "step": 1735 + }, + { + "epoch": 0.8208037825059101, + "grad_norm": 2.689948558807373, + "learning_rate": 4.794559955305513e-06, + "loss": 0.5928, + "step": 1736 + }, + { + "epoch": 0.8212765957446808, + "grad_norm": 2.7267637252807617, + "learning_rate": 4.7943122323109105e-06, + "loss": 0.5224, + "step": 1737 + }, + { + "epoch": 0.8217494089834515, + "grad_norm": 2.4346601963043213, + "learning_rate": 4.794064366459544e-06, + "loss": 0.6431, + "step": 1738 + }, + { + "epoch": 0.8222222222222222, + "grad_norm": 2.7440176010131836, + "learning_rate": 4.793816357766849e-06, + "loss": 0.6083, + "step": 1739 + }, + { + "epoch": 0.8226950354609929, + "grad_norm": 2.6558027267456055, + "learning_rate": 4.793568206248268e-06, + "loss": 0.698, + "step": 1740 + }, + { + "epoch": 0.8231678486997636, + "grad_norm": 2.591658353805542, + "learning_rate": 4.793319911919251e-06, + "loss": 0.6601, + "step": 1741 + }, + { + "epoch": 0.8236406619385342, + "grad_norm": 2.5431172847747803, + "learning_rate": 4.79307147479526e-06, + "loss": 0.5917, + "step": 1742 + }, + { + "epoch": 0.8241134751773049, + "grad_norm": 2.7335588932037354, + "learning_rate": 4.792822894891762e-06, + "loss": 0.5925, + "step": 1743 + }, + { + "epoch": 0.8245862884160756, + "grad_norm": 2.2500839233398438, + "learning_rate": 4.792574172224237e-06, + "loss": 0.4984, + "step": 1744 + }, + { + "epoch": 0.8250591016548463, + "grad_norm": 2.691343069076538, + "learning_rate": 4.79232530680817e-06, + "loss": 0.6262, + "step": 1745 + }, + { + "epoch": 0.825531914893617, + "grad_norm": 2.612204074859619, + "learning_rate": 4.792076298659058e-06, + "loss": 0.5822, + "step": 1746 + }, + { + "epoch": 0.8260047281323877, + "grad_norm": 3.0163519382476807, + "learning_rate": 4.791827147792406e-06, + "loss": 0.6263, + "step": 1747 + }, + { + "epoch": 0.8264775413711584, + "grad_norm": 2.742183208465576, + "learning_rate": 4.791577854223727e-06, + "loss": 0.6628, + "step": 1748 + }, + { + "epoch": 0.826950354609929, + "grad_norm": 2.872213840484619, + "learning_rate": 4.791328417968542e-06, + "loss": 0.6332, + "step": 1749 + }, + { + "epoch": 0.8274231678486997, + "grad_norm": 2.725006580352783, + "learning_rate": 4.7910788390423844e-06, + "loss": 0.6266, + "step": 1750 + }, + { + "epoch": 0.8278959810874704, + "grad_norm": 3.0366697311401367, + "learning_rate": 4.790829117460793e-06, + "loss": 0.6403, + "step": 1751 + }, + { + "epoch": 0.8283687943262411, + "grad_norm": 2.594881772994995, + "learning_rate": 4.790579253239318e-06, + "loss": 0.521, + "step": 1752 + }, + { + "epoch": 0.8288416075650118, + "grad_norm": 2.4496347904205322, + "learning_rate": 4.790329246393517e-06, + "loss": 0.54, + "step": 1753 + }, + { + "epoch": 0.8293144208037825, + "grad_norm": 3.102278470993042, + "learning_rate": 4.790079096938956e-06, + "loss": 0.6142, + "step": 1754 + }, + { + "epoch": 0.8297872340425532, + "grad_norm": 2.4645912647247314, + "learning_rate": 4.789828804891212e-06, + "loss": 0.5212, + "step": 1755 + }, + { + "epoch": 0.8302600472813239, + "grad_norm": 2.7482516765594482, + "learning_rate": 4.789578370265868e-06, + "loss": 0.6712, + "step": 1756 + }, + { + "epoch": 0.8307328605200945, + "grad_norm": 2.61360502243042, + "learning_rate": 4.7893277930785195e-06, + "loss": 0.6367, + "step": 1757 + }, + { + "epoch": 0.8312056737588652, + "grad_norm": 2.79028058052063, + "learning_rate": 4.789077073344767e-06, + "loss": 0.5099, + "step": 1758 + }, + { + "epoch": 0.8316784869976359, + "grad_norm": 2.647662401199341, + "learning_rate": 4.788826211080222e-06, + "loss": 0.6698, + "step": 1759 + }, + { + "epoch": 0.8321513002364066, + "grad_norm": 3.0214831829071045, + "learning_rate": 4.7885752063005055e-06, + "loss": 0.6121, + "step": 1760 + }, + { + "epoch": 0.8326241134751773, + "grad_norm": 2.8244032859802246, + "learning_rate": 4.788324059021247e-06, + "loss": 0.6921, + "step": 1761 + }, + { + "epoch": 0.833096926713948, + "grad_norm": 3.1501076221466064, + "learning_rate": 4.788072769258082e-06, + "loss": 0.6872, + "step": 1762 + }, + { + "epoch": 0.8335697399527187, + "grad_norm": 2.6989903450012207, + "learning_rate": 4.7878213370266594e-06, + "loss": 0.5884, + "step": 1763 + }, + { + "epoch": 0.8340425531914893, + "grad_norm": 2.6982665061950684, + "learning_rate": 4.787569762342633e-06, + "loss": 0.6112, + "step": 1764 + }, + { + "epoch": 0.83451536643026, + "grad_norm": 2.6918323040008545, + "learning_rate": 4.7873180452216685e-06, + "loss": 0.5315, + "step": 1765 + }, + { + "epoch": 0.8349881796690307, + "grad_norm": 2.5494401454925537, + "learning_rate": 4.78706618567944e-06, + "loss": 0.5909, + "step": 1766 + }, + { + "epoch": 0.8354609929078014, + "grad_norm": 2.7532095909118652, + "learning_rate": 4.786814183731627e-06, + "loss": 0.5566, + "step": 1767 + }, + { + "epoch": 0.8359338061465721, + "grad_norm": 2.550865888595581, + "learning_rate": 4.786562039393923e-06, + "loss": 0.555, + "step": 1768 + }, + { + "epoch": 0.8364066193853428, + "grad_norm": 2.4477791786193848, + "learning_rate": 4.786309752682028e-06, + "loss": 0.5844, + "step": 1769 + }, + { + "epoch": 0.8368794326241135, + "grad_norm": 2.6982262134552, + "learning_rate": 4.7860573236116485e-06, + "loss": 0.6136, + "step": 1770 + }, + { + "epoch": 0.8373522458628841, + "grad_norm": 2.456263542175293, + "learning_rate": 4.785804752198503e-06, + "loss": 0.5055, + "step": 1771 + }, + { + "epoch": 0.8378250591016548, + "grad_norm": 2.428544521331787, + "learning_rate": 4.78555203845832e-06, + "loss": 0.5859, + "step": 1772 + }, + { + "epoch": 0.8382978723404255, + "grad_norm": 2.1782307624816895, + "learning_rate": 4.785299182406833e-06, + "loss": 0.5325, + "step": 1773 + }, + { + "epoch": 0.8387706855791962, + "grad_norm": 3.137956142425537, + "learning_rate": 4.785046184059786e-06, + "loss": 0.6097, + "step": 1774 + }, + { + "epoch": 0.8392434988179669, + "grad_norm": 2.6269001960754395, + "learning_rate": 4.7847930434329336e-06, + "loss": 0.5972, + "step": 1775 + }, + { + "epoch": 0.8397163120567376, + "grad_norm": 2.732659339904785, + "learning_rate": 4.784539760542037e-06, + "loss": 0.6054, + "step": 1776 + }, + { + "epoch": 0.8401891252955083, + "grad_norm": 2.5346736907958984, + "learning_rate": 4.784286335402866e-06, + "loss": 0.5521, + "step": 1777 + }, + { + "epoch": 0.840661938534279, + "grad_norm": 3.1420228481292725, + "learning_rate": 4.784032768031202e-06, + "loss": 0.6165, + "step": 1778 + }, + { + "epoch": 0.8411347517730496, + "grad_norm": 3.073793411254883, + "learning_rate": 4.783779058442831e-06, + "loss": 0.6414, + "step": 1779 + }, + { + "epoch": 0.8416075650118203, + "grad_norm": 2.6621336936950684, + "learning_rate": 4.783525206653554e-06, + "loss": 0.5836, + "step": 1780 + }, + { + "epoch": 0.842080378250591, + "grad_norm": 2.7029049396514893, + "learning_rate": 4.7832712126791745e-06, + "loss": 0.5897, + "step": 1781 + }, + { + "epoch": 0.8425531914893617, + "grad_norm": 2.4733822345733643, + "learning_rate": 4.783017076535509e-06, + "loss": 0.5913, + "step": 1782 + }, + { + "epoch": 0.8430260047281324, + "grad_norm": 2.8119473457336426, + "learning_rate": 4.782762798238381e-06, + "loss": 0.6105, + "step": 1783 + }, + { + "epoch": 0.8434988179669031, + "grad_norm": 2.5290818214416504, + "learning_rate": 4.782508377803622e-06, + "loss": 0.6119, + "step": 1784 + }, + { + "epoch": 0.8439716312056738, + "grad_norm": 3.193472385406494, + "learning_rate": 4.782253815247076e-06, + "loss": 0.6665, + "step": 1785 + }, + { + "epoch": 0.8444444444444444, + "grad_norm": 3.206759452819824, + "learning_rate": 4.781999110584592e-06, + "loss": 0.6012, + "step": 1786 + }, + { + "epoch": 0.8449172576832151, + "grad_norm": 2.6227457523345947, + "learning_rate": 4.781744263832029e-06, + "loss": 0.5845, + "step": 1787 + }, + { + "epoch": 0.8453900709219858, + "grad_norm": 2.838365316390991, + "learning_rate": 4.781489275005257e-06, + "loss": 0.5695, + "step": 1788 + }, + { + "epoch": 0.8458628841607565, + "grad_norm": 2.8348326683044434, + "learning_rate": 4.78123414412015e-06, + "loss": 0.6136, + "step": 1789 + }, + { + "epoch": 0.8463356973995272, + "grad_norm": 2.5698344707489014, + "learning_rate": 4.780978871192597e-06, + "loss": 0.6576, + "step": 1790 + }, + { + "epoch": 0.8468085106382979, + "grad_norm": 2.5198330879211426, + "learning_rate": 4.780723456238492e-06, + "loss": 0.5521, + "step": 1791 + }, + { + "epoch": 0.8472813238770686, + "grad_norm": 3.001325845718384, + "learning_rate": 4.780467899273737e-06, + "loss": 0.6075, + "step": 1792 + }, + { + "epoch": 0.8477541371158392, + "grad_norm": 2.7732746601104736, + "learning_rate": 4.780212200314247e-06, + "loss": 0.6245, + "step": 1793 + }, + { + "epoch": 0.8482269503546099, + "grad_norm": 2.6950337886810303, + "learning_rate": 4.77995635937594e-06, + "loss": 0.5723, + "step": 1794 + }, + { + "epoch": 0.8486997635933806, + "grad_norm": 2.82051420211792, + "learning_rate": 4.779700376474749e-06, + "loss": 0.6184, + "step": 1795 + }, + { + "epoch": 0.8491725768321513, + "grad_norm": 2.757791757583618, + "learning_rate": 4.779444251626611e-06, + "loss": 0.608, + "step": 1796 + }, + { + "epoch": 0.849645390070922, + "grad_norm": 2.394108533859253, + "learning_rate": 4.779187984847475e-06, + "loss": 0.6174, + "step": 1797 + }, + { + "epoch": 0.8501182033096927, + "grad_norm": 2.427562713623047, + "learning_rate": 4.778931576153296e-06, + "loss": 0.5618, + "step": 1798 + }, + { + "epoch": 0.8505910165484634, + "grad_norm": 2.891268491744995, + "learning_rate": 4.778675025560042e-06, + "loss": 0.6865, + "step": 1799 + }, + { + "epoch": 0.851063829787234, + "grad_norm": 2.665534257888794, + "learning_rate": 4.778418333083685e-06, + "loss": 0.5852, + "step": 1800 + }, + { + "epoch": 0.8515366430260047, + "grad_norm": 2.5492889881134033, + "learning_rate": 4.7781614987402095e-06, + "loss": 0.5161, + "step": 1801 + }, + { + "epoch": 0.8520094562647754, + "grad_norm": 2.400177001953125, + "learning_rate": 4.777904522545607e-06, + "loss": 0.5128, + "step": 1802 + }, + { + "epoch": 0.8524822695035461, + "grad_norm": 2.3949809074401855, + "learning_rate": 4.777647404515878e-06, + "loss": 0.571, + "step": 1803 + }, + { + "epoch": 0.8529550827423168, + "grad_norm": 2.3624472618103027, + "learning_rate": 4.7773901446670325e-06, + "loss": 0.5486, + "step": 1804 + }, + { + "epoch": 0.8534278959810875, + "grad_norm": 2.711366891860962, + "learning_rate": 4.7771327430150885e-06, + "loss": 0.5667, + "step": 1805 + }, + { + "epoch": 0.8539007092198582, + "grad_norm": 2.7681493759155273, + "learning_rate": 4.776875199576073e-06, + "loss": 0.5686, + "step": 1806 + }, + { + "epoch": 0.8543735224586289, + "grad_norm": 3.0369436740875244, + "learning_rate": 4.776617514366023e-06, + "loss": 0.6635, + "step": 1807 + }, + { + "epoch": 0.8548463356973995, + "grad_norm": 2.919649600982666, + "learning_rate": 4.776359687400983e-06, + "loss": 0.5749, + "step": 1808 + }, + { + "epoch": 0.8553191489361702, + "grad_norm": 2.7986185550689697, + "learning_rate": 4.776101718697007e-06, + "loss": 0.559, + "step": 1809 + }, + { + "epoch": 0.8557919621749409, + "grad_norm": 2.5951223373413086, + "learning_rate": 4.775843608270158e-06, + "loss": 0.5654, + "step": 1810 + }, + { + "epoch": 0.8562647754137116, + "grad_norm": 2.674138069152832, + "learning_rate": 4.775585356136505e-06, + "loss": 0.5286, + "step": 1811 + }, + { + "epoch": 0.8567375886524823, + "grad_norm": 3.045437812805176, + "learning_rate": 4.775326962312131e-06, + "loss": 0.6185, + "step": 1812 + }, + { + "epoch": 0.857210401891253, + "grad_norm": 2.6145293712615967, + "learning_rate": 4.775068426813124e-06, + "loss": 0.6075, + "step": 1813 + }, + { + "epoch": 0.8576832151300237, + "grad_norm": 2.6320106983184814, + "learning_rate": 4.7748097496555824e-06, + "loss": 0.561, + "step": 1814 + }, + { + "epoch": 0.8581560283687943, + "grad_norm": 2.5038623809814453, + "learning_rate": 4.774550930855612e-06, + "loss": 0.593, + "step": 1815 + }, + { + "epoch": 0.858628841607565, + "grad_norm": 2.8168089389801025, + "learning_rate": 4.774291970429329e-06, + "loss": 0.5196, + "step": 1816 + }, + { + "epoch": 0.8591016548463357, + "grad_norm": 2.778130292892456, + "learning_rate": 4.774032868392858e-06, + "loss": 0.5984, + "step": 1817 + }, + { + "epoch": 0.8595744680851064, + "grad_norm": 2.536458730697632, + "learning_rate": 4.7737736247623305e-06, + "loss": 0.568, + "step": 1818 + }, + { + "epoch": 0.8600472813238771, + "grad_norm": 2.6669719219207764, + "learning_rate": 4.77351423955389e-06, + "loss": 0.6233, + "step": 1819 + }, + { + "epoch": 0.8605200945626478, + "grad_norm": 2.578242540359497, + "learning_rate": 4.773254712783687e-06, + "loss": 0.579, + "step": 1820 + }, + { + "epoch": 0.8609929078014185, + "grad_norm": 2.816664457321167, + "learning_rate": 4.772995044467881e-06, + "loss": 0.6635, + "step": 1821 + }, + { + "epoch": 0.8614657210401891, + "grad_norm": 3.1111979484558105, + "learning_rate": 4.77273523462264e-06, + "loss": 0.6372, + "step": 1822 + }, + { + "epoch": 0.8619385342789598, + "grad_norm": 2.764552354812622, + "learning_rate": 4.772475283264142e-06, + "loss": 0.6216, + "step": 1823 + }, + { + "epoch": 0.8624113475177305, + "grad_norm": 2.9126830101013184, + "learning_rate": 4.772215190408572e-06, + "loss": 0.6396, + "step": 1824 + }, + { + "epoch": 0.8628841607565012, + "grad_norm": 2.7502307891845703, + "learning_rate": 4.7719549560721264e-06, + "loss": 0.6186, + "step": 1825 + }, + { + "epoch": 0.8633569739952719, + "grad_norm": 2.6279006004333496, + "learning_rate": 4.771694580271007e-06, + "loss": 0.5557, + "step": 1826 + }, + { + "epoch": 0.8638297872340426, + "grad_norm": 2.996563196182251, + "learning_rate": 4.7714340630214276e-06, + "loss": 0.6259, + "step": 1827 + }, + { + "epoch": 0.8643026004728133, + "grad_norm": 3.231323480606079, + "learning_rate": 4.771173404339609e-06, + "loss": 0.5473, + "step": 1828 + }, + { + "epoch": 0.864775413711584, + "grad_norm": 3.143519878387451, + "learning_rate": 4.770912604241781e-06, + "loss": 0.593, + "step": 1829 + }, + { + "epoch": 0.8652482269503546, + "grad_norm": 2.515484094619751, + "learning_rate": 4.770651662744184e-06, + "loss": 0.538, + "step": 1830 + }, + { + "epoch": 0.8657210401891253, + "grad_norm": 2.629058837890625, + "learning_rate": 4.770390579863064e-06, + "loss": 0.5745, + "step": 1831 + }, + { + "epoch": 0.866193853427896, + "grad_norm": 2.5826802253723145, + "learning_rate": 4.770129355614677e-06, + "loss": 0.6397, + "step": 1832 + }, + { + "epoch": 0.8666666666666667, + "grad_norm": 2.954623222351074, + "learning_rate": 4.769867990015289e-06, + "loss": 0.6106, + "step": 1833 + }, + { + "epoch": 0.8671394799054374, + "grad_norm": 2.742192268371582, + "learning_rate": 4.769606483081175e-06, + "loss": 0.6902, + "step": 1834 + }, + { + "epoch": 0.8676122931442081, + "grad_norm": 2.2619097232818604, + "learning_rate": 4.769344834828618e-06, + "loss": 0.5414, + "step": 1835 + }, + { + "epoch": 0.8680851063829788, + "grad_norm": 2.7384188175201416, + "learning_rate": 4.769083045273908e-06, + "loss": 0.5787, + "step": 1836 + }, + { + "epoch": 0.8685579196217494, + "grad_norm": 2.6734485626220703, + "learning_rate": 4.768821114433346e-06, + "loss": 0.5923, + "step": 1837 + }, + { + "epoch": 0.8690307328605201, + "grad_norm": 2.286140203475952, + "learning_rate": 4.768559042323243e-06, + "loss": 0.5822, + "step": 1838 + }, + { + "epoch": 0.8695035460992908, + "grad_norm": 3.0243725776672363, + "learning_rate": 4.768296828959915e-06, + "loss": 0.6623, + "step": 1839 + }, + { + "epoch": 0.8699763593380615, + "grad_norm": 2.4026312828063965, + "learning_rate": 4.768034474359689e-06, + "loss": 0.5554, + "step": 1840 + }, + { + "epoch": 0.8704491725768322, + "grad_norm": 2.7469029426574707, + "learning_rate": 4.767771978538903e-06, + "loss": 0.6316, + "step": 1841 + }, + { + "epoch": 0.8709219858156029, + "grad_norm": 2.729659080505371, + "learning_rate": 4.767509341513899e-06, + "loss": 0.5807, + "step": 1842 + }, + { + "epoch": 0.8713947990543736, + "grad_norm": 2.5336945056915283, + "learning_rate": 4.76724656330103e-06, + "loss": 0.6109, + "step": 1843 + }, + { + "epoch": 0.8718676122931442, + "grad_norm": 2.519880533218384, + "learning_rate": 4.76698364391666e-06, + "loss": 0.5313, + "step": 1844 + }, + { + "epoch": 0.8723404255319149, + "grad_norm": 2.698862075805664, + "learning_rate": 4.766720583377159e-06, + "loss": 0.5953, + "step": 1845 + }, + { + "epoch": 0.8728132387706856, + "grad_norm": 3.0195560455322266, + "learning_rate": 4.766457381698907e-06, + "loss": 0.5965, + "step": 1846 + }, + { + "epoch": 0.8732860520094563, + "grad_norm": 2.5972697734832764, + "learning_rate": 4.766194038898291e-06, + "loss": 0.6014, + "step": 1847 + }, + { + "epoch": 0.873758865248227, + "grad_norm": 2.7132294178009033, + "learning_rate": 4.76593055499171e-06, + "loss": 0.5638, + "step": 1848 + }, + { + "epoch": 0.8742316784869977, + "grad_norm": 2.7134575843811035, + "learning_rate": 4.765666929995568e-06, + "loss": 0.52, + "step": 1849 + }, + { + "epoch": 0.8747044917257684, + "grad_norm": 2.3804993629455566, + "learning_rate": 4.765403163926282e-06, + "loss": 0.5435, + "step": 1850 + }, + { + "epoch": 0.875177304964539, + "grad_norm": 2.8782761096954346, + "learning_rate": 4.765139256800274e-06, + "loss": 0.5843, + "step": 1851 + }, + { + "epoch": 0.8756501182033097, + "grad_norm": 2.836209774017334, + "learning_rate": 4.764875208633977e-06, + "loss": 0.6667, + "step": 1852 + }, + { + "epoch": 0.8761229314420804, + "grad_norm": 2.608851194381714, + "learning_rate": 4.764611019443831e-06, + "loss": 0.5436, + "step": 1853 + }, + { + "epoch": 0.8765957446808511, + "grad_norm": 2.788738965988159, + "learning_rate": 4.764346689246288e-06, + "loss": 0.7331, + "step": 1854 + }, + { + "epoch": 0.8770685579196218, + "grad_norm": 2.524277687072754, + "learning_rate": 4.764082218057805e-06, + "loss": 0.5067, + "step": 1855 + }, + { + "epoch": 0.8775413711583925, + "grad_norm": 3.7559316158294678, + "learning_rate": 4.763817605894851e-06, + "loss": 0.6809, + "step": 1856 + }, + { + "epoch": 0.8780141843971632, + "grad_norm": 2.9070613384246826, + "learning_rate": 4.763552852773899e-06, + "loss": 0.5913, + "step": 1857 + }, + { + "epoch": 0.8784869976359339, + "grad_norm": 2.7050609588623047, + "learning_rate": 4.7632879587114386e-06, + "loss": 0.6074, + "step": 1858 + }, + { + "epoch": 0.8789598108747045, + "grad_norm": 2.891134262084961, + "learning_rate": 4.76302292372396e-06, + "loss": 0.5939, + "step": 1859 + }, + { + "epoch": 0.8794326241134752, + "grad_norm": 2.8581702709198, + "learning_rate": 4.762757747827968e-06, + "loss": 0.5972, + "step": 1860 + }, + { + "epoch": 0.8799054373522459, + "grad_norm": 2.8266196250915527, + "learning_rate": 4.762492431039971e-06, + "loss": 0.5993, + "step": 1861 + }, + { + "epoch": 0.8803782505910166, + "grad_norm": 2.4853954315185547, + "learning_rate": 4.762226973376493e-06, + "loss": 0.6388, + "step": 1862 + }, + { + "epoch": 0.8808510638297873, + "grad_norm": 3.2212886810302734, + "learning_rate": 4.761961374854059e-06, + "loss": 0.6698, + "step": 1863 + }, + { + "epoch": 0.881323877068558, + "grad_norm": 3.1254501342773438, + "learning_rate": 4.761695635489211e-06, + "loss": 0.5263, + "step": 1864 + }, + { + "epoch": 0.8817966903073287, + "grad_norm": 2.6891462802886963, + "learning_rate": 4.761429755298491e-06, + "loss": 0.5359, + "step": 1865 + }, + { + "epoch": 0.8822695035460993, + "grad_norm": 2.8557538986206055, + "learning_rate": 4.761163734298457e-06, + "loss": 0.5933, + "step": 1866 + }, + { + "epoch": 0.88274231678487, + "grad_norm": 2.53548264503479, + "learning_rate": 4.7608975725056724e-06, + "loss": 0.6397, + "step": 1867 + }, + { + "epoch": 0.8832151300236407, + "grad_norm": 3.0237956047058105, + "learning_rate": 4.76063126993671e-06, + "loss": 0.6845, + "step": 1868 + }, + { + "epoch": 0.8836879432624114, + "grad_norm": 3.222886800765991, + "learning_rate": 4.76036482660815e-06, + "loss": 0.6055, + "step": 1869 + }, + { + "epoch": 0.8841607565011821, + "grad_norm": 3.1867551803588867, + "learning_rate": 4.760098242536584e-06, + "loss": 0.6592, + "step": 1870 + }, + { + "epoch": 0.8846335697399527, + "grad_norm": 2.782209873199463, + "learning_rate": 4.7598315177386115e-06, + "loss": 0.5833, + "step": 1871 + }, + { + "epoch": 0.8851063829787233, + "grad_norm": 2.899871587753296, + "learning_rate": 4.759564652230838e-06, + "loss": 0.6129, + "step": 1872 + }, + { + "epoch": 0.885579196217494, + "grad_norm": 2.5690579414367676, + "learning_rate": 4.759297646029882e-06, + "loss": 0.5827, + "step": 1873 + }, + { + "epoch": 0.8860520094562647, + "grad_norm": 2.666130304336548, + "learning_rate": 4.759030499152368e-06, + "loss": 0.5272, + "step": 1874 + }, + { + "epoch": 0.8865248226950354, + "grad_norm": 2.7030911445617676, + "learning_rate": 4.758763211614932e-06, + "loss": 0.6415, + "step": 1875 + }, + { + "epoch": 0.8869976359338061, + "grad_norm": 2.717512845993042, + "learning_rate": 4.7584957834342135e-06, + "loss": 0.5827, + "step": 1876 + }, + { + "epoch": 0.8874704491725768, + "grad_norm": 2.665823459625244, + "learning_rate": 4.758228214626867e-06, + "loss": 0.6209, + "step": 1877 + }, + { + "epoch": 0.8879432624113475, + "grad_norm": 2.636653184890747, + "learning_rate": 4.75796050520955e-06, + "loss": 0.6413, + "step": 1878 + }, + { + "epoch": 0.8884160756501182, + "grad_norm": 2.585115671157837, + "learning_rate": 4.7576926551989345e-06, + "loss": 0.5518, + "step": 1879 + }, + { + "epoch": 0.8888888888888888, + "grad_norm": 2.808526039123535, + "learning_rate": 4.757424664611697e-06, + "loss": 0.5717, + "step": 1880 + }, + { + "epoch": 0.8893617021276595, + "grad_norm": 3.5957939624786377, + "learning_rate": 4.757156533464524e-06, + "loss": 0.6323, + "step": 1881 + }, + { + "epoch": 0.8898345153664302, + "grad_norm": 2.5003883838653564, + "learning_rate": 4.756888261774111e-06, + "loss": 0.5937, + "step": 1882 + }, + { + "epoch": 0.8903073286052009, + "grad_norm": 2.749061346054077, + "learning_rate": 4.756619849557161e-06, + "loss": 0.6642, + "step": 1883 + }, + { + "epoch": 0.8907801418439716, + "grad_norm": 2.6757891178131104, + "learning_rate": 4.756351296830389e-06, + "loss": 0.5887, + "step": 1884 + }, + { + "epoch": 0.8912529550827423, + "grad_norm": 2.811925172805786, + "learning_rate": 4.756082603610516e-06, + "loss": 0.6571, + "step": 1885 + }, + { + "epoch": 0.891725768321513, + "grad_norm": 2.5054616928100586, + "learning_rate": 4.755813769914271e-06, + "loss": 0.6312, + "step": 1886 + }, + { + "epoch": 0.8921985815602836, + "grad_norm": 2.7518467903137207, + "learning_rate": 4.755544795758395e-06, + "loss": 0.6685, + "step": 1887 + }, + { + "epoch": 0.8926713947990543, + "grad_norm": 2.7527287006378174, + "learning_rate": 4.755275681159634e-06, + "loss": 0.5886, + "step": 1888 + }, + { + "epoch": 0.893144208037825, + "grad_norm": 2.6162452697753906, + "learning_rate": 4.755006426134745e-06, + "loss": 0.546, + "step": 1889 + }, + { + "epoch": 0.8936170212765957, + "grad_norm": 2.4016737937927246, + "learning_rate": 4.754737030700495e-06, + "loss": 0.5726, + "step": 1890 + }, + { + "epoch": 0.8940898345153664, + "grad_norm": 2.528327703475952, + "learning_rate": 4.754467494873656e-06, + "loss": 0.5682, + "step": 1891 + }, + { + "epoch": 0.8945626477541371, + "grad_norm": 2.3139286041259766, + "learning_rate": 4.7541978186710115e-06, + "loss": 0.6108, + "step": 1892 + }, + { + "epoch": 0.8950354609929078, + "grad_norm": 2.7269136905670166, + "learning_rate": 4.753928002109354e-06, + "loss": 0.5875, + "step": 1893 + }, + { + "epoch": 0.8955082742316784, + "grad_norm": 4.425495147705078, + "learning_rate": 4.753658045205482e-06, + "loss": 0.5572, + "step": 1894 + }, + { + "epoch": 0.8959810874704491, + "grad_norm": 2.535409927368164, + "learning_rate": 4.753387947976206e-06, + "loss": 0.5868, + "step": 1895 + }, + { + "epoch": 0.8964539007092198, + "grad_norm": 2.722458600997925, + "learning_rate": 4.753117710438343e-06, + "loss": 0.5935, + "step": 1896 + }, + { + "epoch": 0.8969267139479905, + "grad_norm": 2.743861436843872, + "learning_rate": 4.75284733260872e-06, + "loss": 0.572, + "step": 1897 + }, + { + "epoch": 0.8973995271867612, + "grad_norm": 2.60640549659729, + "learning_rate": 4.752576814504173e-06, + "loss": 0.567, + "step": 1898 + }, + { + "epoch": 0.8978723404255319, + "grad_norm": 2.7486042976379395, + "learning_rate": 4.7523061561415435e-06, + "loss": 0.5768, + "step": 1899 + }, + { + "epoch": 0.8983451536643026, + "grad_norm": 3.8410251140594482, + "learning_rate": 4.752035357537686e-06, + "loss": 0.6034, + "step": 1900 + }, + { + "epoch": 0.8988179669030733, + "grad_norm": 3.0935890674591064, + "learning_rate": 4.751764418709462e-06, + "loss": 0.5644, + "step": 1901 + }, + { + "epoch": 0.8992907801418439, + "grad_norm": 2.7989892959594727, + "learning_rate": 4.751493339673742e-06, + "loss": 0.656, + "step": 1902 + }, + { + "epoch": 0.8997635933806146, + "grad_norm": 3.6940557956695557, + "learning_rate": 4.751222120447403e-06, + "loss": 0.6632, + "step": 1903 + }, + { + "epoch": 0.9002364066193853, + "grad_norm": 2.3428797721862793, + "learning_rate": 4.750950761047335e-06, + "loss": 0.4485, + "step": 1904 + }, + { + "epoch": 0.900709219858156, + "grad_norm": 2.622544050216675, + "learning_rate": 4.750679261490432e-06, + "loss": 0.5857, + "step": 1905 + }, + { + "epoch": 0.9011820330969267, + "grad_norm": 2.4911322593688965, + "learning_rate": 4.750407621793601e-06, + "loss": 0.5618, + "step": 1906 + }, + { + "epoch": 0.9016548463356974, + "grad_norm": 2.6434662342071533, + "learning_rate": 4.750135841973755e-06, + "loss": 0.6057, + "step": 1907 + }, + { + "epoch": 0.902127659574468, + "grad_norm": 3.115443706512451, + "learning_rate": 4.749863922047817e-06, + "loss": 0.6064, + "step": 1908 + }, + { + "epoch": 0.9026004728132387, + "grad_norm": 2.5671091079711914, + "learning_rate": 4.749591862032718e-06, + "loss": 0.5625, + "step": 1909 + }, + { + "epoch": 0.9030732860520094, + "grad_norm": 3.2008655071258545, + "learning_rate": 4.749319661945398e-06, + "loss": 0.5547, + "step": 1910 + }, + { + "epoch": 0.9035460992907801, + "grad_norm": 2.905987024307251, + "learning_rate": 4.749047321802805e-06, + "loss": 0.6033, + "step": 1911 + }, + { + "epoch": 0.9040189125295508, + "grad_norm": 3.1456053256988525, + "learning_rate": 4.748774841621897e-06, + "loss": 0.5651, + "step": 1912 + }, + { + "epoch": 0.9044917257683215, + "grad_norm": 2.8116416931152344, + "learning_rate": 4.748502221419641e-06, + "loss": 0.5853, + "step": 1913 + }, + { + "epoch": 0.9049645390070922, + "grad_norm": 3.123835325241089, + "learning_rate": 4.748229461213011e-06, + "loss": 0.5427, + "step": 1914 + }, + { + "epoch": 0.9054373522458629, + "grad_norm": 2.4750146865844727, + "learning_rate": 4.747956561018989e-06, + "loss": 0.6517, + "step": 1915 + }, + { + "epoch": 0.9059101654846335, + "grad_norm": 2.6174299716949463, + "learning_rate": 4.7476835208545705e-06, + "loss": 0.6119, + "step": 1916 + }, + { + "epoch": 0.9063829787234042, + "grad_norm": 2.7390382289886475, + "learning_rate": 4.747410340736755e-06, + "loss": 0.5664, + "step": 1917 + }, + { + "epoch": 0.9068557919621749, + "grad_norm": 2.7940444946289062, + "learning_rate": 4.747137020682552e-06, + "loss": 0.5628, + "step": 1918 + }, + { + "epoch": 0.9073286052009456, + "grad_norm": 2.477365016937256, + "learning_rate": 4.7468635607089795e-06, + "loss": 0.5261, + "step": 1919 + }, + { + "epoch": 0.9078014184397163, + "grad_norm": 2.7016685009002686, + "learning_rate": 4.746589960833066e-06, + "loss": 0.5576, + "step": 1920 + }, + { + "epoch": 0.908274231678487, + "grad_norm": 2.8806519508361816, + "learning_rate": 4.746316221071846e-06, + "loss": 0.5925, + "step": 1921 + }, + { + "epoch": 0.9087470449172577, + "grad_norm": 3.0315234661102295, + "learning_rate": 4.746042341442365e-06, + "loss": 0.6142, + "step": 1922 + }, + { + "epoch": 0.9092198581560283, + "grad_norm": 4.2446160316467285, + "learning_rate": 4.745768321961676e-06, + "loss": 0.5352, + "step": 1923 + }, + { + "epoch": 0.909692671394799, + "grad_norm": 2.6517012119293213, + "learning_rate": 4.745494162646841e-06, + "loss": 0.6118, + "step": 1924 + }, + { + "epoch": 0.9101654846335697, + "grad_norm": 2.774900197982788, + "learning_rate": 4.7452198635149304e-06, + "loss": 0.572, + "step": 1925 + }, + { + "epoch": 0.9106382978723404, + "grad_norm": 3.0133683681488037, + "learning_rate": 4.744945424583024e-06, + "loss": 0.5897, + "step": 1926 + }, + { + "epoch": 0.9111111111111111, + "grad_norm": 2.7344839572906494, + "learning_rate": 4.744670845868211e-06, + "loss": 0.6207, + "step": 1927 + }, + { + "epoch": 0.9115839243498818, + "grad_norm": 2.636578321456909, + "learning_rate": 4.744396127387586e-06, + "loss": 0.6687, + "step": 1928 + }, + { + "epoch": 0.9120567375886525, + "grad_norm": 2.8663458824157715, + "learning_rate": 4.744121269158255e-06, + "loss": 0.5002, + "step": 1929 + }, + { + "epoch": 0.9125295508274232, + "grad_norm": 2.661079168319702, + "learning_rate": 4.743846271197333e-06, + "loss": 0.5848, + "step": 1930 + }, + { + "epoch": 0.9130023640661938, + "grad_norm": 2.881256341934204, + "learning_rate": 4.743571133521943e-06, + "loss": 0.5911, + "step": 1931 + }, + { + "epoch": 0.9134751773049645, + "grad_norm": 2.5540573596954346, + "learning_rate": 4.743295856149217e-06, + "loss": 0.5647, + "step": 1932 + }, + { + "epoch": 0.9139479905437352, + "grad_norm": 2.7060387134552, + "learning_rate": 4.743020439096293e-06, + "loss": 0.6267, + "step": 1933 + }, + { + "epoch": 0.9144208037825059, + "grad_norm": 2.694481372833252, + "learning_rate": 4.742744882380323e-06, + "loss": 0.6283, + "step": 1934 + }, + { + "epoch": 0.9148936170212766, + "grad_norm": 2.711555242538452, + "learning_rate": 4.7424691860184625e-06, + "loss": 0.5784, + "step": 1935 + }, + { + "epoch": 0.9153664302600473, + "grad_norm": 2.9077224731445312, + "learning_rate": 4.742193350027879e-06, + "loss": 0.5948, + "step": 1936 + }, + { + "epoch": 0.915839243498818, + "grad_norm": 2.9824187755584717, + "learning_rate": 4.7419173744257476e-06, + "loss": 0.6115, + "step": 1937 + }, + { + "epoch": 0.9163120567375886, + "grad_norm": 2.5127830505371094, + "learning_rate": 4.7416412592292515e-06, + "loss": 0.5803, + "step": 1938 + }, + { + "epoch": 0.9167848699763593, + "grad_norm": 3.1307175159454346, + "learning_rate": 4.741365004455583e-06, + "loss": 0.5657, + "step": 1939 + }, + { + "epoch": 0.91725768321513, + "grad_norm": 2.8205273151397705, + "learning_rate": 4.741088610121944e-06, + "loss": 0.6145, + "step": 1940 + }, + { + "epoch": 0.9177304964539007, + "grad_norm": 2.6119720935821533, + "learning_rate": 4.7408120762455444e-06, + "loss": 0.6058, + "step": 1941 + }, + { + "epoch": 0.9182033096926714, + "grad_norm": 2.421276092529297, + "learning_rate": 4.7405354028436025e-06, + "loss": 0.5973, + "step": 1942 + }, + { + "epoch": 0.9186761229314421, + "grad_norm": 2.9846808910369873, + "learning_rate": 4.740258589933346e-06, + "loss": 0.6892, + "step": 1943 + }, + { + "epoch": 0.9191489361702128, + "grad_norm": 2.6899871826171875, + "learning_rate": 4.739981637532009e-06, + "loss": 0.5705, + "step": 1944 + }, + { + "epoch": 0.9196217494089834, + "grad_norm": 2.8636131286621094, + "learning_rate": 4.739704545656839e-06, + "loss": 0.5775, + "step": 1945 + }, + { + "epoch": 0.9200945626477541, + "grad_norm": 2.7659449577331543, + "learning_rate": 4.739427314325087e-06, + "loss": 0.5823, + "step": 1946 + }, + { + "epoch": 0.9205673758865248, + "grad_norm": 4.71295166015625, + "learning_rate": 4.739149943554016e-06, + "loss": 0.5601, + "step": 1947 + }, + { + "epoch": 0.9210401891252955, + "grad_norm": 2.642636775970459, + "learning_rate": 4.738872433360896e-06, + "loss": 0.5278, + "step": 1948 + }, + { + "epoch": 0.9215130023640662, + "grad_norm": 2.4658217430114746, + "learning_rate": 4.7385947837630065e-06, + "loss": 0.6392, + "step": 1949 + }, + { + "epoch": 0.9219858156028369, + "grad_norm": 2.851602792739868, + "learning_rate": 4.738316994777636e-06, + "loss": 0.6164, + "step": 1950 + }, + { + "epoch": 0.9224586288416076, + "grad_norm": 2.394226551055908, + "learning_rate": 4.738039066422081e-06, + "loss": 0.5556, + "step": 1951 + }, + { + "epoch": 0.9229314420803783, + "grad_norm": 2.7985100746154785, + "learning_rate": 4.737760998713647e-06, + "loss": 0.5799, + "step": 1952 + }, + { + "epoch": 0.9234042553191489, + "grad_norm": 2.5974674224853516, + "learning_rate": 4.737482791669648e-06, + "loss": 0.6984, + "step": 1953 + }, + { + "epoch": 0.9238770685579196, + "grad_norm": 2.707636594772339, + "learning_rate": 4.737204445307406e-06, + "loss": 0.5548, + "step": 1954 + }, + { + "epoch": 0.9243498817966903, + "grad_norm": 2.7882707118988037, + "learning_rate": 4.736925959644254e-06, + "loss": 0.6026, + "step": 1955 + }, + { + "epoch": 0.924822695035461, + "grad_norm": 2.474482774734497, + "learning_rate": 4.7366473346975304e-06, + "loss": 0.5832, + "step": 1956 + }, + { + "epoch": 0.9252955082742317, + "grad_norm": 2.6196324825286865, + "learning_rate": 4.736368570484585e-06, + "loss": 0.5861, + "step": 1957 + }, + { + "epoch": 0.9257683215130024, + "grad_norm": 2.826864004135132, + "learning_rate": 4.736089667022775e-06, + "loss": 0.6173, + "step": 1958 + }, + { + "epoch": 0.926241134751773, + "grad_norm": 2.414473056793213, + "learning_rate": 4.735810624329466e-06, + "loss": 0.5753, + "step": 1959 + }, + { + "epoch": 0.9267139479905437, + "grad_norm": 2.8037970066070557, + "learning_rate": 4.7355314424220335e-06, + "loss": 0.6207, + "step": 1960 + }, + { + "epoch": 0.9271867612293144, + "grad_norm": 2.645458698272705, + "learning_rate": 4.735252121317861e-06, + "loss": 0.5959, + "step": 1961 + }, + { + "epoch": 0.9276595744680851, + "grad_norm": 2.7983884811401367, + "learning_rate": 4.734972661034339e-06, + "loss": 0.5696, + "step": 1962 + }, + { + "epoch": 0.9281323877068558, + "grad_norm": 3.0568997859954834, + "learning_rate": 4.73469306158887e-06, + "loss": 0.6194, + "step": 1963 + }, + { + "epoch": 0.9286052009456265, + "grad_norm": 2.7205135822296143, + "learning_rate": 4.734413322998863e-06, + "loss": 0.5292, + "step": 1964 + }, + { + "epoch": 0.9290780141843972, + "grad_norm": 3.3168489933013916, + "learning_rate": 4.734133445281735e-06, + "loss": 0.5654, + "step": 1965 + }, + { + "epoch": 0.9295508274231679, + "grad_norm": 3.0095653533935547, + "learning_rate": 4.733853428454916e-06, + "loss": 0.6508, + "step": 1966 + }, + { + "epoch": 0.9300236406619385, + "grad_norm": 2.7726712226867676, + "learning_rate": 4.733573272535838e-06, + "loss": 0.644, + "step": 1967 + }, + { + "epoch": 0.9304964539007092, + "grad_norm": 2.474397659301758, + "learning_rate": 4.7332929775419456e-06, + "loss": 0.5479, + "step": 1968 + }, + { + "epoch": 0.9309692671394799, + "grad_norm": 2.4518635272979736, + "learning_rate": 4.733012543490693e-06, + "loss": 0.6, + "step": 1969 + }, + { + "epoch": 0.9314420803782506, + "grad_norm": 2.9292192459106445, + "learning_rate": 4.73273197039954e-06, + "loss": 0.6647, + "step": 1970 + }, + { + "epoch": 0.9319148936170213, + "grad_norm": 2.425004720687866, + "learning_rate": 4.732451258285958e-06, + "loss": 0.6338, + "step": 1971 + }, + { + "epoch": 0.932387706855792, + "grad_norm": 2.904479503631592, + "learning_rate": 4.7321704071674255e-06, + "loss": 0.5923, + "step": 1972 + }, + { + "epoch": 0.9328605200945627, + "grad_norm": 2.477085590362549, + "learning_rate": 4.731889417061428e-06, + "loss": 0.5984, + "step": 1973 + }, + { + "epoch": 0.9333333333333333, + "grad_norm": 2.585240364074707, + "learning_rate": 4.731608287985465e-06, + "loss": 0.558, + "step": 1974 + }, + { + "epoch": 0.933806146572104, + "grad_norm": 2.658714532852173, + "learning_rate": 4.731327019957039e-06, + "loss": 0.5567, + "step": 1975 + }, + { + "epoch": 0.9342789598108747, + "grad_norm": 2.7593026161193848, + "learning_rate": 4.731045612993662e-06, + "loss": 0.5772, + "step": 1976 + }, + { + "epoch": 0.9347517730496454, + "grad_norm": 2.4386026859283447, + "learning_rate": 4.7307640671128585e-06, + "loss": 0.6199, + "step": 1977 + }, + { + "epoch": 0.9352245862884161, + "grad_norm": 2.681910514831543, + "learning_rate": 4.730482382332158e-06, + "loss": 0.5971, + "step": 1978 + }, + { + "epoch": 0.9356973995271868, + "grad_norm": 3.7593860626220703, + "learning_rate": 4.7302005586691e-06, + "loss": 0.6346, + "step": 1979 + }, + { + "epoch": 0.9361702127659575, + "grad_norm": 2.5789096355438232, + "learning_rate": 4.729918596141232e-06, + "loss": 0.5684, + "step": 1980 + }, + { + "epoch": 0.9366430260047282, + "grad_norm": 3.0607335567474365, + "learning_rate": 4.729636494766111e-06, + "loss": 0.6223, + "step": 1981 + }, + { + "epoch": 0.9371158392434988, + "grad_norm": 2.906643867492676, + "learning_rate": 4.729354254561303e-06, + "loss": 0.6513, + "step": 1982 + }, + { + "epoch": 0.9375886524822695, + "grad_norm": 3.192430019378662, + "learning_rate": 4.7290718755443795e-06, + "loss": 0.5095, + "step": 1983 + }, + { + "epoch": 0.9380614657210402, + "grad_norm": 2.661536931991577, + "learning_rate": 4.7287893577329255e-06, + "loss": 0.5525, + "step": 1984 + }, + { + "epoch": 0.9385342789598109, + "grad_norm": 2.8436734676361084, + "learning_rate": 4.728506701144531e-06, + "loss": 0.6323, + "step": 1985 + }, + { + "epoch": 0.9390070921985816, + "grad_norm": 2.75544810295105, + "learning_rate": 4.728223905796796e-06, + "loss": 0.6018, + "step": 1986 + }, + { + "epoch": 0.9394799054373523, + "grad_norm": 3.0652759075164795, + "learning_rate": 4.727940971707329e-06, + "loss": 0.62, + "step": 1987 + }, + { + "epoch": 0.939952718676123, + "grad_norm": 2.802567720413208, + "learning_rate": 4.727657898893747e-06, + "loss": 0.5809, + "step": 1988 + }, + { + "epoch": 0.9404255319148936, + "grad_norm": 2.6208512783050537, + "learning_rate": 4.7273746873736745e-06, + "loss": 0.5762, + "step": 1989 + }, + { + "epoch": 0.9408983451536643, + "grad_norm": 2.5901873111724854, + "learning_rate": 4.727091337164748e-06, + "loss": 0.6111, + "step": 1990 + }, + { + "epoch": 0.941371158392435, + "grad_norm": 3.002347707748413, + "learning_rate": 4.726807848284609e-06, + "loss": 0.6419, + "step": 1991 + }, + { + "epoch": 0.9418439716312057, + "grad_norm": 2.522151470184326, + "learning_rate": 4.72652422075091e-06, + "loss": 0.642, + "step": 1992 + }, + { + "epoch": 0.9423167848699764, + "grad_norm": 2.5571532249450684, + "learning_rate": 4.726240454581311e-06, + "loss": 0.5729, + "step": 1993 + }, + { + "epoch": 0.9427895981087471, + "grad_norm": 2.7704918384552, + "learning_rate": 4.72595654979348e-06, + "loss": 0.6816, + "step": 1994 + }, + { + "epoch": 0.9432624113475178, + "grad_norm": 2.517040491104126, + "learning_rate": 4.7256725064050955e-06, + "loss": 0.5782, + "step": 1995 + }, + { + "epoch": 0.9437352245862884, + "grad_norm": 2.613955020904541, + "learning_rate": 4.725388324433843e-06, + "loss": 0.6291, + "step": 1996 + }, + { + "epoch": 0.9442080378250591, + "grad_norm": 2.848891258239746, + "learning_rate": 4.725104003897418e-06, + "loss": 0.6544, + "step": 1997 + }, + { + "epoch": 0.9446808510638298, + "grad_norm": 3.0162429809570312, + "learning_rate": 4.724819544813523e-06, + "loss": 0.6301, + "step": 1998 + }, + { + "epoch": 0.9451536643026005, + "grad_norm": 2.613614559173584, + "learning_rate": 4.72453494719987e-06, + "loss": 0.5829, + "step": 1999 + }, + { + "epoch": 0.9456264775413712, + "grad_norm": 2.4838767051696777, + "learning_rate": 4.724250211074182e-06, + "loss": 0.6042, + "step": 2000 + }, + { + "epoch": 0.9460992907801419, + "grad_norm": 2.526470899581909, + "learning_rate": 4.723965336454185e-06, + "loss": 0.6167, + "step": 2001 + }, + { + "epoch": 0.9465721040189126, + "grad_norm": 2.504506826400757, + "learning_rate": 4.723680323357618e-06, + "loss": 0.6061, + "step": 2002 + }, + { + "epoch": 0.9470449172576832, + "grad_norm": 3.0547544956207275, + "learning_rate": 4.723395171802228e-06, + "loss": 0.6619, + "step": 2003 + }, + { + "epoch": 0.9475177304964539, + "grad_norm": 2.8692407608032227, + "learning_rate": 4.723109881805771e-06, + "loss": 0.5985, + "step": 2004 + }, + { + "epoch": 0.9479905437352246, + "grad_norm": 2.7929654121398926, + "learning_rate": 4.7228244533860094e-06, + "loss": 0.5869, + "step": 2005 + }, + { + "epoch": 0.9484633569739953, + "grad_norm": 2.764869451522827, + "learning_rate": 4.7225388865607146e-06, + "loss": 0.6288, + "step": 2006 + }, + { + "epoch": 0.948936170212766, + "grad_norm": 2.7656404972076416, + "learning_rate": 4.722253181347671e-06, + "loss": 0.5831, + "step": 2007 + }, + { + "epoch": 0.9494089834515367, + "grad_norm": 2.6698336601257324, + "learning_rate": 4.7219673377646635e-06, + "loss": 0.6087, + "step": 2008 + }, + { + "epoch": 0.9498817966903074, + "grad_norm": 2.524935722351074, + "learning_rate": 4.7216813558294946e-06, + "loss": 0.5675, + "step": 2009 + }, + { + "epoch": 0.950354609929078, + "grad_norm": 2.5998785495758057, + "learning_rate": 4.721395235559969e-06, + "loss": 0.5667, + "step": 2010 + }, + { + "epoch": 0.9508274231678487, + "grad_norm": 2.758021354675293, + "learning_rate": 4.721108976973902e-06, + "loss": 0.4931, + "step": 2011 + }, + { + "epoch": 0.9513002364066194, + "grad_norm": 2.767695903778076, + "learning_rate": 4.72082258008912e-06, + "loss": 0.5778, + "step": 2012 + }, + { + "epoch": 0.9517730496453901, + "grad_norm": 2.982314348220825, + "learning_rate": 4.720536044923453e-06, + "loss": 0.6096, + "step": 2013 + }, + { + "epoch": 0.9522458628841608, + "grad_norm": 2.7608799934387207, + "learning_rate": 4.720249371494743e-06, + "loss": 0.6242, + "step": 2014 + }, + { + "epoch": 0.9527186761229315, + "grad_norm": 2.60054349899292, + "learning_rate": 4.71996255982084e-06, + "loss": 0.6249, + "step": 2015 + }, + { + "epoch": 0.9531914893617022, + "grad_norm": 2.654355764389038, + "learning_rate": 4.719675609919603e-06, + "loss": 0.6327, + "step": 2016 + }, + { + "epoch": 0.9536643026004729, + "grad_norm": 2.589404582977295, + "learning_rate": 4.719388521808899e-06, + "loss": 0.6357, + "step": 2017 + }, + { + "epoch": 0.9541371158392435, + "grad_norm": 2.8016581535339355, + "learning_rate": 4.719101295506603e-06, + "loss": 0.5901, + "step": 2018 + }, + { + "epoch": 0.9546099290780142, + "grad_norm": 3.1408045291900635, + "learning_rate": 4.7188139310306e-06, + "loss": 0.598, + "step": 2019 + }, + { + "epoch": 0.9550827423167849, + "grad_norm": 2.7432665824890137, + "learning_rate": 4.718526428398783e-06, + "loss": 0.5508, + "step": 2020 + }, + { + "epoch": 0.9555555555555556, + "grad_norm": 2.947800874710083, + "learning_rate": 4.718238787629053e-06, + "loss": 0.6439, + "step": 2021 + }, + { + "epoch": 0.9560283687943263, + "grad_norm": 2.50828218460083, + "learning_rate": 4.71795100873932e-06, + "loss": 0.5441, + "step": 2022 + }, + { + "epoch": 0.956501182033097, + "grad_norm": 2.8558974266052246, + "learning_rate": 4.717663091747503e-06, + "loss": 0.5416, + "step": 2023 + }, + { + "epoch": 0.9569739952718677, + "grad_norm": 2.4803316593170166, + "learning_rate": 4.71737503667153e-06, + "loss": 0.5317, + "step": 2024 + }, + { + "epoch": 0.9574468085106383, + "grad_norm": 4.36754035949707, + "learning_rate": 4.717086843529336e-06, + "loss": 0.5808, + "step": 2025 + }, + { + "epoch": 0.957919621749409, + "grad_norm": 2.730185031890869, + "learning_rate": 4.7167985123388665e-06, + "loss": 0.5257, + "step": 2026 + }, + { + "epoch": 0.9583924349881797, + "grad_norm": 2.8136069774627686, + "learning_rate": 4.716510043118074e-06, + "loss": 0.5836, + "step": 2027 + }, + { + "epoch": 0.9588652482269504, + "grad_norm": 2.793975353240967, + "learning_rate": 4.71622143588492e-06, + "loss": 0.5706, + "step": 2028 + }, + { + "epoch": 0.9593380614657211, + "grad_norm": 2.3883821964263916, + "learning_rate": 4.7159326906573745e-06, + "loss": 0.5291, + "step": 2029 + }, + { + "epoch": 0.9598108747044918, + "grad_norm": 2.6135976314544678, + "learning_rate": 4.715643807453417e-06, + "loss": 0.6199, + "step": 2030 + }, + { + "epoch": 0.9602836879432625, + "grad_norm": 2.6245670318603516, + "learning_rate": 4.715354786291035e-06, + "loss": 0.5585, + "step": 2031 + }, + { + "epoch": 0.9607565011820332, + "grad_norm": 2.7870967388153076, + "learning_rate": 4.715065627188225e-06, + "loss": 0.6196, + "step": 2032 + }, + { + "epoch": 0.9612293144208038, + "grad_norm": 2.6983911991119385, + "learning_rate": 4.714776330162991e-06, + "loss": 0.6424, + "step": 2033 + }, + { + "epoch": 0.9617021276595744, + "grad_norm": 2.3221919536590576, + "learning_rate": 4.7144868952333465e-06, + "loss": 0.568, + "step": 2034 + }, + { + "epoch": 0.9621749408983451, + "grad_norm": 2.9408178329467773, + "learning_rate": 4.714197322417314e-06, + "loss": 0.6175, + "step": 2035 + }, + { + "epoch": 0.9626477541371158, + "grad_norm": 2.404057264328003, + "learning_rate": 4.713907611732921e-06, + "loss": 0.4943, + "step": 2036 + }, + { + "epoch": 0.9631205673758865, + "grad_norm": 3.547607660293579, + "learning_rate": 4.71361776319821e-06, + "loss": 0.5488, + "step": 2037 + }, + { + "epoch": 0.9635933806146572, + "grad_norm": 2.679614543914795, + "learning_rate": 4.713327776831227e-06, + "loss": 0.6234, + "step": 2038 + }, + { + "epoch": 0.9640661938534278, + "grad_norm": 2.526914119720459, + "learning_rate": 4.7130376526500286e-06, + "loss": 0.5891, + "step": 2039 + }, + { + "epoch": 0.9645390070921985, + "grad_norm": 2.6953470706939697, + "learning_rate": 4.71274739067268e-06, + "loss": 0.69, + "step": 2040 + }, + { + "epoch": 0.9650118203309692, + "grad_norm": 2.546660900115967, + "learning_rate": 4.712456990917254e-06, + "loss": 0.6185, + "step": 2041 + }, + { + "epoch": 0.9654846335697399, + "grad_norm": 3.3920490741729736, + "learning_rate": 4.712166453401832e-06, + "loss": 0.587, + "step": 2042 + }, + { + "epoch": 0.9659574468085106, + "grad_norm": 2.5961573123931885, + "learning_rate": 4.711875778144504e-06, + "loss": 0.6105, + "step": 2043 + }, + { + "epoch": 0.9664302600472813, + "grad_norm": 2.5111498832702637, + "learning_rate": 4.711584965163372e-06, + "loss": 0.5533, + "step": 2044 + }, + { + "epoch": 0.966903073286052, + "grad_norm": 2.4878132343292236, + "learning_rate": 4.7112940144765405e-06, + "loss": 0.5604, + "step": 2045 + }, + { + "epoch": 0.9673758865248226, + "grad_norm": 2.5714077949523926, + "learning_rate": 4.711002926102128e-06, + "loss": 0.5794, + "step": 2046 + }, + { + "epoch": 0.9678486997635933, + "grad_norm": 2.7069091796875, + "learning_rate": 4.710711700058257e-06, + "loss": 0.594, + "step": 2047 + }, + { + "epoch": 0.968321513002364, + "grad_norm": 2.8104631900787354, + "learning_rate": 4.710420336363063e-06, + "loss": 0.6247, + "step": 2048 + }, + { + "epoch": 0.9687943262411347, + "grad_norm": 2.8464386463165283, + "learning_rate": 4.7101288350346865e-06, + "loss": 0.6162, + "step": 2049 + }, + { + "epoch": 0.9692671394799054, + "grad_norm": 2.7187976837158203, + "learning_rate": 4.709837196091279e-06, + "loss": 0.6109, + "step": 2050 + }, + { + "epoch": 0.9697399527186761, + "grad_norm": 2.556734085083008, + "learning_rate": 4.709545419550999e-06, + "loss": 0.6297, + "step": 2051 + }, + { + "epoch": 0.9702127659574468, + "grad_norm": 2.937195062637329, + "learning_rate": 4.709253505432014e-06, + "loss": 0.6862, + "step": 2052 + }, + { + "epoch": 0.9706855791962175, + "grad_norm": 2.792175531387329, + "learning_rate": 4.7089614537525015e-06, + "loss": 0.6105, + "step": 2053 + }, + { + "epoch": 0.9711583924349881, + "grad_norm": 2.625636100769043, + "learning_rate": 4.708669264530644e-06, + "loss": 0.5849, + "step": 2054 + }, + { + "epoch": 0.9716312056737588, + "grad_norm": 2.6752610206604004, + "learning_rate": 4.708376937784637e-06, + "loss": 0.5949, + "step": 2055 + }, + { + "epoch": 0.9721040189125295, + "grad_norm": 2.6072793006896973, + "learning_rate": 4.708084473532681e-06, + "loss": 0.5776, + "step": 2056 + }, + { + "epoch": 0.9725768321513002, + "grad_norm": 2.728632926940918, + "learning_rate": 4.707791871792988e-06, + "loss": 0.6352, + "step": 2057 + }, + { + "epoch": 0.9730496453900709, + "grad_norm": 2.5841758251190186, + "learning_rate": 4.707499132583775e-06, + "loss": 0.5488, + "step": 2058 + }, + { + "epoch": 0.9735224586288416, + "grad_norm": 2.8464293479919434, + "learning_rate": 4.707206255923271e-06, + "loss": 0.7051, + "step": 2059 + }, + { + "epoch": 0.9739952718676123, + "grad_norm": 2.547297239303589, + "learning_rate": 4.706913241829712e-06, + "loss": 0.5937, + "step": 2060 + }, + { + "epoch": 0.9744680851063829, + "grad_norm": 2.6572306156158447, + "learning_rate": 4.706620090321341e-06, + "loss": 0.6041, + "step": 2061 + }, + { + "epoch": 0.9749408983451536, + "grad_norm": 2.3262805938720703, + "learning_rate": 4.706326801416414e-06, + "loss": 0.5144, + "step": 2062 + }, + { + "epoch": 0.9754137115839243, + "grad_norm": 2.9693965911865234, + "learning_rate": 4.706033375133191e-06, + "loss": 0.551, + "step": 2063 + }, + { + "epoch": 0.975886524822695, + "grad_norm": 2.5993731021881104, + "learning_rate": 4.7057398114899435e-06, + "loss": 0.6143, + "step": 2064 + }, + { + "epoch": 0.9763593380614657, + "grad_norm": 2.453336477279663, + "learning_rate": 4.70544611050495e-06, + "loss": 0.6093, + "step": 2065 + }, + { + "epoch": 0.9768321513002364, + "grad_norm": 2.898629665374756, + "learning_rate": 4.705152272196497e-06, + "loss": 0.6007, + "step": 2066 + }, + { + "epoch": 0.9773049645390071, + "grad_norm": 2.7990612983703613, + "learning_rate": 4.7048582965828815e-06, + "loss": 0.6687, + "step": 2067 + }, + { + "epoch": 0.9777777777777777, + "grad_norm": 2.635284423828125, + "learning_rate": 4.704564183682408e-06, + "loss": 0.5564, + "step": 2068 + }, + { + "epoch": 0.9782505910165484, + "grad_norm": 3.014547109603882, + "learning_rate": 4.704269933513389e-06, + "loss": 0.6084, + "step": 2069 + }, + { + "epoch": 0.9787234042553191, + "grad_norm": 2.659357786178589, + "learning_rate": 4.703975546094147e-06, + "loss": 0.6031, + "step": 2070 + }, + { + "epoch": 0.9791962174940898, + "grad_norm": 2.326932668685913, + "learning_rate": 4.703681021443013e-06, + "loss": 0.5859, + "step": 2071 + }, + { + "epoch": 0.9796690307328605, + "grad_norm": 2.958803653717041, + "learning_rate": 4.7033863595783235e-06, + "loss": 0.5586, + "step": 2072 + }, + { + "epoch": 0.9801418439716312, + "grad_norm": 2.921386957168579, + "learning_rate": 4.703091560518427e-06, + "loss": 0.6126, + "step": 2073 + }, + { + "epoch": 0.9806146572104019, + "grad_norm": 2.6500775814056396, + "learning_rate": 4.702796624281679e-06, + "loss": 0.5678, + "step": 2074 + }, + { + "epoch": 0.9810874704491725, + "grad_norm": 2.7740228176116943, + "learning_rate": 4.702501550886445e-06, + "loss": 0.6067, + "step": 2075 + }, + { + "epoch": 0.9815602836879432, + "grad_norm": 2.3296213150024414, + "learning_rate": 4.702206340351096e-06, + "loss": 0.5247, + "step": 2076 + }, + { + "epoch": 0.9820330969267139, + "grad_norm": 2.748300790786743, + "learning_rate": 4.701910992694016e-06, + "loss": 0.5197, + "step": 2077 + }, + { + "epoch": 0.9825059101654846, + "grad_norm": 2.250985622406006, + "learning_rate": 4.7016155079335926e-06, + "loss": 0.5214, + "step": 2078 + }, + { + "epoch": 0.9829787234042553, + "grad_norm": 2.389845848083496, + "learning_rate": 4.701319886088226e-06, + "loss": 0.519, + "step": 2079 + }, + { + "epoch": 0.983451536643026, + "grad_norm": 2.818220853805542, + "learning_rate": 4.701024127176322e-06, + "loss": 0.607, + "step": 2080 + }, + { + "epoch": 0.9839243498817967, + "grad_norm": 3.4058034420013428, + "learning_rate": 4.700728231216297e-06, + "loss": 0.5711, + "step": 2081 + }, + { + "epoch": 0.9843971631205674, + "grad_norm": 2.5297787189483643, + "learning_rate": 4.700432198226575e-06, + "loss": 0.5979, + "step": 2082 + }, + { + "epoch": 0.984869976359338, + "grad_norm": 3.0548105239868164, + "learning_rate": 4.7001360282255885e-06, + "loss": 0.6041, + "step": 2083 + }, + { + "epoch": 0.9853427895981087, + "grad_norm": 2.8983733654022217, + "learning_rate": 4.699839721231779e-06, + "loss": 0.5926, + "step": 2084 + }, + { + "epoch": 0.9858156028368794, + "grad_norm": 3.2717764377593994, + "learning_rate": 4.699543277263596e-06, + "loss": 0.6477, + "step": 2085 + }, + { + "epoch": 0.9862884160756501, + "grad_norm": 3.03729248046875, + "learning_rate": 4.699246696339497e-06, + "loss": 0.6786, + "step": 2086 + }, + { + "epoch": 0.9867612293144208, + "grad_norm": 2.852301597595215, + "learning_rate": 4.698949978477951e-06, + "loss": 0.6565, + "step": 2087 + }, + { + "epoch": 0.9872340425531915, + "grad_norm": 2.843485116958618, + "learning_rate": 4.698653123697431e-06, + "loss": 0.6627, + "step": 2088 + }, + { + "epoch": 0.9877068557919622, + "grad_norm": 2.6315064430236816, + "learning_rate": 4.698356132016423e-06, + "loss": 0.6577, + "step": 2089 + }, + { + "epoch": 0.9881796690307328, + "grad_norm": 2.7482151985168457, + "learning_rate": 4.698059003453417e-06, + "loss": 0.5514, + "step": 2090 + }, + { + "epoch": 0.9886524822695035, + "grad_norm": 2.826673746109009, + "learning_rate": 4.6977617380269145e-06, + "loss": 0.565, + "step": 2091 + }, + { + "epoch": 0.9891252955082742, + "grad_norm": 3.0273752212524414, + "learning_rate": 4.697464335755427e-06, + "loss": 0.6331, + "step": 2092 + }, + { + "epoch": 0.9895981087470449, + "grad_norm": 2.7551653385162354, + "learning_rate": 4.6971667966574695e-06, + "loss": 0.6486, + "step": 2093 + }, + { + "epoch": 0.9900709219858156, + "grad_norm": 2.656299114227295, + "learning_rate": 4.696869120751571e-06, + "loss": 0.6562, + "step": 2094 + }, + { + "epoch": 0.9905437352245863, + "grad_norm": 2.785322904586792, + "learning_rate": 4.696571308056265e-06, + "loss": 0.5892, + "step": 2095 + }, + { + "epoch": 0.991016548463357, + "grad_norm": 2.9334635734558105, + "learning_rate": 4.696273358590095e-06, + "loss": 0.6346, + "step": 2096 + }, + { + "epoch": 0.9914893617021276, + "grad_norm": 2.7944300174713135, + "learning_rate": 4.695975272371613e-06, + "loss": 0.5859, + "step": 2097 + }, + { + "epoch": 0.9919621749408983, + "grad_norm": 2.5416972637176514, + "learning_rate": 4.695677049419381e-06, + "loss": 0.5658, + "step": 2098 + }, + { + "epoch": 0.992434988179669, + "grad_norm": 2.4056856632232666, + "learning_rate": 4.695378689751966e-06, + "loss": 0.5121, + "step": 2099 + }, + { + "epoch": 0.9929078014184397, + "grad_norm": 2.614548683166504, + "learning_rate": 4.695080193387948e-06, + "loss": 0.5961, + "step": 2100 + }, + { + "epoch": 0.9933806146572104, + "grad_norm": 2.8966517448425293, + "learning_rate": 4.69478156034591e-06, + "loss": 0.5985, + "step": 2101 + }, + { + "epoch": 0.9938534278959811, + "grad_norm": 2.9514098167419434, + "learning_rate": 4.694482790644448e-06, + "loss": 0.5677, + "step": 2102 + }, + { + "epoch": 0.9943262411347518, + "grad_norm": 2.4326791763305664, + "learning_rate": 4.694183884302165e-06, + "loss": 0.5698, + "step": 2103 + }, + { + "epoch": 0.9947990543735225, + "grad_norm": 2.9242892265319824, + "learning_rate": 4.6938848413376735e-06, + "loss": 0.6245, + "step": 2104 + }, + { + "epoch": 0.9952718676122931, + "grad_norm": 2.9134104251861572, + "learning_rate": 4.693585661769593e-06, + "loss": 0.6164, + "step": 2105 + }, + { + "epoch": 0.9957446808510638, + "grad_norm": 2.472564458847046, + "learning_rate": 4.693286345616551e-06, + "loss": 0.5616, + "step": 2106 + }, + { + "epoch": 0.9962174940898345, + "grad_norm": 3.2456448078155518, + "learning_rate": 4.692986892897186e-06, + "loss": 0.6977, + "step": 2107 + }, + { + "epoch": 0.9966903073286052, + "grad_norm": 3.4032769203186035, + "learning_rate": 4.692687303630143e-06, + "loss": 0.643, + "step": 2108 + }, + { + "epoch": 0.9971631205673759, + "grad_norm": 2.722200870513916, + "learning_rate": 4.692387577834076e-06, + "loss": 0.5873, + "step": 2109 + }, + { + "epoch": 0.9976359338061466, + "grad_norm": 2.687532663345337, + "learning_rate": 4.692087715527648e-06, + "loss": 0.5423, + "step": 2110 + }, + { + "epoch": 0.9981087470449173, + "grad_norm": 2.578613042831421, + "learning_rate": 4.6917877167295305e-06, + "loss": 0.5689, + "step": 2111 + }, + { + "epoch": 0.9985815602836879, + "grad_norm": 3.1806094646453857, + "learning_rate": 4.691487581458402e-06, + "loss": 0.6133, + "step": 2112 + }, + { + "epoch": 0.9990543735224586, + "grad_norm": 2.4449520111083984, + "learning_rate": 4.691187309732952e-06, + "loss": 0.5841, + "step": 2113 + }, + { + "epoch": 0.9995271867612293, + "grad_norm": 2.908749580383301, + "learning_rate": 4.690886901571875e-06, + "loss": 0.534, + "step": 2114 + }, + { + "epoch": 1.0, + "grad_norm": 4.019968032836914, + "learning_rate": 4.6905863569938785e-06, + "loss": 0.596, + "step": 2115 + }, + { + "epoch": 1.0004728132387706, + "grad_norm": 2.4319307804107666, + "learning_rate": 4.690285676017675e-06, + "loss": 0.4973, + "step": 2116 + }, + { + "epoch": 1.0009456264775414, + "grad_norm": 2.6366477012634277, + "learning_rate": 4.689984858661986e-06, + "loss": 0.5682, + "step": 2117 + }, + { + "epoch": 1.001418439716312, + "grad_norm": 2.815114974975586, + "learning_rate": 4.689683904945542e-06, + "loss": 0.5616, + "step": 2118 + }, + { + "epoch": 1.0018912529550827, + "grad_norm": 2.6680490970611572, + "learning_rate": 4.689382814887084e-06, + "loss": 0.5161, + "step": 2119 + }, + { + "epoch": 1.0023640661938533, + "grad_norm": 2.7406351566314697, + "learning_rate": 4.689081588505358e-06, + "loss": 0.4937, + "step": 2120 + }, + { + "epoch": 1.0028368794326241, + "grad_norm": 2.2832298278808594, + "learning_rate": 4.68878022581912e-06, + "loss": 0.4986, + "step": 2121 + }, + { + "epoch": 1.0033096926713947, + "grad_norm": 2.5525307655334473, + "learning_rate": 4.688478726847136e-06, + "loss": 0.4909, + "step": 2122 + }, + { + "epoch": 1.0037825059101655, + "grad_norm": 2.9843199253082275, + "learning_rate": 4.688177091608176e-06, + "loss": 0.6046, + "step": 2123 + }, + { + "epoch": 1.004255319148936, + "grad_norm": 2.5231106281280518, + "learning_rate": 4.687875320121024e-06, + "loss": 0.5423, + "step": 2124 + }, + { + "epoch": 1.0047281323877069, + "grad_norm": 2.567599058151245, + "learning_rate": 4.68757341240447e-06, + "loss": 0.5092, + "step": 2125 + }, + { + "epoch": 1.0052009456264774, + "grad_norm": 2.768111228942871, + "learning_rate": 4.687271368477311e-06, + "loss": 0.5175, + "step": 2126 + }, + { + "epoch": 1.0056737588652482, + "grad_norm": 2.7223286628723145, + "learning_rate": 4.686969188358355e-06, + "loss": 0.5412, + "step": 2127 + }, + { + "epoch": 1.0061465721040188, + "grad_norm": 2.488299608230591, + "learning_rate": 4.686666872066418e-06, + "loss": 0.5288, + "step": 2128 + }, + { + "epoch": 1.0066193853427896, + "grad_norm": 2.882981777191162, + "learning_rate": 4.6863644196203215e-06, + "loss": 0.6117, + "step": 2129 + }, + { + "epoch": 1.0070921985815602, + "grad_norm": 3.0019447803497314, + "learning_rate": 4.686061831038901e-06, + "loss": 0.5308, + "step": 2130 + }, + { + "epoch": 1.007565011820331, + "grad_norm": 3.0056138038635254, + "learning_rate": 4.685759106340996e-06, + "loss": 0.5833, + "step": 2131 + }, + { + "epoch": 1.0080378250591016, + "grad_norm": 2.5709075927734375, + "learning_rate": 4.685456245545454e-06, + "loss": 0.5071, + "step": 2132 + }, + { + "epoch": 1.0085106382978724, + "grad_norm": 2.4641504287719727, + "learning_rate": 4.685153248671136e-06, + "loss": 0.4813, + "step": 2133 + }, + { + "epoch": 1.008983451536643, + "grad_norm": 2.374413013458252, + "learning_rate": 4.684850115736906e-06, + "loss": 0.5179, + "step": 2134 + }, + { + "epoch": 1.0094562647754137, + "grad_norm": 2.6504571437835693, + "learning_rate": 4.684546846761641e-06, + "loss": 0.437, + "step": 2135 + }, + { + "epoch": 1.0099290780141843, + "grad_norm": 2.5977871417999268, + "learning_rate": 4.684243441764221e-06, + "loss": 0.497, + "step": 2136 + }, + { + "epoch": 1.010401891252955, + "grad_norm": 2.4950785636901855, + "learning_rate": 4.683939900763541e-06, + "loss": 0.5624, + "step": 2137 + }, + { + "epoch": 1.0108747044917257, + "grad_norm": 3.065718412399292, + "learning_rate": 4.6836362237785e-06, + "loss": 0.512, + "step": 2138 + }, + { + "epoch": 1.0113475177304965, + "grad_norm": 2.7419207096099854, + "learning_rate": 4.6833324108280045e-06, + "loss": 0.5585, + "step": 2139 + }, + { + "epoch": 1.011820330969267, + "grad_norm": 2.623610496520996, + "learning_rate": 4.6830284619309744e-06, + "loss": 0.5163, + "step": 2140 + }, + { + "epoch": 1.0122931442080378, + "grad_norm": 2.774322986602783, + "learning_rate": 4.682724377106334e-06, + "loss": 0.527, + "step": 2141 + }, + { + "epoch": 1.0127659574468084, + "grad_norm": 2.959935188293457, + "learning_rate": 4.682420156373017e-06, + "loss": 0.6166, + "step": 2142 + }, + { + "epoch": 1.0132387706855792, + "grad_norm": 2.584026336669922, + "learning_rate": 4.682115799749968e-06, + "loss": 0.5086, + "step": 2143 + }, + { + "epoch": 1.0137115839243498, + "grad_norm": 2.6039700508117676, + "learning_rate": 4.6818113072561346e-06, + "loss": 0.49, + "step": 2144 + }, + { + "epoch": 1.0141843971631206, + "grad_norm": 2.466381072998047, + "learning_rate": 4.681506678910479e-06, + "loss": 0.4959, + "step": 2145 + }, + { + "epoch": 1.0146572104018912, + "grad_norm": 2.432636260986328, + "learning_rate": 4.681201914731969e-06, + "loss": 0.5057, + "step": 2146 + }, + { + "epoch": 1.015130023640662, + "grad_norm": 2.6134090423583984, + "learning_rate": 4.680897014739579e-06, + "loss": 0.4874, + "step": 2147 + }, + { + "epoch": 1.0156028368794325, + "grad_norm": 2.774481773376465, + "learning_rate": 4.680591978952295e-06, + "loss": 0.4967, + "step": 2148 + }, + { + "epoch": 1.0160756501182033, + "grad_norm": 2.66050124168396, + "learning_rate": 4.68028680738911e-06, + "loss": 0.4932, + "step": 2149 + }, + { + "epoch": 1.016548463356974, + "grad_norm": 3.020594835281372, + "learning_rate": 4.679981500069026e-06, + "loss": 0.5788, + "step": 2150 + }, + { + "epoch": 1.0170212765957447, + "grad_norm": 2.697758436203003, + "learning_rate": 4.679676057011053e-06, + "loss": 0.5441, + "step": 2151 + }, + { + "epoch": 1.0174940898345153, + "grad_norm": 6.986445903778076, + "learning_rate": 4.679370478234209e-06, + "loss": 0.6483, + "step": 2152 + }, + { + "epoch": 1.017966903073286, + "grad_norm": 2.6637115478515625, + "learning_rate": 4.679064763757522e-06, + "loss": 0.5859, + "step": 2153 + }, + { + "epoch": 1.0184397163120567, + "grad_norm": 2.7501862049102783, + "learning_rate": 4.678758913600027e-06, + "loss": 0.5745, + "step": 2154 + }, + { + "epoch": 1.0189125295508275, + "grad_norm": 2.7959372997283936, + "learning_rate": 4.678452927780768e-06, + "loss": 0.5076, + "step": 2155 + }, + { + "epoch": 1.019385342789598, + "grad_norm": 2.4377388954162598, + "learning_rate": 4.678146806318798e-06, + "loss": 0.5061, + "step": 2156 + }, + { + "epoch": 1.0198581560283688, + "grad_norm": 2.5478947162628174, + "learning_rate": 4.677840549233176e-06, + "loss": 0.4941, + "step": 2157 + }, + { + "epoch": 1.0203309692671394, + "grad_norm": 3.0956528186798096, + "learning_rate": 4.677534156542973e-06, + "loss": 0.5879, + "step": 2158 + }, + { + "epoch": 1.0208037825059102, + "grad_norm": 2.5247607231140137, + "learning_rate": 4.6772276282672666e-06, + "loss": 0.5532, + "step": 2159 + }, + { + "epoch": 1.0212765957446808, + "grad_norm": 3.1972787380218506, + "learning_rate": 4.676920964425143e-06, + "loss": 0.6081, + "step": 2160 + }, + { + "epoch": 1.0217494089834516, + "grad_norm": 2.6173388957977295, + "learning_rate": 4.6766141650356955e-06, + "loss": 0.5001, + "step": 2161 + }, + { + "epoch": 1.0222222222222221, + "grad_norm": 2.9914398193359375, + "learning_rate": 4.676307230118029e-06, + "loss": 0.5566, + "step": 2162 + }, + { + "epoch": 1.022695035460993, + "grad_norm": 2.8011834621429443, + "learning_rate": 4.676000159691254e-06, + "loss": 0.4909, + "step": 2163 + }, + { + "epoch": 1.0231678486997635, + "grad_norm": 2.6049559116363525, + "learning_rate": 4.67569295377449e-06, + "loss": 0.5018, + "step": 2164 + }, + { + "epoch": 1.0236406619385343, + "grad_norm": 2.8175013065338135, + "learning_rate": 4.675385612386866e-06, + "loss": 0.5309, + "step": 2165 + }, + { + "epoch": 1.0241134751773049, + "grad_norm": 2.854696750640869, + "learning_rate": 4.675078135547519e-06, + "loss": 0.5627, + "step": 2166 + }, + { + "epoch": 1.0245862884160757, + "grad_norm": 3.1856436729431152, + "learning_rate": 4.674770523275594e-06, + "loss": 0.5475, + "step": 2167 + }, + { + "epoch": 1.0250591016548463, + "grad_norm": 2.8289129734039307, + "learning_rate": 4.674462775590244e-06, + "loss": 0.5878, + "step": 2168 + }, + { + "epoch": 1.025531914893617, + "grad_norm": 2.8824517726898193, + "learning_rate": 4.6741548925106325e-06, + "loss": 0.4392, + "step": 2169 + }, + { + "epoch": 1.0260047281323876, + "grad_norm": 2.7044589519500732, + "learning_rate": 4.673846874055928e-06, + "loss": 0.5264, + "step": 2170 + }, + { + "epoch": 1.0264775413711584, + "grad_norm": 2.575035810470581, + "learning_rate": 4.673538720245312e-06, + "loss": 0.4615, + "step": 2171 + }, + { + "epoch": 1.026950354609929, + "grad_norm": 2.48168683052063, + "learning_rate": 4.67323043109797e-06, + "loss": 0.4404, + "step": 2172 + }, + { + "epoch": 1.0274231678486998, + "grad_norm": 2.926593065261841, + "learning_rate": 4.672922006633098e-06, + "loss": 0.54, + "step": 2173 + }, + { + "epoch": 1.0278959810874704, + "grad_norm": 2.4610698223114014, + "learning_rate": 4.672613446869901e-06, + "loss": 0.5555, + "step": 2174 + }, + { + "epoch": 1.0283687943262412, + "grad_norm": 3.026901960372925, + "learning_rate": 4.672304751827592e-06, + "loss": 0.62, + "step": 2175 + }, + { + "epoch": 1.0288416075650118, + "grad_norm": 2.3946213722229004, + "learning_rate": 4.671995921525391e-06, + "loss": 0.5228, + "step": 2176 + }, + { + "epoch": 1.0293144208037825, + "grad_norm": 2.985020399093628, + "learning_rate": 4.671686955982528e-06, + "loss": 0.6256, + "step": 2177 + }, + { + "epoch": 1.0297872340425531, + "grad_norm": 3.0910139083862305, + "learning_rate": 4.671377855218239e-06, + "loss": 0.5893, + "step": 2178 + }, + { + "epoch": 1.030260047281324, + "grad_norm": 2.507805109024048, + "learning_rate": 4.6710686192517744e-06, + "loss": 0.5329, + "step": 2179 + }, + { + "epoch": 1.0307328605200945, + "grad_norm": 2.4514641761779785, + "learning_rate": 4.670759248102386e-06, + "loss": 0.4585, + "step": 2180 + }, + { + "epoch": 1.0312056737588653, + "grad_norm": 2.742838144302368, + "learning_rate": 4.670449741789337e-06, + "loss": 0.6255, + "step": 2181 + }, + { + "epoch": 1.0316784869976359, + "grad_norm": 2.374349594116211, + "learning_rate": 4.670140100331901e-06, + "loss": 0.5049, + "step": 2182 + }, + { + "epoch": 1.0321513002364067, + "grad_norm": 2.78894305229187, + "learning_rate": 4.669830323749356e-06, + "loss": 0.6061, + "step": 2183 + }, + { + "epoch": 1.0326241134751772, + "grad_norm": 2.7195091247558594, + "learning_rate": 4.6695204120609905e-06, + "loss": 0.592, + "step": 2184 + }, + { + "epoch": 1.033096926713948, + "grad_norm": 2.824411630630493, + "learning_rate": 4.6692103652861035e-06, + "loss": 0.5666, + "step": 2185 + }, + { + "epoch": 1.0335697399527186, + "grad_norm": 2.4981014728546143, + "learning_rate": 4.6689001834439975e-06, + "loss": 0.5045, + "step": 2186 + }, + { + "epoch": 1.0340425531914894, + "grad_norm": 2.7375214099884033, + "learning_rate": 4.668589866553988e-06, + "loss": 0.5305, + "step": 2187 + }, + { + "epoch": 1.03451536643026, + "grad_norm": 2.625345468521118, + "learning_rate": 4.668279414635396e-06, + "loss": 0.4819, + "step": 2188 + }, + { + "epoch": 1.0349881796690308, + "grad_norm": 2.60479736328125, + "learning_rate": 4.667968827707553e-06, + "loss": 0.55, + "step": 2189 + }, + { + "epoch": 1.0354609929078014, + "grad_norm": 2.642014741897583, + "learning_rate": 4.667658105789797e-06, + "loss": 0.5264, + "step": 2190 + }, + { + "epoch": 1.0359338061465722, + "grad_norm": 2.5439083576202393, + "learning_rate": 4.667347248901476e-06, + "loss": 0.4657, + "step": 2191 + }, + { + "epoch": 1.0364066193853427, + "grad_norm": 2.5537586212158203, + "learning_rate": 4.667036257061945e-06, + "loss": 0.527, + "step": 2192 + }, + { + "epoch": 1.0368794326241135, + "grad_norm": 2.595466375350952, + "learning_rate": 4.666725130290569e-06, + "loss": 0.5336, + "step": 2193 + }, + { + "epoch": 1.037352245862884, + "grad_norm": 3.5106313228607178, + "learning_rate": 4.666413868606719e-06, + "loss": 0.5176, + "step": 2194 + }, + { + "epoch": 1.037825059101655, + "grad_norm": 2.931553363800049, + "learning_rate": 4.666102472029778e-06, + "loss": 0.549, + "step": 2195 + }, + { + "epoch": 1.0382978723404255, + "grad_norm": 2.4325125217437744, + "learning_rate": 4.665790940579133e-06, + "loss": 0.5095, + "step": 2196 + }, + { + "epoch": 1.0387706855791963, + "grad_norm": 2.708477258682251, + "learning_rate": 4.665479274274184e-06, + "loss": 0.5264, + "step": 2197 + }, + { + "epoch": 1.0392434988179668, + "grad_norm": 2.905977487564087, + "learning_rate": 4.665167473134335e-06, + "loss": 0.5575, + "step": 2198 + }, + { + "epoch": 1.0397163120567376, + "grad_norm": 2.428938865661621, + "learning_rate": 4.664855537179003e-06, + "loss": 0.5099, + "step": 2199 + }, + { + "epoch": 1.0401891252955082, + "grad_norm": 2.8432137966156006, + "learning_rate": 4.6645434664276075e-06, + "loss": 0.5331, + "step": 2200 + }, + { + "epoch": 1.040661938534279, + "grad_norm": 2.5185136795043945, + "learning_rate": 4.6642312608995825e-06, + "loss": 0.5217, + "step": 2201 + }, + { + "epoch": 1.0411347517730496, + "grad_norm": 2.556607723236084, + "learning_rate": 4.663918920614366e-06, + "loss": 0.4431, + "step": 2202 + }, + { + "epoch": 1.0416075650118204, + "grad_norm": 3.1271166801452637, + "learning_rate": 4.663606445591407e-06, + "loss": 0.5398, + "step": 2203 + }, + { + "epoch": 1.042080378250591, + "grad_norm": 2.573680877685547, + "learning_rate": 4.663293835850162e-06, + "loss": 0.4713, + "step": 2204 + }, + { + "epoch": 1.0425531914893618, + "grad_norm": 2.5230324268341064, + "learning_rate": 4.662981091410096e-06, + "loss": 0.5571, + "step": 2205 + }, + { + "epoch": 1.0430260047281323, + "grad_norm": 2.552182912826538, + "learning_rate": 4.662668212290681e-06, + "loss": 0.5173, + "step": 2206 + }, + { + "epoch": 1.0434988179669031, + "grad_norm": 2.832345724105835, + "learning_rate": 4.6623551985113995e-06, + "loss": 0.525, + "step": 2207 + }, + { + "epoch": 1.0439716312056737, + "grad_norm": 2.9729080200195312, + "learning_rate": 4.6620420500917416e-06, + "loss": 0.6308, + "step": 2208 + }, + { + "epoch": 1.0444444444444445, + "grad_norm": 2.618187665939331, + "learning_rate": 4.661728767051206e-06, + "loss": 0.4942, + "step": 2209 + }, + { + "epoch": 1.044917257683215, + "grad_norm": 2.515566349029541, + "learning_rate": 4.661415349409299e-06, + "loss": 0.5229, + "step": 2210 + }, + { + "epoch": 1.0453900709219859, + "grad_norm": 2.8651459217071533, + "learning_rate": 4.6611017971855356e-06, + "loss": 0.5029, + "step": 2211 + }, + { + "epoch": 1.0458628841607565, + "grad_norm": 2.502405881881714, + "learning_rate": 4.660788110399439e-06, + "loss": 0.4732, + "step": 2212 + }, + { + "epoch": 1.0463356973995273, + "grad_norm": 2.540668249130249, + "learning_rate": 4.660474289070541e-06, + "loss": 0.547, + "step": 2213 + }, + { + "epoch": 1.0468085106382978, + "grad_norm": 2.803469181060791, + "learning_rate": 4.660160333218384e-06, + "loss": 0.5441, + "step": 2214 + }, + { + "epoch": 1.0472813238770686, + "grad_norm": 3.233325481414795, + "learning_rate": 4.659846242862514e-06, + "loss": 0.4457, + "step": 2215 + }, + { + "epoch": 1.0477541371158392, + "grad_norm": 2.549548387527466, + "learning_rate": 4.659532018022489e-06, + "loss": 0.5684, + "step": 2216 + }, + { + "epoch": 1.04822695035461, + "grad_norm": 2.6112852096557617, + "learning_rate": 4.659217658717875e-06, + "loss": 0.5323, + "step": 2217 + }, + { + "epoch": 1.0486997635933806, + "grad_norm": 2.347418785095215, + "learning_rate": 4.658903164968245e-06, + "loss": 0.5349, + "step": 2218 + }, + { + "epoch": 1.0491725768321514, + "grad_norm": 2.695502281188965, + "learning_rate": 4.658588536793182e-06, + "loss": 0.4883, + "step": 2219 + }, + { + "epoch": 1.049645390070922, + "grad_norm": 2.7575674057006836, + "learning_rate": 4.658273774212275e-06, + "loss": 0.5517, + "step": 2220 + }, + { + "epoch": 1.0501182033096927, + "grad_norm": 2.787855386734009, + "learning_rate": 4.6579588772451245e-06, + "loss": 0.5744, + "step": 2221 + }, + { + "epoch": 1.0505910165484633, + "grad_norm": 3.0699398517608643, + "learning_rate": 4.657643845911337e-06, + "loss": 0.5258, + "step": 2222 + }, + { + "epoch": 1.0510638297872341, + "grad_norm": 2.652040719985962, + "learning_rate": 4.657328680230527e-06, + "loss": 0.5141, + "step": 2223 + }, + { + "epoch": 1.0515366430260047, + "grad_norm": 2.6896369457244873, + "learning_rate": 4.657013380222322e-06, + "loss": 0.5139, + "step": 2224 + }, + { + "epoch": 1.0520094562647755, + "grad_norm": 2.551839590072632, + "learning_rate": 4.65669794590635e-06, + "loss": 0.5099, + "step": 2225 + }, + { + "epoch": 1.052482269503546, + "grad_norm": 2.8543262481689453, + "learning_rate": 4.656382377302255e-06, + "loss": 0.6085, + "step": 2226 + }, + { + "epoch": 1.0529550827423169, + "grad_norm": 2.871469259262085, + "learning_rate": 4.656066674429685e-06, + "loss": 0.6108, + "step": 2227 + }, + { + "epoch": 1.0534278959810874, + "grad_norm": 2.4840824604034424, + "learning_rate": 4.655750837308296e-06, + "loss": 0.4994, + "step": 2228 + }, + { + "epoch": 1.0539007092198582, + "grad_norm": 2.5203280448913574, + "learning_rate": 4.6554348659577555e-06, + "loss": 0.4928, + "step": 2229 + }, + { + "epoch": 1.0543735224586288, + "grad_norm": 2.9327683448791504, + "learning_rate": 4.655118760397737e-06, + "loss": 0.6324, + "step": 2230 + }, + { + "epoch": 1.0548463356973996, + "grad_norm": 2.6766855716705322, + "learning_rate": 4.654802520647924e-06, + "loss": 0.5178, + "step": 2231 + }, + { + "epoch": 1.0553191489361702, + "grad_norm": 2.8438873291015625, + "learning_rate": 4.654486146728006e-06, + "loss": 0.509, + "step": 2232 + }, + { + "epoch": 1.055791962174941, + "grad_norm": 2.538661241531372, + "learning_rate": 4.6541696386576826e-06, + "loss": 0.5463, + "step": 2233 + }, + { + "epoch": 1.0562647754137116, + "grad_norm": 2.829030990600586, + "learning_rate": 4.653852996456662e-06, + "loss": 0.5404, + "step": 2234 + }, + { + "epoch": 1.0567375886524824, + "grad_norm": 2.5657269954681396, + "learning_rate": 4.653536220144659e-06, + "loss": 0.5479, + "step": 2235 + }, + { + "epoch": 1.057210401891253, + "grad_norm": 2.6641297340393066, + "learning_rate": 4.653219309741399e-06, + "loss": 0.5503, + "step": 2236 + }, + { + "epoch": 1.0576832151300237, + "grad_norm": 2.966350555419922, + "learning_rate": 4.652902265266615e-06, + "loss": 0.6404, + "step": 2237 + }, + { + "epoch": 1.0581560283687943, + "grad_norm": 2.462430000305176, + "learning_rate": 4.6525850867400455e-06, + "loss": 0.4885, + "step": 2238 + }, + { + "epoch": 1.058628841607565, + "grad_norm": 2.1791880130767822, + "learning_rate": 4.652267774181443e-06, + "loss": 0.4405, + "step": 2239 + }, + { + "epoch": 1.0591016548463357, + "grad_norm": 2.5473732948303223, + "learning_rate": 4.651950327610563e-06, + "loss": 0.5295, + "step": 2240 + }, + { + "epoch": 1.0595744680851065, + "grad_norm": 2.70904803276062, + "learning_rate": 4.651632747047172e-06, + "loss": 0.5169, + "step": 2241 + }, + { + "epoch": 1.060047281323877, + "grad_norm": 3.8442928791046143, + "learning_rate": 4.651315032511045e-06, + "loss": 0.5473, + "step": 2242 + }, + { + "epoch": 1.0605200945626478, + "grad_norm": 2.8613383769989014, + "learning_rate": 4.650997184021963e-06, + "loss": 0.5445, + "step": 2243 + }, + { + "epoch": 1.0609929078014184, + "grad_norm": 2.5995829105377197, + "learning_rate": 4.6506792015997184e-06, + "loss": 0.5525, + "step": 2244 + }, + { + "epoch": 1.0614657210401892, + "grad_norm": 2.5465996265411377, + "learning_rate": 4.650361085264111e-06, + "loss": 0.5093, + "step": 2245 + }, + { + "epoch": 1.0619385342789598, + "grad_norm": 2.46553111076355, + "learning_rate": 4.650042835034948e-06, + "loss": 0.5375, + "step": 2246 + }, + { + "epoch": 1.0624113475177306, + "grad_norm": 2.6907830238342285, + "learning_rate": 4.649724450932045e-06, + "loss": 0.572, + "step": 2247 + }, + { + "epoch": 1.0628841607565012, + "grad_norm": 3.0671346187591553, + "learning_rate": 4.649405932975226e-06, + "loss": 0.4974, + "step": 2248 + }, + { + "epoch": 1.063356973995272, + "grad_norm": 2.5392491817474365, + "learning_rate": 4.649087281184325e-06, + "loss": 0.524, + "step": 2249 + }, + { + "epoch": 1.0638297872340425, + "grad_norm": 2.7498562335968018, + "learning_rate": 4.648768495579183e-06, + "loss": 0.5801, + "step": 2250 + }, + { + "epoch": 1.0643026004728133, + "grad_norm": 2.8536248207092285, + "learning_rate": 4.648449576179649e-06, + "loss": 0.5384, + "step": 2251 + }, + { + "epoch": 1.064775413711584, + "grad_norm": 2.7062792778015137, + "learning_rate": 4.64813052300558e-06, + "loss": 0.5262, + "step": 2252 + }, + { + "epoch": 1.0652482269503547, + "grad_norm": 2.798650026321411, + "learning_rate": 4.647811336076841e-06, + "loss": 0.5719, + "step": 2253 + }, + { + "epoch": 1.0657210401891253, + "grad_norm": 2.9793951511383057, + "learning_rate": 4.647492015413311e-06, + "loss": 0.5377, + "step": 2254 + }, + { + "epoch": 1.066193853427896, + "grad_norm": 2.572129011154175, + "learning_rate": 4.647172561034868e-06, + "loss": 0.4791, + "step": 2255 + }, + { + "epoch": 1.0666666666666667, + "grad_norm": 3.7490930557250977, + "learning_rate": 4.646852972961405e-06, + "loss": 0.5423, + "step": 2256 + }, + { + "epoch": 1.0671394799054374, + "grad_norm": 2.626255750656128, + "learning_rate": 4.646533251212821e-06, + "loss": 0.5558, + "step": 2257 + }, + { + "epoch": 1.067612293144208, + "grad_norm": 2.8408126831054688, + "learning_rate": 4.646213395809023e-06, + "loss": 0.55, + "step": 2258 + }, + { + "epoch": 1.0680851063829788, + "grad_norm": 3.255606174468994, + "learning_rate": 4.645893406769929e-06, + "loss": 0.547, + "step": 2259 + }, + { + "epoch": 1.0685579196217494, + "grad_norm": 2.4352102279663086, + "learning_rate": 4.645573284115461e-06, + "loss": 0.4898, + "step": 2260 + }, + { + "epoch": 1.0690307328605202, + "grad_norm": 2.408634662628174, + "learning_rate": 4.6452530278655535e-06, + "loss": 0.5264, + "step": 2261 + }, + { + "epoch": 1.0695035460992908, + "grad_norm": 2.4220449924468994, + "learning_rate": 4.644932638040146e-06, + "loss": 0.5166, + "step": 2262 + }, + { + "epoch": 1.0699763593380616, + "grad_norm": 2.9188082218170166, + "learning_rate": 4.644612114659188e-06, + "loss": 0.5611, + "step": 2263 + }, + { + "epoch": 1.0704491725768321, + "grad_norm": 2.906557083129883, + "learning_rate": 4.644291457742638e-06, + "loss": 0.5515, + "step": 2264 + }, + { + "epoch": 1.070921985815603, + "grad_norm": 2.9039015769958496, + "learning_rate": 4.643970667310462e-06, + "loss": 0.5732, + "step": 2265 + }, + { + "epoch": 1.0713947990543735, + "grad_norm": 2.9985480308532715, + "learning_rate": 4.643649743382632e-06, + "loss": 0.563, + "step": 2266 + }, + { + "epoch": 1.0718676122931443, + "grad_norm": 2.5780906677246094, + "learning_rate": 4.6433286859791335e-06, + "loss": 0.502, + "step": 2267 + }, + { + "epoch": 1.0723404255319149, + "grad_norm": 2.590209722518921, + "learning_rate": 4.643007495119955e-06, + "loss": 0.4995, + "step": 2268 + }, + { + "epoch": 1.0728132387706855, + "grad_norm": 2.378894805908203, + "learning_rate": 4.642686170825097e-06, + "loss": 0.4886, + "step": 2269 + }, + { + "epoch": 1.0732860520094563, + "grad_norm": 2.6826229095458984, + "learning_rate": 4.642364713114567e-06, + "loss": 0.465, + "step": 2270 + }, + { + "epoch": 1.073758865248227, + "grad_norm": 2.627819538116455, + "learning_rate": 4.64204312200838e-06, + "loss": 0.4954, + "step": 2271 + }, + { + "epoch": 1.0742316784869976, + "grad_norm": 2.993021249771118, + "learning_rate": 4.641721397526561e-06, + "loss": 0.5073, + "step": 2272 + }, + { + "epoch": 1.0747044917257682, + "grad_norm": 2.719052791595459, + "learning_rate": 4.64139953968914e-06, + "loss": 0.538, + "step": 2273 + }, + { + "epoch": 1.075177304964539, + "grad_norm": 2.729252576828003, + "learning_rate": 4.6410775485161605e-06, + "loss": 0.552, + "step": 2274 + }, + { + "epoch": 1.0756501182033098, + "grad_norm": 2.924142599105835, + "learning_rate": 4.640755424027671e-06, + "loss": 0.522, + "step": 2275 + }, + { + "epoch": 1.0761229314420804, + "grad_norm": 3.329162120819092, + "learning_rate": 4.640433166243728e-06, + "loss": 0.5965, + "step": 2276 + }, + { + "epoch": 1.076595744680851, + "grad_norm": 2.9810245037078857, + "learning_rate": 4.640110775184396e-06, + "loss": 0.5653, + "step": 2277 + }, + { + "epoch": 1.0770685579196217, + "grad_norm": 2.61772084236145, + "learning_rate": 4.639788250869751e-06, + "loss": 0.5382, + "step": 2278 + }, + { + "epoch": 1.0775413711583925, + "grad_norm": 2.741225004196167, + "learning_rate": 4.639465593319874e-06, + "loss": 0.4866, + "step": 2279 + }, + { + "epoch": 1.0780141843971631, + "grad_norm": 2.7945218086242676, + "learning_rate": 4.639142802554856e-06, + "loss": 0.4711, + "step": 2280 + }, + { + "epoch": 1.0784869976359337, + "grad_norm": 2.4282329082489014, + "learning_rate": 4.638819878594795e-06, + "loss": 0.4911, + "step": 2281 + }, + { + "epoch": 1.0789598108747045, + "grad_norm": 2.551741361618042, + "learning_rate": 4.638496821459799e-06, + "loss": 0.453, + "step": 2282 + }, + { + "epoch": 1.0794326241134753, + "grad_norm": 2.5622754096984863, + "learning_rate": 4.638173631169983e-06, + "loss": 0.5983, + "step": 2283 + }, + { + "epoch": 1.0799054373522459, + "grad_norm": 2.7748284339904785, + "learning_rate": 4.6378503077454715e-06, + "loss": 0.5143, + "step": 2284 + }, + { + "epoch": 1.0803782505910164, + "grad_norm": 2.7693238258361816, + "learning_rate": 4.637526851206394e-06, + "loss": 0.5929, + "step": 2285 + }, + { + "epoch": 1.0808510638297872, + "grad_norm": 2.705548048019409, + "learning_rate": 4.637203261572893e-06, + "loss": 0.5577, + "step": 2286 + }, + { + "epoch": 1.081323877068558, + "grad_norm": 2.739307165145874, + "learning_rate": 4.636879538865117e-06, + "loss": 0.5676, + "step": 2287 + }, + { + "epoch": 1.0817966903073286, + "grad_norm": 2.514059543609619, + "learning_rate": 4.636555683103221e-06, + "loss": 0.5001, + "step": 2288 + }, + { + "epoch": 1.0822695035460992, + "grad_norm": 2.7166874408721924, + "learning_rate": 4.636231694307372e-06, + "loss": 0.5411, + "step": 2289 + }, + { + "epoch": 1.08274231678487, + "grad_norm": 2.7661683559417725, + "learning_rate": 4.635907572497741e-06, + "loss": 0.6353, + "step": 2290 + }, + { + "epoch": 1.0832151300236406, + "grad_norm": 2.598381996154785, + "learning_rate": 4.635583317694512e-06, + "loss": 0.5213, + "step": 2291 + }, + { + "epoch": 1.0836879432624114, + "grad_norm": 2.821491003036499, + "learning_rate": 4.6352589299178744e-06, + "loss": 0.6172, + "step": 2292 + }, + { + "epoch": 1.084160756501182, + "grad_norm": 2.5422823429107666, + "learning_rate": 4.634934409188025e-06, + "loss": 0.5245, + "step": 2293 + }, + { + "epoch": 1.0846335697399527, + "grad_norm": 2.8264620304107666, + "learning_rate": 4.634609755525173e-06, + "loss": 0.5004, + "step": 2294 + }, + { + "epoch": 1.0851063829787233, + "grad_norm": 2.3286643028259277, + "learning_rate": 4.63428496894953e-06, + "loss": 0.4561, + "step": 2295 + }, + { + "epoch": 1.085579196217494, + "grad_norm": 2.462005376815796, + "learning_rate": 4.633960049481321e-06, + "loss": 0.4948, + "step": 2296 + }, + { + "epoch": 1.0860520094562647, + "grad_norm": 2.760258913040161, + "learning_rate": 4.633634997140777e-06, + "loss": 0.5407, + "step": 2297 + }, + { + "epoch": 1.0865248226950355, + "grad_norm": 3.0234217643737793, + "learning_rate": 4.633309811948138e-06, + "loss": 0.4914, + "step": 2298 + }, + { + "epoch": 1.086997635933806, + "grad_norm": 2.8380849361419678, + "learning_rate": 4.63298449392365e-06, + "loss": 0.5562, + "step": 2299 + }, + { + "epoch": 1.0874704491725768, + "grad_norm": 2.6201648712158203, + "learning_rate": 4.632659043087572e-06, + "loss": 0.5882, + "step": 2300 + }, + { + "epoch": 1.0879432624113474, + "grad_norm": 2.586339235305786, + "learning_rate": 4.632333459460165e-06, + "loss": 0.4991, + "step": 2301 + }, + { + "epoch": 1.0884160756501182, + "grad_norm": 2.500115394592285, + "learning_rate": 4.632007743061705e-06, + "loss": 0.552, + "step": 2302 + }, + { + "epoch": 1.0888888888888888, + "grad_norm": 2.816390037536621, + "learning_rate": 4.63168189391247e-06, + "loss": 0.5301, + "step": 2303 + }, + { + "epoch": 1.0893617021276596, + "grad_norm": 2.975400924682617, + "learning_rate": 4.631355912032753e-06, + "loss": 0.6056, + "step": 2304 + }, + { + "epoch": 1.0898345153664302, + "grad_norm": 2.747985363006592, + "learning_rate": 4.631029797442846e-06, + "loss": 0.5335, + "step": 2305 + }, + { + "epoch": 1.090307328605201, + "grad_norm": 2.609281539916992, + "learning_rate": 4.630703550163059e-06, + "loss": 0.5189, + "step": 2306 + }, + { + "epoch": 1.0907801418439715, + "grad_norm": 2.624131202697754, + "learning_rate": 4.630377170213705e-06, + "loss": 0.5646, + "step": 2307 + }, + { + "epoch": 1.0912529550827423, + "grad_norm": 2.6186959743499756, + "learning_rate": 4.630050657615107e-06, + "loss": 0.5187, + "step": 2308 + }, + { + "epoch": 1.091725768321513, + "grad_norm": 2.9961764812469482, + "learning_rate": 4.629724012387594e-06, + "loss": 0.6207, + "step": 2309 + }, + { + "epoch": 1.0921985815602837, + "grad_norm": 2.665799140930176, + "learning_rate": 4.629397234551505e-06, + "loss": 0.5046, + "step": 2310 + }, + { + "epoch": 1.0926713947990543, + "grad_norm": 2.6154725551605225, + "learning_rate": 4.629070324127187e-06, + "loss": 0.5553, + "step": 2311 + }, + { + "epoch": 1.093144208037825, + "grad_norm": 2.702967643737793, + "learning_rate": 4.628743281134996e-06, + "loss": 0.5159, + "step": 2312 + }, + { + "epoch": 1.0936170212765957, + "grad_norm": 2.578080177307129, + "learning_rate": 4.628416105595295e-06, + "loss": 0.4934, + "step": 2313 + }, + { + "epoch": 1.0940898345153665, + "grad_norm": 2.8763060569763184, + "learning_rate": 4.628088797528456e-06, + "loss": 0.5404, + "step": 2314 + }, + { + "epoch": 1.094562647754137, + "grad_norm": 2.5301198959350586, + "learning_rate": 4.6277613569548585e-06, + "loss": 0.524, + "step": 2315 + }, + { + "epoch": 1.0950354609929078, + "grad_norm": 2.559903144836426, + "learning_rate": 4.627433783894892e-06, + "loss": 0.5177, + "step": 2316 + }, + { + "epoch": 1.0955082742316784, + "grad_norm": 2.430863380432129, + "learning_rate": 4.627106078368952e-06, + "loss": 0.5368, + "step": 2317 + }, + { + "epoch": 1.0959810874704492, + "grad_norm": 2.687567949295044, + "learning_rate": 4.626778240397444e-06, + "loss": 0.5385, + "step": 2318 + }, + { + "epoch": 1.0964539007092198, + "grad_norm": 3.053466558456421, + "learning_rate": 4.62645027000078e-06, + "loss": 0.5814, + "step": 2319 + }, + { + "epoch": 1.0969267139479906, + "grad_norm": 2.4612979888916016, + "learning_rate": 4.6261221671993815e-06, + "loss": 0.5069, + "step": 2320 + }, + { + "epoch": 1.0973995271867611, + "grad_norm": 2.6153628826141357, + "learning_rate": 4.625793932013679e-06, + "loss": 0.5422, + "step": 2321 + }, + { + "epoch": 1.097872340425532, + "grad_norm": 2.8918874263763428, + "learning_rate": 4.62546556446411e-06, + "loss": 0.5326, + "step": 2322 + }, + { + "epoch": 1.0983451536643025, + "grad_norm": 3.62565279006958, + "learning_rate": 4.625137064571119e-06, + "loss": 0.5164, + "step": 2323 + }, + { + "epoch": 1.0988179669030733, + "grad_norm": 2.4285085201263428, + "learning_rate": 4.624808432355164e-06, + "loss": 0.5084, + "step": 2324 + }, + { + "epoch": 1.099290780141844, + "grad_norm": 2.593979835510254, + "learning_rate": 4.624479667836702e-06, + "loss": 0.4986, + "step": 2325 + }, + { + "epoch": 1.0997635933806147, + "grad_norm": 2.490752935409546, + "learning_rate": 4.624150771036208e-06, + "loss": 0.5296, + "step": 2326 + }, + { + "epoch": 1.1002364066193853, + "grad_norm": 2.67694091796875, + "learning_rate": 4.6238217419741595e-06, + "loss": 0.5229, + "step": 2327 + }, + { + "epoch": 1.100709219858156, + "grad_norm": 2.594147205352783, + "learning_rate": 4.623492580671044e-06, + "loss": 0.4916, + "step": 2328 + }, + { + "epoch": 1.1011820330969266, + "grad_norm": 2.943472385406494, + "learning_rate": 4.623163287147356e-06, + "loss": 0.5591, + "step": 2329 + }, + { + "epoch": 1.1016548463356974, + "grad_norm": 2.569410562515259, + "learning_rate": 4.622833861423601e-06, + "loss": 0.4648, + "step": 2330 + }, + { + "epoch": 1.102127659574468, + "grad_norm": 2.5490405559539795, + "learning_rate": 4.6225043035202886e-06, + "loss": 0.5493, + "step": 2331 + }, + { + "epoch": 1.1026004728132388, + "grad_norm": 2.5964598655700684, + "learning_rate": 4.622174613457941e-06, + "loss": 0.5358, + "step": 2332 + }, + { + "epoch": 1.1030732860520094, + "grad_norm": 2.6456820964813232, + "learning_rate": 4.621844791257085e-06, + "loss": 0.5864, + "step": 2333 + }, + { + "epoch": 1.1035460992907802, + "grad_norm": 2.861180067062378, + "learning_rate": 4.621514836938259e-06, + "loss": 0.6064, + "step": 2334 + }, + { + "epoch": 1.1040189125295508, + "grad_norm": 2.8199548721313477, + "learning_rate": 4.621184750522005e-06, + "loss": 0.5244, + "step": 2335 + }, + { + "epoch": 1.1044917257683216, + "grad_norm": 2.7398853302001953, + "learning_rate": 4.6208545320288795e-06, + "loss": 0.5496, + "step": 2336 + }, + { + "epoch": 1.1049645390070921, + "grad_norm": 2.7941031455993652, + "learning_rate": 4.620524181479441e-06, + "loss": 0.5496, + "step": 2337 + }, + { + "epoch": 1.105437352245863, + "grad_norm": 2.973785161972046, + "learning_rate": 4.620193698894259e-06, + "loss": 0.5492, + "step": 2338 + }, + { + "epoch": 1.1059101654846335, + "grad_norm": 2.650355815887451, + "learning_rate": 4.6198630842939144e-06, + "loss": 0.5392, + "step": 2339 + }, + { + "epoch": 1.1063829787234043, + "grad_norm": 2.9092214107513428, + "learning_rate": 4.61953233769899e-06, + "loss": 0.5305, + "step": 2340 + }, + { + "epoch": 1.1068557919621749, + "grad_norm": 2.6329731941223145, + "learning_rate": 4.61920145913008e-06, + "loss": 0.5031, + "step": 2341 + }, + { + "epoch": 1.1073286052009457, + "grad_norm": 2.7214207649230957, + "learning_rate": 4.618870448607788e-06, + "loss": 0.5536, + "step": 2342 + }, + { + "epoch": 1.1078014184397162, + "grad_norm": 2.873119592666626, + "learning_rate": 4.618539306152724e-06, + "loss": 0.4531, + "step": 2343 + }, + { + "epoch": 1.108274231678487, + "grad_norm": 2.701042413711548, + "learning_rate": 4.618208031785507e-06, + "loss": 0.5217, + "step": 2344 + }, + { + "epoch": 1.1087470449172576, + "grad_norm": 2.7189881801605225, + "learning_rate": 4.6178766255267635e-06, + "loss": 0.6205, + "step": 2345 + }, + { + "epoch": 1.1092198581560284, + "grad_norm": 2.546382188796997, + "learning_rate": 4.61754508739713e-06, + "loss": 0.5475, + "step": 2346 + }, + { + "epoch": 1.109692671394799, + "grad_norm": 2.8429276943206787, + "learning_rate": 4.617213417417249e-06, + "loss": 0.4809, + "step": 2347 + }, + { + "epoch": 1.1101654846335698, + "grad_norm": 2.9515812397003174, + "learning_rate": 4.616881615607772e-06, + "loss": 0.5067, + "step": 2348 + }, + { + "epoch": 1.1106382978723404, + "grad_norm": 2.5910723209381104, + "learning_rate": 4.616549681989358e-06, + "loss": 0.5368, + "step": 2349 + }, + { + "epoch": 1.1111111111111112, + "grad_norm": 2.80855655670166, + "learning_rate": 4.616217616582678e-06, + "loss": 0.5827, + "step": 2350 + }, + { + "epoch": 1.1115839243498817, + "grad_norm": 2.604383945465088, + "learning_rate": 4.6158854194084044e-06, + "loss": 0.5716, + "step": 2351 + }, + { + "epoch": 1.1120567375886525, + "grad_norm": 3.0585904121398926, + "learning_rate": 4.6155530904872246e-06, + "loss": 0.4998, + "step": 2352 + }, + { + "epoch": 1.112529550827423, + "grad_norm": 2.660961627960205, + "learning_rate": 4.61522062983983e-06, + "loss": 0.4533, + "step": 2353 + }, + { + "epoch": 1.113002364066194, + "grad_norm": 2.8042070865631104, + "learning_rate": 4.614888037486923e-06, + "loss": 0.5592, + "step": 2354 + }, + { + "epoch": 1.1134751773049645, + "grad_norm": 2.681664228439331, + "learning_rate": 4.61455531344921e-06, + "loss": 0.5439, + "step": 2355 + }, + { + "epoch": 1.1139479905437353, + "grad_norm": 2.905054807662964, + "learning_rate": 4.61422245774741e-06, + "loss": 0.5497, + "step": 2356 + }, + { + "epoch": 1.1144208037825059, + "grad_norm": 2.7979753017425537, + "learning_rate": 4.6138894704022484e-06, + "loss": 0.5374, + "step": 2357 + }, + { + "epoch": 1.1148936170212767, + "grad_norm": 2.965611696243286, + "learning_rate": 4.613556351434458e-06, + "loss": 0.5145, + "step": 2358 + }, + { + "epoch": 1.1153664302600472, + "grad_norm": 2.583134889602661, + "learning_rate": 4.613223100864782e-06, + "loss": 0.535, + "step": 2359 + }, + { + "epoch": 1.115839243498818, + "grad_norm": 2.5979621410369873, + "learning_rate": 4.61288971871397e-06, + "loss": 0.5514, + "step": 2360 + }, + { + "epoch": 1.1163120567375886, + "grad_norm": 3.0117669105529785, + "learning_rate": 4.612556205002779e-06, + "loss": 0.5266, + "step": 2361 + }, + { + "epoch": 1.1167848699763594, + "grad_norm": 2.425133466720581, + "learning_rate": 4.612222559751976e-06, + "loss": 0.4838, + "step": 2362 + }, + { + "epoch": 1.11725768321513, + "grad_norm": 2.5102691650390625, + "learning_rate": 4.611888782982337e-06, + "loss": 0.3947, + "step": 2363 + }, + { + "epoch": 1.1177304964539008, + "grad_norm": 3.0327367782592773, + "learning_rate": 4.611554874714645e-06, + "loss": 0.5753, + "step": 2364 + }, + { + "epoch": 1.1182033096926713, + "grad_norm": 2.4561009407043457, + "learning_rate": 4.6112208349696875e-06, + "loss": 0.5054, + "step": 2365 + }, + { + "epoch": 1.1186761229314421, + "grad_norm": 3.3898050785064697, + "learning_rate": 4.610886663768267e-06, + "loss": 0.5946, + "step": 2366 + }, + { + "epoch": 1.1191489361702127, + "grad_norm": 2.8112242221832275, + "learning_rate": 4.61055236113119e-06, + "loss": 0.5475, + "step": 2367 + }, + { + "epoch": 1.1196217494089835, + "grad_norm": 3.152946710586548, + "learning_rate": 4.610217927079272e-06, + "loss": 0.5165, + "step": 2368 + }, + { + "epoch": 1.120094562647754, + "grad_norm": 2.7847867012023926, + "learning_rate": 4.609883361633336e-06, + "loss": 0.5533, + "step": 2369 + }, + { + "epoch": 1.1205673758865249, + "grad_norm": 2.6376686096191406, + "learning_rate": 4.6095486648142155e-06, + "loss": 0.4942, + "step": 2370 + }, + { + "epoch": 1.1210401891252955, + "grad_norm": 3.123072862625122, + "learning_rate": 4.609213836642749e-06, + "loss": 0.616, + "step": 2371 + }, + { + "epoch": 1.1215130023640663, + "grad_norm": 2.802694320678711, + "learning_rate": 4.608878877139786e-06, + "loss": 0.5323, + "step": 2372 + }, + { + "epoch": 1.1219858156028368, + "grad_norm": 2.3567938804626465, + "learning_rate": 4.6085437863261825e-06, + "loss": 0.4822, + "step": 2373 + }, + { + "epoch": 1.1224586288416076, + "grad_norm": 2.553112030029297, + "learning_rate": 4.608208564222804e-06, + "loss": 0.5447, + "step": 2374 + }, + { + "epoch": 1.1229314420803782, + "grad_norm": 3.0020132064819336, + "learning_rate": 4.607873210850521e-06, + "loss": 0.6486, + "step": 2375 + }, + { + "epoch": 1.123404255319149, + "grad_norm": 2.832442045211792, + "learning_rate": 4.607537726230216e-06, + "loss": 0.5257, + "step": 2376 + }, + { + "epoch": 1.1238770685579196, + "grad_norm": 2.471527099609375, + "learning_rate": 4.607202110382778e-06, + "loss": 0.4816, + "step": 2377 + }, + { + "epoch": 1.1243498817966904, + "grad_norm": 2.4232118129730225, + "learning_rate": 4.606866363329105e-06, + "loss": 0.5533, + "step": 2378 + }, + { + "epoch": 1.124822695035461, + "grad_norm": 2.477506637573242, + "learning_rate": 4.6065304850901025e-06, + "loss": 0.5223, + "step": 2379 + }, + { + "epoch": 1.1252955082742317, + "grad_norm": 3.54127836227417, + "learning_rate": 4.6061944756866824e-06, + "loss": 0.6514, + "step": 2380 + }, + { + "epoch": 1.1257683215130023, + "grad_norm": 2.5148677825927734, + "learning_rate": 4.605858335139768e-06, + "loss": 0.4864, + "step": 2381 + }, + { + "epoch": 1.1262411347517731, + "grad_norm": 2.8363659381866455, + "learning_rate": 4.605522063470289e-06, + "loss": 0.5034, + "step": 2382 + }, + { + "epoch": 1.1267139479905437, + "grad_norm": 2.4996654987335205, + "learning_rate": 4.605185660699184e-06, + "loss": 0.4126, + "step": 2383 + }, + { + "epoch": 1.1271867612293145, + "grad_norm": 2.352543830871582, + "learning_rate": 4.604849126847398e-06, + "loss": 0.5224, + "step": 2384 + }, + { + "epoch": 1.127659574468085, + "grad_norm": 2.60101056098938, + "learning_rate": 4.6045124619358875e-06, + "loss": 0.4867, + "step": 2385 + }, + { + "epoch": 1.1281323877068559, + "grad_norm": 2.9471068382263184, + "learning_rate": 4.604175665985613e-06, + "loss": 0.6474, + "step": 2386 + }, + { + "epoch": 1.1286052009456264, + "grad_norm": 2.5933351516723633, + "learning_rate": 4.603838739017546e-06, + "loss": 0.5081, + "step": 2387 + }, + { + "epoch": 1.1290780141843972, + "grad_norm": 2.3740346431732178, + "learning_rate": 4.6035016810526665e-06, + "loss": 0.4438, + "step": 2388 + }, + { + "epoch": 1.1295508274231678, + "grad_norm": 2.675020217895508, + "learning_rate": 4.6031644921119614e-06, + "loss": 0.4968, + "step": 2389 + }, + { + "epoch": 1.1300236406619386, + "grad_norm": 2.599472999572754, + "learning_rate": 4.602827172216424e-06, + "loss": 0.5131, + "step": 2390 + }, + { + "epoch": 1.1304964539007092, + "grad_norm": 2.8176097869873047, + "learning_rate": 4.602489721387061e-06, + "loss": 0.5549, + "step": 2391 + }, + { + "epoch": 1.13096926713948, + "grad_norm": 2.466914176940918, + "learning_rate": 4.602152139644881e-06, + "loss": 0.5052, + "step": 2392 + }, + { + "epoch": 1.1314420803782506, + "grad_norm": 2.8938796520233154, + "learning_rate": 4.601814427010905e-06, + "loss": 0.6181, + "step": 2393 + }, + { + "epoch": 1.1319148936170214, + "grad_norm": 2.7390825748443604, + "learning_rate": 4.601476583506161e-06, + "loss": 0.5178, + "step": 2394 + }, + { + "epoch": 1.132387706855792, + "grad_norm": 3.180112838745117, + "learning_rate": 4.601138609151685e-06, + "loss": 0.6071, + "step": 2395 + }, + { + "epoch": 1.1328605200945627, + "grad_norm": 2.9282350540161133, + "learning_rate": 4.600800503968521e-06, + "loss": 0.5557, + "step": 2396 + }, + { + "epoch": 1.1333333333333333, + "grad_norm": 2.6689717769622803, + "learning_rate": 4.6004622679777215e-06, + "loss": 0.4679, + "step": 2397 + }, + { + "epoch": 1.133806146572104, + "grad_norm": 2.651582956314087, + "learning_rate": 4.600123901200347e-06, + "loss": 0.4907, + "step": 2398 + }, + { + "epoch": 1.1342789598108747, + "grad_norm": 2.5702924728393555, + "learning_rate": 4.599785403657464e-06, + "loss": 0.4919, + "step": 2399 + }, + { + "epoch": 1.1347517730496455, + "grad_norm": 2.636812448501587, + "learning_rate": 4.599446775370153e-06, + "loss": 0.5091, + "step": 2400 + }, + { + "epoch": 1.135224586288416, + "grad_norm": 2.5965442657470703, + "learning_rate": 4.599108016359497e-06, + "loss": 0.5035, + "step": 2401 + }, + { + "epoch": 1.1356973995271868, + "grad_norm": 2.689732313156128, + "learning_rate": 4.5987691266465885e-06, + "loss": 0.5307, + "step": 2402 + }, + { + "epoch": 1.1361702127659574, + "grad_norm": 2.7256956100463867, + "learning_rate": 4.59843010625253e-06, + "loss": 0.5066, + "step": 2403 + }, + { + "epoch": 1.1366430260047282, + "grad_norm": 2.726020574569702, + "learning_rate": 4.59809095519843e-06, + "loss": 0.4805, + "step": 2404 + }, + { + "epoch": 1.1371158392434988, + "grad_norm": 2.703339099884033, + "learning_rate": 4.597751673505406e-06, + "loss": 0.4992, + "step": 2405 + }, + { + "epoch": 1.1375886524822696, + "grad_norm": 2.54455304145813, + "learning_rate": 4.5974122611945835e-06, + "loss": 0.5251, + "step": 2406 + }, + { + "epoch": 1.1380614657210402, + "grad_norm": 2.623507022857666, + "learning_rate": 4.597072718287096e-06, + "loss": 0.4831, + "step": 2407 + }, + { + "epoch": 1.138534278959811, + "grad_norm": 2.653590202331543, + "learning_rate": 4.596733044804086e-06, + "loss": 0.5646, + "step": 2408 + }, + { + "epoch": 1.1390070921985815, + "grad_norm": 2.8230600357055664, + "learning_rate": 4.5963932407667035e-06, + "loss": 0.514, + "step": 2409 + }, + { + "epoch": 1.1394799054373523, + "grad_norm": 2.6077451705932617, + "learning_rate": 4.5960533061961065e-06, + "loss": 0.4713, + "step": 2410 + }, + { + "epoch": 1.139952718676123, + "grad_norm": 2.3945798873901367, + "learning_rate": 4.595713241113461e-06, + "loss": 0.466, + "step": 2411 + }, + { + "epoch": 1.1404255319148937, + "grad_norm": 2.8100006580352783, + "learning_rate": 4.595373045539941e-06, + "loss": 0.5365, + "step": 2412 + }, + { + "epoch": 1.1408983451536643, + "grad_norm": 2.6825881004333496, + "learning_rate": 4.59503271949673e-06, + "loss": 0.4457, + "step": 2413 + }, + { + "epoch": 1.141371158392435, + "grad_norm": 2.969435691833496, + "learning_rate": 4.594692263005016e-06, + "loss": 0.5459, + "step": 2414 + }, + { + "epoch": 1.1418439716312057, + "grad_norm": 2.4103164672851562, + "learning_rate": 4.594351676086002e-06, + "loss": 0.4573, + "step": 2415 + }, + { + "epoch": 1.1423167848699765, + "grad_norm": 2.9450128078460693, + "learning_rate": 4.594010958760892e-06, + "loss": 0.5529, + "step": 2416 + }, + { + "epoch": 1.142789598108747, + "grad_norm": 2.6416335105895996, + "learning_rate": 4.593670111050901e-06, + "loss": 0.5153, + "step": 2417 + }, + { + "epoch": 1.1432624113475178, + "grad_norm": 2.473177194595337, + "learning_rate": 4.593329132977253e-06, + "loss": 0.4962, + "step": 2418 + }, + { + "epoch": 1.1437352245862884, + "grad_norm": 2.4494502544403076, + "learning_rate": 4.592988024561179e-06, + "loss": 0.5182, + "step": 2419 + }, + { + "epoch": 1.1442080378250592, + "grad_norm": 2.773930311203003, + "learning_rate": 4.592646785823918e-06, + "loss": 0.4442, + "step": 2420 + }, + { + "epoch": 1.1446808510638298, + "grad_norm": 2.4733314514160156, + "learning_rate": 4.592305416786718e-06, + "loss": 0.5106, + "step": 2421 + }, + { + "epoch": 1.1451536643026006, + "grad_norm": 2.6870038509368896, + "learning_rate": 4.591963917470834e-06, + "loss": 0.5316, + "step": 2422 + }, + { + "epoch": 1.1456264775413711, + "grad_norm": 2.8989531993865967, + "learning_rate": 4.591622287897529e-06, + "loss": 0.5906, + "step": 2423 + }, + { + "epoch": 1.1460992907801417, + "grad_norm": 2.6349124908447266, + "learning_rate": 4.591280528088077e-06, + "loss": 0.6225, + "step": 2424 + }, + { + "epoch": 1.1465721040189125, + "grad_norm": 3.19022274017334, + "learning_rate": 4.5909386380637555e-06, + "loss": 0.555, + "step": 2425 + }, + { + "epoch": 1.1470449172576833, + "grad_norm": 3.1473541259765625, + "learning_rate": 4.5905966178458535e-06, + "loss": 0.537, + "step": 2426 + }, + { + "epoch": 1.147517730496454, + "grad_norm": 2.6996145248413086, + "learning_rate": 4.590254467455667e-06, + "loss": 0.565, + "step": 2427 + }, + { + "epoch": 1.1479905437352245, + "grad_norm": 2.830188274383545, + "learning_rate": 4.5899121869145015e-06, + "loss": 0.6773, + "step": 2428 + }, + { + "epoch": 1.1484633569739953, + "grad_norm": 2.4937260150909424, + "learning_rate": 4.589569776243667e-06, + "loss": 0.5484, + "step": 2429 + }, + { + "epoch": 1.148936170212766, + "grad_norm": 2.54011869430542, + "learning_rate": 4.589227235464486e-06, + "loss": 0.5307, + "step": 2430 + }, + { + "epoch": 1.1494089834515366, + "grad_norm": 2.8764214515686035, + "learning_rate": 4.5888845645982845e-06, + "loss": 0.5296, + "step": 2431 + }, + { + "epoch": 1.1498817966903072, + "grad_norm": 2.637033462524414, + "learning_rate": 4.588541763666402e-06, + "loss": 0.5975, + "step": 2432 + }, + { + "epoch": 1.150354609929078, + "grad_norm": 2.8534255027770996, + "learning_rate": 4.5881988326901815e-06, + "loss": 0.5431, + "step": 2433 + }, + { + "epoch": 1.1508274231678488, + "grad_norm": 2.8546559810638428, + "learning_rate": 4.587855771690976e-06, + "loss": 0.469, + "step": 2434 + }, + { + "epoch": 1.1513002364066194, + "grad_norm": 2.9084973335266113, + "learning_rate": 4.587512580690146e-06, + "loss": 0.5566, + "step": 2435 + }, + { + "epoch": 1.15177304964539, + "grad_norm": 3.0993130207061768, + "learning_rate": 4.587169259709063e-06, + "loss": 0.5612, + "step": 2436 + }, + { + "epoch": 1.1522458628841608, + "grad_norm": 10.847400665283203, + "learning_rate": 4.5868258087691e-06, + "loss": 0.4678, + "step": 2437 + }, + { + "epoch": 1.1527186761229316, + "grad_norm": 2.6648571491241455, + "learning_rate": 4.586482227891645e-06, + "loss": 0.5951, + "step": 2438 + }, + { + "epoch": 1.1531914893617021, + "grad_norm": 2.529043197631836, + "learning_rate": 4.586138517098091e-06, + "loss": 0.5048, + "step": 2439 + }, + { + "epoch": 1.1536643026004727, + "grad_norm": 2.833904504776001, + "learning_rate": 4.585794676409839e-06, + "loss": 0.536, + "step": 2440 + }, + { + "epoch": 1.1541371158392435, + "grad_norm": 3.507657766342163, + "learning_rate": 4.585450705848298e-06, + "loss": 0.5954, + "step": 2441 + }, + { + "epoch": 1.1546099290780143, + "grad_norm": 2.6108388900756836, + "learning_rate": 4.585106605434887e-06, + "loss": 0.5684, + "step": 2442 + }, + { + "epoch": 1.1550827423167849, + "grad_norm": 2.490708589553833, + "learning_rate": 4.58476237519103e-06, + "loss": 0.4678, + "step": 2443 + }, + { + "epoch": 1.1555555555555554, + "grad_norm": 2.8192343711853027, + "learning_rate": 4.584418015138161e-06, + "loss": 0.5291, + "step": 2444 + }, + { + "epoch": 1.1560283687943262, + "grad_norm": 3.0878679752349854, + "learning_rate": 4.584073525297722e-06, + "loss": 0.5691, + "step": 2445 + }, + { + "epoch": 1.156501182033097, + "grad_norm": 3.1444318294525146, + "learning_rate": 4.583728905691163e-06, + "loss": 0.5643, + "step": 2446 + }, + { + "epoch": 1.1569739952718676, + "grad_norm": 3.02382230758667, + "learning_rate": 4.583384156339942e-06, + "loss": 0.6008, + "step": 2447 + }, + { + "epoch": 1.1574468085106382, + "grad_norm": 2.5942490100860596, + "learning_rate": 4.583039277265525e-06, + "loss": 0.5105, + "step": 2448 + }, + { + "epoch": 1.157919621749409, + "grad_norm": 2.938608407974243, + "learning_rate": 4.582694268489386e-06, + "loss": 0.5123, + "step": 2449 + }, + { + "epoch": 1.1583924349881798, + "grad_norm": 2.4622268676757812, + "learning_rate": 4.5823491300330075e-06, + "loss": 0.4538, + "step": 2450 + }, + { + "epoch": 1.1588652482269504, + "grad_norm": 2.4380505084991455, + "learning_rate": 4.5820038619178795e-06, + "loss": 0.4682, + "step": 2451 + }, + { + "epoch": 1.159338061465721, + "grad_norm": 2.479896068572998, + "learning_rate": 4.581658464165501e-06, + "loss": 0.4877, + "step": 2452 + }, + { + "epoch": 1.1598108747044917, + "grad_norm": 2.3373546600341797, + "learning_rate": 4.5813129367973765e-06, + "loss": 0.445, + "step": 2453 + }, + { + "epoch": 1.1602836879432625, + "grad_norm": 2.8586013317108154, + "learning_rate": 4.5809672798350214e-06, + "loss": 0.5232, + "step": 2454 + }, + { + "epoch": 1.160756501182033, + "grad_norm": 3.2302439212799072, + "learning_rate": 4.5806214932999595e-06, + "loss": 0.5336, + "step": 2455 + }, + { + "epoch": 1.1612293144208037, + "grad_norm": 3.1005783081054688, + "learning_rate": 4.580275577213721e-06, + "loss": 0.5123, + "step": 2456 + }, + { + "epoch": 1.1617021276595745, + "grad_norm": 2.7131073474884033, + "learning_rate": 4.579929531597842e-06, + "loss": 0.5648, + "step": 2457 + }, + { + "epoch": 1.1621749408983453, + "grad_norm": 2.5067050457000732, + "learning_rate": 4.579583356473874e-06, + "loss": 0.5324, + "step": 2458 + }, + { + "epoch": 1.1626477541371159, + "grad_norm": 2.7870543003082275, + "learning_rate": 4.579237051863366e-06, + "loss": 0.5094, + "step": 2459 + }, + { + "epoch": 1.1631205673758864, + "grad_norm": 2.739196300506592, + "learning_rate": 4.578890617787887e-06, + "loss": 0.5103, + "step": 2460 + }, + { + "epoch": 1.1635933806146572, + "grad_norm": 2.7108185291290283, + "learning_rate": 4.578544054269003e-06, + "loss": 0.533, + "step": 2461 + }, + { + "epoch": 1.1640661938534278, + "grad_norm": 3.028005361557007, + "learning_rate": 4.578197361328295e-06, + "loss": 0.636, + "step": 2462 + }, + { + "epoch": 1.1645390070921986, + "grad_norm": 2.4855129718780518, + "learning_rate": 4.5778505389873505e-06, + "loss": 0.501, + "step": 2463 + }, + { + "epoch": 1.1650118203309692, + "grad_norm": 2.6314198970794678, + "learning_rate": 4.577503587267764e-06, + "loss": 0.5812, + "step": 2464 + }, + { + "epoch": 1.16548463356974, + "grad_norm": 2.4209671020507812, + "learning_rate": 4.5771565061911385e-06, + "loss": 0.5168, + "step": 2465 + }, + { + "epoch": 1.1659574468085105, + "grad_norm": 2.526388645172119, + "learning_rate": 4.576809295779085e-06, + "loss": 0.5047, + "step": 2466 + }, + { + "epoch": 1.1664302600472813, + "grad_norm": 2.8278191089630127, + "learning_rate": 4.576461956053224e-06, + "loss": 0.4759, + "step": 2467 + }, + { + "epoch": 1.166903073286052, + "grad_norm": 2.7862167358398438, + "learning_rate": 4.576114487035182e-06, + "loss": 0.5492, + "step": 2468 + }, + { + "epoch": 1.1673758865248227, + "grad_norm": 2.6303019523620605, + "learning_rate": 4.575766888746594e-06, + "loss": 0.5538, + "step": 2469 + }, + { + "epoch": 1.1678486997635933, + "grad_norm": 2.613104820251465, + "learning_rate": 4.5754191612091034e-06, + "loss": 0.5114, + "step": 2470 + }, + { + "epoch": 1.168321513002364, + "grad_norm": 2.653958320617676, + "learning_rate": 4.5750713044443625e-06, + "loss": 0.5858, + "step": 2471 + }, + { + "epoch": 1.1687943262411347, + "grad_norm": 3.1143975257873535, + "learning_rate": 4.574723318474031e-06, + "loss": 0.5193, + "step": 2472 + }, + { + "epoch": 1.1692671394799055, + "grad_norm": 3.05454421043396, + "learning_rate": 4.574375203319775e-06, + "loss": 0.464, + "step": 2473 + }, + { + "epoch": 1.169739952718676, + "grad_norm": 2.66626238822937, + "learning_rate": 4.574026959003272e-06, + "loss": 0.4988, + "step": 2474 + }, + { + "epoch": 1.1702127659574468, + "grad_norm": 2.8871963024139404, + "learning_rate": 4.573678585546203e-06, + "loss": 0.5557, + "step": 2475 + }, + { + "epoch": 1.1706855791962174, + "grad_norm": 2.592949628829956, + "learning_rate": 4.573330082970262e-06, + "loss": 0.5178, + "step": 2476 + }, + { + "epoch": 1.1711583924349882, + "grad_norm": 2.9111456871032715, + "learning_rate": 4.572981451297148e-06, + "loss": 0.5712, + "step": 2477 + }, + { + "epoch": 1.1716312056737588, + "grad_norm": 2.8152248859405518, + "learning_rate": 4.57263269054857e-06, + "loss": 0.5548, + "step": 2478 + }, + { + "epoch": 1.1721040189125296, + "grad_norm": 3.0292418003082275, + "learning_rate": 4.572283800746241e-06, + "loss": 0.5937, + "step": 2479 + }, + { + "epoch": 1.1725768321513002, + "grad_norm": 3.454618215560913, + "learning_rate": 4.571934781911886e-06, + "loss": 0.5537, + "step": 2480 + }, + { + "epoch": 1.173049645390071, + "grad_norm": 2.7817866802215576, + "learning_rate": 4.571585634067239e-06, + "loss": 0.5649, + "step": 2481 + }, + { + "epoch": 1.1735224586288415, + "grad_norm": 2.7989349365234375, + "learning_rate": 4.571236357234037e-06, + "loss": 0.5448, + "step": 2482 + }, + { + "epoch": 1.1739952718676123, + "grad_norm": 2.8863933086395264, + "learning_rate": 4.57088695143403e-06, + "loss": 0.63, + "step": 2483 + }, + { + "epoch": 1.174468085106383, + "grad_norm": 2.5738039016723633, + "learning_rate": 4.570537416688972e-06, + "loss": 0.4702, + "step": 2484 + }, + { + "epoch": 1.1749408983451537, + "grad_norm": 3.003643274307251, + "learning_rate": 4.570187753020629e-06, + "loss": 0.5918, + "step": 2485 + }, + { + "epoch": 1.1754137115839243, + "grad_norm": 2.8619167804718018, + "learning_rate": 4.569837960450772e-06, + "loss": 0.5268, + "step": 2486 + }, + { + "epoch": 1.175886524822695, + "grad_norm": 2.876077175140381, + "learning_rate": 4.569488039001181e-06, + "loss": 0.4915, + "step": 2487 + }, + { + "epoch": 1.1763593380614656, + "grad_norm": 3.407115936279297, + "learning_rate": 4.569137988693644e-06, + "loss": 0.5761, + "step": 2488 + }, + { + "epoch": 1.1768321513002364, + "grad_norm": 2.7292826175689697, + "learning_rate": 4.568787809549958e-06, + "loss": 0.541, + "step": 2489 + }, + { + "epoch": 1.177304964539007, + "grad_norm": 2.8805999755859375, + "learning_rate": 4.568437501591926e-06, + "loss": 0.6223, + "step": 2490 + }, + { + "epoch": 1.1777777777777778, + "grad_norm": 2.9264373779296875, + "learning_rate": 4.56808706484136e-06, + "loss": 0.6081, + "step": 2491 + }, + { + "epoch": 1.1782505910165484, + "grad_norm": 2.5167033672332764, + "learning_rate": 4.567736499320082e-06, + "loss": 0.5393, + "step": 2492 + }, + { + "epoch": 1.1787234042553192, + "grad_norm": 3.4647862911224365, + "learning_rate": 4.567385805049918e-06, + "loss": 0.4826, + "step": 2493 + }, + { + "epoch": 1.1791962174940898, + "grad_norm": 2.9824202060699463, + "learning_rate": 4.5670349820527055e-06, + "loss": 0.541, + "step": 2494 + }, + { + "epoch": 1.1796690307328606, + "grad_norm": 2.997105836868286, + "learning_rate": 4.5666840303502885e-06, + "loss": 0.5771, + "step": 2495 + }, + { + "epoch": 1.1801418439716311, + "grad_norm": 2.8728017807006836, + "learning_rate": 4.56633294996452e-06, + "loss": 0.4877, + "step": 2496 + }, + { + "epoch": 1.180614657210402, + "grad_norm": 2.626498222351074, + "learning_rate": 4.5659817409172565e-06, + "loss": 0.5296, + "step": 2497 + }, + { + "epoch": 1.1810874704491725, + "grad_norm": 2.87037992477417, + "learning_rate": 4.565630403230371e-06, + "loss": 0.539, + "step": 2498 + }, + { + "epoch": 1.1815602836879433, + "grad_norm": 2.5719685554504395, + "learning_rate": 4.5652789369257375e-06, + "loss": 0.5653, + "step": 2499 + }, + { + "epoch": 1.1820330969267139, + "grad_norm": 2.4842135906219482, + "learning_rate": 4.56492734202524e-06, + "loss": 0.515, + "step": 2500 + }, + { + "epoch": 1.1825059101654847, + "grad_norm": 2.640951156616211, + "learning_rate": 4.564575618550773e-06, + "loss": 0.5601, + "step": 2501 + }, + { + "epoch": 1.1829787234042553, + "grad_norm": 2.624394655227661, + "learning_rate": 4.564223766524234e-06, + "loss": 0.5551, + "step": 2502 + }, + { + "epoch": 1.183451536643026, + "grad_norm": 3.014537811279297, + "learning_rate": 4.563871785967533e-06, + "loss": 0.5212, + "step": 2503 + }, + { + "epoch": 1.1839243498817966, + "grad_norm": 2.8756890296936035, + "learning_rate": 4.563519676902585e-06, + "loss": 0.5132, + "step": 2504 + }, + { + "epoch": 1.1843971631205674, + "grad_norm": 2.636781692504883, + "learning_rate": 4.5631674393513145e-06, + "loss": 0.5323, + "step": 2505 + }, + { + "epoch": 1.184869976359338, + "grad_norm": 2.7233786582946777, + "learning_rate": 4.562815073335655e-06, + "loss": 0.5608, + "step": 2506 + }, + { + "epoch": 1.1853427895981088, + "grad_norm": 2.7158713340759277, + "learning_rate": 4.562462578877546e-06, + "loss": 0.5373, + "step": 2507 + }, + { + "epoch": 1.1858156028368794, + "grad_norm": 2.9754762649536133, + "learning_rate": 4.562109955998936e-06, + "loss": 0.5712, + "step": 2508 + }, + { + "epoch": 1.1862884160756502, + "grad_norm": 2.8815054893493652, + "learning_rate": 4.561757204721781e-06, + "loss": 0.6126, + "step": 2509 + }, + { + "epoch": 1.1867612293144207, + "grad_norm": 2.866319417953491, + "learning_rate": 4.561404325068045e-06, + "loss": 0.506, + "step": 2510 + }, + { + "epoch": 1.1872340425531915, + "grad_norm": 2.6187376976013184, + "learning_rate": 4.561051317059701e-06, + "loss": 0.4674, + "step": 2511 + }, + { + "epoch": 1.1877068557919621, + "grad_norm": 2.642552137374878, + "learning_rate": 4.560698180718729e-06, + "loss": 0.4793, + "step": 2512 + }, + { + "epoch": 1.188179669030733, + "grad_norm": 2.7815041542053223, + "learning_rate": 4.560344916067117e-06, + "loss": 0.5034, + "step": 2513 + }, + { + "epoch": 1.1886524822695035, + "grad_norm": 2.70853590965271, + "learning_rate": 4.559991523126862e-06, + "loss": 0.4811, + "step": 2514 + }, + { + "epoch": 1.1891252955082743, + "grad_norm": 2.7049436569213867, + "learning_rate": 4.559638001919967e-06, + "loss": 0.547, + "step": 2515 + }, + { + "epoch": 1.1895981087470449, + "grad_norm": 2.766773223876953, + "learning_rate": 4.559284352468445e-06, + "loss": 0.5362, + "step": 2516 + }, + { + "epoch": 1.1900709219858157, + "grad_norm": 3.0064334869384766, + "learning_rate": 4.558930574794316e-06, + "loss": 0.5915, + "step": 2517 + }, + { + "epoch": 1.1905437352245862, + "grad_norm": 2.4899885654449463, + "learning_rate": 4.558576668919609e-06, + "loss": 0.4379, + "step": 2518 + }, + { + "epoch": 1.191016548463357, + "grad_norm": 2.925963878631592, + "learning_rate": 4.558222634866358e-06, + "loss": 0.5389, + "step": 2519 + }, + { + "epoch": 1.1914893617021276, + "grad_norm": 6.087667465209961, + "learning_rate": 4.55786847265661e-06, + "loss": 0.4777, + "step": 2520 + }, + { + "epoch": 1.1919621749408984, + "grad_norm": 2.4560582637786865, + "learning_rate": 4.5575141823124145e-06, + "loss": 0.5576, + "step": 2521 + }, + { + "epoch": 1.192434988179669, + "grad_norm": 3.184252977371216, + "learning_rate": 4.557159763855834e-06, + "loss": 0.5151, + "step": 2522 + }, + { + "epoch": 1.1929078014184398, + "grad_norm": 2.359722137451172, + "learning_rate": 4.556805217308935e-06, + "loss": 0.478, + "step": 2523 + }, + { + "epoch": 1.1933806146572103, + "grad_norm": 3.0821568965911865, + "learning_rate": 4.5564505426937935e-06, + "loss": 0.5784, + "step": 2524 + }, + { + "epoch": 1.1938534278959811, + "grad_norm": 2.9905128479003906, + "learning_rate": 4.5560957400324936e-06, + "loss": 0.6087, + "step": 2525 + }, + { + "epoch": 1.1943262411347517, + "grad_norm": 2.462102174758911, + "learning_rate": 4.555740809347128e-06, + "loss": 0.4739, + "step": 2526 + }, + { + "epoch": 1.1947990543735225, + "grad_norm": 2.7931067943573, + "learning_rate": 4.555385750659796e-06, + "loss": 0.4961, + "step": 2527 + }, + { + "epoch": 1.195271867612293, + "grad_norm": 2.660320997238159, + "learning_rate": 4.555030563992607e-06, + "loss": 0.487, + "step": 2528 + }, + { + "epoch": 1.195744680851064, + "grad_norm": 2.8135557174682617, + "learning_rate": 4.554675249367675e-06, + "loss": 0.5269, + "step": 2529 + }, + { + "epoch": 1.1962174940898345, + "grad_norm": 2.661933422088623, + "learning_rate": 4.554319806807126e-06, + "loss": 0.4723, + "step": 2530 + }, + { + "epoch": 1.1966903073286053, + "grad_norm": 2.568176507949829, + "learning_rate": 4.553964236333089e-06, + "loss": 0.5258, + "step": 2531 + }, + { + "epoch": 1.1971631205673758, + "grad_norm": 2.6890947818756104, + "learning_rate": 4.553608537967705e-06, + "loss": 0.4965, + "step": 2532 + }, + { + "epoch": 1.1976359338061466, + "grad_norm": 3.133470058441162, + "learning_rate": 4.553252711733124e-06, + "loss": 0.5423, + "step": 2533 + }, + { + "epoch": 1.1981087470449172, + "grad_norm": 2.7086687088012695, + "learning_rate": 4.552896757651498e-06, + "loss": 0.5326, + "step": 2534 + }, + { + "epoch": 1.198581560283688, + "grad_norm": 2.8411715030670166, + "learning_rate": 4.552540675744994e-06, + "loss": 0.5793, + "step": 2535 + }, + { + "epoch": 1.1990543735224586, + "grad_norm": 3.041077136993408, + "learning_rate": 4.552184466035782e-06, + "loss": 0.5068, + "step": 2536 + }, + { + "epoch": 1.1995271867612294, + "grad_norm": 2.5921192169189453, + "learning_rate": 4.551828128546041e-06, + "loss": 0.5189, + "step": 2537 + }, + { + "epoch": 1.2, + "grad_norm": 2.923305034637451, + "learning_rate": 4.5514716632979605e-06, + "loss": 0.516, + "step": 2538 + }, + { + "epoch": 1.2004728132387708, + "grad_norm": 2.7083024978637695, + "learning_rate": 4.551115070313734e-06, + "loss": 0.4825, + "step": 2539 + }, + { + "epoch": 1.2009456264775413, + "grad_norm": 2.746842384338379, + "learning_rate": 4.550758349615567e-06, + "loss": 0.5691, + "step": 2540 + }, + { + "epoch": 1.2014184397163121, + "grad_norm": 2.6596429347991943, + "learning_rate": 4.550401501225669e-06, + "loss": 0.5983, + "step": 2541 + }, + { + "epoch": 1.2018912529550827, + "grad_norm": 2.9057931900024414, + "learning_rate": 4.550044525166261e-06, + "loss": 0.5069, + "step": 2542 + }, + { + "epoch": 1.2023640661938535, + "grad_norm": 2.6139039993286133, + "learning_rate": 4.5496874214595686e-06, + "loss": 0.5102, + "step": 2543 + }, + { + "epoch": 1.202836879432624, + "grad_norm": 2.630286455154419, + "learning_rate": 4.5493301901278285e-06, + "loss": 0.4902, + "step": 2544 + }, + { + "epoch": 1.2033096926713949, + "grad_norm": 2.639174222946167, + "learning_rate": 4.548972831193284e-06, + "loss": 0.4566, + "step": 2545 + }, + { + "epoch": 1.2037825059101654, + "grad_norm": 2.9569664001464844, + "learning_rate": 4.548615344678186e-06, + "loss": 0.5636, + "step": 2546 + }, + { + "epoch": 1.2042553191489362, + "grad_norm": 2.981734037399292, + "learning_rate": 4.5482577306047924e-06, + "loss": 0.4884, + "step": 2547 + }, + { + "epoch": 1.2047281323877068, + "grad_norm": 2.6760342121124268, + "learning_rate": 4.547899988995371e-06, + "loss": 0.5426, + "step": 2548 + }, + { + "epoch": 1.2052009456264776, + "grad_norm": 2.825805902481079, + "learning_rate": 4.547542119872198e-06, + "loss": 0.4989, + "step": 2549 + }, + { + "epoch": 1.2056737588652482, + "grad_norm": 2.856426954269409, + "learning_rate": 4.547184123257555e-06, + "loss": 0.5734, + "step": 2550 + }, + { + "epoch": 1.206146572104019, + "grad_norm": 2.555682420730591, + "learning_rate": 4.5468259991737334e-06, + "loss": 0.5299, + "step": 2551 + }, + { + "epoch": 1.2066193853427896, + "grad_norm": 2.6324024200439453, + "learning_rate": 4.546467747643032e-06, + "loss": 0.5906, + "step": 2552 + }, + { + "epoch": 1.2070921985815604, + "grad_norm": 3.4145350456237793, + "learning_rate": 4.546109368687757e-06, + "loss": 0.5153, + "step": 2553 + }, + { + "epoch": 1.207565011820331, + "grad_norm": 2.658691644668579, + "learning_rate": 4.545750862330225e-06, + "loss": 0.5759, + "step": 2554 + }, + { + "epoch": 1.2080378250591017, + "grad_norm": 3.162605047225952, + "learning_rate": 4.545392228592755e-06, + "loss": 0.5379, + "step": 2555 + }, + { + "epoch": 1.2085106382978723, + "grad_norm": 2.8631198406219482, + "learning_rate": 4.545033467497681e-06, + "loss": 0.5959, + "step": 2556 + }, + { + "epoch": 1.208983451536643, + "grad_norm": 2.457109212875366, + "learning_rate": 4.54467457906734e-06, + "loss": 0.4864, + "step": 2557 + }, + { + "epoch": 1.2094562647754137, + "grad_norm": 2.5307061672210693, + "learning_rate": 4.544315563324078e-06, + "loss": 0.5308, + "step": 2558 + }, + { + "epoch": 1.2099290780141845, + "grad_norm": 2.8482773303985596, + "learning_rate": 4.543956420290251e-06, + "loss": 0.5126, + "step": 2559 + }, + { + "epoch": 1.210401891252955, + "grad_norm": 2.4990832805633545, + "learning_rate": 4.5435971499882195e-06, + "loss": 0.4534, + "step": 2560 + }, + { + "epoch": 1.2108747044917259, + "grad_norm": 2.6292665004730225, + "learning_rate": 4.543237752440354e-06, + "loss": 0.4434, + "step": 2561 + }, + { + "epoch": 1.2113475177304964, + "grad_norm": 2.865983247756958, + "learning_rate": 4.542878227669033e-06, + "loss": 0.5667, + "step": 2562 + }, + { + "epoch": 1.2118203309692672, + "grad_norm": 2.745614528656006, + "learning_rate": 4.542518575696644e-06, + "loss": 0.4724, + "step": 2563 + }, + { + "epoch": 1.2122931442080378, + "grad_norm": 2.8562581539154053, + "learning_rate": 4.5421587965455785e-06, + "loss": 0.5405, + "step": 2564 + }, + { + "epoch": 1.2127659574468086, + "grad_norm": 2.6670095920562744, + "learning_rate": 4.5417988902382385e-06, + "loss": 0.5432, + "step": 2565 + }, + { + "epoch": 1.2132387706855792, + "grad_norm": 2.9320743083953857, + "learning_rate": 4.541438856797036e-06, + "loss": 0.5862, + "step": 2566 + }, + { + "epoch": 1.21371158392435, + "grad_norm": 2.577505588531494, + "learning_rate": 4.541078696244386e-06, + "loss": 0.4742, + "step": 2567 + }, + { + "epoch": 1.2141843971631205, + "grad_norm": 3.4476120471954346, + "learning_rate": 4.540718408602717e-06, + "loss": 0.5903, + "step": 2568 + }, + { + "epoch": 1.2146572104018913, + "grad_norm": 2.816210985183716, + "learning_rate": 4.540357993894459e-06, + "loss": 0.5033, + "step": 2569 + }, + { + "epoch": 1.215130023640662, + "grad_norm": 3.0806639194488525, + "learning_rate": 4.539997452142058e-06, + "loss": 0.6064, + "step": 2570 + }, + { + "epoch": 1.2156028368794327, + "grad_norm": 2.563060760498047, + "learning_rate": 4.5396367833679586e-06, + "loss": 0.5597, + "step": 2571 + }, + { + "epoch": 1.2160756501182033, + "grad_norm": 3.1014397144317627, + "learning_rate": 4.5392759875946215e-06, + "loss": 0.54, + "step": 2572 + }, + { + "epoch": 1.216548463356974, + "grad_norm": 3.124190330505371, + "learning_rate": 4.53891506484451e-06, + "loss": 0.5122, + "step": 2573 + }, + { + "epoch": 1.2170212765957447, + "grad_norm": 2.6688716411590576, + "learning_rate": 4.538554015140097e-06, + "loss": 0.5615, + "step": 2574 + }, + { + "epoch": 1.2174940898345155, + "grad_norm": 2.775543689727783, + "learning_rate": 4.538192838503866e-06, + "loss": 0.496, + "step": 2575 + }, + { + "epoch": 1.217966903073286, + "grad_norm": 2.7877283096313477, + "learning_rate": 4.537831534958303e-06, + "loss": 0.4995, + "step": 2576 + }, + { + "epoch": 1.2184397163120568, + "grad_norm": 2.824810028076172, + "learning_rate": 4.537470104525906e-06, + "loss": 0.5481, + "step": 2577 + }, + { + "epoch": 1.2189125295508274, + "grad_norm": 2.801269292831421, + "learning_rate": 4.53710854722918e-06, + "loss": 0.5628, + "step": 2578 + }, + { + "epoch": 1.2193853427895982, + "grad_norm": 2.7780683040618896, + "learning_rate": 4.536746863090637e-06, + "loss": 0.4845, + "step": 2579 + }, + { + "epoch": 1.2198581560283688, + "grad_norm": 2.536010265350342, + "learning_rate": 4.536385052132798e-06, + "loss": 0.4771, + "step": 2580 + }, + { + "epoch": 1.2203309692671396, + "grad_norm": 2.768775701522827, + "learning_rate": 4.536023114378191e-06, + "loss": 0.5366, + "step": 2581 + }, + { + "epoch": 1.2208037825059102, + "grad_norm": 2.658125877380371, + "learning_rate": 4.535661049849352e-06, + "loss": 0.524, + "step": 2582 + }, + { + "epoch": 1.2212765957446807, + "grad_norm": 2.558696746826172, + "learning_rate": 4.535298858568825e-06, + "loss": 0.5482, + "step": 2583 + }, + { + "epoch": 1.2217494089834515, + "grad_norm": 2.5284535884857178, + "learning_rate": 4.534936540559164e-06, + "loss": 0.4454, + "step": 2584 + }, + { + "epoch": 1.2222222222222223, + "grad_norm": 7.617330074310303, + "learning_rate": 4.534574095842927e-06, + "loss": 0.5615, + "step": 2585 + }, + { + "epoch": 1.222695035460993, + "grad_norm": 2.9120311737060547, + "learning_rate": 4.534211524442682e-06, + "loss": 0.5624, + "step": 2586 + }, + { + "epoch": 1.2231678486997635, + "grad_norm": 2.5004289150238037, + "learning_rate": 4.533848826381005e-06, + "loss": 0.4743, + "step": 2587 + }, + { + "epoch": 1.2236406619385343, + "grad_norm": 2.8395533561706543, + "learning_rate": 4.53348600168048e-06, + "loss": 0.4457, + "step": 2588 + }, + { + "epoch": 1.224113475177305, + "grad_norm": 2.832211494445801, + "learning_rate": 4.533123050363699e-06, + "loss": 0.5559, + "step": 2589 + }, + { + "epoch": 1.2245862884160756, + "grad_norm": 2.6318583488464355, + "learning_rate": 4.53275997245326e-06, + "loss": 0.5281, + "step": 2590 + }, + { + "epoch": 1.2250591016548462, + "grad_norm": 3.0509233474731445, + "learning_rate": 4.532396767971771e-06, + "loss": 0.6003, + "step": 2591 + }, + { + "epoch": 1.225531914893617, + "grad_norm": 2.6863620281219482, + "learning_rate": 4.532033436941847e-06, + "loss": 0.5219, + "step": 2592 + }, + { + "epoch": 1.2260047281323878, + "grad_norm": 2.401463747024536, + "learning_rate": 4.5316699793861104e-06, + "loss": 0.5994, + "step": 2593 + }, + { + "epoch": 1.2264775413711584, + "grad_norm": 2.613517999649048, + "learning_rate": 4.531306395327194e-06, + "loss": 0.5785, + "step": 2594 + }, + { + "epoch": 1.226950354609929, + "grad_norm": 2.5016374588012695, + "learning_rate": 4.530942684787735e-06, + "loss": 0.5695, + "step": 2595 + }, + { + "epoch": 1.2274231678486998, + "grad_norm": 2.576464891433716, + "learning_rate": 4.53057884779038e-06, + "loss": 0.4427, + "step": 2596 + }, + { + "epoch": 1.2278959810874706, + "grad_norm": 2.5688700675964355, + "learning_rate": 4.530214884357785e-06, + "loss": 0.4966, + "step": 2597 + }, + { + "epoch": 1.2283687943262411, + "grad_norm": 3.179013729095459, + "learning_rate": 4.52985079451261e-06, + "loss": 0.5239, + "step": 2598 + }, + { + "epoch": 1.2288416075650117, + "grad_norm": 2.6015284061431885, + "learning_rate": 4.529486578277527e-06, + "loss": 0.5135, + "step": 2599 + }, + { + "epoch": 1.2293144208037825, + "grad_norm": 2.3029589653015137, + "learning_rate": 4.529122235675214e-06, + "loss": 0.4044, + "step": 2600 + }, + { + "epoch": 1.2297872340425533, + "grad_norm": 2.994093656539917, + "learning_rate": 4.528757766728357e-06, + "loss": 0.5419, + "step": 2601 + }, + { + "epoch": 1.2302600472813239, + "grad_norm": 2.6297390460968018, + "learning_rate": 4.52839317145965e-06, + "loss": 0.488, + "step": 2602 + }, + { + "epoch": 1.2307328605200945, + "grad_norm": 2.4814043045043945, + "learning_rate": 4.528028449891793e-06, + "loss": 0.4917, + "step": 2603 + }, + { + "epoch": 1.2312056737588652, + "grad_norm": 3.6052863597869873, + "learning_rate": 4.527663602047499e-06, + "loss": 0.5301, + "step": 2604 + }, + { + "epoch": 1.231678486997636, + "grad_norm": 2.6984751224517822, + "learning_rate": 4.5272986279494825e-06, + "loss": 0.5253, + "step": 2605 + }, + { + "epoch": 1.2321513002364066, + "grad_norm": 2.514000415802002, + "learning_rate": 4.526933527620469e-06, + "loss": 0.5661, + "step": 2606 + }, + { + "epoch": 1.2326241134751772, + "grad_norm": 2.890921115875244, + "learning_rate": 4.526568301083195e-06, + "loss": 0.5585, + "step": 2607 + }, + { + "epoch": 1.233096926713948, + "grad_norm": 2.6390011310577393, + "learning_rate": 4.526202948360397e-06, + "loss": 0.5168, + "step": 2608 + }, + { + "epoch": 1.2335697399527188, + "grad_norm": 2.7370636463165283, + "learning_rate": 4.5258374694748266e-06, + "loss": 0.5453, + "step": 2609 + }, + { + "epoch": 1.2340425531914894, + "grad_norm": 2.8203976154327393, + "learning_rate": 4.52547186444924e-06, + "loss": 0.5763, + "step": 2610 + }, + { + "epoch": 1.23451536643026, + "grad_norm": 2.7567849159240723, + "learning_rate": 4.5251061333064025e-06, + "loss": 0.5194, + "step": 2611 + }, + { + "epoch": 1.2349881796690307, + "grad_norm": 2.767519474029541, + "learning_rate": 4.524740276069085e-06, + "loss": 0.5355, + "step": 2612 + }, + { + "epoch": 1.2354609929078015, + "grad_norm": 3.072035312652588, + "learning_rate": 4.5243742927600695e-06, + "loss": 0.5391, + "step": 2613 + }, + { + "epoch": 1.2359338061465721, + "grad_norm": 2.5957462787628174, + "learning_rate": 4.524008183402143e-06, + "loss": 0.5645, + "step": 2614 + }, + { + "epoch": 1.2364066193853427, + "grad_norm": 2.774897575378418, + "learning_rate": 4.523641948018101e-06, + "loss": 0.5576, + "step": 2615 + }, + { + "epoch": 1.2368794326241135, + "grad_norm": 2.635887622833252, + "learning_rate": 4.5232755866307496e-06, + "loss": 0.5254, + "step": 2616 + }, + { + "epoch": 1.2373522458628843, + "grad_norm": 2.4860997200012207, + "learning_rate": 4.522909099262899e-06, + "loss": 0.4692, + "step": 2617 + }, + { + "epoch": 1.2378250591016549, + "grad_norm": 2.595513105392456, + "learning_rate": 4.522542485937369e-06, + "loss": 0.5166, + "step": 2618 + }, + { + "epoch": 1.2382978723404254, + "grad_norm": 2.961474895477295, + "learning_rate": 4.522175746676986e-06, + "loss": 0.5455, + "step": 2619 + }, + { + "epoch": 1.2387706855791962, + "grad_norm": 2.813889741897583, + "learning_rate": 4.521808881504588e-06, + "loss": 0.5249, + "step": 2620 + }, + { + "epoch": 1.239243498817967, + "grad_norm": 2.8434813022613525, + "learning_rate": 4.521441890443015e-06, + "loss": 0.472, + "step": 2621 + }, + { + "epoch": 1.2397163120567376, + "grad_norm": 2.4264845848083496, + "learning_rate": 4.521074773515119e-06, + "loss": 0.4783, + "step": 2622 + }, + { + "epoch": 1.2401891252955082, + "grad_norm": 2.615169048309326, + "learning_rate": 4.520707530743761e-06, + "loss": 0.5324, + "step": 2623 + }, + { + "epoch": 1.240661938534279, + "grad_norm": 2.6772537231445312, + "learning_rate": 4.520340162151803e-06, + "loss": 0.5224, + "step": 2624 + }, + { + "epoch": 1.2411347517730495, + "grad_norm": 2.683393955230713, + "learning_rate": 4.519972667762124e-06, + "loss": 0.4863, + "step": 2625 + }, + { + "epoch": 1.2416075650118203, + "grad_norm": 3.0335750579833984, + "learning_rate": 4.519605047597603e-06, + "loss": 0.544, + "step": 2626 + }, + { + "epoch": 1.242080378250591, + "grad_norm": 2.8694353103637695, + "learning_rate": 4.519237301681132e-06, + "loss": 0.5576, + "step": 2627 + }, + { + "epoch": 1.2425531914893617, + "grad_norm": 3.217808246612549, + "learning_rate": 4.518869430035609e-06, + "loss": 0.5459, + "step": 2628 + }, + { + "epoch": 1.2430260047281323, + "grad_norm": 2.7700083255767822, + "learning_rate": 4.518501432683937e-06, + "loss": 0.5579, + "step": 2629 + }, + { + "epoch": 1.243498817966903, + "grad_norm": 2.4759175777435303, + "learning_rate": 4.5181333096490335e-06, + "loss": 0.5049, + "step": 2630 + }, + { + "epoch": 1.2439716312056737, + "grad_norm": 2.8652584552764893, + "learning_rate": 4.517765060953818e-06, + "loss": 0.5366, + "step": 2631 + }, + { + "epoch": 1.2444444444444445, + "grad_norm": 2.776334524154663, + "learning_rate": 4.517396686621218e-06, + "loss": 0.5677, + "step": 2632 + }, + { + "epoch": 1.244917257683215, + "grad_norm": 2.676708221435547, + "learning_rate": 4.517028186674174e-06, + "loss": 0.5055, + "step": 2633 + }, + { + "epoch": 1.2453900709219858, + "grad_norm": 2.6851537227630615, + "learning_rate": 4.516659561135629e-06, + "loss": 0.5537, + "step": 2634 + }, + { + "epoch": 1.2458628841607564, + "grad_norm": 2.619971513748169, + "learning_rate": 4.516290810028536e-06, + "loss": 0.5765, + "step": 2635 + }, + { + "epoch": 1.2463356973995272, + "grad_norm": 2.7302334308624268, + "learning_rate": 4.515921933375855e-06, + "loss": 0.5611, + "step": 2636 + }, + { + "epoch": 1.2468085106382978, + "grad_norm": 2.5005829334259033, + "learning_rate": 4.5155529312005554e-06, + "loss": 0.442, + "step": 2637 + }, + { + "epoch": 1.2472813238770686, + "grad_norm": 2.713587522506714, + "learning_rate": 4.515183803525612e-06, + "loss": 0.5023, + "step": 2638 + }, + { + "epoch": 1.2477541371158392, + "grad_norm": 2.5146236419677734, + "learning_rate": 4.514814550374009e-06, + "loss": 0.5195, + "step": 2639 + }, + { + "epoch": 1.24822695035461, + "grad_norm": 2.761060953140259, + "learning_rate": 4.51444517176874e-06, + "loss": 0.5138, + "step": 2640 + }, + { + "epoch": 1.2486997635933805, + "grad_norm": 3.082329273223877, + "learning_rate": 4.5140756677328026e-06, + "loss": 0.6105, + "step": 2641 + }, + { + "epoch": 1.2491725768321513, + "grad_norm": 2.6933493614196777, + "learning_rate": 4.513706038289205e-06, + "loss": 0.5185, + "step": 2642 + }, + { + "epoch": 1.249645390070922, + "grad_norm": 2.515856981277466, + "learning_rate": 4.513336283460962e-06, + "loss": 0.5375, + "step": 2643 + }, + { + "epoch": 1.2501182033096927, + "grad_norm": 2.8553731441497803, + "learning_rate": 4.512966403271096e-06, + "loss": 0.5582, + "step": 2644 + }, + { + "epoch": 1.2505910165484633, + "grad_norm": 2.640880823135376, + "learning_rate": 4.5125963977426405e-06, + "loss": 0.5125, + "step": 2645 + }, + { + "epoch": 1.251063829787234, + "grad_norm": 2.9845943450927734, + "learning_rate": 4.512226266898631e-06, + "loss": 0.4749, + "step": 2646 + }, + { + "epoch": 1.2515366430260046, + "grad_norm": 2.5131032466888428, + "learning_rate": 4.511856010762116e-06, + "loss": 0.4764, + "step": 2647 + }, + { + "epoch": 1.2520094562647754, + "grad_norm": 2.370638370513916, + "learning_rate": 4.511485629356148e-06, + "loss": 0.5153, + "step": 2648 + }, + { + "epoch": 1.252482269503546, + "grad_norm": 2.912461996078491, + "learning_rate": 4.511115122703791e-06, + "loss": 0.6117, + "step": 2649 + }, + { + "epoch": 1.2529550827423168, + "grad_norm": 2.7308082580566406, + "learning_rate": 4.510744490828113e-06, + "loss": 0.5076, + "step": 2650 + }, + { + "epoch": 1.2534278959810874, + "grad_norm": 2.8524296283721924, + "learning_rate": 4.510373733752193e-06, + "loss": 0.542, + "step": 2651 + }, + { + "epoch": 1.2539007092198582, + "grad_norm": 2.799377202987671, + "learning_rate": 4.5100028514991145e-06, + "loss": 0.486, + "step": 2652 + }, + { + "epoch": 1.2543735224586288, + "grad_norm": 2.7248027324676514, + "learning_rate": 4.509631844091973e-06, + "loss": 0.4972, + "step": 2653 + }, + { + "epoch": 1.2548463356973996, + "grad_norm": 2.8041458129882812, + "learning_rate": 4.5092607115538686e-06, + "loss": 0.588, + "step": 2654 + }, + { + "epoch": 1.2553191489361701, + "grad_norm": 2.679417133331299, + "learning_rate": 4.50888945390791e-06, + "loss": 0.4639, + "step": 2655 + }, + { + "epoch": 1.255791962174941, + "grad_norm": 3.1049270629882812, + "learning_rate": 4.508518071177214e-06, + "loss": 0.5857, + "step": 2656 + }, + { + "epoch": 1.2562647754137115, + "grad_norm": 2.8590362071990967, + "learning_rate": 4.508146563384904e-06, + "loss": 0.5451, + "step": 2657 + }, + { + "epoch": 1.2567375886524823, + "grad_norm": 2.9774081707000732, + "learning_rate": 4.507774930554114e-06, + "loss": 0.5493, + "step": 2658 + }, + { + "epoch": 1.2572104018912529, + "grad_norm": 2.617643356323242, + "learning_rate": 4.507403172707983e-06, + "loss": 0.5472, + "step": 2659 + }, + { + "epoch": 1.2576832151300237, + "grad_norm": 2.9195587635040283, + "learning_rate": 4.507031289869658e-06, + "loss": 0.5403, + "step": 2660 + }, + { + "epoch": 1.2581560283687943, + "grad_norm": 2.706089496612549, + "learning_rate": 4.506659282062295e-06, + "loss": 0.4899, + "step": 2661 + }, + { + "epoch": 1.258628841607565, + "grad_norm": 2.8229358196258545, + "learning_rate": 4.506287149309057e-06, + "loss": 0.5336, + "step": 2662 + }, + { + "epoch": 1.2591016548463356, + "grad_norm": 2.5295674800872803, + "learning_rate": 4.505914891633117e-06, + "loss": 0.4806, + "step": 2663 + }, + { + "epoch": 1.2595744680851064, + "grad_norm": 3.098208427429199, + "learning_rate": 4.505542509057651e-06, + "loss": 0.6039, + "step": 2664 + }, + { + "epoch": 1.260047281323877, + "grad_norm": 2.5118041038513184, + "learning_rate": 4.5051700016058475e-06, + "loss": 0.5279, + "step": 2665 + }, + { + "epoch": 1.2605200945626478, + "grad_norm": 2.6901369094848633, + "learning_rate": 4.5047973693009005e-06, + "loss": 0.5515, + "step": 2666 + }, + { + "epoch": 1.2609929078014184, + "grad_norm": 2.5622377395629883, + "learning_rate": 4.504424612166012e-06, + "loss": 0.5405, + "step": 2667 + }, + { + "epoch": 1.2614657210401892, + "grad_norm": 2.685751438140869, + "learning_rate": 4.5040517302243915e-06, + "loss": 0.5797, + "step": 2668 + }, + { + "epoch": 1.2619385342789597, + "grad_norm": 2.8525350093841553, + "learning_rate": 4.503678723499259e-06, + "loss": 0.5561, + "step": 2669 + }, + { + "epoch": 1.2624113475177305, + "grad_norm": 2.803386926651001, + "learning_rate": 4.503305592013836e-06, + "loss": 0.5376, + "step": 2670 + }, + { + "epoch": 1.2628841607565011, + "grad_norm": 2.78633189201355, + "learning_rate": 4.502932335791359e-06, + "loss": 0.4739, + "step": 2671 + }, + { + "epoch": 1.263356973995272, + "grad_norm": 2.8337297439575195, + "learning_rate": 4.502558954855069e-06, + "loss": 0.5406, + "step": 2672 + }, + { + "epoch": 1.2638297872340425, + "grad_norm": 2.610275983810425, + "learning_rate": 4.502185449228213e-06, + "loss": 0.5343, + "step": 2673 + }, + { + "epoch": 1.2643026004728133, + "grad_norm": 2.7842252254486084, + "learning_rate": 4.501811818934048e-06, + "loss": 0.532, + "step": 2674 + }, + { + "epoch": 1.2647754137115839, + "grad_norm": 2.4472389221191406, + "learning_rate": 4.501438063995839e-06, + "loss": 0.4976, + "step": 2675 + }, + { + "epoch": 1.2652482269503547, + "grad_norm": 3.076580762863159, + "learning_rate": 4.501064184436858e-06, + "loss": 0.507, + "step": 2676 + }, + { + "epoch": 1.2657210401891252, + "grad_norm": 2.5952908992767334, + "learning_rate": 4.500690180280384e-06, + "loss": 0.5498, + "step": 2677 + }, + { + "epoch": 1.266193853427896, + "grad_norm": 2.476943016052246, + "learning_rate": 4.500316051549706e-06, + "loss": 0.557, + "step": 2678 + }, + { + "epoch": 1.2666666666666666, + "grad_norm": 2.730579376220703, + "learning_rate": 4.499941798268118e-06, + "loss": 0.4975, + "step": 2679 + }, + { + "epoch": 1.2671394799054374, + "grad_norm": 2.7916698455810547, + "learning_rate": 4.499567420458924e-06, + "loss": 0.5673, + "step": 2680 + }, + { + "epoch": 1.267612293144208, + "grad_norm": 2.4249091148376465, + "learning_rate": 4.4991929181454355e-06, + "loss": 0.4836, + "step": 2681 + }, + { + "epoch": 1.2680851063829788, + "grad_norm": 2.661911725997925, + "learning_rate": 4.498818291350969e-06, + "loss": 0.5332, + "step": 2682 + }, + { + "epoch": 1.2685579196217494, + "grad_norm": 2.693657875061035, + "learning_rate": 4.498443540098852e-06, + "loss": 0.5257, + "step": 2683 + }, + { + "epoch": 1.2690307328605201, + "grad_norm": 2.609386682510376, + "learning_rate": 4.4980686644124195e-06, + "loss": 0.4918, + "step": 2684 + }, + { + "epoch": 1.2695035460992907, + "grad_norm": 3.2104930877685547, + "learning_rate": 4.4976936643150124e-06, + "loss": 0.6097, + "step": 2685 + }, + { + "epoch": 1.2699763593380615, + "grad_norm": 2.707860231399536, + "learning_rate": 4.49731853982998e-06, + "loss": 0.5109, + "step": 2686 + }, + { + "epoch": 1.270449172576832, + "grad_norm": 3.5046379566192627, + "learning_rate": 4.49694329098068e-06, + "loss": 0.5883, + "step": 2687 + }, + { + "epoch": 1.270921985815603, + "grad_norm": 2.5362324714660645, + "learning_rate": 4.496567917790477e-06, + "loss": 0.5301, + "step": 2688 + }, + { + "epoch": 1.2713947990543735, + "grad_norm": 2.7095518112182617, + "learning_rate": 4.496192420282746e-06, + "loss": 0.4772, + "step": 2689 + }, + { + "epoch": 1.2718676122931443, + "grad_norm": 2.416433095932007, + "learning_rate": 4.495816798480865e-06, + "loss": 0.5012, + "step": 2690 + }, + { + "epoch": 1.2723404255319148, + "grad_norm": 2.5362391471862793, + "learning_rate": 4.495441052408224e-06, + "loss": 0.5197, + "step": 2691 + }, + { + "epoch": 1.2728132387706856, + "grad_norm": 2.9093947410583496, + "learning_rate": 4.495065182088218e-06, + "loss": 0.4893, + "step": 2692 + }, + { + "epoch": 1.2732860520094562, + "grad_norm": 2.520470142364502, + "learning_rate": 4.494689187544251e-06, + "loss": 0.5072, + "step": 2693 + }, + { + "epoch": 1.273758865248227, + "grad_norm": 2.4385125637054443, + "learning_rate": 4.494313068799735e-06, + "loss": 0.4923, + "step": 2694 + }, + { + "epoch": 1.2742316784869976, + "grad_norm": 2.636852502822876, + "learning_rate": 4.493936825878089e-06, + "loss": 0.5409, + "step": 2695 + }, + { + "epoch": 1.2747044917257684, + "grad_norm": 2.7027053833007812, + "learning_rate": 4.493560458802741e-06, + "loss": 0.5906, + "step": 2696 + }, + { + "epoch": 1.275177304964539, + "grad_norm": 2.58752179145813, + "learning_rate": 4.493183967597123e-06, + "loss": 0.5292, + "step": 2697 + }, + { + "epoch": 1.2756501182033098, + "grad_norm": 2.7658379077911377, + "learning_rate": 4.49280735228468e-06, + "loss": 0.5613, + "step": 2698 + }, + { + "epoch": 1.2761229314420803, + "grad_norm": 3.272688388824463, + "learning_rate": 4.492430612888861e-06, + "loss": 0.5654, + "step": 2699 + }, + { + "epoch": 1.2765957446808511, + "grad_norm": 2.806819438934326, + "learning_rate": 4.492053749433125e-06, + "loss": 0.5388, + "step": 2700 + }, + { + "epoch": 1.2770685579196217, + "grad_norm": 2.879727602005005, + "learning_rate": 4.491676761940936e-06, + "loss": 0.5033, + "step": 2701 + }, + { + "epoch": 1.2775413711583925, + "grad_norm": 2.733347177505493, + "learning_rate": 4.4912996504357695e-06, + "loss": 0.5113, + "step": 2702 + }, + { + "epoch": 1.278014184397163, + "grad_norm": 2.7431252002716064, + "learning_rate": 4.490922414941104e-06, + "loss": 0.5417, + "step": 2703 + }, + { + "epoch": 1.2784869976359339, + "grad_norm": 2.9287240505218506, + "learning_rate": 4.490545055480431e-06, + "loss": 0.5875, + "step": 2704 + }, + { + "epoch": 1.2789598108747045, + "grad_norm": 2.576775550842285, + "learning_rate": 4.490167572077244e-06, + "loss": 0.5176, + "step": 2705 + }, + { + "epoch": 1.2794326241134752, + "grad_norm": 2.4335594177246094, + "learning_rate": 4.4897899647550505e-06, + "loss": 0.4749, + "step": 2706 + }, + { + "epoch": 1.2799054373522458, + "grad_norm": 2.6798062324523926, + "learning_rate": 4.489412233537361e-06, + "loss": 0.5439, + "step": 2707 + }, + { + "epoch": 1.2803782505910166, + "grad_norm": 2.8440675735473633, + "learning_rate": 4.489034378447693e-06, + "loss": 0.552, + "step": 2708 + }, + { + "epoch": 1.2808510638297872, + "grad_norm": 2.9059503078460693, + "learning_rate": 4.488656399509577e-06, + "loss": 0.5667, + "step": 2709 + }, + { + "epoch": 1.281323877068558, + "grad_norm": 2.7415006160736084, + "learning_rate": 4.488278296746548e-06, + "loss": 0.5676, + "step": 2710 + }, + { + "epoch": 1.2817966903073286, + "grad_norm": 2.4584875106811523, + "learning_rate": 4.487900070182147e-06, + "loss": 0.4787, + "step": 2711 + }, + { + "epoch": 1.2822695035460994, + "grad_norm": 2.990940809249878, + "learning_rate": 4.487521719839924e-06, + "loss": 0.5239, + "step": 2712 + }, + { + "epoch": 1.28274231678487, + "grad_norm": 3.075201988220215, + "learning_rate": 4.487143245743441e-06, + "loss": 0.5103, + "step": 2713 + }, + { + "epoch": 1.2832151300236407, + "grad_norm": 2.543341875076294, + "learning_rate": 4.486764647916259e-06, + "loss": 0.5475, + "step": 2714 + }, + { + "epoch": 1.2836879432624113, + "grad_norm": 2.9927213191986084, + "learning_rate": 4.486385926381957e-06, + "loss": 0.4923, + "step": 2715 + }, + { + "epoch": 1.284160756501182, + "grad_norm": 2.4220657348632812, + "learning_rate": 4.486007081164111e-06, + "loss": 0.543, + "step": 2716 + }, + { + "epoch": 1.2846335697399527, + "grad_norm": 2.468214988708496, + "learning_rate": 4.4856281122863134e-06, + "loss": 0.5248, + "step": 2717 + }, + { + "epoch": 1.2851063829787235, + "grad_norm": 2.633711099624634, + "learning_rate": 4.48524901977216e-06, + "loss": 0.4764, + "step": 2718 + }, + { + "epoch": 1.285579196217494, + "grad_norm": 2.8399546146392822, + "learning_rate": 4.484869803645254e-06, + "loss": 0.5503, + "step": 2719 + }, + { + "epoch": 1.2860520094562649, + "grad_norm": 2.769063949584961, + "learning_rate": 4.484490463929209e-06, + "loss": 0.5468, + "step": 2720 + }, + { + "epoch": 1.2865248226950354, + "grad_norm": 2.617863893508911, + "learning_rate": 4.4841110006476465e-06, + "loss": 0.5906, + "step": 2721 + }, + { + "epoch": 1.2869976359338062, + "grad_norm": 2.7639541625976562, + "learning_rate": 4.4837314138241905e-06, + "loss": 0.552, + "step": 2722 + }, + { + "epoch": 1.2874704491725768, + "grad_norm": 2.7711129188537598, + "learning_rate": 4.483351703482478e-06, + "loss": 0.5229, + "step": 2723 + }, + { + "epoch": 1.2879432624113476, + "grad_norm": 2.611205577850342, + "learning_rate": 4.482971869646152e-06, + "loss": 0.5055, + "step": 2724 + }, + { + "epoch": 1.2884160756501182, + "grad_norm": 2.8602211475372314, + "learning_rate": 4.482591912338862e-06, + "loss": 0.5561, + "step": 2725 + }, + { + "epoch": 1.2888888888888888, + "grad_norm": 2.5882298946380615, + "learning_rate": 4.4822118315842675e-06, + "loss": 0.5555, + "step": 2726 + }, + { + "epoch": 1.2893617021276595, + "grad_norm": 2.7533531188964844, + "learning_rate": 4.481831627406033e-06, + "loss": 0.5346, + "step": 2727 + }, + { + "epoch": 1.2898345153664303, + "grad_norm": 2.4296958446502686, + "learning_rate": 4.481451299827835e-06, + "loss": 0.4915, + "step": 2728 + }, + { + "epoch": 1.290307328605201, + "grad_norm": 2.4403445720672607, + "learning_rate": 4.481070848873352e-06, + "loss": 0.5648, + "step": 2729 + }, + { + "epoch": 1.2907801418439715, + "grad_norm": 2.473224401473999, + "learning_rate": 4.480690274566274e-06, + "loss": 0.4849, + "step": 2730 + }, + { + "epoch": 1.2912529550827423, + "grad_norm": 2.637899875640869, + "learning_rate": 4.480309576930297e-06, + "loss": 0.4968, + "step": 2731 + }, + { + "epoch": 1.291725768321513, + "grad_norm": 2.7156927585601807, + "learning_rate": 4.479928755989127e-06, + "loss": 0.4759, + "step": 2732 + }, + { + "epoch": 1.2921985815602837, + "grad_norm": 2.632786989212036, + "learning_rate": 4.479547811766475e-06, + "loss": 0.5468, + "step": 2733 + }, + { + "epoch": 1.2926713947990542, + "grad_norm": 2.529218912124634, + "learning_rate": 4.479166744286061e-06, + "loss": 0.4852, + "step": 2734 + }, + { + "epoch": 1.293144208037825, + "grad_norm": 2.561978340148926, + "learning_rate": 4.4787855535716115e-06, + "loss": 0.546, + "step": 2735 + }, + { + "epoch": 1.2936170212765958, + "grad_norm": 2.3684909343719482, + "learning_rate": 4.478404239646862e-06, + "loss": 0.5369, + "step": 2736 + }, + { + "epoch": 1.2940898345153664, + "grad_norm": 2.8940367698669434, + "learning_rate": 4.4780228025355566e-06, + "loss": 0.568, + "step": 2737 + }, + { + "epoch": 1.294562647754137, + "grad_norm": 2.6950316429138184, + "learning_rate": 4.477641242261445e-06, + "loss": 0.4576, + "step": 2738 + }, + { + "epoch": 1.2950354609929078, + "grad_norm": 2.4211716651916504, + "learning_rate": 4.4772595588482835e-06, + "loss": 0.4341, + "step": 2739 + }, + { + "epoch": 1.2955082742316786, + "grad_norm": 3.141097068786621, + "learning_rate": 4.47687775231984e-06, + "loss": 0.5944, + "step": 2740 + }, + { + "epoch": 1.2959810874704492, + "grad_norm": 3.077522039413452, + "learning_rate": 4.476495822699887e-06, + "loss": 0.5786, + "step": 2741 + }, + { + "epoch": 1.2964539007092197, + "grad_norm": 2.708139419555664, + "learning_rate": 4.476113770012206e-06, + "loss": 0.5014, + "step": 2742 + }, + { + "epoch": 1.2969267139479905, + "grad_norm": 2.7572035789489746, + "learning_rate": 4.475731594280586e-06, + "loss": 0.594, + "step": 2743 + }, + { + "epoch": 1.2973995271867613, + "grad_norm": 2.673126459121704, + "learning_rate": 4.475349295528822e-06, + "loss": 0.5317, + "step": 2744 + }, + { + "epoch": 1.297872340425532, + "grad_norm": 2.6757819652557373, + "learning_rate": 4.4749668737807195e-06, + "loss": 0.5614, + "step": 2745 + }, + { + "epoch": 1.2983451536643025, + "grad_norm": 2.7077620029449463, + "learning_rate": 4.47458432906009e-06, + "loss": 0.4916, + "step": 2746 + }, + { + "epoch": 1.2988179669030733, + "grad_norm": 2.446570873260498, + "learning_rate": 4.474201661390752e-06, + "loss": 0.5005, + "step": 2747 + }, + { + "epoch": 1.299290780141844, + "grad_norm": 2.642695665359497, + "learning_rate": 4.473818870796533e-06, + "loss": 0.5048, + "step": 2748 + }, + { + "epoch": 1.2997635933806146, + "grad_norm": 2.519824743270874, + "learning_rate": 4.4734359573012686e-06, + "loss": 0.5131, + "step": 2749 + }, + { + "epoch": 1.3002364066193852, + "grad_norm": 2.5901925563812256, + "learning_rate": 4.4730529209287995e-06, + "loss": 0.4582, + "step": 2750 + }, + { + "epoch": 1.300709219858156, + "grad_norm": 2.6789121627807617, + "learning_rate": 4.472669761702978e-06, + "loss": 0.5685, + "step": 2751 + }, + { + "epoch": 1.3011820330969268, + "grad_norm": 2.408003807067871, + "learning_rate": 4.472286479647659e-06, + "loss": 0.4329, + "step": 2752 + }, + { + "epoch": 1.3016548463356974, + "grad_norm": 2.681403398513794, + "learning_rate": 4.47190307478671e-06, + "loss": 0.4853, + "step": 2753 + }, + { + "epoch": 1.302127659574468, + "grad_norm": 2.9923183917999268, + "learning_rate": 4.4715195471440025e-06, + "loss": 0.5184, + "step": 2754 + }, + { + "epoch": 1.3026004728132388, + "grad_norm": 2.5100321769714355, + "learning_rate": 4.471135896743418e-06, + "loss": 0.5148, + "step": 2755 + }, + { + "epoch": 1.3030732860520096, + "grad_norm": 2.267881393432617, + "learning_rate": 4.4707521236088444e-06, + "loss": 0.5028, + "step": 2756 + }, + { + "epoch": 1.3035460992907801, + "grad_norm": 2.7779829502105713, + "learning_rate": 4.4703682277641775e-06, + "loss": 0.5724, + "step": 2757 + }, + { + "epoch": 1.3040189125295507, + "grad_norm": 2.4262194633483887, + "learning_rate": 4.4699842092333205e-06, + "loss": 0.5341, + "step": 2758 + }, + { + "epoch": 1.3044917257683215, + "grad_norm": 2.8682050704956055, + "learning_rate": 4.469600068040185e-06, + "loss": 0.6114, + "step": 2759 + }, + { + "epoch": 1.3049645390070923, + "grad_norm": 2.647853374481201, + "learning_rate": 4.46921580420869e-06, + "loss": 0.5107, + "step": 2760 + }, + { + "epoch": 1.3054373522458629, + "grad_norm": 2.561998128890991, + "learning_rate": 4.468831417762762e-06, + "loss": 0.6019, + "step": 2761 + }, + { + "epoch": 1.3059101654846335, + "grad_norm": 2.763425350189209, + "learning_rate": 4.468446908726334e-06, + "loss": 0.572, + "step": 2762 + }, + { + "epoch": 1.3063829787234043, + "grad_norm": 2.7052934169769287, + "learning_rate": 4.468062277123348e-06, + "loss": 0.4876, + "step": 2763 + }, + { + "epoch": 1.306855791962175, + "grad_norm": 2.997845411300659, + "learning_rate": 4.467677522977755e-06, + "loss": 0.5683, + "step": 2764 + }, + { + "epoch": 1.3073286052009456, + "grad_norm": 2.503129005432129, + "learning_rate": 4.46729264631351e-06, + "loss": 0.4951, + "step": 2765 + }, + { + "epoch": 1.3078014184397162, + "grad_norm": 2.617492437362671, + "learning_rate": 4.466907647154578e-06, + "loss": 0.5054, + "step": 2766 + }, + { + "epoch": 1.308274231678487, + "grad_norm": 2.934967279434204, + "learning_rate": 4.4665225255249315e-06, + "loss": 0.5299, + "step": 2767 + }, + { + "epoch": 1.3087470449172578, + "grad_norm": 2.787252187728882, + "learning_rate": 4.46613728144855e-06, + "loss": 0.4652, + "step": 2768 + }, + { + "epoch": 1.3092198581560284, + "grad_norm": 2.567439556121826, + "learning_rate": 4.465751914949422e-06, + "loss": 0.538, + "step": 2769 + }, + { + "epoch": 1.309692671394799, + "grad_norm": 2.6386024951934814, + "learning_rate": 4.4653664260515416e-06, + "loss": 0.464, + "step": 2770 + }, + { + "epoch": 1.3101654846335697, + "grad_norm": 2.966848134994507, + "learning_rate": 4.464980814778912e-06, + "loss": 0.4889, + "step": 2771 + }, + { + "epoch": 1.3106382978723405, + "grad_norm": 2.571256637573242, + "learning_rate": 4.464595081155542e-06, + "loss": 0.4979, + "step": 2772 + }, + { + "epoch": 1.3111111111111111, + "grad_norm": 2.774203062057495, + "learning_rate": 4.4642092252054515e-06, + "loss": 0.5366, + "step": 2773 + }, + { + "epoch": 1.3115839243498817, + "grad_norm": 2.682969331741333, + "learning_rate": 4.463823246952666e-06, + "loss": 0.5118, + "step": 2774 + }, + { + "epoch": 1.3120567375886525, + "grad_norm": 2.4873905181884766, + "learning_rate": 4.463437146421217e-06, + "loss": 0.5548, + "step": 2775 + }, + { + "epoch": 1.3125295508274233, + "grad_norm": 2.6769661903381348, + "learning_rate": 4.463050923635147e-06, + "loss": 0.5023, + "step": 2776 + }, + { + "epoch": 1.3130023640661939, + "grad_norm": 2.7190892696380615, + "learning_rate": 4.462664578618503e-06, + "loss": 0.5546, + "step": 2777 + }, + { + "epoch": 1.3134751773049644, + "grad_norm": 2.8193624019622803, + "learning_rate": 4.462278111395343e-06, + "loss": 0.5265, + "step": 2778 + }, + { + "epoch": 1.3139479905437352, + "grad_norm": 2.7324538230895996, + "learning_rate": 4.461891521989728e-06, + "loss": 0.5449, + "step": 2779 + }, + { + "epoch": 1.314420803782506, + "grad_norm": 2.87320876121521, + "learning_rate": 4.4615048104257305e-06, + "loss": 0.5367, + "step": 2780 + }, + { + "epoch": 1.3148936170212766, + "grad_norm": 2.6777031421661377, + "learning_rate": 4.4611179767274306e-06, + "loss": 0.5026, + "step": 2781 + }, + { + "epoch": 1.3153664302600472, + "grad_norm": 3.714524269104004, + "learning_rate": 4.460731020918913e-06, + "loss": 0.569, + "step": 2782 + }, + { + "epoch": 1.315839243498818, + "grad_norm": 2.7493600845336914, + "learning_rate": 4.460343943024273e-06, + "loss": 0.5826, + "step": 2783 + }, + { + "epoch": 1.3163120567375888, + "grad_norm": 2.6544079780578613, + "learning_rate": 4.459956743067609e-06, + "loss": 0.5399, + "step": 2784 + }, + { + "epoch": 1.3167848699763594, + "grad_norm": 2.4338037967681885, + "learning_rate": 4.459569421073036e-06, + "loss": 0.5186, + "step": 2785 + }, + { + "epoch": 1.31725768321513, + "grad_norm": 2.9312374591827393, + "learning_rate": 4.459181977064665e-06, + "loss": 0.5571, + "step": 2786 + }, + { + "epoch": 1.3177304964539007, + "grad_norm": 2.5988922119140625, + "learning_rate": 4.458794411066624e-06, + "loss": 0.5926, + "step": 2787 + }, + { + "epoch": 1.3182033096926715, + "grad_norm": 2.5193772315979004, + "learning_rate": 4.458406723103044e-06, + "loss": 0.5243, + "step": 2788 + }, + { + "epoch": 1.318676122931442, + "grad_norm": 2.8653743267059326, + "learning_rate": 4.458018913198066e-06, + "loss": 0.5421, + "step": 2789 + }, + { + "epoch": 1.3191489361702127, + "grad_norm": 2.486245632171631, + "learning_rate": 4.457630981375834e-06, + "loss": 0.4862, + "step": 2790 + }, + { + "epoch": 1.3196217494089835, + "grad_norm": 3.155435800552368, + "learning_rate": 4.457242927660506e-06, + "loss": 0.5386, + "step": 2791 + }, + { + "epoch": 1.3200945626477543, + "grad_norm": 3.102023124694824, + "learning_rate": 4.456854752076242e-06, + "loss": 0.5527, + "step": 2792 + }, + { + "epoch": 1.3205673758865248, + "grad_norm": 2.7995986938476562, + "learning_rate": 4.456466454647215e-06, + "loss": 0.4364, + "step": 2793 + }, + { + "epoch": 1.3210401891252954, + "grad_norm": 2.8328311443328857, + "learning_rate": 4.456078035397599e-06, + "loss": 0.5516, + "step": 2794 + }, + { + "epoch": 1.3215130023640662, + "grad_norm": 2.606161594390869, + "learning_rate": 4.455689494351581e-06, + "loss": 0.5042, + "step": 2795 + }, + { + "epoch": 1.321985815602837, + "grad_norm": 2.6344757080078125, + "learning_rate": 4.455300831533354e-06, + "loss": 0.4807, + "step": 2796 + }, + { + "epoch": 1.3224586288416076, + "grad_norm": 2.8539786338806152, + "learning_rate": 4.454912046967118e-06, + "loss": 0.4694, + "step": 2797 + }, + { + "epoch": 1.3229314420803782, + "grad_norm": 2.849066734313965, + "learning_rate": 4.454523140677081e-06, + "loss": 0.5037, + "step": 2798 + }, + { + "epoch": 1.323404255319149, + "grad_norm": 2.6803371906280518, + "learning_rate": 4.454134112687458e-06, + "loss": 0.4959, + "step": 2799 + }, + { + "epoch": 1.3238770685579198, + "grad_norm": 3.0546066761016846, + "learning_rate": 4.453744963022473e-06, + "loss": 0.5935, + "step": 2800 + }, + { + "epoch": 1.3243498817966903, + "grad_norm": 2.625602960586548, + "learning_rate": 4.453355691706356e-06, + "loss": 0.5349, + "step": 2801 + }, + { + "epoch": 1.324822695035461, + "grad_norm": 2.7568554878234863, + "learning_rate": 4.452966298763345e-06, + "loss": 0.5012, + "step": 2802 + }, + { + "epoch": 1.3252955082742317, + "grad_norm": 2.940427303314209, + "learning_rate": 4.452576784217686e-06, + "loss": 0.5246, + "step": 2803 + }, + { + "epoch": 1.3257683215130025, + "grad_norm": 2.5485289096832275, + "learning_rate": 4.452187148093633e-06, + "loss": 0.5282, + "step": 2804 + }, + { + "epoch": 1.326241134751773, + "grad_norm": 2.8152987957000732, + "learning_rate": 4.4517973904154455e-06, + "loss": 0.5468, + "step": 2805 + }, + { + "epoch": 1.3267139479905437, + "grad_norm": 2.9399688243865967, + "learning_rate": 4.451407511207393e-06, + "loss": 0.5586, + "step": 2806 + }, + { + "epoch": 1.3271867612293144, + "grad_norm": 2.3870036602020264, + "learning_rate": 4.451017510493751e-06, + "loss": 0.4807, + "step": 2807 + }, + { + "epoch": 1.327659574468085, + "grad_norm": 3.4667887687683105, + "learning_rate": 4.450627388298805e-06, + "loss": 0.5571, + "step": 2808 + }, + { + "epoch": 1.3281323877068558, + "grad_norm": 2.685986042022705, + "learning_rate": 4.450237144646844e-06, + "loss": 0.5525, + "step": 2809 + }, + { + "epoch": 1.3286052009456264, + "grad_norm": 2.8529131412506104, + "learning_rate": 4.449846779562168e-06, + "loss": 0.491, + "step": 2810 + }, + { + "epoch": 1.3290780141843972, + "grad_norm": 2.7360332012176514, + "learning_rate": 4.449456293069082e-06, + "loss": 0.5574, + "step": 2811 + }, + { + "epoch": 1.3295508274231678, + "grad_norm": 2.4656026363372803, + "learning_rate": 4.4490656851919015e-06, + "loss": 0.4678, + "step": 2812 + }, + { + "epoch": 1.3300236406619386, + "grad_norm": 2.602651357650757, + "learning_rate": 4.448674955954947e-06, + "loss": 0.5118, + "step": 2813 + }, + { + "epoch": 1.3304964539007091, + "grad_norm": 3.0129756927490234, + "learning_rate": 4.448284105382548e-06, + "loss": 0.6136, + "step": 2814 + }, + { + "epoch": 1.33096926713948, + "grad_norm": 2.8499927520751953, + "learning_rate": 4.447893133499039e-06, + "loss": 0.5286, + "step": 2815 + }, + { + "epoch": 1.3314420803782505, + "grad_norm": 2.8320744037628174, + "learning_rate": 4.447502040328767e-06, + "loss": 0.5186, + "step": 2816 + }, + { + "epoch": 1.3319148936170213, + "grad_norm": 2.499950885772705, + "learning_rate": 4.447110825896084e-06, + "loss": 0.5338, + "step": 2817 + }, + { + "epoch": 1.3323877068557919, + "grad_norm": 2.530895233154297, + "learning_rate": 4.446719490225346e-06, + "loss": 0.5151, + "step": 2818 + }, + { + "epoch": 1.3328605200945627, + "grad_norm": 2.5276098251342773, + "learning_rate": 4.446328033340921e-06, + "loss": 0.5424, + "step": 2819 + }, + { + "epoch": 1.3333333333333333, + "grad_norm": 2.90218186378479, + "learning_rate": 4.4459364552671845e-06, + "loss": 0.5747, + "step": 2820 + }, + { + "epoch": 1.333806146572104, + "grad_norm": 2.500943183898926, + "learning_rate": 4.445544756028518e-06, + "loss": 0.5459, + "step": 2821 + }, + { + "epoch": 1.3342789598108746, + "grad_norm": 2.960374355316162, + "learning_rate": 4.44515293564931e-06, + "loss": 0.6092, + "step": 2822 + }, + { + "epoch": 1.3347517730496454, + "grad_norm": 2.813671827316284, + "learning_rate": 4.444760994153958e-06, + "loss": 0.5536, + "step": 2823 + }, + { + "epoch": 1.335224586288416, + "grad_norm": 2.7147483825683594, + "learning_rate": 4.444368931566867e-06, + "loss": 0.5291, + "step": 2824 + }, + { + "epoch": 1.3356973995271868, + "grad_norm": 2.710101842880249, + "learning_rate": 4.443976747912447e-06, + "loss": 0.5138, + "step": 2825 + }, + { + "epoch": 1.3361702127659574, + "grad_norm": 2.711419105529785, + "learning_rate": 4.443584443215121e-06, + "loss": 0.5223, + "step": 2826 + }, + { + "epoch": 1.3366430260047282, + "grad_norm": 2.887472152709961, + "learning_rate": 4.443192017499313e-06, + "loss": 0.5464, + "step": 2827 + }, + { + "epoch": 1.3371158392434987, + "grad_norm": 2.8867223262786865, + "learning_rate": 4.4427994707894585e-06, + "loss": 0.5748, + "step": 2828 + }, + { + "epoch": 1.3375886524822695, + "grad_norm": 2.407247543334961, + "learning_rate": 4.44240680311e-06, + "loss": 0.4727, + "step": 2829 + }, + { + "epoch": 1.3380614657210401, + "grad_norm": 2.578420877456665, + "learning_rate": 4.4420140144853865e-06, + "loss": 0.5129, + "step": 2830 + }, + { + "epoch": 1.338534278959811, + "grad_norm": 2.884373426437378, + "learning_rate": 4.441621104940077e-06, + "loss": 0.5366, + "step": 2831 + }, + { + "epoch": 1.3390070921985815, + "grad_norm": 2.8652374744415283, + "learning_rate": 4.441228074498534e-06, + "loss": 0.5045, + "step": 2832 + }, + { + "epoch": 1.3394799054373523, + "grad_norm": 2.5380210876464844, + "learning_rate": 4.440834923185231e-06, + "loss": 0.509, + "step": 2833 + }, + { + "epoch": 1.3399527186761229, + "grad_norm": 2.415734052658081, + "learning_rate": 4.440441651024648e-06, + "loss": 0.5066, + "step": 2834 + }, + { + "epoch": 1.3404255319148937, + "grad_norm": 2.503051996231079, + "learning_rate": 4.440048258041272e-06, + "loss": 0.5118, + "step": 2835 + }, + { + "epoch": 1.3408983451536642, + "grad_norm": 3.351001024246216, + "learning_rate": 4.439654744259598e-06, + "loss": 0.5758, + "step": 2836 + }, + { + "epoch": 1.341371158392435, + "grad_norm": 2.7368781566619873, + "learning_rate": 4.439261109704129e-06, + "loss": 0.5674, + "step": 2837 + }, + { + "epoch": 1.3418439716312056, + "grad_norm": 3.008199453353882, + "learning_rate": 4.438867354399372e-06, + "loss": 0.5891, + "step": 2838 + }, + { + "epoch": 1.3423167848699764, + "grad_norm": 2.538907766342163, + "learning_rate": 4.438473478369847e-06, + "loss": 0.5102, + "step": 2839 + }, + { + "epoch": 1.342789598108747, + "grad_norm": 2.7169063091278076, + "learning_rate": 4.438079481640079e-06, + "loss": 0.6131, + "step": 2840 + }, + { + "epoch": 1.3432624113475178, + "grad_norm": 2.7411608695983887, + "learning_rate": 4.437685364234601e-06, + "loss": 0.5337, + "step": 2841 + }, + { + "epoch": 1.3437352245862884, + "grad_norm": 3.2374939918518066, + "learning_rate": 4.43729112617795e-06, + "loss": 0.5401, + "step": 2842 + }, + { + "epoch": 1.3442080378250592, + "grad_norm": 2.4712226390838623, + "learning_rate": 4.436896767494676e-06, + "loss": 0.5365, + "step": 2843 + }, + { + "epoch": 1.3446808510638297, + "grad_norm": 2.661619186401367, + "learning_rate": 4.436502288209334e-06, + "loss": 0.4919, + "step": 2844 + }, + { + "epoch": 1.3451536643026005, + "grad_norm": 2.5943779945373535, + "learning_rate": 4.4361076883464845e-06, + "loss": 0.5253, + "step": 2845 + }, + { + "epoch": 1.345626477541371, + "grad_norm": 2.672297477722168, + "learning_rate": 4.4357129679307e-06, + "loss": 0.541, + "step": 2846 + }, + { + "epoch": 1.346099290780142, + "grad_norm": 2.6830925941467285, + "learning_rate": 4.435318126986557e-06, + "loss": 0.5641, + "step": 2847 + }, + { + "epoch": 1.3465721040189125, + "grad_norm": 2.7394626140594482, + "learning_rate": 4.434923165538639e-06, + "loss": 0.5591, + "step": 2848 + }, + { + "epoch": 1.3470449172576833, + "grad_norm": 2.9656317234039307, + "learning_rate": 4.434528083611541e-06, + "loss": 0.515, + "step": 2849 + }, + { + "epoch": 1.3475177304964538, + "grad_norm": 3.30155086517334, + "learning_rate": 4.434132881229861e-06, + "loss": 0.5871, + "step": 2850 + }, + { + "epoch": 1.3479905437352246, + "grad_norm": 2.6222476959228516, + "learning_rate": 4.433737558418209e-06, + "loss": 0.5143, + "step": 2851 + }, + { + "epoch": 1.3484633569739952, + "grad_norm": 2.903158187866211, + "learning_rate": 4.4333421152011965e-06, + "loss": 0.4484, + "step": 2852 + }, + { + "epoch": 1.348936170212766, + "grad_norm": 2.863116979598999, + "learning_rate": 4.432946551603449e-06, + "loss": 0.5213, + "step": 2853 + }, + { + "epoch": 1.3494089834515366, + "grad_norm": 2.8253962993621826, + "learning_rate": 4.432550867649596e-06, + "loss": 0.5713, + "step": 2854 + }, + { + "epoch": 1.3498817966903074, + "grad_norm": 2.652493953704834, + "learning_rate": 4.432155063364273e-06, + "loss": 0.5559, + "step": 2855 + }, + { + "epoch": 1.350354609929078, + "grad_norm": 2.4289376735687256, + "learning_rate": 4.431759138772127e-06, + "loss": 0.5122, + "step": 2856 + }, + { + "epoch": 1.3508274231678488, + "grad_norm": 2.6329853534698486, + "learning_rate": 4.43136309389781e-06, + "loss": 0.5332, + "step": 2857 + }, + { + "epoch": 1.3513002364066193, + "grad_norm": 2.431103229522705, + "learning_rate": 4.430966928765982e-06, + "loss": 0.4863, + "step": 2858 + }, + { + "epoch": 1.3517730496453901, + "grad_norm": 2.7529025077819824, + "learning_rate": 4.4305706434013106e-06, + "loss": 0.5263, + "step": 2859 + }, + { + "epoch": 1.3522458628841607, + "grad_norm": 2.884605646133423, + "learning_rate": 4.43017423782847e-06, + "loss": 0.564, + "step": 2860 + }, + { + "epoch": 1.3527186761229315, + "grad_norm": 3.027771234512329, + "learning_rate": 4.4297777120721435e-06, + "loss": 0.5846, + "step": 2861 + }, + { + "epoch": 1.353191489361702, + "grad_norm": 3.0140626430511475, + "learning_rate": 4.4293810661570205e-06, + "loss": 0.6621, + "step": 2862 + }, + { + "epoch": 1.3536643026004729, + "grad_norm": 2.721799612045288, + "learning_rate": 4.428984300107799e-06, + "loss": 0.5566, + "step": 2863 + }, + { + "epoch": 1.3541371158392435, + "grad_norm": 3.0016496181488037, + "learning_rate": 4.428587413949183e-06, + "loss": 0.5525, + "step": 2864 + }, + { + "epoch": 1.3546099290780143, + "grad_norm": 2.77138614654541, + "learning_rate": 4.428190407705886e-06, + "loss": 0.6016, + "step": 2865 + }, + { + "epoch": 1.3550827423167848, + "grad_norm": 2.9783477783203125, + "learning_rate": 4.427793281402627e-06, + "loss": 0.5556, + "step": 2866 + }, + { + "epoch": 1.3555555555555556, + "grad_norm": 2.2490382194519043, + "learning_rate": 4.427396035064132e-06, + "loss": 0.5138, + "step": 2867 + }, + { + "epoch": 1.3560283687943262, + "grad_norm": 2.442225217819214, + "learning_rate": 4.426998668715139e-06, + "loss": 0.4843, + "step": 2868 + }, + { + "epoch": 1.356501182033097, + "grad_norm": 2.74040150642395, + "learning_rate": 4.426601182380388e-06, + "loss": 0.54, + "step": 2869 + }, + { + "epoch": 1.3569739952718676, + "grad_norm": 2.4434332847595215, + "learning_rate": 4.426203576084629e-06, + "loss": 0.5199, + "step": 2870 + }, + { + "epoch": 1.3574468085106384, + "grad_norm": 2.6380388736724854, + "learning_rate": 4.42580584985262e-06, + "loss": 0.5049, + "step": 2871 + }, + { + "epoch": 1.357919621749409, + "grad_norm": 2.7324254512786865, + "learning_rate": 4.425408003709125e-06, + "loss": 0.5036, + "step": 2872 + }, + { + "epoch": 1.3583924349881797, + "grad_norm": 2.661012649536133, + "learning_rate": 4.425010037678916e-06, + "loss": 0.4965, + "step": 2873 + }, + { + "epoch": 1.3588652482269503, + "grad_norm": 2.5380208492279053, + "learning_rate": 4.424611951786773e-06, + "loss": 0.4293, + "step": 2874 + }, + { + "epoch": 1.3593380614657211, + "grad_norm": 2.6060714721679688, + "learning_rate": 4.424213746057483e-06, + "loss": 0.5335, + "step": 2875 + }, + { + "epoch": 1.3598108747044917, + "grad_norm": 2.98282527923584, + "learning_rate": 4.423815420515841e-06, + "loss": 0.5626, + "step": 2876 + }, + { + "epoch": 1.3602836879432625, + "grad_norm": 2.779371500015259, + "learning_rate": 4.423416975186647e-06, + "loss": 0.5353, + "step": 2877 + }, + { + "epoch": 1.360756501182033, + "grad_norm": 2.8033530712127686, + "learning_rate": 4.423018410094713e-06, + "loss": 0.538, + "step": 2878 + }, + { + "epoch": 1.3612293144208039, + "grad_norm": 3.225177764892578, + "learning_rate": 4.422619725264855e-06, + "loss": 0.5441, + "step": 2879 + }, + { + "epoch": 1.3617021276595744, + "grad_norm": 2.959135055541992, + "learning_rate": 4.422220920721896e-06, + "loss": 0.5293, + "step": 2880 + }, + { + "epoch": 1.3621749408983452, + "grad_norm": 2.5558884143829346, + "learning_rate": 4.4218219964906704e-06, + "loss": 0.442, + "step": 2881 + }, + { + "epoch": 1.3626477541371158, + "grad_norm": 2.694899797439575, + "learning_rate": 4.421422952596015e-06, + "loss": 0.5318, + "step": 2882 + }, + { + "epoch": 1.3631205673758866, + "grad_norm": 2.7909531593322754, + "learning_rate": 4.421023789062777e-06, + "loss": 0.6648, + "step": 2883 + }, + { + "epoch": 1.3635933806146572, + "grad_norm": 2.421995162963867, + "learning_rate": 4.420624505915813e-06, + "loss": 0.4644, + "step": 2884 + }, + { + "epoch": 1.364066193853428, + "grad_norm": 2.5876688957214355, + "learning_rate": 4.420225103179981e-06, + "loss": 0.5743, + "step": 2885 + }, + { + "epoch": 1.3645390070921986, + "grad_norm": 2.89341139793396, + "learning_rate": 4.419825580880152e-06, + "loss": 0.5454, + "step": 2886 + }, + { + "epoch": 1.3650118203309693, + "grad_norm": 2.534708261489868, + "learning_rate": 4.419425939041203e-06, + "loss": 0.5572, + "step": 2887 + }, + { + "epoch": 1.36548463356974, + "grad_norm": 2.6052141189575195, + "learning_rate": 4.419026177688017e-06, + "loss": 0.4763, + "step": 2888 + }, + { + "epoch": 1.3659574468085105, + "grad_norm": 2.723720073699951, + "learning_rate": 4.4186262968454854e-06, + "loss": 0.5659, + "step": 2889 + }, + { + "epoch": 1.3664302600472813, + "grad_norm": 2.8909599781036377, + "learning_rate": 4.418226296538507e-06, + "loss": 0.4996, + "step": 2890 + }, + { + "epoch": 1.366903073286052, + "grad_norm": 2.551375389099121, + "learning_rate": 4.417826176791988e-06, + "loss": 0.5259, + "step": 2891 + }, + { + "epoch": 1.3673758865248227, + "grad_norm": 3.360267162322998, + "learning_rate": 4.417425937630843e-06, + "loss": 0.5381, + "step": 2892 + }, + { + "epoch": 1.3678486997635932, + "grad_norm": 2.7611942291259766, + "learning_rate": 4.417025579079992e-06, + "loss": 0.6022, + "step": 2893 + }, + { + "epoch": 1.368321513002364, + "grad_norm": 2.5931224822998047, + "learning_rate": 4.416625101164365e-06, + "loss": 0.5102, + "step": 2894 + }, + { + "epoch": 1.3687943262411348, + "grad_norm": 2.5888102054595947, + "learning_rate": 4.416224503908897e-06, + "loss": 0.4955, + "step": 2895 + }, + { + "epoch": 1.3692671394799054, + "grad_norm": 2.6262896060943604, + "learning_rate": 4.41582378733853e-06, + "loss": 0.5101, + "step": 2896 + }, + { + "epoch": 1.369739952718676, + "grad_norm": 3.339170217514038, + "learning_rate": 4.415422951478218e-06, + "loss": 0.4939, + "step": 2897 + }, + { + "epoch": 1.3702127659574468, + "grad_norm": 2.940866708755493, + "learning_rate": 4.415021996352917e-06, + "loss": 0.5157, + "step": 2898 + }, + { + "epoch": 1.3706855791962176, + "grad_norm": 2.7423818111419678, + "learning_rate": 4.414620921987594e-06, + "loss": 0.5308, + "step": 2899 + }, + { + "epoch": 1.3711583924349882, + "grad_norm": 2.7177040576934814, + "learning_rate": 4.414219728407221e-06, + "loss": 0.5429, + "step": 2900 + }, + { + "epoch": 1.3716312056737587, + "grad_norm": 2.560774087905884, + "learning_rate": 4.4138184156367794e-06, + "loss": 0.5266, + "step": 2901 + }, + { + "epoch": 1.3721040189125295, + "grad_norm": 2.5649116039276123, + "learning_rate": 4.413416983701256e-06, + "loss": 0.4718, + "step": 2902 + }, + { + "epoch": 1.3725768321513003, + "grad_norm": 2.8547167778015137, + "learning_rate": 4.413015432625648e-06, + "loss": 0.5129, + "step": 2903 + }, + { + "epoch": 1.373049645390071, + "grad_norm": 2.5413618087768555, + "learning_rate": 4.412613762434958e-06, + "loss": 0.5738, + "step": 2904 + }, + { + "epoch": 1.3735224586288415, + "grad_norm": 3.3252241611480713, + "learning_rate": 4.412211973154195e-06, + "loss": 0.5639, + "step": 2905 + }, + { + "epoch": 1.3739952718676123, + "grad_norm": 2.869102954864502, + "learning_rate": 4.411810064808376e-06, + "loss": 0.5384, + "step": 2906 + }, + { + "epoch": 1.374468085106383, + "grad_norm": 2.703199863433838, + "learning_rate": 4.411408037422529e-06, + "loss": 0.5742, + "step": 2907 + }, + { + "epoch": 1.3749408983451537, + "grad_norm": 2.685450792312622, + "learning_rate": 4.411005891021684e-06, + "loss": 0.5121, + "step": 2908 + }, + { + "epoch": 1.3754137115839242, + "grad_norm": 2.9572203159332275, + "learning_rate": 4.410603625630882e-06, + "loss": 0.5444, + "step": 2909 + }, + { + "epoch": 1.375886524822695, + "grad_norm": 2.707002878189087, + "learning_rate": 4.410201241275169e-06, + "loss": 0.5125, + "step": 2910 + }, + { + "epoch": 1.3763593380614658, + "grad_norm": 3.0158939361572266, + "learning_rate": 4.409798737979602e-06, + "loss": 0.5299, + "step": 2911 + }, + { + "epoch": 1.3768321513002364, + "grad_norm": 2.7932698726654053, + "learning_rate": 4.4093961157692415e-06, + "loss": 0.5437, + "step": 2912 + }, + { + "epoch": 1.377304964539007, + "grad_norm": 2.459510326385498, + "learning_rate": 4.408993374669156e-06, + "loss": 0.5548, + "step": 2913 + }, + { + "epoch": 1.3777777777777778, + "grad_norm": 2.7500696182250977, + "learning_rate": 4.408590514704425e-06, + "loss": 0.5186, + "step": 2914 + }, + { + "epoch": 1.3782505910165486, + "grad_norm": 2.7824268341064453, + "learning_rate": 4.4081875359001315e-06, + "loss": 0.4762, + "step": 2915 + }, + { + "epoch": 1.3787234042553191, + "grad_norm": 2.4202158451080322, + "learning_rate": 4.4077844382813675e-06, + "loss": 0.5005, + "step": 2916 + }, + { + "epoch": 1.3791962174940897, + "grad_norm": 2.5566670894622803, + "learning_rate": 4.4073812218732316e-06, + "loss": 0.5377, + "step": 2917 + }, + { + "epoch": 1.3796690307328605, + "grad_norm": 3.400874376296997, + "learning_rate": 4.406977886700831e-06, + "loss": 0.6637, + "step": 2918 + }, + { + "epoch": 1.3801418439716313, + "grad_norm": 2.8187878131866455, + "learning_rate": 4.406574432789278e-06, + "loss": 0.5033, + "step": 2919 + }, + { + "epoch": 1.3806146572104019, + "grad_norm": 2.5578041076660156, + "learning_rate": 4.406170860163697e-06, + "loss": 0.5293, + "step": 2920 + }, + { + "epoch": 1.3810874704491725, + "grad_norm": 2.6709718704223633, + "learning_rate": 4.405767168849213e-06, + "loss": 0.5144, + "step": 2921 + }, + { + "epoch": 1.3815602836879433, + "grad_norm": 3.049365997314453, + "learning_rate": 4.405363358870965e-06, + "loss": 0.4894, + "step": 2922 + }, + { + "epoch": 1.382033096926714, + "grad_norm": 2.5569891929626465, + "learning_rate": 4.404959430254095e-06, + "loss": 0.4929, + "step": 2923 + }, + { + "epoch": 1.3825059101654846, + "grad_norm": 2.8288230895996094, + "learning_rate": 4.404555383023754e-06, + "loss": 0.5438, + "step": 2924 + }, + { + "epoch": 1.3829787234042552, + "grad_norm": 2.8363358974456787, + "learning_rate": 4.404151217205102e-06, + "loss": 0.545, + "step": 2925 + }, + { + "epoch": 1.383451536643026, + "grad_norm": 2.720972776412964, + "learning_rate": 4.403746932823302e-06, + "loss": 0.5732, + "step": 2926 + }, + { + "epoch": 1.3839243498817968, + "grad_norm": 2.728043794631958, + "learning_rate": 4.403342529903528e-06, + "loss": 0.4944, + "step": 2927 + }, + { + "epoch": 1.3843971631205674, + "grad_norm": 2.4366135597229004, + "learning_rate": 4.402938008470961e-06, + "loss": 0.4441, + "step": 2928 + }, + { + "epoch": 1.384869976359338, + "grad_norm": 2.858454704284668, + "learning_rate": 4.402533368550788e-06, + "loss": 0.5359, + "step": 2929 + }, + { + "epoch": 1.3853427895981087, + "grad_norm": 2.805795907974243, + "learning_rate": 4.402128610168205e-06, + "loss": 0.4954, + "step": 2930 + }, + { + "epoch": 1.3858156028368795, + "grad_norm": 3.3514177799224854, + "learning_rate": 4.401723733348413e-06, + "loss": 0.579, + "step": 2931 + }, + { + "epoch": 1.3862884160756501, + "grad_norm": 2.6255125999450684, + "learning_rate": 4.401318738116624e-06, + "loss": 0.5002, + "step": 2932 + }, + { + "epoch": 1.3867612293144207, + "grad_norm": 2.3480796813964844, + "learning_rate": 4.400913624498054e-06, + "loss": 0.4688, + "step": 2933 + }, + { + "epoch": 1.3872340425531915, + "grad_norm": 2.710165023803711, + "learning_rate": 4.400508392517927e-06, + "loss": 0.5099, + "step": 2934 + }, + { + "epoch": 1.3877068557919623, + "grad_norm": 2.5820295810699463, + "learning_rate": 4.400103042201477e-06, + "loss": 0.512, + "step": 2935 + }, + { + "epoch": 1.3881796690307329, + "grad_norm": 2.750596523284912, + "learning_rate": 4.399697573573942e-06, + "loss": 0.463, + "step": 2936 + }, + { + "epoch": 1.3886524822695034, + "grad_norm": 3.497537612915039, + "learning_rate": 4.399291986660569e-06, + "loss": 0.5676, + "step": 2937 + }, + { + "epoch": 1.3891252955082742, + "grad_norm": 2.4046003818511963, + "learning_rate": 4.398886281486612e-06, + "loss": 0.5408, + "step": 2938 + }, + { + "epoch": 1.389598108747045, + "grad_norm": 2.941606283187866, + "learning_rate": 4.398480458077332e-06, + "loss": 0.5734, + "step": 2939 + }, + { + "epoch": 1.3900709219858156, + "grad_norm": 3.030214309692383, + "learning_rate": 4.398074516458e-06, + "loss": 0.5353, + "step": 2940 + }, + { + "epoch": 1.3905437352245862, + "grad_norm": 2.9991626739501953, + "learning_rate": 4.397668456653889e-06, + "loss": 0.5989, + "step": 2941 + }, + { + "epoch": 1.391016548463357, + "grad_norm": 4.163141250610352, + "learning_rate": 4.397262278690285e-06, + "loss": 0.5436, + "step": 2942 + }, + { + "epoch": 1.3914893617021278, + "grad_norm": 2.6576037406921387, + "learning_rate": 4.396855982592478e-06, + "loss": 0.5206, + "step": 2943 + }, + { + "epoch": 1.3919621749408984, + "grad_norm": 2.7729203701019287, + "learning_rate": 4.396449568385768e-06, + "loss": 0.5403, + "step": 2944 + }, + { + "epoch": 1.392434988179669, + "grad_norm": 2.4560446739196777, + "learning_rate": 4.396043036095457e-06, + "loss": 0.4924, + "step": 2945 + }, + { + "epoch": 1.3929078014184397, + "grad_norm": 2.6370556354522705, + "learning_rate": 4.39563638574686e-06, + "loss": 0.5543, + "step": 2946 + }, + { + "epoch": 1.3933806146572105, + "grad_norm": 2.593914270401001, + "learning_rate": 4.395229617365298e-06, + "loss": 0.5133, + "step": 2947 + }, + { + "epoch": 1.393853427895981, + "grad_norm": 2.3583998680114746, + "learning_rate": 4.394822730976099e-06, + "loss": 0.4436, + "step": 2948 + }, + { + "epoch": 1.3943262411347517, + "grad_norm": 3.2768537998199463, + "learning_rate": 4.394415726604596e-06, + "loss": 0.5489, + "step": 2949 + }, + { + "epoch": 1.3947990543735225, + "grad_norm": 2.88662052154541, + "learning_rate": 4.394008604276133e-06, + "loss": 0.5194, + "step": 2950 + }, + { + "epoch": 1.3952718676122933, + "grad_norm": 2.46610426902771, + "learning_rate": 4.393601364016059e-06, + "loss": 0.5255, + "step": 2951 + }, + { + "epoch": 1.3957446808510638, + "grad_norm": 3.122509241104126, + "learning_rate": 4.393194005849731e-06, + "loss": 0.6046, + "step": 2952 + }, + { + "epoch": 1.3962174940898344, + "grad_norm": 2.724926471710205, + "learning_rate": 4.392786529802513e-06, + "loss": 0.4958, + "step": 2953 + }, + { + "epoch": 1.3966903073286052, + "grad_norm": 2.491485595703125, + "learning_rate": 4.3923789358997785e-06, + "loss": 0.5209, + "step": 2954 + }, + { + "epoch": 1.397163120567376, + "grad_norm": 2.61110520362854, + "learning_rate": 4.3919712241669056e-06, + "loss": 0.5202, + "step": 2955 + }, + { + "epoch": 1.3976359338061466, + "grad_norm": 2.3814501762390137, + "learning_rate": 4.39156339462928e-06, + "loss": 0.4966, + "step": 2956 + }, + { + "epoch": 1.3981087470449172, + "grad_norm": 2.762498617172241, + "learning_rate": 4.391155447312296e-06, + "loss": 0.6025, + "step": 2957 + }, + { + "epoch": 1.398581560283688, + "grad_norm": 2.964975595474243, + "learning_rate": 4.390747382241355e-06, + "loss": 0.4845, + "step": 2958 + }, + { + "epoch": 1.3990543735224588, + "grad_norm": 3.0117249488830566, + "learning_rate": 4.3903391994418655e-06, + "loss": 0.5326, + "step": 2959 + }, + { + "epoch": 1.3995271867612293, + "grad_norm": 2.578626871109009, + "learning_rate": 4.389930898939243e-06, + "loss": 0.5271, + "step": 2960 + }, + { + "epoch": 1.4, + "grad_norm": 2.747441053390503, + "learning_rate": 4.38952248075891e-06, + "loss": 0.5553, + "step": 2961 + }, + { + "epoch": 1.4004728132387707, + "grad_norm": 2.8273086547851562, + "learning_rate": 4.389113944926297e-06, + "loss": 0.5475, + "step": 2962 + }, + { + "epoch": 1.4009456264775415, + "grad_norm": 2.55238676071167, + "learning_rate": 4.388705291466843e-06, + "loss": 0.4864, + "step": 2963 + }, + { + "epoch": 1.401418439716312, + "grad_norm": 2.597214460372925, + "learning_rate": 4.388296520405992e-06, + "loss": 0.4845, + "step": 2964 + }, + { + "epoch": 1.4018912529550827, + "grad_norm": 2.608962297439575, + "learning_rate": 4.387887631769196e-06, + "loss": 0.5544, + "step": 2965 + }, + { + "epoch": 1.4023640661938535, + "grad_norm": 2.2754876613616943, + "learning_rate": 4.3874786255819165e-06, + "loss": 0.5045, + "step": 2966 + }, + { + "epoch": 1.4028368794326243, + "grad_norm": 2.9900264739990234, + "learning_rate": 4.387069501869618e-06, + "loss": 0.562, + "step": 2967 + }, + { + "epoch": 1.4033096926713948, + "grad_norm": 2.8069417476654053, + "learning_rate": 4.386660260657778e-06, + "loss": 0.5284, + "step": 2968 + }, + { + "epoch": 1.4037825059101654, + "grad_norm": 2.68894624710083, + "learning_rate": 4.386250901971875e-06, + "loss": 0.5879, + "step": 2969 + }, + { + "epoch": 1.4042553191489362, + "grad_norm": 2.614485025405884, + "learning_rate": 4.385841425837399e-06, + "loss": 0.4771, + "step": 2970 + }, + { + "epoch": 1.4047281323877068, + "grad_norm": 2.487950325012207, + "learning_rate": 4.385431832279848e-06, + "loss": 0.5552, + "step": 2971 + }, + { + "epoch": 1.4052009456264776, + "grad_norm": 2.5098392963409424, + "learning_rate": 4.385022121324723e-06, + "loss": 0.5267, + "step": 2972 + }, + { + "epoch": 1.4056737588652481, + "grad_norm": 2.825838565826416, + "learning_rate": 4.384612292997537e-06, + "loss": 0.5336, + "step": 2973 + }, + { + "epoch": 1.406146572104019, + "grad_norm": 2.898188829421997, + "learning_rate": 4.384202347323806e-06, + "loss": 0.5685, + "step": 2974 + }, + { + "epoch": 1.4066193853427895, + "grad_norm": 2.8722569942474365, + "learning_rate": 4.383792284329057e-06, + "loss": 0.5977, + "step": 2975 + }, + { + "epoch": 1.4070921985815603, + "grad_norm": 2.832951307296753, + "learning_rate": 4.3833821040388235e-06, + "loss": 0.5766, + "step": 2976 + }, + { + "epoch": 1.407565011820331, + "grad_norm": 2.7353670597076416, + "learning_rate": 4.3829718064786446e-06, + "loss": 0.5461, + "step": 2977 + }, + { + "epoch": 1.4080378250591017, + "grad_norm": 2.6050429344177246, + "learning_rate": 4.3825613916740675e-06, + "loss": 0.5501, + "step": 2978 + }, + { + "epoch": 1.4085106382978723, + "grad_norm": 2.79719877243042, + "learning_rate": 4.382150859650647e-06, + "loss": 0.502, + "step": 2979 + }, + { + "epoch": 1.408983451536643, + "grad_norm": 2.5538079738616943, + "learning_rate": 4.381740210433946e-06, + "loss": 0.4762, + "step": 2980 + }, + { + "epoch": 1.4094562647754136, + "grad_norm": 2.7256062030792236, + "learning_rate": 4.381329444049533e-06, + "loss": 0.4692, + "step": 2981 + }, + { + "epoch": 1.4099290780141844, + "grad_norm": 2.7778146266937256, + "learning_rate": 4.3809185605229855e-06, + "loss": 0.5366, + "step": 2982 + }, + { + "epoch": 1.410401891252955, + "grad_norm": 2.6289451122283936, + "learning_rate": 4.380507559879887e-06, + "loss": 0.5412, + "step": 2983 + }, + { + "epoch": 1.4108747044917258, + "grad_norm": 2.697204828262329, + "learning_rate": 4.380096442145827e-06, + "loss": 0.5065, + "step": 2984 + }, + { + "epoch": 1.4113475177304964, + "grad_norm": 2.4709219932556152, + "learning_rate": 4.379685207346407e-06, + "loss": 0.568, + "step": 2985 + }, + { + "epoch": 1.4118203309692672, + "grad_norm": 2.9740655422210693, + "learning_rate": 4.379273855507231e-06, + "loss": 0.5512, + "step": 2986 + }, + { + "epoch": 1.4122931442080378, + "grad_norm": 3.0090627670288086, + "learning_rate": 4.378862386653911e-06, + "loss": 0.5459, + "step": 2987 + }, + { + "epoch": 1.4127659574468086, + "grad_norm": 2.8835368156433105, + "learning_rate": 4.378450800812071e-06, + "loss": 0.5357, + "step": 2988 + }, + { + "epoch": 1.4132387706855791, + "grad_norm": 2.558824062347412, + "learning_rate": 4.378039098007335e-06, + "loss": 0.536, + "step": 2989 + }, + { + "epoch": 1.41371158392435, + "grad_norm": 2.5572092533111572, + "learning_rate": 4.377627278265339e-06, + "loss": 0.5183, + "step": 2990 + }, + { + "epoch": 1.4141843971631205, + "grad_norm": 2.7356579303741455, + "learning_rate": 4.377215341611727e-06, + "loss": 0.5087, + "step": 2991 + }, + { + "epoch": 1.4146572104018913, + "grad_norm": 2.7541024684906006, + "learning_rate": 4.376803288072146e-06, + "loss": 0.4509, + "step": 2992 + }, + { + "epoch": 1.4151300236406619, + "grad_norm": 2.7548446655273438, + "learning_rate": 4.376391117672254e-06, + "loss": 0.5532, + "step": 2993 + }, + { + "epoch": 1.4156028368794327, + "grad_norm": 2.9107465744018555, + "learning_rate": 4.375978830437715e-06, + "loss": 0.5719, + "step": 2994 + }, + { + "epoch": 1.4160756501182032, + "grad_norm": 2.7077393531799316, + "learning_rate": 4.3755664263942e-06, + "loss": 0.5084, + "step": 2995 + }, + { + "epoch": 1.416548463356974, + "grad_norm": 2.764209270477295, + "learning_rate": 4.375153905567388e-06, + "loss": 0.5976, + "step": 2996 + }, + { + "epoch": 1.4170212765957446, + "grad_norm": 2.7792932987213135, + "learning_rate": 4.374741267982964e-06, + "loss": 0.5358, + "step": 2997 + }, + { + "epoch": 1.4174940898345154, + "grad_norm": 2.459212064743042, + "learning_rate": 4.374328513666622e-06, + "loss": 0.5181, + "step": 2998 + }, + { + "epoch": 1.417966903073286, + "grad_norm": 2.548546552658081, + "learning_rate": 4.373915642644062e-06, + "loss": 0.528, + "step": 2999 + }, + { + "epoch": 1.4184397163120568, + "grad_norm": 2.998138189315796, + "learning_rate": 4.373502654940992e-06, + "loss": 0.5233, + "step": 3000 + }, + { + "epoch": 1.4189125295508274, + "grad_norm": 2.604341983795166, + "learning_rate": 4.373089550583126e-06, + "loss": 0.5274, + "step": 3001 + }, + { + "epoch": 1.4193853427895982, + "grad_norm": 2.6792588233947754, + "learning_rate": 4.372676329596188e-06, + "loss": 0.5061, + "step": 3002 + }, + { + "epoch": 1.4198581560283687, + "grad_norm": 2.5182368755340576, + "learning_rate": 4.372262992005906e-06, + "loss": 0.541, + "step": 3003 + }, + { + "epoch": 1.4203309692671395, + "grad_norm": 2.690718173980713, + "learning_rate": 4.371849537838018e-06, + "loss": 0.5308, + "step": 3004 + }, + { + "epoch": 1.42080378250591, + "grad_norm": 2.6797590255737305, + "learning_rate": 4.371435967118266e-06, + "loss": 0.5728, + "step": 3005 + }, + { + "epoch": 1.421276595744681, + "grad_norm": 2.847900152206421, + "learning_rate": 4.371022279872403e-06, + "loss": 0.5053, + "step": 3006 + }, + { + "epoch": 1.4217494089834515, + "grad_norm": 2.497810125350952, + "learning_rate": 4.370608476126186e-06, + "loss": 0.5057, + "step": 3007 + }, + { + "epoch": 1.4222222222222223, + "grad_norm": 2.5259225368499756, + "learning_rate": 4.370194555905382e-06, + "loss": 0.5508, + "step": 3008 + }, + { + "epoch": 1.4226950354609929, + "grad_norm": 2.774118423461914, + "learning_rate": 4.369780519235763e-06, + "loss": 0.5419, + "step": 3009 + }, + { + "epoch": 1.4231678486997636, + "grad_norm": 2.2764663696289062, + "learning_rate": 4.369366366143111e-06, + "loss": 0.5032, + "step": 3010 + }, + { + "epoch": 1.4236406619385342, + "grad_norm": 2.736347198486328, + "learning_rate": 4.368952096653211e-06, + "loss": 0.5184, + "step": 3011 + }, + { + "epoch": 1.424113475177305, + "grad_norm": 2.476762056350708, + "learning_rate": 4.36853771079186e-06, + "loss": 0.5331, + "step": 3012 + }, + { + "epoch": 1.4245862884160756, + "grad_norm": 2.8006162643432617, + "learning_rate": 4.3681232085848585e-06, + "loss": 0.5331, + "step": 3013 + }, + { + "epoch": 1.4250591016548464, + "grad_norm": 2.509143590927124, + "learning_rate": 4.367708590058016e-06, + "loss": 0.5127, + "step": 3014 + }, + { + "epoch": 1.425531914893617, + "grad_norm": 3.030137538909912, + "learning_rate": 4.3672938552371505e-06, + "loss": 0.5555, + "step": 3015 + }, + { + "epoch": 1.4260047281323878, + "grad_norm": 3.0536904335021973, + "learning_rate": 4.3668790041480835e-06, + "loss": 0.5241, + "step": 3016 + }, + { + "epoch": 1.4264775413711583, + "grad_norm": 2.6400439739227295, + "learning_rate": 4.366464036816647e-06, + "loss": 0.4946, + "step": 3017 + }, + { + "epoch": 1.4269503546099291, + "grad_norm": 2.7302589416503906, + "learning_rate": 4.366048953268679e-06, + "loss": 0.5105, + "step": 3018 + }, + { + "epoch": 1.4274231678486997, + "grad_norm": 2.504549264907837, + "learning_rate": 4.365633753530026e-06, + "loss": 0.4844, + "step": 3019 + }, + { + "epoch": 1.4278959810874705, + "grad_norm": 2.3872320652008057, + "learning_rate": 4.365218437626539e-06, + "loss": 0.4402, + "step": 3020 + }, + { + "epoch": 1.428368794326241, + "grad_norm": 2.531649351119995, + "learning_rate": 4.364803005584078e-06, + "loss": 0.4913, + "step": 3021 + }, + { + "epoch": 1.4288416075650119, + "grad_norm": 2.4683783054351807, + "learning_rate": 4.364387457428512e-06, + "loss": 0.515, + "step": 3022 + }, + { + "epoch": 1.4293144208037825, + "grad_norm": 2.632336378097534, + "learning_rate": 4.363971793185713e-06, + "loss": 0.5398, + "step": 3023 + }, + { + "epoch": 1.4297872340425533, + "grad_norm": 2.7456719875335693, + "learning_rate": 4.363556012881565e-06, + "loss": 0.5254, + "step": 3024 + }, + { + "epoch": 1.4302600472813238, + "grad_norm": 2.607177972793579, + "learning_rate": 4.363140116541955e-06, + "loss": 0.5266, + "step": 3025 + }, + { + "epoch": 1.4307328605200946, + "grad_norm": 2.640127420425415, + "learning_rate": 4.3627241041927796e-06, + "loss": 0.5157, + "step": 3026 + }, + { + "epoch": 1.4312056737588652, + "grad_norm": 2.4210736751556396, + "learning_rate": 4.362307975859941e-06, + "loss": 0.4599, + "step": 3027 + }, + { + "epoch": 1.431678486997636, + "grad_norm": 2.6007790565490723, + "learning_rate": 4.361891731569352e-06, + "loss": 0.5298, + "step": 3028 + }, + { + "epoch": 1.4321513002364066, + "grad_norm": 2.5352046489715576, + "learning_rate": 4.361475371346928e-06, + "loss": 0.5128, + "step": 3029 + }, + { + "epoch": 1.4326241134751774, + "grad_norm": 2.4204049110412598, + "learning_rate": 4.361058895218596e-06, + "loss": 0.4669, + "step": 3030 + }, + { + "epoch": 1.433096926713948, + "grad_norm": 2.525240182876587, + "learning_rate": 4.360642303210286e-06, + "loss": 0.4925, + "step": 3031 + }, + { + "epoch": 1.4335697399527187, + "grad_norm": 2.839646339416504, + "learning_rate": 4.360225595347939e-06, + "loss": 0.5868, + "step": 3032 + }, + { + "epoch": 1.4340425531914893, + "grad_norm": 2.5043296813964844, + "learning_rate": 4.359808771657501e-06, + "loss": 0.4951, + "step": 3033 + }, + { + "epoch": 1.4345153664302601, + "grad_norm": 2.9082300662994385, + "learning_rate": 4.359391832164927e-06, + "loss": 0.5259, + "step": 3034 + }, + { + "epoch": 1.4349881796690307, + "grad_norm": 2.6651999950408936, + "learning_rate": 4.3589747768961745e-06, + "loss": 0.537, + "step": 3035 + }, + { + "epoch": 1.4354609929078015, + "grad_norm": 2.577077865600586, + "learning_rate": 4.358557605877216e-06, + "loss": 0.5186, + "step": 3036 + }, + { + "epoch": 1.435933806146572, + "grad_norm": 2.7445287704467773, + "learning_rate": 4.3581403191340236e-06, + "loss": 0.5573, + "step": 3037 + }, + { + "epoch": 1.4364066193853429, + "grad_norm": 2.502086639404297, + "learning_rate": 4.357722916692582e-06, + "loss": 0.5039, + "step": 3038 + }, + { + "epoch": 1.4368794326241134, + "grad_norm": 2.4476163387298584, + "learning_rate": 4.357305398578879e-06, + "loss": 0.5638, + "step": 3039 + }, + { + "epoch": 1.4373522458628842, + "grad_norm": 2.7705588340759277, + "learning_rate": 4.356887764818915e-06, + "loss": 0.5485, + "step": 3040 + }, + { + "epoch": 1.4378250591016548, + "grad_norm": 2.498225450515747, + "learning_rate": 4.356470015438691e-06, + "loss": 0.5486, + "step": 3041 + }, + { + "epoch": 1.4382978723404256, + "grad_norm": 2.394320011138916, + "learning_rate": 4.356052150464219e-06, + "loss": 0.512, + "step": 3042 + }, + { + "epoch": 1.4387706855791962, + "grad_norm": 2.8725767135620117, + "learning_rate": 4.3556341699215185e-06, + "loss": 0.5202, + "step": 3043 + }, + { + "epoch": 1.439243498817967, + "grad_norm": 3.1707918643951416, + "learning_rate": 4.355216073836615e-06, + "loss": 0.5229, + "step": 3044 + }, + { + "epoch": 1.4397163120567376, + "grad_norm": 2.532578468322754, + "learning_rate": 4.3547978622355415e-06, + "loss": 0.4569, + "step": 3045 + }, + { + "epoch": 1.4401891252955084, + "grad_norm": 3.0111029148101807, + "learning_rate": 4.354379535144338e-06, + "loss": 0.5801, + "step": 3046 + }, + { + "epoch": 1.440661938534279, + "grad_norm": 2.9554224014282227, + "learning_rate": 4.353961092589052e-06, + "loss": 0.5968, + "step": 3047 + }, + { + "epoch": 1.4411347517730497, + "grad_norm": 2.7562637329101562, + "learning_rate": 4.353542534595738e-06, + "loss": 0.5005, + "step": 3048 + }, + { + "epoch": 1.4416075650118203, + "grad_norm": 3.083254337310791, + "learning_rate": 4.3531238611904595e-06, + "loss": 0.5389, + "step": 3049 + }, + { + "epoch": 1.442080378250591, + "grad_norm": 2.7778005599975586, + "learning_rate": 4.352705072399282e-06, + "loss": 0.5342, + "step": 3050 + }, + { + "epoch": 1.4425531914893617, + "grad_norm": 2.6673996448516846, + "learning_rate": 4.3522861682482845e-06, + "loss": 0.5213, + "step": 3051 + }, + { + "epoch": 1.4430260047281322, + "grad_norm": 2.637605905532837, + "learning_rate": 4.351867148763548e-06, + "loss": 0.4893, + "step": 3052 + }, + { + "epoch": 1.443498817966903, + "grad_norm": 2.834469795227051, + "learning_rate": 4.351448013971166e-06, + "loss": 0.5391, + "step": 3053 + }, + { + "epoch": 1.4439716312056738, + "grad_norm": 2.824153184890747, + "learning_rate": 4.351028763897234e-06, + "loss": 0.6403, + "step": 3054 + }, + { + "epoch": 1.4444444444444444, + "grad_norm": 2.558966875076294, + "learning_rate": 4.350609398567857e-06, + "loss": 0.4912, + "step": 3055 + }, + { + "epoch": 1.444917257683215, + "grad_norm": 2.281726360321045, + "learning_rate": 4.3501899180091475e-06, + "loss": 0.4655, + "step": 3056 + }, + { + "epoch": 1.4453900709219858, + "grad_norm": 2.499472141265869, + "learning_rate": 4.349770322247225e-06, + "loss": 0.4878, + "step": 3057 + }, + { + "epoch": 1.4458628841607566, + "grad_norm": 2.578615188598633, + "learning_rate": 4.349350611308215e-06, + "loss": 0.4855, + "step": 3058 + }, + { + "epoch": 1.4463356973995272, + "grad_norm": 2.7111165523529053, + "learning_rate": 4.348930785218252e-06, + "loss": 0.5415, + "step": 3059 + }, + { + "epoch": 1.4468085106382977, + "grad_norm": 2.8081610202789307, + "learning_rate": 4.348510844003476e-06, + "loss": 0.4881, + "step": 3060 + }, + { + "epoch": 1.4472813238770685, + "grad_norm": 2.9439868927001953, + "learning_rate": 4.348090787690036e-06, + "loss": 0.5485, + "step": 3061 + }, + { + "epoch": 1.4477541371158393, + "grad_norm": 2.592532157897949, + "learning_rate": 4.347670616304085e-06, + "loss": 0.4912, + "step": 3062 + }, + { + "epoch": 1.44822695035461, + "grad_norm": 2.960592746734619, + "learning_rate": 4.347250329871787e-06, + "loss": 0.5473, + "step": 3063 + }, + { + "epoch": 1.4486997635933805, + "grad_norm": 2.5786688327789307, + "learning_rate": 4.3468299284193116e-06, + "loss": 0.5348, + "step": 3064 + }, + { + "epoch": 1.4491725768321513, + "grad_norm": 2.6084046363830566, + "learning_rate": 4.346409411972834e-06, + "loss": 0.527, + "step": 3065 + }, + { + "epoch": 1.449645390070922, + "grad_norm": 2.489748239517212, + "learning_rate": 4.3459887805585385e-06, + "loss": 0.4943, + "step": 3066 + }, + { + "epoch": 1.4501182033096927, + "grad_norm": 2.452131986618042, + "learning_rate": 4.345568034202617e-06, + "loss": 0.4886, + "step": 3067 + }, + { + "epoch": 1.4505910165484632, + "grad_norm": 2.4034671783447266, + "learning_rate": 4.345147172931266e-06, + "loss": 0.4689, + "step": 3068 + }, + { + "epoch": 1.451063829787234, + "grad_norm": 2.6045448780059814, + "learning_rate": 4.344726196770691e-06, + "loss": 0.5842, + "step": 3069 + }, + { + "epoch": 1.4515366430260048, + "grad_norm": 2.697593927383423, + "learning_rate": 4.3443051057471045e-06, + "loss": 0.5358, + "step": 3070 + }, + { + "epoch": 1.4520094562647754, + "grad_norm": 2.6080820560455322, + "learning_rate": 4.343883899886727e-06, + "loss": 0.5361, + "step": 3071 + }, + { + "epoch": 1.452482269503546, + "grad_norm": 2.4605307579040527, + "learning_rate": 4.343462579215783e-06, + "loss": 0.4941, + "step": 3072 + }, + { + "epoch": 1.4529550827423168, + "grad_norm": 2.8025355339050293, + "learning_rate": 4.343041143760509e-06, + "loss": 0.5116, + "step": 3073 + }, + { + "epoch": 1.4534278959810876, + "grad_norm": 2.432515859603882, + "learning_rate": 4.3426195935471434e-06, + "loss": 0.4991, + "step": 3074 + }, + { + "epoch": 1.4539007092198581, + "grad_norm": 2.5838661193847656, + "learning_rate": 4.342197928601935e-06, + "loss": 0.4994, + "step": 3075 + }, + { + "epoch": 1.4543735224586287, + "grad_norm": 2.421692371368408, + "learning_rate": 4.341776148951141e-06, + "loss": 0.4945, + "step": 3076 + }, + { + "epoch": 1.4548463356973995, + "grad_norm": 2.5354676246643066, + "learning_rate": 4.341354254621021e-06, + "loss": 0.4859, + "step": 3077 + }, + { + "epoch": 1.4553191489361703, + "grad_norm": 2.7316789627075195, + "learning_rate": 4.340932245637846e-06, + "loss": 0.5136, + "step": 3078 + }, + { + "epoch": 1.455791962174941, + "grad_norm": 3.5903496742248535, + "learning_rate": 4.340510122027891e-06, + "loss": 0.6451, + "step": 3079 + }, + { + "epoch": 1.4562647754137115, + "grad_norm": 2.95190167427063, + "learning_rate": 4.340087883817442e-06, + "loss": 0.6354, + "step": 3080 + }, + { + "epoch": 1.4567375886524823, + "grad_norm": 2.8659214973449707, + "learning_rate": 4.339665531032789e-06, + "loss": 0.5514, + "step": 3081 + }, + { + "epoch": 1.457210401891253, + "grad_norm": 2.5681674480438232, + "learning_rate": 4.339243063700231e-06, + "loss": 0.5135, + "step": 3082 + }, + { + "epoch": 1.4576832151300236, + "grad_norm": 2.7353906631469727, + "learning_rate": 4.338820481846072e-06, + "loss": 0.4608, + "step": 3083 + }, + { + "epoch": 1.4581560283687942, + "grad_norm": 2.6116466522216797, + "learning_rate": 4.3383977854966245e-06, + "loss": 0.4924, + "step": 3084 + }, + { + "epoch": 1.458628841607565, + "grad_norm": 2.6676487922668457, + "learning_rate": 4.337974974678207e-06, + "loss": 0.5747, + "step": 3085 + }, + { + "epoch": 1.4591016548463358, + "grad_norm": 2.909031629562378, + "learning_rate": 4.337552049417147e-06, + "loss": 0.4618, + "step": 3086 + }, + { + "epoch": 1.4595744680851064, + "grad_norm": 2.7614190578460693, + "learning_rate": 4.33712900973978e-06, + "loss": 0.5154, + "step": 3087 + }, + { + "epoch": 1.460047281323877, + "grad_norm": 2.452188014984131, + "learning_rate": 4.336705855672444e-06, + "loss": 0.542, + "step": 3088 + }, + { + "epoch": 1.4605200945626478, + "grad_norm": 3.0004117488861084, + "learning_rate": 4.336282587241488e-06, + "loss": 0.5857, + "step": 3089 + }, + { + "epoch": 1.4609929078014185, + "grad_norm": 2.870783567428589, + "learning_rate": 4.335859204473268e-06, + "loss": 0.5506, + "step": 3090 + }, + { + "epoch": 1.4614657210401891, + "grad_norm": 3.1078689098358154, + "learning_rate": 4.335435707394145e-06, + "loss": 0.5138, + "step": 3091 + }, + { + "epoch": 1.4619385342789597, + "grad_norm": 2.8516197204589844, + "learning_rate": 4.335012096030488e-06, + "loss": 0.5842, + "step": 3092 + }, + { + "epoch": 1.4624113475177305, + "grad_norm": 2.615922212600708, + "learning_rate": 4.334588370408675e-06, + "loss": 0.4896, + "step": 3093 + }, + { + "epoch": 1.4628841607565013, + "grad_norm": 3.1911802291870117, + "learning_rate": 4.334164530555088e-06, + "loss": 0.4974, + "step": 3094 + }, + { + "epoch": 1.4633569739952719, + "grad_norm": 3.075051784515381, + "learning_rate": 4.3337405764961186e-06, + "loss": 0.567, + "step": 3095 + }, + { + "epoch": 1.4638297872340424, + "grad_norm": 2.550625801086426, + "learning_rate": 4.333316508258163e-06, + "loss": 0.4887, + "step": 3096 + }, + { + "epoch": 1.4643026004728132, + "grad_norm": 2.3986475467681885, + "learning_rate": 4.332892325867629e-06, + "loss": 0.5047, + "step": 3097 + }, + { + "epoch": 1.464775413711584, + "grad_norm": 2.5045125484466553, + "learning_rate": 4.332468029350926e-06, + "loss": 0.4721, + "step": 3098 + }, + { + "epoch": 1.4652482269503546, + "grad_norm": 2.347365617752075, + "learning_rate": 4.332043618734474e-06, + "loss": 0.4913, + "step": 3099 + }, + { + "epoch": 1.4657210401891252, + "grad_norm": 2.459928512573242, + "learning_rate": 4.331619094044699e-06, + "loss": 0.523, + "step": 3100 + }, + { + "epoch": 1.466193853427896, + "grad_norm": 2.5771310329437256, + "learning_rate": 4.331194455308035e-06, + "loss": 0.593, + "step": 3101 + }, + { + "epoch": 1.4666666666666668, + "grad_norm": 3.1351823806762695, + "learning_rate": 4.330769702550921e-06, + "loss": 0.5852, + "step": 3102 + }, + { + "epoch": 1.4671394799054374, + "grad_norm": 2.589817523956299, + "learning_rate": 4.330344835799806e-06, + "loss": 0.508, + "step": 3103 + }, + { + "epoch": 1.467612293144208, + "grad_norm": 3.1140341758728027, + "learning_rate": 4.329919855081144e-06, + "loss": 0.469, + "step": 3104 + }, + { + "epoch": 1.4680851063829787, + "grad_norm": 2.8186635971069336, + "learning_rate": 4.329494760421396e-06, + "loss": 0.5088, + "step": 3105 + }, + { + "epoch": 1.4685579196217495, + "grad_norm": 2.676077365875244, + "learning_rate": 4.329069551847031e-06, + "loss": 0.52, + "step": 3106 + }, + { + "epoch": 1.46903073286052, + "grad_norm": 2.5543313026428223, + "learning_rate": 4.328644229384526e-06, + "loss": 0.5066, + "step": 3107 + }, + { + "epoch": 1.4695035460992907, + "grad_norm": 2.8176217079162598, + "learning_rate": 4.328218793060362e-06, + "loss": 0.6404, + "step": 3108 + }, + { + "epoch": 1.4699763593380615, + "grad_norm": 2.485217332839966, + "learning_rate": 4.3277932429010314e-06, + "loss": 0.4578, + "step": 3109 + }, + { + "epoch": 1.4704491725768323, + "grad_norm": 2.6741621494293213, + "learning_rate": 4.327367578933031e-06, + "loss": 0.5068, + "step": 3110 + }, + { + "epoch": 1.4709219858156029, + "grad_norm": 2.377242088317871, + "learning_rate": 4.326941801182863e-06, + "loss": 0.5249, + "step": 3111 + }, + { + "epoch": 1.4713947990543734, + "grad_norm": 2.790046215057373, + "learning_rate": 4.32651590967704e-06, + "loss": 0.5532, + "step": 3112 + }, + { + "epoch": 1.4718676122931442, + "grad_norm": 2.78019642829895, + "learning_rate": 4.326089904442081e-06, + "loss": 0.5362, + "step": 3113 + }, + { + "epoch": 1.472340425531915, + "grad_norm": 2.5661380290985107, + "learning_rate": 4.32566378550451e-06, + "loss": 0.5041, + "step": 3114 + }, + { + "epoch": 1.4728132387706856, + "grad_norm": 2.522153615951538, + "learning_rate": 4.3252375528908605e-06, + "loss": 0.5074, + "step": 3115 + }, + { + "epoch": 1.4732860520094562, + "grad_norm": 2.874688148498535, + "learning_rate": 4.3248112066276725e-06, + "loss": 0.59, + "step": 3116 + }, + { + "epoch": 1.473758865248227, + "grad_norm": 3.067866802215576, + "learning_rate": 4.324384746741492e-06, + "loss": 0.5924, + "step": 3117 + }, + { + "epoch": 1.4742316784869978, + "grad_norm": 3.359463930130005, + "learning_rate": 4.323958173258873e-06, + "loss": 0.6346, + "step": 3118 + }, + { + "epoch": 1.4747044917257683, + "grad_norm": 2.193024158477783, + "learning_rate": 4.323531486206376e-06, + "loss": 0.4594, + "step": 3119 + }, + { + "epoch": 1.475177304964539, + "grad_norm": 2.886889934539795, + "learning_rate": 4.323104685610569e-06, + "loss": 0.523, + "step": 3120 + }, + { + "epoch": 1.4756501182033097, + "grad_norm": 2.7558681964874268, + "learning_rate": 4.322677771498028e-06, + "loss": 0.5387, + "step": 3121 + }, + { + "epoch": 1.4761229314420805, + "grad_norm": 2.639277935028076, + "learning_rate": 4.322250743895335e-06, + "loss": 0.5599, + "step": 3122 + }, + { + "epoch": 1.476595744680851, + "grad_norm": 2.786198616027832, + "learning_rate": 4.321823602829078e-06, + "loss": 0.5405, + "step": 3123 + }, + { + "epoch": 1.4770685579196217, + "grad_norm": 2.582315683364868, + "learning_rate": 4.321396348325853e-06, + "loss": 0.4452, + "step": 3124 + }, + { + "epoch": 1.4775413711583925, + "grad_norm": 2.8574297428131104, + "learning_rate": 4.320968980412265e-06, + "loss": 0.4846, + "step": 3125 + }, + { + "epoch": 1.4780141843971633, + "grad_norm": 2.705281972885132, + "learning_rate": 4.320541499114922e-06, + "loss": 0.5548, + "step": 3126 + }, + { + "epoch": 1.4784869976359338, + "grad_norm": 2.3152754306793213, + "learning_rate": 4.320113904460444e-06, + "loss": 0.5216, + "step": 3127 + }, + { + "epoch": 1.4789598108747044, + "grad_norm": 3.230764150619507, + "learning_rate": 4.319686196475453e-06, + "loss": 0.6192, + "step": 3128 + }, + { + "epoch": 1.4794326241134752, + "grad_norm": 2.463380813598633, + "learning_rate": 4.319258375186583e-06, + "loss": 0.4872, + "step": 3129 + }, + { + "epoch": 1.479905437352246, + "grad_norm": 2.8477656841278076, + "learning_rate": 4.31883044062047e-06, + "loss": 0.5371, + "step": 3130 + }, + { + "epoch": 1.4803782505910166, + "grad_norm": 2.393911123275757, + "learning_rate": 4.318402392803762e-06, + "loss": 0.5334, + "step": 3131 + }, + { + "epoch": 1.4808510638297872, + "grad_norm": 2.6113736629486084, + "learning_rate": 4.317974231763109e-06, + "loss": 0.5572, + "step": 3132 + }, + { + "epoch": 1.481323877068558, + "grad_norm": 2.3941731452941895, + "learning_rate": 4.317545957525173e-06, + "loss": 0.4849, + "step": 3133 + }, + { + "epoch": 1.4817966903073285, + "grad_norm": 2.9536755084991455, + "learning_rate": 4.317117570116619e-06, + "loss": 0.6058, + "step": 3134 + }, + { + "epoch": 1.4822695035460993, + "grad_norm": 2.595754623413086, + "learning_rate": 4.316689069564123e-06, + "loss": 0.5193, + "step": 3135 + }, + { + "epoch": 1.48274231678487, + "grad_norm": 2.569833993911743, + "learning_rate": 4.316260455894364e-06, + "loss": 0.543, + "step": 3136 + }, + { + "epoch": 1.4832151300236407, + "grad_norm": 2.5137455463409424, + "learning_rate": 4.315831729134031e-06, + "loss": 0.5415, + "step": 3137 + }, + { + "epoch": 1.4836879432624113, + "grad_norm": 2.5582292079925537, + "learning_rate": 4.3154028893098176e-06, + "loss": 0.5338, + "step": 3138 + }, + { + "epoch": 1.484160756501182, + "grad_norm": 2.666426181793213, + "learning_rate": 4.3149739364484265e-06, + "loss": 0.5435, + "step": 3139 + }, + { + "epoch": 1.4846335697399526, + "grad_norm": 2.790851354598999, + "learning_rate": 4.314544870576568e-06, + "loss": 0.5746, + "step": 3140 + }, + { + "epoch": 1.4851063829787234, + "grad_norm": 2.620326042175293, + "learning_rate": 4.314115691720956e-06, + "loss": 0.5076, + "step": 3141 + }, + { + "epoch": 1.485579196217494, + "grad_norm": 3.075674533843994, + "learning_rate": 4.313686399908314e-06, + "loss": 0.5486, + "step": 3142 + }, + { + "epoch": 1.4860520094562648, + "grad_norm": 3.1347315311431885, + "learning_rate": 4.3132569951653745e-06, + "loss": 0.531, + "step": 3143 + }, + { + "epoch": 1.4865248226950354, + "grad_norm": 2.5783653259277344, + "learning_rate": 4.312827477518871e-06, + "loss": 0.5818, + "step": 3144 + }, + { + "epoch": 1.4869976359338062, + "grad_norm": 3.0247137546539307, + "learning_rate": 4.3123978469955505e-06, + "loss": 0.5347, + "step": 3145 + }, + { + "epoch": 1.4874704491725768, + "grad_norm": 2.4789345264434814, + "learning_rate": 4.311968103622163e-06, + "loss": 0.5, + "step": 3146 + }, + { + "epoch": 1.4879432624113476, + "grad_norm": 2.663341522216797, + "learning_rate": 4.311538247425466e-06, + "loss": 0.4825, + "step": 3147 + }, + { + "epoch": 1.4884160756501181, + "grad_norm": 2.633711099624634, + "learning_rate": 4.311108278432226e-06, + "loss": 0.5244, + "step": 3148 + }, + { + "epoch": 1.488888888888889, + "grad_norm": 2.51312518119812, + "learning_rate": 4.310678196669216e-06, + "loss": 0.513, + "step": 3149 + }, + { + "epoch": 1.4893617021276595, + "grad_norm": 2.5263755321502686, + "learning_rate": 4.310248002163214e-06, + "loss": 0.5236, + "step": 3150 + }, + { + "epoch": 1.4898345153664303, + "grad_norm": 2.559216260910034, + "learning_rate": 4.309817694941007e-06, + "loss": 0.5107, + "step": 3151 + }, + { + "epoch": 1.4903073286052009, + "grad_norm": 2.5023303031921387, + "learning_rate": 4.309387275029386e-06, + "loss": 0.4685, + "step": 3152 + }, + { + "epoch": 1.4907801418439717, + "grad_norm": 3.0314254760742188, + "learning_rate": 4.308956742455155e-06, + "loss": 0.5462, + "step": 3153 + }, + { + "epoch": 1.4912529550827422, + "grad_norm": 2.675295114517212, + "learning_rate": 4.308526097245119e-06, + "loss": 0.5398, + "step": 3154 + }, + { + "epoch": 1.491725768321513, + "grad_norm": 2.6613399982452393, + "learning_rate": 4.308095339426094e-06, + "loss": 0.5376, + "step": 3155 + }, + { + "epoch": 1.4921985815602836, + "grad_norm": 2.58937668800354, + "learning_rate": 4.307664469024899e-06, + "loss": 0.5385, + "step": 3156 + }, + { + "epoch": 1.4926713947990544, + "grad_norm": 2.583631992340088, + "learning_rate": 4.3072334860683655e-06, + "loss": 0.4927, + "step": 3157 + }, + { + "epoch": 1.493144208037825, + "grad_norm": 2.5889222621917725, + "learning_rate": 4.306802390583327e-06, + "loss": 0.47, + "step": 3158 + }, + { + "epoch": 1.4936170212765958, + "grad_norm": 2.9362716674804688, + "learning_rate": 4.3063711825966244e-06, + "loss": 0.4902, + "step": 3159 + }, + { + "epoch": 1.4940898345153664, + "grad_norm": 2.5385425090789795, + "learning_rate": 4.305939862135111e-06, + "loss": 0.5396, + "step": 3160 + }, + { + "epoch": 1.4945626477541372, + "grad_norm": 2.776326894760132, + "learning_rate": 4.305508429225641e-06, + "loss": 0.5169, + "step": 3161 + }, + { + "epoch": 1.4950354609929077, + "grad_norm": 2.575063467025757, + "learning_rate": 4.305076883895076e-06, + "loss": 0.4938, + "step": 3162 + }, + { + "epoch": 1.4955082742316785, + "grad_norm": 2.7552313804626465, + "learning_rate": 4.304645226170291e-06, + "loss": 0.6211, + "step": 3163 + }, + { + "epoch": 1.4959810874704491, + "grad_norm": 2.57149338722229, + "learning_rate": 4.30421345607816e-06, + "loss": 0.5241, + "step": 3164 + }, + { + "epoch": 1.49645390070922, + "grad_norm": 2.8142426013946533, + "learning_rate": 4.303781573645568e-06, + "loss": 0.5699, + "step": 3165 + }, + { + "epoch": 1.4969267139479905, + "grad_norm": 2.6344845294952393, + "learning_rate": 4.303349578899407e-06, + "loss": 0.5049, + "step": 3166 + }, + { + "epoch": 1.4973995271867613, + "grad_norm": 2.554410934448242, + "learning_rate": 4.302917471866575e-06, + "loss": 0.4404, + "step": 3167 + }, + { + "epoch": 1.4978723404255319, + "grad_norm": 2.896240711212158, + "learning_rate": 4.302485252573978e-06, + "loss": 0.602, + "step": 3168 + }, + { + "epoch": 1.4983451536643027, + "grad_norm": 2.4044477939605713, + "learning_rate": 4.302052921048527e-06, + "loss": 0.4857, + "step": 3169 + }, + { + "epoch": 1.4988179669030732, + "grad_norm": 2.7447879314422607, + "learning_rate": 4.301620477317144e-06, + "loss": 0.5438, + "step": 3170 + }, + { + "epoch": 1.499290780141844, + "grad_norm": 2.851820945739746, + "learning_rate": 4.301187921406752e-06, + "loss": 0.5245, + "step": 3171 + }, + { + "epoch": 1.4997635933806146, + "grad_norm": 3.247114419937134, + "learning_rate": 4.300755253344287e-06, + "loss": 0.504, + "step": 3172 + }, + { + "epoch": 1.5002364066193854, + "grad_norm": 3.117490291595459, + "learning_rate": 4.300322473156688e-06, + "loss": 0.4627, + "step": 3173 + }, + { + "epoch": 1.500709219858156, + "grad_norm": 2.558319330215454, + "learning_rate": 4.299889580870904e-06, + "loss": 0.5721, + "step": 3174 + }, + { + "epoch": 1.5011820330969265, + "grad_norm": 2.8983113765716553, + "learning_rate": 4.2994565765138865e-06, + "loss": 0.5257, + "step": 3175 + }, + { + "epoch": 1.5016548463356973, + "grad_norm": 2.744056463241577, + "learning_rate": 4.299023460112599e-06, + "loss": 0.4892, + "step": 3176 + }, + { + "epoch": 1.5021276595744681, + "grad_norm": 2.5506751537323, + "learning_rate": 4.29859023169401e-06, + "loss": 0.4933, + "step": 3177 + }, + { + "epoch": 1.5026004728132387, + "grad_norm": 2.842615842819214, + "learning_rate": 4.298156891285092e-06, + "loss": 0.6124, + "step": 3178 + }, + { + "epoch": 1.5030732860520093, + "grad_norm": 2.5355329513549805, + "learning_rate": 4.2977234389128305e-06, + "loss": 0.641, + "step": 3179 + }, + { + "epoch": 1.50354609929078, + "grad_norm": 2.674781084060669, + "learning_rate": 4.297289874604213e-06, + "loss": 0.475, + "step": 3180 + }, + { + "epoch": 1.5040189125295509, + "grad_norm": 2.6845548152923584, + "learning_rate": 4.296856198386235e-06, + "loss": 0.5328, + "step": 3181 + }, + { + "epoch": 1.5044917257683215, + "grad_norm": 2.9686241149902344, + "learning_rate": 4.296422410285902e-06, + "loss": 0.6216, + "step": 3182 + }, + { + "epoch": 1.504964539007092, + "grad_norm": 2.5095980167388916, + "learning_rate": 4.295988510330222e-06, + "loss": 0.4993, + "step": 3183 + }, + { + "epoch": 1.5054373522458628, + "grad_norm": 2.4906392097473145, + "learning_rate": 4.2955544985462125e-06, + "loss": 0.4795, + "step": 3184 + }, + { + "epoch": 1.5059101654846336, + "grad_norm": 2.5593366622924805, + "learning_rate": 4.295120374960897e-06, + "loss": 0.5527, + "step": 3185 + }, + { + "epoch": 1.5063829787234042, + "grad_norm": 2.691495180130005, + "learning_rate": 4.294686139601308e-06, + "loss": 0.5646, + "step": 3186 + }, + { + "epoch": 1.5068557919621748, + "grad_norm": 2.74320387840271, + "learning_rate": 4.294251792494483e-06, + "loss": 0.6149, + "step": 3187 + }, + { + "epoch": 1.5073286052009456, + "grad_norm": 2.8827052116394043, + "learning_rate": 4.293817333667465e-06, + "loss": 0.5414, + "step": 3188 + }, + { + "epoch": 1.5078014184397164, + "grad_norm": 2.5652425289154053, + "learning_rate": 4.293382763147308e-06, + "loss": 0.5006, + "step": 3189 + }, + { + "epoch": 1.508274231678487, + "grad_norm": 2.729295253753662, + "learning_rate": 4.29294808096107e-06, + "loss": 0.522, + "step": 3190 + }, + { + "epoch": 1.5087470449172575, + "grad_norm": 2.348118305206299, + "learning_rate": 4.292513287135817e-06, + "loss": 0.4125, + "step": 3191 + }, + { + "epoch": 1.5092198581560283, + "grad_norm": 2.809551954269409, + "learning_rate": 4.292078381698621e-06, + "loss": 0.5577, + "step": 3192 + }, + { + "epoch": 1.5096926713947991, + "grad_norm": 2.6925361156463623, + "learning_rate": 4.291643364676563e-06, + "loss": 0.62, + "step": 3193 + }, + { + "epoch": 1.5101654846335697, + "grad_norm": 2.4200620651245117, + "learning_rate": 4.291208236096729e-06, + "loss": 0.5464, + "step": 3194 + }, + { + "epoch": 1.5106382978723403, + "grad_norm": 2.5659191608428955, + "learning_rate": 4.290772995986211e-06, + "loss": 0.5402, + "step": 3195 + }, + { + "epoch": 1.511111111111111, + "grad_norm": 2.3877315521240234, + "learning_rate": 4.290337644372113e-06, + "loss": 0.463, + "step": 3196 + }, + { + "epoch": 1.5115839243498819, + "grad_norm": 2.7063233852386475, + "learning_rate": 4.289902181281538e-06, + "loss": 0.5253, + "step": 3197 + }, + { + "epoch": 1.5120567375886524, + "grad_norm": 2.56788969039917, + "learning_rate": 4.289466606741603e-06, + "loss": 0.5012, + "step": 3198 + }, + { + "epoch": 1.512529550827423, + "grad_norm": 2.637164831161499, + "learning_rate": 4.28903092077943e-06, + "loss": 0.5236, + "step": 3199 + }, + { + "epoch": 1.5130023640661938, + "grad_norm": 2.767526865005493, + "learning_rate": 4.288595123422146e-06, + "loss": 0.5832, + "step": 3200 + }, + { + "epoch": 1.5134751773049646, + "grad_norm": 2.33365535736084, + "learning_rate": 4.2881592146968866e-06, + "loss": 0.4548, + "step": 3201 + }, + { + "epoch": 1.5139479905437352, + "grad_norm": 2.544189453125, + "learning_rate": 4.287723194630793e-06, + "loss": 0.5115, + "step": 3202 + }, + { + "epoch": 1.5144208037825058, + "grad_norm": 2.588793992996216, + "learning_rate": 4.2872870632510155e-06, + "loss": 0.4766, + "step": 3203 + }, + { + "epoch": 1.5148936170212766, + "grad_norm": 2.5382184982299805, + "learning_rate": 4.286850820584709e-06, + "loss": 0.5401, + "step": 3204 + }, + { + "epoch": 1.5153664302600474, + "grad_norm": 2.597930669784546, + "learning_rate": 4.286414466659038e-06, + "loss": 0.5346, + "step": 3205 + }, + { + "epoch": 1.515839243498818, + "grad_norm": 2.8522393703460693, + "learning_rate": 4.28597800150117e-06, + "loss": 0.486, + "step": 3206 + }, + { + "epoch": 1.5163120567375885, + "grad_norm": 2.4801454544067383, + "learning_rate": 4.285541425138285e-06, + "loss": 0.5162, + "step": 3207 + }, + { + "epoch": 1.5167848699763593, + "grad_norm": 2.353665351867676, + "learning_rate": 4.285104737597563e-06, + "loss": 0.5066, + "step": 3208 + }, + { + "epoch": 1.51725768321513, + "grad_norm": 2.767976760864258, + "learning_rate": 4.2846679389061975e-06, + "loss": 0.5331, + "step": 3209 + }, + { + "epoch": 1.5177304964539007, + "grad_norm": 2.9307682514190674, + "learning_rate": 4.284231029091385e-06, + "loss": 0.5291, + "step": 3210 + }, + { + "epoch": 1.5182033096926713, + "grad_norm": 2.39719820022583, + "learning_rate": 4.283794008180329e-06, + "loss": 0.4759, + "step": 3211 + }, + { + "epoch": 1.518676122931442, + "grad_norm": 2.452244758605957, + "learning_rate": 4.283356876200242e-06, + "loss": 0.4283, + "step": 3212 + }, + { + "epoch": 1.5191489361702128, + "grad_norm": 2.4911608695983887, + "learning_rate": 4.282919633178343e-06, + "loss": 0.4812, + "step": 3213 + }, + { + "epoch": 1.5196217494089834, + "grad_norm": 2.5813944339752197, + "learning_rate": 4.282482279141856e-06, + "loss": 0.4911, + "step": 3214 + }, + { + "epoch": 1.520094562647754, + "grad_norm": 2.503542184829712, + "learning_rate": 4.282044814118013e-06, + "loss": 0.4969, + "step": 3215 + }, + { + "epoch": 1.5205673758865248, + "grad_norm": 2.5090713500976562, + "learning_rate": 4.281607238134053e-06, + "loss": 0.5293, + "step": 3216 + }, + { + "epoch": 1.5210401891252956, + "grad_norm": 2.425994396209717, + "learning_rate": 4.281169551217223e-06, + "loss": 0.5365, + "step": 3217 + }, + { + "epoch": 1.5215130023640662, + "grad_norm": 2.637655258178711, + "learning_rate": 4.2807317533947765e-06, + "loss": 0.5589, + "step": 3218 + }, + { + "epoch": 1.5219858156028367, + "grad_norm": 2.9335296154022217, + "learning_rate": 4.28029384469397e-06, + "loss": 0.6071, + "step": 3219 + }, + { + "epoch": 1.5224586288416075, + "grad_norm": 2.898683547973633, + "learning_rate": 4.279855825142073e-06, + "loss": 0.5392, + "step": 3220 + }, + { + "epoch": 1.5229314420803783, + "grad_norm": 2.613914966583252, + "learning_rate": 4.279417694766359e-06, + "loss": 0.4968, + "step": 3221 + }, + { + "epoch": 1.523404255319149, + "grad_norm": 2.500682830810547, + "learning_rate": 4.278979453594106e-06, + "loss": 0.471, + "step": 3222 + }, + { + "epoch": 1.5238770685579195, + "grad_norm": 2.5269598960876465, + "learning_rate": 4.278541101652605e-06, + "loss": 0.471, + "step": 3223 + }, + { + "epoch": 1.5243498817966903, + "grad_norm": 2.8153114318847656, + "learning_rate": 4.2781026389691465e-06, + "loss": 0.5742, + "step": 3224 + }, + { + "epoch": 1.524822695035461, + "grad_norm": 2.5648019313812256, + "learning_rate": 4.277664065571034e-06, + "loss": 0.5315, + "step": 3225 + }, + { + "epoch": 1.5252955082742317, + "grad_norm": 2.778355836868286, + "learning_rate": 4.277225381485575e-06, + "loss": 0.5543, + "step": 3226 + }, + { + "epoch": 1.5257683215130022, + "grad_norm": 2.6736745834350586, + "learning_rate": 4.2767865867400846e-06, + "loss": 0.4947, + "step": 3227 + }, + { + "epoch": 1.526241134751773, + "grad_norm": 2.9560294151306152, + "learning_rate": 4.276347681361884e-06, + "loss": 0.5835, + "step": 3228 + }, + { + "epoch": 1.5267139479905438, + "grad_norm": 2.5580296516418457, + "learning_rate": 4.275908665378302e-06, + "loss": 0.4751, + "step": 3229 + }, + { + "epoch": 1.5271867612293144, + "grad_norm": 3.0705175399780273, + "learning_rate": 4.2754695388166755e-06, + "loss": 0.5327, + "step": 3230 + }, + { + "epoch": 1.527659574468085, + "grad_norm": 2.664652109146118, + "learning_rate": 4.275030301704346e-06, + "loss": 0.4934, + "step": 3231 + }, + { + "epoch": 1.5281323877068558, + "grad_norm": 2.308499813079834, + "learning_rate": 4.274590954068663e-06, + "loss": 0.4412, + "step": 3232 + }, + { + "epoch": 1.5286052009456266, + "grad_norm": 2.871189594268799, + "learning_rate": 4.2741514959369815e-06, + "loss": 0.5001, + "step": 3233 + }, + { + "epoch": 1.5290780141843971, + "grad_norm": 2.5274453163146973, + "learning_rate": 4.273711927336666e-06, + "loss": 0.4938, + "step": 3234 + }, + { + "epoch": 1.5295508274231677, + "grad_norm": 2.8848133087158203, + "learning_rate": 4.273272248295087e-06, + "loss": 0.5397, + "step": 3235 + }, + { + "epoch": 1.5300236406619385, + "grad_norm": 2.3927090167999268, + "learning_rate": 4.27283245883962e-06, + "loss": 0.5497, + "step": 3236 + }, + { + "epoch": 1.5304964539007093, + "grad_norm": 2.5413873195648193, + "learning_rate": 4.27239255899765e-06, + "loss": 0.5108, + "step": 3237 + }, + { + "epoch": 1.53096926713948, + "grad_norm": 2.7692389488220215, + "learning_rate": 4.271952548796567e-06, + "loss": 0.5768, + "step": 3238 + }, + { + "epoch": 1.5314420803782505, + "grad_norm": 2.4621126651763916, + "learning_rate": 4.271512428263768e-06, + "loss": 0.4698, + "step": 3239 + }, + { + "epoch": 1.5319148936170213, + "grad_norm": 2.6423375606536865, + "learning_rate": 4.271072197426659e-06, + "loss": 0.4929, + "step": 3240 + }, + { + "epoch": 1.532387706855792, + "grad_norm": 2.7097692489624023, + "learning_rate": 4.270631856312649e-06, + "loss": 0.4836, + "step": 3241 + }, + { + "epoch": 1.5328605200945626, + "grad_norm": 2.545706272125244, + "learning_rate": 4.270191404949158e-06, + "loss": 0.4636, + "step": 3242 + }, + { + "epoch": 1.5333333333333332, + "grad_norm": 3.138781785964966, + "learning_rate": 4.26975084336361e-06, + "loss": 0.5988, + "step": 3243 + }, + { + "epoch": 1.533806146572104, + "grad_norm": 2.492715835571289, + "learning_rate": 4.269310171583438e-06, + "loss": 0.5095, + "step": 3244 + }, + { + "epoch": 1.5342789598108748, + "grad_norm": 2.5705838203430176, + "learning_rate": 4.268869389636077e-06, + "loss": 0.4818, + "step": 3245 + }, + { + "epoch": 1.5347517730496454, + "grad_norm": 2.7633554935455322, + "learning_rate": 4.268428497548979e-06, + "loss": 0.547, + "step": 3246 + }, + { + "epoch": 1.535224586288416, + "grad_norm": 2.654528856277466, + "learning_rate": 4.2679874953495905e-06, + "loss": 0.5261, + "step": 3247 + }, + { + "epoch": 1.5356973995271868, + "grad_norm": 2.5039751529693604, + "learning_rate": 4.2675463830653744e-06, + "loss": 0.4941, + "step": 3248 + }, + { + "epoch": 1.5361702127659576, + "grad_norm": 2.897268295288086, + "learning_rate": 4.267105160723794e-06, + "loss": 0.5404, + "step": 3249 + }, + { + "epoch": 1.5366430260047281, + "grad_norm": 2.500732421875, + "learning_rate": 4.266663828352324e-06, + "loss": 0.5375, + "step": 3250 + }, + { + "epoch": 1.5371158392434987, + "grad_norm": 2.6310064792633057, + "learning_rate": 4.266222385978444e-06, + "loss": 0.5217, + "step": 3251 + }, + { + "epoch": 1.5375886524822695, + "grad_norm": 2.7440476417541504, + "learning_rate": 4.265780833629642e-06, + "loss": 0.5419, + "step": 3252 + }, + { + "epoch": 1.5380614657210403, + "grad_norm": 2.7037577629089355, + "learning_rate": 4.2653391713334095e-06, + "loss": 0.5634, + "step": 3253 + }, + { + "epoch": 1.5385342789598109, + "grad_norm": 2.548525810241699, + "learning_rate": 4.264897399117248e-06, + "loss": 0.535, + "step": 3254 + }, + { + "epoch": 1.5390070921985815, + "grad_norm": 2.6127355098724365, + "learning_rate": 4.264455517008663e-06, + "loss": 0.4619, + "step": 3255 + }, + { + "epoch": 1.5394799054373522, + "grad_norm": 2.5597004890441895, + "learning_rate": 4.264013525035171e-06, + "loss": 0.4477, + "step": 3256 + }, + { + "epoch": 1.539952718676123, + "grad_norm": 2.642432689666748, + "learning_rate": 4.263571423224292e-06, + "loss": 0.4749, + "step": 3257 + }, + { + "epoch": 1.5404255319148936, + "grad_norm": 2.5121877193450928, + "learning_rate": 4.2631292116035526e-06, + "loss": 0.4693, + "step": 3258 + }, + { + "epoch": 1.5408983451536642, + "grad_norm": 2.390292167663574, + "learning_rate": 4.262686890200489e-06, + "loss": 0.4872, + "step": 3259 + }, + { + "epoch": 1.541371158392435, + "grad_norm": 2.5898337364196777, + "learning_rate": 4.2622444590426405e-06, + "loss": 0.5193, + "step": 3260 + }, + { + "epoch": 1.5418439716312058, + "grad_norm": 2.508821487426758, + "learning_rate": 4.261801918157558e-06, + "loss": 0.511, + "step": 3261 + }, + { + "epoch": 1.5423167848699764, + "grad_norm": 2.6992101669311523, + "learning_rate": 4.261359267572795e-06, + "loss": 0.5069, + "step": 3262 + }, + { + "epoch": 1.542789598108747, + "grad_norm": 2.6011030673980713, + "learning_rate": 4.2609165073159145e-06, + "loss": 0.5887, + "step": 3263 + }, + { + "epoch": 1.5432624113475177, + "grad_norm": 2.887053966522217, + "learning_rate": 4.260473637414483e-06, + "loss": 0.5556, + "step": 3264 + }, + { + "epoch": 1.5437352245862885, + "grad_norm": 2.6433887481689453, + "learning_rate": 4.260030657896079e-06, + "loss": 0.4728, + "step": 3265 + }, + { + "epoch": 1.544208037825059, + "grad_norm": 2.6134607791900635, + "learning_rate": 4.259587568788282e-06, + "loss": 0.483, + "step": 3266 + }, + { + "epoch": 1.5446808510638297, + "grad_norm": 2.5308640003204346, + "learning_rate": 4.259144370118684e-06, + "loss": 0.5115, + "step": 3267 + }, + { + "epoch": 1.5451536643026005, + "grad_norm": 2.8256733417510986, + "learning_rate": 4.258701061914879e-06, + "loss": 0.5414, + "step": 3268 + }, + { + "epoch": 1.5456264775413713, + "grad_norm": 2.8648319244384766, + "learning_rate": 4.258257644204471e-06, + "loss": 0.5695, + "step": 3269 + }, + { + "epoch": 1.5460992907801419, + "grad_norm": 2.8568081855773926, + "learning_rate": 4.257814117015069e-06, + "loss": 0.5264, + "step": 3270 + }, + { + "epoch": 1.5465721040189124, + "grad_norm": 2.6065011024475098, + "learning_rate": 4.257370480374289e-06, + "loss": 0.5646, + "step": 3271 + }, + { + "epoch": 1.5470449172576832, + "grad_norm": 2.7840216159820557, + "learning_rate": 4.256926734309756e-06, + "loss": 0.5191, + "step": 3272 + }, + { + "epoch": 1.547517730496454, + "grad_norm": 2.85906982421875, + "learning_rate": 4.256482878849099e-06, + "loss": 0.5911, + "step": 3273 + }, + { + "epoch": 1.5479905437352246, + "grad_norm": 2.916029930114746, + "learning_rate": 4.256038914019954e-06, + "loss": 0.5589, + "step": 3274 + }, + { + "epoch": 1.5484633569739952, + "grad_norm": 2.6748716831207275, + "learning_rate": 4.255594839849967e-06, + "loss": 0.5323, + "step": 3275 + }, + { + "epoch": 1.548936170212766, + "grad_norm": 2.717212200164795, + "learning_rate": 4.255150656366787e-06, + "loss": 0.453, + "step": 3276 + }, + { + "epoch": 1.5494089834515368, + "grad_norm": 2.4974849224090576, + "learning_rate": 4.254706363598072e-06, + "loss": 0.4516, + "step": 3277 + }, + { + "epoch": 1.5498817966903073, + "grad_norm": 2.648151397705078, + "learning_rate": 4.254261961571485e-06, + "loss": 0.5452, + "step": 3278 + }, + { + "epoch": 1.550354609929078, + "grad_norm": 2.932905435562134, + "learning_rate": 4.253817450314699e-06, + "loss": 0.4813, + "step": 3279 + }, + { + "epoch": 1.5508274231678487, + "grad_norm": 2.862912178039551, + "learning_rate": 4.25337282985539e-06, + "loss": 0.5689, + "step": 3280 + }, + { + "epoch": 1.5513002364066195, + "grad_norm": 2.532156467437744, + "learning_rate": 4.2529281002212436e-06, + "loss": 0.485, + "step": 3281 + }, + { + "epoch": 1.55177304964539, + "grad_norm": 2.583299160003662, + "learning_rate": 4.25248326143995e-06, + "loss": 0.4661, + "step": 3282 + }, + { + "epoch": 1.5522458628841607, + "grad_norm": 2.5790653228759766, + "learning_rate": 4.252038313539209e-06, + "loss": 0.5455, + "step": 3283 + }, + { + "epoch": 1.5527186761229315, + "grad_norm": 2.872864007949829, + "learning_rate": 4.251593256546724e-06, + "loss": 0.5317, + "step": 3284 + }, + { + "epoch": 1.5531914893617023, + "grad_norm": 3.0382463932037354, + "learning_rate": 4.251148090490208e-06, + "loss": 0.5131, + "step": 3285 + }, + { + "epoch": 1.5536643026004728, + "grad_norm": 2.574399709701538, + "learning_rate": 4.250702815397379e-06, + "loss": 0.5399, + "step": 3286 + }, + { + "epoch": 1.5541371158392434, + "grad_norm": 2.9784770011901855, + "learning_rate": 4.250257431295962e-06, + "loss": 0.5209, + "step": 3287 + }, + { + "epoch": 1.5546099290780142, + "grad_norm": 2.6482062339782715, + "learning_rate": 4.249811938213689e-06, + "loss": 0.5416, + "step": 3288 + }, + { + "epoch": 1.555082742316785, + "grad_norm": 2.82142972946167, + "learning_rate": 4.2493663361783e-06, + "loss": 0.594, + "step": 3289 + }, + { + "epoch": 1.5555555555555556, + "grad_norm": 2.815595865249634, + "learning_rate": 4.24892062521754e-06, + "loss": 0.5381, + "step": 3290 + }, + { + "epoch": 1.5560283687943262, + "grad_norm": 2.689764976501465, + "learning_rate": 4.248474805359161e-06, + "loss": 0.5141, + "step": 3291 + }, + { + "epoch": 1.556501182033097, + "grad_norm": 2.7718515396118164, + "learning_rate": 4.248028876630922e-06, + "loss": 0.5324, + "step": 3292 + }, + { + "epoch": 1.5569739952718678, + "grad_norm": 3.0196774005889893, + "learning_rate": 4.247582839060591e-06, + "loss": 0.4971, + "step": 3293 + }, + { + "epoch": 1.5574468085106383, + "grad_norm": 2.608475923538208, + "learning_rate": 4.247136692675939e-06, + "loss": 0.5795, + "step": 3294 + }, + { + "epoch": 1.557919621749409, + "grad_norm": 2.4912326335906982, + "learning_rate": 4.246690437504746e-06, + "loss": 0.5348, + "step": 3295 + }, + { + "epoch": 1.5583924349881797, + "grad_norm": 2.519303560256958, + "learning_rate": 4.246244073574799e-06, + "loss": 0.4953, + "step": 3296 + }, + { + "epoch": 1.5588652482269505, + "grad_norm": 2.5667171478271484, + "learning_rate": 4.24579760091389e-06, + "loss": 0.5353, + "step": 3297 + }, + { + "epoch": 1.559338061465721, + "grad_norm": 2.8835761547088623, + "learning_rate": 4.24535101954982e-06, + "loss": 0.578, + "step": 3298 + }, + { + "epoch": 1.5598108747044916, + "grad_norm": 3.0506930351257324, + "learning_rate": 4.244904329510395e-06, + "loss": 0.6418, + "step": 3299 + }, + { + "epoch": 1.5602836879432624, + "grad_norm": 2.579446315765381, + "learning_rate": 4.244457530823428e-06, + "loss": 0.5027, + "step": 3300 + }, + { + "epoch": 1.5607565011820332, + "grad_norm": 2.72012996673584, + "learning_rate": 4.24401062351674e-06, + "loss": 0.5438, + "step": 3301 + }, + { + "epoch": 1.5612293144208038, + "grad_norm": 2.527007818222046, + "learning_rate": 4.243563607618158e-06, + "loss": 0.5303, + "step": 3302 + }, + { + "epoch": 1.5617021276595744, + "grad_norm": 2.4415159225463867, + "learning_rate": 4.243116483155516e-06, + "loss": 0.4893, + "step": 3303 + }, + { + "epoch": 1.5621749408983452, + "grad_norm": 2.462256669998169, + "learning_rate": 4.242669250156653e-06, + "loss": 0.5671, + "step": 3304 + }, + { + "epoch": 1.562647754137116, + "grad_norm": 2.479865074157715, + "learning_rate": 4.242221908649418e-06, + "loss": 0.5038, + "step": 3305 + }, + { + "epoch": 1.5631205673758866, + "grad_norm": 2.74670672416687, + "learning_rate": 4.241774458661662e-06, + "loss": 0.5689, + "step": 3306 + }, + { + "epoch": 1.5635933806146571, + "grad_norm": 2.55938982963562, + "learning_rate": 4.24132690022125e-06, + "loss": 0.492, + "step": 3307 + }, + { + "epoch": 1.564066193853428, + "grad_norm": 2.634956121444702, + "learning_rate": 4.240879233356048e-06, + "loss": 0.503, + "step": 3308 + }, + { + "epoch": 1.5645390070921987, + "grad_norm": 2.381775140762329, + "learning_rate": 4.240431458093928e-06, + "loss": 0.4939, + "step": 3309 + }, + { + "epoch": 1.5650118203309693, + "grad_norm": 2.8176610469818115, + "learning_rate": 4.239983574462774e-06, + "loss": 0.5609, + "step": 3310 + }, + { + "epoch": 1.5654846335697399, + "grad_norm": 3.0268442630767822, + "learning_rate": 4.239535582490471e-06, + "loss": 0.5427, + "step": 3311 + }, + { + "epoch": 1.5659574468085107, + "grad_norm": 2.5881481170654297, + "learning_rate": 4.239087482204916e-06, + "loss": 0.5538, + "step": 3312 + }, + { + "epoch": 1.5664302600472815, + "grad_norm": 2.5317704677581787, + "learning_rate": 4.238639273634008e-06, + "loss": 0.4915, + "step": 3313 + }, + { + "epoch": 1.566903073286052, + "grad_norm": 2.9608731269836426, + "learning_rate": 4.238190956805658e-06, + "loss": 0.564, + "step": 3314 + }, + { + "epoch": 1.5673758865248226, + "grad_norm": 3.022686243057251, + "learning_rate": 4.237742531747777e-06, + "loss": 0.5503, + "step": 3315 + }, + { + "epoch": 1.5678486997635934, + "grad_norm": 2.763622283935547, + "learning_rate": 4.23729399848829e-06, + "loss": 0.5241, + "step": 3316 + }, + { + "epoch": 1.5683215130023642, + "grad_norm": 2.6112794876098633, + "learning_rate": 4.236845357055122e-06, + "loss": 0.4919, + "step": 3317 + }, + { + "epoch": 1.5687943262411348, + "grad_norm": 2.649829149246216, + "learning_rate": 4.23639660747621e-06, + "loss": 0.5472, + "step": 3318 + }, + { + "epoch": 1.5692671394799054, + "grad_norm": 2.8888115882873535, + "learning_rate": 4.2359477497794955e-06, + "loss": 0.5077, + "step": 3319 + }, + { + "epoch": 1.5697399527186762, + "grad_norm": 2.5666911602020264, + "learning_rate": 4.235498783992927e-06, + "loss": 0.5365, + "step": 3320 + }, + { + "epoch": 1.570212765957447, + "grad_norm": 2.448758363723755, + "learning_rate": 4.2350497101444575e-06, + "loss": 0.5043, + "step": 3321 + }, + { + "epoch": 1.5706855791962175, + "grad_norm": 2.595207691192627, + "learning_rate": 4.234600528262052e-06, + "loss": 0.5303, + "step": 3322 + }, + { + "epoch": 1.5711583924349881, + "grad_norm": 2.7814228534698486, + "learning_rate": 4.234151238373676e-06, + "loss": 0.4521, + "step": 3323 + }, + { + "epoch": 1.571631205673759, + "grad_norm": 2.781538724899292, + "learning_rate": 4.233701840507308e-06, + "loss": 0.5193, + "step": 3324 + }, + { + "epoch": 1.5721040189125297, + "grad_norm": 2.771907329559326, + "learning_rate": 4.233252334690928e-06, + "loss": 0.497, + "step": 3325 + }, + { + "epoch": 1.5725768321513003, + "grad_norm": 2.5557498931884766, + "learning_rate": 4.232802720952525e-06, + "loss": 0.4913, + "step": 3326 + }, + { + "epoch": 1.5730496453900709, + "grad_norm": 2.478267192840576, + "learning_rate": 4.232352999320094e-06, + "loss": 0.4967, + "step": 3327 + }, + { + "epoch": 1.5735224586288417, + "grad_norm": 3.1548502445220947, + "learning_rate": 4.231903169821639e-06, + "loss": 0.5009, + "step": 3328 + }, + { + "epoch": 1.5739952718676125, + "grad_norm": 2.634824275970459, + "learning_rate": 4.231453232485168e-06, + "loss": 0.5223, + "step": 3329 + }, + { + "epoch": 1.574468085106383, + "grad_norm": 2.579102039337158, + "learning_rate": 4.231003187338695e-06, + "loss": 0.5513, + "step": 3330 + }, + { + "epoch": 1.5749408983451536, + "grad_norm": 2.8477070331573486, + "learning_rate": 4.230553034410245e-06, + "loss": 0.561, + "step": 3331 + }, + { + "epoch": 1.5754137115839244, + "grad_norm": 2.6714725494384766, + "learning_rate": 4.2301027737278446e-06, + "loss": 0.4687, + "step": 3332 + }, + { + "epoch": 1.5758865248226952, + "grad_norm": 2.6562764644622803, + "learning_rate": 4.229652405319532e-06, + "loss": 0.5925, + "step": 3333 + }, + { + "epoch": 1.5763593380614658, + "grad_norm": 2.750946283340454, + "learning_rate": 4.229201929213348e-06, + "loss": 0.4748, + "step": 3334 + }, + { + "epoch": 1.5768321513002364, + "grad_norm": 2.760470151901245, + "learning_rate": 4.228751345437342e-06, + "loss": 0.5989, + "step": 3335 + }, + { + "epoch": 1.5773049645390071, + "grad_norm": 3.1451845169067383, + "learning_rate": 4.2283006540195706e-06, + "loss": 0.562, + "step": 3336 + }, + { + "epoch": 1.5777777777777777, + "grad_norm": 2.563011407852173, + "learning_rate": 4.227849854988095e-06, + "loss": 0.5473, + "step": 3337 + }, + { + "epoch": 1.5782505910165483, + "grad_norm": 2.310469388961792, + "learning_rate": 4.2273989483709856e-06, + "loss": 0.5033, + "step": 3338 + }, + { + "epoch": 1.578723404255319, + "grad_norm": 2.677978754043579, + "learning_rate": 4.226947934196318e-06, + "loss": 0.5291, + "step": 3339 + }, + { + "epoch": 1.57919621749409, + "grad_norm": 3.0423545837402344, + "learning_rate": 4.226496812492176e-06, + "loss": 0.5201, + "step": 3340 + }, + { + "epoch": 1.5796690307328605, + "grad_norm": 2.357513904571533, + "learning_rate": 4.226045583286647e-06, + "loss": 0.4421, + "step": 3341 + }, + { + "epoch": 1.580141843971631, + "grad_norm": 2.719860315322876, + "learning_rate": 4.225594246607828e-06, + "loss": 0.4855, + "step": 3342 + }, + { + "epoch": 1.5806146572104018, + "grad_norm": 3.2645058631896973, + "learning_rate": 4.2251428024838215e-06, + "loss": 0.6654, + "step": 3343 + }, + { + "epoch": 1.5810874704491726, + "grad_norm": 2.2997004985809326, + "learning_rate": 4.224691250942737e-06, + "loss": 0.4565, + "step": 3344 + }, + { + "epoch": 1.5815602836879432, + "grad_norm": 2.8103034496307373, + "learning_rate": 4.2242395920126926e-06, + "loss": 0.5543, + "step": 3345 + }, + { + "epoch": 1.5820330969267138, + "grad_norm": 2.720254898071289, + "learning_rate": 4.223787825721808e-06, + "loss": 0.5028, + "step": 3346 + }, + { + "epoch": 1.5825059101654846, + "grad_norm": 2.735544204711914, + "learning_rate": 4.223335952098214e-06, + "loss": 0.5169, + "step": 3347 + }, + { + "epoch": 1.5829787234042554, + "grad_norm": 2.784254550933838, + "learning_rate": 4.222883971170047e-06, + "loss": 0.4989, + "step": 3348 + }, + { + "epoch": 1.583451536643026, + "grad_norm": 2.7192094326019287, + "learning_rate": 4.22243188296545e-06, + "loss": 0.502, + "step": 3349 + }, + { + "epoch": 1.5839243498817965, + "grad_norm": 2.716501474380493, + "learning_rate": 4.221979687512573e-06, + "loss": 0.5687, + "step": 3350 + }, + { + "epoch": 1.5843971631205673, + "grad_norm": 2.8420114517211914, + "learning_rate": 4.22152738483957e-06, + "loss": 0.5903, + "step": 3351 + }, + { + "epoch": 1.5848699763593381, + "grad_norm": 2.734872579574585, + "learning_rate": 4.2210749749746065e-06, + "loss": 0.5397, + "step": 3352 + }, + { + "epoch": 1.5853427895981087, + "grad_norm": 2.4343836307525635, + "learning_rate": 4.220622457945851e-06, + "loss": 0.436, + "step": 3353 + }, + { + "epoch": 1.5858156028368793, + "grad_norm": 2.728177547454834, + "learning_rate": 4.2201698337814785e-06, + "loss": 0.5703, + "step": 3354 + }, + { + "epoch": 1.58628841607565, + "grad_norm": 2.502098560333252, + "learning_rate": 4.219717102509674e-06, + "loss": 0.5275, + "step": 3355 + }, + { + "epoch": 1.5867612293144209, + "grad_norm": 2.6595494747161865, + "learning_rate": 4.219264264158627e-06, + "loss": 0.4659, + "step": 3356 + }, + { + "epoch": 1.5872340425531914, + "grad_norm": 2.5307185649871826, + "learning_rate": 4.218811318756532e-06, + "loss": 0.5048, + "step": 3357 + }, + { + "epoch": 1.587706855791962, + "grad_norm": 2.9300129413604736, + "learning_rate": 4.218358266331593e-06, + "loss": 0.5137, + "step": 3358 + }, + { + "epoch": 1.5881796690307328, + "grad_norm": 2.686586618423462, + "learning_rate": 4.21790510691202e-06, + "loss": 0.4529, + "step": 3359 + }, + { + "epoch": 1.5886524822695036, + "grad_norm": 2.9981517791748047, + "learning_rate": 4.217451840526029e-06, + "loss": 0.6054, + "step": 3360 + }, + { + "epoch": 1.5891252955082742, + "grad_norm": 2.6943674087524414, + "learning_rate": 4.216998467201841e-06, + "loss": 0.5153, + "step": 3361 + }, + { + "epoch": 1.5895981087470448, + "grad_norm": 2.707084894180298, + "learning_rate": 4.216544986967689e-06, + "loss": 0.5235, + "step": 3362 + }, + { + "epoch": 1.5900709219858156, + "grad_norm": 2.6553728580474854, + "learning_rate": 4.216091399851808e-06, + "loss": 0.5275, + "step": 3363 + }, + { + "epoch": 1.5905437352245864, + "grad_norm": 2.9136953353881836, + "learning_rate": 4.215637705882439e-06, + "loss": 0.5834, + "step": 3364 + }, + { + "epoch": 1.591016548463357, + "grad_norm": 2.7647159099578857, + "learning_rate": 4.2151839050878325e-06, + "loss": 0.5641, + "step": 3365 + }, + { + "epoch": 1.5914893617021275, + "grad_norm": 2.4556827545166016, + "learning_rate": 4.214729997496246e-06, + "loss": 0.5636, + "step": 3366 + }, + { + "epoch": 1.5919621749408983, + "grad_norm": 2.6111652851104736, + "learning_rate": 4.2142759831359414e-06, + "loss": 0.5097, + "step": 3367 + }, + { + "epoch": 1.592434988179669, + "grad_norm": 2.4886903762817383, + "learning_rate": 4.213821862035189e-06, + "loss": 0.531, + "step": 3368 + }, + { + "epoch": 1.5929078014184397, + "grad_norm": 2.5245840549468994, + "learning_rate": 4.213367634222263e-06, + "loss": 0.5085, + "step": 3369 + }, + { + "epoch": 1.5933806146572103, + "grad_norm": 2.970214605331421, + "learning_rate": 4.212913299725447e-06, + "loss": 0.5851, + "step": 3370 + }, + { + "epoch": 1.593853427895981, + "grad_norm": 2.5433361530303955, + "learning_rate": 4.212458858573032e-06, + "loss": 0.48, + "step": 3371 + }, + { + "epoch": 1.5943262411347519, + "grad_norm": 2.3550102710723877, + "learning_rate": 4.212004310793312e-06, + "loss": 0.4405, + "step": 3372 + }, + { + "epoch": 1.5947990543735224, + "grad_norm": 2.4824719429016113, + "learning_rate": 4.2115496564145896e-06, + "loss": 0.4634, + "step": 3373 + }, + { + "epoch": 1.595271867612293, + "grad_norm": 2.4751930236816406, + "learning_rate": 4.211094895465176e-06, + "loss": 0.5662, + "step": 3374 + }, + { + "epoch": 1.5957446808510638, + "grad_norm": 2.4193356037139893, + "learning_rate": 4.210640027973386e-06, + "loss": 0.4441, + "step": 3375 + }, + { + "epoch": 1.5962174940898346, + "grad_norm": 2.4477498531341553, + "learning_rate": 4.210185053967543e-06, + "loss": 0.5205, + "step": 3376 + }, + { + "epoch": 1.5966903073286052, + "grad_norm": 2.7954161167144775, + "learning_rate": 4.209729973475976e-06, + "loss": 0.4951, + "step": 3377 + }, + { + "epoch": 1.5971631205673757, + "grad_norm": 3.1907570362091064, + "learning_rate": 4.209274786527019e-06, + "loss": 0.6024, + "step": 3378 + }, + { + "epoch": 1.5976359338061465, + "grad_norm": 2.485245704650879, + "learning_rate": 4.2088194931490165e-06, + "loss": 0.5652, + "step": 3379 + }, + { + "epoch": 1.5981087470449173, + "grad_norm": 2.589310884475708, + "learning_rate": 4.208364093370317e-06, + "loss": 0.5085, + "step": 3380 + }, + { + "epoch": 1.598581560283688, + "grad_norm": 2.8941214084625244, + "learning_rate": 4.207908587219276e-06, + "loss": 0.53, + "step": 3381 + }, + { + "epoch": 1.5990543735224585, + "grad_norm": 2.480509042739868, + "learning_rate": 4.207452974724258e-06, + "loss": 0.4543, + "step": 3382 + }, + { + "epoch": 1.5995271867612293, + "grad_norm": 2.7884905338287354, + "learning_rate": 4.206997255913629e-06, + "loss": 0.5483, + "step": 3383 + }, + { + "epoch": 1.6, + "grad_norm": 2.7976696491241455, + "learning_rate": 4.206541430815766e-06, + "loss": 0.4734, + "step": 3384 + }, + { + "epoch": 1.6004728132387707, + "grad_norm": 2.5463132858276367, + "learning_rate": 4.206085499459051e-06, + "loss": 0.4931, + "step": 3385 + }, + { + "epoch": 1.6009456264775412, + "grad_norm": 2.8384251594543457, + "learning_rate": 4.205629461871871e-06, + "loss": 0.5066, + "step": 3386 + }, + { + "epoch": 1.601418439716312, + "grad_norm": 2.8578574657440186, + "learning_rate": 4.205173318082626e-06, + "loss": 0.458, + "step": 3387 + }, + { + "epoch": 1.6018912529550828, + "grad_norm": 2.7779932022094727, + "learning_rate": 4.204717068119715e-06, + "loss": 0.5293, + "step": 3388 + }, + { + "epoch": 1.6023640661938534, + "grad_norm": 2.9123778343200684, + "learning_rate": 4.204260712011546e-06, + "loss": 0.4866, + "step": 3389 + }, + { + "epoch": 1.602836879432624, + "grad_norm": 2.757922887802124, + "learning_rate": 4.203804249786537e-06, + "loss": 0.4925, + "step": 3390 + }, + { + "epoch": 1.6033096926713948, + "grad_norm": 3.287733316421509, + "learning_rate": 4.203347681473107e-06, + "loss": 0.6694, + "step": 3391 + }, + { + "epoch": 1.6037825059101656, + "grad_norm": 3.2117912769317627, + "learning_rate": 4.202891007099687e-06, + "loss": 0.5269, + "step": 3392 + }, + { + "epoch": 1.6042553191489362, + "grad_norm": 2.8489456176757812, + "learning_rate": 4.20243422669471e-06, + "loss": 0.5073, + "step": 3393 + }, + { + "epoch": 1.6047281323877067, + "grad_norm": 2.7660224437713623, + "learning_rate": 4.201977340286619e-06, + "loss": 0.5014, + "step": 3394 + }, + { + "epoch": 1.6052009456264775, + "grad_norm": 2.68182110786438, + "learning_rate": 4.201520347903862e-06, + "loss": 0.4542, + "step": 3395 + }, + { + "epoch": 1.6056737588652483, + "grad_norm": 2.7546045780181885, + "learning_rate": 4.2010632495748934e-06, + "loss": 0.516, + "step": 3396 + }, + { + "epoch": 1.606146572104019, + "grad_norm": 2.744668483734131, + "learning_rate": 4.200606045328176e-06, + "loss": 0.5243, + "step": 3397 + }, + { + "epoch": 1.6066193853427895, + "grad_norm": 2.935343027114868, + "learning_rate": 4.200148735192177e-06, + "loss": 0.5624, + "step": 3398 + }, + { + "epoch": 1.6070921985815603, + "grad_norm": 2.7392852306365967, + "learning_rate": 4.19969131919537e-06, + "loss": 0.5796, + "step": 3399 + }, + { + "epoch": 1.607565011820331, + "grad_norm": 2.864750385284424, + "learning_rate": 4.199233797366239e-06, + "loss": 0.549, + "step": 3400 + }, + { + "epoch": 1.6080378250591016, + "grad_norm": 2.684157371520996, + "learning_rate": 4.198776169733269e-06, + "loss": 0.5532, + "step": 3401 + }, + { + "epoch": 1.6085106382978722, + "grad_norm": 2.4717135429382324, + "learning_rate": 4.198318436324957e-06, + "loss": 0.5174, + "step": 3402 + }, + { + "epoch": 1.608983451536643, + "grad_norm": 2.640242338180542, + "learning_rate": 4.197860597169802e-06, + "loss": 0.5117, + "step": 3403 + }, + { + "epoch": 1.6094562647754138, + "grad_norm": 2.4957473278045654, + "learning_rate": 4.197402652296313e-06, + "loss": 0.474, + "step": 3404 + }, + { + "epoch": 1.6099290780141844, + "grad_norm": 2.416138172149658, + "learning_rate": 4.196944601733004e-06, + "loss": 0.4858, + "step": 3405 + }, + { + "epoch": 1.610401891252955, + "grad_norm": 2.4498109817504883, + "learning_rate": 4.196486445508395e-06, + "loss": 0.5048, + "step": 3406 + }, + { + "epoch": 1.6108747044917258, + "grad_norm": 2.415895938873291, + "learning_rate": 4.196028183651014e-06, + "loss": 0.4745, + "step": 3407 + }, + { + "epoch": 1.6113475177304966, + "grad_norm": 2.843665838241577, + "learning_rate": 4.195569816189395e-06, + "loss": 0.5219, + "step": 3408 + }, + { + "epoch": 1.6118203309692671, + "grad_norm": 2.608579158782959, + "learning_rate": 4.195111343152079e-06, + "loss": 0.4941, + "step": 3409 + }, + { + "epoch": 1.6122931442080377, + "grad_norm": 2.643789529800415, + "learning_rate": 4.194652764567611e-06, + "loss": 0.515, + "step": 3410 + }, + { + "epoch": 1.6127659574468085, + "grad_norm": 2.8099429607391357, + "learning_rate": 4.194194080464547e-06, + "loss": 0.4935, + "step": 3411 + }, + { + "epoch": 1.6132387706855793, + "grad_norm": 2.595628261566162, + "learning_rate": 4.193735290871446e-06, + "loss": 0.5571, + "step": 3412 + }, + { + "epoch": 1.6137115839243499, + "grad_norm": 2.7903778553009033, + "learning_rate": 4.193276395816876e-06, + "loss": 0.5228, + "step": 3413 + }, + { + "epoch": 1.6141843971631205, + "grad_norm": 2.83910870552063, + "learning_rate": 4.192817395329409e-06, + "loss": 0.6124, + "step": 3414 + }, + { + "epoch": 1.6146572104018913, + "grad_norm": 2.6155734062194824, + "learning_rate": 4.192358289437626e-06, + "loss": 0.552, + "step": 3415 + }, + { + "epoch": 1.615130023640662, + "grad_norm": 2.795832872390747, + "learning_rate": 4.191899078170113e-06, + "loss": 0.5561, + "step": 3416 + }, + { + "epoch": 1.6156028368794326, + "grad_norm": 2.3402161598205566, + "learning_rate": 4.191439761555464e-06, + "loss": 0.4889, + "step": 3417 + }, + { + "epoch": 1.6160756501182032, + "grad_norm": 3.1183433532714844, + "learning_rate": 4.190980339622276e-06, + "loss": 0.5337, + "step": 3418 + }, + { + "epoch": 1.616548463356974, + "grad_norm": 2.6262872219085693, + "learning_rate": 4.190520812399158e-06, + "loss": 0.525, + "step": 3419 + }, + { + "epoch": 1.6170212765957448, + "grad_norm": 2.578340530395508, + "learning_rate": 4.190061179914722e-06, + "loss": 0.4975, + "step": 3420 + }, + { + "epoch": 1.6174940898345154, + "grad_norm": 3.19482159614563, + "learning_rate": 4.189601442197586e-06, + "loss": 0.5832, + "step": 3421 + }, + { + "epoch": 1.617966903073286, + "grad_norm": 2.6398792266845703, + "learning_rate": 4.189141599276378e-06, + "loss": 0.4676, + "step": 3422 + }, + { + "epoch": 1.6184397163120567, + "grad_norm": 2.624865770339966, + "learning_rate": 4.1886816511797275e-06, + "loss": 0.4507, + "step": 3423 + }, + { + "epoch": 1.6189125295508275, + "grad_norm": 2.4136857986450195, + "learning_rate": 4.1882215979362775e-06, + "loss": 0.4616, + "step": 3424 + }, + { + "epoch": 1.6193853427895981, + "grad_norm": 2.6906614303588867, + "learning_rate": 4.18776143957467e-06, + "loss": 0.5142, + "step": 3425 + }, + { + "epoch": 1.6198581560283687, + "grad_norm": 2.5149154663085938, + "learning_rate": 4.187301176123558e-06, + "loss": 0.5252, + "step": 3426 + }, + { + "epoch": 1.6203309692671395, + "grad_norm": 2.677405834197998, + "learning_rate": 4.186840807611602e-06, + "loss": 0.4635, + "step": 3427 + }, + { + "epoch": 1.6208037825059103, + "grad_norm": 2.7164649963378906, + "learning_rate": 4.186380334067464e-06, + "loss": 0.5634, + "step": 3428 + }, + { + "epoch": 1.6212765957446809, + "grad_norm": 2.8299832344055176, + "learning_rate": 4.185919755519817e-06, + "loss": 0.5166, + "step": 3429 + }, + { + "epoch": 1.6217494089834514, + "grad_norm": 2.465848207473755, + "learning_rate": 4.18545907199734e-06, + "loss": 0.4696, + "step": 3430 + }, + { + "epoch": 1.6222222222222222, + "grad_norm": 2.407616376876831, + "learning_rate": 4.1849982835287175e-06, + "loss": 0.5111, + "step": 3431 + }, + { + "epoch": 1.622695035460993, + "grad_norm": 2.452146291732788, + "learning_rate": 4.184537390142639e-06, + "loss": 0.4574, + "step": 3432 + }, + { + "epoch": 1.6231678486997636, + "grad_norm": 2.653071165084839, + "learning_rate": 4.1840763918678055e-06, + "loss": 0.5611, + "step": 3433 + }, + { + "epoch": 1.6236406619385342, + "grad_norm": 2.5920350551605225, + "learning_rate": 4.183615288732919e-06, + "loss": 0.5437, + "step": 3434 + }, + { + "epoch": 1.624113475177305, + "grad_norm": 2.782900810241699, + "learning_rate": 4.18315408076669e-06, + "loss": 0.5824, + "step": 3435 + }, + { + "epoch": 1.6245862884160758, + "grad_norm": 2.8769774436950684, + "learning_rate": 4.1826927679978365e-06, + "loss": 0.5271, + "step": 3436 + }, + { + "epoch": 1.6250591016548463, + "grad_norm": 2.488598585128784, + "learning_rate": 4.182231350455084e-06, + "loss": 0.4684, + "step": 3437 + }, + { + "epoch": 1.625531914893617, + "grad_norm": 2.6472036838531494, + "learning_rate": 4.181769828167161e-06, + "loss": 0.5372, + "step": 3438 + }, + { + "epoch": 1.6260047281323877, + "grad_norm": 2.6498794555664062, + "learning_rate": 4.1813082011628045e-06, + "loss": 0.4805, + "step": 3439 + }, + { + "epoch": 1.6264775413711585, + "grad_norm": 2.5386533737182617, + "learning_rate": 4.1808464694707595e-06, + "loss": 0.5015, + "step": 3440 + }, + { + "epoch": 1.626950354609929, + "grad_norm": 2.8812551498413086, + "learning_rate": 4.180384633119775e-06, + "loss": 0.5225, + "step": 3441 + }, + { + "epoch": 1.6274231678486997, + "grad_norm": 2.870124578475952, + "learning_rate": 4.179922692138609e-06, + "loss": 0.537, + "step": 3442 + }, + { + "epoch": 1.6278959810874705, + "grad_norm": 2.5759785175323486, + "learning_rate": 4.179460646556021e-06, + "loss": 0.5142, + "step": 3443 + }, + { + "epoch": 1.6283687943262413, + "grad_norm": 2.629347324371338, + "learning_rate": 4.1789984964007836e-06, + "loss": 0.5007, + "step": 3444 + }, + { + "epoch": 1.6288416075650118, + "grad_norm": 2.751128673553467, + "learning_rate": 4.178536241701672e-06, + "loss": 0.5677, + "step": 3445 + }, + { + "epoch": 1.6293144208037824, + "grad_norm": 2.7582364082336426, + "learning_rate": 4.178073882487469e-06, + "loss": 0.499, + "step": 3446 + }, + { + "epoch": 1.6297872340425532, + "grad_norm": 3.136711359024048, + "learning_rate": 4.177611418786963e-06, + "loss": 0.5294, + "step": 3447 + }, + { + "epoch": 1.630260047281324, + "grad_norm": 2.7363100051879883, + "learning_rate": 4.17714885062895e-06, + "loss": 0.5264, + "step": 3448 + }, + { + "epoch": 1.6307328605200946, + "grad_norm": 2.7305946350097656, + "learning_rate": 4.176686178042233e-06, + "loss": 0.5235, + "step": 3449 + }, + { + "epoch": 1.6312056737588652, + "grad_norm": 2.6500556468963623, + "learning_rate": 4.176223401055619e-06, + "loss": 0.5463, + "step": 3450 + }, + { + "epoch": 1.631678486997636, + "grad_norm": 2.756321907043457, + "learning_rate": 4.175760519697924e-06, + "loss": 0.545, + "step": 3451 + }, + { + "epoch": 1.6321513002364068, + "grad_norm": 2.6234960556030273, + "learning_rate": 4.17529753399797e-06, + "loss": 0.4927, + "step": 3452 + }, + { + "epoch": 1.6326241134751773, + "grad_norm": 2.6358842849731445, + "learning_rate": 4.174834443984584e-06, + "loss": 0.5445, + "step": 3453 + }, + { + "epoch": 1.633096926713948, + "grad_norm": 2.541147470474243, + "learning_rate": 4.174371249686601e-06, + "loss": 0.4691, + "step": 3454 + }, + { + "epoch": 1.6335697399527187, + "grad_norm": 2.566981077194214, + "learning_rate": 4.173907951132863e-06, + "loss": 0.4932, + "step": 3455 + }, + { + "epoch": 1.6340425531914895, + "grad_norm": 2.670940399169922, + "learning_rate": 4.173444548352216e-06, + "loss": 0.4979, + "step": 3456 + }, + { + "epoch": 1.63451536643026, + "grad_norm": 2.5440268516540527, + "learning_rate": 4.172981041373515e-06, + "loss": 0.4716, + "step": 3457 + }, + { + "epoch": 1.6349881796690307, + "grad_norm": 2.3801631927490234, + "learning_rate": 4.17251743022562e-06, + "loss": 0.5126, + "step": 3458 + }, + { + "epoch": 1.6354609929078014, + "grad_norm": 2.5051121711730957, + "learning_rate": 4.1720537149373985e-06, + "loss": 0.4964, + "step": 3459 + }, + { + "epoch": 1.6359338061465722, + "grad_norm": 3.5521697998046875, + "learning_rate": 4.171589895537724e-06, + "loss": 0.5447, + "step": 3460 + }, + { + "epoch": 1.6364066193853428, + "grad_norm": 2.6041572093963623, + "learning_rate": 4.171125972055477e-06, + "loss": 0.4637, + "step": 3461 + }, + { + "epoch": 1.6368794326241134, + "grad_norm": 2.2297258377075195, + "learning_rate": 4.170661944519543e-06, + "loss": 0.4702, + "step": 3462 + }, + { + "epoch": 1.6373522458628842, + "grad_norm": 2.6764535903930664, + "learning_rate": 4.170197812958815e-06, + "loss": 0.5111, + "step": 3463 + }, + { + "epoch": 1.637825059101655, + "grad_norm": 2.86892032623291, + "learning_rate": 4.169733577402193e-06, + "loss": 0.5437, + "step": 3464 + }, + { + "epoch": 1.6382978723404256, + "grad_norm": 2.9007070064544678, + "learning_rate": 4.1692692378785825e-06, + "loss": 0.5425, + "step": 3465 + }, + { + "epoch": 1.6387706855791961, + "grad_norm": 2.5902905464172363, + "learning_rate": 4.168804794416896e-06, + "loss": 0.5252, + "step": 3466 + }, + { + "epoch": 1.639243498817967, + "grad_norm": 2.821183681488037, + "learning_rate": 4.168340247046053e-06, + "loss": 0.5265, + "step": 3467 + }, + { + "epoch": 1.6397163120567377, + "grad_norm": 2.7928314208984375, + "learning_rate": 4.167875595794978e-06, + "loss": 0.5151, + "step": 3468 + }, + { + "epoch": 1.6401891252955083, + "grad_norm": 2.3130412101745605, + "learning_rate": 4.167410840692603e-06, + "loss": 0.4941, + "step": 3469 + }, + { + "epoch": 1.6406619385342789, + "grad_norm": 2.6078619956970215, + "learning_rate": 4.1669459817678655e-06, + "loss": 0.493, + "step": 3470 + }, + { + "epoch": 1.6411347517730497, + "grad_norm": 2.5335731506347656, + "learning_rate": 4.166481019049712e-06, + "loss": 0.4969, + "step": 3471 + }, + { + "epoch": 1.6416075650118205, + "grad_norm": 2.8181469440460205, + "learning_rate": 4.166015952567093e-06, + "loss": 0.5062, + "step": 3472 + }, + { + "epoch": 1.642080378250591, + "grad_norm": 2.7256782054901123, + "learning_rate": 4.165550782348966e-06, + "loss": 0.5397, + "step": 3473 + }, + { + "epoch": 1.6425531914893616, + "grad_norm": 2.284345865249634, + "learning_rate": 4.1650855084242946e-06, + "loss": 0.4448, + "step": 3474 + }, + { + "epoch": 1.6430260047281324, + "grad_norm": 3.0383145809173584, + "learning_rate": 4.164620130822049e-06, + "loss": 0.5873, + "step": 3475 + }, + { + "epoch": 1.6434988179669032, + "grad_norm": 2.754448652267456, + "learning_rate": 4.1641546495712085e-06, + "loss": 0.4852, + "step": 3476 + }, + { + "epoch": 1.6439716312056738, + "grad_norm": 2.6820101737976074, + "learning_rate": 4.1636890647007535e-06, + "loss": 0.5325, + "step": 3477 + }, + { + "epoch": 1.6444444444444444, + "grad_norm": 2.6396398544311523, + "learning_rate": 4.163223376239676e-06, + "loss": 0.466, + "step": 3478 + }, + { + "epoch": 1.6449172576832152, + "grad_norm": 2.395049810409546, + "learning_rate": 4.162757584216972e-06, + "loss": 0.4531, + "step": 3479 + }, + { + "epoch": 1.645390070921986, + "grad_norm": 2.596670627593994, + "learning_rate": 4.162291688661645e-06, + "loss": 0.5207, + "step": 3480 + }, + { + "epoch": 1.6458628841607565, + "grad_norm": 2.4391872882843018, + "learning_rate": 4.161825689602703e-06, + "loss": 0.5133, + "step": 3481 + }, + { + "epoch": 1.6463356973995271, + "grad_norm": 2.6169841289520264, + "learning_rate": 4.161359587069162e-06, + "loss": 0.5096, + "step": 3482 + }, + { + "epoch": 1.646808510638298, + "grad_norm": 2.634089946746826, + "learning_rate": 4.1608933810900445e-06, + "loss": 0.4921, + "step": 3483 + }, + { + "epoch": 1.6472813238770687, + "grad_norm": 2.815877914428711, + "learning_rate": 4.160427071694379e-06, + "loss": 0.5045, + "step": 3484 + }, + { + "epoch": 1.6477541371158393, + "grad_norm": 2.417525053024292, + "learning_rate": 4.159960658911199e-06, + "loss": 0.4997, + "step": 3485 + }, + { + "epoch": 1.6482269503546099, + "grad_norm": 2.5713605880737305, + "learning_rate": 4.15949414276955e-06, + "loss": 0.5246, + "step": 3486 + }, + { + "epoch": 1.6486997635933807, + "grad_norm": 3.49833607673645, + "learning_rate": 4.159027523298475e-06, + "loss": 0.4901, + "step": 3487 + }, + { + "epoch": 1.6491725768321515, + "grad_norm": 2.985464334487915, + "learning_rate": 4.158560800527033e-06, + "loss": 0.5726, + "step": 3488 + }, + { + "epoch": 1.649645390070922, + "grad_norm": 2.72745680809021, + "learning_rate": 4.158093974484282e-06, + "loss": 0.5119, + "step": 3489 + }, + { + "epoch": 1.6501182033096926, + "grad_norm": 2.4885571002960205, + "learning_rate": 4.157627045199289e-06, + "loss": 0.4838, + "step": 3490 + }, + { + "epoch": 1.6505910165484634, + "grad_norm": 2.7622628211975098, + "learning_rate": 4.157160012701128e-06, + "loss": 0.5269, + "step": 3491 + }, + { + "epoch": 1.6510638297872342, + "grad_norm": 2.615122079849243, + "learning_rate": 4.156692877018879e-06, + "loss": 0.5501, + "step": 3492 + }, + { + "epoch": 1.6515366430260048, + "grad_norm": 2.827753782272339, + "learning_rate": 4.156225638181631e-06, + "loss": 0.5452, + "step": 3493 + }, + { + "epoch": 1.6520094562647754, + "grad_norm": 2.724820137023926, + "learning_rate": 4.155758296218474e-06, + "loss": 0.5155, + "step": 3494 + }, + { + "epoch": 1.6524822695035462, + "grad_norm": 2.5806174278259277, + "learning_rate": 4.155290851158508e-06, + "loss": 0.5292, + "step": 3495 + }, + { + "epoch": 1.652955082742317, + "grad_norm": 2.5655179023742676, + "learning_rate": 4.154823303030838e-06, + "loss": 0.4959, + "step": 3496 + }, + { + "epoch": 1.6534278959810875, + "grad_norm": 2.656548261642456, + "learning_rate": 4.154355651864579e-06, + "loss": 0.5703, + "step": 3497 + }, + { + "epoch": 1.653900709219858, + "grad_norm": 2.9085004329681396, + "learning_rate": 4.153887897688847e-06, + "loss": 0.5061, + "step": 3498 + }, + { + "epoch": 1.654373522458629, + "grad_norm": 2.608010768890381, + "learning_rate": 4.1534200405327665e-06, + "loss": 0.5165, + "step": 3499 + }, + { + "epoch": 1.6548463356973995, + "grad_norm": 2.600463628768921, + "learning_rate": 4.152952080425471e-06, + "loss": 0.4946, + "step": 3500 + }, + { + "epoch": 1.65531914893617, + "grad_norm": 2.5561563968658447, + "learning_rate": 4.152484017396098e-06, + "loss": 0.4804, + "step": 3501 + }, + { + "epoch": 1.6557919621749408, + "grad_norm": 2.788594961166382, + "learning_rate": 4.152015851473791e-06, + "loss": 0.5635, + "step": 3502 + }, + { + "epoch": 1.6562647754137116, + "grad_norm": 2.693302631378174, + "learning_rate": 4.151547582687699e-06, + "loss": 0.5139, + "step": 3503 + }, + { + "epoch": 1.6567375886524822, + "grad_norm": 2.7887485027313232, + "learning_rate": 4.1510792110669825e-06, + "loss": 0.4952, + "step": 3504 + }, + { + "epoch": 1.6572104018912528, + "grad_norm": 2.8982298374176025, + "learning_rate": 4.150610736640803e-06, + "loss": 0.4136, + "step": 3505 + }, + { + "epoch": 1.6576832151300236, + "grad_norm": 2.7569408416748047, + "learning_rate": 4.150142159438331e-06, + "loss": 0.5272, + "step": 3506 + }, + { + "epoch": 1.6581560283687944, + "grad_norm": 2.531648874282837, + "learning_rate": 4.149673479488742e-06, + "loss": 0.5016, + "step": 3507 + }, + { + "epoch": 1.658628841607565, + "grad_norm": 2.7706353664398193, + "learning_rate": 4.149204696821219e-06, + "loss": 0.5512, + "step": 3508 + }, + { + "epoch": 1.6591016548463355, + "grad_norm": 2.7307450771331787, + "learning_rate": 4.148735811464951e-06, + "loss": 0.4968, + "step": 3509 + }, + { + "epoch": 1.6595744680851063, + "grad_norm": 3.0097429752349854, + "learning_rate": 4.1482668234491335e-06, + "loss": 0.4797, + "step": 3510 + }, + { + "epoch": 1.6600472813238771, + "grad_norm": 2.6045308113098145, + "learning_rate": 4.147797732802969e-06, + "loss": 0.5496, + "step": 3511 + }, + { + "epoch": 1.6605200945626477, + "grad_norm": 2.702061176300049, + "learning_rate": 4.147328539555664e-06, + "loss": 0.5302, + "step": 3512 + }, + { + "epoch": 1.6609929078014183, + "grad_norm": 3.3724892139434814, + "learning_rate": 4.1468592437364356e-06, + "loss": 0.5124, + "step": 3513 + }, + { + "epoch": 1.661465721040189, + "grad_norm": 2.5117242336273193, + "learning_rate": 4.146389845374502e-06, + "loss": 0.4953, + "step": 3514 + }, + { + "epoch": 1.6619385342789599, + "grad_norm": 2.86547589302063, + "learning_rate": 4.145920344499092e-06, + "loss": 0.5337, + "step": 3515 + }, + { + "epoch": 1.6624113475177305, + "grad_norm": 2.745149850845337, + "learning_rate": 4.14545074113944e-06, + "loss": 0.5187, + "step": 3516 + }, + { + "epoch": 1.662884160756501, + "grad_norm": 2.5560994148254395, + "learning_rate": 4.1449810353247855e-06, + "loss": 0.5183, + "step": 3517 + }, + { + "epoch": 1.6633569739952718, + "grad_norm": 2.2318122386932373, + "learning_rate": 4.144511227084374e-06, + "loss": 0.4452, + "step": 3518 + }, + { + "epoch": 1.6638297872340426, + "grad_norm": 2.6980903148651123, + "learning_rate": 4.14404131644746e-06, + "loss": 0.4974, + "step": 3519 + }, + { + "epoch": 1.6643026004728132, + "grad_norm": 2.6875357627868652, + "learning_rate": 4.1435713034433025e-06, + "loss": 0.4582, + "step": 3520 + }, + { + "epoch": 1.6647754137115838, + "grad_norm": 2.9430019855499268, + "learning_rate": 4.143101188101166e-06, + "loss": 0.5004, + "step": 3521 + }, + { + "epoch": 1.6652482269503546, + "grad_norm": 2.4447221755981445, + "learning_rate": 4.142630970450323e-06, + "loss": 0.5436, + "step": 3522 + }, + { + "epoch": 1.6657210401891254, + "grad_norm": 2.571023941040039, + "learning_rate": 4.142160650520053e-06, + "loss": 0.5307, + "step": 3523 + }, + { + "epoch": 1.666193853427896, + "grad_norm": 2.9725306034088135, + "learning_rate": 4.14169022833964e-06, + "loss": 0.5918, + "step": 3524 + }, + { + "epoch": 1.6666666666666665, + "grad_norm": 2.5958926677703857, + "learning_rate": 4.141219703938375e-06, + "loss": 0.5036, + "step": 3525 + }, + { + "epoch": 1.6671394799054373, + "grad_norm": 2.935788631439209, + "learning_rate": 4.140749077345556e-06, + "loss": 0.5773, + "step": 3526 + }, + { + "epoch": 1.6676122931442081, + "grad_norm": 2.5460526943206787, + "learning_rate": 4.140278348590485e-06, + "loss": 0.4762, + "step": 3527 + }, + { + "epoch": 1.6680851063829787, + "grad_norm": 2.5729143619537354, + "learning_rate": 4.139807517702475e-06, + "loss": 0.5515, + "step": 3528 + }, + { + "epoch": 1.6685579196217493, + "grad_norm": 2.4377381801605225, + "learning_rate": 4.13933658471084e-06, + "loss": 0.5383, + "step": 3529 + }, + { + "epoch": 1.66903073286052, + "grad_norm": 2.6284425258636475, + "learning_rate": 4.138865549644905e-06, + "loss": 0.5396, + "step": 3530 + }, + { + "epoch": 1.6695035460992909, + "grad_norm": 2.857250928878784, + "learning_rate": 4.138394412533998e-06, + "loss": 0.5861, + "step": 3531 + }, + { + "epoch": 1.6699763593380614, + "grad_norm": 2.9226012229919434, + "learning_rate": 4.137923173407456e-06, + "loss": 0.5262, + "step": 3532 + }, + { + "epoch": 1.670449172576832, + "grad_norm": 4.839131832122803, + "learning_rate": 4.137451832294619e-06, + "loss": 0.651, + "step": 3533 + }, + { + "epoch": 1.6709219858156028, + "grad_norm": 2.4727771282196045, + "learning_rate": 4.1369803892248375e-06, + "loss": 0.5149, + "step": 3534 + }, + { + "epoch": 1.6713947990543736, + "grad_norm": 2.5391688346862793, + "learning_rate": 4.1365088442274635e-06, + "loss": 0.4907, + "step": 3535 + }, + { + "epoch": 1.6718676122931442, + "grad_norm": 2.5168209075927734, + "learning_rate": 4.136037197331862e-06, + "loss": 0.5091, + "step": 3536 + }, + { + "epoch": 1.6723404255319148, + "grad_norm": 2.6278600692749023, + "learning_rate": 4.135565448567396e-06, + "loss": 0.4357, + "step": 3537 + }, + { + "epoch": 1.6728132387706856, + "grad_norm": 2.835184097290039, + "learning_rate": 4.135093597963441e-06, + "loss": 0.4786, + "step": 3538 + }, + { + "epoch": 1.6732860520094563, + "grad_norm": 2.385328531265259, + "learning_rate": 4.134621645549379e-06, + "loss": 0.4849, + "step": 3539 + }, + { + "epoch": 1.673758865248227, + "grad_norm": 2.6504149436950684, + "learning_rate": 4.134149591354593e-06, + "loss": 0.6037, + "step": 3540 + }, + { + "epoch": 1.6742316784869975, + "grad_norm": 2.945634126663208, + "learning_rate": 4.1336774354084786e-06, + "loss": 0.532, + "step": 3541 + }, + { + "epoch": 1.6747044917257683, + "grad_norm": 2.8373215198516846, + "learning_rate": 4.133205177740434e-06, + "loss": 0.5138, + "step": 3542 + }, + { + "epoch": 1.675177304964539, + "grad_norm": 2.6616621017456055, + "learning_rate": 4.1327328183798634e-06, + "loss": 0.5543, + "step": 3543 + }, + { + "epoch": 1.6756501182033097, + "grad_norm": 3.0843071937561035, + "learning_rate": 4.13226035735618e-06, + "loss": 0.6585, + "step": 3544 + }, + { + "epoch": 1.6761229314420802, + "grad_norm": 2.2214272022247314, + "learning_rate": 4.131787794698802e-06, + "loss": 0.5413, + "step": 3545 + }, + { + "epoch": 1.676595744680851, + "grad_norm": 2.4515018463134766, + "learning_rate": 4.131315130437152e-06, + "loss": 0.4966, + "step": 3546 + }, + { + "epoch": 1.6770685579196218, + "grad_norm": 2.647414207458496, + "learning_rate": 4.130842364600663e-06, + "loss": 0.5401, + "step": 3547 + }, + { + "epoch": 1.6775413711583924, + "grad_norm": 2.648941993713379, + "learning_rate": 4.13036949721877e-06, + "loss": 0.4796, + "step": 3548 + }, + { + "epoch": 1.678014184397163, + "grad_norm": 2.7835679054260254, + "learning_rate": 4.129896528320919e-06, + "loss": 0.5653, + "step": 3549 + }, + { + "epoch": 1.6784869976359338, + "grad_norm": 2.995964288711548, + "learning_rate": 4.129423457936556e-06, + "loss": 0.4999, + "step": 3550 + }, + { + "epoch": 1.6789598108747046, + "grad_norm": 2.5980007648468018, + "learning_rate": 4.1289502860951405e-06, + "loss": 0.5177, + "step": 3551 + }, + { + "epoch": 1.6794326241134752, + "grad_norm": 2.442254066467285, + "learning_rate": 4.128477012826133e-06, + "loss": 0.5062, + "step": 3552 + }, + { + "epoch": 1.6799054373522457, + "grad_norm": 2.3007538318634033, + "learning_rate": 4.1280036381590025e-06, + "loss": 0.5029, + "step": 3553 + }, + { + "epoch": 1.6803782505910165, + "grad_norm": 2.4169347286224365, + "learning_rate": 4.1275301621232245e-06, + "loss": 0.515, + "step": 3554 + }, + { + "epoch": 1.6808510638297873, + "grad_norm": 2.6456379890441895, + "learning_rate": 4.127056584748279e-06, + "loss": 0.5343, + "step": 3555 + }, + { + "epoch": 1.681323877068558, + "grad_norm": 2.6406595706939697, + "learning_rate": 4.1265829060636546e-06, + "loss": 0.5047, + "step": 3556 + }, + { + "epoch": 1.6817966903073285, + "grad_norm": 2.9344475269317627, + "learning_rate": 4.126109126098846e-06, + "loss": 0.5501, + "step": 3557 + }, + { + "epoch": 1.6822695035460993, + "grad_norm": 2.3292455673217773, + "learning_rate": 4.125635244883351e-06, + "loss": 0.463, + "step": 3558 + }, + { + "epoch": 1.68274231678487, + "grad_norm": 2.4150657653808594, + "learning_rate": 4.125161262446677e-06, + "loss": 0.4802, + "step": 3559 + }, + { + "epoch": 1.6832151300236406, + "grad_norm": 2.604292392730713, + "learning_rate": 4.124687178818339e-06, + "loss": 0.5683, + "step": 3560 + }, + { + "epoch": 1.6836879432624112, + "grad_norm": 2.5676791667938232, + "learning_rate": 4.1242129940278544e-06, + "loss": 0.5519, + "step": 3561 + }, + { + "epoch": 1.684160756501182, + "grad_norm": 3.078514814376831, + "learning_rate": 4.123738708104748e-06, + "loss": 0.5194, + "step": 3562 + }, + { + "epoch": 1.6846335697399528, + "grad_norm": 2.893577814102173, + "learning_rate": 4.123264321078552e-06, + "loss": 0.5107, + "step": 3563 + }, + { + "epoch": 1.6851063829787234, + "grad_norm": 2.772413730621338, + "learning_rate": 4.122789832978804e-06, + "loss": 0.6147, + "step": 3564 + }, + { + "epoch": 1.685579196217494, + "grad_norm": 2.5804643630981445, + "learning_rate": 4.12231524383505e-06, + "loss": 0.5057, + "step": 3565 + }, + { + "epoch": 1.6860520094562648, + "grad_norm": 2.599571466445923, + "learning_rate": 4.121840553676839e-06, + "loss": 0.5591, + "step": 3566 + }, + { + "epoch": 1.6865248226950356, + "grad_norm": 2.9124577045440674, + "learning_rate": 4.1213657625337275e-06, + "loss": 0.565, + "step": 3567 + }, + { + "epoch": 1.6869976359338061, + "grad_norm": 2.6582155227661133, + "learning_rate": 4.120890870435281e-06, + "loss": 0.4607, + "step": 3568 + }, + { + "epoch": 1.6874704491725767, + "grad_norm": 2.929227590560913, + "learning_rate": 4.120415877411066e-06, + "loss": 0.5705, + "step": 3569 + }, + { + "epoch": 1.6879432624113475, + "grad_norm": 2.4443247318267822, + "learning_rate": 4.11994078349066e-06, + "loss": 0.4592, + "step": 3570 + }, + { + "epoch": 1.6884160756501183, + "grad_norm": 2.4799163341522217, + "learning_rate": 4.119465588703645e-06, + "loss": 0.5361, + "step": 3571 + }, + { + "epoch": 1.6888888888888889, + "grad_norm": 2.9408936500549316, + "learning_rate": 4.1189902930796085e-06, + "loss": 0.5347, + "step": 3572 + }, + { + "epoch": 1.6893617021276595, + "grad_norm": 3.3348076343536377, + "learning_rate": 4.118514896648146e-06, + "loss": 0.5612, + "step": 3573 + }, + { + "epoch": 1.6898345153664303, + "grad_norm": 2.764889717102051, + "learning_rate": 4.118039399438857e-06, + "loss": 0.4745, + "step": 3574 + }, + { + "epoch": 1.690307328605201, + "grad_norm": 2.7023751735687256, + "learning_rate": 4.11756380148135e-06, + "loss": 0.5106, + "step": 3575 + }, + { + "epoch": 1.6907801418439716, + "grad_norm": 2.8816208839416504, + "learning_rate": 4.117088102805238e-06, + "loss": 0.6016, + "step": 3576 + }, + { + "epoch": 1.6912529550827422, + "grad_norm": 2.215733289718628, + "learning_rate": 4.11661230344014e-06, + "loss": 0.4404, + "step": 3577 + }, + { + "epoch": 1.691725768321513, + "grad_norm": 2.8190999031066895, + "learning_rate": 4.116136403415683e-06, + "loss": 0.5038, + "step": 3578 + }, + { + "epoch": 1.6921985815602838, + "grad_norm": 2.616424083709717, + "learning_rate": 4.115660402761499e-06, + "loss": 0.5493, + "step": 3579 + }, + { + "epoch": 1.6926713947990544, + "grad_norm": 2.7738113403320312, + "learning_rate": 4.115184301507226e-06, + "loss": 0.5416, + "step": 3580 + }, + { + "epoch": 1.693144208037825, + "grad_norm": 2.4793593883514404, + "learning_rate": 4.114708099682509e-06, + "loss": 0.4526, + "step": 3581 + }, + { + "epoch": 1.6936170212765957, + "grad_norm": 2.390652894973755, + "learning_rate": 4.114231797316999e-06, + "loss": 0.4908, + "step": 3582 + }, + { + "epoch": 1.6940898345153665, + "grad_norm": 2.513197660446167, + "learning_rate": 4.113755394440352e-06, + "loss": 0.4738, + "step": 3583 + }, + { + "epoch": 1.6945626477541371, + "grad_norm": 2.504497766494751, + "learning_rate": 4.113278891082234e-06, + "loss": 0.4661, + "step": 3584 + }, + { + "epoch": 1.6950354609929077, + "grad_norm": 2.4966917037963867, + "learning_rate": 4.112802287272314e-06, + "loss": 0.4979, + "step": 3585 + }, + { + "epoch": 1.6955082742316785, + "grad_norm": 2.3129689693450928, + "learning_rate": 4.112325583040265e-06, + "loss": 0.4933, + "step": 3586 + }, + { + "epoch": 1.6959810874704493, + "grad_norm": 2.822136878967285, + "learning_rate": 4.111848778415774e-06, + "loss": 0.5087, + "step": 3587 + }, + { + "epoch": 1.6964539007092199, + "grad_norm": 2.5181210041046143, + "learning_rate": 4.111371873428527e-06, + "loss": 0.4836, + "step": 3588 + }, + { + "epoch": 1.6969267139479904, + "grad_norm": 2.7564687728881836, + "learning_rate": 4.110894868108218e-06, + "loss": 0.5224, + "step": 3589 + }, + { + "epoch": 1.6973995271867612, + "grad_norm": 2.424421787261963, + "learning_rate": 4.11041776248455e-06, + "loss": 0.4552, + "step": 3590 + }, + { + "epoch": 1.697872340425532, + "grad_norm": 2.7013823986053467, + "learning_rate": 4.10994055658723e-06, + "loss": 0.5535, + "step": 3591 + }, + { + "epoch": 1.6983451536643026, + "grad_norm": 2.5660946369171143, + "learning_rate": 4.10946325044597e-06, + "loss": 0.5351, + "step": 3592 + }, + { + "epoch": 1.6988179669030732, + "grad_norm": 2.5598108768463135, + "learning_rate": 4.10898584409049e-06, + "loss": 0.5246, + "step": 3593 + }, + { + "epoch": 1.699290780141844, + "grad_norm": 2.6318907737731934, + "learning_rate": 4.108508337550518e-06, + "loss": 0.5002, + "step": 3594 + }, + { + "epoch": 1.6997635933806148, + "grad_norm": 2.527099132537842, + "learning_rate": 4.108030730855784e-06, + "loss": 0.5366, + "step": 3595 + }, + { + "epoch": 1.7002364066193854, + "grad_norm": 2.8629603385925293, + "learning_rate": 4.107553024036029e-06, + "loss": 0.5742, + "step": 3596 + }, + { + "epoch": 1.700709219858156, + "grad_norm": 2.8084018230438232, + "learning_rate": 4.107075217120994e-06, + "loss": 0.5618, + "step": 3597 + }, + { + "epoch": 1.7011820330969267, + "grad_norm": 3.6470065116882324, + "learning_rate": 4.1065973101404325e-06, + "loss": 0.508, + "step": 3598 + }, + { + "epoch": 1.7016548463356975, + "grad_norm": 3.0332422256469727, + "learning_rate": 4.106119303124102e-06, + "loss": 0.51, + "step": 3599 + }, + { + "epoch": 1.702127659574468, + "grad_norm": 2.4887590408325195, + "learning_rate": 4.105641196101765e-06, + "loss": 0.5109, + "step": 3600 + }, + { + "epoch": 1.7026004728132387, + "grad_norm": 2.6102066040039062, + "learning_rate": 4.105162989103191e-06, + "loss": 0.5278, + "step": 3601 + }, + { + "epoch": 1.7030732860520095, + "grad_norm": 2.771578073501587, + "learning_rate": 4.104684682158156e-06, + "loss": 0.498, + "step": 3602 + }, + { + "epoch": 1.7035460992907803, + "grad_norm": 2.5452702045440674, + "learning_rate": 4.1042062752964425e-06, + "loss": 0.4939, + "step": 3603 + }, + { + "epoch": 1.7040189125295508, + "grad_norm": 2.4287021160125732, + "learning_rate": 4.103727768547838e-06, + "loss": 0.4819, + "step": 3604 + }, + { + "epoch": 1.7044917257683214, + "grad_norm": 2.412280321121216, + "learning_rate": 4.103249161942138e-06, + "loss": 0.5196, + "step": 3605 + }, + { + "epoch": 1.7049645390070922, + "grad_norm": 2.8850717544555664, + "learning_rate": 4.102770455509142e-06, + "loss": 0.5724, + "step": 3606 + }, + { + "epoch": 1.705437352245863, + "grad_norm": 2.7979609966278076, + "learning_rate": 4.102291649278659e-06, + "loss": 0.5295, + "step": 3607 + }, + { + "epoch": 1.7059101654846336, + "grad_norm": 2.762238025665283, + "learning_rate": 4.1018127432805e-06, + "loss": 0.5166, + "step": 3608 + }, + { + "epoch": 1.7063829787234042, + "grad_norm": 2.921586513519287, + "learning_rate": 4.101333737544485e-06, + "loss": 0.5607, + "step": 3609 + }, + { + "epoch": 1.706855791962175, + "grad_norm": 3.001929998397827, + "learning_rate": 4.100854632100439e-06, + "loss": 0.6255, + "step": 3610 + }, + { + "epoch": 1.7073286052009458, + "grad_norm": 2.752713918685913, + "learning_rate": 4.100375426978196e-06, + "loss": 0.5732, + "step": 3611 + }, + { + "epoch": 1.7078014184397163, + "grad_norm": 2.6496472358703613, + "learning_rate": 4.099896122207593e-06, + "loss": 0.5138, + "step": 3612 + }, + { + "epoch": 1.708274231678487, + "grad_norm": 3.0079452991485596, + "learning_rate": 4.099416717818473e-06, + "loss": 0.5746, + "step": 3613 + }, + { + "epoch": 1.7087470449172577, + "grad_norm": 2.5762360095977783, + "learning_rate": 4.098937213840687e-06, + "loss": 0.5308, + "step": 3614 + }, + { + "epoch": 1.7092198581560285, + "grad_norm": 2.6026158332824707, + "learning_rate": 4.098457610304092e-06, + "loss": 0.4857, + "step": 3615 + }, + { + "epoch": 1.709692671394799, + "grad_norm": 2.587583541870117, + "learning_rate": 4.097977907238551e-06, + "loss": 0.4591, + "step": 3616 + }, + { + "epoch": 1.7101654846335697, + "grad_norm": 2.6996991634368896, + "learning_rate": 4.097498104673932e-06, + "loss": 0.5298, + "step": 3617 + }, + { + "epoch": 1.7106382978723405, + "grad_norm": 2.600029945373535, + "learning_rate": 4.097018202640111e-06, + "loss": 0.4726, + "step": 3618 + }, + { + "epoch": 1.7111111111111112, + "grad_norm": 2.8261220455169678, + "learning_rate": 4.096538201166969e-06, + "loss": 0.5242, + "step": 3619 + }, + { + "epoch": 1.7115839243498818, + "grad_norm": 3.053027629852295, + "learning_rate": 4.096058100284394e-06, + "loss": 0.5568, + "step": 3620 + }, + { + "epoch": 1.7120567375886524, + "grad_norm": 2.9638442993164062, + "learning_rate": 4.0955779000222805e-06, + "loss": 0.5325, + "step": 3621 + }, + { + "epoch": 1.7125295508274232, + "grad_norm": 2.731095790863037, + "learning_rate": 4.095097600410527e-06, + "loss": 0.4733, + "step": 3622 + }, + { + "epoch": 1.713002364066194, + "grad_norm": 2.632490873336792, + "learning_rate": 4.09461720147904e-06, + "loss": 0.5253, + "step": 3623 + }, + { + "epoch": 1.7134751773049646, + "grad_norm": 2.847689390182495, + "learning_rate": 4.094136703257732e-06, + "loss": 0.57, + "step": 3624 + }, + { + "epoch": 1.7139479905437351, + "grad_norm": 3.1078696250915527, + "learning_rate": 4.0936561057765215e-06, + "loss": 0.5368, + "step": 3625 + }, + { + "epoch": 1.714420803782506, + "grad_norm": 2.696349620819092, + "learning_rate": 4.0931754090653334e-06, + "loss": 0.491, + "step": 3626 + }, + { + "epoch": 1.7148936170212767, + "grad_norm": 2.712958812713623, + "learning_rate": 4.092694613154099e-06, + "loss": 0.5768, + "step": 3627 + }, + { + "epoch": 1.7153664302600473, + "grad_norm": 2.5421478748321533, + "learning_rate": 4.092213718072754e-06, + "loss": 0.4839, + "step": 3628 + }, + { + "epoch": 1.715839243498818, + "grad_norm": 2.5176162719726562, + "learning_rate": 4.091732723851243e-06, + "loss": 0.5049, + "step": 3629 + }, + { + "epoch": 1.7163120567375887, + "grad_norm": 2.642185926437378, + "learning_rate": 4.091251630519514e-06, + "loss": 0.589, + "step": 3630 + }, + { + "epoch": 1.7167848699763595, + "grad_norm": 2.587348461151123, + "learning_rate": 4.0907704381075245e-06, + "loss": 0.5281, + "step": 3631 + }, + { + "epoch": 1.71725768321513, + "grad_norm": 2.4628195762634277, + "learning_rate": 4.090289146645234e-06, + "loss": 0.5592, + "step": 3632 + }, + { + "epoch": 1.7177304964539006, + "grad_norm": 2.2751028537750244, + "learning_rate": 4.0898077561626125e-06, + "loss": 0.502, + "step": 3633 + }, + { + "epoch": 1.7182033096926714, + "grad_norm": 2.7712769508361816, + "learning_rate": 4.089326266689632e-06, + "loss": 0.5143, + "step": 3634 + }, + { + "epoch": 1.7186761229314422, + "grad_norm": 2.5297727584838867, + "learning_rate": 4.088844678256275e-06, + "loss": 0.5035, + "step": 3635 + }, + { + "epoch": 1.7191489361702128, + "grad_norm": 2.739130735397339, + "learning_rate": 4.088362990892527e-06, + "loss": 0.5959, + "step": 3636 + }, + { + "epoch": 1.7196217494089834, + "grad_norm": 2.3708314895629883, + "learning_rate": 4.08788120462838e-06, + "loss": 0.4796, + "step": 3637 + }, + { + "epoch": 1.7200945626477542, + "grad_norm": 2.7664241790771484, + "learning_rate": 4.087399319493832e-06, + "loss": 0.6052, + "step": 3638 + }, + { + "epoch": 1.720567375886525, + "grad_norm": 2.5900204181671143, + "learning_rate": 4.0869173355188895e-06, + "loss": 0.4955, + "step": 3639 + }, + { + "epoch": 1.7210401891252955, + "grad_norm": 2.6771862506866455, + "learning_rate": 4.0864352527335635e-06, + "loss": 0.4889, + "step": 3640 + }, + { + "epoch": 1.7215130023640661, + "grad_norm": 2.888479471206665, + "learning_rate": 4.085953071167871e-06, + "loss": 0.5719, + "step": 3641 + }, + { + "epoch": 1.721985815602837, + "grad_norm": 2.5967187881469727, + "learning_rate": 4.085470790851833e-06, + "loss": 0.4959, + "step": 3642 + }, + { + "epoch": 1.7224586288416077, + "grad_norm": 2.5317695140838623, + "learning_rate": 4.084988411815483e-06, + "loss": 0.4596, + "step": 3643 + }, + { + "epoch": 1.7229314420803783, + "grad_norm": 2.6531455516815186, + "learning_rate": 4.084505934088853e-06, + "loss": 0.5346, + "step": 3644 + }, + { + "epoch": 1.7234042553191489, + "grad_norm": 2.6525208950042725, + "learning_rate": 4.084023357701987e-06, + "loss": 0.5178, + "step": 3645 + }, + { + "epoch": 1.7238770685579197, + "grad_norm": 2.461954116821289, + "learning_rate": 4.083540682684932e-06, + "loss": 0.4802, + "step": 3646 + }, + { + "epoch": 1.7243498817966905, + "grad_norm": 2.794696807861328, + "learning_rate": 4.083057909067743e-06, + "loss": 0.5148, + "step": 3647 + }, + { + "epoch": 1.724822695035461, + "grad_norm": 2.867572546005249, + "learning_rate": 4.082575036880479e-06, + "loss": 0.5352, + "step": 3648 + }, + { + "epoch": 1.7252955082742316, + "grad_norm": 2.642820358276367, + "learning_rate": 4.082092066153207e-06, + "loss": 0.4652, + "step": 3649 + }, + { + "epoch": 1.7257683215130024, + "grad_norm": 2.782142400741577, + "learning_rate": 4.081608996915999e-06, + "loss": 0.5591, + "step": 3650 + }, + { + "epoch": 1.7262411347517732, + "grad_norm": 2.327331304550171, + "learning_rate": 4.081125829198934e-06, + "loss": 0.4339, + "step": 3651 + }, + { + "epoch": 1.7267139479905438, + "grad_norm": 2.7959988117218018, + "learning_rate": 4.0806425630320965e-06, + "loss": 0.5783, + "step": 3652 + }, + { + "epoch": 1.7271867612293144, + "grad_norm": 2.595053195953369, + "learning_rate": 4.080159198445578e-06, + "loss": 0.4602, + "step": 3653 + }, + { + "epoch": 1.7276595744680852, + "grad_norm": 3.0968129634857178, + "learning_rate": 4.079675735469475e-06, + "loss": 0.5775, + "step": 3654 + }, + { + "epoch": 1.728132387706856, + "grad_norm": 2.628044605255127, + "learning_rate": 4.07919217413389e-06, + "loss": 0.486, + "step": 3655 + }, + { + "epoch": 1.7286052009456265, + "grad_norm": 2.782799005508423, + "learning_rate": 4.078708514468933e-06, + "loss": 0.5282, + "step": 3656 + }, + { + "epoch": 1.729078014184397, + "grad_norm": 2.655365467071533, + "learning_rate": 4.0782247565047205e-06, + "loss": 0.4873, + "step": 3657 + }, + { + "epoch": 1.729550827423168, + "grad_norm": 2.9461584091186523, + "learning_rate": 4.077740900271371e-06, + "loss": 0.548, + "step": 3658 + }, + { + "epoch": 1.7300236406619387, + "grad_norm": 2.5094761848449707, + "learning_rate": 4.077256945799015e-06, + "loss": 0.5437, + "step": 3659 + }, + { + "epoch": 1.7304964539007093, + "grad_norm": 2.555793285369873, + "learning_rate": 4.0767728931177845e-06, + "loss": 0.5268, + "step": 3660 + }, + { + "epoch": 1.7309692671394799, + "grad_norm": 2.4433486461639404, + "learning_rate": 4.07628874225782e-06, + "loss": 0.5211, + "step": 3661 + }, + { + "epoch": 1.7314420803782506, + "grad_norm": 2.365206003189087, + "learning_rate": 4.075804493249267e-06, + "loss": 0.5084, + "step": 3662 + }, + { + "epoch": 1.7319148936170212, + "grad_norm": 2.514305830001831, + "learning_rate": 4.075320146122278e-06, + "loss": 0.4693, + "step": 3663 + }, + { + "epoch": 1.7323877068557918, + "grad_norm": 2.9270083904266357, + "learning_rate": 4.074835700907012e-06, + "loss": 0.5724, + "step": 3664 + }, + { + "epoch": 1.7328605200945626, + "grad_norm": 2.938692569732666, + "learning_rate": 4.0743511576336315e-06, + "loss": 0.5361, + "step": 3665 + }, + { + "epoch": 1.7333333333333334, + "grad_norm": 3.1978867053985596, + "learning_rate": 4.073866516332307e-06, + "loss": 0.6277, + "step": 3666 + }, + { + "epoch": 1.733806146572104, + "grad_norm": 2.3477370738983154, + "learning_rate": 4.073381777033217e-06, + "loss": 0.5139, + "step": 3667 + }, + { + "epoch": 1.7342789598108745, + "grad_norm": 2.5954184532165527, + "learning_rate": 4.072896939766543e-06, + "loss": 0.537, + "step": 3668 + }, + { + "epoch": 1.7347517730496453, + "grad_norm": 2.8999998569488525, + "learning_rate": 4.072412004562472e-06, + "loss": 0.5486, + "step": 3669 + }, + { + "epoch": 1.7352245862884161, + "grad_norm": 2.7320556640625, + "learning_rate": 4.071926971451201e-06, + "loss": 0.6025, + "step": 3670 + }, + { + "epoch": 1.7356973995271867, + "grad_norm": 2.499234676361084, + "learning_rate": 4.0714418404629304e-06, + "loss": 0.456, + "step": 3671 + }, + { + "epoch": 1.7361702127659573, + "grad_norm": 2.485924243927002, + "learning_rate": 4.070956611627867e-06, + "loss": 0.5097, + "step": 3672 + }, + { + "epoch": 1.736643026004728, + "grad_norm": 2.513723373413086, + "learning_rate": 4.070471284976225e-06, + "loss": 0.4744, + "step": 3673 + }, + { + "epoch": 1.7371158392434989, + "grad_norm": 2.281977653503418, + "learning_rate": 4.06998586053822e-06, + "loss": 0.5124, + "step": 3674 + }, + { + "epoch": 1.7375886524822695, + "grad_norm": 2.3683905601501465, + "learning_rate": 4.069500338344081e-06, + "loss": 0.4816, + "step": 3675 + }, + { + "epoch": 1.73806146572104, + "grad_norm": 2.5635924339294434, + "learning_rate": 4.069014718424038e-06, + "loss": 0.5665, + "step": 3676 + }, + { + "epoch": 1.7385342789598108, + "grad_norm": 2.7308456897735596, + "learning_rate": 4.068529000808328e-06, + "loss": 0.534, + "step": 3677 + }, + { + "epoch": 1.7390070921985816, + "grad_norm": 2.788452625274658, + "learning_rate": 4.068043185527196e-06, + "loss": 0.5609, + "step": 3678 + }, + { + "epoch": 1.7394799054373522, + "grad_norm": 2.832368850708008, + "learning_rate": 4.067557272610889e-06, + "loss": 0.553, + "step": 3679 + }, + { + "epoch": 1.7399527186761228, + "grad_norm": 2.9987435340881348, + "learning_rate": 4.067071262089665e-06, + "loss": 0.5, + "step": 3680 + }, + { + "epoch": 1.7404255319148936, + "grad_norm": 3.04913067817688, + "learning_rate": 4.066585153993785e-06, + "loss": 0.5158, + "step": 3681 + }, + { + "epoch": 1.7408983451536644, + "grad_norm": 2.5177130699157715, + "learning_rate": 4.066098948353516e-06, + "loss": 0.4508, + "step": 3682 + }, + { + "epoch": 1.741371158392435, + "grad_norm": 2.8991222381591797, + "learning_rate": 4.065612645199133e-06, + "loss": 0.5268, + "step": 3683 + }, + { + "epoch": 1.7418439716312055, + "grad_norm": 2.4928159713745117, + "learning_rate": 4.0651262445609156e-06, + "loss": 0.5024, + "step": 3684 + }, + { + "epoch": 1.7423167848699763, + "grad_norm": 2.9737319946289062, + "learning_rate": 4.06463974646915e-06, + "loss": 0.5429, + "step": 3685 + }, + { + "epoch": 1.7427895981087471, + "grad_norm": 2.6485493183135986, + "learning_rate": 4.064153150954128e-06, + "loss": 0.5619, + "step": 3686 + }, + { + "epoch": 1.7432624113475177, + "grad_norm": 2.564861297607422, + "learning_rate": 4.063666458046148e-06, + "loss": 0.4878, + "step": 3687 + }, + { + "epoch": 1.7437352245862883, + "grad_norm": 2.6048383712768555, + "learning_rate": 4.063179667775514e-06, + "loss": 0.4836, + "step": 3688 + }, + { + "epoch": 1.744208037825059, + "grad_norm": 2.751638650894165, + "learning_rate": 4.062692780172536e-06, + "loss": 0.5558, + "step": 3689 + }, + { + "epoch": 1.7446808510638299, + "grad_norm": 3.3866634368896484, + "learning_rate": 4.062205795267531e-06, + "loss": 0.4825, + "step": 3690 + }, + { + "epoch": 1.7451536643026004, + "grad_norm": 3.0112249851226807, + "learning_rate": 4.061718713090822e-06, + "loss": 0.5732, + "step": 3691 + }, + { + "epoch": 1.745626477541371, + "grad_norm": 2.5889365673065186, + "learning_rate": 4.061231533672736e-06, + "loss": 0.483, + "step": 3692 + }, + { + "epoch": 1.7460992907801418, + "grad_norm": 2.624598979949951, + "learning_rate": 4.0607442570436085e-06, + "loss": 0.5706, + "step": 3693 + }, + { + "epoch": 1.7465721040189126, + "grad_norm": 2.9219250679016113, + "learning_rate": 4.060256883233779e-06, + "loss": 0.5153, + "step": 3694 + }, + { + "epoch": 1.7470449172576832, + "grad_norm": 3.2219252586364746, + "learning_rate": 4.059769412273595e-06, + "loss": 0.5184, + "step": 3695 + }, + { + "epoch": 1.7475177304964538, + "grad_norm": 2.890697956085205, + "learning_rate": 4.05928184419341e-06, + "loss": 0.5312, + "step": 3696 + }, + { + "epoch": 1.7479905437352246, + "grad_norm": 2.673809289932251, + "learning_rate": 4.0587941790235816e-06, + "loss": 0.4893, + "step": 3697 + }, + { + "epoch": 1.7484633569739954, + "grad_norm": 2.5339348316192627, + "learning_rate": 4.058306416794474e-06, + "loss": 0.5115, + "step": 3698 + }, + { + "epoch": 1.748936170212766, + "grad_norm": 2.6525840759277344, + "learning_rate": 4.05781855753646e-06, + "loss": 0.5256, + "step": 3699 + }, + { + "epoch": 1.7494089834515365, + "grad_norm": 2.7868754863739014, + "learning_rate": 4.057330601279914e-06, + "loss": 0.5227, + "step": 3700 + }, + { + "epoch": 1.7498817966903073, + "grad_norm": 3.1629884243011475, + "learning_rate": 4.056842548055221e-06, + "loss": 0.5617, + "step": 3701 + }, + { + "epoch": 1.750354609929078, + "grad_norm": 2.9350688457489014, + "learning_rate": 4.056354397892769e-06, + "loss": 0.4753, + "step": 3702 + }, + { + "epoch": 1.7508274231678487, + "grad_norm": 2.9688615798950195, + "learning_rate": 4.0558661508229525e-06, + "loss": 0.596, + "step": 3703 + }, + { + "epoch": 1.7513002364066192, + "grad_norm": 2.802205801010132, + "learning_rate": 4.055377806876174e-06, + "loss": 0.5793, + "step": 3704 + }, + { + "epoch": 1.75177304964539, + "grad_norm": 2.4933416843414307, + "learning_rate": 4.054889366082839e-06, + "loss": 0.4824, + "step": 3705 + }, + { + "epoch": 1.7522458628841608, + "grad_norm": 3.7904608249664307, + "learning_rate": 4.054400828473361e-06, + "loss": 0.5124, + "step": 3706 + }, + { + "epoch": 1.7527186761229314, + "grad_norm": 2.694838762283325, + "learning_rate": 4.053912194078159e-06, + "loss": 0.5604, + "step": 3707 + }, + { + "epoch": 1.753191489361702, + "grad_norm": 2.3721256256103516, + "learning_rate": 4.053423462927659e-06, + "loss": 0.4978, + "step": 3708 + }, + { + "epoch": 1.7536643026004728, + "grad_norm": 2.718512773513794, + "learning_rate": 4.052934635052292e-06, + "loss": 0.5029, + "step": 3709 + }, + { + "epoch": 1.7541371158392436, + "grad_norm": 3.061558246612549, + "learning_rate": 4.052445710482493e-06, + "loss": 0.4886, + "step": 3710 + }, + { + "epoch": 1.7546099290780142, + "grad_norm": 3.0490729808807373, + "learning_rate": 4.051956689248709e-06, + "loss": 0.5363, + "step": 3711 + }, + { + "epoch": 1.7550827423167847, + "grad_norm": 2.611661672592163, + "learning_rate": 4.051467571381385e-06, + "loss": 0.5397, + "step": 3712 + }, + { + "epoch": 1.7555555555555555, + "grad_norm": 2.7829177379608154, + "learning_rate": 4.050978356910979e-06, + "loss": 0.4973, + "step": 3713 + }, + { + "epoch": 1.7560283687943263, + "grad_norm": 2.6228256225585938, + "learning_rate": 4.0504890458679525e-06, + "loss": 0.4551, + "step": 3714 + }, + { + "epoch": 1.756501182033097, + "grad_norm": 2.6801326274871826, + "learning_rate": 4.049999638282771e-06, + "loss": 0.5581, + "step": 3715 + }, + { + "epoch": 1.7569739952718675, + "grad_norm": 2.4476819038391113, + "learning_rate": 4.049510134185908e-06, + "loss": 0.5226, + "step": 3716 + }, + { + "epoch": 1.7574468085106383, + "grad_norm": 2.5661075115203857, + "learning_rate": 4.049020533607844e-06, + "loss": 0.5163, + "step": 3717 + }, + { + "epoch": 1.757919621749409, + "grad_norm": 2.3923349380493164, + "learning_rate": 4.048530836579065e-06, + "loss": 0.5076, + "step": 3718 + }, + { + "epoch": 1.7583924349881797, + "grad_norm": 2.8204405307769775, + "learning_rate": 4.0480410431300585e-06, + "loss": 0.5883, + "step": 3719 + }, + { + "epoch": 1.7588652482269502, + "grad_norm": 2.323107957839966, + "learning_rate": 4.047551153291325e-06, + "loss": 0.5116, + "step": 3720 + }, + { + "epoch": 1.759338061465721, + "grad_norm": 2.8306009769439697, + "learning_rate": 4.047061167093368e-06, + "loss": 0.5094, + "step": 3721 + }, + { + "epoch": 1.7598108747044918, + "grad_norm": 2.568765640258789, + "learning_rate": 4.046571084566695e-06, + "loss": 0.4725, + "step": 3722 + }, + { + "epoch": 1.7602836879432624, + "grad_norm": 2.7212061882019043, + "learning_rate": 4.046080905741822e-06, + "loss": 0.4741, + "step": 3723 + }, + { + "epoch": 1.760756501182033, + "grad_norm": 2.802917003631592, + "learning_rate": 4.04559063064927e-06, + "loss": 0.5691, + "step": 3724 + }, + { + "epoch": 1.7612293144208038, + "grad_norm": 3.1044139862060547, + "learning_rate": 4.0451002593195675e-06, + "loss": 0.5472, + "step": 3725 + }, + { + "epoch": 1.7617021276595746, + "grad_norm": 2.5855562686920166, + "learning_rate": 4.044609791783246e-06, + "loss": 0.4852, + "step": 3726 + }, + { + "epoch": 1.7621749408983451, + "grad_norm": 2.6235129833221436, + "learning_rate": 4.0441192280708465e-06, + "loss": 0.5269, + "step": 3727 + }, + { + "epoch": 1.7626477541371157, + "grad_norm": 3.535630464553833, + "learning_rate": 4.043628568212914e-06, + "loss": 0.5266, + "step": 3728 + }, + { + "epoch": 1.7631205673758865, + "grad_norm": 2.7783355712890625, + "learning_rate": 4.043137812239998e-06, + "loss": 0.5609, + "step": 3729 + }, + { + "epoch": 1.7635933806146573, + "grad_norm": 2.9344944953918457, + "learning_rate": 4.042646960182657e-06, + "loss": 0.5056, + "step": 3730 + }, + { + "epoch": 1.7640661938534279, + "grad_norm": 2.6205739974975586, + "learning_rate": 4.042156012071453e-06, + "loss": 0.4914, + "step": 3731 + }, + { + "epoch": 1.7645390070921985, + "grad_norm": 2.8004493713378906, + "learning_rate": 4.041664967936958e-06, + "loss": 0.4901, + "step": 3732 + }, + { + "epoch": 1.7650118203309693, + "grad_norm": 2.944589138031006, + "learning_rate": 4.041173827809745e-06, + "loss": 0.5572, + "step": 3733 + }, + { + "epoch": 1.76548463356974, + "grad_norm": 2.5021605491638184, + "learning_rate": 4.040682591720397e-06, + "loss": 0.4637, + "step": 3734 + }, + { + "epoch": 1.7659574468085106, + "grad_norm": 2.448030948638916, + "learning_rate": 4.040191259699497e-06, + "loss": 0.4785, + "step": 3735 + }, + { + "epoch": 1.7664302600472812, + "grad_norm": 2.7171032428741455, + "learning_rate": 4.039699831777643e-06, + "loss": 0.4919, + "step": 3736 + }, + { + "epoch": 1.766903073286052, + "grad_norm": 2.453118324279785, + "learning_rate": 4.03920830798543e-06, + "loss": 0.4326, + "step": 3737 + }, + { + "epoch": 1.7673758865248228, + "grad_norm": 3.112877368927002, + "learning_rate": 4.038716688353466e-06, + "loss": 0.5375, + "step": 3738 + }, + { + "epoch": 1.7678486997635934, + "grad_norm": 2.742239236831665, + "learning_rate": 4.038224972912361e-06, + "loss": 0.5267, + "step": 3739 + }, + { + "epoch": 1.768321513002364, + "grad_norm": 2.544785737991333, + "learning_rate": 4.037733161692731e-06, + "loss": 0.5032, + "step": 3740 + }, + { + "epoch": 1.7687943262411348, + "grad_norm": 2.4639062881469727, + "learning_rate": 4.037241254725201e-06, + "loss": 0.5532, + "step": 3741 + }, + { + "epoch": 1.7692671394799055, + "grad_norm": 2.866290330886841, + "learning_rate": 4.036749252040398e-06, + "loss": 0.5503, + "step": 3742 + }, + { + "epoch": 1.7697399527186761, + "grad_norm": 2.3466262817382812, + "learning_rate": 4.0362571536689575e-06, + "loss": 0.5286, + "step": 3743 + }, + { + "epoch": 1.7702127659574467, + "grad_norm": 2.246464967727661, + "learning_rate": 4.03576495964152e-06, + "loss": 0.4656, + "step": 3744 + }, + { + "epoch": 1.7706855791962175, + "grad_norm": 2.667558431625366, + "learning_rate": 4.035272669988733e-06, + "loss": 0.5205, + "step": 3745 + }, + { + "epoch": 1.7711583924349883, + "grad_norm": 2.974666118621826, + "learning_rate": 4.034780284741249e-06, + "loss": 0.6007, + "step": 3746 + }, + { + "epoch": 1.7716312056737589, + "grad_norm": 2.7164433002471924, + "learning_rate": 4.034287803929726e-06, + "loss": 0.4913, + "step": 3747 + }, + { + "epoch": 1.7721040189125294, + "grad_norm": 2.5923962593078613, + "learning_rate": 4.033795227584829e-06, + "loss": 0.5275, + "step": 3748 + }, + { + "epoch": 1.7725768321513002, + "grad_norm": 2.606027126312256, + "learning_rate": 4.033302555737229e-06, + "loss": 0.4869, + "step": 3749 + }, + { + "epoch": 1.773049645390071, + "grad_norm": 3.0110089778900146, + "learning_rate": 4.032809788417602e-06, + "loss": 0.4956, + "step": 3750 + }, + { + "epoch": 1.7735224586288416, + "grad_norm": 3.004598617553711, + "learning_rate": 4.032316925656632e-06, + "loss": 0.5159, + "step": 3751 + }, + { + "epoch": 1.7739952718676122, + "grad_norm": 2.731539249420166, + "learning_rate": 4.031823967485005e-06, + "loss": 0.5237, + "step": 3752 + }, + { + "epoch": 1.774468085106383, + "grad_norm": 2.7466373443603516, + "learning_rate": 4.0313309139334155e-06, + "loss": 0.4948, + "step": 3753 + }, + { + "epoch": 1.7749408983451538, + "grad_norm": 2.8596460819244385, + "learning_rate": 4.030837765032565e-06, + "loss": 0.5016, + "step": 3754 + }, + { + "epoch": 1.7754137115839244, + "grad_norm": 3.2886788845062256, + "learning_rate": 4.03034452081316e-06, + "loss": 0.5377, + "step": 3755 + }, + { + "epoch": 1.775886524822695, + "grad_norm": 2.5629258155822754, + "learning_rate": 4.029851181305912e-06, + "loss": 0.519, + "step": 3756 + }, + { + "epoch": 1.7763593380614657, + "grad_norm": 2.5988714694976807, + "learning_rate": 4.029357746541539e-06, + "loss": 0.5521, + "step": 3757 + }, + { + "epoch": 1.7768321513002365, + "grad_norm": 2.987884759902954, + "learning_rate": 4.028864216550765e-06, + "loss": 0.6225, + "step": 3758 + }, + { + "epoch": 1.777304964539007, + "grad_norm": 2.6875851154327393, + "learning_rate": 4.02837059136432e-06, + "loss": 0.5321, + "step": 3759 + }, + { + "epoch": 1.7777777777777777, + "grad_norm": 2.6414570808410645, + "learning_rate": 4.02787687101294e-06, + "loss": 0.4831, + "step": 3760 + }, + { + "epoch": 1.7782505910165485, + "grad_norm": 2.581475019454956, + "learning_rate": 4.027383055527368e-06, + "loss": 0.5204, + "step": 3761 + }, + { + "epoch": 1.7787234042553193, + "grad_norm": 2.811298131942749, + "learning_rate": 4.026889144938349e-06, + "loss": 0.5486, + "step": 3762 + }, + { + "epoch": 1.7791962174940898, + "grad_norm": 3.1589081287384033, + "learning_rate": 4.026395139276639e-06, + "loss": 0.4979, + "step": 3763 + }, + { + "epoch": 1.7796690307328604, + "grad_norm": 2.3773093223571777, + "learning_rate": 4.025901038572996e-06, + "loss": 0.503, + "step": 3764 + }, + { + "epoch": 1.7801418439716312, + "grad_norm": 2.962541341781616, + "learning_rate": 4.025406842858187e-06, + "loss": 0.4613, + "step": 3765 + }, + { + "epoch": 1.780614657210402, + "grad_norm": 2.603092908859253, + "learning_rate": 4.024912552162982e-06, + "loss": 0.5142, + "step": 3766 + }, + { + "epoch": 1.7810874704491726, + "grad_norm": 2.648927927017212, + "learning_rate": 4.024418166518159e-06, + "loss": 0.4491, + "step": 3767 + }, + { + "epoch": 1.7815602836879432, + "grad_norm": 3.3239917755126953, + "learning_rate": 4.023923685954502e-06, + "loss": 0.6272, + "step": 3768 + }, + { + "epoch": 1.782033096926714, + "grad_norm": 2.672821283340454, + "learning_rate": 4.023429110502798e-06, + "loss": 0.5171, + "step": 3769 + }, + { + "epoch": 1.7825059101654848, + "grad_norm": 2.364332437515259, + "learning_rate": 4.022934440193844e-06, + "loss": 0.4513, + "step": 3770 + }, + { + "epoch": 1.7829787234042553, + "grad_norm": 3.03108549118042, + "learning_rate": 4.022439675058441e-06, + "loss": 0.4324, + "step": 3771 + }, + { + "epoch": 1.783451536643026, + "grad_norm": 2.647557020187378, + "learning_rate": 4.021944815127393e-06, + "loss": 0.5162, + "step": 3772 + }, + { + "epoch": 1.7839243498817967, + "grad_norm": 2.4111907482147217, + "learning_rate": 4.021449860431517e-06, + "loss": 0.4712, + "step": 3773 + }, + { + "epoch": 1.7843971631205675, + "grad_norm": 2.796175718307495, + "learning_rate": 4.020954811001629e-06, + "loss": 0.5131, + "step": 3774 + }, + { + "epoch": 1.784869976359338, + "grad_norm": 2.4594924449920654, + "learning_rate": 4.020459666868553e-06, + "loss": 0.4739, + "step": 3775 + }, + { + "epoch": 1.7853427895981087, + "grad_norm": 2.5735671520233154, + "learning_rate": 4.0199644280631215e-06, + "loss": 0.4716, + "step": 3776 + }, + { + "epoch": 1.7858156028368795, + "grad_norm": 2.419990062713623, + "learning_rate": 4.01946909461617e-06, + "loss": 0.4866, + "step": 3777 + }, + { + "epoch": 1.7862884160756503, + "grad_norm": 2.5597951412200928, + "learning_rate": 4.01897366655854e-06, + "loss": 0.5569, + "step": 3778 + }, + { + "epoch": 1.7867612293144208, + "grad_norm": 2.462383985519409, + "learning_rate": 4.018478143921081e-06, + "loss": 0.4588, + "step": 3779 + }, + { + "epoch": 1.7872340425531914, + "grad_norm": 2.536701202392578, + "learning_rate": 4.017982526734646e-06, + "loss": 0.5278, + "step": 3780 + }, + { + "epoch": 1.7877068557919622, + "grad_norm": 2.691077470779419, + "learning_rate": 4.017486815030095e-06, + "loss": 0.4815, + "step": 3781 + }, + { + "epoch": 1.788179669030733, + "grad_norm": 2.4277288913726807, + "learning_rate": 4.016991008838294e-06, + "loss": 0.4877, + "step": 3782 + }, + { + "epoch": 1.7886524822695036, + "grad_norm": 2.6740009784698486, + "learning_rate": 4.016495108190115e-06, + "loss": 0.572, + "step": 3783 + }, + { + "epoch": 1.7891252955082741, + "grad_norm": 3.179232120513916, + "learning_rate": 4.0159991131164355e-06, + "loss": 0.4821, + "step": 3784 + }, + { + "epoch": 1.789598108747045, + "grad_norm": 3.2747793197631836, + "learning_rate": 4.015503023648138e-06, + "loss": 0.5517, + "step": 3785 + }, + { + "epoch": 1.7900709219858157, + "grad_norm": 2.671367645263672, + "learning_rate": 4.015006839816113e-06, + "loss": 0.5158, + "step": 3786 + }, + { + "epoch": 1.7905437352245863, + "grad_norm": 2.6600193977355957, + "learning_rate": 4.014510561651256e-06, + "loss": 0.535, + "step": 3787 + }, + { + "epoch": 1.791016548463357, + "grad_norm": 2.481509208679199, + "learning_rate": 4.014014189184466e-06, + "loss": 0.5596, + "step": 3788 + }, + { + "epoch": 1.7914893617021277, + "grad_norm": 2.759816884994507, + "learning_rate": 4.013517722446652e-06, + "loss": 0.5201, + "step": 3789 + }, + { + "epoch": 1.7919621749408985, + "grad_norm": 2.6913561820983887, + "learning_rate": 4.013021161468724e-06, + "loss": 0.5758, + "step": 3790 + }, + { + "epoch": 1.792434988179669, + "grad_norm": 2.775087594985962, + "learning_rate": 4.0125245062816044e-06, + "loss": 0.499, + "step": 3791 + }, + { + "epoch": 1.7929078014184396, + "grad_norm": 2.6134777069091797, + "learning_rate": 4.012027756916216e-06, + "loss": 0.5659, + "step": 3792 + }, + { + "epoch": 1.7933806146572104, + "grad_norm": 2.7109756469726562, + "learning_rate": 4.0115309134034895e-06, + "loss": 0.5337, + "step": 3793 + }, + { + "epoch": 1.7938534278959812, + "grad_norm": 2.5389950275421143, + "learning_rate": 4.0110339757743595e-06, + "loss": 0.4501, + "step": 3794 + }, + { + "epoch": 1.7943262411347518, + "grad_norm": 2.634648561477661, + "learning_rate": 4.010536944059771e-06, + "loss": 0.4411, + "step": 3795 + }, + { + "epoch": 1.7947990543735224, + "grad_norm": 2.527070999145508, + "learning_rate": 4.0100398182906695e-06, + "loss": 0.5145, + "step": 3796 + }, + { + "epoch": 1.7952718676122932, + "grad_norm": 2.62988543510437, + "learning_rate": 4.0095425984980105e-06, + "loss": 0.4981, + "step": 3797 + }, + { + "epoch": 1.795744680851064, + "grad_norm": 2.6032519340515137, + "learning_rate": 4.009045284712752e-06, + "loss": 0.453, + "step": 3798 + }, + { + "epoch": 1.7962174940898346, + "grad_norm": 2.735173463821411, + "learning_rate": 4.008547876965863e-06, + "loss": 0.5925, + "step": 3799 + }, + { + "epoch": 1.7966903073286051, + "grad_norm": 2.6296730041503906, + "learning_rate": 4.00805037528831e-06, + "loss": 0.5651, + "step": 3800 + }, + { + "epoch": 1.797163120567376, + "grad_norm": 2.641214370727539, + "learning_rate": 4.0075527797110735e-06, + "loss": 0.4973, + "step": 3801 + }, + { + "epoch": 1.7976359338061467, + "grad_norm": 2.6104819774627686, + "learning_rate": 4.007055090265136e-06, + "loss": 0.4432, + "step": 3802 + }, + { + "epoch": 1.7981087470449173, + "grad_norm": 2.8200619220733643, + "learning_rate": 4.0065573069814865e-06, + "loss": 0.4899, + "step": 3803 + }, + { + "epoch": 1.7985815602836879, + "grad_norm": 2.982354164123535, + "learning_rate": 4.006059429891119e-06, + "loss": 0.5488, + "step": 3804 + }, + { + "epoch": 1.7990543735224587, + "grad_norm": 2.7561678886413574, + "learning_rate": 4.005561459025034e-06, + "loss": 0.5637, + "step": 3805 + }, + { + "epoch": 1.7995271867612295, + "grad_norm": 2.702212333679199, + "learning_rate": 4.005063394414241e-06, + "loss": 0.4804, + "step": 3806 + }, + { + "epoch": 1.8, + "grad_norm": 2.8655319213867188, + "learning_rate": 4.004565236089748e-06, + "loss": 0.5759, + "step": 3807 + }, + { + "epoch": 1.8004728132387706, + "grad_norm": 2.703676223754883, + "learning_rate": 4.0040669840825756e-06, + "loss": 0.4728, + "step": 3808 + }, + { + "epoch": 1.8009456264775414, + "grad_norm": 2.802645683288574, + "learning_rate": 4.003568638423747e-06, + "loss": 0.5421, + "step": 3809 + }, + { + "epoch": 1.8014184397163122, + "grad_norm": 2.4723124504089355, + "learning_rate": 4.003070199144292e-06, + "loss": 0.4944, + "step": 3810 + }, + { + "epoch": 1.8018912529550828, + "grad_norm": 2.4889068603515625, + "learning_rate": 4.0025716662752475e-06, + "loss": 0.4774, + "step": 3811 + }, + { + "epoch": 1.8023640661938534, + "grad_norm": 2.5408077239990234, + "learning_rate": 4.002073039847653e-06, + "loss": 0.5233, + "step": 3812 + }, + { + "epoch": 1.8028368794326242, + "grad_norm": 2.734602689743042, + "learning_rate": 4.001574319892557e-06, + "loss": 0.5403, + "step": 3813 + }, + { + "epoch": 1.803309692671395, + "grad_norm": 3.3786163330078125, + "learning_rate": 4.001075506441012e-06, + "loss": 0.6969, + "step": 3814 + }, + { + "epoch": 1.8037825059101655, + "grad_norm": 2.7375378608703613, + "learning_rate": 4.000576599524078e-06, + "loss": 0.4907, + "step": 3815 + }, + { + "epoch": 1.804255319148936, + "grad_norm": 3.041804075241089, + "learning_rate": 4.000077599172818e-06, + "loss": 0.6021, + "step": 3816 + }, + { + "epoch": 1.804728132387707, + "grad_norm": 2.697599411010742, + "learning_rate": 3.999578505418305e-06, + "loss": 0.4743, + "step": 3817 + }, + { + "epoch": 1.8052009456264777, + "grad_norm": 2.276921272277832, + "learning_rate": 3.999079318291612e-06, + "loss": 0.4885, + "step": 3818 + }, + { + "epoch": 1.8056737588652483, + "grad_norm": 2.4896953105926514, + "learning_rate": 3.998580037823825e-06, + "loss": 0.503, + "step": 3819 + }, + { + "epoch": 1.8061465721040189, + "grad_norm": 2.6232175827026367, + "learning_rate": 3.998080664046029e-06, + "loss": 0.5058, + "step": 3820 + }, + { + "epoch": 1.8066193853427897, + "grad_norm": 2.695861339569092, + "learning_rate": 3.997581196989319e-06, + "loss": 0.4949, + "step": 3821 + }, + { + "epoch": 1.8070921985815604, + "grad_norm": 2.912886142730713, + "learning_rate": 3.997081636684795e-06, + "loss": 0.4971, + "step": 3822 + }, + { + "epoch": 1.807565011820331, + "grad_norm": 2.876500368118286, + "learning_rate": 3.996581983163561e-06, + "loss": 0.5584, + "step": 3823 + }, + { + "epoch": 1.8080378250591016, + "grad_norm": 2.857069730758667, + "learning_rate": 3.99608223645673e-06, + "loss": 0.5457, + "step": 3824 + }, + { + "epoch": 1.8085106382978724, + "grad_norm": 2.486743211746216, + "learning_rate": 3.995582396595419e-06, + "loss": 0.5291, + "step": 3825 + }, + { + "epoch": 1.808983451536643, + "grad_norm": 2.509441375732422, + "learning_rate": 3.9950824636107486e-06, + "loss": 0.4747, + "step": 3826 + }, + { + "epoch": 1.8094562647754135, + "grad_norm": 2.931394100189209, + "learning_rate": 3.99458243753385e-06, + "loss": 0.5116, + "step": 3827 + }, + { + "epoch": 1.8099290780141843, + "grad_norm": 2.4868650436401367, + "learning_rate": 3.994082318395856e-06, + "loss": 0.4671, + "step": 3828 + }, + { + "epoch": 1.8104018912529551, + "grad_norm": 2.5554752349853516, + "learning_rate": 3.993582106227907e-06, + "loss": 0.4969, + "step": 3829 + }, + { + "epoch": 1.8108747044917257, + "grad_norm": 2.8367133140563965, + "learning_rate": 3.99308180106115e-06, + "loss": 0.5507, + "step": 3830 + }, + { + "epoch": 1.8113475177304963, + "grad_norm": 2.68245792388916, + "learning_rate": 3.992581402926737e-06, + "loss": 0.5115, + "step": 3831 + }, + { + "epoch": 1.811820330969267, + "grad_norm": 2.406674385070801, + "learning_rate": 3.992080911855824e-06, + "loss": 0.545, + "step": 3832 + }, + { + "epoch": 1.8122931442080379, + "grad_norm": 2.5003464221954346, + "learning_rate": 3.991580327879575e-06, + "loss": 0.4331, + "step": 3833 + }, + { + "epoch": 1.8127659574468085, + "grad_norm": 2.49320912361145, + "learning_rate": 3.99107965102916e-06, + "loss": 0.5118, + "step": 3834 + }, + { + "epoch": 1.813238770685579, + "grad_norm": 2.6183295249938965, + "learning_rate": 3.990578881335752e-06, + "loss": 0.5286, + "step": 3835 + }, + { + "epoch": 1.8137115839243498, + "grad_norm": 3.1999518871307373, + "learning_rate": 3.990078018830534e-06, + "loss": 0.5048, + "step": 3836 + }, + { + "epoch": 1.8141843971631206, + "grad_norm": 2.4351117610931396, + "learning_rate": 3.9895770635446915e-06, + "loss": 0.514, + "step": 3837 + }, + { + "epoch": 1.8146572104018912, + "grad_norm": 2.6859259605407715, + "learning_rate": 3.989076015509416e-06, + "loss": 0.5575, + "step": 3838 + }, + { + "epoch": 1.8151300236406618, + "grad_norm": 2.790421962738037, + "learning_rate": 3.988574874755909e-06, + "loss": 0.5467, + "step": 3839 + }, + { + "epoch": 1.8156028368794326, + "grad_norm": 2.5202765464782715, + "learning_rate": 3.988073641315369e-06, + "loss": 0.5229, + "step": 3840 + }, + { + "epoch": 1.8160756501182034, + "grad_norm": 2.623652219772339, + "learning_rate": 3.987572315219009e-06, + "loss": 0.509, + "step": 3841 + }, + { + "epoch": 1.816548463356974, + "grad_norm": 2.6038360595703125, + "learning_rate": 3.987070896498044e-06, + "loss": 0.5304, + "step": 3842 + }, + { + "epoch": 1.8170212765957445, + "grad_norm": 2.9378011226654053, + "learning_rate": 3.9865693851836955e-06, + "loss": 0.5845, + "step": 3843 + }, + { + "epoch": 1.8174940898345153, + "grad_norm": 2.4061124324798584, + "learning_rate": 3.98606778130719e-06, + "loss": 0.4333, + "step": 3844 + }, + { + "epoch": 1.8179669030732861, + "grad_norm": 2.483489751815796, + "learning_rate": 3.985566084899759e-06, + "loss": 0.4827, + "step": 3845 + }, + { + "epoch": 1.8184397163120567, + "grad_norm": 2.7774932384490967, + "learning_rate": 3.985064295992642e-06, + "loss": 0.5016, + "step": 3846 + }, + { + "epoch": 1.8189125295508273, + "grad_norm": 2.5936765670776367, + "learning_rate": 3.984562414617083e-06, + "loss": 0.4448, + "step": 3847 + }, + { + "epoch": 1.819385342789598, + "grad_norm": 2.8608627319335938, + "learning_rate": 3.9840604408043325e-06, + "loss": 0.5735, + "step": 3848 + }, + { + "epoch": 1.8198581560283689, + "grad_norm": 2.6212472915649414, + "learning_rate": 3.983558374585646e-06, + "loss": 0.5091, + "step": 3849 + }, + { + "epoch": 1.8203309692671394, + "grad_norm": 2.832460641860962, + "learning_rate": 3.983056215992284e-06, + "loss": 0.5169, + "step": 3850 + }, + { + "epoch": 1.82080378250591, + "grad_norm": 2.5293610095977783, + "learning_rate": 3.982553965055514e-06, + "loss": 0.4708, + "step": 3851 + }, + { + "epoch": 1.8212765957446808, + "grad_norm": 2.9362871646881104, + "learning_rate": 3.982051621806611e-06, + "loss": 0.575, + "step": 3852 + }, + { + "epoch": 1.8217494089834516, + "grad_norm": 2.69073486328125, + "learning_rate": 3.98154918627685e-06, + "loss": 0.5278, + "step": 3853 + }, + { + "epoch": 1.8222222222222222, + "grad_norm": 2.6711034774780273, + "learning_rate": 3.98104665849752e-06, + "loss": 0.4918, + "step": 3854 + }, + { + "epoch": 1.8226950354609928, + "grad_norm": 2.571110963821411, + "learning_rate": 3.980544038499907e-06, + "loss": 0.5234, + "step": 3855 + }, + { + "epoch": 1.8231678486997636, + "grad_norm": 3.2603371143341064, + "learning_rate": 3.980041326315309e-06, + "loss": 0.5996, + "step": 3856 + }, + { + "epoch": 1.8236406619385344, + "grad_norm": 2.8472323417663574, + "learning_rate": 3.979538521975028e-06, + "loss": 0.4769, + "step": 3857 + }, + { + "epoch": 1.824113475177305, + "grad_norm": 2.6714751720428467, + "learning_rate": 3.979035625510371e-06, + "loss": 0.4826, + "step": 3858 + }, + { + "epoch": 1.8245862884160755, + "grad_norm": 2.6816468238830566, + "learning_rate": 3.97853263695265e-06, + "loss": 0.5127, + "step": 3859 + }, + { + "epoch": 1.8250591016548463, + "grad_norm": 2.6464123725891113, + "learning_rate": 3.978029556333185e-06, + "loss": 0.4925, + "step": 3860 + }, + { + "epoch": 1.825531914893617, + "grad_norm": 2.5317227840423584, + "learning_rate": 3.977526383683301e-06, + "loss": 0.4765, + "step": 3861 + }, + { + "epoch": 1.8260047281323877, + "grad_norm": 2.5052425861358643, + "learning_rate": 3.977023119034328e-06, + "loss": 0.4804, + "step": 3862 + }, + { + "epoch": 1.8264775413711583, + "grad_norm": 2.7022836208343506, + "learning_rate": 3.976519762417602e-06, + "loss": 0.4824, + "step": 3863 + }, + { + "epoch": 1.826950354609929, + "grad_norm": 2.7445900440216064, + "learning_rate": 3.976016313864464e-06, + "loss": 0.5698, + "step": 3864 + }, + { + "epoch": 1.8274231678486998, + "grad_norm": 2.442518711090088, + "learning_rate": 3.975512773406262e-06, + "loss": 0.5133, + "step": 3865 + }, + { + "epoch": 1.8278959810874704, + "grad_norm": 2.4100050926208496, + "learning_rate": 3.975009141074351e-06, + "loss": 0.5044, + "step": 3866 + }, + { + "epoch": 1.828368794326241, + "grad_norm": 2.9507648944854736, + "learning_rate": 3.974505416900088e-06, + "loss": 0.5367, + "step": 3867 + }, + { + "epoch": 1.8288416075650118, + "grad_norm": 2.5662600994110107, + "learning_rate": 3.974001600914837e-06, + "loss": 0.5878, + "step": 3868 + }, + { + "epoch": 1.8293144208037826, + "grad_norm": 2.4306657314300537, + "learning_rate": 3.973497693149971e-06, + "loss": 0.4647, + "step": 3869 + }, + { + "epoch": 1.8297872340425532, + "grad_norm": 2.974686622619629, + "learning_rate": 3.972993693636864e-06, + "loss": 0.4911, + "step": 3870 + }, + { + "epoch": 1.8302600472813237, + "grad_norm": 2.5711987018585205, + "learning_rate": 3.972489602406899e-06, + "loss": 0.5089, + "step": 3871 + }, + { + "epoch": 1.8307328605200945, + "grad_norm": 3.259617328643799, + "learning_rate": 3.971985419491463e-06, + "loss": 0.5966, + "step": 3872 + }, + { + "epoch": 1.8312056737588653, + "grad_norm": 2.7437000274658203, + "learning_rate": 3.971481144921949e-06, + "loss": 0.5097, + "step": 3873 + }, + { + "epoch": 1.831678486997636, + "grad_norm": 2.9597461223602295, + "learning_rate": 3.970976778729757e-06, + "loss": 0.5672, + "step": 3874 + }, + { + "epoch": 1.8321513002364065, + "grad_norm": 2.5775723457336426, + "learning_rate": 3.970472320946291e-06, + "loss": 0.4749, + "step": 3875 + }, + { + "epoch": 1.8326241134751773, + "grad_norm": 2.7381200790405273, + "learning_rate": 3.969967771602961e-06, + "loss": 0.5255, + "step": 3876 + }, + { + "epoch": 1.833096926713948, + "grad_norm": 2.651698350906372, + "learning_rate": 3.969463130731183e-06, + "loss": 0.5098, + "step": 3877 + }, + { + "epoch": 1.8335697399527187, + "grad_norm": 2.7277021408081055, + "learning_rate": 3.968958398362381e-06, + "loss": 0.5251, + "step": 3878 + }, + { + "epoch": 1.8340425531914892, + "grad_norm": 2.5184953212738037, + "learning_rate": 3.968453574527978e-06, + "loss": 0.5086, + "step": 3879 + }, + { + "epoch": 1.83451536643026, + "grad_norm": 2.8227882385253906, + "learning_rate": 3.967948659259412e-06, + "loss": 0.5742, + "step": 3880 + }, + { + "epoch": 1.8349881796690308, + "grad_norm": 2.547922134399414, + "learning_rate": 3.967443652588119e-06, + "loss": 0.5411, + "step": 3881 + }, + { + "epoch": 1.8354609929078014, + "grad_norm": 2.6572835445404053, + "learning_rate": 3.966938554545545e-06, + "loss": 0.4854, + "step": 3882 + }, + { + "epoch": 1.835933806146572, + "grad_norm": 2.9416658878326416, + "learning_rate": 3.966433365163139e-06, + "loss": 0.5236, + "step": 3883 + }, + { + "epoch": 1.8364066193853428, + "grad_norm": 2.344325304031372, + "learning_rate": 3.965928084472357e-06, + "loss": 0.4916, + "step": 3884 + }, + { + "epoch": 1.8368794326241136, + "grad_norm": 2.890418291091919, + "learning_rate": 3.965422712504662e-06, + "loss": 0.5287, + "step": 3885 + }, + { + "epoch": 1.8373522458628841, + "grad_norm": 2.6063363552093506, + "learning_rate": 3.96491724929152e-06, + "loss": 0.4842, + "step": 3886 + }, + { + "epoch": 1.8378250591016547, + "grad_norm": 2.5582427978515625, + "learning_rate": 3.964411694864404e-06, + "loss": 0.4768, + "step": 3887 + }, + { + "epoch": 1.8382978723404255, + "grad_norm": 2.84356951713562, + "learning_rate": 3.963906049254793e-06, + "loss": 0.5284, + "step": 3888 + }, + { + "epoch": 1.8387706855791963, + "grad_norm": 2.7048516273498535, + "learning_rate": 3.963400312494172e-06, + "loss": 0.5271, + "step": 3889 + }, + { + "epoch": 1.839243498817967, + "grad_norm": 2.5401699542999268, + "learning_rate": 3.962894484614031e-06, + "loss": 0.4734, + "step": 3890 + }, + { + "epoch": 1.8397163120567375, + "grad_norm": 2.208256244659424, + "learning_rate": 3.962388565645864e-06, + "loss": 0.4113, + "step": 3891 + }, + { + "epoch": 1.8401891252955083, + "grad_norm": 2.775139331817627, + "learning_rate": 3.961882555621173e-06, + "loss": 0.5172, + "step": 3892 + }, + { + "epoch": 1.840661938534279, + "grad_norm": 2.7540855407714844, + "learning_rate": 3.961376454571466e-06, + "loss": 0.5252, + "step": 3893 + }, + { + "epoch": 1.8411347517730496, + "grad_norm": 2.6731574535369873, + "learning_rate": 3.960870262528255e-06, + "loss": 0.4495, + "step": 3894 + }, + { + "epoch": 1.8416075650118202, + "grad_norm": 2.791492223739624, + "learning_rate": 3.960363979523058e-06, + "loss": 0.5457, + "step": 3895 + }, + { + "epoch": 1.842080378250591, + "grad_norm": 2.9280290603637695, + "learning_rate": 3.959857605587401e-06, + "loss": 0.5373, + "step": 3896 + }, + { + "epoch": 1.8425531914893618, + "grad_norm": 2.5652217864990234, + "learning_rate": 3.95935114075281e-06, + "loss": 0.5191, + "step": 3897 + }, + { + "epoch": 1.8430260047281324, + "grad_norm": 2.7297749519348145, + "learning_rate": 3.958844585050824e-06, + "loss": 0.5366, + "step": 3898 + }, + { + "epoch": 1.843498817966903, + "grad_norm": 2.5302982330322266, + "learning_rate": 3.958337938512983e-06, + "loss": 0.569, + "step": 3899 + }, + { + "epoch": 1.8439716312056738, + "grad_norm": 2.644777297973633, + "learning_rate": 3.957831201170832e-06, + "loss": 0.521, + "step": 3900 + }, + { + "epoch": 1.8444444444444446, + "grad_norm": 2.8375515937805176, + "learning_rate": 3.957324373055925e-06, + "loss": 0.573, + "step": 3901 + }, + { + "epoch": 1.8449172576832151, + "grad_norm": 2.512296676635742, + "learning_rate": 3.956817454199819e-06, + "loss": 0.5081, + "step": 3902 + }, + { + "epoch": 1.8453900709219857, + "grad_norm": 2.3662109375, + "learning_rate": 3.956310444634079e-06, + "loss": 0.4989, + "step": 3903 + }, + { + "epoch": 1.8458628841607565, + "grad_norm": 2.6849682331085205, + "learning_rate": 3.955803344390272e-06, + "loss": 0.5459, + "step": 3904 + }, + { + "epoch": 1.8463356973995273, + "grad_norm": 2.8364317417144775, + "learning_rate": 3.9552961534999756e-06, + "loss": 0.5704, + "step": 3905 + }, + { + "epoch": 1.8468085106382979, + "grad_norm": 2.6006948947906494, + "learning_rate": 3.954788871994768e-06, + "loss": 0.5696, + "step": 3906 + }, + { + "epoch": 1.8472813238770684, + "grad_norm": 2.558300018310547, + "learning_rate": 3.9542814999062375e-06, + "loss": 0.5047, + "step": 3907 + }, + { + "epoch": 1.8477541371158392, + "grad_norm": 2.6343321800231934, + "learning_rate": 3.953774037265974e-06, + "loss": 0.525, + "step": 3908 + }, + { + "epoch": 1.84822695035461, + "grad_norm": 2.5050008296966553, + "learning_rate": 3.953266484105576e-06, + "loss": 0.4867, + "step": 3909 + }, + { + "epoch": 1.8486997635933806, + "grad_norm": 2.3775103092193604, + "learning_rate": 3.952758840456647e-06, + "loss": 0.4349, + "step": 3910 + }, + { + "epoch": 1.8491725768321512, + "grad_norm": 2.508376359939575, + "learning_rate": 3.952251106350794e-06, + "loss": 0.539, + "step": 3911 + }, + { + "epoch": 1.849645390070922, + "grad_norm": 2.7403106689453125, + "learning_rate": 3.951743281819633e-06, + "loss": 0.4478, + "step": 3912 + }, + { + "epoch": 1.8501182033096928, + "grad_norm": 2.5332062244415283, + "learning_rate": 3.951235366894784e-06, + "loss": 0.4658, + "step": 3913 + }, + { + "epoch": 1.8505910165484634, + "grad_norm": 3.0137248039245605, + "learning_rate": 3.950727361607872e-06, + "loss": 0.5047, + "step": 3914 + }, + { + "epoch": 1.851063829787234, + "grad_norm": 2.5820653438568115, + "learning_rate": 3.950219265990528e-06, + "loss": 0.542, + "step": 3915 + }, + { + "epoch": 1.8515366430260047, + "grad_norm": 2.555133819580078, + "learning_rate": 3.949711080074389e-06, + "loss": 0.5253, + "step": 3916 + }, + { + "epoch": 1.8520094562647755, + "grad_norm": 2.876882791519165, + "learning_rate": 3.949202803891099e-06, + "loss": 0.5242, + "step": 3917 + }, + { + "epoch": 1.852482269503546, + "grad_norm": 2.5929203033447266, + "learning_rate": 3.948694437472305e-06, + "loss": 0.5358, + "step": 3918 + }, + { + "epoch": 1.8529550827423167, + "grad_norm": 2.468513250350952, + "learning_rate": 3.948185980849659e-06, + "loss": 0.5119, + "step": 3919 + }, + { + "epoch": 1.8534278959810875, + "grad_norm": 2.9259560108184814, + "learning_rate": 3.947677434054824e-06, + "loss": 0.4756, + "step": 3920 + }, + { + "epoch": 1.8539007092198583, + "grad_norm": 2.5247011184692383, + "learning_rate": 3.947168797119462e-06, + "loss": 0.4627, + "step": 3921 + }, + { + "epoch": 1.8543735224586289, + "grad_norm": 2.7396671772003174, + "learning_rate": 3.946660070075245e-06, + "loss": 0.5013, + "step": 3922 + }, + { + "epoch": 1.8548463356973994, + "grad_norm": 2.7059738636016846, + "learning_rate": 3.946151252953849e-06, + "loss": 0.5875, + "step": 3923 + }, + { + "epoch": 1.8553191489361702, + "grad_norm": 2.5638437271118164, + "learning_rate": 3.945642345786955e-06, + "loss": 0.5063, + "step": 3924 + }, + { + "epoch": 1.855791962174941, + "grad_norm": 2.6647839546203613, + "learning_rate": 3.945133348606251e-06, + "loss": 0.5421, + "step": 3925 + }, + { + "epoch": 1.8562647754137116, + "grad_norm": 3.7235286235809326, + "learning_rate": 3.944624261443431e-06, + "loss": 0.5958, + "step": 3926 + }, + { + "epoch": 1.8567375886524822, + "grad_norm": 2.769984245300293, + "learning_rate": 3.944115084330192e-06, + "loss": 0.5678, + "step": 3927 + }, + { + "epoch": 1.857210401891253, + "grad_norm": 2.567249059677124, + "learning_rate": 3.9436058172982395e-06, + "loss": 0.4767, + "step": 3928 + }, + { + "epoch": 1.8576832151300238, + "grad_norm": 2.6196048259735107, + "learning_rate": 3.943096460379283e-06, + "loss": 0.5345, + "step": 3929 + }, + { + "epoch": 1.8581560283687943, + "grad_norm": 2.5999555587768555, + "learning_rate": 3.942587013605037e-06, + "loss": 0.5482, + "step": 3930 + }, + { + "epoch": 1.858628841607565, + "grad_norm": 2.630387783050537, + "learning_rate": 3.942077477007224e-06, + "loss": 0.6023, + "step": 3931 + }, + { + "epoch": 1.8591016548463357, + "grad_norm": 2.543503761291504, + "learning_rate": 3.941567850617569e-06, + "loss": 0.5157, + "step": 3932 + }, + { + "epoch": 1.8595744680851065, + "grad_norm": 2.5109236240386963, + "learning_rate": 3.941058134467805e-06, + "loss": 0.4774, + "step": 3933 + }, + { + "epoch": 1.860047281323877, + "grad_norm": 2.5110230445861816, + "learning_rate": 3.94054832858967e-06, + "loss": 0.5064, + "step": 3934 + }, + { + "epoch": 1.8605200945626477, + "grad_norm": 2.4780776500701904, + "learning_rate": 3.940038433014908e-06, + "loss": 0.5216, + "step": 3935 + }, + { + "epoch": 1.8609929078014185, + "grad_norm": 2.4398856163024902, + "learning_rate": 3.939528447775266e-06, + "loss": 0.4958, + "step": 3936 + }, + { + "epoch": 1.8614657210401893, + "grad_norm": 2.449498176574707, + "learning_rate": 3.9390183729025e-06, + "loss": 0.5165, + "step": 3937 + }, + { + "epoch": 1.8619385342789598, + "grad_norm": 2.982544422149658, + "learning_rate": 3.938508208428371e-06, + "loss": 0.4803, + "step": 3938 + }, + { + "epoch": 1.8624113475177304, + "grad_norm": 2.6574015617370605, + "learning_rate": 3.937997954384641e-06, + "loss": 0.4797, + "step": 3939 + }, + { + "epoch": 1.8628841607565012, + "grad_norm": 2.7773542404174805, + "learning_rate": 3.937487610803086e-06, + "loss": 0.4843, + "step": 3940 + }, + { + "epoch": 1.863356973995272, + "grad_norm": 2.588937759399414, + "learning_rate": 3.9369771777154805e-06, + "loss": 0.5426, + "step": 3941 + }, + { + "epoch": 1.8638297872340426, + "grad_norm": 2.855442523956299, + "learning_rate": 3.936466655153607e-06, + "loss": 0.5443, + "step": 3942 + }, + { + "epoch": 1.8643026004728132, + "grad_norm": 2.554676055908203, + "learning_rate": 3.935956043149253e-06, + "loss": 0.5334, + "step": 3943 + }, + { + "epoch": 1.864775413711584, + "grad_norm": 2.901599884033203, + "learning_rate": 3.935445341734212e-06, + "loss": 0.5842, + "step": 3944 + }, + { + "epoch": 1.8652482269503547, + "grad_norm": 2.554485321044922, + "learning_rate": 3.934934550940285e-06, + "loss": 0.4941, + "step": 3945 + }, + { + "epoch": 1.8657210401891253, + "grad_norm": 2.357203245162964, + "learning_rate": 3.934423670799275e-06, + "loss": 0.4402, + "step": 3946 + }, + { + "epoch": 1.866193853427896, + "grad_norm": 2.7036049365997314, + "learning_rate": 3.933912701342993e-06, + "loss": 0.4966, + "step": 3947 + }, + { + "epoch": 1.8666666666666667, + "grad_norm": 2.7817211151123047, + "learning_rate": 3.933401642603255e-06, + "loss": 0.4908, + "step": 3948 + }, + { + "epoch": 1.8671394799054375, + "grad_norm": 2.439490795135498, + "learning_rate": 3.932890494611882e-06, + "loss": 0.4322, + "step": 3949 + }, + { + "epoch": 1.867612293144208, + "grad_norm": 3.187152147293091, + "learning_rate": 3.9323792574007e-06, + "loss": 0.501, + "step": 3950 + }, + { + "epoch": 1.8680851063829786, + "grad_norm": 2.405773401260376, + "learning_rate": 3.931867931001543e-06, + "loss": 0.4477, + "step": 3951 + }, + { + "epoch": 1.8685579196217494, + "grad_norm": 2.4922525882720947, + "learning_rate": 3.931356515446248e-06, + "loss": 0.5098, + "step": 3952 + }, + { + "epoch": 1.8690307328605202, + "grad_norm": 2.7781267166137695, + "learning_rate": 3.93084501076666e-06, + "loss": 0.5815, + "step": 3953 + }, + { + "epoch": 1.8695035460992908, + "grad_norm": 2.74621844291687, + "learning_rate": 3.930333416994626e-06, + "loss": 0.5605, + "step": 3954 + }, + { + "epoch": 1.8699763593380614, + "grad_norm": 2.5527689456939697, + "learning_rate": 3.929821734162004e-06, + "loss": 0.5141, + "step": 3955 + }, + { + "epoch": 1.8704491725768322, + "grad_norm": 2.5730628967285156, + "learning_rate": 3.92930996230065e-06, + "loss": 0.5446, + "step": 3956 + }, + { + "epoch": 1.870921985815603, + "grad_norm": 2.7053353786468506, + "learning_rate": 3.9287981014424334e-06, + "loss": 0.4722, + "step": 3957 + }, + { + "epoch": 1.8713947990543736, + "grad_norm": 2.7591893672943115, + "learning_rate": 3.928286151619224e-06, + "loss": 0.509, + "step": 3958 + }, + { + "epoch": 1.8718676122931441, + "grad_norm": 2.6233739852905273, + "learning_rate": 3.927774112862898e-06, + "loss": 0.5266, + "step": 3959 + }, + { + "epoch": 1.872340425531915, + "grad_norm": 2.7715370655059814, + "learning_rate": 3.9272619852053396e-06, + "loss": 0.5612, + "step": 3960 + }, + { + "epoch": 1.8728132387706857, + "grad_norm": 2.4815211296081543, + "learning_rate": 3.926749768678435e-06, + "loss": 0.5498, + "step": 3961 + }, + { + "epoch": 1.8732860520094563, + "grad_norm": 2.6819605827331543, + "learning_rate": 3.926237463314078e-06, + "loss": 0.5499, + "step": 3962 + }, + { + "epoch": 1.8737588652482269, + "grad_norm": 2.638664722442627, + "learning_rate": 3.925725069144168e-06, + "loss": 0.5429, + "step": 3963 + }, + { + "epoch": 1.8742316784869977, + "grad_norm": 2.527294874191284, + "learning_rate": 3.925212586200611e-06, + "loss": 0.5451, + "step": 3964 + }, + { + "epoch": 1.8747044917257685, + "grad_norm": 2.831638813018799, + "learning_rate": 3.924700014515315e-06, + "loss": 0.5276, + "step": 3965 + }, + { + "epoch": 1.875177304964539, + "grad_norm": 2.5906996726989746, + "learning_rate": 3.924187354120196e-06, + "loss": 0.5323, + "step": 3966 + }, + { + "epoch": 1.8756501182033096, + "grad_norm": 2.5482442378997803, + "learning_rate": 3.923674605047175e-06, + "loss": 0.4882, + "step": 3967 + }, + { + "epoch": 1.8761229314420804, + "grad_norm": 2.56402850151062, + "learning_rate": 3.923161767328179e-06, + "loss": 0.5111, + "step": 3968 + }, + { + "epoch": 1.8765957446808512, + "grad_norm": 3.223782539367676, + "learning_rate": 3.9226488409951405e-06, + "loss": 0.5829, + "step": 3969 + }, + { + "epoch": 1.8770685579196218, + "grad_norm": 2.665964365005493, + "learning_rate": 3.922135826079997e-06, + "loss": 0.4739, + "step": 3970 + }, + { + "epoch": 1.8775413711583924, + "grad_norm": 2.602696418762207, + "learning_rate": 3.921622722614691e-06, + "loss": 0.5199, + "step": 3971 + }, + { + "epoch": 1.8780141843971632, + "grad_norm": 2.5384418964385986, + "learning_rate": 3.921109530631172e-06, + "loss": 0.5086, + "step": 3972 + }, + { + "epoch": 1.878486997635934, + "grad_norm": 2.7961080074310303, + "learning_rate": 3.920596250161394e-06, + "loss": 0.5454, + "step": 3973 + }, + { + "epoch": 1.8789598108747045, + "grad_norm": 3.022007465362549, + "learning_rate": 3.920082881237317e-06, + "loss": 0.5537, + "step": 3974 + }, + { + "epoch": 1.8794326241134751, + "grad_norm": 2.699885129928589, + "learning_rate": 3.9195694238909045e-06, + "loss": 0.5274, + "step": 3975 + }, + { + "epoch": 1.879905437352246, + "grad_norm": 2.3994593620300293, + "learning_rate": 3.919055878154129e-06, + "loss": 0.4134, + "step": 3976 + }, + { + "epoch": 1.8803782505910167, + "grad_norm": 4.093045711517334, + "learning_rate": 3.918542244058967e-06, + "loss": 0.5305, + "step": 3977 + }, + { + "epoch": 1.8808510638297873, + "grad_norm": 3.011643171310425, + "learning_rate": 3.9180285216374e-06, + "loss": 0.5481, + "step": 3978 + }, + { + "epoch": 1.8813238770685579, + "grad_norm": 2.6426854133605957, + "learning_rate": 3.917514710921414e-06, + "loss": 0.5415, + "step": 3979 + }, + { + "epoch": 1.8817966903073287, + "grad_norm": 2.4379019737243652, + "learning_rate": 3.917000811943002e-06, + "loss": 0.4566, + "step": 3980 + }, + { + "epoch": 1.8822695035460995, + "grad_norm": 3.18522047996521, + "learning_rate": 3.9164868247341634e-06, + "loss": 0.6079, + "step": 3981 + }, + { + "epoch": 1.88274231678487, + "grad_norm": 2.6451141834259033, + "learning_rate": 3.915972749326903e-06, + "loss": 0.515, + "step": 3982 + }, + { + "epoch": 1.8832151300236406, + "grad_norm": 2.565598726272583, + "learning_rate": 3.915458585753226e-06, + "loss": 0.4799, + "step": 3983 + }, + { + "epoch": 1.8836879432624114, + "grad_norm": 2.711651563644409, + "learning_rate": 3.91494433404515e-06, + "loss": 0.5595, + "step": 3984 + }, + { + "epoch": 1.8841607565011822, + "grad_norm": 2.749328851699829, + "learning_rate": 3.914429994234695e-06, + "loss": 0.495, + "step": 3985 + }, + { + "epoch": 1.8846335697399526, + "grad_norm": 2.9492287635803223, + "learning_rate": 3.913915566353886e-06, + "loss": 0.5683, + "step": 3986 + }, + { + "epoch": 1.8851063829787233, + "grad_norm": 3.07747745513916, + "learning_rate": 3.913401050434756e-06, + "loss": 0.4953, + "step": 3987 + }, + { + "epoch": 1.8855791962174941, + "grad_norm": 2.8746345043182373, + "learning_rate": 3.912886446509338e-06, + "loss": 0.4752, + "step": 3988 + }, + { + "epoch": 1.8860520094562647, + "grad_norm": 2.772954225540161, + "learning_rate": 3.912371754609677e-06, + "loss": 0.5473, + "step": 3989 + }, + { + "epoch": 1.8865248226950353, + "grad_norm": 2.8906044960021973, + "learning_rate": 3.911856974767821e-06, + "loss": 0.5285, + "step": 3990 + }, + { + "epoch": 1.886997635933806, + "grad_norm": 2.8992726802825928, + "learning_rate": 3.9113421070158206e-06, + "loss": 0.571, + "step": 3991 + }, + { + "epoch": 1.887470449172577, + "grad_norm": 2.624662160873413, + "learning_rate": 3.910827151385737e-06, + "loss": 0.5183, + "step": 3992 + }, + { + "epoch": 1.8879432624113475, + "grad_norm": 2.4491732120513916, + "learning_rate": 3.910312107909632e-06, + "loss": 0.4205, + "step": 3993 + }, + { + "epoch": 1.888416075650118, + "grad_norm": 2.278259515762329, + "learning_rate": 3.909796976619575e-06, + "loss": 0.4464, + "step": 3994 + }, + { + "epoch": 1.8888888888888888, + "grad_norm": 2.6481523513793945, + "learning_rate": 3.909281757547644e-06, + "loss": 0.5023, + "step": 3995 + }, + { + "epoch": 1.8893617021276596, + "grad_norm": 2.6687493324279785, + "learning_rate": 3.908766450725917e-06, + "loss": 0.495, + "step": 3996 + }, + { + "epoch": 1.8898345153664302, + "grad_norm": 2.507525682449341, + "learning_rate": 3.908251056186481e-06, + "loss": 0.4155, + "step": 3997 + }, + { + "epoch": 1.8903073286052008, + "grad_norm": 2.7048323154449463, + "learning_rate": 3.907735573961426e-06, + "loss": 0.4601, + "step": 3998 + }, + { + "epoch": 1.8907801418439716, + "grad_norm": 2.6825389862060547, + "learning_rate": 3.907220004082848e-06, + "loss": 0.5067, + "step": 3999 + }, + { + "epoch": 1.8912529550827424, + "grad_norm": 2.775696039199829, + "learning_rate": 3.906704346582852e-06, + "loss": 0.5411, + "step": 4000 + }, + { + "epoch": 1.891725768321513, + "grad_norm": 2.4492077827453613, + "learning_rate": 3.906188601493545e-06, + "loss": 0.4931, + "step": 4001 + }, + { + "epoch": 1.8921985815602835, + "grad_norm": 2.320810556411743, + "learning_rate": 3.905672768847041e-06, + "loss": 0.4908, + "step": 4002 + }, + { + "epoch": 1.8926713947990543, + "grad_norm": 2.455162525177002, + "learning_rate": 3.905156848675455e-06, + "loss": 0.508, + "step": 4003 + }, + { + "epoch": 1.8931442080378251, + "grad_norm": 2.515921115875244, + "learning_rate": 3.904640841010915e-06, + "loss": 0.5318, + "step": 4004 + }, + { + "epoch": 1.8936170212765957, + "grad_norm": 2.7230770587921143, + "learning_rate": 3.904124745885548e-06, + "loss": 0.4793, + "step": 4005 + }, + { + "epoch": 1.8940898345153663, + "grad_norm": 2.519934892654419, + "learning_rate": 3.903608563331491e-06, + "loss": 0.5013, + "step": 4006 + }, + { + "epoch": 1.894562647754137, + "grad_norm": 2.719674587249756, + "learning_rate": 3.903092293380883e-06, + "loss": 0.516, + "step": 4007 + }, + { + "epoch": 1.8950354609929079, + "grad_norm": 3.2107343673706055, + "learning_rate": 3.902575936065869e-06, + "loss": 0.6297, + "step": 4008 + }, + { + "epoch": 1.8955082742316784, + "grad_norm": 2.9773149490356445, + "learning_rate": 3.902059491418603e-06, + "loss": 0.566, + "step": 4009 + }, + { + "epoch": 1.895981087470449, + "grad_norm": 2.6754770278930664, + "learning_rate": 3.90154295947124e-06, + "loss": 0.5187, + "step": 4010 + }, + { + "epoch": 1.8964539007092198, + "grad_norm": 2.457303762435913, + "learning_rate": 3.901026340255943e-06, + "loss": 0.5757, + "step": 4011 + }, + { + "epoch": 1.8969267139479906, + "grad_norm": 2.5944161415100098, + "learning_rate": 3.900509633804878e-06, + "loss": 0.5049, + "step": 4012 + }, + { + "epoch": 1.8973995271867612, + "grad_norm": 2.610445022583008, + "learning_rate": 3.89999284015022e-06, + "loss": 0.521, + "step": 4013 + }, + { + "epoch": 1.8978723404255318, + "grad_norm": 2.6949338912963867, + "learning_rate": 3.899475959324146e-06, + "loss": 0.5619, + "step": 4014 + }, + { + "epoch": 1.8983451536643026, + "grad_norm": 2.7889559268951416, + "learning_rate": 3.898958991358841e-06, + "loss": 0.5223, + "step": 4015 + }, + { + "epoch": 1.8988179669030734, + "grad_norm": 2.569265842437744, + "learning_rate": 3.898441936286493e-06, + "loss": 0.5724, + "step": 4016 + }, + { + "epoch": 1.899290780141844, + "grad_norm": 2.3567774295806885, + "learning_rate": 3.897924794139299e-06, + "loss": 0.4784, + "step": 4017 + }, + { + "epoch": 1.8997635933806145, + "grad_norm": 2.9176526069641113, + "learning_rate": 3.897407564949457e-06, + "loss": 0.646, + "step": 4018 + }, + { + "epoch": 1.9002364066193853, + "grad_norm": 2.7870090007781982, + "learning_rate": 3.896890248749174e-06, + "loss": 0.4922, + "step": 4019 + }, + { + "epoch": 1.900709219858156, + "grad_norm": 2.8310980796813965, + "learning_rate": 3.89637284557066e-06, + "loss": 0.4746, + "step": 4020 + }, + { + "epoch": 1.9011820330969267, + "grad_norm": 2.434915542602539, + "learning_rate": 3.895855355446131e-06, + "loss": 0.4537, + "step": 4021 + }, + { + "epoch": 1.9016548463356973, + "grad_norm": 3.0547034740448, + "learning_rate": 3.89533777840781e-06, + "loss": 0.6161, + "step": 4022 + }, + { + "epoch": 1.902127659574468, + "grad_norm": 3.416774272918701, + "learning_rate": 3.894820114487925e-06, + "loss": 0.5448, + "step": 4023 + }, + { + "epoch": 1.9026004728132389, + "grad_norm": 2.606951951980591, + "learning_rate": 3.894302363718707e-06, + "loss": 0.5501, + "step": 4024 + }, + { + "epoch": 1.9030732860520094, + "grad_norm": 3.082165002822876, + "learning_rate": 3.8937845261323945e-06, + "loss": 0.6035, + "step": 4025 + }, + { + "epoch": 1.90354609929078, + "grad_norm": 2.616093397140503, + "learning_rate": 3.893266601761231e-06, + "loss": 0.5294, + "step": 4026 + }, + { + "epoch": 1.9040189125295508, + "grad_norm": 2.7141637802124023, + "learning_rate": 3.8927485906374654e-06, + "loss": 0.5481, + "step": 4027 + }, + { + "epoch": 1.9044917257683216, + "grad_norm": 2.5129404067993164, + "learning_rate": 3.892230492793352e-06, + "loss": 0.4958, + "step": 4028 + }, + { + "epoch": 1.9049645390070922, + "grad_norm": 2.703403949737549, + "learning_rate": 3.891712308261151e-06, + "loss": 0.4852, + "step": 4029 + }, + { + "epoch": 1.9054373522458627, + "grad_norm": 2.881058931350708, + "learning_rate": 3.891194037073127e-06, + "loss": 0.4662, + "step": 4030 + }, + { + "epoch": 1.9059101654846335, + "grad_norm": 3.216769218444824, + "learning_rate": 3.8906756792615505e-06, + "loss": 0.5076, + "step": 4031 + }, + { + "epoch": 1.9063829787234043, + "grad_norm": 2.442265748977661, + "learning_rate": 3.890157234858697e-06, + "loss": 0.4748, + "step": 4032 + }, + { + "epoch": 1.906855791962175, + "grad_norm": 3.088672399520874, + "learning_rate": 3.889638703896849e-06, + "loss": 0.5729, + "step": 4033 + }, + { + "epoch": 1.9073286052009455, + "grad_norm": 2.9304986000061035, + "learning_rate": 3.889120086408291e-06, + "loss": 0.603, + "step": 4034 + }, + { + "epoch": 1.9078014184397163, + "grad_norm": 2.686093807220459, + "learning_rate": 3.888601382425318e-06, + "loss": 0.4978, + "step": 4035 + }, + { + "epoch": 1.908274231678487, + "grad_norm": 2.5668389797210693, + "learning_rate": 3.888082591980225e-06, + "loss": 0.5086, + "step": 4036 + }, + { + "epoch": 1.9087470449172577, + "grad_norm": 2.530996561050415, + "learning_rate": 3.887563715105315e-06, + "loss": 0.4678, + "step": 4037 + }, + { + "epoch": 1.9092198581560282, + "grad_norm": 3.043342351913452, + "learning_rate": 3.887044751832897e-06, + "loss": 0.5452, + "step": 4038 + }, + { + "epoch": 1.909692671394799, + "grad_norm": 2.799734115600586, + "learning_rate": 3.886525702195284e-06, + "loss": 0.5265, + "step": 4039 + }, + { + "epoch": 1.9101654846335698, + "grad_norm": 2.890022039413452, + "learning_rate": 3.886006566224796e-06, + "loss": 0.4634, + "step": 4040 + }, + { + "epoch": 1.9106382978723404, + "grad_norm": 2.6804237365722656, + "learning_rate": 3.8854873439537555e-06, + "loss": 0.5031, + "step": 4041 + }, + { + "epoch": 1.911111111111111, + "grad_norm": 2.43038272857666, + "learning_rate": 3.884968035414495e-06, + "loss": 0.5098, + "step": 4042 + }, + { + "epoch": 1.9115839243498818, + "grad_norm": 2.589583396911621, + "learning_rate": 3.884448640639346e-06, + "loss": 0.498, + "step": 4043 + }, + { + "epoch": 1.9120567375886526, + "grad_norm": 2.4565231800079346, + "learning_rate": 3.8839291596606524e-06, + "loss": 0.4318, + "step": 4044 + }, + { + "epoch": 1.9125295508274232, + "grad_norm": 2.66762638092041, + "learning_rate": 3.8834095925107575e-06, + "loss": 0.5441, + "step": 4045 + }, + { + "epoch": 1.9130023640661937, + "grad_norm": 2.7334461212158203, + "learning_rate": 3.882889939222013e-06, + "loss": 0.5209, + "step": 4046 + }, + { + "epoch": 1.9134751773049645, + "grad_norm": 2.6398537158966064, + "learning_rate": 3.8823701998267765e-06, + "loss": 0.4874, + "step": 4047 + }, + { + "epoch": 1.9139479905437353, + "grad_norm": 2.82405161857605, + "learning_rate": 3.881850374357409e-06, + "loss": 0.4519, + "step": 4048 + }, + { + "epoch": 1.914420803782506, + "grad_norm": 2.7552523612976074, + "learning_rate": 3.8813304628462776e-06, + "loss": 0.547, + "step": 4049 + }, + { + "epoch": 1.9148936170212765, + "grad_norm": 2.5287928581237793, + "learning_rate": 3.880810465325755e-06, + "loss": 0.5226, + "step": 4050 + }, + { + "epoch": 1.9153664302600473, + "grad_norm": 2.7597358226776123, + "learning_rate": 3.88029038182822e-06, + "loss": 0.5171, + "step": 4051 + }, + { + "epoch": 1.915839243498818, + "grad_norm": 2.563899278640747, + "learning_rate": 3.879770212386055e-06, + "loss": 0.4911, + "step": 4052 + }, + { + "epoch": 1.9163120567375886, + "grad_norm": 2.499404191970825, + "learning_rate": 3.879249957031649e-06, + "loss": 0.5072, + "step": 4053 + }, + { + "epoch": 1.9167848699763592, + "grad_norm": 2.817713499069214, + "learning_rate": 3.878729615797396e-06, + "loss": 0.5452, + "step": 4054 + }, + { + "epoch": 1.91725768321513, + "grad_norm": 2.7152490615844727, + "learning_rate": 3.878209188715696e-06, + "loss": 0.4917, + "step": 4055 + }, + { + "epoch": 1.9177304964539008, + "grad_norm": 2.384265661239624, + "learning_rate": 3.877688675818953e-06, + "loss": 0.4823, + "step": 4056 + }, + { + "epoch": 1.9182033096926714, + "grad_norm": 2.61059308052063, + "learning_rate": 3.877168077139577e-06, + "loss": 0.478, + "step": 4057 + }, + { + "epoch": 1.918676122931442, + "grad_norm": 2.6107938289642334, + "learning_rate": 3.8766473927099824e-06, + "loss": 0.5202, + "step": 4058 + }, + { + "epoch": 1.9191489361702128, + "grad_norm": 2.2339766025543213, + "learning_rate": 3.876126622562592e-06, + "loss": 0.547, + "step": 4059 + }, + { + "epoch": 1.9196217494089836, + "grad_norm": 2.4324610233306885, + "learning_rate": 3.8756057667298304e-06, + "loss": 0.5333, + "step": 4060 + }, + { + "epoch": 1.9200945626477541, + "grad_norm": 2.5521230697631836, + "learning_rate": 3.875084825244131e-06, + "loss": 0.5503, + "step": 4061 + }, + { + "epoch": 1.9205673758865247, + "grad_norm": 2.6985747814178467, + "learning_rate": 3.874563798137928e-06, + "loss": 0.4944, + "step": 4062 + }, + { + "epoch": 1.9210401891252955, + "grad_norm": 2.422332525253296, + "learning_rate": 3.874042685443664e-06, + "loss": 0.4807, + "step": 4063 + }, + { + "epoch": 1.9215130023640663, + "grad_norm": 2.914553165435791, + "learning_rate": 3.873521487193788e-06, + "loss": 0.4439, + "step": 4064 + }, + { + "epoch": 1.9219858156028369, + "grad_norm": 2.8098697662353516, + "learning_rate": 3.873000203420752e-06, + "loss": 0.5433, + "step": 4065 + }, + { + "epoch": 1.9224586288416075, + "grad_norm": 2.6124703884124756, + "learning_rate": 3.872478834157013e-06, + "loss": 0.4812, + "step": 4066 + }, + { + "epoch": 1.9229314420803783, + "grad_norm": 2.511059522628784, + "learning_rate": 3.871957379435035e-06, + "loss": 0.4666, + "step": 4067 + }, + { + "epoch": 1.923404255319149, + "grad_norm": 2.950542688369751, + "learning_rate": 3.871435839287287e-06, + "loss": 0.5687, + "step": 4068 + }, + { + "epoch": 1.9238770685579196, + "grad_norm": 2.4969422817230225, + "learning_rate": 3.870914213746243e-06, + "loss": 0.5235, + "step": 4069 + }, + { + "epoch": 1.9243498817966902, + "grad_norm": 2.512152910232544, + "learning_rate": 3.870392502844382e-06, + "loss": 0.4524, + "step": 4070 + }, + { + "epoch": 1.924822695035461, + "grad_norm": 3.0212557315826416, + "learning_rate": 3.86987070661419e-06, + "loss": 0.4868, + "step": 4071 + }, + { + "epoch": 1.9252955082742318, + "grad_norm": 2.8949966430664062, + "learning_rate": 3.869348825088154e-06, + "loss": 0.5556, + "step": 4072 + }, + { + "epoch": 1.9257683215130024, + "grad_norm": 2.402043581008911, + "learning_rate": 3.868826858298772e-06, + "loss": 0.5307, + "step": 4073 + }, + { + "epoch": 1.926241134751773, + "grad_norm": 2.980992078781128, + "learning_rate": 3.868304806278543e-06, + "loss": 0.6313, + "step": 4074 + }, + { + "epoch": 1.9267139479905437, + "grad_norm": 2.7140514850616455, + "learning_rate": 3.867782669059975e-06, + "loss": 0.5359, + "step": 4075 + }, + { + "epoch": 1.9271867612293145, + "grad_norm": 2.499631643295288, + "learning_rate": 3.867260446675577e-06, + "loss": 0.4873, + "step": 4076 + }, + { + "epoch": 1.9276595744680851, + "grad_norm": 2.915583610534668, + "learning_rate": 3.866738139157866e-06, + "loss": 0.5736, + "step": 4077 + }, + { + "epoch": 1.9281323877068557, + "grad_norm": 2.4231131076812744, + "learning_rate": 3.866215746539363e-06, + "loss": 0.5096, + "step": 4078 + }, + { + "epoch": 1.9286052009456265, + "grad_norm": 2.360074996948242, + "learning_rate": 3.865693268852599e-06, + "loss": 0.4907, + "step": 4079 + }, + { + "epoch": 1.9290780141843973, + "grad_norm": 2.5410032272338867, + "learning_rate": 3.865170706130101e-06, + "loss": 0.473, + "step": 4080 + }, + { + "epoch": 1.9295508274231679, + "grad_norm": 2.780090808868408, + "learning_rate": 3.86464805840441e-06, + "loss": 0.5213, + "step": 4081 + }, + { + "epoch": 1.9300236406619384, + "grad_norm": 2.7318382263183594, + "learning_rate": 3.864125325708068e-06, + "loss": 0.5617, + "step": 4082 + }, + { + "epoch": 1.9304964539007092, + "grad_norm": 2.76509165763855, + "learning_rate": 3.863602508073623e-06, + "loss": 0.52, + "step": 4083 + }, + { + "epoch": 1.93096926713948, + "grad_norm": 2.8041110038757324, + "learning_rate": 3.863079605533631e-06, + "loss": 0.5343, + "step": 4084 + }, + { + "epoch": 1.9314420803782506, + "grad_norm": 2.4462404251098633, + "learning_rate": 3.862556618120647e-06, + "loss": 0.4657, + "step": 4085 + }, + { + "epoch": 1.9319148936170212, + "grad_norm": 2.460864305496216, + "learning_rate": 3.862033545867238e-06, + "loss": 0.517, + "step": 4086 + }, + { + "epoch": 1.932387706855792, + "grad_norm": 2.6480276584625244, + "learning_rate": 3.8615103888059715e-06, + "loss": 0.4702, + "step": 4087 + }, + { + "epoch": 1.9328605200945628, + "grad_norm": 2.7175381183624268, + "learning_rate": 3.860987146969424e-06, + "loss": 0.5073, + "step": 4088 + }, + { + "epoch": 1.9333333333333333, + "grad_norm": 2.4963486194610596, + "learning_rate": 3.860463820390175e-06, + "loss": 0.4491, + "step": 4089 + }, + { + "epoch": 1.933806146572104, + "grad_norm": 2.548135757446289, + "learning_rate": 3.8599404091008075e-06, + "loss": 0.5134, + "step": 4090 + }, + { + "epoch": 1.9342789598108747, + "grad_norm": 2.8693668842315674, + "learning_rate": 3.859416913133916e-06, + "loss": 0.5467, + "step": 4091 + }, + { + "epoch": 1.9347517730496455, + "grad_norm": 2.711273670196533, + "learning_rate": 3.858893332522092e-06, + "loss": 0.6287, + "step": 4092 + }, + { + "epoch": 1.935224586288416, + "grad_norm": 2.8604533672332764, + "learning_rate": 3.858369667297941e-06, + "loss": 0.5661, + "step": 4093 + }, + { + "epoch": 1.9356973995271867, + "grad_norm": 2.936988353729248, + "learning_rate": 3.857845917494066e-06, + "loss": 0.5311, + "step": 4094 + }, + { + "epoch": 1.9361702127659575, + "grad_norm": 2.414093494415283, + "learning_rate": 3.857322083143079e-06, + "loss": 0.505, + "step": 4095 + }, + { + "epoch": 1.9366430260047283, + "grad_norm": 2.5528934001922607, + "learning_rate": 3.856798164277599e-06, + "loss": 0.4759, + "step": 4096 + }, + { + "epoch": 1.9371158392434988, + "grad_norm": 2.592893600463867, + "learning_rate": 3.8562741609302456e-06, + "loss": 0.4932, + "step": 4097 + }, + { + "epoch": 1.9375886524822694, + "grad_norm": 2.9619107246398926, + "learning_rate": 3.855750073133648e-06, + "loss": 0.5563, + "step": 4098 + }, + { + "epoch": 1.9380614657210402, + "grad_norm": 2.864889621734619, + "learning_rate": 3.855225900920438e-06, + "loss": 0.5069, + "step": 4099 + }, + { + "epoch": 1.938534278959811, + "grad_norm": 2.3951032161712646, + "learning_rate": 3.854701644323253e-06, + "loss": 0.4883, + "step": 4100 + }, + { + "epoch": 1.9390070921985816, + "grad_norm": 2.6339633464813232, + "learning_rate": 3.854177303374737e-06, + "loss": 0.5207, + "step": 4101 + }, + { + "epoch": 1.9394799054373522, + "grad_norm": 2.6435508728027344, + "learning_rate": 3.853652878107539e-06, + "loss": 0.4679, + "step": 4102 + }, + { + "epoch": 1.939952718676123, + "grad_norm": 2.4635629653930664, + "learning_rate": 3.853128368554311e-06, + "loss": 0.5639, + "step": 4103 + }, + { + "epoch": 1.9404255319148938, + "grad_norm": 2.664635419845581, + "learning_rate": 3.852603774747714e-06, + "loss": 0.5697, + "step": 4104 + }, + { + "epoch": 1.9408983451536643, + "grad_norm": 2.7020363807678223, + "learning_rate": 3.8520790967204095e-06, + "loss": 0.5462, + "step": 4105 + }, + { + "epoch": 1.941371158392435, + "grad_norm": 3.529282331466675, + "learning_rate": 3.851554334505069e-06, + "loss": 0.54, + "step": 4106 + }, + { + "epoch": 1.9418439716312057, + "grad_norm": 2.7125768661499023, + "learning_rate": 3.851029488134367e-06, + "loss": 0.5355, + "step": 4107 + }, + { + "epoch": 1.9423167848699765, + "grad_norm": 2.5226643085479736, + "learning_rate": 3.850504557640981e-06, + "loss": 0.5106, + "step": 4108 + }, + { + "epoch": 1.942789598108747, + "grad_norm": 2.834352731704712, + "learning_rate": 3.8499795430575995e-06, + "loss": 0.6069, + "step": 4109 + }, + { + "epoch": 1.9432624113475176, + "grad_norm": 2.8484177589416504, + "learning_rate": 3.849454444416911e-06, + "loss": 0.5542, + "step": 4110 + }, + { + "epoch": 1.9437352245862884, + "grad_norm": 2.402539014816284, + "learning_rate": 3.848929261751612e-06, + "loss": 0.47, + "step": 4111 + }, + { + "epoch": 1.9442080378250592, + "grad_norm": 2.7010042667388916, + "learning_rate": 3.848403995094402e-06, + "loss": 0.5263, + "step": 4112 + }, + { + "epoch": 1.9446808510638298, + "grad_norm": 2.441689968109131, + "learning_rate": 3.847878644477988e-06, + "loss": 0.5607, + "step": 4113 + }, + { + "epoch": 1.9451536643026004, + "grad_norm": 2.5994722843170166, + "learning_rate": 3.847353209935081e-06, + "loss": 0.5103, + "step": 4114 + }, + { + "epoch": 1.9456264775413712, + "grad_norm": 2.452242136001587, + "learning_rate": 3.8468276914983975e-06, + "loss": 0.4409, + "step": 4115 + }, + { + "epoch": 1.946099290780142, + "grad_norm": 2.421023368835449, + "learning_rate": 3.84630208920066e-06, + "loss": 0.4429, + "step": 4116 + }, + { + "epoch": 1.9465721040189126, + "grad_norm": 2.696399688720703, + "learning_rate": 3.8457764030745945e-06, + "loss": 0.5352, + "step": 4117 + }, + { + "epoch": 1.9470449172576831, + "grad_norm": 2.3963489532470703, + "learning_rate": 3.845250633152933e-06, + "loss": 0.4505, + "step": 4118 + }, + { + "epoch": 1.947517730496454, + "grad_norm": 2.610649585723877, + "learning_rate": 3.8447247794684135e-06, + "loss": 0.501, + "step": 4119 + }, + { + "epoch": 1.9479905437352247, + "grad_norm": 2.740412712097168, + "learning_rate": 3.8441988420537775e-06, + "loss": 0.5362, + "step": 4120 + }, + { + "epoch": 1.9484633569739953, + "grad_norm": 2.2614004611968994, + "learning_rate": 3.8436728209417755e-06, + "loss": 0.4199, + "step": 4121 + }, + { + "epoch": 1.9489361702127659, + "grad_norm": 3.0683481693267822, + "learning_rate": 3.843146716165158e-06, + "loss": 0.5248, + "step": 4122 + }, + { + "epoch": 1.9494089834515367, + "grad_norm": 3.005174398422241, + "learning_rate": 3.842620527756684e-06, + "loss": 0.5246, + "step": 4123 + }, + { + "epoch": 1.9498817966903075, + "grad_norm": 2.672896385192871, + "learning_rate": 3.842094255749117e-06, + "loss": 0.5586, + "step": 4124 + }, + { + "epoch": 1.950354609929078, + "grad_norm": 2.5481197834014893, + "learning_rate": 3.8415679001752255e-06, + "loss": 0.5061, + "step": 4125 + }, + { + "epoch": 1.9508274231678486, + "grad_norm": 2.515789270401001, + "learning_rate": 3.8410414610677835e-06, + "loss": 0.4645, + "step": 4126 + }, + { + "epoch": 1.9513002364066194, + "grad_norm": 2.7236077785491943, + "learning_rate": 3.84051493845957e-06, + "loss": 0.5623, + "step": 4127 + }, + { + "epoch": 1.9517730496453902, + "grad_norm": 2.6252009868621826, + "learning_rate": 3.839988332383369e-06, + "loss": 0.5078, + "step": 4128 + }, + { + "epoch": 1.9522458628841608, + "grad_norm": 2.719196081161499, + "learning_rate": 3.83946164287197e-06, + "loss": 0.5481, + "step": 4129 + }, + { + "epoch": 1.9527186761229314, + "grad_norm": 2.484163284301758, + "learning_rate": 3.838934869958169e-06, + "loss": 0.5332, + "step": 4130 + }, + { + "epoch": 1.9531914893617022, + "grad_norm": 2.615382671356201, + "learning_rate": 3.838408013674764e-06, + "loss": 0.4742, + "step": 4131 + }, + { + "epoch": 1.953664302600473, + "grad_norm": 2.735321044921875, + "learning_rate": 3.83788107405456e-06, + "loss": 0.421, + "step": 4132 + }, + { + "epoch": 1.9541371158392435, + "grad_norm": 2.892652750015259, + "learning_rate": 3.837354051130369e-06, + "loss": 0.5326, + "step": 4133 + }, + { + "epoch": 1.9546099290780141, + "grad_norm": 2.6800546646118164, + "learning_rate": 3.8368269449350055e-06, + "loss": 0.5041, + "step": 4134 + }, + { + "epoch": 1.955082742316785, + "grad_norm": 2.362470865249634, + "learning_rate": 3.836299755501289e-06, + "loss": 0.4697, + "step": 4135 + }, + { + "epoch": 1.9555555555555557, + "grad_norm": 2.3855135440826416, + "learning_rate": 3.835772482862047e-06, + "loss": 0.5148, + "step": 4136 + }, + { + "epoch": 1.9560283687943263, + "grad_norm": 2.3338418006896973, + "learning_rate": 3.83524512705011e-06, + "loss": 0.4643, + "step": 4137 + }, + { + "epoch": 1.9565011820330969, + "grad_norm": 2.261355400085449, + "learning_rate": 3.834717688098313e-06, + "loss": 0.5573, + "step": 4138 + }, + { + "epoch": 1.9569739952718677, + "grad_norm": 2.8166391849517822, + "learning_rate": 3.834190166039498e-06, + "loss": 0.4868, + "step": 4139 + }, + { + "epoch": 1.9574468085106385, + "grad_norm": 2.4155869483947754, + "learning_rate": 3.833662560906512e-06, + "loss": 0.4923, + "step": 4140 + }, + { + "epoch": 1.957919621749409, + "grad_norm": 2.3977696895599365, + "learning_rate": 3.833134872732206e-06, + "loss": 0.5106, + "step": 4141 + }, + { + "epoch": 1.9583924349881796, + "grad_norm": 2.9541378021240234, + "learning_rate": 3.832607101549438e-06, + "loss": 0.4683, + "step": 4142 + }, + { + "epoch": 1.9588652482269504, + "grad_norm": 2.5862700939178467, + "learning_rate": 3.832079247391068e-06, + "loss": 0.4453, + "step": 4143 + }, + { + "epoch": 1.9593380614657212, + "grad_norm": 2.7459371089935303, + "learning_rate": 3.8315513102899644e-06, + "loss": 0.5511, + "step": 4144 + }, + { + "epoch": 1.9598108747044918, + "grad_norm": 2.904869556427002, + "learning_rate": 3.831023290279e-06, + "loss": 0.5348, + "step": 4145 + }, + { + "epoch": 1.9602836879432624, + "grad_norm": 3.092846632003784, + "learning_rate": 3.830495187391051e-06, + "loss": 0.5664, + "step": 4146 + }, + { + "epoch": 1.9607565011820332, + "grad_norm": 3.2838528156280518, + "learning_rate": 3.829967001659001e-06, + "loss": 0.5115, + "step": 4147 + }, + { + "epoch": 1.961229314420804, + "grad_norm": 2.7799549102783203, + "learning_rate": 3.829438733115738e-06, + "loss": 0.5145, + "step": 4148 + }, + { + "epoch": 1.9617021276595743, + "grad_norm": 2.436084270477295, + "learning_rate": 3.828910381794154e-06, + "loss": 0.4718, + "step": 4149 + }, + { + "epoch": 1.962174940898345, + "grad_norm": 2.6662371158599854, + "learning_rate": 3.828381947727148e-06, + "loss": 0.6129, + "step": 4150 + }, + { + "epoch": 1.962647754137116, + "grad_norm": 2.937000036239624, + "learning_rate": 3.827853430947622e-06, + "loss": 0.522, + "step": 4151 + }, + { + "epoch": 1.9631205673758865, + "grad_norm": 2.5737369060516357, + "learning_rate": 3.827324831488486e-06, + "loss": 0.4916, + "step": 4152 + }, + { + "epoch": 1.963593380614657, + "grad_norm": 2.70232892036438, + "learning_rate": 3.826796149382653e-06, + "loss": 0.4726, + "step": 4153 + }, + { + "epoch": 1.9640661938534278, + "grad_norm": 2.6899707317352295, + "learning_rate": 3.826267384663042e-06, + "loss": 0.529, + "step": 4154 + }, + { + "epoch": 1.9645390070921986, + "grad_norm": 2.6142728328704834, + "learning_rate": 3.825738537362575e-06, + "loss": 0.4999, + "step": 4155 + }, + { + "epoch": 1.9650118203309692, + "grad_norm": 2.43949818611145, + "learning_rate": 3.825209607514183e-06, + "loss": 0.5035, + "step": 4156 + }, + { + "epoch": 1.9654846335697398, + "grad_norm": 2.3735458850860596, + "learning_rate": 3.824680595150801e-06, + "loss": 0.4779, + "step": 4157 + }, + { + "epoch": 1.9659574468085106, + "grad_norm": 2.444307565689087, + "learning_rate": 3.824151500305365e-06, + "loss": 0.4825, + "step": 4158 + }, + { + "epoch": 1.9664302600472814, + "grad_norm": 2.8219668865203857, + "learning_rate": 3.8236223230108224e-06, + "loss": 0.5354, + "step": 4159 + }, + { + "epoch": 1.966903073286052, + "grad_norm": 2.720721483230591, + "learning_rate": 3.823093063300121e-06, + "loss": 0.5064, + "step": 4160 + }, + { + "epoch": 1.9673758865248225, + "grad_norm": 2.324190616607666, + "learning_rate": 3.822563721206217e-06, + "loss": 0.5348, + "step": 4161 + }, + { + "epoch": 1.9678486997635933, + "grad_norm": 2.702155351638794, + "learning_rate": 3.8220342967620695e-06, + "loss": 0.5388, + "step": 4162 + }, + { + "epoch": 1.9683215130023641, + "grad_norm": 2.4956369400024414, + "learning_rate": 3.821504790000642e-06, + "loss": 0.5071, + "step": 4163 + }, + { + "epoch": 1.9687943262411347, + "grad_norm": 2.568039655685425, + "learning_rate": 3.820975200954906e-06, + "loss": 0.5133, + "step": 4164 + }, + { + "epoch": 1.9692671394799053, + "grad_norm": 2.810868978500366, + "learning_rate": 3.820445529657837e-06, + "loss": 0.4856, + "step": 4165 + }, + { + "epoch": 1.969739952718676, + "grad_norm": 2.66365647315979, + "learning_rate": 3.819915776142415e-06, + "loss": 0.5235, + "step": 4166 + }, + { + "epoch": 1.9702127659574469, + "grad_norm": 2.2982139587402344, + "learning_rate": 3.8193859404416265e-06, + "loss": 0.4361, + "step": 4167 + }, + { + "epoch": 1.9706855791962175, + "grad_norm": 2.585672378540039, + "learning_rate": 3.818856022588458e-06, + "loss": 0.4842, + "step": 4168 + }, + { + "epoch": 1.971158392434988, + "grad_norm": 2.57857346534729, + "learning_rate": 3.81832602261591e-06, + "loss": 0.5249, + "step": 4169 + }, + { + "epoch": 1.9716312056737588, + "grad_norm": 2.6947224140167236, + "learning_rate": 3.817795940556981e-06, + "loss": 0.5234, + "step": 4170 + }, + { + "epoch": 1.9721040189125296, + "grad_norm": 2.7453415393829346, + "learning_rate": 3.8172657764446764e-06, + "loss": 0.5219, + "step": 4171 + }, + { + "epoch": 1.9725768321513002, + "grad_norm": 8.424073219299316, + "learning_rate": 3.816735530312009e-06, + "loss": 0.5162, + "step": 4172 + }, + { + "epoch": 1.9730496453900708, + "grad_norm": 2.8229739665985107, + "learning_rate": 3.816205202191993e-06, + "loss": 0.4621, + "step": 4173 + }, + { + "epoch": 1.9735224586288416, + "grad_norm": 2.5969009399414062, + "learning_rate": 3.815674792117651e-06, + "loss": 0.5044, + "step": 4174 + }, + { + "epoch": 1.9739952718676124, + "grad_norm": 2.646024227142334, + "learning_rate": 3.815144300122009e-06, + "loss": 0.5094, + "step": 4175 + }, + { + "epoch": 1.974468085106383, + "grad_norm": 2.4950616359710693, + "learning_rate": 3.814613726238097e-06, + "loss": 0.4827, + "step": 4176 + }, + { + "epoch": 1.9749408983451535, + "grad_norm": 2.5636119842529297, + "learning_rate": 3.8140830704989535e-06, + "loss": 0.5241, + "step": 4177 + }, + { + "epoch": 1.9754137115839243, + "grad_norm": 2.7936553955078125, + "learning_rate": 3.813552332937619e-06, + "loss": 0.5344, + "step": 4178 + }, + { + "epoch": 1.9758865248226951, + "grad_norm": 2.8085341453552246, + "learning_rate": 3.8130215135871405e-06, + "loss": 0.5647, + "step": 4179 + }, + { + "epoch": 1.9763593380614657, + "grad_norm": 2.4776322841644287, + "learning_rate": 3.8124906124805694e-06, + "loss": 0.542, + "step": 4180 + }, + { + "epoch": 1.9768321513002363, + "grad_norm": 2.3227856159210205, + "learning_rate": 3.8119596296509635e-06, + "loss": 0.4618, + "step": 4181 + }, + { + "epoch": 1.977304964539007, + "grad_norm": 2.5157814025878906, + "learning_rate": 3.8114285651313848e-06, + "loss": 0.538, + "step": 4182 + }, + { + "epoch": 1.9777777777777779, + "grad_norm": 2.5630218982696533, + "learning_rate": 3.8108974189548987e-06, + "loss": 0.5254, + "step": 4183 + }, + { + "epoch": 1.9782505910165484, + "grad_norm": 2.703237533569336, + "learning_rate": 3.8103661911545787e-06, + "loss": 0.4859, + "step": 4184 + }, + { + "epoch": 1.978723404255319, + "grad_norm": 2.8808000087738037, + "learning_rate": 3.809834881763502e-06, + "loss": 0.5585, + "step": 4185 + }, + { + "epoch": 1.9791962174940898, + "grad_norm": 2.9047577381134033, + "learning_rate": 3.8093034908147507e-06, + "loss": 0.5022, + "step": 4186 + }, + { + "epoch": 1.9796690307328606, + "grad_norm": 2.7417640686035156, + "learning_rate": 3.8087720183414125e-06, + "loss": 0.5275, + "step": 4187 + }, + { + "epoch": 1.9801418439716312, + "grad_norm": 2.952012062072754, + "learning_rate": 3.8082404643765786e-06, + "loss": 0.543, + "step": 4188 + }, + { + "epoch": 1.9806146572104018, + "grad_norm": 2.538376569747925, + "learning_rate": 3.807708828953348e-06, + "loss": 0.4969, + "step": 4189 + }, + { + "epoch": 1.9810874704491725, + "grad_norm": 2.3476181030273438, + "learning_rate": 3.807177112104823e-06, + "loss": 0.4979, + "step": 4190 + }, + { + "epoch": 1.9815602836879433, + "grad_norm": 2.6480464935302734, + "learning_rate": 3.80664531386411e-06, + "loss": 0.4894, + "step": 4191 + }, + { + "epoch": 1.982033096926714, + "grad_norm": 2.792916774749756, + "learning_rate": 3.8061134342643235e-06, + "loss": 0.5468, + "step": 4192 + }, + { + "epoch": 1.9825059101654845, + "grad_norm": 2.368736743927002, + "learning_rate": 3.805581473338581e-06, + "loss": 0.4672, + "step": 4193 + }, + { + "epoch": 1.9829787234042553, + "grad_norm": 2.379084348678589, + "learning_rate": 3.8050494311200037e-06, + "loss": 0.4577, + "step": 4194 + }, + { + "epoch": 1.983451536643026, + "grad_norm": 2.722471237182617, + "learning_rate": 3.804517307641722e-06, + "loss": 0.4988, + "step": 4195 + }, + { + "epoch": 1.9839243498817967, + "grad_norm": 2.356649875640869, + "learning_rate": 3.8039851029368674e-06, + "loss": 0.4933, + "step": 4196 + }, + { + "epoch": 1.9843971631205672, + "grad_norm": 2.9182281494140625, + "learning_rate": 3.8034528170385776e-06, + "loss": 0.4873, + "step": 4197 + }, + { + "epoch": 1.984869976359338, + "grad_norm": 2.6232199668884277, + "learning_rate": 3.8029204499799976e-06, + "loss": 0.4425, + "step": 4198 + }, + { + "epoch": 1.9853427895981088, + "grad_norm": 2.667541980743408, + "learning_rate": 3.802388001794274e-06, + "loss": 0.5022, + "step": 4199 + }, + { + "epoch": 1.9858156028368794, + "grad_norm": 3.168470621109009, + "learning_rate": 3.8018554725145596e-06, + "loss": 0.5505, + "step": 4200 + }, + { + "epoch": 1.98628841607565, + "grad_norm": 2.716625452041626, + "learning_rate": 3.8013228621740132e-06, + "loss": 0.4937, + "step": 4201 + }, + { + "epoch": 1.9867612293144208, + "grad_norm": 2.3014442920684814, + "learning_rate": 3.800790170805799e-06, + "loss": 0.4734, + "step": 4202 + }, + { + "epoch": 1.9872340425531916, + "grad_norm": 2.9426841735839844, + "learning_rate": 3.8002573984430847e-06, + "loss": 0.4983, + "step": 4203 + }, + { + "epoch": 1.9877068557919622, + "grad_norm": 2.5598278045654297, + "learning_rate": 3.7997245451190435e-06, + "loss": 0.4834, + "step": 4204 + }, + { + "epoch": 1.9881796690307327, + "grad_norm": 2.86458420753479, + "learning_rate": 3.7991916108668538e-06, + "loss": 0.5613, + "step": 4205 + }, + { + "epoch": 1.9886524822695035, + "grad_norm": 2.842914342880249, + "learning_rate": 3.7986585957196997e-06, + "loss": 0.4951, + "step": 4206 + }, + { + "epoch": 1.9891252955082743, + "grad_norm": 3.1828150749206543, + "learning_rate": 3.7981254997107686e-06, + "loss": 0.5913, + "step": 4207 + }, + { + "epoch": 1.989598108747045, + "grad_norm": 2.5765931606292725, + "learning_rate": 3.7975923228732547e-06, + "loss": 0.5544, + "step": 4208 + }, + { + "epoch": 1.9900709219858155, + "grad_norm": 2.492234945297241, + "learning_rate": 3.797059065240357e-06, + "loss": 0.5046, + "step": 4209 + }, + { + "epoch": 1.9905437352245863, + "grad_norm": 2.870346784591675, + "learning_rate": 3.7965257268452795e-06, + "loss": 0.5354, + "step": 4210 + }, + { + "epoch": 1.991016548463357, + "grad_norm": 2.4989993572235107, + "learning_rate": 3.795992307721229e-06, + "loss": 0.4677, + "step": 4211 + }, + { + "epoch": 1.9914893617021276, + "grad_norm": 2.931114673614502, + "learning_rate": 3.7954588079014206e-06, + "loss": 0.5504, + "step": 4212 + }, + { + "epoch": 1.9919621749408982, + "grad_norm": 2.5247652530670166, + "learning_rate": 3.794925227419073e-06, + "loss": 0.4736, + "step": 4213 + }, + { + "epoch": 1.992434988179669, + "grad_norm": 2.6238436698913574, + "learning_rate": 3.794391566307409e-06, + "loss": 0.4591, + "step": 4214 + }, + { + "epoch": 1.9929078014184398, + "grad_norm": 2.654886245727539, + "learning_rate": 3.7938578245996584e-06, + "loss": 0.5149, + "step": 4215 + }, + { + "epoch": 1.9933806146572104, + "grad_norm": 2.509164810180664, + "learning_rate": 3.793324002329054e-06, + "loss": 0.4951, + "step": 4216 + }, + { + "epoch": 1.993853427895981, + "grad_norm": 2.909632921218872, + "learning_rate": 3.7927900995288345e-06, + "loss": 0.5131, + "step": 4217 + }, + { + "epoch": 1.9943262411347518, + "grad_norm": 2.4354615211486816, + "learning_rate": 3.7922561162322456e-06, + "loss": 0.4716, + "step": 4218 + }, + { + "epoch": 1.9947990543735226, + "grad_norm": 2.6514649391174316, + "learning_rate": 3.791722052472534e-06, + "loss": 0.5714, + "step": 4219 + }, + { + "epoch": 1.9952718676122931, + "grad_norm": 2.77089262008667, + "learning_rate": 3.791187908282954e-06, + "loss": 0.5736, + "step": 4220 + }, + { + "epoch": 1.9957446808510637, + "grad_norm": 2.7651021480560303, + "learning_rate": 3.7906536836967657e-06, + "loss": 0.4948, + "step": 4221 + }, + { + "epoch": 1.9962174940898345, + "grad_norm": 2.7536795139312744, + "learning_rate": 3.7901193787472306e-06, + "loss": 0.512, + "step": 4222 + }, + { + "epoch": 1.9966903073286053, + "grad_norm": 2.684893846511841, + "learning_rate": 3.78958499346762e-06, + "loss": 0.5118, + "step": 4223 + }, + { + "epoch": 1.9971631205673759, + "grad_norm": 2.7616753578186035, + "learning_rate": 3.7890505278912054e-06, + "loss": 0.4516, + "step": 4224 + }, + { + "epoch": 1.9976359338061465, + "grad_norm": 2.4731967449188232, + "learning_rate": 3.7885159820512666e-06, + "loss": 0.4736, + "step": 4225 + }, + { + "epoch": 1.9981087470449173, + "grad_norm": 2.366631031036377, + "learning_rate": 3.7879813559810884e-06, + "loss": 0.4999, + "step": 4226 + }, + { + "epoch": 1.998581560283688, + "grad_norm": 2.994624137878418, + "learning_rate": 3.7874466497139582e-06, + "loss": 0.5273, + "step": 4227 + }, + { + "epoch": 1.9990543735224586, + "grad_norm": 2.4499242305755615, + "learning_rate": 3.7869118632831712e-06, + "loss": 0.5761, + "step": 4228 + }, + { + "epoch": 1.9995271867612292, + "grad_norm": 2.3370113372802734, + "learning_rate": 3.7863769967220243e-06, + "loss": 0.4673, + "step": 4229 + }, + { + "epoch": 2.0, + "grad_norm": 3.1131203174591064, + "learning_rate": 3.7858420500638236e-06, + "loss": 0.5118, + "step": 4230 + }, + { + "epoch": 2.000472813238771, + "grad_norm": 2.2747561931610107, + "learning_rate": 3.785307023341876e-06, + "loss": 0.4166, + "step": 4231 + }, + { + "epoch": 2.000945626477541, + "grad_norm": 2.4347424507141113, + "learning_rate": 3.7847719165894963e-06, + "loss": 0.4161, + "step": 4232 + }, + { + "epoch": 2.001418439716312, + "grad_norm": 2.398805618286133, + "learning_rate": 3.784236729840003e-06, + "loss": 0.4652, + "step": 4233 + }, + { + "epoch": 2.0018912529550827, + "grad_norm": 2.1904916763305664, + "learning_rate": 3.783701463126719e-06, + "loss": 0.4554, + "step": 4234 + }, + { + "epoch": 2.0023640661938535, + "grad_norm": 2.237330913543701, + "learning_rate": 3.7831661164829735e-06, + "loss": 0.4471, + "step": 4235 + }, + { + "epoch": 2.002836879432624, + "grad_norm": 2.3656628131866455, + "learning_rate": 3.7826306899421016e-06, + "loss": 0.4052, + "step": 4236 + }, + { + "epoch": 2.0033096926713947, + "grad_norm": 2.615489959716797, + "learning_rate": 3.7820951835374405e-06, + "loss": 0.4847, + "step": 4237 + }, + { + "epoch": 2.0037825059101655, + "grad_norm": 2.453036308288574, + "learning_rate": 3.7815595973023347e-06, + "loss": 0.4672, + "step": 4238 + }, + { + "epoch": 2.0042553191489363, + "grad_norm": 2.537468671798706, + "learning_rate": 3.7810239312701306e-06, + "loss": 0.467, + "step": 4239 + }, + { + "epoch": 2.0047281323877066, + "grad_norm": 2.3321666717529297, + "learning_rate": 3.780488185474184e-06, + "loss": 0.3557, + "step": 4240 + }, + { + "epoch": 2.0052009456264774, + "grad_norm": 2.9051828384399414, + "learning_rate": 3.779952359947854e-06, + "loss": 0.5474, + "step": 4241 + }, + { + "epoch": 2.0056737588652482, + "grad_norm": 2.7458817958831787, + "learning_rate": 3.7794164547245015e-06, + "loss": 0.4659, + "step": 4242 + }, + { + "epoch": 2.006146572104019, + "grad_norm": 2.627046585083008, + "learning_rate": 3.778880469837497e-06, + "loss": 0.4179, + "step": 4243 + }, + { + "epoch": 2.0066193853427894, + "grad_norm": 2.4186174869537354, + "learning_rate": 3.7783444053202135e-06, + "loss": 0.3976, + "step": 4244 + }, + { + "epoch": 2.00709219858156, + "grad_norm": 3.109376907348633, + "learning_rate": 3.7778082612060296e-06, + "loss": 0.4095, + "step": 4245 + }, + { + "epoch": 2.007565011820331, + "grad_norm": 2.583376169204712, + "learning_rate": 3.7772720375283282e-06, + "loss": 0.4325, + "step": 4246 + }, + { + "epoch": 2.0080378250591018, + "grad_norm": 2.6199896335601807, + "learning_rate": 3.776735734320497e-06, + "loss": 0.4207, + "step": 4247 + }, + { + "epoch": 2.008510638297872, + "grad_norm": 2.545353651046753, + "learning_rate": 3.77619935161593e-06, + "loss": 0.4483, + "step": 4248 + }, + { + "epoch": 2.008983451536643, + "grad_norm": 2.770266056060791, + "learning_rate": 3.7756628894480263e-06, + "loss": 0.457, + "step": 4249 + }, + { + "epoch": 2.0094562647754137, + "grad_norm": 2.903254985809326, + "learning_rate": 3.7751263478501878e-06, + "loss": 0.4171, + "step": 4250 + }, + { + "epoch": 2.0099290780141845, + "grad_norm": 2.5576963424682617, + "learning_rate": 3.774589726855822e-06, + "loss": 0.3631, + "step": 4251 + }, + { + "epoch": 2.010401891252955, + "grad_norm": 3.7584285736083984, + "learning_rate": 3.7740530264983434e-06, + "loss": 0.4827, + "step": 4252 + }, + { + "epoch": 2.0108747044917257, + "grad_norm": 3.3116581439971924, + "learning_rate": 3.77351624681117e-06, + "loss": 0.5071, + "step": 4253 + }, + { + "epoch": 2.0113475177304965, + "grad_norm": 3.1370885372161865, + "learning_rate": 3.772979387827723e-06, + "loss": 0.4963, + "step": 4254 + }, + { + "epoch": 2.0118203309692673, + "grad_norm": 2.4832639694213867, + "learning_rate": 3.772442449581432e-06, + "loss": 0.4442, + "step": 4255 + }, + { + "epoch": 2.0122931442080376, + "grad_norm": 2.7645785808563232, + "learning_rate": 3.7719054321057293e-06, + "loss": 0.4572, + "step": 4256 + }, + { + "epoch": 2.0127659574468084, + "grad_norm": 2.7962236404418945, + "learning_rate": 3.7713683354340515e-06, + "loss": 0.4906, + "step": 4257 + }, + { + "epoch": 2.013238770685579, + "grad_norm": 2.647991895675659, + "learning_rate": 3.7708311595998425e-06, + "loss": 0.4027, + "step": 4258 + }, + { + "epoch": 2.01371158392435, + "grad_norm": 2.3780267238616943, + "learning_rate": 3.7702939046365504e-06, + "loss": 0.4285, + "step": 4259 + }, + { + "epoch": 2.0141843971631204, + "grad_norm": 2.5185933113098145, + "learning_rate": 3.7697565705776266e-06, + "loss": 0.4834, + "step": 4260 + }, + { + "epoch": 2.014657210401891, + "grad_norm": 2.432507276535034, + "learning_rate": 3.7692191574565294e-06, + "loss": 0.3695, + "step": 4261 + }, + { + "epoch": 2.015130023640662, + "grad_norm": 2.8010706901550293, + "learning_rate": 3.76868166530672e-06, + "loss": 0.478, + "step": 4262 + }, + { + "epoch": 2.0156028368794328, + "grad_norm": 2.32817006111145, + "learning_rate": 3.768144094161666e-06, + "loss": 0.4154, + "step": 4263 + }, + { + "epoch": 2.016075650118203, + "grad_norm": 3.062812328338623, + "learning_rate": 3.7676064440548405e-06, + "loss": 0.5015, + "step": 4264 + }, + { + "epoch": 2.016548463356974, + "grad_norm": 2.6129536628723145, + "learning_rate": 3.7670687150197194e-06, + "loss": 0.3843, + "step": 4265 + }, + { + "epoch": 2.0170212765957447, + "grad_norm": 2.838259696960449, + "learning_rate": 3.766530907089786e-06, + "loss": 0.4937, + "step": 4266 + }, + { + "epoch": 2.0174940898345155, + "grad_norm": 2.601203680038452, + "learning_rate": 3.7659930202985263e-06, + "loss": 0.4644, + "step": 4267 + }, + { + "epoch": 2.017966903073286, + "grad_norm": 2.5964133739471436, + "learning_rate": 3.7654550546794322e-06, + "loss": 0.4365, + "step": 4268 + }, + { + "epoch": 2.0184397163120567, + "grad_norm": 3.0028915405273438, + "learning_rate": 3.764917010266001e-06, + "loss": 0.434, + "step": 4269 + }, + { + "epoch": 2.0189125295508275, + "grad_norm": 2.719252586364746, + "learning_rate": 3.764378887091734e-06, + "loss": 0.4401, + "step": 4270 + }, + { + "epoch": 2.0193853427895982, + "grad_norm": 2.400254011154175, + "learning_rate": 3.7638406851901377e-06, + "loss": 0.4904, + "step": 4271 + }, + { + "epoch": 2.0198581560283686, + "grad_norm": 2.8015363216400146, + "learning_rate": 3.763302404594724e-06, + "loss": 0.4569, + "step": 4272 + }, + { + "epoch": 2.0203309692671394, + "grad_norm": 2.718416452407837, + "learning_rate": 3.762764045339009e-06, + "loss": 0.5124, + "step": 4273 + }, + { + "epoch": 2.02080378250591, + "grad_norm": 2.484049081802368, + "learning_rate": 3.762225607456514e-06, + "loss": 0.4255, + "step": 4274 + }, + { + "epoch": 2.021276595744681, + "grad_norm": 2.6377930641174316, + "learning_rate": 3.7616870909807645e-06, + "loss": 0.5044, + "step": 4275 + }, + { + "epoch": 2.0217494089834513, + "grad_norm": 2.8845038414001465, + "learning_rate": 3.7611484959452927e-06, + "loss": 0.4924, + "step": 4276 + }, + { + "epoch": 2.022222222222222, + "grad_norm": 2.5939974784851074, + "learning_rate": 3.7606098223836342e-06, + "loss": 0.4873, + "step": 4277 + }, + { + "epoch": 2.022695035460993, + "grad_norm": 2.499826431274414, + "learning_rate": 3.76007107032933e-06, + "loss": 0.4515, + "step": 4278 + }, + { + "epoch": 2.0231678486997637, + "grad_norm": 3.0318663120269775, + "learning_rate": 3.759532239815924e-06, + "loss": 0.4901, + "step": 4279 + }, + { + "epoch": 2.023640661938534, + "grad_norm": 2.857977867126465, + "learning_rate": 3.758993330876969e-06, + "loss": 0.4659, + "step": 4280 + }, + { + "epoch": 2.024113475177305, + "grad_norm": 2.47918438911438, + "learning_rate": 3.7584543435460196e-06, + "loss": 0.4323, + "step": 4281 + }, + { + "epoch": 2.0245862884160757, + "grad_norm": 2.6033785343170166, + "learning_rate": 3.757915277856637e-06, + "loss": 0.4437, + "step": 4282 + }, + { + "epoch": 2.0250591016548465, + "grad_norm": 2.799781322479248, + "learning_rate": 3.757376133842386e-06, + "loss": 0.4523, + "step": 4283 + }, + { + "epoch": 2.025531914893617, + "grad_norm": 2.6092529296875, + "learning_rate": 3.756836911536836e-06, + "loss": 0.3898, + "step": 4284 + }, + { + "epoch": 2.0260047281323876, + "grad_norm": 2.66229248046875, + "learning_rate": 3.7562976109735627e-06, + "loss": 0.4731, + "step": 4285 + }, + { + "epoch": 2.0264775413711584, + "grad_norm": 2.90142822265625, + "learning_rate": 3.7557582321861463e-06, + "loss": 0.4285, + "step": 4286 + }, + { + "epoch": 2.0269503546099292, + "grad_norm": 2.5138802528381348, + "learning_rate": 3.7552187752081707e-06, + "loss": 0.4467, + "step": 4287 + }, + { + "epoch": 2.0274231678486996, + "grad_norm": 3.0656235218048096, + "learning_rate": 3.754679240073226e-06, + "loss": 0.4718, + "step": 4288 + }, + { + "epoch": 2.0278959810874704, + "grad_norm": 2.9633383750915527, + "learning_rate": 3.754139626814907e-06, + "loss": 0.4741, + "step": 4289 + }, + { + "epoch": 2.028368794326241, + "grad_norm": 2.5925145149230957, + "learning_rate": 3.753599935466812e-06, + "loss": 0.4281, + "step": 4290 + }, + { + "epoch": 2.028841607565012, + "grad_norm": 2.837740659713745, + "learning_rate": 3.7530601660625456e-06, + "loss": 0.4757, + "step": 4291 + }, + { + "epoch": 2.0293144208037823, + "grad_norm": 2.3995790481567383, + "learning_rate": 3.752520318635718e-06, + "loss": 0.4148, + "step": 4292 + }, + { + "epoch": 2.029787234042553, + "grad_norm": 2.572601795196533, + "learning_rate": 3.7519803932199424e-06, + "loss": 0.4051, + "step": 4293 + }, + { + "epoch": 2.030260047281324, + "grad_norm": 2.6780295372009277, + "learning_rate": 3.751440389848837e-06, + "loss": 0.4626, + "step": 4294 + }, + { + "epoch": 2.0307328605200947, + "grad_norm": 2.8666839599609375, + "learning_rate": 3.7509003085560257e-06, + "loss": 0.4255, + "step": 4295 + }, + { + "epoch": 2.031205673758865, + "grad_norm": 2.4398207664489746, + "learning_rate": 3.750360149375138e-06, + "loss": 0.4235, + "step": 4296 + }, + { + "epoch": 2.031678486997636, + "grad_norm": 2.436840534210205, + "learning_rate": 3.7498199123398062e-06, + "loss": 0.3907, + "step": 4297 + }, + { + "epoch": 2.0321513002364067, + "grad_norm": 3.3945820331573486, + "learning_rate": 3.7492795974836683e-06, + "loss": 0.465, + "step": 4298 + }, + { + "epoch": 2.0326241134751775, + "grad_norm": 2.6693103313446045, + "learning_rate": 3.7487392048403678e-06, + "loss": 0.4948, + "step": 4299 + }, + { + "epoch": 2.033096926713948, + "grad_norm": 2.7642734050750732, + "learning_rate": 3.748198734443553e-06, + "loss": 0.4538, + "step": 4300 + }, + { + "epoch": 2.0335697399527186, + "grad_norm": 3.1436543464660645, + "learning_rate": 3.747658186326876e-06, + "loss": 0.5137, + "step": 4301 + }, + { + "epoch": 2.0340425531914894, + "grad_norm": 3.482678174972534, + "learning_rate": 3.7471175605239947e-06, + "loss": 0.4982, + "step": 4302 + }, + { + "epoch": 2.03451536643026, + "grad_norm": 2.712557077407837, + "learning_rate": 3.746576857068571e-06, + "loss": 0.4459, + "step": 4303 + }, + { + "epoch": 2.0349881796690306, + "grad_norm": 3.147440195083618, + "learning_rate": 3.7460360759942726e-06, + "loss": 0.5063, + "step": 4304 + }, + { + "epoch": 2.0354609929078014, + "grad_norm": 2.840672492980957, + "learning_rate": 3.7454952173347714e-06, + "loss": 0.5041, + "step": 4305 + }, + { + "epoch": 2.035933806146572, + "grad_norm": 2.584122657775879, + "learning_rate": 3.744954281123745e-06, + "loss": 0.4487, + "step": 4306 + }, + { + "epoch": 2.036406619385343, + "grad_norm": 2.9869542121887207, + "learning_rate": 3.7444132673948737e-06, + "loss": 0.479, + "step": 4307 + }, + { + "epoch": 2.0368794326241133, + "grad_norm": 2.478459358215332, + "learning_rate": 3.7438721761818446e-06, + "loss": 0.4636, + "step": 4308 + }, + { + "epoch": 2.037352245862884, + "grad_norm": 2.5524215698242188, + "learning_rate": 3.7433310075183504e-06, + "loss": 0.4601, + "step": 4309 + }, + { + "epoch": 2.037825059101655, + "grad_norm": 2.3709988594055176, + "learning_rate": 3.742789761438086e-06, + "loss": 0.4163, + "step": 4310 + }, + { + "epoch": 2.0382978723404257, + "grad_norm": 3.140355348587036, + "learning_rate": 3.742248437974752e-06, + "loss": 0.4433, + "step": 4311 + }, + { + "epoch": 2.038770685579196, + "grad_norm": 2.940948486328125, + "learning_rate": 3.741707037162055e-06, + "loss": 0.4299, + "step": 4312 + }, + { + "epoch": 2.039243498817967, + "grad_norm": 3.009157419204712, + "learning_rate": 3.7411655590337055e-06, + "loss": 0.463, + "step": 4313 + }, + { + "epoch": 2.0397163120567376, + "grad_norm": 2.672945737838745, + "learning_rate": 3.7406240036234185e-06, + "loss": 0.4696, + "step": 4314 + }, + { + "epoch": 2.0401891252955084, + "grad_norm": 2.745962142944336, + "learning_rate": 3.740082370964916e-06, + "loss": 0.4931, + "step": 4315 + }, + { + "epoch": 2.040661938534279, + "grad_norm": 2.3939316272735596, + "learning_rate": 3.7395406610919217e-06, + "loss": 0.4396, + "step": 4316 + }, + { + "epoch": 2.0411347517730496, + "grad_norm": 2.4364447593688965, + "learning_rate": 3.738998874038165e-06, + "loss": 0.4807, + "step": 4317 + }, + { + "epoch": 2.0416075650118204, + "grad_norm": 2.360489845275879, + "learning_rate": 3.738457009837381e-06, + "loss": 0.4426, + "step": 4318 + }, + { + "epoch": 2.042080378250591, + "grad_norm": 2.5494935512542725, + "learning_rate": 3.7379150685233108e-06, + "loss": 0.4189, + "step": 4319 + }, + { + "epoch": 2.0425531914893615, + "grad_norm": 2.635472059249878, + "learning_rate": 3.7373730501296963e-06, + "loss": 0.5014, + "step": 4320 + }, + { + "epoch": 2.0430260047281323, + "grad_norm": 2.4982943534851074, + "learning_rate": 3.7368309546902876e-06, + "loss": 0.4658, + "step": 4321 + }, + { + "epoch": 2.043498817966903, + "grad_norm": 2.692742109298706, + "learning_rate": 3.736288782238839e-06, + "loss": 0.4454, + "step": 4322 + }, + { + "epoch": 2.043971631205674, + "grad_norm": 2.6774091720581055, + "learning_rate": 3.7357465328091086e-06, + "loss": 0.5002, + "step": 4323 + }, + { + "epoch": 2.0444444444444443, + "grad_norm": 2.695138692855835, + "learning_rate": 3.735204206434861e-06, + "loss": 0.448, + "step": 4324 + }, + { + "epoch": 2.044917257683215, + "grad_norm": 2.5383570194244385, + "learning_rate": 3.7346618031498635e-06, + "loss": 0.4352, + "step": 4325 + }, + { + "epoch": 2.045390070921986, + "grad_norm": 2.267277240753174, + "learning_rate": 3.7341193229878886e-06, + "loss": 0.4162, + "step": 4326 + }, + { + "epoch": 2.0458628841607567, + "grad_norm": 2.6037328243255615, + "learning_rate": 3.733576765982715e-06, + "loss": 0.4471, + "step": 4327 + }, + { + "epoch": 2.046335697399527, + "grad_norm": 3.261385440826416, + "learning_rate": 3.7330341321681253e-06, + "loss": 0.4618, + "step": 4328 + }, + { + "epoch": 2.046808510638298, + "grad_norm": 2.440650463104248, + "learning_rate": 3.7324914215779072e-06, + "loss": 0.4476, + "step": 4329 + }, + { + "epoch": 2.0472813238770686, + "grad_norm": 2.5940682888031006, + "learning_rate": 3.731948634245853e-06, + "loss": 0.4389, + "step": 4330 + }, + { + "epoch": 2.0477541371158394, + "grad_norm": 2.7428150177001953, + "learning_rate": 3.7314057702057582e-06, + "loss": 0.4477, + "step": 4331 + }, + { + "epoch": 2.0482269503546098, + "grad_norm": 2.3546223640441895, + "learning_rate": 3.730862829491427e-06, + "loss": 0.4047, + "step": 4332 + }, + { + "epoch": 2.0486997635933806, + "grad_norm": 2.552422523498535, + "learning_rate": 3.7303198121366637e-06, + "loss": 0.4438, + "step": 4333 + }, + { + "epoch": 2.0491725768321514, + "grad_norm": 2.99226713180542, + "learning_rate": 3.729776718175281e-06, + "loss": 0.491, + "step": 4334 + }, + { + "epoch": 2.049645390070922, + "grad_norm": 3.2003321647644043, + "learning_rate": 3.7292335476410935e-06, + "loss": 0.5458, + "step": 4335 + }, + { + "epoch": 2.0501182033096925, + "grad_norm": 2.739847183227539, + "learning_rate": 3.7286903005679237e-06, + "loss": 0.4499, + "step": 4336 + }, + { + "epoch": 2.0505910165484633, + "grad_norm": 2.5917470455169678, + "learning_rate": 3.7281469769895963e-06, + "loss": 0.4714, + "step": 4337 + }, + { + "epoch": 2.051063829787234, + "grad_norm": 2.8029327392578125, + "learning_rate": 3.7276035769399422e-06, + "loss": 0.42, + "step": 4338 + }, + { + "epoch": 2.051536643026005, + "grad_norm": 2.484879493713379, + "learning_rate": 3.727060100452796e-06, + "loss": 0.4163, + "step": 4339 + }, + { + "epoch": 2.0520094562647753, + "grad_norm": 2.7126030921936035, + "learning_rate": 3.7265165475619973e-06, + "loss": 0.4112, + "step": 4340 + }, + { + "epoch": 2.052482269503546, + "grad_norm": 2.618267774581909, + "learning_rate": 3.7259729183013927e-06, + "loss": 0.4281, + "step": 4341 + }, + { + "epoch": 2.052955082742317, + "grad_norm": 2.703270673751831, + "learning_rate": 3.7254292127048293e-06, + "loss": 0.4437, + "step": 4342 + }, + { + "epoch": 2.0534278959810877, + "grad_norm": 2.429150104522705, + "learning_rate": 3.7248854308061623e-06, + "loss": 0.3971, + "step": 4343 + }, + { + "epoch": 2.053900709219858, + "grad_norm": 2.54354190826416, + "learning_rate": 3.7243415726392508e-06, + "loss": 0.4485, + "step": 4344 + }, + { + "epoch": 2.054373522458629, + "grad_norm": 2.9515016078948975, + "learning_rate": 3.723797638237957e-06, + "loss": 0.4386, + "step": 4345 + }, + { + "epoch": 2.0548463356973996, + "grad_norm": 2.9129958152770996, + "learning_rate": 3.7232536276361514e-06, + "loss": 0.4595, + "step": 4346 + }, + { + "epoch": 2.0553191489361704, + "grad_norm": 2.5397512912750244, + "learning_rate": 3.722709540867706e-06, + "loss": 0.3681, + "step": 4347 + }, + { + "epoch": 2.0557919621749408, + "grad_norm": 2.79884672164917, + "learning_rate": 3.722165377966499e-06, + "loss": 0.4576, + "step": 4348 + }, + { + "epoch": 2.0562647754137116, + "grad_norm": 2.669936180114746, + "learning_rate": 3.7216211389664137e-06, + "loss": 0.3692, + "step": 4349 + }, + { + "epoch": 2.0567375886524824, + "grad_norm": 2.512326240539551, + "learning_rate": 3.7210768239013355e-06, + "loss": 0.4554, + "step": 4350 + }, + { + "epoch": 2.057210401891253, + "grad_norm": 2.913693904876709, + "learning_rate": 3.7205324328051583e-06, + "loss": 0.5282, + "step": 4351 + }, + { + "epoch": 2.0576832151300235, + "grad_norm": 3.040891170501709, + "learning_rate": 3.719987965711778e-06, + "loss": 0.4778, + "step": 4352 + }, + { + "epoch": 2.0581560283687943, + "grad_norm": 2.7504117488861084, + "learning_rate": 3.7194434226550966e-06, + "loss": 0.4217, + "step": 4353 + }, + { + "epoch": 2.058628841607565, + "grad_norm": 2.5522971153259277, + "learning_rate": 3.718898803669021e-06, + "loss": 0.437, + "step": 4354 + }, + { + "epoch": 2.059101654846336, + "grad_norm": 2.8531908988952637, + "learning_rate": 3.718354108787461e-06, + "loss": 0.4251, + "step": 4355 + }, + { + "epoch": 2.0595744680851062, + "grad_norm": 2.5812065601348877, + "learning_rate": 3.7178093380443337e-06, + "loss": 0.4374, + "step": 4356 + }, + { + "epoch": 2.060047281323877, + "grad_norm": 2.627871513366699, + "learning_rate": 3.7172644914735583e-06, + "loss": 0.436, + "step": 4357 + }, + { + "epoch": 2.060520094562648, + "grad_norm": 2.7146239280700684, + "learning_rate": 3.7167195691090607e-06, + "loss": 0.4204, + "step": 4358 + }, + { + "epoch": 2.0609929078014186, + "grad_norm": 2.486483573913574, + "learning_rate": 3.7161745709847706e-06, + "loss": 0.4015, + "step": 4359 + }, + { + "epoch": 2.061465721040189, + "grad_norm": 2.866049289703369, + "learning_rate": 3.7156294971346226e-06, + "loss": 0.4087, + "step": 4360 + }, + { + "epoch": 2.06193853427896, + "grad_norm": 2.9345552921295166, + "learning_rate": 3.715084347592556e-06, + "loss": 0.5074, + "step": 4361 + }, + { + "epoch": 2.0624113475177306, + "grad_norm": 2.502455711364746, + "learning_rate": 3.7145391223925155e-06, + "loss": 0.469, + "step": 4362 + }, + { + "epoch": 2.0628841607565014, + "grad_norm": 2.6419875621795654, + "learning_rate": 3.713993821568449e-06, + "loss": 0.4493, + "step": 4363 + }, + { + "epoch": 2.0633569739952717, + "grad_norm": 3.812079429626465, + "learning_rate": 3.7134484451543114e-06, + "loss": 0.4764, + "step": 4364 + }, + { + "epoch": 2.0638297872340425, + "grad_norm": 2.581780195236206, + "learning_rate": 3.712902993184059e-06, + "loss": 0.3994, + "step": 4365 + }, + { + "epoch": 2.0643026004728133, + "grad_norm": 2.282508134841919, + "learning_rate": 3.712357465691656e-06, + "loss": 0.4252, + "step": 4366 + }, + { + "epoch": 2.064775413711584, + "grad_norm": 2.4727818965911865, + "learning_rate": 3.71181186271107e-06, + "loss": 0.4558, + "step": 4367 + }, + { + "epoch": 2.0652482269503545, + "grad_norm": 2.7661173343658447, + "learning_rate": 3.711266184276272e-06, + "loss": 0.505, + "step": 4368 + }, + { + "epoch": 2.0657210401891253, + "grad_norm": 2.6264543533325195, + "learning_rate": 3.71072043042124e-06, + "loss": 0.4297, + "step": 4369 + }, + { + "epoch": 2.066193853427896, + "grad_norm": 2.773699998855591, + "learning_rate": 3.7101746011799565e-06, + "loss": 0.4267, + "step": 4370 + }, + { + "epoch": 2.066666666666667, + "grad_norm": 2.686955213546753, + "learning_rate": 3.709628696586407e-06, + "loss": 0.4099, + "step": 4371 + }, + { + "epoch": 2.0671394799054372, + "grad_norm": 2.6066620349884033, + "learning_rate": 3.709082716674582e-06, + "loss": 0.4146, + "step": 4372 + }, + { + "epoch": 2.067612293144208, + "grad_norm": 2.7769250869750977, + "learning_rate": 3.7085366614784784e-06, + "loss": 0.4047, + "step": 4373 + }, + { + "epoch": 2.068085106382979, + "grad_norm": 2.4986939430236816, + "learning_rate": 3.7079905310320957e-06, + "loss": 0.4021, + "step": 4374 + }, + { + "epoch": 2.0685579196217496, + "grad_norm": 2.5456206798553467, + "learning_rate": 3.7074443253694402e-06, + "loss": 0.3569, + "step": 4375 + }, + { + "epoch": 2.06903073286052, + "grad_norm": 2.4079296588897705, + "learning_rate": 3.70689804452452e-06, + "loss": 0.4308, + "step": 4376 + }, + { + "epoch": 2.0695035460992908, + "grad_norm": 2.86014723777771, + "learning_rate": 3.7063516885313513e-06, + "loss": 0.4577, + "step": 4377 + }, + { + "epoch": 2.0699763593380616, + "grad_norm": 2.8025779724121094, + "learning_rate": 3.7058052574239523e-06, + "loss": 0.4615, + "step": 4378 + }, + { + "epoch": 2.0704491725768324, + "grad_norm": 2.902676820755005, + "learning_rate": 3.7052587512363475e-06, + "loss": 0.4765, + "step": 4379 + }, + { + "epoch": 2.0709219858156027, + "grad_norm": 2.814509391784668, + "learning_rate": 3.704712170002566e-06, + "loss": 0.434, + "step": 4380 + }, + { + "epoch": 2.0713947990543735, + "grad_norm": 2.7923502922058105, + "learning_rate": 3.704165513756639e-06, + "loss": 0.4626, + "step": 4381 + }, + { + "epoch": 2.0718676122931443, + "grad_norm": 2.6802031993865967, + "learning_rate": 3.703618782532606e-06, + "loss": 0.4835, + "step": 4382 + }, + { + "epoch": 2.072340425531915, + "grad_norm": 3.0963687896728516, + "learning_rate": 3.7030719763645085e-06, + "loss": 0.4813, + "step": 4383 + }, + { + "epoch": 2.0728132387706855, + "grad_norm": 2.5658695697784424, + "learning_rate": 3.7025250952863956e-06, + "loss": 0.4428, + "step": 4384 + }, + { + "epoch": 2.0732860520094563, + "grad_norm": 2.7738289833068848, + "learning_rate": 3.7019781393323167e-06, + "loss": 0.4376, + "step": 4385 + }, + { + "epoch": 2.073758865248227, + "grad_norm": 2.6446938514709473, + "learning_rate": 3.7014311085363303e-06, + "loss": 0.4208, + "step": 4386 + }, + { + "epoch": 2.0742316784869974, + "grad_norm": 2.7556118965148926, + "learning_rate": 3.7008840029324967e-06, + "loss": 0.3831, + "step": 4387 + }, + { + "epoch": 2.074704491725768, + "grad_norm": 2.573141574859619, + "learning_rate": 3.700336822554882e-06, + "loss": 0.4396, + "step": 4388 + }, + { + "epoch": 2.075177304964539, + "grad_norm": 2.762319803237915, + "learning_rate": 3.6997895674375566e-06, + "loss": 0.4579, + "step": 4389 + }, + { + "epoch": 2.07565011820331, + "grad_norm": 2.729780435562134, + "learning_rate": 3.699242237614596e-06, + "loss": 0.4262, + "step": 4390 + }, + { + "epoch": 2.0761229314420806, + "grad_norm": 2.657480001449585, + "learning_rate": 3.698694833120079e-06, + "loss": 0.4176, + "step": 4391 + }, + { + "epoch": 2.076595744680851, + "grad_norm": 2.8433303833007812, + "learning_rate": 3.6981473539880914e-06, + "loss": 0.457, + "step": 4392 + }, + { + "epoch": 2.0770685579196217, + "grad_norm": 2.819047212600708, + "learning_rate": 3.6975998002527225e-06, + "loss": 0.4244, + "step": 4393 + }, + { + "epoch": 2.0775413711583925, + "grad_norm": 2.6565003395080566, + "learning_rate": 3.697052171948064e-06, + "loss": 0.4384, + "step": 4394 + }, + { + "epoch": 2.078014184397163, + "grad_norm": 2.5795063972473145, + "learning_rate": 3.696504469108216e-06, + "loss": 0.4958, + "step": 4395 + }, + { + "epoch": 2.0784869976359337, + "grad_norm": 2.455730676651001, + "learning_rate": 3.6959566917672822e-06, + "loss": 0.4191, + "step": 4396 + }, + { + "epoch": 2.0789598108747045, + "grad_norm": 2.6706607341766357, + "learning_rate": 3.6954088399593684e-06, + "loss": 0.4709, + "step": 4397 + }, + { + "epoch": 2.0794326241134753, + "grad_norm": 2.3758466243743896, + "learning_rate": 3.694860913718589e-06, + "loss": 0.4231, + "step": 4398 + }, + { + "epoch": 2.079905437352246, + "grad_norm": 2.3488340377807617, + "learning_rate": 3.6943129130790583e-06, + "loss": 0.4321, + "step": 4399 + }, + { + "epoch": 2.0803782505910164, + "grad_norm": 2.6438148021698, + "learning_rate": 3.6937648380748996e-06, + "loss": 0.4907, + "step": 4400 + }, + { + "epoch": 2.0808510638297872, + "grad_norm": 2.9826784133911133, + "learning_rate": 3.6932166887402395e-06, + "loss": 0.4404, + "step": 4401 + }, + { + "epoch": 2.081323877068558, + "grad_norm": 2.5203495025634766, + "learning_rate": 3.6926684651092076e-06, + "loss": 0.4337, + "step": 4402 + }, + { + "epoch": 2.0817966903073284, + "grad_norm": 2.7704148292541504, + "learning_rate": 3.692120167215941e-06, + "loss": 0.4195, + "step": 4403 + }, + { + "epoch": 2.082269503546099, + "grad_norm": 2.879430055618286, + "learning_rate": 3.6915717950945782e-06, + "loss": 0.4498, + "step": 4404 + }, + { + "epoch": 2.08274231678487, + "grad_norm": 2.7659497261047363, + "learning_rate": 3.6910233487792655e-06, + "loss": 0.4017, + "step": 4405 + }, + { + "epoch": 2.083215130023641, + "grad_norm": 3.4017205238342285, + "learning_rate": 3.6904748283041503e-06, + "loss": 0.4733, + "step": 4406 + }, + { + "epoch": 2.083687943262411, + "grad_norm": 2.706223249435425, + "learning_rate": 3.6899262337033887e-06, + "loss": 0.4926, + "step": 4407 + }, + { + "epoch": 2.084160756501182, + "grad_norm": 2.644932508468628, + "learning_rate": 3.6893775650111372e-06, + "loss": 0.3904, + "step": 4408 + }, + { + "epoch": 2.0846335697399527, + "grad_norm": 2.666585683822632, + "learning_rate": 3.6888288222615603e-06, + "loss": 0.4698, + "step": 4409 + }, + { + "epoch": 2.0851063829787235, + "grad_norm": 3.0058486461639404, + "learning_rate": 3.688280005488826e-06, + "loss": 0.5291, + "step": 4410 + }, + { + "epoch": 2.085579196217494, + "grad_norm": 2.533088445663452, + "learning_rate": 3.687731114727105e-06, + "loss": 0.393, + "step": 4411 + }, + { + "epoch": 2.0860520094562647, + "grad_norm": 2.921687364578247, + "learning_rate": 3.6871821500105763e-06, + "loss": 0.4719, + "step": 4412 + }, + { + "epoch": 2.0865248226950355, + "grad_norm": 2.291804313659668, + "learning_rate": 3.686633111373421e-06, + "loss": 0.4105, + "step": 4413 + }, + { + "epoch": 2.0869976359338063, + "grad_norm": 2.496333122253418, + "learning_rate": 3.6860839988498255e-06, + "loss": 0.4704, + "step": 4414 + }, + { + "epoch": 2.0874704491725766, + "grad_norm": 2.8059427738189697, + "learning_rate": 3.6855348124739787e-06, + "loss": 0.4961, + "step": 4415 + }, + { + "epoch": 2.0879432624113474, + "grad_norm": 2.683922290802002, + "learning_rate": 3.6849855522800795e-06, + "loss": 0.4838, + "step": 4416 + }, + { + "epoch": 2.088416075650118, + "grad_norm": 2.694148540496826, + "learning_rate": 3.684436218302324e-06, + "loss": 0.4812, + "step": 4417 + }, + { + "epoch": 2.088888888888889, + "grad_norm": 2.724531888961792, + "learning_rate": 3.683886810574919e-06, + "loss": 0.4495, + "step": 4418 + }, + { + "epoch": 2.0893617021276594, + "grad_norm": 2.6176564693450928, + "learning_rate": 3.6833373291320746e-06, + "loss": 0.4698, + "step": 4419 + }, + { + "epoch": 2.08983451536643, + "grad_norm": 2.534116268157959, + "learning_rate": 3.6827877740080032e-06, + "loss": 0.3912, + "step": 4420 + }, + { + "epoch": 2.090307328605201, + "grad_norm": 2.5747432708740234, + "learning_rate": 3.682238145236924e-06, + "loss": 0.4072, + "step": 4421 + }, + { + "epoch": 2.0907801418439718, + "grad_norm": 2.5947659015655518, + "learning_rate": 3.6816884428530588e-06, + "loss": 0.4638, + "step": 4422 + }, + { + "epoch": 2.091252955082742, + "grad_norm": 2.811992883682251, + "learning_rate": 3.6811386668906353e-06, + "loss": 0.4345, + "step": 4423 + }, + { + "epoch": 2.091725768321513, + "grad_norm": 2.7482287883758545, + "learning_rate": 3.680588817383886e-06, + "loss": 0.4541, + "step": 4424 + }, + { + "epoch": 2.0921985815602837, + "grad_norm": 2.987131357192993, + "learning_rate": 3.6800388943670484e-06, + "loss": 0.4571, + "step": 4425 + }, + { + "epoch": 2.0926713947990545, + "grad_norm": 3.1918671131134033, + "learning_rate": 3.6794888978743637e-06, + "loss": 0.5722, + "step": 4426 + }, + { + "epoch": 2.093144208037825, + "grad_norm": 2.5654571056365967, + "learning_rate": 3.678938827940076e-06, + "loss": 0.4686, + "step": 4427 + }, + { + "epoch": 2.0936170212765957, + "grad_norm": 2.942084789276123, + "learning_rate": 3.6783886845984383e-06, + "loss": 0.4512, + "step": 4428 + }, + { + "epoch": 2.0940898345153665, + "grad_norm": 2.74847674369812, + "learning_rate": 3.677838467883703e-06, + "loss": 0.4506, + "step": 4429 + }, + { + "epoch": 2.0945626477541373, + "grad_norm": 2.7569334506988525, + "learning_rate": 3.6772881778301322e-06, + "loss": 0.502, + "step": 4430 + }, + { + "epoch": 2.0950354609929076, + "grad_norm": 2.969966173171997, + "learning_rate": 3.6767378144719884e-06, + "loss": 0.4772, + "step": 4431 + }, + { + "epoch": 2.0955082742316784, + "grad_norm": 2.773524522781372, + "learning_rate": 3.67618737784354e-06, + "loss": 0.5183, + "step": 4432 + }, + { + "epoch": 2.095981087470449, + "grad_norm": 2.6760106086730957, + "learning_rate": 3.6756368679790617e-06, + "loss": 0.4787, + "step": 4433 + }, + { + "epoch": 2.09645390070922, + "grad_norm": 2.8758978843688965, + "learning_rate": 3.6750862849128304e-06, + "loss": 0.4275, + "step": 4434 + }, + { + "epoch": 2.0969267139479904, + "grad_norm": 2.670509099960327, + "learning_rate": 3.6745356286791288e-06, + "loss": 0.4401, + "step": 4435 + }, + { + "epoch": 2.097399527186761, + "grad_norm": 2.8453969955444336, + "learning_rate": 3.673984899312244e-06, + "loss": 0.4303, + "step": 4436 + }, + { + "epoch": 2.097872340425532, + "grad_norm": 2.6212339401245117, + "learning_rate": 3.673434096846468e-06, + "loss": 0.4675, + "step": 4437 + }, + { + "epoch": 2.0983451536643027, + "grad_norm": 2.8211941719055176, + "learning_rate": 3.672883221316095e-06, + "loss": 0.4678, + "step": 4438 + }, + { + "epoch": 2.098817966903073, + "grad_norm": 2.4838058948516846, + "learning_rate": 3.672332272755427e-06, + "loss": 0.4128, + "step": 4439 + }, + { + "epoch": 2.099290780141844, + "grad_norm": 2.596660852432251, + "learning_rate": 3.671781251198769e-06, + "loss": 0.423, + "step": 4440 + }, + { + "epoch": 2.0997635933806147, + "grad_norm": 2.9979989528656006, + "learning_rate": 3.67123015668043e-06, + "loss": 0.4493, + "step": 4441 + }, + { + "epoch": 2.1002364066193855, + "grad_norm": 2.6232850551605225, + "learning_rate": 3.670678989234725e-06, + "loss": 0.4237, + "step": 4442 + }, + { + "epoch": 2.100709219858156, + "grad_norm": 2.575039863586426, + "learning_rate": 3.670127748895973e-06, + "loss": 0.4464, + "step": 4443 + }, + { + "epoch": 2.1011820330969266, + "grad_norm": 2.3381190299987793, + "learning_rate": 3.669576435698497e-06, + "loss": 0.4208, + "step": 4444 + }, + { + "epoch": 2.1016548463356974, + "grad_norm": 2.9645180702209473, + "learning_rate": 3.669025049676625e-06, + "loss": 0.5272, + "step": 4445 + }, + { + "epoch": 2.1021276595744682, + "grad_norm": 2.719320297241211, + "learning_rate": 3.668473590864689e-06, + "loss": 0.4485, + "step": 4446 + }, + { + "epoch": 2.1026004728132386, + "grad_norm": 2.8665547370910645, + "learning_rate": 3.6679220592970254e-06, + "loss": 0.4433, + "step": 4447 + }, + { + "epoch": 2.1030732860520094, + "grad_norm": 2.6922879219055176, + "learning_rate": 3.667370455007977e-06, + "loss": 0.502, + "step": 4448 + }, + { + "epoch": 2.10354609929078, + "grad_norm": 3.018228530883789, + "learning_rate": 3.6668187780318894e-06, + "loss": 0.4939, + "step": 4449 + }, + { + "epoch": 2.104018912529551, + "grad_norm": 3.187901735305786, + "learning_rate": 3.666267028403112e-06, + "loss": 0.4151, + "step": 4450 + }, + { + "epoch": 2.1044917257683213, + "grad_norm": 2.9521446228027344, + "learning_rate": 3.6657152061560012e-06, + "loss": 0.4343, + "step": 4451 + }, + { + "epoch": 2.104964539007092, + "grad_norm": 2.5125739574432373, + "learning_rate": 3.6651633113249164e-06, + "loss": 0.4071, + "step": 4452 + }, + { + "epoch": 2.105437352245863, + "grad_norm": 2.9164133071899414, + "learning_rate": 3.664611343944221e-06, + "loss": 0.4173, + "step": 4453 + }, + { + "epoch": 2.1059101654846337, + "grad_norm": 2.680893898010254, + "learning_rate": 3.6640593040482834e-06, + "loss": 0.4917, + "step": 4454 + }, + { + "epoch": 2.106382978723404, + "grad_norm": 2.6823534965515137, + "learning_rate": 3.6635071916714774e-06, + "loss": 0.4668, + "step": 4455 + }, + { + "epoch": 2.106855791962175, + "grad_norm": 2.6221907138824463, + "learning_rate": 3.6629550068481806e-06, + "loss": 0.4956, + "step": 4456 + }, + { + "epoch": 2.1073286052009457, + "grad_norm": 3.096370220184326, + "learning_rate": 3.6624027496127745e-06, + "loss": 0.3995, + "step": 4457 + }, + { + "epoch": 2.1078014184397165, + "grad_norm": 2.752885341644287, + "learning_rate": 3.661850419999647e-06, + "loss": 0.4838, + "step": 4458 + }, + { + "epoch": 2.108274231678487, + "grad_norm": 2.6806766986846924, + "learning_rate": 3.661298018043188e-06, + "loss": 0.4817, + "step": 4459 + }, + { + "epoch": 2.1087470449172576, + "grad_norm": 2.6317873001098633, + "learning_rate": 3.660745543777794e-06, + "loss": 0.4777, + "step": 4460 + }, + { + "epoch": 2.1092198581560284, + "grad_norm": 2.4939377307891846, + "learning_rate": 3.6601929972378634e-06, + "loss": 0.4525, + "step": 4461 + }, + { + "epoch": 2.109692671394799, + "grad_norm": 2.4902873039245605, + "learning_rate": 3.659640378457803e-06, + "loss": 0.4392, + "step": 4462 + }, + { + "epoch": 2.1101654846335696, + "grad_norm": 2.5082345008850098, + "learning_rate": 3.6590876874720216e-06, + "loss": 0.4224, + "step": 4463 + }, + { + "epoch": 2.1106382978723404, + "grad_norm": 2.658407211303711, + "learning_rate": 3.6585349243149313e-06, + "loss": 0.4316, + "step": 4464 + }, + { + "epoch": 2.111111111111111, + "grad_norm": 2.562883138656616, + "learning_rate": 3.6579820890209515e-06, + "loss": 0.4491, + "step": 4465 + }, + { + "epoch": 2.111583924349882, + "grad_norm": 2.5719261169433594, + "learning_rate": 3.657429181624505e-06, + "loss": 0.4406, + "step": 4466 + }, + { + "epoch": 2.1120567375886523, + "grad_norm": 2.8840596675872803, + "learning_rate": 3.6568762021600184e-06, + "loss": 0.4267, + "step": 4467 + }, + { + "epoch": 2.112529550827423, + "grad_norm": 2.660304546356201, + "learning_rate": 3.656323150661924e-06, + "loss": 0.4502, + "step": 4468 + }, + { + "epoch": 2.113002364066194, + "grad_norm": 2.610996961593628, + "learning_rate": 3.655770027164657e-06, + "loss": 0.3934, + "step": 4469 + }, + { + "epoch": 2.1134751773049647, + "grad_norm": 2.6000053882598877, + "learning_rate": 3.655216831702658e-06, + "loss": 0.4582, + "step": 4470 + }, + { + "epoch": 2.113947990543735, + "grad_norm": 2.73124098777771, + "learning_rate": 3.654663564310372e-06, + "loss": 0.4748, + "step": 4471 + }, + { + "epoch": 2.114420803782506, + "grad_norm": 2.711091995239258, + "learning_rate": 3.6541102250222495e-06, + "loss": 0.4145, + "step": 4472 + }, + { + "epoch": 2.1148936170212767, + "grad_norm": 2.655996561050415, + "learning_rate": 3.6535568138727438e-06, + "loss": 0.4407, + "step": 4473 + }, + { + "epoch": 2.1153664302600474, + "grad_norm": 2.7630865573883057, + "learning_rate": 3.653003330896313e-06, + "loss": 0.4298, + "step": 4474 + }, + { + "epoch": 2.115839243498818, + "grad_norm": 2.554415464401245, + "learning_rate": 3.6524497761274214e-06, + "loss": 0.44, + "step": 4475 + }, + { + "epoch": 2.1163120567375886, + "grad_norm": 2.790328025817871, + "learning_rate": 3.651896149600535e-06, + "loss": 0.5061, + "step": 4476 + }, + { + "epoch": 2.1167848699763594, + "grad_norm": 2.755267381668091, + "learning_rate": 3.651342451350127e-06, + "loss": 0.4588, + "step": 4477 + }, + { + "epoch": 2.11725768321513, + "grad_norm": 2.8936638832092285, + "learning_rate": 3.6507886814106722e-06, + "loss": 0.468, + "step": 4478 + }, + { + "epoch": 2.1177304964539005, + "grad_norm": 2.7394332885742188, + "learning_rate": 3.6502348398166525e-06, + "loss": 0.383, + "step": 4479 + }, + { + "epoch": 2.1182033096926713, + "grad_norm": 2.3359546661376953, + "learning_rate": 3.649680926602553e-06, + "loss": 0.3903, + "step": 4480 + }, + { + "epoch": 2.118676122931442, + "grad_norm": 3.102202892303467, + "learning_rate": 3.6491269418028637e-06, + "loss": 0.4525, + "step": 4481 + }, + { + "epoch": 2.119148936170213, + "grad_norm": 2.467970848083496, + "learning_rate": 3.648572885452078e-06, + "loss": 0.414, + "step": 4482 + }, + { + "epoch": 2.1196217494089833, + "grad_norm": 2.8984131813049316, + "learning_rate": 3.6480187575846952e-06, + "loss": 0.4571, + "step": 4483 + }, + { + "epoch": 2.120094562647754, + "grad_norm": 2.674834966659546, + "learning_rate": 3.6474645582352187e-06, + "loss": 0.455, + "step": 4484 + }, + { + "epoch": 2.120567375886525, + "grad_norm": 2.8713369369506836, + "learning_rate": 3.6469102874381552e-06, + "loss": 0.4567, + "step": 4485 + }, + { + "epoch": 2.1210401891252957, + "grad_norm": 3.174814462661743, + "learning_rate": 3.646355945228017e-06, + "loss": 0.5295, + "step": 4486 + }, + { + "epoch": 2.121513002364066, + "grad_norm": 2.6409823894500732, + "learning_rate": 3.6458015316393215e-06, + "loss": 0.4308, + "step": 4487 + }, + { + "epoch": 2.121985815602837, + "grad_norm": 2.4228954315185547, + "learning_rate": 3.645247046706588e-06, + "loss": 0.4042, + "step": 4488 + }, + { + "epoch": 2.1224586288416076, + "grad_norm": 2.553551435470581, + "learning_rate": 3.6446924904643427e-06, + "loss": 0.3925, + "step": 4489 + }, + { + "epoch": 2.1229314420803784, + "grad_norm": 2.8019237518310547, + "learning_rate": 3.6441378629471157e-06, + "loss": 0.4079, + "step": 4490 + }, + { + "epoch": 2.123404255319149, + "grad_norm": 2.993251085281372, + "learning_rate": 3.643583164189441e-06, + "loss": 0.4558, + "step": 4491 + }, + { + "epoch": 2.1238770685579196, + "grad_norm": 2.4531471729278564, + "learning_rate": 3.643028394225857e-06, + "loss": 0.4167, + "step": 4492 + }, + { + "epoch": 2.1243498817966904, + "grad_norm": 2.6827852725982666, + "learning_rate": 3.6424735530909065e-06, + "loss": 0.4311, + "step": 4493 + }, + { + "epoch": 2.124822695035461, + "grad_norm": 3.1232128143310547, + "learning_rate": 3.6419186408191377e-06, + "loss": 0.4537, + "step": 4494 + }, + { + "epoch": 2.1252955082742315, + "grad_norm": 2.816348075866699, + "learning_rate": 3.641363657445103e-06, + "loss": 0.4869, + "step": 4495 + }, + { + "epoch": 2.1257683215130023, + "grad_norm": 2.6269683837890625, + "learning_rate": 3.6408086030033575e-06, + "loss": 0.4066, + "step": 4496 + }, + { + "epoch": 2.126241134751773, + "grad_norm": 4.6375956535339355, + "learning_rate": 3.640253477528462e-06, + "loss": 0.4488, + "step": 4497 + }, + { + "epoch": 2.126713947990544, + "grad_norm": 3.020970582962036, + "learning_rate": 3.639698281054983e-06, + "loss": 0.4197, + "step": 4498 + }, + { + "epoch": 2.1271867612293143, + "grad_norm": 2.87904691696167, + "learning_rate": 3.6391430136174892e-06, + "loss": 0.4743, + "step": 4499 + }, + { + "epoch": 2.127659574468085, + "grad_norm": 2.719892978668213, + "learning_rate": 3.6385876752505554e-06, + "loss": 0.388, + "step": 4500 + }, + { + "epoch": 2.128132387706856, + "grad_norm": 2.7321808338165283, + "learning_rate": 3.638032265988759e-06, + "loss": 0.4857, + "step": 4501 + }, + { + "epoch": 2.1286052009456267, + "grad_norm": 2.700814723968506, + "learning_rate": 3.6374767858666836e-06, + "loss": 0.4819, + "step": 4502 + }, + { + "epoch": 2.129078014184397, + "grad_norm": 2.658423662185669, + "learning_rate": 3.6369212349189164e-06, + "loss": 0.4113, + "step": 4503 + }, + { + "epoch": 2.129550827423168, + "grad_norm": 2.673877716064453, + "learning_rate": 3.63636561318005e-06, + "loss": 0.3745, + "step": 4504 + }, + { + "epoch": 2.1300236406619386, + "grad_norm": 2.607758045196533, + "learning_rate": 3.6358099206846787e-06, + "loss": 0.4409, + "step": 4505 + }, + { + "epoch": 2.1304964539007094, + "grad_norm": 2.8117682933807373, + "learning_rate": 3.6352541574674044e-06, + "loss": 0.426, + "step": 4506 + }, + { + "epoch": 2.1309692671394798, + "grad_norm": 2.6970250606536865, + "learning_rate": 3.634698323562832e-06, + "loss": 0.4295, + "step": 4507 + }, + { + "epoch": 2.1314420803782506, + "grad_norm": 2.7133560180664062, + "learning_rate": 3.6341424190055696e-06, + "loss": 0.4443, + "step": 4508 + }, + { + "epoch": 2.1319148936170214, + "grad_norm": 2.57181715965271, + "learning_rate": 3.6335864438302328e-06, + "loss": 0.3995, + "step": 4509 + }, + { + "epoch": 2.132387706855792, + "grad_norm": 2.8618004322052, + "learning_rate": 3.633030398071438e-06, + "loss": 0.5075, + "step": 4510 + }, + { + "epoch": 2.1328605200945625, + "grad_norm": 2.7586729526519775, + "learning_rate": 3.6324742817638087e-06, + "loss": 0.4322, + "step": 4511 + }, + { + "epoch": 2.1333333333333333, + "grad_norm": 2.913256883621216, + "learning_rate": 3.631918094941972e-06, + "loss": 0.4708, + "step": 4512 + }, + { + "epoch": 2.133806146572104, + "grad_norm": 2.7715728282928467, + "learning_rate": 3.6313618376405585e-06, + "loss": 0.5194, + "step": 4513 + }, + { + "epoch": 2.134278959810875, + "grad_norm": 2.7986366748809814, + "learning_rate": 3.6308055098942042e-06, + "loss": 0.4419, + "step": 4514 + }, + { + "epoch": 2.1347517730496453, + "grad_norm": 3.043549060821533, + "learning_rate": 3.6302491117375492e-06, + "loss": 0.4441, + "step": 4515 + }, + { + "epoch": 2.135224586288416, + "grad_norm": 2.771761417388916, + "learning_rate": 3.629692643205238e-06, + "loss": 0.4752, + "step": 4516 + }, + { + "epoch": 2.135697399527187, + "grad_norm": 2.804941415786743, + "learning_rate": 3.6291361043319202e-06, + "loss": 0.4089, + "step": 4517 + }, + { + "epoch": 2.1361702127659576, + "grad_norm": 2.9897940158843994, + "learning_rate": 3.628579495152248e-06, + "loss": 0.4829, + "step": 4518 + }, + { + "epoch": 2.136643026004728, + "grad_norm": 2.9273486137390137, + "learning_rate": 3.6280228157008784e-06, + "loss": 0.4469, + "step": 4519 + }, + { + "epoch": 2.137115839243499, + "grad_norm": 2.584373950958252, + "learning_rate": 3.627466066012475e-06, + "loss": 0.4277, + "step": 4520 + }, + { + "epoch": 2.1375886524822696, + "grad_norm": 3.009333848953247, + "learning_rate": 3.626909246121703e-06, + "loss": 0.4025, + "step": 4521 + }, + { + "epoch": 2.1380614657210404, + "grad_norm": 2.634615659713745, + "learning_rate": 3.626352356063234e-06, + "loss": 0.4046, + "step": 4522 + }, + { + "epoch": 2.1385342789598107, + "grad_norm": 2.87310528755188, + "learning_rate": 3.625795395871743e-06, + "loss": 0.4426, + "step": 4523 + }, + { + "epoch": 2.1390070921985815, + "grad_norm": 2.94985032081604, + "learning_rate": 3.625238365581909e-06, + "loss": 0.445, + "step": 4524 + }, + { + "epoch": 2.1394799054373523, + "grad_norm": 2.470189332962036, + "learning_rate": 3.624681265228416e-06, + "loss": 0.4082, + "step": 4525 + }, + { + "epoch": 2.139952718676123, + "grad_norm": 2.5304040908813477, + "learning_rate": 3.624124094845952e-06, + "loss": 0.403, + "step": 4526 + }, + { + "epoch": 2.1404255319148935, + "grad_norm": 2.6148900985717773, + "learning_rate": 3.62356685446921e-06, + "loss": 0.3867, + "step": 4527 + }, + { + "epoch": 2.1408983451536643, + "grad_norm": 2.885549783706665, + "learning_rate": 3.623009544132886e-06, + "loss": 0.4706, + "step": 4528 + }, + { + "epoch": 2.141371158392435, + "grad_norm": 3.00490665435791, + "learning_rate": 3.6224521638716827e-06, + "loss": 0.4733, + "step": 4529 + }, + { + "epoch": 2.141843971631206, + "grad_norm": 2.925879716873169, + "learning_rate": 3.6218947137203043e-06, + "loss": 0.4581, + "step": 4530 + }, + { + "epoch": 2.1423167848699762, + "grad_norm": 3.10861873626709, + "learning_rate": 3.621337193713462e-06, + "loss": 0.4579, + "step": 4531 + }, + { + "epoch": 2.142789598108747, + "grad_norm": 2.7386577129364014, + "learning_rate": 3.6207796038858693e-06, + "loss": 0.4248, + "step": 4532 + }, + { + "epoch": 2.143262411347518, + "grad_norm": 2.601836681365967, + "learning_rate": 3.6202219442722453e-06, + "loss": 0.4928, + "step": 4533 + }, + { + "epoch": 2.1437352245862886, + "grad_norm": 2.598778247833252, + "learning_rate": 3.6196642149073123e-06, + "loss": 0.4415, + "step": 4534 + }, + { + "epoch": 2.144208037825059, + "grad_norm": 2.443995714187622, + "learning_rate": 3.619106415825798e-06, + "loss": 0.3917, + "step": 4535 + }, + { + "epoch": 2.1446808510638298, + "grad_norm": 2.84643816947937, + "learning_rate": 3.6185485470624354e-06, + "loss": 0.4162, + "step": 4536 + }, + { + "epoch": 2.1451536643026006, + "grad_norm": 2.4568188190460205, + "learning_rate": 3.617990608651959e-06, + "loss": 0.4298, + "step": 4537 + }, + { + "epoch": 2.145626477541371, + "grad_norm": 2.968804359436035, + "learning_rate": 3.61743260062911e-06, + "loss": 0.4696, + "step": 4538 + }, + { + "epoch": 2.1460992907801417, + "grad_norm": 2.629075288772583, + "learning_rate": 3.6168745230286327e-06, + "loss": 0.4234, + "step": 4539 + }, + { + "epoch": 2.1465721040189125, + "grad_norm": 2.7680578231811523, + "learning_rate": 3.6163163758852754e-06, + "loss": 0.4669, + "step": 4540 + }, + { + "epoch": 2.1470449172576833, + "grad_norm": 2.782825469970703, + "learning_rate": 3.615758159233793e-06, + "loss": 0.4552, + "step": 4541 + }, + { + "epoch": 2.147517730496454, + "grad_norm": 2.653047561645508, + "learning_rate": 3.615199873108942e-06, + "loss": 0.4393, + "step": 4542 + }, + { + "epoch": 2.1479905437352245, + "grad_norm": 2.4175806045532227, + "learning_rate": 3.6146415175454852e-06, + "loss": 0.4114, + "step": 4543 + }, + { + "epoch": 2.1484633569739953, + "grad_norm": 2.627943515777588, + "learning_rate": 3.614083092578189e-06, + "loss": 0.4215, + "step": 4544 + }, + { + "epoch": 2.148936170212766, + "grad_norm": 2.8934123516082764, + "learning_rate": 3.6135245982418227e-06, + "loss": 0.4815, + "step": 4545 + }, + { + "epoch": 2.1494089834515364, + "grad_norm": 2.8535244464874268, + "learning_rate": 3.612966034571164e-06, + "loss": 0.4683, + "step": 4546 + }, + { + "epoch": 2.149881796690307, + "grad_norm": 2.7826647758483887, + "learning_rate": 3.6124074016009893e-06, + "loss": 0.4351, + "step": 4547 + }, + { + "epoch": 2.150354609929078, + "grad_norm": 2.6906018257141113, + "learning_rate": 3.6118486993660834e-06, + "loss": 0.4585, + "step": 4548 + }, + { + "epoch": 2.150827423167849, + "grad_norm": 2.726766586303711, + "learning_rate": 3.6112899279012346e-06, + "loss": 0.4753, + "step": 4549 + }, + { + "epoch": 2.1513002364066196, + "grad_norm": 3.0193991661071777, + "learning_rate": 3.6107310872412348e-06, + "loss": 0.4827, + "step": 4550 + }, + { + "epoch": 2.15177304964539, + "grad_norm": 2.6788697242736816, + "learning_rate": 3.610172177420881e-06, + "loss": 0.4333, + "step": 4551 + }, + { + "epoch": 2.1522458628841608, + "grad_norm": 2.865410327911377, + "learning_rate": 3.609613198474973e-06, + "loss": 0.4569, + "step": 4552 + }, + { + "epoch": 2.1527186761229316, + "grad_norm": 2.9199366569519043, + "learning_rate": 3.609054150438317e-06, + "loss": 0.5097, + "step": 4553 + }, + { + "epoch": 2.153191489361702, + "grad_norm": 2.761035203933716, + "learning_rate": 3.6084950333457215e-06, + "loss": 0.5002, + "step": 4554 + }, + { + "epoch": 2.1536643026004727, + "grad_norm": 2.514223337173462, + "learning_rate": 3.607935847232002e-06, + "loss": 0.4171, + "step": 4555 + }, + { + "epoch": 2.1541371158392435, + "grad_norm": 2.5167524814605713, + "learning_rate": 3.6073765921319747e-06, + "loss": 0.4494, + "step": 4556 + }, + { + "epoch": 2.1546099290780143, + "grad_norm": 2.7540643215179443, + "learning_rate": 3.606817268080463e-06, + "loss": 0.4472, + "step": 4557 + }, + { + "epoch": 2.155082742316785, + "grad_norm": 2.7728664875030518, + "learning_rate": 3.6062578751122936e-06, + "loss": 0.4669, + "step": 4558 + }, + { + "epoch": 2.1555555555555554, + "grad_norm": 2.7788400650024414, + "learning_rate": 3.605698413262296e-06, + "loss": 0.4613, + "step": 4559 + }, + { + "epoch": 2.1560283687943262, + "grad_norm": 2.7811810970306396, + "learning_rate": 3.605138882565308e-06, + "loss": 0.4242, + "step": 4560 + }, + { + "epoch": 2.156501182033097, + "grad_norm": 2.7819995880126953, + "learning_rate": 3.6045792830561664e-06, + "loss": 0.443, + "step": 4561 + }, + { + "epoch": 2.1569739952718674, + "grad_norm": 2.671259641647339, + "learning_rate": 3.6040196147697166e-06, + "loss": 0.4336, + "step": 4562 + }, + { + "epoch": 2.157446808510638, + "grad_norm": 2.9296300411224365, + "learning_rate": 3.603459877740807e-06, + "loss": 0.479, + "step": 4563 + }, + { + "epoch": 2.157919621749409, + "grad_norm": 2.834937334060669, + "learning_rate": 3.602900072004289e-06, + "loss": 0.4603, + "step": 4564 + }, + { + "epoch": 2.15839243498818, + "grad_norm": 2.8434760570526123, + "learning_rate": 3.602340197595019e-06, + "loss": 0.4288, + "step": 4565 + }, + { + "epoch": 2.1588652482269506, + "grad_norm": 2.7245426177978516, + "learning_rate": 3.6017802545478593e-06, + "loss": 0.4194, + "step": 4566 + }, + { + "epoch": 2.159338061465721, + "grad_norm": 2.7795023918151855, + "learning_rate": 3.6012202428976735e-06, + "loss": 0.4481, + "step": 4567 + }, + { + "epoch": 2.1598108747044917, + "grad_norm": 2.9482083320617676, + "learning_rate": 3.6006601626793325e-06, + "loss": 0.468, + "step": 4568 + }, + { + "epoch": 2.1602836879432625, + "grad_norm": 2.9563326835632324, + "learning_rate": 3.6001000139277094e-06, + "loss": 0.4427, + "step": 4569 + }, + { + "epoch": 2.160756501182033, + "grad_norm": 2.7755916118621826, + "learning_rate": 3.599539796677682e-06, + "loss": 0.4258, + "step": 4570 + }, + { + "epoch": 2.1612293144208037, + "grad_norm": 2.961045265197754, + "learning_rate": 3.5989795109641333e-06, + "loss": 0.4645, + "step": 4571 + }, + { + "epoch": 2.1617021276595745, + "grad_norm": 3.0184407234191895, + "learning_rate": 3.5984191568219482e-06, + "loss": 0.4192, + "step": 4572 + }, + { + "epoch": 2.1621749408983453, + "grad_norm": 2.9811131954193115, + "learning_rate": 3.5978587342860192e-06, + "loss": 0.408, + "step": 4573 + }, + { + "epoch": 2.162647754137116, + "grad_norm": 2.9172329902648926, + "learning_rate": 3.597298243391242e-06, + "loss": 0.4528, + "step": 4574 + }, + { + "epoch": 2.1631205673758864, + "grad_norm": 2.7798452377319336, + "learning_rate": 3.596737684172513e-06, + "loss": 0.391, + "step": 4575 + }, + { + "epoch": 2.1635933806146572, + "grad_norm": 2.526277542114258, + "learning_rate": 3.596177056664738e-06, + "loss": 0.3699, + "step": 4576 + }, + { + "epoch": 2.164066193853428, + "grad_norm": 2.856269121170044, + "learning_rate": 3.5956163609028244e-06, + "loss": 0.4082, + "step": 4577 + }, + { + "epoch": 2.1645390070921984, + "grad_norm": 2.7681572437286377, + "learning_rate": 3.5950555969216845e-06, + "loss": 0.4064, + "step": 4578 + }, + { + "epoch": 2.165011820330969, + "grad_norm": 2.2924954891204834, + "learning_rate": 3.5944947647562333e-06, + "loss": 0.416, + "step": 4579 + }, + { + "epoch": 2.16548463356974, + "grad_norm": 2.439929485321045, + "learning_rate": 3.5939338644413936e-06, + "loss": 0.4476, + "step": 4580 + }, + { + "epoch": 2.1659574468085108, + "grad_norm": 2.786442518234253, + "learning_rate": 3.5933728960120877e-06, + "loss": 0.4525, + "step": 4581 + }, + { + "epoch": 2.166430260047281, + "grad_norm": 2.5910253524780273, + "learning_rate": 3.5928118595032465e-06, + "loss": 0.4441, + "step": 4582 + }, + { + "epoch": 2.166903073286052, + "grad_norm": 2.8144876956939697, + "learning_rate": 3.5922507549498024e-06, + "loss": 0.497, + "step": 4583 + }, + { + "epoch": 2.1673758865248227, + "grad_norm": 2.5714170932769775, + "learning_rate": 3.591689582386694e-06, + "loss": 0.4625, + "step": 4584 + }, + { + "epoch": 2.1678486997635935, + "grad_norm": 2.878187894821167, + "learning_rate": 3.591128341848861e-06, + "loss": 0.4835, + "step": 4585 + }, + { + "epoch": 2.168321513002364, + "grad_norm": 2.4946508407592773, + "learning_rate": 3.5905670333712504e-06, + "loss": 0.4278, + "step": 4586 + }, + { + "epoch": 2.1687943262411347, + "grad_norm": 2.9186196327209473, + "learning_rate": 3.590005656988814e-06, + "loss": 0.465, + "step": 4587 + }, + { + "epoch": 2.1692671394799055, + "grad_norm": 3.136807441711426, + "learning_rate": 3.5894442127365046e-06, + "loss": 0.4146, + "step": 4588 + }, + { + "epoch": 2.1697399527186763, + "grad_norm": 2.8106343746185303, + "learning_rate": 3.5888827006492804e-06, + "loss": 0.4737, + "step": 4589 + }, + { + "epoch": 2.1702127659574466, + "grad_norm": 2.874553680419922, + "learning_rate": 3.5883211207621047e-06, + "loss": 0.3962, + "step": 4590 + }, + { + "epoch": 2.1706855791962174, + "grad_norm": 2.7914116382598877, + "learning_rate": 3.587759473109946e-06, + "loss": 0.4705, + "step": 4591 + }, + { + "epoch": 2.171158392434988, + "grad_norm": 2.7273290157318115, + "learning_rate": 3.5871977577277745e-06, + "loss": 0.4827, + "step": 4592 + }, + { + "epoch": 2.171631205673759, + "grad_norm": 2.4167256355285645, + "learning_rate": 3.5866359746505653e-06, + "loss": 0.4181, + "step": 4593 + }, + { + "epoch": 2.1721040189125294, + "grad_norm": 2.8929779529571533, + "learning_rate": 3.586074123913299e-06, + "loss": 0.4006, + "step": 4594 + }, + { + "epoch": 2.1725768321513, + "grad_norm": 2.6996190547943115, + "learning_rate": 3.5855122055509593e-06, + "loss": 0.4792, + "step": 4595 + }, + { + "epoch": 2.173049645390071, + "grad_norm": 2.9341464042663574, + "learning_rate": 3.584950219598534e-06, + "loss": 0.3903, + "step": 4596 + }, + { + "epoch": 2.1735224586288417, + "grad_norm": 2.799330234527588, + "learning_rate": 3.5843881660910166e-06, + "loss": 0.4717, + "step": 4597 + }, + { + "epoch": 2.173995271867612, + "grad_norm": 2.5028693675994873, + "learning_rate": 3.5838260450634028e-06, + "loss": 0.4462, + "step": 4598 + }, + { + "epoch": 2.174468085106383, + "grad_norm": 2.5845541954040527, + "learning_rate": 3.583263856550693e-06, + "loss": 0.4327, + "step": 4599 + }, + { + "epoch": 2.1749408983451537, + "grad_norm": 2.4804906845092773, + "learning_rate": 3.5827016005878933e-06, + "loss": 0.4555, + "step": 4600 + }, + { + "epoch": 2.1754137115839245, + "grad_norm": 2.625746011734009, + "learning_rate": 3.5821392772100125e-06, + "loss": 0.455, + "step": 4601 + }, + { + "epoch": 2.175886524822695, + "grad_norm": 2.6230757236480713, + "learning_rate": 3.581576886452064e-06, + "loss": 0.4422, + "step": 4602 + }, + { + "epoch": 2.1763593380614656, + "grad_norm": 3.3104100227355957, + "learning_rate": 3.5810144283490656e-06, + "loss": 0.4212, + "step": 4603 + }, + { + "epoch": 2.1768321513002364, + "grad_norm": 2.6799755096435547, + "learning_rate": 3.5804519029360384e-06, + "loss": 0.4575, + "step": 4604 + }, + { + "epoch": 2.1773049645390072, + "grad_norm": 2.462216854095459, + "learning_rate": 3.5798893102480085e-06, + "loss": 0.4096, + "step": 4605 + }, + { + "epoch": 2.1777777777777776, + "grad_norm": 2.8600878715515137, + "learning_rate": 3.5793266503200074e-06, + "loss": 0.4798, + "step": 4606 + }, + { + "epoch": 2.1782505910165484, + "grad_norm": 2.935746431350708, + "learning_rate": 3.5787639231870673e-06, + "loss": 0.4021, + "step": 4607 + }, + { + "epoch": 2.178723404255319, + "grad_norm": 2.8655526638031006, + "learning_rate": 3.578201128884229e-06, + "loss": 0.4553, + "step": 4608 + }, + { + "epoch": 2.17919621749409, + "grad_norm": 3.219498634338379, + "learning_rate": 3.577638267446533e-06, + "loss": 0.4692, + "step": 4609 + }, + { + "epoch": 2.1796690307328603, + "grad_norm": 3.0449860095977783, + "learning_rate": 3.5770753389090283e-06, + "loss": 0.4675, + "step": 4610 + }, + { + "epoch": 2.180141843971631, + "grad_norm": 2.7045507431030273, + "learning_rate": 3.576512343306765e-06, + "loss": 0.4773, + "step": 4611 + }, + { + "epoch": 2.180614657210402, + "grad_norm": 2.601499557495117, + "learning_rate": 3.5759492806747985e-06, + "loss": 0.4112, + "step": 4612 + }, + { + "epoch": 2.1810874704491727, + "grad_norm": 2.987741470336914, + "learning_rate": 3.575386151048188e-06, + "loss": 0.4651, + "step": 4613 + }, + { + "epoch": 2.181560283687943, + "grad_norm": 2.961228847503662, + "learning_rate": 3.5748229544619973e-06, + "loss": 0.5116, + "step": 4614 + }, + { + "epoch": 2.182033096926714, + "grad_norm": 2.8008430004119873, + "learning_rate": 3.574259690951295e-06, + "loss": 0.4152, + "step": 4615 + }, + { + "epoch": 2.1825059101654847, + "grad_norm": 2.5429348945617676, + "learning_rate": 3.573696360551151e-06, + "loss": 0.4188, + "step": 4616 + }, + { + "epoch": 2.1829787234042555, + "grad_norm": 2.9566478729248047, + "learning_rate": 3.5731329632966428e-06, + "loss": 0.5156, + "step": 4617 + }, + { + "epoch": 2.183451536643026, + "grad_norm": 2.5302467346191406, + "learning_rate": 3.572569499222851e-06, + "loss": 0.4361, + "step": 4618 + }, + { + "epoch": 2.1839243498817966, + "grad_norm": 3.206803560256958, + "learning_rate": 3.5720059683648593e-06, + "loss": 0.5149, + "step": 4619 + }, + { + "epoch": 2.1843971631205674, + "grad_norm": 2.9432034492492676, + "learning_rate": 3.5714423707577573e-06, + "loss": 0.4411, + "step": 4620 + }, + { + "epoch": 2.184869976359338, + "grad_norm": 2.9412078857421875, + "learning_rate": 3.5708787064366358e-06, + "loss": 0.4372, + "step": 4621 + }, + { + "epoch": 2.1853427895981086, + "grad_norm": 3.1702330112457275, + "learning_rate": 3.5703149754365935e-06, + "loss": 0.4761, + "step": 4622 + }, + { + "epoch": 2.1858156028368794, + "grad_norm": 3.1240456104278564, + "learning_rate": 3.569751177792731e-06, + "loss": 0.4854, + "step": 4623 + }, + { + "epoch": 2.18628841607565, + "grad_norm": 2.7221994400024414, + "learning_rate": 3.5691873135401534e-06, + "loss": 0.4048, + "step": 4624 + }, + { + "epoch": 2.186761229314421, + "grad_norm": 2.74397873878479, + "learning_rate": 3.5686233827139695e-06, + "loss": 0.4747, + "step": 4625 + }, + { + "epoch": 2.1872340425531913, + "grad_norm": 2.7379889488220215, + "learning_rate": 3.5680593853492932e-06, + "loss": 0.4963, + "step": 4626 + }, + { + "epoch": 2.187706855791962, + "grad_norm": 3.040205478668213, + "learning_rate": 3.5674953214812435e-06, + "loss": 0.4917, + "step": 4627 + }, + { + "epoch": 2.188179669030733, + "grad_norm": 2.95302677154541, + "learning_rate": 3.56693119114494e-06, + "loss": 0.4758, + "step": 4628 + }, + { + "epoch": 2.1886524822695037, + "grad_norm": 2.5488312244415283, + "learning_rate": 3.56636699437551e-06, + "loss": 0.4057, + "step": 4629 + }, + { + "epoch": 2.189125295508274, + "grad_norm": 2.8379666805267334, + "learning_rate": 3.565802731208083e-06, + "loss": 0.4755, + "step": 4630 + }, + { + "epoch": 2.189598108747045, + "grad_norm": 2.8765869140625, + "learning_rate": 3.565238401677793e-06, + "loss": 0.4232, + "step": 4631 + }, + { + "epoch": 2.1900709219858157, + "grad_norm": 2.9091262817382812, + "learning_rate": 3.5646740058197784e-06, + "loss": 0.3874, + "step": 4632 + }, + { + "epoch": 2.1905437352245865, + "grad_norm": 2.7067387104034424, + "learning_rate": 3.5641095436691826e-06, + "loss": 0.4771, + "step": 4633 + }, + { + "epoch": 2.191016548463357, + "grad_norm": 2.403043508529663, + "learning_rate": 3.563545015261151e-06, + "loss": 0.4062, + "step": 4634 + }, + { + "epoch": 2.1914893617021276, + "grad_norm": 2.8059732913970947, + "learning_rate": 3.562980420630836e-06, + "loss": 0.4635, + "step": 4635 + }, + { + "epoch": 2.1919621749408984, + "grad_norm": 2.5467724800109863, + "learning_rate": 3.56241575981339e-06, + "loss": 0.4552, + "step": 4636 + }, + { + "epoch": 2.192434988179669, + "grad_norm": 2.651024103164673, + "learning_rate": 3.561851032843973e-06, + "loss": 0.38, + "step": 4637 + }, + { + "epoch": 2.1929078014184396, + "grad_norm": 2.5529849529266357, + "learning_rate": 3.5612862397577496e-06, + "loss": 0.4106, + "step": 4638 + }, + { + "epoch": 2.1933806146572103, + "grad_norm": 3.069258451461792, + "learning_rate": 3.5607213805898844e-06, + "loss": 0.461, + "step": 4639 + }, + { + "epoch": 2.193853427895981, + "grad_norm": 2.5652637481689453, + "learning_rate": 3.56015645537555e-06, + "loss": 0.4497, + "step": 4640 + }, + { + "epoch": 2.194326241134752, + "grad_norm": 2.699101209640503, + "learning_rate": 3.5595914641499224e-06, + "loss": 0.4887, + "step": 4641 + }, + { + "epoch": 2.1947990543735223, + "grad_norm": 2.9292235374450684, + "learning_rate": 3.5590264069481805e-06, + "loss": 0.4462, + "step": 4642 + }, + { + "epoch": 2.195271867612293, + "grad_norm": 2.6151106357574463, + "learning_rate": 3.5584612838055077e-06, + "loss": 0.4334, + "step": 4643 + }, + { + "epoch": 2.195744680851064, + "grad_norm": 2.895798444747925, + "learning_rate": 3.5578960947570923e-06, + "loss": 0.4448, + "step": 4644 + }, + { + "epoch": 2.1962174940898347, + "grad_norm": 2.627631425857544, + "learning_rate": 3.557330839838125e-06, + "loss": 0.436, + "step": 4645 + }, + { + "epoch": 2.196690307328605, + "grad_norm": 2.8803584575653076, + "learning_rate": 3.556765519083803e-06, + "loss": 0.4698, + "step": 4646 + }, + { + "epoch": 2.197163120567376, + "grad_norm": 2.436609983444214, + "learning_rate": 3.5562001325293265e-06, + "loss": 0.4043, + "step": 4647 + }, + { + "epoch": 2.1976359338061466, + "grad_norm": 2.5090718269348145, + "learning_rate": 3.5556346802098985e-06, + "loss": 0.4505, + "step": 4648 + }, + { + "epoch": 2.1981087470449174, + "grad_norm": 2.792783737182617, + "learning_rate": 3.5550691621607277e-06, + "loss": 0.43, + "step": 4649 + }, + { + "epoch": 2.198581560283688, + "grad_norm": 2.74153470993042, + "learning_rate": 3.554503578417026e-06, + "loss": 0.4496, + "step": 4650 + }, + { + "epoch": 2.1990543735224586, + "grad_norm": 3.0262627601623535, + "learning_rate": 3.5539379290140114e-06, + "loss": 0.4503, + "step": 4651 + }, + { + "epoch": 2.1995271867612294, + "grad_norm": 2.783811330795288, + "learning_rate": 3.553372213986903e-06, + "loss": 0.432, + "step": 4652 + }, + { + "epoch": 2.2, + "grad_norm": 3.091191053390503, + "learning_rate": 3.5528064333709255e-06, + "loss": 0.4658, + "step": 4653 + }, + { + "epoch": 2.2004728132387705, + "grad_norm": 2.814634084701538, + "learning_rate": 3.5522405872013076e-06, + "loss": 0.4473, + "step": 4654 + }, + { + "epoch": 2.2009456264775413, + "grad_norm": 2.6918299198150635, + "learning_rate": 3.5516746755132824e-06, + "loss": 0.5323, + "step": 4655 + }, + { + "epoch": 2.201418439716312, + "grad_norm": 2.9902455806732178, + "learning_rate": 3.5511086983420867e-06, + "loss": 0.5166, + "step": 4656 + }, + { + "epoch": 2.201891252955083, + "grad_norm": 2.932699203491211, + "learning_rate": 3.5505426557229616e-06, + "loss": 0.5197, + "step": 4657 + }, + { + "epoch": 2.2023640661938533, + "grad_norm": 2.585712432861328, + "learning_rate": 3.549976547691152e-06, + "loss": 0.425, + "step": 4658 + }, + { + "epoch": 2.202836879432624, + "grad_norm": 3.1019949913024902, + "learning_rate": 3.5494103742819065e-06, + "loss": 0.485, + "step": 4659 + }, + { + "epoch": 2.203309692671395, + "grad_norm": 2.3169195652008057, + "learning_rate": 3.548844135530478e-06, + "loss": 0.4064, + "step": 4660 + }, + { + "epoch": 2.2037825059101657, + "grad_norm": 2.779240846633911, + "learning_rate": 3.5482778314721257e-06, + "loss": 0.427, + "step": 4661 + }, + { + "epoch": 2.204255319148936, + "grad_norm": 2.765423059463501, + "learning_rate": 3.5477114621421078e-06, + "loss": 0.5125, + "step": 4662 + }, + { + "epoch": 2.204728132387707, + "grad_norm": 2.5590033531188965, + "learning_rate": 3.5471450275756913e-06, + "loss": 0.4009, + "step": 4663 + }, + { + "epoch": 2.2052009456264776, + "grad_norm": 2.706068515777588, + "learning_rate": 3.546578527808146e-06, + "loss": 0.4604, + "step": 4664 + }, + { + "epoch": 2.2056737588652484, + "grad_norm": 2.7995102405548096, + "learning_rate": 3.546011962874745e-06, + "loss": 0.4088, + "step": 4665 + }, + { + "epoch": 2.2061465721040188, + "grad_norm": 2.6369729042053223, + "learning_rate": 3.5454453328107656e-06, + "loss": 0.4634, + "step": 4666 + }, + { + "epoch": 2.2066193853427896, + "grad_norm": 3.1426475048065186, + "learning_rate": 3.54487863765149e-06, + "loss": 0.4761, + "step": 4667 + }, + { + "epoch": 2.2070921985815604, + "grad_norm": 2.7739460468292236, + "learning_rate": 3.5443118774322027e-06, + "loss": 0.467, + "step": 4668 + }, + { + "epoch": 2.207565011820331, + "grad_norm": 2.559105157852173, + "learning_rate": 3.5437450521881934e-06, + "loss": 0.4268, + "step": 4669 + }, + { + "epoch": 2.2080378250591015, + "grad_norm": 2.726593017578125, + "learning_rate": 3.543178161954758e-06, + "loss": 0.462, + "step": 4670 + }, + { + "epoch": 2.2085106382978723, + "grad_norm": 2.796109199523926, + "learning_rate": 3.5426112067671907e-06, + "loss": 0.4571, + "step": 4671 + }, + { + "epoch": 2.208983451536643, + "grad_norm": 2.7989072799682617, + "learning_rate": 3.5420441866607964e-06, + "loss": 0.4648, + "step": 4672 + }, + { + "epoch": 2.209456264775414, + "grad_norm": 2.6750967502593994, + "learning_rate": 3.5414771016708795e-06, + "loss": 0.4717, + "step": 4673 + }, + { + "epoch": 2.2099290780141843, + "grad_norm": 2.705659866333008, + "learning_rate": 3.5409099518327507e-06, + "loss": 0.4738, + "step": 4674 + }, + { + "epoch": 2.210401891252955, + "grad_norm": 2.79276442527771, + "learning_rate": 3.5403427371817234e-06, + "loss": 0.4625, + "step": 4675 + }, + { + "epoch": 2.210874704491726, + "grad_norm": 2.781339406967163, + "learning_rate": 3.539775457753115e-06, + "loss": 0.438, + "step": 4676 + }, + { + "epoch": 2.2113475177304966, + "grad_norm": 3.0088918209075928, + "learning_rate": 3.5392081135822488e-06, + "loss": 0.4776, + "step": 4677 + }, + { + "epoch": 2.211820330969267, + "grad_norm": 3.0291390419006348, + "learning_rate": 3.538640704704449e-06, + "loss": 0.4634, + "step": 4678 + }, + { + "epoch": 2.212293144208038, + "grad_norm": 2.967867374420166, + "learning_rate": 3.5380732311550477e-06, + "loss": 0.4776, + "step": 4679 + }, + { + "epoch": 2.2127659574468086, + "grad_norm": 2.6268832683563232, + "learning_rate": 3.5375056929693787e-06, + "loss": 0.4646, + "step": 4680 + }, + { + "epoch": 2.2132387706855794, + "grad_norm": 2.6688554286956787, + "learning_rate": 3.536938090182778e-06, + "loss": 0.3975, + "step": 4681 + }, + { + "epoch": 2.2137115839243497, + "grad_norm": 3.0079736709594727, + "learning_rate": 3.5363704228305906e-06, + "loss": 0.4724, + "step": 4682 + }, + { + "epoch": 2.2141843971631205, + "grad_norm": 2.4287586212158203, + "learning_rate": 3.535802690948161e-06, + "loss": 0.4371, + "step": 4683 + }, + { + "epoch": 2.2146572104018913, + "grad_norm": 2.960679531097412, + "learning_rate": 3.53523489457084e-06, + "loss": 0.4347, + "step": 4684 + }, + { + "epoch": 2.215130023640662, + "grad_norm": 2.9646008014678955, + "learning_rate": 3.5346670337339807e-06, + "loss": 0.4803, + "step": 4685 + }, + { + "epoch": 2.2156028368794325, + "grad_norm": 3.0518898963928223, + "learning_rate": 3.534099108472942e-06, + "loss": 0.4712, + "step": 4686 + }, + { + "epoch": 2.2160756501182033, + "grad_norm": 2.776681900024414, + "learning_rate": 3.533531118823086e-06, + "loss": 0.4347, + "step": 4687 + }, + { + "epoch": 2.216548463356974, + "grad_norm": 2.18019437789917, + "learning_rate": 3.53296306481978e-06, + "loss": 0.3551, + "step": 4688 + }, + { + "epoch": 2.217021276595745, + "grad_norm": 2.9400811195373535, + "learning_rate": 3.5323949464983937e-06, + "loss": 0.4912, + "step": 4689 + }, + { + "epoch": 2.2174940898345152, + "grad_norm": 2.798386812210083, + "learning_rate": 3.5318267638943e-06, + "loss": 0.3967, + "step": 4690 + }, + { + "epoch": 2.217966903073286, + "grad_norm": 2.5452775955200195, + "learning_rate": 3.531258517042879e-06, + "loss": 0.3773, + "step": 4691 + }, + { + "epoch": 2.218439716312057, + "grad_norm": 2.711137294769287, + "learning_rate": 3.5306902059795113e-06, + "loss": 0.4123, + "step": 4692 + }, + { + "epoch": 2.2189125295508276, + "grad_norm": 3.0022387504577637, + "learning_rate": 3.530121830739584e-06, + "loss": 0.4898, + "step": 4693 + }, + { + "epoch": 2.219385342789598, + "grad_norm": 2.871814250946045, + "learning_rate": 3.5295533913584877e-06, + "loss": 0.4497, + "step": 4694 + }, + { + "epoch": 2.219858156028369, + "grad_norm": 2.9782521724700928, + "learning_rate": 3.528984887871616e-06, + "loss": 0.4797, + "step": 4695 + }, + { + "epoch": 2.2203309692671396, + "grad_norm": 2.6896398067474365, + "learning_rate": 3.5284163203143673e-06, + "loss": 0.439, + "step": 4696 + }, + { + "epoch": 2.2208037825059104, + "grad_norm": 2.7898833751678467, + "learning_rate": 3.5278476887221436e-06, + "loss": 0.4656, + "step": 4697 + }, + { + "epoch": 2.2212765957446807, + "grad_norm": 2.800416946411133, + "learning_rate": 3.527278993130352e-06, + "loss": 0.4452, + "step": 4698 + }, + { + "epoch": 2.2217494089834515, + "grad_norm": 3.653228998184204, + "learning_rate": 3.526710233574401e-06, + "loss": 0.4189, + "step": 4699 + }, + { + "epoch": 2.2222222222222223, + "grad_norm": 2.856956958770752, + "learning_rate": 3.5261414100897064e-06, + "loss": 0.4298, + "step": 4700 + }, + { + "epoch": 2.222695035460993, + "grad_norm": 2.8576223850250244, + "learning_rate": 3.5255725227116854e-06, + "loss": 0.4425, + "step": 4701 + }, + { + "epoch": 2.2231678486997635, + "grad_norm": 3.1161351203918457, + "learning_rate": 3.5250035714757603e-06, + "loss": 0.4609, + "step": 4702 + }, + { + "epoch": 2.2236406619385343, + "grad_norm": 2.843379259109497, + "learning_rate": 3.5244345564173578e-06, + "loss": 0.3589, + "step": 4703 + }, + { + "epoch": 2.224113475177305, + "grad_norm": 2.877157211303711, + "learning_rate": 3.5238654775719068e-06, + "loss": 0.4591, + "step": 4704 + }, + { + "epoch": 2.2245862884160754, + "grad_norm": 3.488954782485962, + "learning_rate": 3.5232963349748424e-06, + "loss": 0.4836, + "step": 4705 + }, + { + "epoch": 2.225059101654846, + "grad_norm": 2.929037570953369, + "learning_rate": 3.5227271286616025e-06, + "loss": 0.5293, + "step": 4706 + }, + { + "epoch": 2.225531914893617, + "grad_norm": 2.6230576038360596, + "learning_rate": 3.5221578586676286e-06, + "loss": 0.4235, + "step": 4707 + }, + { + "epoch": 2.226004728132388, + "grad_norm": 2.529998302459717, + "learning_rate": 3.5215885250283664e-06, + "loss": 0.4369, + "step": 4708 + }, + { + "epoch": 2.2264775413711586, + "grad_norm": 2.817279577255249, + "learning_rate": 3.521019127779267e-06, + "loss": 0.481, + "step": 4709 + }, + { + "epoch": 2.226950354609929, + "grad_norm": 3.1513843536376953, + "learning_rate": 3.5204496669557833e-06, + "loss": 0.463, + "step": 4710 + }, + { + "epoch": 2.2274231678486998, + "grad_norm": 2.9403610229492188, + "learning_rate": 3.5198801425933725e-06, + "loss": 0.455, + "step": 4711 + }, + { + "epoch": 2.2278959810874706, + "grad_norm": 2.648346424102783, + "learning_rate": 3.5193105547274987e-06, + "loss": 0.4441, + "step": 4712 + }, + { + "epoch": 2.228368794326241, + "grad_norm": 2.791898727416992, + "learning_rate": 3.5187409033936252e-06, + "loss": 0.4682, + "step": 4713 + }, + { + "epoch": 2.2288416075650117, + "grad_norm": 2.8157432079315186, + "learning_rate": 3.5181711886272242e-06, + "loss": 0.4572, + "step": 4714 + }, + { + "epoch": 2.2293144208037825, + "grad_norm": 3.250319480895996, + "learning_rate": 3.5176014104637665e-06, + "loss": 0.4599, + "step": 4715 + }, + { + "epoch": 2.2297872340425533, + "grad_norm": 2.6747050285339355, + "learning_rate": 3.5170315689387307e-06, + "loss": 0.4328, + "step": 4716 + }, + { + "epoch": 2.230260047281324, + "grad_norm": 2.584094762802124, + "learning_rate": 3.5164616640875993e-06, + "loss": 0.4268, + "step": 4717 + }, + { + "epoch": 2.2307328605200945, + "grad_norm": 2.480710506439209, + "learning_rate": 3.5158916959458573e-06, + "loss": 0.438, + "step": 4718 + }, + { + "epoch": 2.2312056737588652, + "grad_norm": 2.9338483810424805, + "learning_rate": 3.515321664548993e-06, + "loss": 0.4937, + "step": 4719 + }, + { + "epoch": 2.231678486997636, + "grad_norm": 2.7880783081054688, + "learning_rate": 3.5147515699325013e-06, + "loss": 0.4624, + "step": 4720 + }, + { + "epoch": 2.2321513002364064, + "grad_norm": 2.740841865539551, + "learning_rate": 3.5141814121318797e-06, + "loss": 0.3689, + "step": 4721 + }, + { + "epoch": 2.232624113475177, + "grad_norm": 2.9541244506835938, + "learning_rate": 3.5136111911826277e-06, + "loss": 0.4092, + "step": 4722 + }, + { + "epoch": 2.233096926713948, + "grad_norm": 2.7205398082733154, + "learning_rate": 3.5130409071202515e-06, + "loss": 0.445, + "step": 4723 + }, + { + "epoch": 2.233569739952719, + "grad_norm": 2.563406229019165, + "learning_rate": 3.51247055998026e-06, + "loss": 0.4335, + "step": 4724 + }, + { + "epoch": 2.2340425531914896, + "grad_norm": 2.4249489307403564, + "learning_rate": 3.5119001497981666e-06, + "loss": 0.4671, + "step": 4725 + }, + { + "epoch": 2.23451536643026, + "grad_norm": 2.711630344390869, + "learning_rate": 3.5113296766094875e-06, + "loss": 0.4177, + "step": 4726 + }, + { + "epoch": 2.2349881796690307, + "grad_norm": 3.0257632732391357, + "learning_rate": 3.5107591404497443e-06, + "loss": 0.4976, + "step": 4727 + }, + { + "epoch": 2.2354609929078015, + "grad_norm": 2.717303991317749, + "learning_rate": 3.5101885413544614e-06, + "loss": 0.4621, + "step": 4728 + }, + { + "epoch": 2.235933806146572, + "grad_norm": 3.2846004962921143, + "learning_rate": 3.509617879359167e-06, + "loss": 0.4284, + "step": 4729 + }, + { + "epoch": 2.2364066193853427, + "grad_norm": 2.7217819690704346, + "learning_rate": 3.5090471544993953e-06, + "loss": 0.4247, + "step": 4730 + }, + { + "epoch": 2.2368794326241135, + "grad_norm": 2.5003223419189453, + "learning_rate": 3.5084763668106812e-06, + "loss": 0.4096, + "step": 4731 + }, + { + "epoch": 2.2373522458628843, + "grad_norm": 2.7312731742858887, + "learning_rate": 3.5079055163285658e-06, + "loss": 0.4741, + "step": 4732 + }, + { + "epoch": 2.237825059101655, + "grad_norm": 2.84940767288208, + "learning_rate": 3.5073346030885934e-06, + "loss": 0.4887, + "step": 4733 + }, + { + "epoch": 2.2382978723404254, + "grad_norm": 3.1188511848449707, + "learning_rate": 3.506763627126313e-06, + "loss": 0.5335, + "step": 4734 + }, + { + "epoch": 2.2387706855791962, + "grad_norm": 2.6741397380828857, + "learning_rate": 3.5061925884772753e-06, + "loss": 0.4137, + "step": 4735 + }, + { + "epoch": 2.239243498817967, + "grad_norm": 3.1542465686798096, + "learning_rate": 3.505621487177037e-06, + "loss": 0.5303, + "step": 4736 + }, + { + "epoch": 2.2397163120567374, + "grad_norm": 5.448268890380859, + "learning_rate": 3.505050323261159e-06, + "loss": 0.4995, + "step": 4737 + }, + { + "epoch": 2.240189125295508, + "grad_norm": 2.7317898273468018, + "learning_rate": 3.5044790967652037e-06, + "loss": 0.4595, + "step": 4738 + }, + { + "epoch": 2.240661938534279, + "grad_norm": 2.8135695457458496, + "learning_rate": 3.50390780772474e-06, + "loss": 0.4593, + "step": 4739 + }, + { + "epoch": 2.2411347517730498, + "grad_norm": 3.1391844749450684, + "learning_rate": 3.5033364561753393e-06, + "loss": 0.4902, + "step": 4740 + }, + { + "epoch": 2.24160756501182, + "grad_norm": 2.6383132934570312, + "learning_rate": 3.5027650421525762e-06, + "loss": 0.3832, + "step": 4741 + }, + { + "epoch": 2.242080378250591, + "grad_norm": 2.742546558380127, + "learning_rate": 3.5021935656920314e-06, + "loss": 0.4012, + "step": 4742 + }, + { + "epoch": 2.2425531914893617, + "grad_norm": 3.1243674755096436, + "learning_rate": 3.5016220268292873e-06, + "loss": 0.4271, + "step": 4743 + }, + { + "epoch": 2.2430260047281325, + "grad_norm": 2.794717788696289, + "learning_rate": 3.501050425599932e-06, + "loss": 0.4604, + "step": 4744 + }, + { + "epoch": 2.243498817966903, + "grad_norm": 2.8481621742248535, + "learning_rate": 3.5004787620395565e-06, + "loss": 0.4814, + "step": 4745 + }, + { + "epoch": 2.2439716312056737, + "grad_norm": 2.8842051029205322, + "learning_rate": 3.499907036183755e-06, + "loss": 0.4987, + "step": 4746 + }, + { + "epoch": 2.2444444444444445, + "grad_norm": 3.074805974960327, + "learning_rate": 3.4993352480681265e-06, + "loss": 0.4966, + "step": 4747 + }, + { + "epoch": 2.2449172576832153, + "grad_norm": 2.7204246520996094, + "learning_rate": 3.4987633977282742e-06, + "loss": 0.4, + "step": 4748 + }, + { + "epoch": 2.2453900709219856, + "grad_norm": 2.685884952545166, + "learning_rate": 3.4981914851998055e-06, + "loss": 0.4285, + "step": 4749 + }, + { + "epoch": 2.2458628841607564, + "grad_norm": 2.1666336059570312, + "learning_rate": 3.4976195105183287e-06, + "loss": 0.3756, + "step": 4750 + }, + { + "epoch": 2.246335697399527, + "grad_norm": 2.863006353378296, + "learning_rate": 3.49704747371946e-06, + "loss": 0.4535, + "step": 4751 + }, + { + "epoch": 2.246808510638298, + "grad_norm": 2.5558736324310303, + "learning_rate": 3.496475374838817e-06, + "loss": 0.4129, + "step": 4752 + }, + { + "epoch": 2.2472813238770684, + "grad_norm": 2.9780309200286865, + "learning_rate": 3.495903213912022e-06, + "loss": 0.4871, + "step": 4753 + }, + { + "epoch": 2.247754137115839, + "grad_norm": 2.951779365539551, + "learning_rate": 3.4953309909747e-06, + "loss": 0.5162, + "step": 4754 + }, + { + "epoch": 2.24822695035461, + "grad_norm": 2.7654693126678467, + "learning_rate": 3.4947587060624834e-06, + "loss": 0.4662, + "step": 4755 + }, + { + "epoch": 2.2486997635933808, + "grad_norm": 2.708247184753418, + "learning_rate": 3.494186359211002e-06, + "loss": 0.4279, + "step": 4756 + }, + { + "epoch": 2.249172576832151, + "grad_norm": 3.09916615486145, + "learning_rate": 3.4936139504558963e-06, + "loss": 0.4085, + "step": 4757 + }, + { + "epoch": 2.249645390070922, + "grad_norm": 2.913806200027466, + "learning_rate": 3.493041479832807e-06, + "loss": 0.4653, + "step": 4758 + }, + { + "epoch": 2.2501182033096927, + "grad_norm": 3.2903928756713867, + "learning_rate": 3.4924689473773787e-06, + "loss": 0.5167, + "step": 4759 + }, + { + "epoch": 2.2505910165484635, + "grad_norm": 3.1302902698516846, + "learning_rate": 3.4918963531252607e-06, + "loss": 0.5398, + "step": 4760 + }, + { + "epoch": 2.251063829787234, + "grad_norm": 2.8858273029327393, + "learning_rate": 3.4913236971121063e-06, + "loss": 0.4395, + "step": 4761 + }, + { + "epoch": 2.2515366430260046, + "grad_norm": 3.194521903991699, + "learning_rate": 3.4907509793735727e-06, + "loss": 0.5258, + "step": 4762 + }, + { + "epoch": 2.2520094562647754, + "grad_norm": 2.8640544414520264, + "learning_rate": 3.49017819994532e-06, + "loss": 0.4073, + "step": 4763 + }, + { + "epoch": 2.2524822695035462, + "grad_norm": 3.139995813369751, + "learning_rate": 3.489605358863011e-06, + "loss": 0.4653, + "step": 4764 + }, + { + "epoch": 2.2529550827423166, + "grad_norm": 2.6228537559509277, + "learning_rate": 3.489032456162317e-06, + "loss": 0.4546, + "step": 4765 + }, + { + "epoch": 2.2534278959810874, + "grad_norm": 2.8197672367095947, + "learning_rate": 3.4884594918789083e-06, + "loss": 0.479, + "step": 4766 + }, + { + "epoch": 2.253900709219858, + "grad_norm": 2.7839298248291016, + "learning_rate": 3.4878864660484612e-06, + "loss": 0.5081, + "step": 4767 + }, + { + "epoch": 2.254373522458629, + "grad_norm": 2.8630709648132324, + "learning_rate": 3.487313378706656e-06, + "loss": 0.4345, + "step": 4768 + }, + { + "epoch": 2.2548463356973993, + "grad_norm": 2.5661563873291016, + "learning_rate": 3.4867402298891755e-06, + "loss": 0.4266, + "step": 4769 + }, + { + "epoch": 2.25531914893617, + "grad_norm": 2.6274025440216064, + "learning_rate": 3.4861670196317084e-06, + "loss": 0.4645, + "step": 4770 + }, + { + "epoch": 2.255791962174941, + "grad_norm": 2.578702449798584, + "learning_rate": 3.485593747969944e-06, + "loss": 0.4242, + "step": 4771 + }, + { + "epoch": 2.2562647754137117, + "grad_norm": 2.322476625442505, + "learning_rate": 3.48502041493958e-06, + "loss": 0.3975, + "step": 4772 + }, + { + "epoch": 2.256737588652482, + "grad_norm": 2.8412630558013916, + "learning_rate": 3.484447020576313e-06, + "loss": 0.4276, + "step": 4773 + }, + { + "epoch": 2.257210401891253, + "grad_norm": 2.6090497970581055, + "learning_rate": 3.483873564915847e-06, + "loss": 0.429, + "step": 4774 + }, + { + "epoch": 2.2576832151300237, + "grad_norm": 2.692458152770996, + "learning_rate": 3.4833000479938877e-06, + "loss": 0.4211, + "step": 4775 + }, + { + "epoch": 2.2581560283687945, + "grad_norm": 2.5546815395355225, + "learning_rate": 3.482726469846146e-06, + "loss": 0.4751, + "step": 4776 + }, + { + "epoch": 2.258628841607565, + "grad_norm": 2.8409626483917236, + "learning_rate": 3.4821528305083376e-06, + "loss": 0.4821, + "step": 4777 + }, + { + "epoch": 2.2591016548463356, + "grad_norm": 2.722966432571411, + "learning_rate": 3.4815791300161785e-06, + "loss": 0.5029, + "step": 4778 + }, + { + "epoch": 2.2595744680851064, + "grad_norm": 2.691603899002075, + "learning_rate": 3.48100536840539e-06, + "loss": 0.4242, + "step": 4779 + }, + { + "epoch": 2.260047281323877, + "grad_norm": 2.64035964012146, + "learning_rate": 3.4804315457116992e-06, + "loss": 0.4033, + "step": 4780 + }, + { + "epoch": 2.2605200945626476, + "grad_norm": 2.758819580078125, + "learning_rate": 3.4798576619708357e-06, + "loss": 0.4321, + "step": 4781 + }, + { + "epoch": 2.2609929078014184, + "grad_norm": 2.8204405307769775, + "learning_rate": 3.4792837172185324e-06, + "loss": 0.4309, + "step": 4782 + }, + { + "epoch": 2.261465721040189, + "grad_norm": 2.529771327972412, + "learning_rate": 3.478709711490525e-06, + "loss": 0.4398, + "step": 4783 + }, + { + "epoch": 2.26193853427896, + "grad_norm": 2.8156251907348633, + "learning_rate": 3.4781356448225557e-06, + "loss": 0.447, + "step": 4784 + }, + { + "epoch": 2.2624113475177303, + "grad_norm": 2.689528703689575, + "learning_rate": 3.477561517250369e-06, + "loss": 0.3907, + "step": 4785 + }, + { + "epoch": 2.262884160756501, + "grad_norm": 2.9148027896881104, + "learning_rate": 3.476987328809713e-06, + "loss": 0.4287, + "step": 4786 + }, + { + "epoch": 2.263356973995272, + "grad_norm": 2.933021306991577, + "learning_rate": 3.4764130795363404e-06, + "loss": 0.4847, + "step": 4787 + }, + { + "epoch": 2.2638297872340427, + "grad_norm": 2.8559257984161377, + "learning_rate": 3.4758387694660064e-06, + "loss": 0.4554, + "step": 4788 + }, + { + "epoch": 2.264302600472813, + "grad_norm": 3.0355522632598877, + "learning_rate": 3.4752643986344707e-06, + "loss": 0.4286, + "step": 4789 + }, + { + "epoch": 2.264775413711584, + "grad_norm": 2.9768362045288086, + "learning_rate": 3.474689967077498e-06, + "loss": 0.4917, + "step": 4790 + }, + { + "epoch": 2.2652482269503547, + "grad_norm": 2.827971935272217, + "learning_rate": 3.474115474830855e-06, + "loss": 0.4542, + "step": 4791 + }, + { + "epoch": 2.2657210401891255, + "grad_norm": 2.559659719467163, + "learning_rate": 3.4735409219303123e-06, + "loss": 0.4168, + "step": 4792 + }, + { + "epoch": 2.266193853427896, + "grad_norm": 2.3172824382781982, + "learning_rate": 3.472966308411645e-06, + "loss": 0.3535, + "step": 4793 + }, + { + "epoch": 2.2666666666666666, + "grad_norm": 2.6779656410217285, + "learning_rate": 3.4723916343106327e-06, + "loss": 0.4599, + "step": 4794 + }, + { + "epoch": 2.2671394799054374, + "grad_norm": 2.55780291557312, + "learning_rate": 3.4718168996630573e-06, + "loss": 0.4185, + "step": 4795 + }, + { + "epoch": 2.267612293144208, + "grad_norm": 2.4929800033569336, + "learning_rate": 3.471242104504704e-06, + "loss": 0.4008, + "step": 4796 + }, + { + "epoch": 2.2680851063829786, + "grad_norm": 2.849475145339966, + "learning_rate": 3.4706672488713642e-06, + "loss": 0.396, + "step": 4797 + }, + { + "epoch": 2.2685579196217494, + "grad_norm": 2.4830739498138428, + "learning_rate": 3.4700923327988306e-06, + "loss": 0.4087, + "step": 4798 + }, + { + "epoch": 2.26903073286052, + "grad_norm": 3.2748119831085205, + "learning_rate": 3.469517356322901e-06, + "loss": 0.4496, + "step": 4799 + }, + { + "epoch": 2.269503546099291, + "grad_norm": 3.0440170764923096, + "learning_rate": 3.468942319479378e-06, + "loss": 0.4903, + "step": 4800 + }, + { + "epoch": 2.2699763593380613, + "grad_norm": 2.8200504779815674, + "learning_rate": 3.4683672223040645e-06, + "loss": 0.4588, + "step": 4801 + }, + { + "epoch": 2.270449172576832, + "grad_norm": 2.675206184387207, + "learning_rate": 3.4677920648327707e-06, + "loss": 0.4257, + "step": 4802 + }, + { + "epoch": 2.270921985815603, + "grad_norm": 2.862675905227661, + "learning_rate": 3.4672168471013084e-06, + "loss": 0.466, + "step": 4803 + }, + { + "epoch": 2.2713947990543737, + "grad_norm": 2.65663743019104, + "learning_rate": 3.4666415691454947e-06, + "loss": 0.4784, + "step": 4804 + }, + { + "epoch": 2.271867612293144, + "grad_norm": 2.5610506534576416, + "learning_rate": 3.4660662310011483e-06, + "loss": 0.4429, + "step": 4805 + }, + { + "epoch": 2.272340425531915, + "grad_norm": 2.6459643840789795, + "learning_rate": 3.465490832704094e-06, + "loss": 0.4345, + "step": 4806 + }, + { + "epoch": 2.2728132387706856, + "grad_norm": 2.426013469696045, + "learning_rate": 3.4649153742901585e-06, + "loss": 0.4533, + "step": 4807 + }, + { + "epoch": 2.2732860520094564, + "grad_norm": 2.6714842319488525, + "learning_rate": 3.4643398557951745e-06, + "loss": 0.4409, + "step": 4808 + }, + { + "epoch": 2.273758865248227, + "grad_norm": 2.703629493713379, + "learning_rate": 3.463764277254976e-06, + "loss": 0.3656, + "step": 4809 + }, + { + "epoch": 2.2742316784869976, + "grad_norm": 2.811753988265991, + "learning_rate": 3.4631886387054025e-06, + "loss": 0.4957, + "step": 4810 + }, + { + "epoch": 2.2747044917257684, + "grad_norm": 2.9469289779663086, + "learning_rate": 3.462612940182295e-06, + "loss": 0.4582, + "step": 4811 + }, + { + "epoch": 2.275177304964539, + "grad_norm": 2.6287801265716553, + "learning_rate": 3.462037181721501e-06, + "loss": 0.4072, + "step": 4812 + }, + { + "epoch": 2.2756501182033095, + "grad_norm": 2.7104952335357666, + "learning_rate": 3.46146136335887e-06, + "loss": 0.4998, + "step": 4813 + }, + { + "epoch": 2.2761229314420803, + "grad_norm": 3.170363187789917, + "learning_rate": 3.460885485130256e-06, + "loss": 0.4722, + "step": 4814 + }, + { + "epoch": 2.276595744680851, + "grad_norm": 2.7315151691436768, + "learning_rate": 3.460309547071516e-06, + "loss": 0.4482, + "step": 4815 + }, + { + "epoch": 2.277068557919622, + "grad_norm": 2.685988187789917, + "learning_rate": 3.4597335492185113e-06, + "loss": 0.4419, + "step": 4816 + }, + { + "epoch": 2.2775413711583923, + "grad_norm": 2.532790184020996, + "learning_rate": 3.459157491607107e-06, + "loss": 0.3961, + "step": 4817 + }, + { + "epoch": 2.278014184397163, + "grad_norm": 2.920729875564575, + "learning_rate": 3.458581374273171e-06, + "loss": 0.4767, + "step": 4818 + }, + { + "epoch": 2.278486997635934, + "grad_norm": 3.2481250762939453, + "learning_rate": 3.458005197252577e-06, + "loss": 0.4985, + "step": 4819 + }, + { + "epoch": 2.2789598108747047, + "grad_norm": 2.373809814453125, + "learning_rate": 3.4574289605811994e-06, + "loss": 0.4259, + "step": 4820 + }, + { + "epoch": 2.279432624113475, + "grad_norm": 2.7851033210754395, + "learning_rate": 3.4568526642949184e-06, + "loss": 0.4829, + "step": 4821 + }, + { + "epoch": 2.279905437352246, + "grad_norm": 2.9777133464813232, + "learning_rate": 3.456276308429618e-06, + "loss": 0.4896, + "step": 4822 + }, + { + "epoch": 2.2803782505910166, + "grad_norm": 2.7922022342681885, + "learning_rate": 3.4556998930211853e-06, + "loss": 0.4908, + "step": 4823 + }, + { + "epoch": 2.2808510638297874, + "grad_norm": 2.699180841445923, + "learning_rate": 3.4551234181055104e-06, + "loss": 0.4518, + "step": 4824 + }, + { + "epoch": 2.2813238770685578, + "grad_norm": 3.1200520992279053, + "learning_rate": 3.4545468837184885e-06, + "loss": 0.4877, + "step": 4825 + }, + { + "epoch": 2.2817966903073286, + "grad_norm": 2.56782603263855, + "learning_rate": 3.453970289896018e-06, + "loss": 0.4281, + "step": 4826 + }, + { + "epoch": 2.2822695035460994, + "grad_norm": 3.241356372833252, + "learning_rate": 3.4533936366740007e-06, + "loss": 0.4338, + "step": 4827 + }, + { + "epoch": 2.28274231678487, + "grad_norm": 3.560295343399048, + "learning_rate": 3.452816924088342e-06, + "loss": 0.4121, + "step": 4828 + }, + { + "epoch": 2.2832151300236405, + "grad_norm": 2.8512449264526367, + "learning_rate": 3.452240152174951e-06, + "loss": 0.4357, + "step": 4829 + }, + { + "epoch": 2.2836879432624113, + "grad_norm": 3.0332651138305664, + "learning_rate": 3.4516633209697408e-06, + "loss": 0.4985, + "step": 4830 + }, + { + "epoch": 2.284160756501182, + "grad_norm": 2.520930528640747, + "learning_rate": 3.451086430508629e-06, + "loss": 0.4021, + "step": 4831 + }, + { + "epoch": 2.284633569739953, + "grad_norm": 2.508227825164795, + "learning_rate": 3.4505094808275363e-06, + "loss": 0.3935, + "step": 4832 + }, + { + "epoch": 2.2851063829787233, + "grad_norm": 2.56752610206604, + "learning_rate": 3.449932471962385e-06, + "loss": 0.4689, + "step": 4833 + }, + { + "epoch": 2.285579196217494, + "grad_norm": 2.7757534980773926, + "learning_rate": 3.449355403949105e-06, + "loss": 0.4565, + "step": 4834 + }, + { + "epoch": 2.286052009456265, + "grad_norm": 3.364821195602417, + "learning_rate": 3.448778276823626e-06, + "loss": 0.4729, + "step": 4835 + }, + { + "epoch": 2.2865248226950357, + "grad_norm": 3.0045557022094727, + "learning_rate": 3.448201090621884e-06, + "loss": 0.4834, + "step": 4836 + }, + { + "epoch": 2.286997635933806, + "grad_norm": 2.9451794624328613, + "learning_rate": 3.4476238453798183e-06, + "loss": 0.489, + "step": 4837 + }, + { + "epoch": 2.287470449172577, + "grad_norm": 2.8307435512542725, + "learning_rate": 3.4470465411333708e-06, + "loss": 0.5079, + "step": 4838 + }, + { + "epoch": 2.2879432624113476, + "grad_norm": 2.7118136882781982, + "learning_rate": 3.4464691779184876e-06, + "loss": 0.4794, + "step": 4839 + }, + { + "epoch": 2.2884160756501184, + "grad_norm": 2.6724441051483154, + "learning_rate": 3.445891755771119e-06, + "loss": 0.4619, + "step": 4840 + }, + { + "epoch": 2.2888888888888888, + "grad_norm": 2.8161258697509766, + "learning_rate": 3.445314274727218e-06, + "loss": 0.4287, + "step": 4841 + }, + { + "epoch": 2.2893617021276595, + "grad_norm": 2.5681750774383545, + "learning_rate": 3.4447367348227433e-06, + "loss": 0.4167, + "step": 4842 + }, + { + "epoch": 2.2898345153664303, + "grad_norm": 2.8136284351348877, + "learning_rate": 3.444159136093654e-06, + "loss": 0.4195, + "step": 4843 + }, + { + "epoch": 2.290307328605201, + "grad_norm": 3.153651714324951, + "learning_rate": 3.443581478575915e-06, + "loss": 0.4821, + "step": 4844 + }, + { + "epoch": 2.2907801418439715, + "grad_norm": 2.980883836746216, + "learning_rate": 3.4430037623054953e-06, + "loss": 0.4627, + "step": 4845 + }, + { + "epoch": 2.2912529550827423, + "grad_norm": 2.786182403564453, + "learning_rate": 3.4424259873183664e-06, + "loss": 0.4342, + "step": 4846 + }, + { + "epoch": 2.291725768321513, + "grad_norm": 2.8938279151916504, + "learning_rate": 3.4418481536505026e-06, + "loss": 0.3997, + "step": 4847 + }, + { + "epoch": 2.2921985815602834, + "grad_norm": 2.5534510612487793, + "learning_rate": 3.4412702613378844e-06, + "loss": 0.3982, + "step": 4848 + }, + { + "epoch": 2.2926713947990542, + "grad_norm": 2.7907063961029053, + "learning_rate": 3.4406923104164956e-06, + "loss": 0.4484, + "step": 4849 + }, + { + "epoch": 2.293144208037825, + "grad_norm": 3.162702798843384, + "learning_rate": 3.4401143009223203e-06, + "loss": 0.4528, + "step": 4850 + }, + { + "epoch": 2.293617021276596, + "grad_norm": 2.4647393226623535, + "learning_rate": 3.4395362328913505e-06, + "loss": 0.3759, + "step": 4851 + }, + { + "epoch": 2.2940898345153666, + "grad_norm": 2.8219876289367676, + "learning_rate": 3.438958106359579e-06, + "loss": 0.4903, + "step": 4852 + }, + { + "epoch": 2.294562647754137, + "grad_norm": 2.827073097229004, + "learning_rate": 3.438379921363003e-06, + "loss": 0.4315, + "step": 4853 + }, + { + "epoch": 2.295035460992908, + "grad_norm": 2.472470283508301, + "learning_rate": 3.4378016779376244e-06, + "loss": 0.4478, + "step": 4854 + }, + { + "epoch": 2.2955082742316786, + "grad_norm": 3.3994734287261963, + "learning_rate": 3.4372233761194473e-06, + "loss": 0.5086, + "step": 4855 + }, + { + "epoch": 2.295981087470449, + "grad_norm": 3.030465602874756, + "learning_rate": 3.4366450159444796e-06, + "loss": 0.4159, + "step": 4856 + }, + { + "epoch": 2.2964539007092197, + "grad_norm": 2.5460705757141113, + "learning_rate": 3.4360665974487346e-06, + "loss": 0.4097, + "step": 4857 + }, + { + "epoch": 2.2969267139479905, + "grad_norm": 2.884469509124756, + "learning_rate": 3.4354881206682273e-06, + "loss": 0.4478, + "step": 4858 + }, + { + "epoch": 2.2973995271867613, + "grad_norm": 2.5139710903167725, + "learning_rate": 3.4349095856389765e-06, + "loss": 0.4286, + "step": 4859 + }, + { + "epoch": 2.297872340425532, + "grad_norm": 3.1628260612487793, + "learning_rate": 3.4343309923970053e-06, + "loss": 0.4617, + "step": 4860 + }, + { + "epoch": 2.2983451536643025, + "grad_norm": 2.6141695976257324, + "learning_rate": 3.4337523409783395e-06, + "loss": 0.3841, + "step": 4861 + }, + { + "epoch": 2.2988179669030733, + "grad_norm": 2.766834259033203, + "learning_rate": 3.43317363141901e-06, + "loss": 0.4484, + "step": 4862 + }, + { + "epoch": 2.299290780141844, + "grad_norm": 2.785491943359375, + "learning_rate": 3.4325948637550503e-06, + "loss": 0.4363, + "step": 4863 + }, + { + "epoch": 2.2997635933806144, + "grad_norm": 2.624929189682007, + "learning_rate": 3.4320160380224988e-06, + "loss": 0.4518, + "step": 4864 + }, + { + "epoch": 2.300236406619385, + "grad_norm": 2.895413398742676, + "learning_rate": 3.4314371542573944e-06, + "loss": 0.4745, + "step": 4865 + }, + { + "epoch": 2.300709219858156, + "grad_norm": 2.603816270828247, + "learning_rate": 3.430858212495783e-06, + "loss": 0.4444, + "step": 4866 + }, + { + "epoch": 2.301182033096927, + "grad_norm": 3.387360095977783, + "learning_rate": 3.4302792127737116e-06, + "loss": 0.4169, + "step": 4867 + }, + { + "epoch": 2.3016548463356976, + "grad_norm": 2.894054651260376, + "learning_rate": 3.4297001551272334e-06, + "loss": 0.4493, + "step": 4868 + }, + { + "epoch": 2.302127659574468, + "grad_norm": 3.0432028770446777, + "learning_rate": 3.4291210395924035e-06, + "loss": 0.4854, + "step": 4869 + }, + { + "epoch": 2.3026004728132388, + "grad_norm": 2.5144734382629395, + "learning_rate": 3.42854186620528e-06, + "loss": 0.4556, + "step": 4870 + }, + { + "epoch": 2.3030732860520096, + "grad_norm": 2.964812755584717, + "learning_rate": 3.427962635001926e-06, + "loss": 0.495, + "step": 4871 + }, + { + "epoch": 2.30354609929078, + "grad_norm": 2.9991118907928467, + "learning_rate": 3.4273833460184077e-06, + "loss": 0.4787, + "step": 4872 + }, + { + "epoch": 2.3040189125295507, + "grad_norm": 2.9424328804016113, + "learning_rate": 3.4268039992907955e-06, + "loss": 0.5006, + "step": 4873 + }, + { + "epoch": 2.3044917257683215, + "grad_norm": 2.792880058288574, + "learning_rate": 3.426224594855162e-06, + "loss": 0.4399, + "step": 4874 + }, + { + "epoch": 2.3049645390070923, + "grad_norm": 2.5308053493499756, + "learning_rate": 3.4256451327475838e-06, + "loss": 0.4843, + "step": 4875 + }, + { + "epoch": 2.305437352245863, + "grad_norm": 2.7937564849853516, + "learning_rate": 3.425065613004142e-06, + "loss": 0.4428, + "step": 4876 + }, + { + "epoch": 2.3059101654846335, + "grad_norm": 2.4231557846069336, + "learning_rate": 3.424486035660921e-06, + "loss": 0.4054, + "step": 4877 + }, + { + "epoch": 2.3063829787234043, + "grad_norm": 3.0622596740722656, + "learning_rate": 3.423906400754009e-06, + "loss": 0.4623, + "step": 4878 + }, + { + "epoch": 2.306855791962175, + "grad_norm": 2.6532933712005615, + "learning_rate": 3.4233267083194955e-06, + "loss": 0.4387, + "step": 4879 + }, + { + "epoch": 2.3073286052009454, + "grad_norm": 2.793325185775757, + "learning_rate": 3.422746958393477e-06, + "loss": 0.4047, + "step": 4880 + }, + { + "epoch": 2.307801418439716, + "grad_norm": 2.9178314208984375, + "learning_rate": 3.422167151012052e-06, + "loss": 0.4397, + "step": 4881 + }, + { + "epoch": 2.308274231678487, + "grad_norm": 3.463913917541504, + "learning_rate": 3.4215872862113214e-06, + "loss": 0.4347, + "step": 4882 + }, + { + "epoch": 2.308747044917258, + "grad_norm": 3.228403091430664, + "learning_rate": 3.421007364027392e-06, + "loss": 0.4405, + "step": 4883 + }, + { + "epoch": 2.3092198581560286, + "grad_norm": 2.896933078765869, + "learning_rate": 3.420427384496372e-06, + "loss": 0.4429, + "step": 4884 + }, + { + "epoch": 2.309692671394799, + "grad_norm": 2.5559937953948975, + "learning_rate": 3.4198473476543755e-06, + "loss": 0.4281, + "step": 4885 + }, + { + "epoch": 2.3101654846335697, + "grad_norm": 3.457918167114258, + "learning_rate": 3.419267253537517e-06, + "loss": 0.4495, + "step": 4886 + }, + { + "epoch": 2.3106382978723405, + "grad_norm": 2.6554839611053467, + "learning_rate": 3.418687102181918e-06, + "loss": 0.4682, + "step": 4887 + }, + { + "epoch": 2.311111111111111, + "grad_norm": 2.8171639442443848, + "learning_rate": 3.4181068936237024e-06, + "loss": 0.4184, + "step": 4888 + }, + { + "epoch": 2.3115839243498817, + "grad_norm": 2.9272499084472656, + "learning_rate": 3.4175266278989955e-06, + "loss": 0.5445, + "step": 4889 + }, + { + "epoch": 2.3120567375886525, + "grad_norm": 2.5928499698638916, + "learning_rate": 3.4169463050439284e-06, + "loss": 0.3808, + "step": 4890 + }, + { + "epoch": 2.3125295508274233, + "grad_norm": 2.6624577045440674, + "learning_rate": 3.4163659250946356e-06, + "loss": 0.4678, + "step": 4891 + }, + { + "epoch": 2.313002364066194, + "grad_norm": 2.666555643081665, + "learning_rate": 3.4157854880872553e-06, + "loss": 0.457, + "step": 4892 + }, + { + "epoch": 2.3134751773049644, + "grad_norm": 3.2987406253814697, + "learning_rate": 3.4152049940579278e-06, + "loss": 0.551, + "step": 4893 + }, + { + "epoch": 2.3139479905437352, + "grad_norm": 2.728119134902954, + "learning_rate": 3.414624443042799e-06, + "loss": 0.3935, + "step": 4894 + }, + { + "epoch": 2.314420803782506, + "grad_norm": 3.133005380630493, + "learning_rate": 3.4140438350780157e-06, + "loss": 0.4981, + "step": 4895 + }, + { + "epoch": 2.3148936170212764, + "grad_norm": 2.591252565383911, + "learning_rate": 3.4134631701997312e-06, + "loss": 0.4251, + "step": 4896 + }, + { + "epoch": 2.315366430260047, + "grad_norm": 3.007136344909668, + "learning_rate": 3.412882448444101e-06, + "loss": 0.4492, + "step": 4897 + }, + { + "epoch": 2.315839243498818, + "grad_norm": 2.6391026973724365, + "learning_rate": 3.412301669847284e-06, + "loss": 0.5151, + "step": 4898 + }, + { + "epoch": 2.3163120567375888, + "grad_norm": 7.453699111938477, + "learning_rate": 3.411720834445441e-06, + "loss": 0.4983, + "step": 4899 + }, + { + "epoch": 2.3167848699763596, + "grad_norm": 2.667712688446045, + "learning_rate": 3.41113994227474e-06, + "loss": 0.4581, + "step": 4900 + }, + { + "epoch": 2.31725768321513, + "grad_norm": 2.7727627754211426, + "learning_rate": 3.41055899337135e-06, + "loss": 0.4731, + "step": 4901 + }, + { + "epoch": 2.3177304964539007, + "grad_norm": 3.0096890926361084, + "learning_rate": 3.409977987771444e-06, + "loss": 0.4996, + "step": 4902 + }, + { + "epoch": 2.3182033096926715, + "grad_norm": 2.725830078125, + "learning_rate": 3.4093969255111993e-06, + "loss": 0.4544, + "step": 4903 + }, + { + "epoch": 2.318676122931442, + "grad_norm": 2.7596993446350098, + "learning_rate": 3.4088158066267945e-06, + "loss": 0.4846, + "step": 4904 + }, + { + "epoch": 2.3191489361702127, + "grad_norm": 2.702620029449463, + "learning_rate": 3.4082346311544156e-06, + "loss": 0.4849, + "step": 4905 + }, + { + "epoch": 2.3196217494089835, + "grad_norm": 2.725374460220337, + "learning_rate": 3.407653399130249e-06, + "loss": 0.4116, + "step": 4906 + }, + { + "epoch": 2.3200945626477543, + "grad_norm": 2.6770219802856445, + "learning_rate": 3.4070721105904847e-06, + "loss": 0.4606, + "step": 4907 + }, + { + "epoch": 2.320567375886525, + "grad_norm": 2.9249117374420166, + "learning_rate": 3.406490765571317e-06, + "loss": 0.461, + "step": 4908 + }, + { + "epoch": 2.3210401891252954, + "grad_norm": 2.7568278312683105, + "learning_rate": 3.405909364108944e-06, + "loss": 0.4065, + "step": 4909 + }, + { + "epoch": 2.321513002364066, + "grad_norm": 2.7231340408325195, + "learning_rate": 3.4053279062395676e-06, + "loss": 0.4173, + "step": 4910 + }, + { + "epoch": 2.321985815602837, + "grad_norm": 3.1401100158691406, + "learning_rate": 3.404746391999393e-06, + "loss": 0.4287, + "step": 4911 + }, + { + "epoch": 2.3224586288416074, + "grad_norm": 2.714853525161743, + "learning_rate": 3.404164821424627e-06, + "loss": 0.4552, + "step": 4912 + }, + { + "epoch": 2.322931442080378, + "grad_norm": 3.1509978771209717, + "learning_rate": 3.4035831945514825e-06, + "loss": 0.5296, + "step": 4913 + }, + { + "epoch": 2.323404255319149, + "grad_norm": 2.567194938659668, + "learning_rate": 3.403001511416174e-06, + "loss": 0.4306, + "step": 4914 + }, + { + "epoch": 2.3238770685579198, + "grad_norm": 2.7473888397216797, + "learning_rate": 3.402419772054922e-06, + "loss": 0.4009, + "step": 4915 + }, + { + "epoch": 2.3243498817966906, + "grad_norm": 2.8617780208587646, + "learning_rate": 3.401837976503947e-06, + "loss": 0.4545, + "step": 4916 + }, + { + "epoch": 2.324822695035461, + "grad_norm": 2.3650572299957275, + "learning_rate": 3.401256124799475e-06, + "loss": 0.4046, + "step": 4917 + }, + { + "epoch": 2.3252955082742317, + "grad_norm": 2.418407678604126, + "learning_rate": 3.4006742169777364e-06, + "loss": 0.4222, + "step": 4918 + }, + { + "epoch": 2.3257683215130025, + "grad_norm": 2.7232494354248047, + "learning_rate": 3.400092253074964e-06, + "loss": 0.4373, + "step": 4919 + }, + { + "epoch": 2.326241134751773, + "grad_norm": 2.702965497970581, + "learning_rate": 3.399510233127394e-06, + "loss": 0.437, + "step": 4920 + }, + { + "epoch": 2.3267139479905437, + "grad_norm": 2.8381760120391846, + "learning_rate": 3.3989281571712664e-06, + "loss": 0.4294, + "step": 4921 + }, + { + "epoch": 2.3271867612293144, + "grad_norm": 2.767131805419922, + "learning_rate": 3.398346025242823e-06, + "loss": 0.4673, + "step": 4922 + }, + { + "epoch": 2.3276595744680852, + "grad_norm": 2.5261805057525635, + "learning_rate": 3.3977638373783123e-06, + "loss": 0.4147, + "step": 4923 + }, + { + "epoch": 2.3281323877068556, + "grad_norm": 2.7176897525787354, + "learning_rate": 3.3971815936139836e-06, + "loss": 0.3885, + "step": 4924 + }, + { + "epoch": 2.3286052009456264, + "grad_norm": 2.849043130874634, + "learning_rate": 3.396599293986092e-06, + "loss": 0.4842, + "step": 4925 + }, + { + "epoch": 2.329078014184397, + "grad_norm": 2.550673484802246, + "learning_rate": 3.3960169385308927e-06, + "loss": 0.4049, + "step": 4926 + }, + { + "epoch": 2.329550827423168, + "grad_norm": 3.0821585655212402, + "learning_rate": 3.3954345272846477e-06, + "loss": 0.53, + "step": 4927 + }, + { + "epoch": 2.3300236406619383, + "grad_norm": 2.68658185005188, + "learning_rate": 3.3948520602836223e-06, + "loss": 0.4592, + "step": 4928 + }, + { + "epoch": 2.330496453900709, + "grad_norm": 2.7391903400421143, + "learning_rate": 3.394269537564082e-06, + "loss": 0.4773, + "step": 4929 + }, + { + "epoch": 2.33096926713948, + "grad_norm": 2.665114164352417, + "learning_rate": 3.393686959162299e-06, + "loss": 0.4671, + "step": 4930 + }, + { + "epoch": 2.3314420803782507, + "grad_norm": 2.6827399730682373, + "learning_rate": 3.3931043251145477e-06, + "loss": 0.4669, + "step": 4931 + }, + { + "epoch": 2.331914893617021, + "grad_norm": 3.1760666370391846, + "learning_rate": 3.392521635457106e-06, + "loss": 0.4729, + "step": 4932 + }, + { + "epoch": 2.332387706855792, + "grad_norm": 2.9686226844787598, + "learning_rate": 3.3919388902262555e-06, + "loss": 0.5017, + "step": 4933 + }, + { + "epoch": 2.3328605200945627, + "grad_norm": 2.471325397491455, + "learning_rate": 3.3913560894582818e-06, + "loss": 0.4195, + "step": 4934 + }, + { + "epoch": 2.3333333333333335, + "grad_norm": 2.4062955379486084, + "learning_rate": 3.3907732331894732e-06, + "loss": 0.3666, + "step": 4935 + }, + { + "epoch": 2.333806146572104, + "grad_norm": 2.6800320148468018, + "learning_rate": 3.3901903214561206e-06, + "loss": 0.4774, + "step": 4936 + }, + { + "epoch": 2.3342789598108746, + "grad_norm": 2.923741102218628, + "learning_rate": 3.389607354294521e-06, + "loss": 0.4546, + "step": 4937 + }, + { + "epoch": 2.3347517730496454, + "grad_norm": 3.0034096240997314, + "learning_rate": 3.3890243317409716e-06, + "loss": 0.5373, + "step": 4938 + }, + { + "epoch": 2.3352245862884162, + "grad_norm": 3.0757339000701904, + "learning_rate": 3.388441253831775e-06, + "loss": 0.4655, + "step": 4939 + }, + { + "epoch": 2.3356973995271866, + "grad_norm": 2.5352041721343994, + "learning_rate": 3.3878581206032373e-06, + "loss": 0.4391, + "step": 4940 + }, + { + "epoch": 2.3361702127659574, + "grad_norm": 2.9332237243652344, + "learning_rate": 3.3872749320916675e-06, + "loss": 0.4685, + "step": 4941 + }, + { + "epoch": 2.336643026004728, + "grad_norm": 2.4871222972869873, + "learning_rate": 3.386691688333379e-06, + "loss": 0.3952, + "step": 4942 + }, + { + "epoch": 2.337115839243499, + "grad_norm": 2.6384918689727783, + "learning_rate": 3.386108389364687e-06, + "loss": 0.4044, + "step": 4943 + }, + { + "epoch": 2.3375886524822693, + "grad_norm": 2.3545165061950684, + "learning_rate": 3.3855250352219102e-06, + "loss": 0.426, + "step": 4944 + }, + { + "epoch": 2.33806146572104, + "grad_norm": 2.972242593765259, + "learning_rate": 3.3849416259413735e-06, + "loss": 0.5033, + "step": 4945 + }, + { + "epoch": 2.338534278959811, + "grad_norm": 3.117351770401001, + "learning_rate": 3.384358161559401e-06, + "loss": 0.4695, + "step": 4946 + }, + { + "epoch": 2.3390070921985817, + "grad_norm": 2.888916492462158, + "learning_rate": 3.383774642112324e-06, + "loss": 0.437, + "step": 4947 + }, + { + "epoch": 2.339479905437352, + "grad_norm": 3.0677435398101807, + "learning_rate": 3.3831910676364753e-06, + "loss": 0.4293, + "step": 4948 + }, + { + "epoch": 2.339952718676123, + "grad_norm": 2.8571784496307373, + "learning_rate": 3.3826074381681916e-06, + "loss": 0.4574, + "step": 4949 + }, + { + "epoch": 2.3404255319148937, + "grad_norm": 2.907276153564453, + "learning_rate": 3.3820237537438127e-06, + "loss": 0.4731, + "step": 4950 + }, + { + "epoch": 2.3408983451536645, + "grad_norm": 2.923762559890747, + "learning_rate": 3.3814400143996823e-06, + "loss": 0.4648, + "step": 4951 + }, + { + "epoch": 2.341371158392435, + "grad_norm": 2.6206982135772705, + "learning_rate": 3.3808562201721473e-06, + "loss": 0.436, + "step": 4952 + }, + { + "epoch": 2.3418439716312056, + "grad_norm": 6.279088973999023, + "learning_rate": 3.380272371097558e-06, + "loss": 0.4461, + "step": 4953 + }, + { + "epoch": 2.3423167848699764, + "grad_norm": 2.785297155380249, + "learning_rate": 3.3796884672122684e-06, + "loss": 0.4619, + "step": 4954 + }, + { + "epoch": 2.342789598108747, + "grad_norm": 2.6241793632507324, + "learning_rate": 3.379104508552634e-06, + "loss": 0.4323, + "step": 4955 + }, + { + "epoch": 2.3432624113475176, + "grad_norm": 2.6052167415618896, + "learning_rate": 3.378520495155017e-06, + "loss": 0.3943, + "step": 4956 + }, + { + "epoch": 2.3437352245862884, + "grad_norm": 2.8247411251068115, + "learning_rate": 3.3779364270557818e-06, + "loss": 0.4689, + "step": 4957 + }, + { + "epoch": 2.344208037825059, + "grad_norm": 2.5348927974700928, + "learning_rate": 3.377352304291294e-06, + "loss": 0.4619, + "step": 4958 + }, + { + "epoch": 2.34468085106383, + "grad_norm": 2.906648874282837, + "learning_rate": 3.376768126897926e-06, + "loss": 0.5191, + "step": 4959 + }, + { + "epoch": 2.3451536643026003, + "grad_norm": 2.796870470046997, + "learning_rate": 3.3761838949120514e-06, + "loss": 0.4227, + "step": 4960 + }, + { + "epoch": 2.345626477541371, + "grad_norm": 2.789635419845581, + "learning_rate": 3.3755996083700464e-06, + "loss": 0.3927, + "step": 4961 + }, + { + "epoch": 2.346099290780142, + "grad_norm": 2.86641263961792, + "learning_rate": 3.375015267308295e-06, + "loss": 0.4097, + "step": 4962 + }, + { + "epoch": 2.3465721040189127, + "grad_norm": 2.8374414443969727, + "learning_rate": 3.374430871763178e-06, + "loss": 0.4566, + "step": 4963 + }, + { + "epoch": 2.347044917257683, + "grad_norm": 2.71951961517334, + "learning_rate": 3.3738464217710854e-06, + "loss": 0.4748, + "step": 4964 + }, + { + "epoch": 2.347517730496454, + "grad_norm": 2.6939785480499268, + "learning_rate": 3.373261917368408e-06, + "loss": 0.4499, + "step": 4965 + }, + { + "epoch": 2.3479905437352246, + "grad_norm": 2.862661600112915, + "learning_rate": 3.37267735859154e-06, + "loss": 0.415, + "step": 4966 + }, + { + "epoch": 2.3484633569739954, + "grad_norm": 2.3657119274139404, + "learning_rate": 3.3720927454768793e-06, + "loss": 0.4112, + "step": 4967 + }, + { + "epoch": 2.348936170212766, + "grad_norm": 3.701571464538574, + "learning_rate": 3.3715080780608277e-06, + "loss": 0.4735, + "step": 4968 + }, + { + "epoch": 2.3494089834515366, + "grad_norm": 2.894350528717041, + "learning_rate": 3.3709233563797895e-06, + "loss": 0.4278, + "step": 4969 + }, + { + "epoch": 2.3498817966903074, + "grad_norm": 3.0072877407073975, + "learning_rate": 3.3703385804701727e-06, + "loss": 0.4718, + "step": 4970 + }, + { + "epoch": 2.350354609929078, + "grad_norm": 2.9920408725738525, + "learning_rate": 3.369753750368389e-06, + "loss": 0.4636, + "step": 4971 + }, + { + "epoch": 2.3508274231678485, + "grad_norm": 2.381770372390747, + "learning_rate": 3.369168866110853e-06, + "loss": 0.3841, + "step": 4972 + }, + { + "epoch": 2.3513002364066193, + "grad_norm": 2.6195342540740967, + "learning_rate": 3.3685839277339825e-06, + "loss": 0.4422, + "step": 4973 + }, + { + "epoch": 2.35177304964539, + "grad_norm": 2.885852575302124, + "learning_rate": 3.3679989352741992e-06, + "loss": 0.4798, + "step": 4974 + }, + { + "epoch": 2.352245862884161, + "grad_norm": 2.820004940032959, + "learning_rate": 3.367413888767929e-06, + "loss": 0.4498, + "step": 4975 + }, + { + "epoch": 2.3527186761229313, + "grad_norm": 2.579680919647217, + "learning_rate": 3.366828788251599e-06, + "loss": 0.4894, + "step": 4976 + }, + { + "epoch": 2.353191489361702, + "grad_norm": 2.7509915828704834, + "learning_rate": 3.366243633761642e-06, + "loss": 0.4354, + "step": 4977 + }, + { + "epoch": 2.353664302600473, + "grad_norm": 3.061767339706421, + "learning_rate": 3.3656584253344917e-06, + "loss": 0.4651, + "step": 4978 + }, + { + "epoch": 2.3541371158392437, + "grad_norm": 2.6109485626220703, + "learning_rate": 3.365073163006587e-06, + "loss": 0.44, + "step": 4979 + }, + { + "epoch": 2.354609929078014, + "grad_norm": 3.4247376918792725, + "learning_rate": 3.36448784681437e-06, + "loss": 0.3993, + "step": 4980 + }, + { + "epoch": 2.355082742316785, + "grad_norm": 2.953695297241211, + "learning_rate": 3.363902476794285e-06, + "loss": 0.4763, + "step": 4981 + }, + { + "epoch": 2.3555555555555556, + "grad_norm": 2.836543083190918, + "learning_rate": 3.3633170529827806e-06, + "loss": 0.4755, + "step": 4982 + }, + { + "epoch": 2.3560283687943264, + "grad_norm": 2.944082021713257, + "learning_rate": 3.36273157541631e-06, + "loss": 0.472, + "step": 4983 + }, + { + "epoch": 2.3565011820330968, + "grad_norm": 2.891716957092285, + "learning_rate": 3.3621460441313262e-06, + "loss": 0.5259, + "step": 4984 + }, + { + "epoch": 2.3569739952718676, + "grad_norm": 2.8448829650878906, + "learning_rate": 3.3615604591642896e-06, + "loss": 0.4587, + "step": 4985 + }, + { + "epoch": 2.3574468085106384, + "grad_norm": 3.114393711090088, + "learning_rate": 3.36097482055166e-06, + "loss": 0.4352, + "step": 4986 + }, + { + "epoch": 2.357919621749409, + "grad_norm": 2.964851140975952, + "learning_rate": 3.360389128329904e-06, + "loss": 0.5015, + "step": 4987 + }, + { + "epoch": 2.3583924349881795, + "grad_norm": 2.4819815158843994, + "learning_rate": 3.3598033825354893e-06, + "loss": 0.3459, + "step": 4988 + }, + { + "epoch": 2.3588652482269503, + "grad_norm": 2.635754346847534, + "learning_rate": 3.359217583204889e-06, + "loss": 0.4367, + "step": 4989 + }, + { + "epoch": 2.359338061465721, + "grad_norm": 2.542482376098633, + "learning_rate": 3.358631730374576e-06, + "loss": 0.3978, + "step": 4990 + }, + { + "epoch": 2.359810874704492, + "grad_norm": 2.614018678665161, + "learning_rate": 3.358045824081031e-06, + "loss": 0.424, + "step": 4991 + }, + { + "epoch": 2.3602836879432623, + "grad_norm": 2.775373697280884, + "learning_rate": 3.3574598643607354e-06, + "loss": 0.4901, + "step": 4992 + }, + { + "epoch": 2.360756501182033, + "grad_norm": 3.091381311416626, + "learning_rate": 3.356873851250173e-06, + "loss": 0.4954, + "step": 4993 + }, + { + "epoch": 2.361229314420804, + "grad_norm": 2.440023422241211, + "learning_rate": 3.3562877847858337e-06, + "loss": 0.4053, + "step": 4994 + }, + { + "epoch": 2.3617021276595747, + "grad_norm": 2.8879518508911133, + "learning_rate": 3.3557016650042084e-06, + "loss": 0.4766, + "step": 4995 + }, + { + "epoch": 2.362174940898345, + "grad_norm": 3.1298391819000244, + "learning_rate": 3.355115491941793e-06, + "loss": 0.4743, + "step": 4996 + }, + { + "epoch": 2.362647754137116, + "grad_norm": 3.3325259685516357, + "learning_rate": 3.3545292656350845e-06, + "loss": 0.4703, + "step": 4997 + }, + { + "epoch": 2.3631205673758866, + "grad_norm": 2.7935359477996826, + "learning_rate": 3.353942986120587e-06, + "loss": 0.432, + "step": 4998 + }, + { + "epoch": 2.3635933806146574, + "grad_norm": 2.623624324798584, + "learning_rate": 3.3533566534348033e-06, + "loss": 0.4302, + "step": 4999 + }, + { + "epoch": 2.3640661938534278, + "grad_norm": 3.1467108726501465, + "learning_rate": 3.3527702676142426e-06, + "loss": 0.4661, + "step": 5000 + }, + { + "epoch": 2.3645390070921986, + "grad_norm": 2.5364840030670166, + "learning_rate": 3.352183828695418e-06, + "loss": 0.4134, + "step": 5001 + }, + { + "epoch": 2.3650118203309693, + "grad_norm": 3.002777338027954, + "learning_rate": 3.3515973367148415e-06, + "loss": 0.3771, + "step": 5002 + }, + { + "epoch": 2.36548463356974, + "grad_norm": 2.660043954849243, + "learning_rate": 3.3510107917090335e-06, + "loss": 0.4254, + "step": 5003 + }, + { + "epoch": 2.3659574468085105, + "grad_norm": 2.7041075229644775, + "learning_rate": 3.3504241937145148e-06, + "loss": 0.4651, + "step": 5004 + }, + { + "epoch": 2.3664302600472813, + "grad_norm": 2.7387280464172363, + "learning_rate": 3.349837542767811e-06, + "loss": 0.3874, + "step": 5005 + }, + { + "epoch": 2.366903073286052, + "grad_norm": 3.012188196182251, + "learning_rate": 3.349250838905449e-06, + "loss": 0.4508, + "step": 5006 + }, + { + "epoch": 2.3673758865248224, + "grad_norm": 2.3108484745025635, + "learning_rate": 3.3486640821639616e-06, + "loss": 0.3783, + "step": 5007 + }, + { + "epoch": 2.3678486997635932, + "grad_norm": 3.2188332080841064, + "learning_rate": 3.3480772725798837e-06, + "loss": 0.4879, + "step": 5008 + }, + { + "epoch": 2.368321513002364, + "grad_norm": 2.566087484359741, + "learning_rate": 3.3474904101897526e-06, + "loss": 0.3847, + "step": 5009 + }, + { + "epoch": 2.368794326241135, + "grad_norm": 2.5581698417663574, + "learning_rate": 3.3469034950301092e-06, + "loss": 0.4201, + "step": 5010 + }, + { + "epoch": 2.3692671394799056, + "grad_norm": 2.900296926498413, + "learning_rate": 3.3463165271374992e-06, + "loss": 0.4568, + "step": 5011 + }, + { + "epoch": 2.369739952718676, + "grad_norm": 2.8239312171936035, + "learning_rate": 3.34572950654847e-06, + "loss": 0.4583, + "step": 5012 + }, + { + "epoch": 2.370212765957447, + "grad_norm": 3.219465970993042, + "learning_rate": 3.3451424332995723e-06, + "loss": 0.5435, + "step": 5013 + }, + { + "epoch": 2.3706855791962176, + "grad_norm": 3.3111915588378906, + "learning_rate": 3.344555307427362e-06, + "loss": 0.435, + "step": 5014 + }, + { + "epoch": 2.371158392434988, + "grad_norm": 3.296668529510498, + "learning_rate": 3.3439681289683946e-06, + "loss": 0.4738, + "step": 5015 + }, + { + "epoch": 2.3716312056737587, + "grad_norm": 3.005722761154175, + "learning_rate": 3.343380897959234e-06, + "loss": 0.4267, + "step": 5016 + }, + { + "epoch": 2.3721040189125295, + "grad_norm": 2.7844085693359375, + "learning_rate": 3.3427936144364425e-06, + "loss": 0.4558, + "step": 5017 + }, + { + "epoch": 2.3725768321513003, + "grad_norm": 2.7532076835632324, + "learning_rate": 3.3422062784365884e-06, + "loss": 0.4144, + "step": 5018 + }, + { + "epoch": 2.373049645390071, + "grad_norm": 2.835764169692993, + "learning_rate": 3.3416188899962413e-06, + "loss": 0.4945, + "step": 5019 + }, + { + "epoch": 2.3735224586288415, + "grad_norm": 3.1513726711273193, + "learning_rate": 3.3410314491519767e-06, + "loss": 0.4971, + "step": 5020 + }, + { + "epoch": 2.3739952718676123, + "grad_norm": 3.0162220001220703, + "learning_rate": 3.3404439559403723e-06, + "loss": 0.4477, + "step": 5021 + }, + { + "epoch": 2.374468085106383, + "grad_norm": 2.676391363143921, + "learning_rate": 3.3398564103980073e-06, + "loss": 0.432, + "step": 5022 + }, + { + "epoch": 2.3749408983451534, + "grad_norm": 2.7806248664855957, + "learning_rate": 3.3392688125614663e-06, + "loss": 0.4818, + "step": 5023 + }, + { + "epoch": 2.3754137115839242, + "grad_norm": 2.968806505203247, + "learning_rate": 3.3386811624673373e-06, + "loss": 0.4893, + "step": 5024 + }, + { + "epoch": 2.375886524822695, + "grad_norm": 2.992684841156006, + "learning_rate": 3.3380934601522087e-06, + "loss": 0.4423, + "step": 5025 + }, + { + "epoch": 2.376359338061466, + "grad_norm": 2.578420639038086, + "learning_rate": 3.3375057056526762e-06, + "loss": 0.3682, + "step": 5026 + }, + { + "epoch": 2.3768321513002366, + "grad_norm": 2.7683115005493164, + "learning_rate": 3.336917899005335e-06, + "loss": 0.4038, + "step": 5027 + }, + { + "epoch": 2.377304964539007, + "grad_norm": 2.838812828063965, + "learning_rate": 3.336330040246786e-06, + "loss": 0.442, + "step": 5028 + }, + { + "epoch": 2.3777777777777778, + "grad_norm": 2.766136646270752, + "learning_rate": 3.335742129413633e-06, + "loss": 0.4745, + "step": 5029 + }, + { + "epoch": 2.3782505910165486, + "grad_norm": 2.862656593322754, + "learning_rate": 3.3351541665424812e-06, + "loss": 0.4324, + "step": 5030 + }, + { + "epoch": 2.378723404255319, + "grad_norm": 2.71425199508667, + "learning_rate": 3.3345661516699433e-06, + "loss": 0.4013, + "step": 5031 + }, + { + "epoch": 2.3791962174940897, + "grad_norm": 2.8404030799865723, + "learning_rate": 3.333978084832629e-06, + "loss": 0.5038, + "step": 5032 + }, + { + "epoch": 2.3796690307328605, + "grad_norm": 2.965851068496704, + "learning_rate": 3.3333899660671574e-06, + "loss": 0.4668, + "step": 5033 + }, + { + "epoch": 2.3801418439716313, + "grad_norm": 2.686452627182007, + "learning_rate": 3.3328017954101464e-06, + "loss": 0.4167, + "step": 5034 + }, + { + "epoch": 2.380614657210402, + "grad_norm": 2.8676156997680664, + "learning_rate": 3.3322135728982197e-06, + "loss": 0.4531, + "step": 5035 + }, + { + "epoch": 2.3810874704491725, + "grad_norm": 2.4456300735473633, + "learning_rate": 3.3316252985680026e-06, + "loss": 0.4173, + "step": 5036 + }, + { + "epoch": 2.3815602836879433, + "grad_norm": 2.5472559928894043, + "learning_rate": 3.331036972456124e-06, + "loss": 0.3926, + "step": 5037 + }, + { + "epoch": 2.382033096926714, + "grad_norm": 2.81900954246521, + "learning_rate": 3.330448594599218e-06, + "loss": 0.4785, + "step": 5038 + }, + { + "epoch": 2.3825059101654844, + "grad_norm": 3.0930590629577637, + "learning_rate": 3.329860165033919e-06, + "loss": 0.4587, + "step": 5039 + }, + { + "epoch": 2.382978723404255, + "grad_norm": 3.0553040504455566, + "learning_rate": 3.3292716837968673e-06, + "loss": 0.5285, + "step": 5040 + }, + { + "epoch": 2.383451536643026, + "grad_norm": 2.577580690383911, + "learning_rate": 3.328683150924704e-06, + "loss": 0.4184, + "step": 5041 + }, + { + "epoch": 2.383924349881797, + "grad_norm": 2.6430366039276123, + "learning_rate": 3.3280945664540735e-06, + "loss": 0.4636, + "step": 5042 + }, + { + "epoch": 2.3843971631205676, + "grad_norm": 3.228360891342163, + "learning_rate": 3.3275059304216255e-06, + "loss": 0.455, + "step": 5043 + }, + { + "epoch": 2.384869976359338, + "grad_norm": 2.776142120361328, + "learning_rate": 3.3269172428640125e-06, + "loss": 0.4785, + "step": 5044 + }, + { + "epoch": 2.3853427895981087, + "grad_norm": 2.755671739578247, + "learning_rate": 3.3263285038178882e-06, + "loss": 0.4625, + "step": 5045 + }, + { + "epoch": 2.3858156028368795, + "grad_norm": 3.061004400253296, + "learning_rate": 3.3257397133199114e-06, + "loss": 0.4641, + "step": 5046 + }, + { + "epoch": 2.38628841607565, + "grad_norm": 2.8391458988189697, + "learning_rate": 3.3251508714067432e-06, + "loss": 0.5003, + "step": 5047 + }, + { + "epoch": 2.3867612293144207, + "grad_norm": 2.390810966491699, + "learning_rate": 3.324561978115049e-06, + "loss": 0.4446, + "step": 5048 + }, + { + "epoch": 2.3872340425531915, + "grad_norm": 2.7760825157165527, + "learning_rate": 3.323973033481496e-06, + "loss": 0.4443, + "step": 5049 + }, + { + "epoch": 2.3877068557919623, + "grad_norm": 3.157893419265747, + "learning_rate": 3.3233840375427552e-06, + "loss": 0.4934, + "step": 5050 + }, + { + "epoch": 2.388179669030733, + "grad_norm": 2.7245349884033203, + "learning_rate": 3.3227949903355e-06, + "loss": 0.4254, + "step": 5051 + }, + { + "epoch": 2.3886524822695034, + "grad_norm": 2.6674044132232666, + "learning_rate": 3.322205891896409e-06, + "loss": 0.4116, + "step": 5052 + }, + { + "epoch": 2.3891252955082742, + "grad_norm": 3.1490554809570312, + "learning_rate": 3.3216167422621627e-06, + "loss": 0.4604, + "step": 5053 + }, + { + "epoch": 2.389598108747045, + "grad_norm": 2.725731134414673, + "learning_rate": 3.321027541469444e-06, + "loss": 0.4836, + "step": 5054 + }, + { + "epoch": 2.3900709219858154, + "grad_norm": 2.5378828048706055, + "learning_rate": 3.3204382895549407e-06, + "loss": 0.4228, + "step": 5055 + }, + { + "epoch": 2.390543735224586, + "grad_norm": 2.8191192150115967, + "learning_rate": 3.3198489865553427e-06, + "loss": 0.4371, + "step": 5056 + }, + { + "epoch": 2.391016548463357, + "grad_norm": 2.5676498413085938, + "learning_rate": 3.3192596325073433e-06, + "loss": 0.4463, + "step": 5057 + }, + { + "epoch": 2.391489361702128, + "grad_norm": 3.0846121311187744, + "learning_rate": 3.3186702274476397e-06, + "loss": 0.5049, + "step": 5058 + }, + { + "epoch": 2.3919621749408986, + "grad_norm": 2.6085152626037598, + "learning_rate": 3.3180807714129293e-06, + "loss": 0.4376, + "step": 5059 + }, + { + "epoch": 2.392434988179669, + "grad_norm": 3.0218591690063477, + "learning_rate": 3.3174912644399172e-06, + "loss": 0.4734, + "step": 5060 + }, + { + "epoch": 2.3929078014184397, + "grad_norm": 2.5904781818389893, + "learning_rate": 3.316901706565308e-06, + "loss": 0.4924, + "step": 5061 + }, + { + "epoch": 2.3933806146572105, + "grad_norm": 2.675478458404541, + "learning_rate": 3.3163120978258123e-06, + "loss": 0.4072, + "step": 5062 + }, + { + "epoch": 2.393853427895981, + "grad_norm": 2.7944445610046387, + "learning_rate": 3.3157224382581415e-06, + "loss": 0.4328, + "step": 5063 + }, + { + "epoch": 2.3943262411347517, + "grad_norm": 2.846224546432495, + "learning_rate": 3.315132727899012e-06, + "loss": 0.4447, + "step": 5064 + }, + { + "epoch": 2.3947990543735225, + "grad_norm": 2.6825828552246094, + "learning_rate": 3.3145429667851402e-06, + "loss": 0.4528, + "step": 5065 + }, + { + "epoch": 2.3952718676122933, + "grad_norm": 3.0305285453796387, + "learning_rate": 3.3139531549532505e-06, + "loss": 0.4538, + "step": 5066 + }, + { + "epoch": 2.395744680851064, + "grad_norm": 2.707540988922119, + "learning_rate": 3.313363292440067e-06, + "loss": 0.4412, + "step": 5067 + }, + { + "epoch": 2.3962174940898344, + "grad_norm": 3.0458385944366455, + "learning_rate": 3.3127733792823173e-06, + "loss": 0.4587, + "step": 5068 + }, + { + "epoch": 2.396690307328605, + "grad_norm": 2.7711992263793945, + "learning_rate": 3.312183415516733e-06, + "loss": 0.4157, + "step": 5069 + }, + { + "epoch": 2.397163120567376, + "grad_norm": 2.6953988075256348, + "learning_rate": 3.3115934011800494e-06, + "loss": 0.3828, + "step": 5070 + }, + { + "epoch": 2.3976359338061464, + "grad_norm": 3.033721923828125, + "learning_rate": 3.311003336309003e-06, + "loss": 0.5204, + "step": 5071 + }, + { + "epoch": 2.398108747044917, + "grad_norm": 2.6134517192840576, + "learning_rate": 3.3104132209403355e-06, + "loss": 0.4181, + "step": 5072 + }, + { + "epoch": 2.398581560283688, + "grad_norm": 2.8800251483917236, + "learning_rate": 3.30982305511079e-06, + "loss": 0.466, + "step": 5073 + }, + { + "epoch": 2.3990543735224588, + "grad_norm": 2.5043210983276367, + "learning_rate": 3.309232838857114e-06, + "loss": 0.4161, + "step": 5074 + }, + { + "epoch": 2.3995271867612296, + "grad_norm": 2.6577322483062744, + "learning_rate": 3.308642572216057e-06, + "loss": 0.465, + "step": 5075 + }, + { + "epoch": 2.4, + "grad_norm": 2.549098253250122, + "learning_rate": 3.3080522552243734e-06, + "loss": 0.4571, + "step": 5076 + }, + { + "epoch": 2.4004728132387707, + "grad_norm": 2.881958246231079, + "learning_rate": 3.3074618879188186e-06, + "loss": 0.4443, + "step": 5077 + }, + { + "epoch": 2.4009456264775415, + "grad_norm": 2.608397960662842, + "learning_rate": 3.3068714703361528e-06, + "loss": 0.3843, + "step": 5078 + }, + { + "epoch": 2.401418439716312, + "grad_norm": 2.8666789531707764, + "learning_rate": 3.306281002513139e-06, + "loss": 0.4857, + "step": 5079 + }, + { + "epoch": 2.4018912529550827, + "grad_norm": 2.9008588790893555, + "learning_rate": 3.3056904844865422e-06, + "loss": 0.4454, + "step": 5080 + }, + { + "epoch": 2.4023640661938535, + "grad_norm": 2.7446060180664062, + "learning_rate": 3.3050999162931315e-06, + "loss": 0.4522, + "step": 5081 + }, + { + "epoch": 2.4028368794326243, + "grad_norm": 2.787116765975952, + "learning_rate": 3.3045092979696804e-06, + "loss": 0.4714, + "step": 5082 + }, + { + "epoch": 2.403309692671395, + "grad_norm": 2.7494192123413086, + "learning_rate": 3.3039186295529613e-06, + "loss": 0.4107, + "step": 5083 + }, + { + "epoch": 2.4037825059101654, + "grad_norm": 2.733794927597046, + "learning_rate": 3.303327911079755e-06, + "loss": 0.4169, + "step": 5084 + }, + { + "epoch": 2.404255319148936, + "grad_norm": 2.7313334941864014, + "learning_rate": 3.3027371425868422e-06, + "loss": 0.4287, + "step": 5085 + }, + { + "epoch": 2.404728132387707, + "grad_norm": 2.7832977771759033, + "learning_rate": 3.3021463241110075e-06, + "loss": 0.5307, + "step": 5086 + }, + { + "epoch": 2.4052009456264773, + "grad_norm": 2.6615281105041504, + "learning_rate": 3.301555455689038e-06, + "loss": 0.4519, + "step": 5087 + }, + { + "epoch": 2.405673758865248, + "grad_norm": 2.343921422958374, + "learning_rate": 3.3009645373577264e-06, + "loss": 0.46, + "step": 5088 + }, + { + "epoch": 2.406146572104019, + "grad_norm": 2.6115355491638184, + "learning_rate": 3.300373569153864e-06, + "loss": 0.4782, + "step": 5089 + }, + { + "epoch": 2.4066193853427897, + "grad_norm": 2.730625629425049, + "learning_rate": 3.299782551114249e-06, + "loss": 0.4632, + "step": 5090 + }, + { + "epoch": 2.40709219858156, + "grad_norm": 2.4495043754577637, + "learning_rate": 3.2991914832756824e-06, + "loss": 0.4243, + "step": 5091 + }, + { + "epoch": 2.407565011820331, + "grad_norm": 2.8731648921966553, + "learning_rate": 3.2986003656749654e-06, + "loss": 0.4262, + "step": 5092 + }, + { + "epoch": 2.4080378250591017, + "grad_norm": 2.870342969894409, + "learning_rate": 3.2980091983489053e-06, + "loss": 0.4735, + "step": 5093 + }, + { + "epoch": 2.4085106382978725, + "grad_norm": 2.500786542892456, + "learning_rate": 3.297417981334312e-06, + "loss": 0.4007, + "step": 5094 + }, + { + "epoch": 2.408983451536643, + "grad_norm": 2.7787322998046875, + "learning_rate": 3.2968267146679978e-06, + "loss": 0.493, + "step": 5095 + }, + { + "epoch": 2.4094562647754136, + "grad_norm": 2.5229599475860596, + "learning_rate": 3.2962353983867783e-06, + "loss": 0.3676, + "step": 5096 + }, + { + "epoch": 2.4099290780141844, + "grad_norm": 3.1955904960632324, + "learning_rate": 3.2956440325274715e-06, + "loss": 0.4888, + "step": 5097 + }, + { + "epoch": 2.4104018912529552, + "grad_norm": 2.8580288887023926, + "learning_rate": 3.2950526171268995e-06, + "loss": 0.4892, + "step": 5098 + }, + { + "epoch": 2.4108747044917256, + "grad_norm": 2.6321749687194824, + "learning_rate": 3.294461152221887e-06, + "loss": 0.3823, + "step": 5099 + }, + { + "epoch": 2.4113475177304964, + "grad_norm": 2.881127119064331, + "learning_rate": 3.293869637849263e-06, + "loss": 0.4569, + "step": 5100 + }, + { + "epoch": 2.411820330969267, + "grad_norm": 2.7742316722869873, + "learning_rate": 3.293278074045857e-06, + "loss": 0.4445, + "step": 5101 + }, + { + "epoch": 2.412293144208038, + "grad_norm": 2.546701431274414, + "learning_rate": 3.2926864608485037e-06, + "loss": 0.3995, + "step": 5102 + }, + { + "epoch": 2.4127659574468083, + "grad_norm": 2.588226318359375, + "learning_rate": 3.292094798294041e-06, + "loss": 0.4081, + "step": 5103 + }, + { + "epoch": 2.413238770685579, + "grad_norm": 2.968689441680908, + "learning_rate": 3.2915030864193077e-06, + "loss": 0.4475, + "step": 5104 + }, + { + "epoch": 2.41371158392435, + "grad_norm": 2.9249184131622314, + "learning_rate": 3.290911325261148e-06, + "loss": 0.4763, + "step": 5105 + }, + { + "epoch": 2.4141843971631207, + "grad_norm": 2.817596673965454, + "learning_rate": 3.2903195148564083e-06, + "loss": 0.4451, + "step": 5106 + }, + { + "epoch": 2.414657210401891, + "grad_norm": 2.6465954780578613, + "learning_rate": 3.2897276552419377e-06, + "loss": 0.4665, + "step": 5107 + }, + { + "epoch": 2.415130023640662, + "grad_norm": 2.8613853454589844, + "learning_rate": 3.2891357464545885e-06, + "loss": 0.4398, + "step": 5108 + }, + { + "epoch": 2.4156028368794327, + "grad_norm": 2.756321907043457, + "learning_rate": 3.2885437885312175e-06, + "loss": 0.4634, + "step": 5109 + }, + { + "epoch": 2.4160756501182035, + "grad_norm": 2.8965282440185547, + "learning_rate": 3.287951781508682e-06, + "loss": 0.4319, + "step": 5110 + }, + { + "epoch": 2.416548463356974, + "grad_norm": 2.896756172180176, + "learning_rate": 3.287359725423844e-06, + "loss": 0.4771, + "step": 5111 + }, + { + "epoch": 2.4170212765957446, + "grad_norm": 2.952911376953125, + "learning_rate": 3.286767620313569e-06, + "loss": 0.5026, + "step": 5112 + }, + { + "epoch": 2.4174940898345154, + "grad_norm": 3.850515604019165, + "learning_rate": 3.2861754662147234e-06, + "loss": 0.4387, + "step": 5113 + }, + { + "epoch": 2.417966903073286, + "grad_norm": 3.0072689056396484, + "learning_rate": 3.2855832631641794e-06, + "loss": 0.4586, + "step": 5114 + }, + { + "epoch": 2.4184397163120566, + "grad_norm": 3.166790246963501, + "learning_rate": 3.2849910111988092e-06, + "loss": 0.4842, + "step": 5115 + }, + { + "epoch": 2.4189125295508274, + "grad_norm": 3.5397679805755615, + "learning_rate": 3.284398710355492e-06, + "loss": 0.5138, + "step": 5116 + }, + { + "epoch": 2.419385342789598, + "grad_norm": 2.779609441757202, + "learning_rate": 3.283806360671106e-06, + "loss": 0.4049, + "step": 5117 + }, + { + "epoch": 2.419858156028369, + "grad_norm": 2.5924575328826904, + "learning_rate": 3.283213962182535e-06, + "loss": 0.433, + "step": 5118 + }, + { + "epoch": 2.4203309692671393, + "grad_norm": 2.7429699897766113, + "learning_rate": 3.282621514926665e-06, + "loss": 0.4674, + "step": 5119 + }, + { + "epoch": 2.42080378250591, + "grad_norm": 2.8113889694213867, + "learning_rate": 3.2820290189403846e-06, + "loss": 0.3898, + "step": 5120 + }, + { + "epoch": 2.421276595744681, + "grad_norm": 2.867105722427368, + "learning_rate": 3.2814364742605863e-06, + "loss": 0.4439, + "step": 5121 + }, + { + "epoch": 2.4217494089834517, + "grad_norm": 2.428597927093506, + "learning_rate": 3.2808438809241654e-06, + "loss": 0.4339, + "step": 5122 + }, + { + "epoch": 2.422222222222222, + "grad_norm": 3.071735143661499, + "learning_rate": 3.2802512389680203e-06, + "loss": 0.4583, + "step": 5123 + }, + { + "epoch": 2.422695035460993, + "grad_norm": 3.046313762664795, + "learning_rate": 3.279658548429051e-06, + "loss": 0.5351, + "step": 5124 + }, + { + "epoch": 2.4231678486997636, + "grad_norm": 2.8412697315216064, + "learning_rate": 3.279065809344163e-06, + "loss": 0.5258, + "step": 5125 + }, + { + "epoch": 2.4236406619385344, + "grad_norm": 2.887169122695923, + "learning_rate": 3.278473021750263e-06, + "loss": 0.4568, + "step": 5126 + }, + { + "epoch": 2.424113475177305, + "grad_norm": 2.8316574096679688, + "learning_rate": 3.2778801856842624e-06, + "loss": 0.46, + "step": 5127 + }, + { + "epoch": 2.4245862884160756, + "grad_norm": 2.7660772800445557, + "learning_rate": 3.277287301183073e-06, + "loss": 0.4323, + "step": 5128 + }, + { + "epoch": 2.4250591016548464, + "grad_norm": 2.737682819366455, + "learning_rate": 3.276694368283611e-06, + "loss": 0.4296, + "step": 5129 + }, + { + "epoch": 2.425531914893617, + "grad_norm": 2.8807425498962402, + "learning_rate": 3.276101387022797e-06, + "loss": 0.4673, + "step": 5130 + }, + { + "epoch": 2.4260047281323875, + "grad_norm": 2.530526876449585, + "learning_rate": 3.275508357437552e-06, + "loss": 0.416, + "step": 5131 + }, + { + "epoch": 2.4264775413711583, + "grad_norm": 3.1189746856689453, + "learning_rate": 3.274915279564803e-06, + "loss": 0.4171, + "step": 5132 + }, + { + "epoch": 2.426950354609929, + "grad_norm": 2.6612462997436523, + "learning_rate": 3.274322153441477e-06, + "loss": 0.4104, + "step": 5133 + }, + { + "epoch": 2.4274231678487, + "grad_norm": 2.717973470687866, + "learning_rate": 3.2737289791045064e-06, + "loss": 0.479, + "step": 5134 + }, + { + "epoch": 2.4278959810874703, + "grad_norm": 2.764216661453247, + "learning_rate": 3.2731357565908247e-06, + "loss": 0.481, + "step": 5135 + }, + { + "epoch": 2.428368794326241, + "grad_norm": 2.5081393718719482, + "learning_rate": 3.272542485937369e-06, + "loss": 0.4592, + "step": 5136 + }, + { + "epoch": 2.428841607565012, + "grad_norm": 3.1380364894866943, + "learning_rate": 3.271949167181081e-06, + "loss": 0.4179, + "step": 5137 + }, + { + "epoch": 2.4293144208037827, + "grad_norm": 2.9275963306427, + "learning_rate": 3.2713558003589026e-06, + "loss": 0.5196, + "step": 5138 + }, + { + "epoch": 2.429787234042553, + "grad_norm": 2.8215506076812744, + "learning_rate": 3.270762385507781e-06, + "loss": 0.4081, + "step": 5139 + }, + { + "epoch": 2.430260047281324, + "grad_norm": 2.9185614585876465, + "learning_rate": 3.270168922664665e-06, + "loss": 0.4936, + "step": 5140 + }, + { + "epoch": 2.4307328605200946, + "grad_norm": 2.6507248878479004, + "learning_rate": 3.269575411866507e-06, + "loss": 0.4834, + "step": 5141 + }, + { + "epoch": 2.4312056737588654, + "grad_norm": 2.864741563796997, + "learning_rate": 3.2689818531502637e-06, + "loss": 0.4562, + "step": 5142 + }, + { + "epoch": 2.431678486997636, + "grad_norm": 2.806919813156128, + "learning_rate": 3.2683882465528917e-06, + "loss": 0.4645, + "step": 5143 + }, + { + "epoch": 2.4321513002364066, + "grad_norm": 2.733372211456299, + "learning_rate": 3.267794592111353e-06, + "loss": 0.4123, + "step": 5144 + }, + { + "epoch": 2.4326241134751774, + "grad_norm": 2.8005833625793457, + "learning_rate": 3.2672008898626116e-06, + "loss": 0.4343, + "step": 5145 + }, + { + "epoch": 2.433096926713948, + "grad_norm": 3.2339670658111572, + "learning_rate": 3.2666071398436354e-06, + "loss": 0.4017, + "step": 5146 + }, + { + "epoch": 2.4335697399527185, + "grad_norm": 2.510251760482788, + "learning_rate": 3.2660133420913932e-06, + "loss": 0.3882, + "step": 5147 + }, + { + "epoch": 2.4340425531914893, + "grad_norm": 3.5633628368377686, + "learning_rate": 3.26541949664286e-06, + "loss": 0.4766, + "step": 5148 + }, + { + "epoch": 2.43451536643026, + "grad_norm": 2.8246724605560303, + "learning_rate": 3.26482560353501e-06, + "loss": 0.3728, + "step": 5149 + }, + { + "epoch": 2.434988179669031, + "grad_norm": 2.4923641681671143, + "learning_rate": 3.264231662804823e-06, + "loss": 0.4346, + "step": 5150 + }, + { + "epoch": 2.4354609929078013, + "grad_norm": 3.180874824523926, + "learning_rate": 3.2636376744892827e-06, + "loss": 0.4351, + "step": 5151 + }, + { + "epoch": 2.435933806146572, + "grad_norm": 2.6933515071868896, + "learning_rate": 3.263043638625373e-06, + "loss": 0.4293, + "step": 5152 + }, + { + "epoch": 2.436406619385343, + "grad_norm": 2.584132194519043, + "learning_rate": 3.262449555250081e-06, + "loss": 0.4589, + "step": 5153 + }, + { + "epoch": 2.4368794326241137, + "grad_norm": 2.8103036880493164, + "learning_rate": 3.2618554244003985e-06, + "loss": 0.463, + "step": 5154 + }, + { + "epoch": 2.437352245862884, + "grad_norm": 2.809070587158203, + "learning_rate": 3.2612612461133197e-06, + "loss": 0.4629, + "step": 5155 + }, + { + "epoch": 2.437825059101655, + "grad_norm": 2.98148512840271, + "learning_rate": 3.2606670204258405e-06, + "loss": 0.451, + "step": 5156 + }, + { + "epoch": 2.4382978723404256, + "grad_norm": 2.691047191619873, + "learning_rate": 3.2600727473749614e-06, + "loss": 0.3878, + "step": 5157 + }, + { + "epoch": 2.4387706855791964, + "grad_norm": 2.900360345840454, + "learning_rate": 3.2594784269976856e-06, + "loss": 0.4216, + "step": 5158 + }, + { + "epoch": 2.4392434988179668, + "grad_norm": 2.8449952602386475, + "learning_rate": 3.258884059331019e-06, + "loss": 0.4268, + "step": 5159 + }, + { + "epoch": 2.4397163120567376, + "grad_norm": 2.7226388454437256, + "learning_rate": 3.258289644411969e-06, + "loss": 0.4381, + "step": 5160 + }, + { + "epoch": 2.4401891252955084, + "grad_norm": 2.513946056365967, + "learning_rate": 3.257695182277547e-06, + "loss": 0.4566, + "step": 5161 + }, + { + "epoch": 2.440661938534279, + "grad_norm": 2.9941394329071045, + "learning_rate": 3.2571006729647693e-06, + "loss": 0.4395, + "step": 5162 + }, + { + "epoch": 2.4411347517730495, + "grad_norm": 2.699094533920288, + "learning_rate": 3.2565061165106523e-06, + "loss": 0.4274, + "step": 5163 + }, + { + "epoch": 2.4416075650118203, + "grad_norm": 2.574193000793457, + "learning_rate": 3.255911512952216e-06, + "loss": 0.4187, + "step": 5164 + }, + { + "epoch": 2.442080378250591, + "grad_norm": 2.920766592025757, + "learning_rate": 3.2553168623264854e-06, + "loss": 0.4911, + "step": 5165 + }, + { + "epoch": 2.4425531914893615, + "grad_norm": 2.728421926498413, + "learning_rate": 3.2547221646704853e-06, + "loss": 0.4466, + "step": 5166 + }, + { + "epoch": 2.4430260047281322, + "grad_norm": 2.8171417713165283, + "learning_rate": 3.254127420021246e-06, + "loss": 0.4331, + "step": 5167 + }, + { + "epoch": 2.443498817966903, + "grad_norm": 2.4069135189056396, + "learning_rate": 3.2535326284157975e-06, + "loss": 0.389, + "step": 5168 + }, + { + "epoch": 2.443971631205674, + "grad_norm": 2.912405490875244, + "learning_rate": 3.2529377898911777e-06, + "loss": 0.4681, + "step": 5169 + }, + { + "epoch": 2.4444444444444446, + "grad_norm": 2.987558126449585, + "learning_rate": 3.2523429044844228e-06, + "loss": 0.4715, + "step": 5170 + }, + { + "epoch": 2.444917257683215, + "grad_norm": 2.5117199420928955, + "learning_rate": 3.251747972232574e-06, + "loss": 0.4531, + "step": 5171 + }, + { + "epoch": 2.445390070921986, + "grad_norm": 2.5405385494232178, + "learning_rate": 3.2511529931726752e-06, + "loss": 0.4323, + "step": 5172 + }, + { + "epoch": 2.4458628841607566, + "grad_norm": 2.989932060241699, + "learning_rate": 3.250557967341773e-06, + "loss": 0.4039, + "step": 5173 + }, + { + "epoch": 2.446335697399527, + "grad_norm": 2.6331627368927, + "learning_rate": 3.2499628947769186e-06, + "loss": 0.5147, + "step": 5174 + }, + { + "epoch": 2.4468085106382977, + "grad_norm": 2.71699857711792, + "learning_rate": 3.249367775515162e-06, + "loss": 0.3748, + "step": 5175 + }, + { + "epoch": 2.4472813238770685, + "grad_norm": 2.9508471488952637, + "learning_rate": 3.2487726095935606e-06, + "loss": 0.5145, + "step": 5176 + }, + { + "epoch": 2.4477541371158393, + "grad_norm": 2.8276431560516357, + "learning_rate": 3.2481773970491713e-06, + "loss": 0.4295, + "step": 5177 + }, + { + "epoch": 2.44822695035461, + "grad_norm": 2.5500540733337402, + "learning_rate": 3.2475821379190565e-06, + "loss": 0.4246, + "step": 5178 + }, + { + "epoch": 2.4486997635933805, + "grad_norm": 2.845641613006592, + "learning_rate": 3.246986832240281e-06, + "loss": 0.4211, + "step": 5179 + }, + { + "epoch": 2.4491725768321513, + "grad_norm": 3.1215856075286865, + "learning_rate": 3.2463914800499097e-06, + "loss": 0.4378, + "step": 5180 + }, + { + "epoch": 2.449645390070922, + "grad_norm": 2.4685606956481934, + "learning_rate": 3.2457960813850137e-06, + "loss": 0.4836, + "step": 5181 + }, + { + "epoch": 2.4501182033096924, + "grad_norm": 2.508028268814087, + "learning_rate": 3.245200636282666e-06, + "loss": 0.4377, + "step": 5182 + }, + { + "epoch": 2.4505910165484632, + "grad_norm": 2.899949312210083, + "learning_rate": 3.244605144779943e-06, + "loss": 0.501, + "step": 5183 + }, + { + "epoch": 2.451063829787234, + "grad_norm": 2.6494483947753906, + "learning_rate": 3.244009606913923e-06, + "loss": 0.4255, + "step": 5184 + }, + { + "epoch": 2.451536643026005, + "grad_norm": 2.4363760948181152, + "learning_rate": 3.243414022721686e-06, + "loss": 0.4402, + "step": 5185 + }, + { + "epoch": 2.4520094562647756, + "grad_norm": 2.4725022315979004, + "learning_rate": 3.242818392240317e-06, + "loss": 0.4388, + "step": 5186 + }, + { + "epoch": 2.452482269503546, + "grad_norm": 2.7010514736175537, + "learning_rate": 3.242222715506905e-06, + "loss": 0.4388, + "step": 5187 + }, + { + "epoch": 2.4529550827423168, + "grad_norm": 2.811464548110962, + "learning_rate": 3.241626992558539e-06, + "loss": 0.4634, + "step": 5188 + }, + { + "epoch": 2.4534278959810876, + "grad_norm": 2.6473052501678467, + "learning_rate": 3.2410312234323123e-06, + "loss": 0.4752, + "step": 5189 + }, + { + "epoch": 2.453900709219858, + "grad_norm": 2.5587213039398193, + "learning_rate": 3.24043540816532e-06, + "loss": 0.4458, + "step": 5190 + }, + { + "epoch": 2.4543735224586287, + "grad_norm": 2.6306557655334473, + "learning_rate": 3.239839546794662e-06, + "loss": 0.4081, + "step": 5191 + }, + { + "epoch": 2.4548463356973995, + "grad_norm": 2.4613633155822754, + "learning_rate": 3.23924363935744e-06, + "loss": 0.4165, + "step": 5192 + }, + { + "epoch": 2.4553191489361703, + "grad_norm": 2.7189204692840576, + "learning_rate": 3.238647685890757e-06, + "loss": 0.4822, + "step": 5193 + }, + { + "epoch": 2.455791962174941, + "grad_norm": 3.015977382659912, + "learning_rate": 3.238051686431722e-06, + "loss": 0.4964, + "step": 5194 + }, + { + "epoch": 2.4562647754137115, + "grad_norm": 2.8868937492370605, + "learning_rate": 3.2374556410174445e-06, + "loss": 0.4514, + "step": 5195 + }, + { + "epoch": 2.4567375886524823, + "grad_norm": 2.7959537506103516, + "learning_rate": 3.2368595496850375e-06, + "loss": 0.475, + "step": 5196 + }, + { + "epoch": 2.457210401891253, + "grad_norm": 3.0086777210235596, + "learning_rate": 3.2362634124716187e-06, + "loss": 0.4913, + "step": 5197 + }, + { + "epoch": 2.4576832151300234, + "grad_norm": 2.621335506439209, + "learning_rate": 3.2356672294143044e-06, + "loss": 0.4259, + "step": 5198 + }, + { + "epoch": 2.458156028368794, + "grad_norm": 3.1620380878448486, + "learning_rate": 3.235071000550218e-06, + "loss": 0.451, + "step": 5199 + }, + { + "epoch": 2.458628841607565, + "grad_norm": 2.7663278579711914, + "learning_rate": 3.234474725916484e-06, + "loss": 0.3854, + "step": 5200 + }, + { + "epoch": 2.459101654846336, + "grad_norm": 2.5187132358551025, + "learning_rate": 3.2338784055502288e-06, + "loss": 0.4068, + "step": 5201 + }, + { + "epoch": 2.4595744680851066, + "grad_norm": 2.6022701263427734, + "learning_rate": 3.233282039488583e-06, + "loss": 0.4484, + "step": 5202 + }, + { + "epoch": 2.460047281323877, + "grad_norm": 2.874750852584839, + "learning_rate": 3.2326856277686807e-06, + "loss": 0.45, + "step": 5203 + }, + { + "epoch": 2.4605200945626478, + "grad_norm": 2.671008586883545, + "learning_rate": 3.232089170427656e-06, + "loss": 0.4446, + "step": 5204 + }, + { + "epoch": 2.4609929078014185, + "grad_norm": 2.7365503311157227, + "learning_rate": 3.2314926675026498e-06, + "loss": 0.4402, + "step": 5205 + }, + { + "epoch": 2.461465721040189, + "grad_norm": 2.8163657188415527, + "learning_rate": 3.230896119030803e-06, + "loss": 0.3881, + "step": 5206 + }, + { + "epoch": 2.4619385342789597, + "grad_norm": 2.812433958053589, + "learning_rate": 3.2302995250492584e-06, + "loss": 0.4897, + "step": 5207 + }, + { + "epoch": 2.4624113475177305, + "grad_norm": 2.786033868789673, + "learning_rate": 3.2297028855951664e-06, + "loss": 0.4069, + "step": 5208 + }, + { + "epoch": 2.4628841607565013, + "grad_norm": 3.0247974395751953, + "learning_rate": 3.229106200705674e-06, + "loss": 0.4048, + "step": 5209 + }, + { + "epoch": 2.463356973995272, + "grad_norm": 3.3280487060546875, + "learning_rate": 3.2285094704179353e-06, + "loss": 0.5613, + "step": 5210 + }, + { + "epoch": 2.4638297872340424, + "grad_norm": 2.603219985961914, + "learning_rate": 3.2279126947691073e-06, + "loss": 0.432, + "step": 5211 + }, + { + "epoch": 2.4643026004728132, + "grad_norm": 3.1532180309295654, + "learning_rate": 3.2273158737963472e-06, + "loss": 0.4602, + "step": 5212 + }, + { + "epoch": 2.464775413711584, + "grad_norm": 2.7512969970703125, + "learning_rate": 3.2267190075368164e-06, + "loss": 0.5064, + "step": 5213 + }, + { + "epoch": 2.4652482269503544, + "grad_norm": 2.926992177963257, + "learning_rate": 3.22612209602768e-06, + "loss": 0.4753, + "step": 5214 + }, + { + "epoch": 2.465721040189125, + "grad_norm": 4.052840709686279, + "learning_rate": 3.2255251393061047e-06, + "loss": 0.5235, + "step": 5215 + }, + { + "epoch": 2.466193853427896, + "grad_norm": 2.8266959190368652, + "learning_rate": 3.2249281374092606e-06, + "loss": 0.3931, + "step": 5216 + }, + { + "epoch": 2.466666666666667, + "grad_norm": 2.564359426498413, + "learning_rate": 3.2243310903743196e-06, + "loss": 0.4146, + "step": 5217 + }, + { + "epoch": 2.4671394799054376, + "grad_norm": 2.387925148010254, + "learning_rate": 3.2237339982384576e-06, + "loss": 0.4142, + "step": 5218 + }, + { + "epoch": 2.467612293144208, + "grad_norm": 2.7045164108276367, + "learning_rate": 3.223136861038853e-06, + "loss": 0.4345, + "step": 5219 + }, + { + "epoch": 2.4680851063829787, + "grad_norm": 2.6963284015655518, + "learning_rate": 3.2225396788126872e-06, + "loss": 0.4243, + "step": 5220 + }, + { + "epoch": 2.4685579196217495, + "grad_norm": 2.8247268199920654, + "learning_rate": 3.221942451597144e-06, + "loss": 0.3919, + "step": 5221 + }, + { + "epoch": 2.46903073286052, + "grad_norm": 3.843836784362793, + "learning_rate": 3.2213451794294093e-06, + "loss": 0.4183, + "step": 5222 + }, + { + "epoch": 2.4695035460992907, + "grad_norm": 2.8579909801483154, + "learning_rate": 3.220747862346674e-06, + "loss": 0.4844, + "step": 5223 + }, + { + "epoch": 2.4699763593380615, + "grad_norm": 3.744027853012085, + "learning_rate": 3.2201505003861294e-06, + "loss": 0.4563, + "step": 5224 + }, + { + "epoch": 2.4704491725768323, + "grad_norm": 2.835108995437622, + "learning_rate": 3.219553093584971e-06, + "loss": 0.4394, + "step": 5225 + }, + { + "epoch": 2.470921985815603, + "grad_norm": 2.5681865215301514, + "learning_rate": 3.218955641980397e-06, + "loss": 0.3907, + "step": 5226 + }, + { + "epoch": 2.4713947990543734, + "grad_norm": 2.963172674179077, + "learning_rate": 3.2183581456096067e-06, + "loss": 0.5163, + "step": 5227 + }, + { + "epoch": 2.4718676122931442, + "grad_norm": 2.7840685844421387, + "learning_rate": 3.2177606045098047e-06, + "loss": 0.411, + "step": 5228 + }, + { + "epoch": 2.472340425531915, + "grad_norm": 2.7849979400634766, + "learning_rate": 3.2171630187181977e-06, + "loss": 0.4671, + "step": 5229 + }, + { + "epoch": 2.4728132387706854, + "grad_norm": 2.736406087875366, + "learning_rate": 3.216565388271994e-06, + "loss": 0.5225, + "step": 5230 + }, + { + "epoch": 2.473286052009456, + "grad_norm": 2.978271007537842, + "learning_rate": 3.215967713208406e-06, + "loss": 0.4668, + "step": 5231 + }, + { + "epoch": 2.473758865248227, + "grad_norm": 2.687560796737671, + "learning_rate": 3.2153699935646475e-06, + "loss": 0.4683, + "step": 5232 + }, + { + "epoch": 2.4742316784869978, + "grad_norm": 2.7096521854400635, + "learning_rate": 3.214772229377936e-06, + "loss": 0.4999, + "step": 5233 + }, + { + "epoch": 2.4747044917257686, + "grad_norm": 3.1861157417297363, + "learning_rate": 3.214174420685493e-06, + "loss": 0.4365, + "step": 5234 + }, + { + "epoch": 2.475177304964539, + "grad_norm": 2.623061418533325, + "learning_rate": 3.2135765675245394e-06, + "loss": 0.3717, + "step": 5235 + }, + { + "epoch": 2.4756501182033097, + "grad_norm": 2.680921792984009, + "learning_rate": 3.2129786699323016e-06, + "loss": 0.4688, + "step": 5236 + }, + { + "epoch": 2.4761229314420805, + "grad_norm": 2.80426025390625, + "learning_rate": 3.2123807279460096e-06, + "loss": 0.5043, + "step": 5237 + }, + { + "epoch": 2.476595744680851, + "grad_norm": 2.676156997680664, + "learning_rate": 3.211782741602893e-06, + "loss": 0.4486, + "step": 5238 + }, + { + "epoch": 2.4770685579196217, + "grad_norm": 2.700822591781616, + "learning_rate": 3.2111847109401855e-06, + "loss": 0.4097, + "step": 5239 + }, + { + "epoch": 2.4775413711583925, + "grad_norm": 2.735387086868286, + "learning_rate": 3.2105866359951254e-06, + "loss": 0.4357, + "step": 5240 + }, + { + "epoch": 2.4780141843971633, + "grad_norm": 2.961874485015869, + "learning_rate": 3.2099885168049507e-06, + "loss": 0.4942, + "step": 5241 + }, + { + "epoch": 2.478486997635934, + "grad_norm": 2.546588659286499, + "learning_rate": 3.209390353406904e-06, + "loss": 0.3852, + "step": 5242 + }, + { + "epoch": 2.4789598108747044, + "grad_norm": 2.6269772052764893, + "learning_rate": 3.208792145838231e-06, + "loss": 0.3935, + "step": 5243 + }, + { + "epoch": 2.479432624113475, + "grad_norm": 2.9009883403778076, + "learning_rate": 3.208193894136179e-06, + "loss": 0.4003, + "step": 5244 + }, + { + "epoch": 2.479905437352246, + "grad_norm": 2.772834300994873, + "learning_rate": 3.2075955983379982e-06, + "loss": 0.4742, + "step": 5245 + }, + { + "epoch": 2.4803782505910164, + "grad_norm": 2.728703737258911, + "learning_rate": 3.2069972584809423e-06, + "loss": 0.4405, + "step": 5246 + }, + { + "epoch": 2.480851063829787, + "grad_norm": 2.72868275642395, + "learning_rate": 3.206398874602268e-06, + "loss": 0.4714, + "step": 5247 + }, + { + "epoch": 2.481323877068558, + "grad_norm": 2.6804213523864746, + "learning_rate": 3.2058004467392323e-06, + "loss": 0.4106, + "step": 5248 + }, + { + "epoch": 2.4817966903073287, + "grad_norm": 2.6740739345550537, + "learning_rate": 3.205201974929098e-06, + "loss": 0.3855, + "step": 5249 + }, + { + "epoch": 2.482269503546099, + "grad_norm": 2.8131754398345947, + "learning_rate": 3.204603459209129e-06, + "loss": 0.418, + "step": 5250 + }, + { + "epoch": 2.48274231678487, + "grad_norm": 2.5242888927459717, + "learning_rate": 3.204004899616592e-06, + "loss": 0.4914, + "step": 5251 + }, + { + "epoch": 2.4832151300236407, + "grad_norm": 2.969191551208496, + "learning_rate": 3.2034062961887567e-06, + "loss": 0.4634, + "step": 5252 + }, + { + "epoch": 2.4836879432624115, + "grad_norm": 2.967968463897705, + "learning_rate": 3.2028076489628963e-06, + "loss": 0.456, + "step": 5253 + }, + { + "epoch": 2.484160756501182, + "grad_norm": 2.9006540775299072, + "learning_rate": 3.2022089579762845e-06, + "loss": 0.4203, + "step": 5254 + }, + { + "epoch": 2.4846335697399526, + "grad_norm": 2.6377336978912354, + "learning_rate": 3.2016102232662003e-06, + "loss": 0.4518, + "step": 5255 + }, + { + "epoch": 2.4851063829787234, + "grad_norm": 2.757749319076538, + "learning_rate": 3.201011444869925e-06, + "loss": 0.4314, + "step": 5256 + }, + { + "epoch": 2.4855791962174942, + "grad_norm": 2.571560859680176, + "learning_rate": 3.20041262282474e-06, + "loss": 0.427, + "step": 5257 + }, + { + "epoch": 2.4860520094562646, + "grad_norm": 3.1367194652557373, + "learning_rate": 3.1998137571679316e-06, + "loss": 0.4901, + "step": 5258 + }, + { + "epoch": 2.4865248226950354, + "grad_norm": 3.194042205810547, + "learning_rate": 3.1992148479367896e-06, + "loss": 0.466, + "step": 5259 + }, + { + "epoch": 2.486997635933806, + "grad_norm": 2.5546324253082275, + "learning_rate": 3.1986158951686052e-06, + "loss": 0.4182, + "step": 5260 + }, + { + "epoch": 2.487470449172577, + "grad_norm": 2.919783115386963, + "learning_rate": 3.198016898900672e-06, + "loss": 0.4234, + "step": 5261 + }, + { + "epoch": 2.4879432624113473, + "grad_norm": 2.865248918533325, + "learning_rate": 3.1974178591702877e-06, + "loss": 0.4291, + "step": 5262 + }, + { + "epoch": 2.488416075650118, + "grad_norm": 2.685737133026123, + "learning_rate": 3.196818776014752e-06, + "loss": 0.4548, + "step": 5263 + }, + { + "epoch": 2.488888888888889, + "grad_norm": 2.826974630355835, + "learning_rate": 3.196219649471365e-06, + "loss": 0.4152, + "step": 5264 + }, + { + "epoch": 2.4893617021276597, + "grad_norm": 2.764975070953369, + "learning_rate": 3.1956204795774336e-06, + "loss": 0.5209, + "step": 5265 + }, + { + "epoch": 2.48983451536643, + "grad_norm": 2.4184255599975586, + "learning_rate": 3.1950212663702662e-06, + "loss": 0.3969, + "step": 5266 + }, + { + "epoch": 2.490307328605201, + "grad_norm": 2.9361133575439453, + "learning_rate": 3.1944220098871713e-06, + "loss": 0.4589, + "step": 5267 + }, + { + "epoch": 2.4907801418439717, + "grad_norm": 2.377051830291748, + "learning_rate": 3.193822710165463e-06, + "loss": 0.4328, + "step": 5268 + }, + { + "epoch": 2.4912529550827425, + "grad_norm": 3.1302497386932373, + "learning_rate": 3.1932233672424563e-06, + "loss": 0.3918, + "step": 5269 + }, + { + "epoch": 2.491725768321513, + "grad_norm": 2.89577579498291, + "learning_rate": 3.192623981155471e-06, + "loss": 0.5004, + "step": 5270 + }, + { + "epoch": 2.4921985815602836, + "grad_norm": 2.7735235691070557, + "learning_rate": 3.1920245519418273e-06, + "loss": 0.4206, + "step": 5271 + }, + { + "epoch": 2.4926713947990544, + "grad_norm": 2.5424516201019287, + "learning_rate": 3.1914250796388493e-06, + "loss": 0.4419, + "step": 5272 + }, + { + "epoch": 2.493144208037825, + "grad_norm": 3.1216981410980225, + "learning_rate": 3.1908255642838628e-06, + "loss": 0.4552, + "step": 5273 + }, + { + "epoch": 2.4936170212765956, + "grad_norm": 3.044045925140381, + "learning_rate": 3.1902260059141978e-06, + "loss": 0.4967, + "step": 5274 + }, + { + "epoch": 2.4940898345153664, + "grad_norm": 2.5630741119384766, + "learning_rate": 3.189626404567186e-06, + "loss": 0.3908, + "step": 5275 + }, + { + "epoch": 2.494562647754137, + "grad_norm": 2.7177648544311523, + "learning_rate": 3.189026760280162e-06, + "loss": 0.4915, + "step": 5276 + }, + { + "epoch": 2.495035460992908, + "grad_norm": 2.653416395187378, + "learning_rate": 3.1884270730904632e-06, + "loss": 0.4633, + "step": 5277 + }, + { + "epoch": 2.4955082742316783, + "grad_norm": 3.7212321758270264, + "learning_rate": 3.1878273430354284e-06, + "loss": 0.4549, + "step": 5278 + }, + { + "epoch": 2.495981087470449, + "grad_norm": 2.4152729511260986, + "learning_rate": 3.187227570152402e-06, + "loss": 0.4674, + "step": 5279 + }, + { + "epoch": 2.49645390070922, + "grad_norm": 2.5354862213134766, + "learning_rate": 3.1866277544787284e-06, + "loss": 0.4135, + "step": 5280 + }, + { + "epoch": 2.4969267139479907, + "grad_norm": 3.1766583919525146, + "learning_rate": 3.186027896051754e-06, + "loss": 0.5656, + "step": 5281 + }, + { + "epoch": 2.497399527186761, + "grad_norm": 2.5636754035949707, + "learning_rate": 3.1854279949088313e-06, + "loss": 0.4138, + "step": 5282 + }, + { + "epoch": 2.497872340425532, + "grad_norm": 2.7615602016448975, + "learning_rate": 3.1848280510873124e-06, + "loss": 0.4936, + "step": 5283 + }, + { + "epoch": 2.4983451536643027, + "grad_norm": 2.964721918106079, + "learning_rate": 3.1842280646245543e-06, + "loss": 0.4865, + "step": 5284 + }, + { + "epoch": 2.4988179669030735, + "grad_norm": 2.6915178298950195, + "learning_rate": 3.1836280355579152e-06, + "loss": 0.4179, + "step": 5285 + }, + { + "epoch": 2.499290780141844, + "grad_norm": 2.820451259613037, + "learning_rate": 3.183027963924755e-06, + "loss": 0.4785, + "step": 5286 + }, + { + "epoch": 2.4997635933806146, + "grad_norm": 2.841719627380371, + "learning_rate": 3.1824278497624393e-06, + "loss": 0.4535, + "step": 5287 + }, + { + "epoch": 2.5002364066193854, + "grad_norm": 2.459167957305908, + "learning_rate": 3.181827693108333e-06, + "loss": 0.4353, + "step": 5288 + }, + { + "epoch": 2.500709219858156, + "grad_norm": 3.2538363933563232, + "learning_rate": 3.1812274939998066e-06, + "loss": 0.4037, + "step": 5289 + }, + { + "epoch": 2.5011820330969265, + "grad_norm": 2.6980504989624023, + "learning_rate": 3.180627252474231e-06, + "loss": 0.4181, + "step": 5290 + }, + { + "epoch": 2.5016548463356973, + "grad_norm": 2.9400012493133545, + "learning_rate": 3.1800269685689804e-06, + "loss": 0.4642, + "step": 5291 + }, + { + "epoch": 2.502127659574468, + "grad_norm": 2.7832958698272705, + "learning_rate": 3.1794266423214328e-06, + "loss": 0.3936, + "step": 5292 + }, + { + "epoch": 2.5026004728132385, + "grad_norm": 2.4017868041992188, + "learning_rate": 3.178826273768967e-06, + "loss": 0.3984, + "step": 5293 + }, + { + "epoch": 2.5030732860520093, + "grad_norm": 2.398120641708374, + "learning_rate": 3.1782258629489665e-06, + "loss": 0.4219, + "step": 5294 + }, + { + "epoch": 2.50354609929078, + "grad_norm": 2.973947763442993, + "learning_rate": 3.177625409898815e-06, + "loss": 0.4192, + "step": 5295 + }, + { + "epoch": 2.504018912529551, + "grad_norm": 3.1169888973236084, + "learning_rate": 3.1770249146559006e-06, + "loss": 0.5098, + "step": 5296 + }, + { + "epoch": 2.5044917257683217, + "grad_norm": 2.816964864730835, + "learning_rate": 3.1764243772576132e-06, + "loss": 0.4228, + "step": 5297 + }, + { + "epoch": 2.504964539007092, + "grad_norm": 2.5624163150787354, + "learning_rate": 3.1758237977413452e-06, + "loss": 0.4389, + "step": 5298 + }, + { + "epoch": 2.505437352245863, + "grad_norm": 2.7477777004241943, + "learning_rate": 3.175223176144494e-06, + "loss": 0.4564, + "step": 5299 + }, + { + "epoch": 2.5059101654846336, + "grad_norm": 3.1478309631347656, + "learning_rate": 3.174622512504456e-06, + "loss": 0.4859, + "step": 5300 + }, + { + "epoch": 2.506382978723404, + "grad_norm": 2.8400418758392334, + "learning_rate": 3.1740218068586315e-06, + "loss": 0.4476, + "step": 5301 + }, + { + "epoch": 2.506855791962175, + "grad_norm": 2.7097036838531494, + "learning_rate": 3.173421059244426e-06, + "loss": 0.4559, + "step": 5302 + }, + { + "epoch": 2.5073286052009456, + "grad_norm": 2.864760637283325, + "learning_rate": 3.172820269699243e-06, + "loss": 0.5124, + "step": 5303 + }, + { + "epoch": 2.5078014184397164, + "grad_norm": 2.877110004425049, + "learning_rate": 3.1722194382604926e-06, + "loss": 0.5083, + "step": 5304 + }, + { + "epoch": 2.508274231678487, + "grad_norm": 3.2369656562805176, + "learning_rate": 3.1716185649655844e-06, + "loss": 0.4894, + "step": 5305 + }, + { + "epoch": 2.5087470449172575, + "grad_norm": 2.7377753257751465, + "learning_rate": 3.171017649851934e-06, + "loss": 0.4324, + "step": 5306 + }, + { + "epoch": 2.5092198581560283, + "grad_norm": 2.883364200592041, + "learning_rate": 3.1704166929569564e-06, + "loss": 0.3731, + "step": 5307 + }, + { + "epoch": 2.509692671394799, + "grad_norm": 2.5724737644195557, + "learning_rate": 3.1698156943180716e-06, + "loss": 0.4768, + "step": 5308 + }, + { + "epoch": 2.5101654846335695, + "grad_norm": 2.7532460689544678, + "learning_rate": 3.1692146539727e-06, + "loss": 0.4385, + "step": 5309 + }, + { + "epoch": 2.5106382978723403, + "grad_norm": 2.786505699157715, + "learning_rate": 3.168613571958267e-06, + "loss": 0.4241, + "step": 5310 + }, + { + "epoch": 2.511111111111111, + "grad_norm": 3.1674118041992188, + "learning_rate": 3.1680124483121975e-06, + "loss": 0.4445, + "step": 5311 + }, + { + "epoch": 2.511583924349882, + "grad_norm": 2.7861545085906982, + "learning_rate": 3.167411283071923e-06, + "loss": 0.4264, + "step": 5312 + }, + { + "epoch": 2.5120567375886527, + "grad_norm": 2.7412493228912354, + "learning_rate": 3.1668100762748745e-06, + "loss": 0.4725, + "step": 5313 + }, + { + "epoch": 2.512529550827423, + "grad_norm": 2.710019588470459, + "learning_rate": 3.1662088279584858e-06, + "loss": 0.5207, + "step": 5314 + }, + { + "epoch": 2.513002364066194, + "grad_norm": 2.694812297821045, + "learning_rate": 3.165607538160194e-06, + "loss": 0.3666, + "step": 5315 + }, + { + "epoch": 2.5134751773049646, + "grad_norm": 2.4390623569488525, + "learning_rate": 3.1650062069174405e-06, + "loss": 0.4025, + "step": 5316 + }, + { + "epoch": 2.513947990543735, + "grad_norm": 3.055738925933838, + "learning_rate": 3.1644048342676663e-06, + "loss": 0.4288, + "step": 5317 + }, + { + "epoch": 2.5144208037825058, + "grad_norm": 3.065824508666992, + "learning_rate": 3.163803420248316e-06, + "loss": 0.4592, + "step": 5318 + }, + { + "epoch": 2.5148936170212766, + "grad_norm": 2.6011085510253906, + "learning_rate": 3.163201964896838e-06, + "loss": 0.4081, + "step": 5319 + }, + { + "epoch": 2.5153664302600474, + "grad_norm": 2.4833033084869385, + "learning_rate": 3.162600468250681e-06, + "loss": 0.4343, + "step": 5320 + }, + { + "epoch": 2.515839243498818, + "grad_norm": 2.9035534858703613, + "learning_rate": 3.161998930347299e-06, + "loss": 0.4972, + "step": 5321 + }, + { + "epoch": 2.5163120567375885, + "grad_norm": 2.788752317428589, + "learning_rate": 3.161397351224146e-06, + "loss": 0.4597, + "step": 5322 + }, + { + "epoch": 2.5167848699763593, + "grad_norm": 2.4344491958618164, + "learning_rate": 3.16079573091868e-06, + "loss": 0.359, + "step": 5323 + }, + { + "epoch": 2.51725768321513, + "grad_norm": 2.750150680541992, + "learning_rate": 3.160194069468361e-06, + "loss": 0.4596, + "step": 5324 + }, + { + "epoch": 2.5177304964539005, + "grad_norm": 2.826902389526367, + "learning_rate": 3.1595923669106526e-06, + "loss": 0.4377, + "step": 5325 + }, + { + "epoch": 2.5182033096926713, + "grad_norm": 2.554439067840576, + "learning_rate": 3.15899062328302e-06, + "loss": 0.4517, + "step": 5326 + }, + { + "epoch": 2.518676122931442, + "grad_norm": 3.0882742404937744, + "learning_rate": 3.158388838622931e-06, + "loss": 0.47, + "step": 5327 + }, + { + "epoch": 2.519148936170213, + "grad_norm": 2.918947696685791, + "learning_rate": 3.157787012967856e-06, + "loss": 0.522, + "step": 5328 + }, + { + "epoch": 2.5196217494089836, + "grad_norm": 2.8057637214660645, + "learning_rate": 3.1571851463552674e-06, + "loss": 0.4837, + "step": 5329 + }, + { + "epoch": 2.520094562647754, + "grad_norm": 2.66241455078125, + "learning_rate": 3.156583238822641e-06, + "loss": 0.3988, + "step": 5330 + }, + { + "epoch": 2.520567375886525, + "grad_norm": 2.9793803691864014, + "learning_rate": 3.155981290407456e-06, + "loss": 0.4737, + "step": 5331 + }, + { + "epoch": 2.5210401891252956, + "grad_norm": 2.847522258758545, + "learning_rate": 3.1553793011471924e-06, + "loss": 0.4394, + "step": 5332 + }, + { + "epoch": 2.521513002364066, + "grad_norm": 2.9561474323272705, + "learning_rate": 3.154777271079333e-06, + "loss": 0.47, + "step": 5333 + }, + { + "epoch": 2.5219858156028367, + "grad_norm": 2.8353018760681152, + "learning_rate": 3.154175200241365e-06, + "loss": 0.4015, + "step": 5334 + }, + { + "epoch": 2.5224586288416075, + "grad_norm": 2.609049081802368, + "learning_rate": 3.153573088670775e-06, + "loss": 0.4723, + "step": 5335 + }, + { + "epoch": 2.5229314420803783, + "grad_norm": 2.8538455963134766, + "learning_rate": 3.1529709364050556e-06, + "loss": 0.4665, + "step": 5336 + }, + { + "epoch": 2.523404255319149, + "grad_norm": 2.768310785293579, + "learning_rate": 3.1523687434816978e-06, + "loss": 0.4933, + "step": 5337 + }, + { + "epoch": 2.5238770685579195, + "grad_norm": 2.9300906658172607, + "learning_rate": 3.1517665099382e-06, + "loss": 0.4651, + "step": 5338 + }, + { + "epoch": 2.5243498817966903, + "grad_norm": 2.6984703540802, + "learning_rate": 3.1511642358120585e-06, + "loss": 0.4442, + "step": 5339 + }, + { + "epoch": 2.524822695035461, + "grad_norm": 2.8148467540740967, + "learning_rate": 3.1505619211407762e-06, + "loss": 0.4611, + "step": 5340 + }, + { + "epoch": 2.5252955082742314, + "grad_norm": 2.816436290740967, + "learning_rate": 3.1499595659618556e-06, + "loss": 0.5291, + "step": 5341 + }, + { + "epoch": 2.5257683215130022, + "grad_norm": 2.902805805206299, + "learning_rate": 3.149357170312802e-06, + "loss": 0.4394, + "step": 5342 + }, + { + "epoch": 2.526241134751773, + "grad_norm": 2.6443474292755127, + "learning_rate": 3.148754734231126e-06, + "loss": 0.4444, + "step": 5343 + }, + { + "epoch": 2.526713947990544, + "grad_norm": 2.6818583011627197, + "learning_rate": 3.148152257754336e-06, + "loss": 0.4256, + "step": 5344 + }, + { + "epoch": 2.5271867612293146, + "grad_norm": 2.5266945362091064, + "learning_rate": 3.1475497409199485e-06, + "loss": 0.4087, + "step": 5345 + }, + { + "epoch": 2.527659574468085, + "grad_norm": 2.6326711177825928, + "learning_rate": 3.146947183765477e-06, + "loss": 0.3842, + "step": 5346 + }, + { + "epoch": 2.5281323877068558, + "grad_norm": 3.122880697250366, + "learning_rate": 3.1463445863284413e-06, + "loss": 0.482, + "step": 5347 + }, + { + "epoch": 2.5286052009456266, + "grad_norm": 2.819258213043213, + "learning_rate": 3.145741948646362e-06, + "loss": 0.4628, + "step": 5348 + }, + { + "epoch": 2.529078014184397, + "grad_norm": 2.5842230319976807, + "learning_rate": 3.145139270756764e-06, + "loss": 0.4479, + "step": 5349 + }, + { + "epoch": 2.5295508274231677, + "grad_norm": 2.7257237434387207, + "learning_rate": 3.144536552697172e-06, + "loss": 0.473, + "step": 5350 + }, + { + "epoch": 2.5300236406619385, + "grad_norm": 2.6876981258392334, + "learning_rate": 3.143933794505115e-06, + "loss": 0.4615, + "step": 5351 + }, + { + "epoch": 2.5304964539007093, + "grad_norm": 2.7942895889282227, + "learning_rate": 3.143330996218124e-06, + "loss": 0.4982, + "step": 5352 + }, + { + "epoch": 2.53096926713948, + "grad_norm": 2.3150579929351807, + "learning_rate": 3.1427281578737327e-06, + "loss": 0.3905, + "step": 5353 + }, + { + "epoch": 2.5314420803782505, + "grad_norm": 2.7326138019561768, + "learning_rate": 3.142125279509478e-06, + "loss": 0.4076, + "step": 5354 + }, + { + "epoch": 2.5319148936170213, + "grad_norm": 2.46362566947937, + "learning_rate": 3.1415223611628976e-06, + "loss": 0.4043, + "step": 5355 + }, + { + "epoch": 2.532387706855792, + "grad_norm": 2.6670427322387695, + "learning_rate": 3.1409194028715323e-06, + "loss": 0.484, + "step": 5356 + }, + { + "epoch": 2.5328605200945624, + "grad_norm": 2.917771100997925, + "learning_rate": 3.140316404672926e-06, + "loss": 0.4539, + "step": 5357 + }, + { + "epoch": 2.533333333333333, + "grad_norm": 2.7964110374450684, + "learning_rate": 3.1397133666046254e-06, + "loss": 0.4706, + "step": 5358 + }, + { + "epoch": 2.533806146572104, + "grad_norm": 2.6481330394744873, + "learning_rate": 3.139110288704179e-06, + "loss": 0.4101, + "step": 5359 + }, + { + "epoch": 2.534278959810875, + "grad_norm": 2.859452962875366, + "learning_rate": 3.1385071710091365e-06, + "loss": 0.4842, + "step": 5360 + }, + { + "epoch": 2.5347517730496456, + "grad_norm": 2.686077356338501, + "learning_rate": 3.137904013557052e-06, + "loss": 0.4073, + "step": 5361 + }, + { + "epoch": 2.535224586288416, + "grad_norm": 3.7147045135498047, + "learning_rate": 3.137300816385482e-06, + "loss": 0.4536, + "step": 5362 + }, + { + "epoch": 2.5356973995271868, + "grad_norm": 2.51054048538208, + "learning_rate": 3.1366975795319856e-06, + "loss": 0.4171, + "step": 5363 + }, + { + "epoch": 2.5361702127659576, + "grad_norm": 3.043149471282959, + "learning_rate": 3.136094303034121e-06, + "loss": 0.5179, + "step": 5364 + }, + { + "epoch": 2.536643026004728, + "grad_norm": 2.398878812789917, + "learning_rate": 3.1354909869294548e-06, + "loss": 0.4144, + "step": 5365 + }, + { + "epoch": 2.5371158392434987, + "grad_norm": 2.969712257385254, + "learning_rate": 3.134887631255551e-06, + "loss": 0.3983, + "step": 5366 + }, + { + "epoch": 2.5375886524822695, + "grad_norm": 2.7707982063293457, + "learning_rate": 3.134284236049978e-06, + "loss": 0.4405, + "step": 5367 + }, + { + "epoch": 2.5380614657210403, + "grad_norm": 2.579742193222046, + "learning_rate": 3.1336808013503073e-06, + "loss": 0.4402, + "step": 5368 + }, + { + "epoch": 2.538534278959811, + "grad_norm": 2.6041927337646484, + "learning_rate": 3.1330773271941113e-06, + "loss": 0.396, + "step": 5369 + }, + { + "epoch": 2.5390070921985815, + "grad_norm": 2.7383856773376465, + "learning_rate": 3.1324738136189658e-06, + "loss": 0.4424, + "step": 5370 + }, + { + "epoch": 2.5394799054373522, + "grad_norm": 3.053644895553589, + "learning_rate": 3.13187026066245e-06, + "loss": 0.473, + "step": 5371 + }, + { + "epoch": 2.539952718676123, + "grad_norm": 2.684244155883789, + "learning_rate": 3.1312666683621428e-06, + "loss": 0.3963, + "step": 5372 + }, + { + "epoch": 2.5404255319148934, + "grad_norm": 2.6505017280578613, + "learning_rate": 3.130663036755629e-06, + "loss": 0.4292, + "step": 5373 + }, + { + "epoch": 2.540898345153664, + "grad_norm": 3.025965929031372, + "learning_rate": 3.1300593658804935e-06, + "loss": 0.4539, + "step": 5374 + }, + { + "epoch": 2.541371158392435, + "grad_norm": 2.72106671333313, + "learning_rate": 3.1294556557743237e-06, + "loss": 0.4519, + "step": 5375 + }, + { + "epoch": 2.541843971631206, + "grad_norm": 2.759995222091675, + "learning_rate": 3.12885190647471e-06, + "loss": 0.451, + "step": 5376 + }, + { + "epoch": 2.5423167848699766, + "grad_norm": 2.697950601577759, + "learning_rate": 3.1282481180192457e-06, + "loss": 0.4328, + "step": 5377 + }, + { + "epoch": 2.542789598108747, + "grad_norm": 2.6970415115356445, + "learning_rate": 3.127644290445526e-06, + "loss": 0.4489, + "step": 5378 + }, + { + "epoch": 2.5432624113475177, + "grad_norm": 2.5856997966766357, + "learning_rate": 3.127040423791148e-06, + "loss": 0.3848, + "step": 5379 + }, + { + "epoch": 2.5437352245862885, + "grad_norm": 2.9798166751861572, + "learning_rate": 3.1264365180937127e-06, + "loss": 0.5038, + "step": 5380 + }, + { + "epoch": 2.544208037825059, + "grad_norm": 3.413175106048584, + "learning_rate": 3.1258325733908224e-06, + "loss": 0.5247, + "step": 5381 + }, + { + "epoch": 2.5446808510638297, + "grad_norm": 2.838517904281616, + "learning_rate": 3.1252285897200818e-06, + "loss": 0.4652, + "step": 5382 + }, + { + "epoch": 2.5451536643026005, + "grad_norm": 2.8342528343200684, + "learning_rate": 3.1246245671190983e-06, + "loss": 0.4245, + "step": 5383 + }, + { + "epoch": 2.5456264775413713, + "grad_norm": 3.06026029586792, + "learning_rate": 3.124020505625482e-06, + "loss": 0.469, + "step": 5384 + }, + { + "epoch": 2.546099290780142, + "grad_norm": 2.633894681930542, + "learning_rate": 3.1234164052768452e-06, + "loss": 0.4509, + "step": 5385 + }, + { + "epoch": 2.5465721040189124, + "grad_norm": 2.634819984436035, + "learning_rate": 3.1228122661108023e-06, + "loss": 0.4879, + "step": 5386 + }, + { + "epoch": 2.5470449172576832, + "grad_norm": 3.9843504428863525, + "learning_rate": 3.1222080881649707e-06, + "loss": 0.4472, + "step": 5387 + }, + { + "epoch": 2.547517730496454, + "grad_norm": 2.5480258464813232, + "learning_rate": 3.1216038714769694e-06, + "loss": 0.4396, + "step": 5388 + }, + { + "epoch": 2.5479905437352244, + "grad_norm": 2.7461917400360107, + "learning_rate": 3.12099961608442e-06, + "loss": 0.4735, + "step": 5389 + }, + { + "epoch": 2.548463356973995, + "grad_norm": 3.167769193649292, + "learning_rate": 3.1203953220249493e-06, + "loss": 0.4196, + "step": 5390 + }, + { + "epoch": 2.548936170212766, + "grad_norm": 2.721696615219116, + "learning_rate": 3.1197909893361814e-06, + "loss": 0.4571, + "step": 5391 + }, + { + "epoch": 2.5494089834515368, + "grad_norm": 2.726668119430542, + "learning_rate": 3.1191866180557463e-06, + "loss": 0.4856, + "step": 5392 + }, + { + "epoch": 2.5498817966903076, + "grad_norm": 2.602205276489258, + "learning_rate": 3.1185822082212754e-06, + "loss": 0.4631, + "step": 5393 + }, + { + "epoch": 2.550354609929078, + "grad_norm": 2.7715859413146973, + "learning_rate": 3.1179777598704025e-06, + "loss": 0.4136, + "step": 5394 + }, + { + "epoch": 2.5508274231678487, + "grad_norm": 2.8081955909729004, + "learning_rate": 3.1173732730407647e-06, + "loss": 0.4963, + "step": 5395 + }, + { + "epoch": 2.5513002364066195, + "grad_norm": 2.946772336959839, + "learning_rate": 3.1167687477700006e-06, + "loss": 0.4443, + "step": 5396 + }, + { + "epoch": 2.55177304964539, + "grad_norm": 2.89345383644104, + "learning_rate": 3.1161641840957503e-06, + "loss": 0.4377, + "step": 5397 + }, + { + "epoch": 2.5522458628841607, + "grad_norm": 2.908317804336548, + "learning_rate": 3.115559582055659e-06, + "loss": 0.4702, + "step": 5398 + }, + { + "epoch": 2.5527186761229315, + "grad_norm": 2.554417848587036, + "learning_rate": 3.1149549416873704e-06, + "loss": 0.3738, + "step": 5399 + }, + { + "epoch": 2.5531914893617023, + "grad_norm": 2.3132457733154297, + "learning_rate": 3.1143502630285356e-06, + "loss": 0.4074, + "step": 5400 + }, + { + "epoch": 2.553664302600473, + "grad_norm": 2.751666784286499, + "learning_rate": 3.1137455461168026e-06, + "loss": 0.4697, + "step": 5401 + }, + { + "epoch": 2.5541371158392434, + "grad_norm": 2.7088871002197266, + "learning_rate": 3.113140790989826e-06, + "loss": 0.4754, + "step": 5402 + }, + { + "epoch": 2.554609929078014, + "grad_norm": 3.0633046627044678, + "learning_rate": 3.1125359976852605e-06, + "loss": 0.4874, + "step": 5403 + }, + { + "epoch": 2.555082742316785, + "grad_norm": 3.399456024169922, + "learning_rate": 3.111931166240764e-06, + "loss": 0.5529, + "step": 5404 + }, + { + "epoch": 2.5555555555555554, + "grad_norm": 2.7729690074920654, + "learning_rate": 3.1113262966939985e-06, + "loss": 0.4677, + "step": 5405 + }, + { + "epoch": 2.556028368794326, + "grad_norm": 2.81025767326355, + "learning_rate": 3.1107213890826244e-06, + "loss": 0.4954, + "step": 5406 + }, + { + "epoch": 2.556501182033097, + "grad_norm": 2.4837241172790527, + "learning_rate": 3.110116443444307e-06, + "loss": 0.3681, + "step": 5407 + }, + { + "epoch": 2.5569739952718678, + "grad_norm": 2.6406874656677246, + "learning_rate": 3.109511459816714e-06, + "loss": 0.4569, + "step": 5408 + }, + { + "epoch": 2.5574468085106385, + "grad_norm": 2.6093738079071045, + "learning_rate": 3.1089064382375155e-06, + "loss": 0.413, + "step": 5409 + }, + { + "epoch": 2.557919621749409, + "grad_norm": 2.6629011631011963, + "learning_rate": 3.108301378744383e-06, + "loss": 0.4286, + "step": 5410 + }, + { + "epoch": 2.5583924349881797, + "grad_norm": 2.694796323776245, + "learning_rate": 3.10769628137499e-06, + "loss": 0.4316, + "step": 5411 + }, + { + "epoch": 2.5588652482269505, + "grad_norm": 2.88023042678833, + "learning_rate": 3.107091146167015e-06, + "loss": 0.4378, + "step": 5412 + }, + { + "epoch": 2.559338061465721, + "grad_norm": 2.8804919719696045, + "learning_rate": 3.1064859731581365e-06, + "loss": 0.4971, + "step": 5413 + }, + { + "epoch": 2.5598108747044916, + "grad_norm": 2.850468397140503, + "learning_rate": 3.1058807623860353e-06, + "loss": 0.4686, + "step": 5414 + }, + { + "epoch": 2.5602836879432624, + "grad_norm": 3.0548019409179688, + "learning_rate": 3.1052755138883963e-06, + "loss": 0.4497, + "step": 5415 + }, + { + "epoch": 2.5607565011820332, + "grad_norm": 3.10168719291687, + "learning_rate": 3.1046702277029046e-06, + "loss": 0.569, + "step": 5416 + }, + { + "epoch": 2.561229314420804, + "grad_norm": 2.5887374877929688, + "learning_rate": 3.1040649038672494e-06, + "loss": 0.3812, + "step": 5417 + }, + { + "epoch": 2.5617021276595744, + "grad_norm": 2.9928438663482666, + "learning_rate": 3.1034595424191212e-06, + "loss": 0.4308, + "step": 5418 + }, + { + "epoch": 2.562174940898345, + "grad_norm": 2.7003073692321777, + "learning_rate": 3.102854143396214e-06, + "loss": 0.4967, + "step": 5419 + }, + { + "epoch": 2.562647754137116, + "grad_norm": 3.172868490219116, + "learning_rate": 3.102248706836222e-06, + "loss": 0.5311, + "step": 5420 + }, + { + "epoch": 2.5631205673758863, + "grad_norm": 3.0146191120147705, + "learning_rate": 3.101643232776844e-06, + "loss": 0.4714, + "step": 5421 + }, + { + "epoch": 2.563593380614657, + "grad_norm": 3.0683791637420654, + "learning_rate": 3.1010377212557806e-06, + "loss": 0.4047, + "step": 5422 + }, + { + "epoch": 2.564066193853428, + "grad_norm": 2.8260676860809326, + "learning_rate": 3.1004321723107334e-06, + "loss": 0.5282, + "step": 5423 + }, + { + "epoch": 2.5645390070921987, + "grad_norm": 3.0792388916015625, + "learning_rate": 3.0998265859794074e-06, + "loss": 0.5323, + "step": 5424 + }, + { + "epoch": 2.5650118203309695, + "grad_norm": 2.7332866191864014, + "learning_rate": 3.09922096229951e-06, + "loss": 0.4401, + "step": 5425 + }, + { + "epoch": 2.56548463356974, + "grad_norm": 2.9366047382354736, + "learning_rate": 3.098615301308751e-06, + "loss": 0.4495, + "step": 5426 + }, + { + "epoch": 2.5659574468085107, + "grad_norm": 2.982088565826416, + "learning_rate": 3.098009603044842e-06, + "loss": 0.495, + "step": 5427 + }, + { + "epoch": 2.5664302600472815, + "grad_norm": 3.1204755306243896, + "learning_rate": 3.0974038675454976e-06, + "loss": 0.4354, + "step": 5428 + }, + { + "epoch": 2.566903073286052, + "grad_norm": 2.835238218307495, + "learning_rate": 3.0967980948484333e-06, + "loss": 0.4161, + "step": 5429 + }, + { + "epoch": 2.5673758865248226, + "grad_norm": 2.8104958534240723, + "learning_rate": 3.096192284991369e-06, + "loss": 0.5045, + "step": 5430 + }, + { + "epoch": 2.5678486997635934, + "grad_norm": 3.1636080741882324, + "learning_rate": 3.0955864380120247e-06, + "loss": 0.4533, + "step": 5431 + }, + { + "epoch": 2.568321513002364, + "grad_norm": 2.980112314224243, + "learning_rate": 3.0949805539481247e-06, + "loss": 0.3998, + "step": 5432 + }, + { + "epoch": 2.568794326241135, + "grad_norm": 2.6379945278167725, + "learning_rate": 3.0943746328373953e-06, + "loss": 0.3785, + "step": 5433 + }, + { + "epoch": 2.5692671394799054, + "grad_norm": 2.780930757522583, + "learning_rate": 3.0937686747175627e-06, + "loss": 0.4801, + "step": 5434 + }, + { + "epoch": 2.569739952718676, + "grad_norm": 2.6608550548553467, + "learning_rate": 3.0931626796263585e-06, + "loss": 0.4047, + "step": 5435 + }, + { + "epoch": 2.570212765957447, + "grad_norm": 3.130584716796875, + "learning_rate": 3.0925566476015156e-06, + "loss": 0.5049, + "step": 5436 + }, + { + "epoch": 2.5706855791962173, + "grad_norm": 2.9699313640594482, + "learning_rate": 3.0919505786807687e-06, + "loss": 0.3847, + "step": 5437 + }, + { + "epoch": 2.571158392434988, + "grad_norm": 2.919260025024414, + "learning_rate": 3.091344472901855e-06, + "loss": 0.4631, + "step": 5438 + }, + { + "epoch": 2.571631205673759, + "grad_norm": 2.956587553024292, + "learning_rate": 3.0907383303025134e-06, + "loss": 0.4974, + "step": 5439 + }, + { + "epoch": 2.5721040189125297, + "grad_norm": 2.758542776107788, + "learning_rate": 3.090132150920486e-06, + "loss": 0.4785, + "step": 5440 + }, + { + "epoch": 2.5725768321513005, + "grad_norm": 2.678469657897949, + "learning_rate": 3.0895259347935175e-06, + "loss": 0.4453, + "step": 5441 + }, + { + "epoch": 2.573049645390071, + "grad_norm": 2.6508545875549316, + "learning_rate": 3.088919681959355e-06, + "loss": 0.4426, + "step": 5442 + }, + { + "epoch": 2.5735224586288417, + "grad_norm": 2.6156187057495117, + "learning_rate": 3.0883133924557453e-06, + "loss": 0.4445, + "step": 5443 + }, + { + "epoch": 2.5739952718676125, + "grad_norm": 2.484374761581421, + "learning_rate": 3.08770706632044e-06, + "loss": 0.4155, + "step": 5444 + }, + { + "epoch": 2.574468085106383, + "grad_norm": 2.7465295791625977, + "learning_rate": 3.087100703591193e-06, + "loss": 0.4085, + "step": 5445 + }, + { + "epoch": 2.5749408983451536, + "grad_norm": 2.771740198135376, + "learning_rate": 3.08649430430576e-06, + "loss": 0.4313, + "step": 5446 + }, + { + "epoch": 2.5754137115839244, + "grad_norm": 2.7480874061584473, + "learning_rate": 3.0858878685018984e-06, + "loss": 0.3471, + "step": 5447 + }, + { + "epoch": 2.575886524822695, + "grad_norm": 2.894913673400879, + "learning_rate": 3.085281396217368e-06, + "loss": 0.4888, + "step": 5448 + }, + { + "epoch": 2.576359338061466, + "grad_norm": 3.037628173828125, + "learning_rate": 3.0846748874899306e-06, + "loss": 0.3976, + "step": 5449 + }, + { + "epoch": 2.5768321513002364, + "grad_norm": 2.4811434745788574, + "learning_rate": 3.0840683423573526e-06, + "loss": 0.4822, + "step": 5450 + }, + { + "epoch": 2.577304964539007, + "grad_norm": 3.0078725814819336, + "learning_rate": 3.0834617608573998e-06, + "loss": 0.4999, + "step": 5451 + }, + { + "epoch": 2.5777777777777775, + "grad_norm": 3.174154043197632, + "learning_rate": 3.0828551430278413e-06, + "loss": 0.4626, + "step": 5452 + }, + { + "epoch": 2.5782505910165483, + "grad_norm": 2.8277535438537598, + "learning_rate": 3.082248488906449e-06, + "loss": 0.4633, + "step": 5453 + }, + { + "epoch": 2.578723404255319, + "grad_norm": 2.731767416000366, + "learning_rate": 3.0816417985309966e-06, + "loss": 0.4148, + "step": 5454 + }, + { + "epoch": 2.57919621749409, + "grad_norm": 2.5480549335479736, + "learning_rate": 3.0810350719392597e-06, + "loss": 0.4773, + "step": 5455 + }, + { + "epoch": 2.5796690307328607, + "grad_norm": 2.9755172729492188, + "learning_rate": 3.080428309169017e-06, + "loss": 0.5107, + "step": 5456 + }, + { + "epoch": 2.580141843971631, + "grad_norm": 2.6499290466308594, + "learning_rate": 3.079821510258048e-06, + "loss": 0.3982, + "step": 5457 + }, + { + "epoch": 2.580614657210402, + "grad_norm": 2.663214921951294, + "learning_rate": 3.079214675244136e-06, + "loss": 0.4419, + "step": 5458 + }, + { + "epoch": 2.5810874704491726, + "grad_norm": 2.595489263534546, + "learning_rate": 3.078607804165066e-06, + "loss": 0.3958, + "step": 5459 + }, + { + "epoch": 2.581560283687943, + "grad_norm": 3.031458854675293, + "learning_rate": 3.0780008970586255e-06, + "loss": 0.518, + "step": 5460 + }, + { + "epoch": 2.582033096926714, + "grad_norm": 2.827071189880371, + "learning_rate": 3.077393953962603e-06, + "loss": 0.4397, + "step": 5461 + }, + { + "epoch": 2.5825059101654846, + "grad_norm": 2.656111240386963, + "learning_rate": 3.0767869749147917e-06, + "loss": 0.4912, + "step": 5462 + }, + { + "epoch": 2.5829787234042554, + "grad_norm": 2.545365333557129, + "learning_rate": 3.076179959952984e-06, + "loss": 0.3991, + "step": 5463 + }, + { + "epoch": 2.583451536643026, + "grad_norm": 2.5794365406036377, + "learning_rate": 3.075572909114977e-06, + "loss": 0.4499, + "step": 5464 + }, + { + "epoch": 2.5839243498817965, + "grad_norm": 2.787140369415283, + "learning_rate": 3.074965822438568e-06, + "loss": 0.386, + "step": 5465 + }, + { + "epoch": 2.5843971631205673, + "grad_norm": 2.6406853199005127, + "learning_rate": 3.0743586999615594e-06, + "loss": 0.4853, + "step": 5466 + }, + { + "epoch": 2.584869976359338, + "grad_norm": 2.8082082271575928, + "learning_rate": 3.073751541721752e-06, + "loss": 0.4669, + "step": 5467 + }, + { + "epoch": 2.5853427895981085, + "grad_norm": 2.8808975219726562, + "learning_rate": 3.073144347756952e-06, + "loss": 0.4193, + "step": 5468 + }, + { + "epoch": 2.5858156028368793, + "grad_norm": 2.823352813720703, + "learning_rate": 3.072537118104968e-06, + "loss": 0.482, + "step": 5469 + }, + { + "epoch": 2.58628841607565, + "grad_norm": 2.6454555988311768, + "learning_rate": 3.0719298528036073e-06, + "loss": 0.4667, + "step": 5470 + }, + { + "epoch": 2.586761229314421, + "grad_norm": 2.871145486831665, + "learning_rate": 3.0713225518906826e-06, + "loss": 0.5125, + "step": 5471 + }, + { + "epoch": 2.5872340425531917, + "grad_norm": 3.1301417350769043, + "learning_rate": 3.070715215404007e-06, + "loss": 0.4827, + "step": 5472 + }, + { + "epoch": 2.587706855791962, + "grad_norm": 2.31062912940979, + "learning_rate": 3.070107843381398e-06, + "loss": 0.3954, + "step": 5473 + }, + { + "epoch": 2.588179669030733, + "grad_norm": 2.8366353511810303, + "learning_rate": 3.069500435860674e-06, + "loss": 0.4597, + "step": 5474 + }, + { + "epoch": 2.5886524822695036, + "grad_norm": 2.900143623352051, + "learning_rate": 3.068892992879654e-06, + "loss": 0.4294, + "step": 5475 + }, + { + "epoch": 2.589125295508274, + "grad_norm": 2.923313617706299, + "learning_rate": 3.0682855144761626e-06, + "loss": 0.505, + "step": 5476 + }, + { + "epoch": 2.5895981087470448, + "grad_norm": 2.726475954055786, + "learning_rate": 3.0676780006880242e-06, + "loss": 0.4208, + "step": 5477 + }, + { + "epoch": 2.5900709219858156, + "grad_norm": 4.115052223205566, + "learning_rate": 3.0670704515530654e-06, + "loss": 0.466, + "step": 5478 + }, + { + "epoch": 2.5905437352245864, + "grad_norm": 2.6018717288970947, + "learning_rate": 3.0664628671091163e-06, + "loss": 0.4697, + "step": 5479 + }, + { + "epoch": 2.591016548463357, + "grad_norm": 2.7393722534179688, + "learning_rate": 3.0658552473940085e-06, + "loss": 0.4618, + "step": 5480 + }, + { + "epoch": 2.5914893617021275, + "grad_norm": 2.8406929969787598, + "learning_rate": 3.065247592445575e-06, + "loss": 0.4806, + "step": 5481 + }, + { + "epoch": 2.5919621749408983, + "grad_norm": 2.9773001670837402, + "learning_rate": 3.0646399023016525e-06, + "loss": 0.4764, + "step": 5482 + }, + { + "epoch": 2.592434988179669, + "grad_norm": 3.374643325805664, + "learning_rate": 3.0640321770000804e-06, + "loss": 0.4481, + "step": 5483 + }, + { + "epoch": 2.5929078014184395, + "grad_norm": 2.5742013454437256, + "learning_rate": 3.0634244165786965e-06, + "loss": 0.432, + "step": 5484 + }, + { + "epoch": 2.5933806146572103, + "grad_norm": 2.9390289783477783, + "learning_rate": 3.062816621075346e-06, + "loss": 0.3941, + "step": 5485 + }, + { + "epoch": 2.593853427895981, + "grad_norm": 2.683414936065674, + "learning_rate": 3.062208790527871e-06, + "loss": 0.4268, + "step": 5486 + }, + { + "epoch": 2.594326241134752, + "grad_norm": 2.689647674560547, + "learning_rate": 3.06160092497412e-06, + "loss": 0.4569, + "step": 5487 + }, + { + "epoch": 2.5947990543735227, + "grad_norm": 3.1170310974121094, + "learning_rate": 3.060993024451943e-06, + "loss": 0.4387, + "step": 5488 + }, + { + "epoch": 2.595271867612293, + "grad_norm": 2.8732447624206543, + "learning_rate": 3.0603850889991894e-06, + "loss": 0.451, + "step": 5489 + }, + { + "epoch": 2.595744680851064, + "grad_norm": 3.0444157123565674, + "learning_rate": 3.0597771186537135e-06, + "loss": 0.4691, + "step": 5490 + }, + { + "epoch": 2.5962174940898346, + "grad_norm": 2.3791720867156982, + "learning_rate": 3.0591691134533714e-06, + "loss": 0.4771, + "step": 5491 + }, + { + "epoch": 2.596690307328605, + "grad_norm": 3.0677225589752197, + "learning_rate": 3.05856107343602e-06, + "loss": 0.459, + "step": 5492 + }, + { + "epoch": 2.5971631205673757, + "grad_norm": 3.1702635288238525, + "learning_rate": 3.05795299863952e-06, + "loss": 0.4816, + "step": 5493 + }, + { + "epoch": 2.5976359338061465, + "grad_norm": 2.964869499206543, + "learning_rate": 3.057344889101734e-06, + "loss": 0.4369, + "step": 5494 + }, + { + "epoch": 2.5981087470449173, + "grad_norm": 3.1333882808685303, + "learning_rate": 3.056736744860525e-06, + "loss": 0.4178, + "step": 5495 + }, + { + "epoch": 2.598581560283688, + "grad_norm": 2.4340405464172363, + "learning_rate": 3.05612856595376e-06, + "loss": 0.4359, + "step": 5496 + }, + { + "epoch": 2.5990543735224585, + "grad_norm": 2.638620615005493, + "learning_rate": 3.0555203524193083e-06, + "loss": 0.3915, + "step": 5497 + }, + { + "epoch": 2.5995271867612293, + "grad_norm": 2.8218815326690674, + "learning_rate": 3.054912104295039e-06, + "loss": 0.4684, + "step": 5498 + }, + { + "epoch": 2.6, + "grad_norm": 2.6696009635925293, + "learning_rate": 3.054303821618827e-06, + "loss": 0.4073, + "step": 5499 + }, + { + "epoch": 2.6004728132387704, + "grad_norm": 2.3880512714385986, + "learning_rate": 3.0536955044285465e-06, + "loss": 0.3576, + "step": 5500 + }, + { + "epoch": 2.6009456264775412, + "grad_norm": 2.762890100479126, + "learning_rate": 3.053087152762075e-06, + "loss": 0.3857, + "step": 5501 + }, + { + "epoch": 2.601418439716312, + "grad_norm": 2.729033946990967, + "learning_rate": 3.052478766657292e-06, + "loss": 0.3935, + "step": 5502 + }, + { + "epoch": 2.601891252955083, + "grad_norm": 2.630490303039551, + "learning_rate": 3.051870346152078e-06, + "loss": 0.3932, + "step": 5503 + }, + { + "epoch": 2.6023640661938536, + "grad_norm": 3.0335981845855713, + "learning_rate": 3.051261891284318e-06, + "loss": 0.4313, + "step": 5504 + }, + { + "epoch": 2.602836879432624, + "grad_norm": 2.969888687133789, + "learning_rate": 3.0506534020918963e-06, + "loss": 0.4698, + "step": 5505 + }, + { + "epoch": 2.603309692671395, + "grad_norm": 3.093996524810791, + "learning_rate": 3.050044878612703e-06, + "loss": 0.5338, + "step": 5506 + }, + { + "epoch": 2.6037825059101656, + "grad_norm": 2.759993314743042, + "learning_rate": 3.049436320884626e-06, + "loss": 0.4429, + "step": 5507 + }, + { + "epoch": 2.604255319148936, + "grad_norm": 2.979422092437744, + "learning_rate": 3.0488277289455587e-06, + "loss": 0.4489, + "step": 5508 + }, + { + "epoch": 2.6047281323877067, + "grad_norm": 2.8266701698303223, + "learning_rate": 3.048219102833396e-06, + "loss": 0.489, + "step": 5509 + }, + { + "epoch": 2.6052009456264775, + "grad_norm": 2.2582461833953857, + "learning_rate": 3.047610442586033e-06, + "loss": 0.3759, + "step": 5510 + }, + { + "epoch": 2.6056737588652483, + "grad_norm": 3.078152894973755, + "learning_rate": 3.0470017482413694e-06, + "loss": 0.5059, + "step": 5511 + }, + { + "epoch": 2.606146572104019, + "grad_norm": 2.7895498275756836, + "learning_rate": 3.0463930198373047e-06, + "loss": 0.4752, + "step": 5512 + }, + { + "epoch": 2.6066193853427895, + "grad_norm": 3.2307958602905273, + "learning_rate": 3.045784257411743e-06, + "loss": 0.4847, + "step": 5513 + }, + { + "epoch": 2.6070921985815603, + "grad_norm": 2.793661594390869, + "learning_rate": 3.0451754610025884e-06, + "loss": 0.4492, + "step": 5514 + }, + { + "epoch": 2.607565011820331, + "grad_norm": 2.4443132877349854, + "learning_rate": 3.0445666306477484e-06, + "loss": 0.4174, + "step": 5515 + }, + { + "epoch": 2.6080378250591014, + "grad_norm": 2.628769636154175, + "learning_rate": 3.0439577663851326e-06, + "loss": 0.3889, + "step": 5516 + }, + { + "epoch": 2.608510638297872, + "grad_norm": 2.9367563724517822, + "learning_rate": 3.0433488682526525e-06, + "loss": 0.437, + "step": 5517 + }, + { + "epoch": 2.608983451536643, + "grad_norm": 3.171353340148926, + "learning_rate": 3.04273993628822e-06, + "loss": 0.47, + "step": 5518 + }, + { + "epoch": 2.609456264775414, + "grad_norm": 2.856576442718506, + "learning_rate": 3.0421309705297513e-06, + "loss": 0.4797, + "step": 5519 + }, + { + "epoch": 2.6099290780141846, + "grad_norm": 2.4926068782806396, + "learning_rate": 3.041521971015165e-06, + "loss": 0.4294, + "step": 5520 + }, + { + "epoch": 2.610401891252955, + "grad_norm": 2.7897613048553467, + "learning_rate": 3.040912937782379e-06, + "loss": 0.4388, + "step": 5521 + }, + { + "epoch": 2.6108747044917258, + "grad_norm": 3.588188886642456, + "learning_rate": 3.0403038708693173e-06, + "loss": 0.4027, + "step": 5522 + }, + { + "epoch": 2.6113475177304966, + "grad_norm": 3.5394980907440186, + "learning_rate": 3.0396947703139017e-06, + "loss": 0.4866, + "step": 5523 + }, + { + "epoch": 2.611820330969267, + "grad_norm": 3.086865186691284, + "learning_rate": 3.03908563615406e-06, + "loss": 0.4344, + "step": 5524 + }, + { + "epoch": 2.6122931442080377, + "grad_norm": 2.649564504623413, + "learning_rate": 3.0384764684277194e-06, + "loss": 0.4571, + "step": 5525 + }, + { + "epoch": 2.6127659574468085, + "grad_norm": 2.945234775543213, + "learning_rate": 3.0378672671728105e-06, + "loss": 0.4885, + "step": 5526 + }, + { + "epoch": 2.6132387706855793, + "grad_norm": 2.625424861907959, + "learning_rate": 3.037258032427265e-06, + "loss": 0.4095, + "step": 5527 + }, + { + "epoch": 2.61371158392435, + "grad_norm": 2.7597248554229736, + "learning_rate": 3.0366487642290175e-06, + "loss": 0.4393, + "step": 5528 + }, + { + "epoch": 2.6141843971631205, + "grad_norm": 2.721189260482788, + "learning_rate": 3.0360394626160043e-06, + "loss": 0.3865, + "step": 5529 + }, + { + "epoch": 2.6146572104018913, + "grad_norm": 2.624056339263916, + "learning_rate": 3.0354301276261656e-06, + "loss": 0.4273, + "step": 5530 + }, + { + "epoch": 2.615130023640662, + "grad_norm": 2.7764177322387695, + "learning_rate": 3.034820759297439e-06, + "loss": 0.4756, + "step": 5531 + }, + { + "epoch": 2.6156028368794324, + "grad_norm": 3.0841729640960693, + "learning_rate": 3.0342113576677696e-06, + "loss": 0.4907, + "step": 5532 + }, + { + "epoch": 2.616075650118203, + "grad_norm": 2.678715705871582, + "learning_rate": 3.0336019227751017e-06, + "loss": 0.4478, + "step": 5533 + }, + { + "epoch": 2.616548463356974, + "grad_norm": 2.378679037094116, + "learning_rate": 3.032992454657382e-06, + "loss": 0.3678, + "step": 5534 + }, + { + "epoch": 2.617021276595745, + "grad_norm": 2.792079210281372, + "learning_rate": 3.0323829533525583e-06, + "loss": 0.4115, + "step": 5535 + }, + { + "epoch": 2.6174940898345156, + "grad_norm": 2.738133192062378, + "learning_rate": 3.0317734188985832e-06, + "loss": 0.4152, + "step": 5536 + }, + { + "epoch": 2.617966903073286, + "grad_norm": 2.6963796615600586, + "learning_rate": 3.0311638513334084e-06, + "loss": 0.4096, + "step": 5537 + }, + { + "epoch": 2.6184397163120567, + "grad_norm": 2.694145679473877, + "learning_rate": 3.03055425069499e-06, + "loss": 0.3793, + "step": 5538 + }, + { + "epoch": 2.6189125295508275, + "grad_norm": 2.762403964996338, + "learning_rate": 3.0299446170212855e-06, + "loss": 0.459, + "step": 5539 + }, + { + "epoch": 2.619385342789598, + "grad_norm": 2.804382562637329, + "learning_rate": 3.0293349503502522e-06, + "loss": 0.4853, + "step": 5540 + }, + { + "epoch": 2.6198581560283687, + "grad_norm": 2.7768518924713135, + "learning_rate": 3.0287252507198537e-06, + "loss": 0.4496, + "step": 5541 + }, + { + "epoch": 2.6203309692671395, + "grad_norm": 2.9075138568878174, + "learning_rate": 3.028115518168052e-06, + "loss": 0.4498, + "step": 5542 + }, + { + "epoch": 2.6208037825059103, + "grad_norm": 2.8966822624206543, + "learning_rate": 3.0275057527328126e-06, + "loss": 0.4434, + "step": 5543 + }, + { + "epoch": 2.621276595744681, + "grad_norm": 2.8140156269073486, + "learning_rate": 3.0268959544521027e-06, + "loss": 0.3935, + "step": 5544 + }, + { + "epoch": 2.6217494089834514, + "grad_norm": 2.8606276512145996, + "learning_rate": 3.0262861233638924e-06, + "loss": 0.4222, + "step": 5545 + }, + { + "epoch": 2.6222222222222222, + "grad_norm": 3.003610134124756, + "learning_rate": 3.0256762595061522e-06, + "loss": 0.428, + "step": 5546 + }, + { + "epoch": 2.622695035460993, + "grad_norm": 2.725907802581787, + "learning_rate": 3.025066362916857e-06, + "loss": 0.3975, + "step": 5547 + }, + { + "epoch": 2.6231678486997634, + "grad_norm": 2.5247902870178223, + "learning_rate": 3.024456433633982e-06, + "loss": 0.4584, + "step": 5548 + }, + { + "epoch": 2.623640661938534, + "grad_norm": 2.932798147201538, + "learning_rate": 3.0238464716955045e-06, + "loss": 0.4991, + "step": 5549 + }, + { + "epoch": 2.624113475177305, + "grad_norm": 2.693547010421753, + "learning_rate": 3.023236477139404e-06, + "loss": 0.4405, + "step": 5550 + }, + { + "epoch": 2.6245862884160758, + "grad_norm": 3.2600035667419434, + "learning_rate": 3.022626450003662e-06, + "loss": 0.4904, + "step": 5551 + }, + { + "epoch": 2.6250591016548466, + "grad_norm": 2.9471960067749023, + "learning_rate": 3.0220163903262627e-06, + "loss": 0.4487, + "step": 5552 + }, + { + "epoch": 2.625531914893617, + "grad_norm": 2.583944082260132, + "learning_rate": 3.0214062981451926e-06, + "loss": 0.3552, + "step": 5553 + }, + { + "epoch": 2.6260047281323877, + "grad_norm": 2.675062656402588, + "learning_rate": 3.0207961734984377e-06, + "loss": 0.4524, + "step": 5554 + }, + { + "epoch": 2.6264775413711585, + "grad_norm": 3.0126802921295166, + "learning_rate": 3.0201860164239887e-06, + "loss": 0.4124, + "step": 5555 + }, + { + "epoch": 2.626950354609929, + "grad_norm": 2.490734577178955, + "learning_rate": 3.019575826959838e-06, + "loss": 0.4095, + "step": 5556 + }, + { + "epoch": 2.6274231678486997, + "grad_norm": 2.72817063331604, + "learning_rate": 3.018965605143978e-06, + "loss": 0.4298, + "step": 5557 + }, + { + "epoch": 2.6278959810874705, + "grad_norm": 3.1298327445983887, + "learning_rate": 3.0183553510144064e-06, + "loss": 0.4961, + "step": 5558 + }, + { + "epoch": 2.6283687943262413, + "grad_norm": 3.2379956245422363, + "learning_rate": 3.0177450646091195e-06, + "loss": 0.4943, + "step": 5559 + }, + { + "epoch": 2.628841607565012, + "grad_norm": 2.5040571689605713, + "learning_rate": 3.017134745966117e-06, + "loss": 0.3701, + "step": 5560 + }, + { + "epoch": 2.6293144208037824, + "grad_norm": 3.047184944152832, + "learning_rate": 3.0165243951234025e-06, + "loss": 0.4587, + "step": 5561 + }, + { + "epoch": 2.629787234042553, + "grad_norm": 2.4926774501800537, + "learning_rate": 3.0159140121189783e-06, + "loss": 0.3723, + "step": 5562 + }, + { + "epoch": 2.630260047281324, + "grad_norm": 2.5434961318969727, + "learning_rate": 3.015303596990851e-06, + "loss": 0.4176, + "step": 5563 + }, + { + "epoch": 2.6307328605200944, + "grad_norm": 2.5117976665496826, + "learning_rate": 3.0146931497770284e-06, + "loss": 0.4218, + "step": 5564 + }, + { + "epoch": 2.631205673758865, + "grad_norm": 2.9408798217773438, + "learning_rate": 3.0140826705155196e-06, + "loss": 0.4473, + "step": 5565 + }, + { + "epoch": 2.631678486997636, + "grad_norm": 2.996422052383423, + "learning_rate": 3.0134721592443385e-06, + "loss": 0.4513, + "step": 5566 + }, + { + "epoch": 2.6321513002364068, + "grad_norm": 2.984356164932251, + "learning_rate": 3.0128616160014955e-06, + "loss": 0.4749, + "step": 5567 + }, + { + "epoch": 2.6326241134751776, + "grad_norm": 2.6075069904327393, + "learning_rate": 3.0122510408250095e-06, + "loss": 0.4707, + "step": 5568 + }, + { + "epoch": 2.633096926713948, + "grad_norm": 2.9463071823120117, + "learning_rate": 3.0116404337528972e-06, + "loss": 0.5125, + "step": 5569 + }, + { + "epoch": 2.6335697399527187, + "grad_norm": 2.98574161529541, + "learning_rate": 3.0110297948231787e-06, + "loss": 0.4487, + "step": 5570 + }, + { + "epoch": 2.6340425531914895, + "grad_norm": 2.6039397716522217, + "learning_rate": 3.010419124073876e-06, + "loss": 0.4516, + "step": 5571 + }, + { + "epoch": 2.63451536643026, + "grad_norm": 2.8480236530303955, + "learning_rate": 3.0098084215430124e-06, + "loss": 0.4962, + "step": 5572 + }, + { + "epoch": 2.6349881796690307, + "grad_norm": 2.527597427368164, + "learning_rate": 3.0091976872686133e-06, + "loss": 0.435, + "step": 5573 + }, + { + "epoch": 2.6354609929078014, + "grad_norm": 2.898303508758545, + "learning_rate": 3.0085869212887076e-06, + "loss": 0.4473, + "step": 5574 + }, + { + "epoch": 2.6359338061465722, + "grad_norm": 2.981414318084717, + "learning_rate": 3.007976123641324e-06, + "loss": 0.4203, + "step": 5575 + }, + { + "epoch": 2.636406619385343, + "grad_norm": 3.219064474105835, + "learning_rate": 3.0073652943644947e-06, + "loss": 0.4596, + "step": 5576 + }, + { + "epoch": 2.6368794326241134, + "grad_norm": 2.7287049293518066, + "learning_rate": 3.0067544334962532e-06, + "loss": 0.433, + "step": 5577 + }, + { + "epoch": 2.637352245862884, + "grad_norm": 2.6232664585113525, + "learning_rate": 3.0061435410746352e-06, + "loss": 0.4254, + "step": 5578 + }, + { + "epoch": 2.637825059101655, + "grad_norm": 2.908311605453491, + "learning_rate": 3.0055326171376788e-06, + "loss": 0.4349, + "step": 5579 + }, + { + "epoch": 2.6382978723404253, + "grad_norm": 2.8369064331054688, + "learning_rate": 3.0049216617234224e-06, + "loss": 0.4675, + "step": 5580 + }, + { + "epoch": 2.638770685579196, + "grad_norm": 2.659499406814575, + "learning_rate": 3.0043106748699085e-06, + "loss": 0.4073, + "step": 5581 + }, + { + "epoch": 2.639243498817967, + "grad_norm": 2.579765558242798, + "learning_rate": 3.00369965661518e-06, + "loss": 0.4536, + "step": 5582 + }, + { + "epoch": 2.6397163120567377, + "grad_norm": 3.572861909866333, + "learning_rate": 3.0030886069972827e-06, + "loss": 0.5227, + "step": 5583 + }, + { + "epoch": 2.6401891252955085, + "grad_norm": 2.6523196697235107, + "learning_rate": 3.002477526054263e-06, + "loss": 0.3846, + "step": 5584 + }, + { + "epoch": 2.640661938534279, + "grad_norm": 3.072181463241577, + "learning_rate": 3.001866413824173e-06, + "loss": 0.5399, + "step": 5585 + }, + { + "epoch": 2.6411347517730497, + "grad_norm": 2.7304325103759766, + "learning_rate": 3.0012552703450597e-06, + "loss": 0.4048, + "step": 5586 + }, + { + "epoch": 2.6416075650118205, + "grad_norm": 3.039491891860962, + "learning_rate": 3.0006440956549798e-06, + "loss": 0.5035, + "step": 5587 + }, + { + "epoch": 2.642080378250591, + "grad_norm": 2.7623798847198486, + "learning_rate": 3.000032889791988e-06, + "loss": 0.4369, + "step": 5588 + }, + { + "epoch": 2.6425531914893616, + "grad_norm": 3.391052722930908, + "learning_rate": 2.9994216527941394e-06, + "loss": 0.5308, + "step": 5589 + }, + { + "epoch": 2.6430260047281324, + "grad_norm": 3.0263915061950684, + "learning_rate": 2.9988103846994954e-06, + "loss": 0.4319, + "step": 5590 + }, + { + "epoch": 2.6434988179669032, + "grad_norm": 2.786607027053833, + "learning_rate": 2.998199085546115e-06, + "loss": 0.4695, + "step": 5591 + }, + { + "epoch": 2.643971631205674, + "grad_norm": 2.884674310684204, + "learning_rate": 2.9975877553720627e-06, + "loss": 0.4615, + "step": 5592 + }, + { + "epoch": 2.6444444444444444, + "grad_norm": 2.6100499629974365, + "learning_rate": 2.996976394215402e-06, + "loss": 0.4784, + "step": 5593 + }, + { + "epoch": 2.644917257683215, + "grad_norm": 2.6978676319122314, + "learning_rate": 2.9963650021142018e-06, + "loss": 0.3911, + "step": 5594 + }, + { + "epoch": 2.645390070921986, + "grad_norm": 2.8080835342407227, + "learning_rate": 2.9957535791065284e-06, + "loss": 0.4997, + "step": 5595 + }, + { + "epoch": 2.6458628841607563, + "grad_norm": 2.6639578342437744, + "learning_rate": 2.9951421252304537e-06, + "loss": 0.4066, + "step": 5596 + }, + { + "epoch": 2.646335697399527, + "grad_norm": 3.102456569671631, + "learning_rate": 2.9945306405240505e-06, + "loss": 0.5554, + "step": 5597 + }, + { + "epoch": 2.646808510638298, + "grad_norm": 2.6524150371551514, + "learning_rate": 2.993919125025392e-06, + "loss": 0.3881, + "step": 5598 + }, + { + "epoch": 2.6472813238770687, + "grad_norm": 2.926316499710083, + "learning_rate": 2.993307578772556e-06, + "loss": 0.4845, + "step": 5599 + }, + { + "epoch": 2.6477541371158395, + "grad_norm": 3.346550703048706, + "learning_rate": 2.9926960018036195e-06, + "loss": 0.4481, + "step": 5600 + }, + { + "epoch": 2.64822695035461, + "grad_norm": 2.6211020946502686, + "learning_rate": 2.9920843941566634e-06, + "loss": 0.4355, + "step": 5601 + }, + { + "epoch": 2.6486997635933807, + "grad_norm": 2.7479333877563477, + "learning_rate": 2.99147275586977e-06, + "loss": 0.4373, + "step": 5602 + }, + { + "epoch": 2.6491725768321515, + "grad_norm": 2.523385524749756, + "learning_rate": 2.9908610869810235e-06, + "loss": 0.4467, + "step": 5603 + }, + { + "epoch": 2.649645390070922, + "grad_norm": 2.93886137008667, + "learning_rate": 2.9902493875285086e-06, + "loss": 0.4956, + "step": 5604 + }, + { + "epoch": 2.6501182033096926, + "grad_norm": 2.7630443572998047, + "learning_rate": 2.989637657550315e-06, + "loss": 0.5012, + "step": 5605 + }, + { + "epoch": 2.6505910165484634, + "grad_norm": 2.6733906269073486, + "learning_rate": 2.989025897084531e-06, + "loss": 0.446, + "step": 5606 + }, + { + "epoch": 2.651063829787234, + "grad_norm": 2.8411107063293457, + "learning_rate": 2.9884141061692484e-06, + "loss": 0.4817, + "step": 5607 + }, + { + "epoch": 2.651536643026005, + "grad_norm": 2.8667192459106445, + "learning_rate": 2.987802284842562e-06, + "loss": 0.3909, + "step": 5608 + }, + { + "epoch": 2.6520094562647754, + "grad_norm": 3.4640755653381348, + "learning_rate": 2.987190433142565e-06, + "loss": 0.4379, + "step": 5609 + }, + { + "epoch": 2.652482269503546, + "grad_norm": 2.675121307373047, + "learning_rate": 2.9865785511073565e-06, + "loss": 0.4833, + "step": 5610 + }, + { + "epoch": 2.652955082742317, + "grad_norm": 2.4375529289245605, + "learning_rate": 2.9859666387750353e-06, + "loss": 0.3949, + "step": 5611 + }, + { + "epoch": 2.6534278959810873, + "grad_norm": 2.7312581539154053, + "learning_rate": 2.9853546961837026e-06, + "loss": 0.4546, + "step": 5612 + }, + { + "epoch": 2.653900709219858, + "grad_norm": 2.7695999145507812, + "learning_rate": 2.9847427233714617e-06, + "loss": 0.4696, + "step": 5613 + }, + { + "epoch": 2.654373522458629, + "grad_norm": 2.6313109397888184, + "learning_rate": 2.984130720376416e-06, + "loss": 0.4733, + "step": 5614 + }, + { + "epoch": 2.6548463356973997, + "grad_norm": 2.656864881515503, + "learning_rate": 2.9835186872366733e-06, + "loss": 0.3806, + "step": 5615 + }, + { + "epoch": 2.65531914893617, + "grad_norm": 2.720075845718384, + "learning_rate": 2.982906623990342e-06, + "loss": 0.4041, + "step": 5616 + }, + { + "epoch": 2.655791962174941, + "grad_norm": 2.6684951782226562, + "learning_rate": 2.9822945306755334e-06, + "loss": 0.4552, + "step": 5617 + }, + { + "epoch": 2.6562647754137116, + "grad_norm": 2.567751884460449, + "learning_rate": 2.9816824073303585e-06, + "loss": 0.465, + "step": 5618 + }, + { + "epoch": 2.656737588652482, + "grad_norm": 2.7490367889404297, + "learning_rate": 2.981070253992933e-06, + "loss": 0.4647, + "step": 5619 + }, + { + "epoch": 2.657210401891253, + "grad_norm": 2.548656463623047, + "learning_rate": 2.9804580707013715e-06, + "loss": 0.4226, + "step": 5620 + }, + { + "epoch": 2.6576832151300236, + "grad_norm": 2.5484731197357178, + "learning_rate": 2.9798458574937927e-06, + "loss": 0.382, + "step": 5621 + }, + { + "epoch": 2.6581560283687944, + "grad_norm": 2.7293949127197266, + "learning_rate": 2.979233614408317e-06, + "loss": 0.4418, + "step": 5622 + }, + { + "epoch": 2.658628841607565, + "grad_norm": 2.645036458969116, + "learning_rate": 2.9786213414830646e-06, + "loss": 0.414, + "step": 5623 + }, + { + "epoch": 2.6591016548463355, + "grad_norm": 2.5287609100341797, + "learning_rate": 2.9780090387561604e-06, + "loss": 0.3914, + "step": 5624 + }, + { + "epoch": 2.6595744680851063, + "grad_norm": 2.5570411682128906, + "learning_rate": 2.9773967062657293e-06, + "loss": 0.4431, + "step": 5625 + }, + { + "epoch": 2.660047281323877, + "grad_norm": 2.681749105453491, + "learning_rate": 2.9767843440498983e-06, + "loss": 0.4245, + "step": 5626 + }, + { + "epoch": 2.6605200945626475, + "grad_norm": 2.8629777431488037, + "learning_rate": 2.976171952146798e-06, + "loss": 0.4643, + "step": 5627 + }, + { + "epoch": 2.6609929078014183, + "grad_norm": 2.577148199081421, + "learning_rate": 2.9755595305945573e-06, + "loss": 0.43, + "step": 5628 + }, + { + "epoch": 2.661465721040189, + "grad_norm": 2.747218370437622, + "learning_rate": 2.97494707943131e-06, + "loss": 0.5194, + "step": 5629 + }, + { + "epoch": 2.66193853427896, + "grad_norm": 2.535604953765869, + "learning_rate": 2.9743345986951904e-06, + "loss": 0.4401, + "step": 5630 + }, + { + "epoch": 2.6624113475177307, + "grad_norm": 3.3341166973114014, + "learning_rate": 2.973722088424336e-06, + "loss": 0.4925, + "step": 5631 + }, + { + "epoch": 2.662884160756501, + "grad_norm": 2.9264349937438965, + "learning_rate": 2.973109548656884e-06, + "loss": 0.4787, + "step": 5632 + }, + { + "epoch": 2.663356973995272, + "grad_norm": 2.7132506370544434, + "learning_rate": 2.9724969794309742e-06, + "loss": 0.4138, + "step": 5633 + }, + { + "epoch": 2.6638297872340426, + "grad_norm": 2.7970192432403564, + "learning_rate": 2.9718843807847497e-06, + "loss": 0.4896, + "step": 5634 + }, + { + "epoch": 2.664302600472813, + "grad_norm": 2.610208749771118, + "learning_rate": 2.9712717527563545e-06, + "loss": 0.3997, + "step": 5635 + }, + { + "epoch": 2.6647754137115838, + "grad_norm": 3.5483577251434326, + "learning_rate": 2.9706590953839335e-06, + "loss": 0.5109, + "step": 5636 + }, + { + "epoch": 2.6652482269503546, + "grad_norm": 2.746933698654175, + "learning_rate": 2.9700464087056345e-06, + "loss": 0.4672, + "step": 5637 + }, + { + "epoch": 2.6657210401891254, + "grad_norm": 2.704436779022217, + "learning_rate": 2.969433692759607e-06, + "loss": 0.4402, + "step": 5638 + }, + { + "epoch": 2.666193853427896, + "grad_norm": 2.859520196914673, + "learning_rate": 2.9688209475840005e-06, + "loss": 0.4679, + "step": 5639 + }, + { + "epoch": 2.6666666666666665, + "grad_norm": 2.518580436706543, + "learning_rate": 2.968208173216971e-06, + "loss": 0.3772, + "step": 5640 + }, + { + "epoch": 2.6671394799054373, + "grad_norm": 2.7624926567077637, + "learning_rate": 2.967595369696671e-06, + "loss": 0.4753, + "step": 5641 + }, + { + "epoch": 2.667612293144208, + "grad_norm": 2.654003620147705, + "learning_rate": 2.966982537061257e-06, + "loss": 0.4583, + "step": 5642 + }, + { + "epoch": 2.6680851063829785, + "grad_norm": 2.8473968505859375, + "learning_rate": 2.966369675348888e-06, + "loss": 0.4623, + "step": 5643 + }, + { + "epoch": 2.6685579196217493, + "grad_norm": 2.5587947368621826, + "learning_rate": 2.9657567845977253e-06, + "loss": 0.4014, + "step": 5644 + }, + { + "epoch": 2.66903073286052, + "grad_norm": 2.572220802307129, + "learning_rate": 2.96514386484593e-06, + "loss": 0.4249, + "step": 5645 + }, + { + "epoch": 2.669503546099291, + "grad_norm": 2.7995707988739014, + "learning_rate": 2.964530916131665e-06, + "loss": 0.4575, + "step": 5646 + }, + { + "epoch": 2.6699763593380617, + "grad_norm": 2.8712687492370605, + "learning_rate": 2.963917938493097e-06, + "loss": 0.4353, + "step": 5647 + }, + { + "epoch": 2.670449172576832, + "grad_norm": 2.856473207473755, + "learning_rate": 2.963304931968393e-06, + "loss": 0.4345, + "step": 5648 + }, + { + "epoch": 2.670921985815603, + "grad_norm": 2.709198474884033, + "learning_rate": 2.9626918965957224e-06, + "loss": 0.4116, + "step": 5649 + }, + { + "epoch": 2.6713947990543736, + "grad_norm": 2.8144607543945312, + "learning_rate": 2.962078832413257e-06, + "loss": 0.4575, + "step": 5650 + }, + { + "epoch": 2.671867612293144, + "grad_norm": 3.131911039352417, + "learning_rate": 2.961465739459168e-06, + "loss": 0.4743, + "step": 5651 + }, + { + "epoch": 2.6723404255319148, + "grad_norm": 2.8487515449523926, + "learning_rate": 2.9608526177716316e-06, + "loss": 0.4314, + "step": 5652 + }, + { + "epoch": 2.6728132387706856, + "grad_norm": 2.613229751586914, + "learning_rate": 2.960239467388823e-06, + "loss": 0.4807, + "step": 5653 + }, + { + "epoch": 2.6732860520094563, + "grad_norm": 2.5049116611480713, + "learning_rate": 2.9596262883489213e-06, + "loss": 0.4708, + "step": 5654 + }, + { + "epoch": 2.673758865248227, + "grad_norm": 2.6347460746765137, + "learning_rate": 2.9590130806901052e-06, + "loss": 0.3689, + "step": 5655 + }, + { + "epoch": 2.6742316784869975, + "grad_norm": 3.3290371894836426, + "learning_rate": 2.9583998444505578e-06, + "loss": 0.4674, + "step": 5656 + }, + { + "epoch": 2.6747044917257683, + "grad_norm": 2.748403549194336, + "learning_rate": 2.957786579668462e-06, + "loss": 0.3852, + "step": 5657 + }, + { + "epoch": 2.675177304964539, + "grad_norm": 2.837573766708374, + "learning_rate": 2.957173286382003e-06, + "loss": 0.4541, + "step": 5658 + }, + { + "epoch": 2.6756501182033094, + "grad_norm": 3.0976510047912598, + "learning_rate": 2.9565599646293686e-06, + "loss": 0.4669, + "step": 5659 + }, + { + "epoch": 2.6761229314420802, + "grad_norm": 2.7059597969055176, + "learning_rate": 2.955946614448747e-06, + "loss": 0.3935, + "step": 5660 + }, + { + "epoch": 2.676595744680851, + "grad_norm": 2.6700541973114014, + "learning_rate": 2.9553332358783294e-06, + "loss": 0.4322, + "step": 5661 + }, + { + "epoch": 2.677068557919622, + "grad_norm": 2.9782698154449463, + "learning_rate": 2.9547198289563068e-06, + "loss": 0.4338, + "step": 5662 + }, + { + "epoch": 2.6775413711583926, + "grad_norm": 2.637876510620117, + "learning_rate": 2.9541063937208755e-06, + "loss": 0.4289, + "step": 5663 + }, + { + "epoch": 2.678014184397163, + "grad_norm": 3.421949863433838, + "learning_rate": 2.953492930210229e-06, + "loss": 0.5458, + "step": 5664 + }, + { + "epoch": 2.678486997635934, + "grad_norm": 2.8273842334747314, + "learning_rate": 2.952879438462567e-06, + "loss": 0.4529, + "step": 5665 + }, + { + "epoch": 2.6789598108747046, + "grad_norm": 2.9090168476104736, + "learning_rate": 2.9522659185160873e-06, + "loss": 0.444, + "step": 5666 + }, + { + "epoch": 2.679432624113475, + "grad_norm": 2.646710157394409, + "learning_rate": 2.9516523704089927e-06, + "loss": 0.4226, + "step": 5667 + }, + { + "epoch": 2.6799054373522457, + "grad_norm": 2.65915584564209, + "learning_rate": 2.951038794179486e-06, + "loss": 0.4307, + "step": 5668 + }, + { + "epoch": 2.6803782505910165, + "grad_norm": 3.004507303237915, + "learning_rate": 2.950425189865771e-06, + "loss": 0.4799, + "step": 5669 + }, + { + "epoch": 2.6808510638297873, + "grad_norm": 2.5210134983062744, + "learning_rate": 2.949811557506054e-06, + "loss": 0.3842, + "step": 5670 + }, + { + "epoch": 2.681323877068558, + "grad_norm": 2.8072893619537354, + "learning_rate": 2.9491978971385436e-06, + "loss": 0.435, + "step": 5671 + }, + { + "epoch": 2.6817966903073285, + "grad_norm": 2.5701990127563477, + "learning_rate": 2.9485842088014498e-06, + "loss": 0.4932, + "step": 5672 + }, + { + "epoch": 2.6822695035460993, + "grad_norm": 2.9368457794189453, + "learning_rate": 2.9479704925329854e-06, + "loss": 0.455, + "step": 5673 + }, + { + "epoch": 2.68274231678487, + "grad_norm": 2.8576247692108154, + "learning_rate": 2.947356748371362e-06, + "loss": 0.4254, + "step": 5674 + }, + { + "epoch": 2.6832151300236404, + "grad_norm": 2.8999195098876953, + "learning_rate": 2.946742976354795e-06, + "loss": 0.4159, + "step": 5675 + }, + { + "epoch": 2.6836879432624112, + "grad_norm": 2.8439736366271973, + "learning_rate": 2.946129176521502e-06, + "loss": 0.4035, + "step": 5676 + }, + { + "epoch": 2.684160756501182, + "grad_norm": 2.8525729179382324, + "learning_rate": 2.945515348909702e-06, + "loss": 0.4137, + "step": 5677 + }, + { + "epoch": 2.684633569739953, + "grad_norm": 2.6573562622070312, + "learning_rate": 2.9449014935576147e-06, + "loss": 0.4203, + "step": 5678 + }, + { + "epoch": 2.6851063829787236, + "grad_norm": 2.765794277191162, + "learning_rate": 2.9442876105034616e-06, + "loss": 0.5184, + "step": 5679 + }, + { + "epoch": 2.685579196217494, + "grad_norm": 2.694617748260498, + "learning_rate": 2.943673699785467e-06, + "loss": 0.417, + "step": 5680 + }, + { + "epoch": 2.6860520094562648, + "grad_norm": 2.740774393081665, + "learning_rate": 2.943059761441857e-06, + "loss": 0.4431, + "step": 5681 + }, + { + "epoch": 2.6865248226950356, + "grad_norm": 2.670642614364624, + "learning_rate": 2.942445795510859e-06, + "loss": 0.4298, + "step": 5682 + }, + { + "epoch": 2.686997635933806, + "grad_norm": 2.838907241821289, + "learning_rate": 2.9418318020307e-06, + "loss": 0.4529, + "step": 5683 + }, + { + "epoch": 2.6874704491725767, + "grad_norm": 2.562317371368408, + "learning_rate": 2.9412177810396135e-06, + "loss": 0.4251, + "step": 5684 + }, + { + "epoch": 2.6879432624113475, + "grad_norm": 2.5805928707122803, + "learning_rate": 2.9406037325758298e-06, + "loss": 0.4405, + "step": 5685 + }, + { + "epoch": 2.6884160756501183, + "grad_norm": 2.5701205730438232, + "learning_rate": 2.939989656677583e-06, + "loss": 0.4184, + "step": 5686 + }, + { + "epoch": 2.688888888888889, + "grad_norm": 2.7990400791168213, + "learning_rate": 2.939375553383111e-06, + "loss": 0.4866, + "step": 5687 + }, + { + "epoch": 2.6893617021276595, + "grad_norm": 3.063319206237793, + "learning_rate": 2.9387614227306487e-06, + "loss": 0.4202, + "step": 5688 + }, + { + "epoch": 2.6898345153664303, + "grad_norm": 3.0891315937042236, + "learning_rate": 2.938147264758437e-06, + "loss": 0.4344, + "step": 5689 + }, + { + "epoch": 2.690307328605201, + "grad_norm": 2.8982670307159424, + "learning_rate": 2.9375330795047165e-06, + "loss": 0.4548, + "step": 5690 + }, + { + "epoch": 2.6907801418439714, + "grad_norm": 2.7947235107421875, + "learning_rate": 2.9369188670077293e-06, + "loss": 0.5028, + "step": 5691 + }, + { + "epoch": 2.691252955082742, + "grad_norm": 3.1615960597991943, + "learning_rate": 2.9363046273057206e-06, + "loss": 0.4855, + "step": 5692 + }, + { + "epoch": 2.691725768321513, + "grad_norm": 2.669516086578369, + "learning_rate": 2.935690360436935e-06, + "loss": 0.3813, + "step": 5693 + }, + { + "epoch": 2.692198581560284, + "grad_norm": 2.8743274211883545, + "learning_rate": 2.935076066439622e-06, + "loss": 0.4302, + "step": 5694 + }, + { + "epoch": 2.6926713947990546, + "grad_norm": 2.6829612255096436, + "learning_rate": 2.9344617453520295e-06, + "loss": 0.4063, + "step": 5695 + }, + { + "epoch": 2.693144208037825, + "grad_norm": 2.776447057723999, + "learning_rate": 2.9338473972124097e-06, + "loss": 0.4921, + "step": 5696 + }, + { + "epoch": 2.6936170212765957, + "grad_norm": 2.7865772247314453, + "learning_rate": 2.9332330220590143e-06, + "loss": 0.4939, + "step": 5697 + }, + { + "epoch": 2.6940898345153665, + "grad_norm": 3.020526170730591, + "learning_rate": 2.932618619930098e-06, + "loss": 0.4839, + "step": 5698 + }, + { + "epoch": 2.694562647754137, + "grad_norm": 2.637057065963745, + "learning_rate": 2.932004190863918e-06, + "loss": 0.4343, + "step": 5699 + }, + { + "epoch": 2.6950354609929077, + "grad_norm": 2.7426512241363525, + "learning_rate": 2.9313897348987314e-06, + "loss": 0.3609, + "step": 5700 + }, + { + "epoch": 2.6955082742316785, + "grad_norm": 2.767186164855957, + "learning_rate": 2.9307752520727974e-06, + "loss": 0.3793, + "step": 5701 + }, + { + "epoch": 2.6959810874704493, + "grad_norm": 2.4791622161865234, + "learning_rate": 2.930160742424377e-06, + "loss": 0.4192, + "step": 5702 + }, + { + "epoch": 2.69645390070922, + "grad_norm": 2.661461591720581, + "learning_rate": 2.9295462059917336e-06, + "loss": 0.4758, + "step": 5703 + }, + { + "epoch": 2.6969267139479904, + "grad_norm": 2.896242380142212, + "learning_rate": 2.928931642813131e-06, + "loss": 0.42, + "step": 5704 + }, + { + "epoch": 2.6973995271867612, + "grad_norm": 2.783813238143921, + "learning_rate": 2.9283170529268366e-06, + "loss": 0.4726, + "step": 5705 + }, + { + "epoch": 2.697872340425532, + "grad_norm": 2.4347333908081055, + "learning_rate": 2.927702436371117e-06, + "loss": 0.4199, + "step": 5706 + }, + { + "epoch": 2.6983451536643024, + "grad_norm": 2.4643805027008057, + "learning_rate": 2.927087793184242e-06, + "loss": 0.3578, + "step": 5707 + }, + { + "epoch": 2.698817966903073, + "grad_norm": 2.6396660804748535, + "learning_rate": 2.9264731234044835e-06, + "loss": 0.4509, + "step": 5708 + }, + { + "epoch": 2.699290780141844, + "grad_norm": 2.7341182231903076, + "learning_rate": 2.925858427070113e-06, + "loss": 0.4331, + "step": 5709 + }, + { + "epoch": 2.699763593380615, + "grad_norm": 2.7578938007354736, + "learning_rate": 2.9252437042194058e-06, + "loss": 0.4508, + "step": 5710 + }, + { + "epoch": 2.7002364066193856, + "grad_norm": 2.557788133621216, + "learning_rate": 2.9246289548906375e-06, + "loss": 0.3775, + "step": 5711 + }, + { + "epoch": 2.700709219858156, + "grad_norm": 2.802851676940918, + "learning_rate": 2.924014179122086e-06, + "loss": 0.4518, + "step": 5712 + }, + { + "epoch": 2.7011820330969267, + "grad_norm": 2.4773001670837402, + "learning_rate": 2.9233993769520313e-06, + "loss": 0.4019, + "step": 5713 + }, + { + "epoch": 2.7016548463356975, + "grad_norm": 3.108971357345581, + "learning_rate": 2.922784548418754e-06, + "loss": 0.4715, + "step": 5714 + }, + { + "epoch": 2.702127659574468, + "grad_norm": 2.8596770763397217, + "learning_rate": 2.9221696935605366e-06, + "loss": 0.4361, + "step": 5715 + }, + { + "epoch": 2.7026004728132387, + "grad_norm": 2.570604085922241, + "learning_rate": 2.9215548124156633e-06, + "loss": 0.3982, + "step": 5716 + }, + { + "epoch": 2.7030732860520095, + "grad_norm": 2.3157799243927, + "learning_rate": 2.9209399050224206e-06, + "loss": 0.456, + "step": 5717 + }, + { + "epoch": 2.7035460992907803, + "grad_norm": 2.6865758895874023, + "learning_rate": 2.9203249714190952e-06, + "loss": 0.4441, + "step": 5718 + }, + { + "epoch": 2.704018912529551, + "grad_norm": 2.76723313331604, + "learning_rate": 2.919710011643978e-06, + "loss": 0.464, + "step": 5719 + }, + { + "epoch": 2.7044917257683214, + "grad_norm": 2.648792028427124, + "learning_rate": 2.9190950257353578e-06, + "loss": 0.3426, + "step": 5720 + }, + { + "epoch": 2.704964539007092, + "grad_norm": 2.878739833831787, + "learning_rate": 2.9184800137315276e-06, + "loss": 0.4431, + "step": 5721 + }, + { + "epoch": 2.705437352245863, + "grad_norm": 2.670567274093628, + "learning_rate": 2.917864975670783e-06, + "loss": 0.4347, + "step": 5722 + }, + { + "epoch": 2.7059101654846334, + "grad_norm": 2.7031569480895996, + "learning_rate": 2.9172499115914184e-06, + "loss": 0.4557, + "step": 5723 + }, + { + "epoch": 2.706382978723404, + "grad_norm": 2.5225696563720703, + "learning_rate": 2.9166348215317314e-06, + "loss": 0.4159, + "step": 5724 + }, + { + "epoch": 2.706855791962175, + "grad_norm": 2.8676085472106934, + "learning_rate": 2.916019705530021e-06, + "loss": 0.5018, + "step": 5725 + }, + { + "epoch": 2.7073286052009458, + "grad_norm": 2.576463460922241, + "learning_rate": 2.915404563624587e-06, + "loss": 0.4317, + "step": 5726 + }, + { + "epoch": 2.7078014184397166, + "grad_norm": 3.155565023422241, + "learning_rate": 2.9147893958537328e-06, + "loss": 0.5029, + "step": 5727 + }, + { + "epoch": 2.708274231678487, + "grad_norm": 2.604079008102417, + "learning_rate": 2.9141742022557622e-06, + "loss": 0.4324, + "step": 5728 + }, + { + "epoch": 2.7087470449172577, + "grad_norm": 2.6597228050231934, + "learning_rate": 2.913558982868979e-06, + "loss": 0.4335, + "step": 5729 + }, + { + "epoch": 2.7092198581560285, + "grad_norm": 2.811384439468384, + "learning_rate": 2.9129437377316923e-06, + "loss": 0.4031, + "step": 5730 + }, + { + "epoch": 2.709692671394799, + "grad_norm": 3.1041207313537598, + "learning_rate": 2.91232846688221e-06, + "loss": 0.481, + "step": 5731 + }, + { + "epoch": 2.7101654846335697, + "grad_norm": 2.5992188453674316, + "learning_rate": 2.9117131703588414e-06, + "loss": 0.4266, + "step": 5732 + }, + { + "epoch": 2.7106382978723405, + "grad_norm": 2.7726242542266846, + "learning_rate": 2.911097848199899e-06, + "loss": 0.4464, + "step": 5733 + }, + { + "epoch": 2.7111111111111112, + "grad_norm": 2.8683483600616455, + "learning_rate": 2.9104825004436966e-06, + "loss": 0.4248, + "step": 5734 + }, + { + "epoch": 2.711583924349882, + "grad_norm": 2.776386022567749, + "learning_rate": 2.9098671271285484e-06, + "loss": 0.4556, + "step": 5735 + }, + { + "epoch": 2.7120567375886524, + "grad_norm": 2.7612528800964355, + "learning_rate": 2.909251728292771e-06, + "loss": 0.455, + "step": 5736 + }, + { + "epoch": 2.712529550827423, + "grad_norm": 2.9223551750183105, + "learning_rate": 2.908636303974684e-06, + "loss": 0.4302, + "step": 5737 + }, + { + "epoch": 2.713002364066194, + "grad_norm": 2.898226022720337, + "learning_rate": 2.908020854212606e-06, + "loss": 0.4827, + "step": 5738 + }, + { + "epoch": 2.7134751773049643, + "grad_norm": 2.706361770629883, + "learning_rate": 2.9074053790448576e-06, + "loss": 0.4444, + "step": 5739 + }, + { + "epoch": 2.713947990543735, + "grad_norm": 2.8227248191833496, + "learning_rate": 2.9067898785097637e-06, + "loss": 0.4661, + "step": 5740 + }, + { + "epoch": 2.714420803782506, + "grad_norm": 2.597837448120117, + "learning_rate": 2.9061743526456474e-06, + "loss": 0.4646, + "step": 5741 + }, + { + "epoch": 2.7148936170212767, + "grad_norm": 2.5525131225585938, + "learning_rate": 2.9055588014908354e-06, + "loss": 0.4172, + "step": 5742 + }, + { + "epoch": 2.7153664302600475, + "grad_norm": 2.713071823120117, + "learning_rate": 2.904943225083655e-06, + "loss": 0.4893, + "step": 5743 + }, + { + "epoch": 2.715839243498818, + "grad_norm": 2.538623571395874, + "learning_rate": 2.9043276234624353e-06, + "loss": 0.3905, + "step": 5744 + }, + { + "epoch": 2.7163120567375887, + "grad_norm": 2.5190389156341553, + "learning_rate": 2.9037119966655076e-06, + "loss": 0.4318, + "step": 5745 + }, + { + "epoch": 2.7167848699763595, + "grad_norm": 2.6587612628936768, + "learning_rate": 2.903096344731204e-06, + "loss": 0.4153, + "step": 5746 + }, + { + "epoch": 2.71725768321513, + "grad_norm": 2.836731433868408, + "learning_rate": 2.902480667697859e-06, + "loss": 0.4779, + "step": 5747 + }, + { + "epoch": 2.7177304964539006, + "grad_norm": 2.8076045513153076, + "learning_rate": 2.9018649656038074e-06, + "loss": 0.5126, + "step": 5748 + }, + { + "epoch": 2.7182033096926714, + "grad_norm": 2.8930516242980957, + "learning_rate": 2.9012492384873865e-06, + "loss": 0.4561, + "step": 5749 + }, + { + "epoch": 2.7186761229314422, + "grad_norm": 2.7000370025634766, + "learning_rate": 2.9006334863869343e-06, + "loss": 0.4659, + "step": 5750 + }, + { + "epoch": 2.719148936170213, + "grad_norm": 2.927011251449585, + "learning_rate": 2.9000177093407926e-06, + "loss": 0.5123, + "step": 5751 + }, + { + "epoch": 2.7196217494089834, + "grad_norm": 3.0102779865264893, + "learning_rate": 2.8994019073873015e-06, + "loss": 0.3972, + "step": 5752 + }, + { + "epoch": 2.720094562647754, + "grad_norm": 2.778838634490967, + "learning_rate": 2.8987860805648054e-06, + "loss": 0.4922, + "step": 5753 + }, + { + "epoch": 2.720567375886525, + "grad_norm": 2.6150314807891846, + "learning_rate": 2.898170228911648e-06, + "loss": 0.4425, + "step": 5754 + }, + { + "epoch": 2.7210401891252953, + "grad_norm": 2.9329984188079834, + "learning_rate": 2.8975543524661777e-06, + "loss": 0.4872, + "step": 5755 + }, + { + "epoch": 2.721513002364066, + "grad_norm": 2.756803512573242, + "learning_rate": 2.8969384512667404e-06, + "loss": 0.4362, + "step": 5756 + }, + { + "epoch": 2.721985815602837, + "grad_norm": 2.600877285003662, + "learning_rate": 2.896322525351686e-06, + "loss": 0.4802, + "step": 5757 + }, + { + "epoch": 2.7224586288416077, + "grad_norm": 2.647069215774536, + "learning_rate": 2.8957065747593655e-06, + "loss": 0.4649, + "step": 5758 + }, + { + "epoch": 2.7229314420803785, + "grad_norm": 2.845388174057007, + "learning_rate": 2.895090599528132e-06, + "loss": 0.4533, + "step": 5759 + }, + { + "epoch": 2.723404255319149, + "grad_norm": 2.973881721496582, + "learning_rate": 2.8944745996963397e-06, + "loss": 0.4959, + "step": 5760 + }, + { + "epoch": 2.7238770685579197, + "grad_norm": 2.8995487689971924, + "learning_rate": 2.8938585753023435e-06, + "loss": 0.4597, + "step": 5761 + }, + { + "epoch": 2.7243498817966905, + "grad_norm": 2.903693437576294, + "learning_rate": 2.8932425263845004e-06, + "loss": 0.4521, + "step": 5762 + }, + { + "epoch": 2.724822695035461, + "grad_norm": 2.7609009742736816, + "learning_rate": 2.8926264529811702e-06, + "loss": 0.4399, + "step": 5763 + }, + { + "epoch": 2.7252955082742316, + "grad_norm": 2.788787603378296, + "learning_rate": 2.892010355130712e-06, + "loss": 0.4614, + "step": 5764 + }, + { + "epoch": 2.7257683215130024, + "grad_norm": 2.786498785018921, + "learning_rate": 2.8913942328714887e-06, + "loss": 0.4798, + "step": 5765 + }, + { + "epoch": 2.726241134751773, + "grad_norm": 2.9809393882751465, + "learning_rate": 2.8907780862418616e-06, + "loss": 0.5108, + "step": 5766 + }, + { + "epoch": 2.726713947990544, + "grad_norm": 2.6621177196502686, + "learning_rate": 2.8901619152801967e-06, + "loss": 0.4031, + "step": 5767 + }, + { + "epoch": 2.7271867612293144, + "grad_norm": 3.3092098236083984, + "learning_rate": 2.8895457200248607e-06, + "loss": 0.4671, + "step": 5768 + }, + { + "epoch": 2.727659574468085, + "grad_norm": 2.866306781768799, + "learning_rate": 2.8889295005142204e-06, + "loss": 0.4434, + "step": 5769 + }, + { + "epoch": 2.728132387706856, + "grad_norm": 2.6861231327056885, + "learning_rate": 2.888313256786646e-06, + "loss": 0.429, + "step": 5770 + }, + { + "epoch": 2.7286052009456263, + "grad_norm": 2.873180389404297, + "learning_rate": 2.8876969888805072e-06, + "loss": 0.4412, + "step": 5771 + }, + { + "epoch": 2.729078014184397, + "grad_norm": 2.511678695678711, + "learning_rate": 2.887080696834178e-06, + "loss": 0.4024, + "step": 5772 + }, + { + "epoch": 2.729550827423168, + "grad_norm": 2.6502726078033447, + "learning_rate": 2.88646438068603e-06, + "loss": 0.4357, + "step": 5773 + }, + { + "epoch": 2.7300236406619387, + "grad_norm": 2.7156145572662354, + "learning_rate": 2.8858480404744403e-06, + "loss": 0.4511, + "step": 5774 + }, + { + "epoch": 2.7304964539007095, + "grad_norm": 2.882582187652588, + "learning_rate": 2.8852316762377842e-06, + "loss": 0.4822, + "step": 5775 + }, + { + "epoch": 2.73096926713948, + "grad_norm": 2.7139666080474854, + "learning_rate": 2.8846152880144413e-06, + "loss": 0.4666, + "step": 5776 + }, + { + "epoch": 2.7314420803782506, + "grad_norm": 2.7453949451446533, + "learning_rate": 2.8839988758427907e-06, + "loss": 0.3927, + "step": 5777 + }, + { + "epoch": 2.731914893617021, + "grad_norm": 2.7859580516815186, + "learning_rate": 2.883382439761214e-06, + "loss": 0.4466, + "step": 5778 + }, + { + "epoch": 2.732387706855792, + "grad_norm": 2.695234537124634, + "learning_rate": 2.882765979808094e-06, + "loss": 0.4227, + "step": 5779 + }, + { + "epoch": 2.7328605200945626, + "grad_norm": 2.8081552982330322, + "learning_rate": 2.8821494960218148e-06, + "loss": 0.447, + "step": 5780 + }, + { + "epoch": 2.7333333333333334, + "grad_norm": 2.887643337249756, + "learning_rate": 2.881532988440762e-06, + "loss": 0.5018, + "step": 5781 + }, + { + "epoch": 2.733806146572104, + "grad_norm": 3.108212471008301, + "learning_rate": 2.8809164571033233e-06, + "loss": 0.4132, + "step": 5782 + }, + { + "epoch": 2.7342789598108745, + "grad_norm": 2.874328374862671, + "learning_rate": 2.880299902047886e-06, + "loss": 0.4618, + "step": 5783 + }, + { + "epoch": 2.7347517730496453, + "grad_norm": 3.089132308959961, + "learning_rate": 2.879683323312843e-06, + "loss": 0.4956, + "step": 5784 + }, + { + "epoch": 2.735224586288416, + "grad_norm": 2.5173206329345703, + "learning_rate": 2.879066720936583e-06, + "loss": 0.4087, + "step": 5785 + }, + { + "epoch": 2.7356973995271865, + "grad_norm": 2.6401286125183105, + "learning_rate": 2.8784500949575014e-06, + "loss": 0.3995, + "step": 5786 + }, + { + "epoch": 2.7361702127659573, + "grad_norm": 2.9371910095214844, + "learning_rate": 2.877833445413991e-06, + "loss": 0.5209, + "step": 5787 + }, + { + "epoch": 2.736643026004728, + "grad_norm": 3.218158006668091, + "learning_rate": 2.8772167723444498e-06, + "loss": 0.4275, + "step": 5788 + }, + { + "epoch": 2.737115839243499, + "grad_norm": 2.9072160720825195, + "learning_rate": 2.8766000757872736e-06, + "loss": 0.4244, + "step": 5789 + }, + { + "epoch": 2.7375886524822697, + "grad_norm": 3.0378096103668213, + "learning_rate": 2.8759833557808614e-06, + "loss": 0.507, + "step": 5790 + }, + { + "epoch": 2.73806146572104, + "grad_norm": 2.728353977203369, + "learning_rate": 2.8753666123636148e-06, + "loss": 0.413, + "step": 5791 + }, + { + "epoch": 2.738534278959811, + "grad_norm": 2.6869957447052, + "learning_rate": 2.874749845573935e-06, + "loss": 0.44, + "step": 5792 + }, + { + "epoch": 2.7390070921985816, + "grad_norm": 2.6381702423095703, + "learning_rate": 2.8741330554502263e-06, + "loss": 0.4708, + "step": 5793 + }, + { + "epoch": 2.739479905437352, + "grad_norm": 2.6944689750671387, + "learning_rate": 2.873516242030892e-06, + "loss": 0.4555, + "step": 5794 + }, + { + "epoch": 2.739952718676123, + "grad_norm": 3.168473243713379, + "learning_rate": 2.8728994053543396e-06, + "loss": 0.4538, + "step": 5795 + }, + { + "epoch": 2.7404255319148936, + "grad_norm": 2.7504515647888184, + "learning_rate": 2.872282545458976e-06, + "loss": 0.4628, + "step": 5796 + }, + { + "epoch": 2.7408983451536644, + "grad_norm": 2.896462917327881, + "learning_rate": 2.8716656623832114e-06, + "loss": 0.4946, + "step": 5797 + }, + { + "epoch": 2.741371158392435, + "grad_norm": 2.8053417205810547, + "learning_rate": 2.8710487561654547e-06, + "loss": 0.4893, + "step": 5798 + }, + { + "epoch": 2.7418439716312055, + "grad_norm": 2.63171124458313, + "learning_rate": 2.870431826844119e-06, + "loss": 0.4257, + "step": 5799 + }, + { + "epoch": 2.7423167848699763, + "grad_norm": 3.0963807106018066, + "learning_rate": 2.869814874457618e-06, + "loss": 0.5404, + "step": 5800 + }, + { + "epoch": 2.742789598108747, + "grad_norm": 2.591132164001465, + "learning_rate": 2.8691978990443664e-06, + "loss": 0.4015, + "step": 5801 + }, + { + "epoch": 2.7432624113475175, + "grad_norm": 3.0319552421569824, + "learning_rate": 2.8685809006427812e-06, + "loss": 0.4411, + "step": 5802 + }, + { + "epoch": 2.7437352245862883, + "grad_norm": 2.7791874408721924, + "learning_rate": 2.8679638792912784e-06, + "loss": 0.43, + "step": 5803 + }, + { + "epoch": 2.744208037825059, + "grad_norm": 3.530632495880127, + "learning_rate": 2.867346835028279e-06, + "loss": 0.4581, + "step": 5804 + }, + { + "epoch": 2.74468085106383, + "grad_norm": 3.2043099403381348, + "learning_rate": 2.8667297678922024e-06, + "loss": 0.4375, + "step": 5805 + }, + { + "epoch": 2.7451536643026007, + "grad_norm": 2.8442344665527344, + "learning_rate": 2.8661126779214716e-06, + "loss": 0.4059, + "step": 5806 + }, + { + "epoch": 2.745626477541371, + "grad_norm": 2.7561380863189697, + "learning_rate": 2.86549556515451e-06, + "loss": 0.4391, + "step": 5807 + }, + { + "epoch": 2.746099290780142, + "grad_norm": 3.229663848876953, + "learning_rate": 2.8648784296297418e-06, + "loss": 0.4579, + "step": 5808 + }, + { + "epoch": 2.7465721040189126, + "grad_norm": 2.8375027179718018, + "learning_rate": 2.864261271385593e-06, + "loss": 0.4566, + "step": 5809 + }, + { + "epoch": 2.747044917257683, + "grad_norm": 2.392998695373535, + "learning_rate": 2.863644090460493e-06, + "loss": 0.4123, + "step": 5810 + }, + { + "epoch": 2.7475177304964538, + "grad_norm": 2.707610607147217, + "learning_rate": 2.86302688689287e-06, + "loss": 0.4299, + "step": 5811 + }, + { + "epoch": 2.7479905437352246, + "grad_norm": 2.824042797088623, + "learning_rate": 2.8624096607211547e-06, + "loss": 0.3799, + "step": 5812 + }, + { + "epoch": 2.7484633569739954, + "grad_norm": 2.933102607727051, + "learning_rate": 2.861792411983779e-06, + "loss": 0.4416, + "step": 5813 + }, + { + "epoch": 2.748936170212766, + "grad_norm": 2.7000277042388916, + "learning_rate": 2.8611751407191757e-06, + "loss": 0.4346, + "step": 5814 + }, + { + "epoch": 2.7494089834515365, + "grad_norm": 2.6757142543792725, + "learning_rate": 2.86055784696578e-06, + "loss": 0.4559, + "step": 5815 + }, + { + "epoch": 2.7498817966903073, + "grad_norm": 2.5791053771972656, + "learning_rate": 2.8599405307620287e-06, + "loss": 0.4619, + "step": 5816 + }, + { + "epoch": 2.750354609929078, + "grad_norm": 3.0327374935150146, + "learning_rate": 2.859323192146359e-06, + "loss": 0.4163, + "step": 5817 + }, + { + "epoch": 2.7508274231678485, + "grad_norm": 2.5580220222473145, + "learning_rate": 2.8587058311572084e-06, + "loss": 0.4005, + "step": 5818 + }, + { + "epoch": 2.7513002364066192, + "grad_norm": 2.592179536819458, + "learning_rate": 2.85808844783302e-06, + "loss": 0.4404, + "step": 5819 + }, + { + "epoch": 2.75177304964539, + "grad_norm": 3.2779927253723145, + "learning_rate": 2.8574710422122342e-06, + "loss": 0.54, + "step": 5820 + }, + { + "epoch": 2.752245862884161, + "grad_norm": 2.4804370403289795, + "learning_rate": 2.8568536143332933e-06, + "loss": 0.4476, + "step": 5821 + }, + { + "epoch": 2.7527186761229316, + "grad_norm": 2.649477481842041, + "learning_rate": 2.8562361642346427e-06, + "loss": 0.4336, + "step": 5822 + }, + { + "epoch": 2.753191489361702, + "grad_norm": 3.138587474822998, + "learning_rate": 2.855618691954728e-06, + "loss": 0.5042, + "step": 5823 + }, + { + "epoch": 2.753664302600473, + "grad_norm": 2.75093412399292, + "learning_rate": 2.855001197531997e-06, + "loss": 0.4327, + "step": 5824 + }, + { + "epoch": 2.7541371158392436, + "grad_norm": 2.678809642791748, + "learning_rate": 2.854383681004898e-06, + "loss": 0.4409, + "step": 5825 + }, + { + "epoch": 2.754609929078014, + "grad_norm": 2.965386390686035, + "learning_rate": 2.853766142411881e-06, + "loss": 0.4716, + "step": 5826 + }, + { + "epoch": 2.7550827423167847, + "grad_norm": 2.6419436931610107, + "learning_rate": 2.853148581791398e-06, + "loss": 0.4367, + "step": 5827 + }, + { + "epoch": 2.7555555555555555, + "grad_norm": 3.205794095993042, + "learning_rate": 2.8525309991819004e-06, + "loss": 0.4869, + "step": 5828 + }, + { + "epoch": 2.7560283687943263, + "grad_norm": 3.041008472442627, + "learning_rate": 2.851913394621844e-06, + "loss": 0.5087, + "step": 5829 + }, + { + "epoch": 2.756501182033097, + "grad_norm": 2.6525566577911377, + "learning_rate": 2.851295768149684e-06, + "loss": 0.3951, + "step": 5830 + }, + { + "epoch": 2.7569739952718675, + "grad_norm": 2.732220411300659, + "learning_rate": 2.850678119803876e-06, + "loss": 0.4797, + "step": 5831 + }, + { + "epoch": 2.7574468085106383, + "grad_norm": 2.8965251445770264, + "learning_rate": 2.8500604496228797e-06, + "loss": 0.4938, + "step": 5832 + }, + { + "epoch": 2.757919621749409, + "grad_norm": 2.48020076751709, + "learning_rate": 2.849442757645154e-06, + "loss": 0.4172, + "step": 5833 + }, + { + "epoch": 2.7583924349881794, + "grad_norm": 2.4764912128448486, + "learning_rate": 2.8488250439091603e-06, + "loss": 0.4123, + "step": 5834 + }, + { + "epoch": 2.7588652482269502, + "grad_norm": 2.4547016620635986, + "learning_rate": 2.84820730845336e-06, + "loss": 0.4116, + "step": 5835 + }, + { + "epoch": 2.759338061465721, + "grad_norm": 2.55476975440979, + "learning_rate": 2.847589551316218e-06, + "loss": 0.4744, + "step": 5836 + }, + { + "epoch": 2.759810874704492, + "grad_norm": 2.3866238594055176, + "learning_rate": 2.846971772536199e-06, + "loss": 0.4406, + "step": 5837 + }, + { + "epoch": 2.7602836879432626, + "grad_norm": 2.855318784713745, + "learning_rate": 2.8463539721517687e-06, + "loss": 0.4517, + "step": 5838 + }, + { + "epoch": 2.760756501182033, + "grad_norm": 2.527198314666748, + "learning_rate": 2.8457361502013954e-06, + "loss": 0.3588, + "step": 5839 + }, + { + "epoch": 2.7612293144208038, + "grad_norm": 2.6761462688446045, + "learning_rate": 2.8451183067235476e-06, + "loss": 0.4192, + "step": 5840 + }, + { + "epoch": 2.7617021276595746, + "grad_norm": 2.5692319869995117, + "learning_rate": 2.8445004417566967e-06, + "loss": 0.4108, + "step": 5841 + }, + { + "epoch": 2.762174940898345, + "grad_norm": 2.5721096992492676, + "learning_rate": 2.8438825553393133e-06, + "loss": 0.3941, + "step": 5842 + }, + { + "epoch": 2.7626477541371157, + "grad_norm": 2.699430227279663, + "learning_rate": 2.843264647509872e-06, + "loss": 0.4418, + "step": 5843 + }, + { + "epoch": 2.7631205673758865, + "grad_norm": 2.6943318843841553, + "learning_rate": 2.842646718306846e-06, + "loss": 0.4505, + "step": 5844 + }, + { + "epoch": 2.7635933806146573, + "grad_norm": 2.661656379699707, + "learning_rate": 2.8420287677687107e-06, + "loss": 0.4413, + "step": 5845 + }, + { + "epoch": 2.764066193853428, + "grad_norm": 2.830467939376831, + "learning_rate": 2.8414107959339444e-06, + "loss": 0.5095, + "step": 5846 + }, + { + "epoch": 2.7645390070921985, + "grad_norm": 2.598053455352783, + "learning_rate": 2.840792802841024e-06, + "loss": 0.4029, + "step": 5847 + }, + { + "epoch": 2.7650118203309693, + "grad_norm": 2.641700029373169, + "learning_rate": 2.8401747885284316e-06, + "loss": 0.4237, + "step": 5848 + }, + { + "epoch": 2.76548463356974, + "grad_norm": 2.6672768592834473, + "learning_rate": 2.8395567530346454e-06, + "loss": 0.4181, + "step": 5849 + }, + { + "epoch": 2.7659574468085104, + "grad_norm": 2.5851705074310303, + "learning_rate": 2.838938696398149e-06, + "loss": 0.4165, + "step": 5850 + }, + { + "epoch": 2.766430260047281, + "grad_norm": 2.318120002746582, + "learning_rate": 2.8383206186574276e-06, + "loss": 0.3578, + "step": 5851 + }, + { + "epoch": 2.766903073286052, + "grad_norm": 2.6199793815612793, + "learning_rate": 2.8377025198509635e-06, + "loss": 0.4719, + "step": 5852 + }, + { + "epoch": 2.767375886524823, + "grad_norm": 2.7186086177825928, + "learning_rate": 2.837084400017245e-06, + "loss": 0.41, + "step": 5853 + }, + { + "epoch": 2.7678486997635936, + "grad_norm": 2.702514886856079, + "learning_rate": 2.8364662591947583e-06, + "loss": 0.4659, + "step": 5854 + }, + { + "epoch": 2.768321513002364, + "grad_norm": 2.612375259399414, + "learning_rate": 2.835848097421993e-06, + "loss": 0.4252, + "step": 5855 + }, + { + "epoch": 2.7687943262411348, + "grad_norm": 3.0127978324890137, + "learning_rate": 2.8352299147374394e-06, + "loss": 0.4084, + "step": 5856 + }, + { + "epoch": 2.7692671394799055, + "grad_norm": 2.6460049152374268, + "learning_rate": 2.83461171117959e-06, + "loss": 0.4035, + "step": 5857 + }, + { + "epoch": 2.769739952718676, + "grad_norm": 2.9844725131988525, + "learning_rate": 2.8339934867869357e-06, + "loss": 0.4912, + "step": 5858 + }, + { + "epoch": 2.7702127659574467, + "grad_norm": 2.731217861175537, + "learning_rate": 2.833375241597972e-06, + "loss": 0.4112, + "step": 5859 + }, + { + "epoch": 2.7706855791962175, + "grad_norm": 2.731194496154785, + "learning_rate": 2.832756975651193e-06, + "loss": 0.4516, + "step": 5860 + }, + { + "epoch": 2.7711583924349883, + "grad_norm": 3.0532076358795166, + "learning_rate": 2.8321386889850965e-06, + "loss": 0.3959, + "step": 5861 + }, + { + "epoch": 2.771631205673759, + "grad_norm": 3.5437800884246826, + "learning_rate": 2.831520381638181e-06, + "loss": 0.6055, + "step": 5862 + }, + { + "epoch": 2.7721040189125294, + "grad_norm": 2.4297714233398438, + "learning_rate": 2.830902053648944e-06, + "loss": 0.4038, + "step": 5863 + }, + { + "epoch": 2.7725768321513002, + "grad_norm": 2.696768045425415, + "learning_rate": 2.8302837050558876e-06, + "loss": 0.3983, + "step": 5864 + }, + { + "epoch": 2.773049645390071, + "grad_norm": 2.6574649810791016, + "learning_rate": 2.8296653358975122e-06, + "loss": 0.4937, + "step": 5865 + }, + { + "epoch": 2.7735224586288414, + "grad_norm": 2.9393341541290283, + "learning_rate": 2.8290469462123234e-06, + "loss": 0.4603, + "step": 5866 + }, + { + "epoch": 2.773995271867612, + "grad_norm": 2.7630696296691895, + "learning_rate": 2.828428536038824e-06, + "loss": 0.4663, + "step": 5867 + }, + { + "epoch": 2.774468085106383, + "grad_norm": 2.7354233264923096, + "learning_rate": 2.8278101054155183e-06, + "loss": 0.4444, + "step": 5868 + }, + { + "epoch": 2.774940898345154, + "grad_norm": 3.0489425659179688, + "learning_rate": 2.827191654380915e-06, + "loss": 0.4684, + "step": 5869 + }, + { + "epoch": 2.7754137115839246, + "grad_norm": 2.9602572917938232, + "learning_rate": 2.8265731829735226e-06, + "loss": 0.4571, + "step": 5870 + }, + { + "epoch": 2.775886524822695, + "grad_norm": 2.774132013320923, + "learning_rate": 2.825954691231851e-06, + "loss": 0.4458, + "step": 5871 + }, + { + "epoch": 2.7763593380614657, + "grad_norm": 2.696622133255005, + "learning_rate": 2.825336179194409e-06, + "loss": 0.4933, + "step": 5872 + }, + { + "epoch": 2.7768321513002365, + "grad_norm": 2.742184638977051, + "learning_rate": 2.8247176468997096e-06, + "loss": 0.4464, + "step": 5873 + }, + { + "epoch": 2.777304964539007, + "grad_norm": 2.7033183574676514, + "learning_rate": 2.824099094386266e-06, + "loss": 0.4369, + "step": 5874 + }, + { + "epoch": 2.7777777777777777, + "grad_norm": 2.7264044284820557, + "learning_rate": 2.8234805216925935e-06, + "loss": 0.4621, + "step": 5875 + }, + { + "epoch": 2.7782505910165485, + "grad_norm": 2.6417739391326904, + "learning_rate": 2.822861928857208e-06, + "loss": 0.4254, + "step": 5876 + }, + { + "epoch": 2.7787234042553193, + "grad_norm": 3.17209529876709, + "learning_rate": 2.8222433159186245e-06, + "loss": 0.5011, + "step": 5877 + }, + { + "epoch": 2.77919621749409, + "grad_norm": 3.1434381008148193, + "learning_rate": 2.8216246829153633e-06, + "loss": 0.4567, + "step": 5878 + }, + { + "epoch": 2.7796690307328604, + "grad_norm": 2.781608819961548, + "learning_rate": 2.821006029885943e-06, + "loss": 0.4723, + "step": 5879 + }, + { + "epoch": 2.780141843971631, + "grad_norm": 3.00079345703125, + "learning_rate": 2.820387356868885e-06, + "loss": 0.4796, + "step": 5880 + }, + { + "epoch": 2.780614657210402, + "grad_norm": 2.703555107116699, + "learning_rate": 2.819768663902712e-06, + "loss": 0.4577, + "step": 5881 + }, + { + "epoch": 2.7810874704491724, + "grad_norm": 2.5741801261901855, + "learning_rate": 2.8191499510259453e-06, + "loss": 0.4255, + "step": 5882 + }, + { + "epoch": 2.781560283687943, + "grad_norm": 2.9871208667755127, + "learning_rate": 2.8185312182771112e-06, + "loss": 0.4495, + "step": 5883 + }, + { + "epoch": 2.782033096926714, + "grad_norm": 2.525317668914795, + "learning_rate": 2.8179124656947343e-06, + "loss": 0.4428, + "step": 5884 + }, + { + "epoch": 2.7825059101654848, + "grad_norm": 2.525092840194702, + "learning_rate": 2.817293693317343e-06, + "loss": 0.4348, + "step": 5885 + }, + { + "epoch": 2.7829787234042556, + "grad_norm": 2.8485171794891357, + "learning_rate": 2.816674901183464e-06, + "loss": 0.4206, + "step": 5886 + }, + { + "epoch": 2.783451536643026, + "grad_norm": 2.6612746715545654, + "learning_rate": 2.8160560893316272e-06, + "loss": 0.396, + "step": 5887 + }, + { + "epoch": 2.7839243498817967, + "grad_norm": 2.7093865871429443, + "learning_rate": 2.815437257800364e-06, + "loss": 0.4468, + "step": 5888 + }, + { + "epoch": 2.7843971631205675, + "grad_norm": 2.6130900382995605, + "learning_rate": 2.814818406628206e-06, + "loss": 0.443, + "step": 5889 + }, + { + "epoch": 2.784869976359338, + "grad_norm": 2.8147552013397217, + "learning_rate": 2.8141995358536866e-06, + "loss": 0.4454, + "step": 5890 + }, + { + "epoch": 2.7853427895981087, + "grad_norm": 2.5621275901794434, + "learning_rate": 2.8135806455153395e-06, + "loss": 0.439, + "step": 5891 + }, + { + "epoch": 2.7858156028368795, + "grad_norm": 2.880228281021118, + "learning_rate": 2.812961735651701e-06, + "loss": 0.3895, + "step": 5892 + }, + { + "epoch": 2.7862884160756503, + "grad_norm": 2.5861377716064453, + "learning_rate": 2.8123428063013068e-06, + "loss": 0.4402, + "step": 5893 + }, + { + "epoch": 2.786761229314421, + "grad_norm": 2.9707765579223633, + "learning_rate": 2.811723857502696e-06, + "loss": 0.4461, + "step": 5894 + }, + { + "epoch": 2.7872340425531914, + "grad_norm": 2.923999309539795, + "learning_rate": 2.811104889294408e-06, + "loss": 0.4395, + "step": 5895 + }, + { + "epoch": 2.787706855791962, + "grad_norm": 2.846933603286743, + "learning_rate": 2.810485901714981e-06, + "loss": 0.5168, + "step": 5896 + }, + { + "epoch": 2.788179669030733, + "grad_norm": 4.1052350997924805, + "learning_rate": 2.8098668948029597e-06, + "loss": 0.5152, + "step": 5897 + }, + { + "epoch": 2.7886524822695034, + "grad_norm": 2.7391018867492676, + "learning_rate": 2.8092478685968856e-06, + "loss": 0.4515, + "step": 5898 + }, + { + "epoch": 2.789125295508274, + "grad_norm": 2.976088285446167, + "learning_rate": 2.8086288231353027e-06, + "loss": 0.5156, + "step": 5899 + }, + { + "epoch": 2.789598108747045, + "grad_norm": 2.6139633655548096, + "learning_rate": 2.8080097584567562e-06, + "loss": 0.4237, + "step": 5900 + }, + { + "epoch": 2.7900709219858157, + "grad_norm": 2.501654624938965, + "learning_rate": 2.807390674599792e-06, + "loss": 0.4349, + "step": 5901 + }, + { + "epoch": 2.7905437352245865, + "grad_norm": 2.8814525604248047, + "learning_rate": 2.8067715716029586e-06, + "loss": 0.4866, + "step": 5902 + }, + { + "epoch": 2.791016548463357, + "grad_norm": 2.7953200340270996, + "learning_rate": 2.8061524495048046e-06, + "loss": 0.3964, + "step": 5903 + }, + { + "epoch": 2.7914893617021277, + "grad_norm": 2.7362849712371826, + "learning_rate": 2.8055333083438808e-06, + "loss": 0.4181, + "step": 5904 + }, + { + "epoch": 2.7919621749408985, + "grad_norm": 2.9740512371063232, + "learning_rate": 2.8049141481587366e-06, + "loss": 0.4784, + "step": 5905 + }, + { + "epoch": 2.792434988179669, + "grad_norm": 2.595813274383545, + "learning_rate": 2.8042949689879262e-06, + "loss": 0.4421, + "step": 5906 + }, + { + "epoch": 2.7929078014184396, + "grad_norm": 2.886899948120117, + "learning_rate": 2.803675770870002e-06, + "loss": 0.4435, + "step": 5907 + }, + { + "epoch": 2.7933806146572104, + "grad_norm": 2.6057486534118652, + "learning_rate": 2.8030565538435196e-06, + "loss": 0.4472, + "step": 5908 + }, + { + "epoch": 2.7938534278959812, + "grad_norm": 2.7422802448272705, + "learning_rate": 2.802437317947034e-06, + "loss": 0.4799, + "step": 5909 + }, + { + "epoch": 2.794326241134752, + "grad_norm": 2.3904244899749756, + "learning_rate": 2.801818063219102e-06, + "loss": 0.4508, + "step": 5910 + }, + { + "epoch": 2.7947990543735224, + "grad_norm": 2.8434207439422607, + "learning_rate": 2.8011987896982835e-06, + "loss": 0.4473, + "step": 5911 + }, + { + "epoch": 2.795271867612293, + "grad_norm": 2.916088819503784, + "learning_rate": 2.8005794974231366e-06, + "loss": 0.464, + "step": 5912 + }, + { + "epoch": 2.795744680851064, + "grad_norm": 2.6483397483825684, + "learning_rate": 2.7999601864322236e-06, + "loss": 0.441, + "step": 5913 + }, + { + "epoch": 2.7962174940898343, + "grad_norm": 2.9287428855895996, + "learning_rate": 2.7993408567641033e-06, + "loss": 0.4551, + "step": 5914 + }, + { + "epoch": 2.796690307328605, + "grad_norm": 2.575024127960205, + "learning_rate": 2.798721508457342e-06, + "loss": 0.4494, + "step": 5915 + }, + { + "epoch": 2.797163120567376, + "grad_norm": 2.7156829833984375, + "learning_rate": 2.7981021415505015e-06, + "loss": 0.419, + "step": 5916 + }, + { + "epoch": 2.7976359338061467, + "grad_norm": 2.850553035736084, + "learning_rate": 2.7974827560821482e-06, + "loss": 0.4709, + "step": 5917 + }, + { + "epoch": 2.7981087470449175, + "grad_norm": 2.673846483230591, + "learning_rate": 2.796863352090847e-06, + "loss": 0.4224, + "step": 5918 + }, + { + "epoch": 2.798581560283688, + "grad_norm": 2.9093217849731445, + "learning_rate": 2.796243929615168e-06, + "loss": 0.468, + "step": 5919 + }, + { + "epoch": 2.7990543735224587, + "grad_norm": 2.4853813648223877, + "learning_rate": 2.7956244886936775e-06, + "loss": 0.4723, + "step": 5920 + }, + { + "epoch": 2.7995271867612295, + "grad_norm": 3.026428461074829, + "learning_rate": 2.795005029364946e-06, + "loss": 0.4721, + "step": 5921 + }, + { + "epoch": 2.8, + "grad_norm": 2.886295795440674, + "learning_rate": 2.794385551667546e-06, + "loss": 0.456, + "step": 5922 + }, + { + "epoch": 2.8004728132387706, + "grad_norm": 3.2260656356811523, + "learning_rate": 2.7937660556400486e-06, + "loss": 0.4499, + "step": 5923 + }, + { + "epoch": 2.8009456264775414, + "grad_norm": 2.7971982955932617, + "learning_rate": 2.793146541321027e-06, + "loss": 0.3982, + "step": 5924 + }, + { + "epoch": 2.801418439716312, + "grad_norm": 2.85461163520813, + "learning_rate": 2.7925270087490546e-06, + "loss": 0.4841, + "step": 5925 + }, + { + "epoch": 2.801891252955083, + "grad_norm": 3.0642316341400146, + "learning_rate": 2.7919074579627086e-06, + "loss": 0.4538, + "step": 5926 + }, + { + "epoch": 2.8023640661938534, + "grad_norm": 2.9053616523742676, + "learning_rate": 2.7912878890005657e-06, + "loss": 0.434, + "step": 5927 + }, + { + "epoch": 2.802836879432624, + "grad_norm": 2.7649240493774414, + "learning_rate": 2.7906683019012027e-06, + "loss": 0.414, + "step": 5928 + }, + { + "epoch": 2.803309692671395, + "grad_norm": 2.8717660903930664, + "learning_rate": 2.7900486967031987e-06, + "loss": 0.4337, + "step": 5929 + }, + { + "epoch": 2.8037825059101653, + "grad_norm": 2.6860995292663574, + "learning_rate": 2.789429073445135e-06, + "loss": 0.447, + "step": 5930 + }, + { + "epoch": 2.804255319148936, + "grad_norm": 2.67509126663208, + "learning_rate": 2.7888094321655918e-06, + "loss": 0.4955, + "step": 5931 + }, + { + "epoch": 2.804728132387707, + "grad_norm": 2.7426326274871826, + "learning_rate": 2.7881897729031514e-06, + "loss": 0.4564, + "step": 5932 + }, + { + "epoch": 2.8052009456264777, + "grad_norm": 2.7087252140045166, + "learning_rate": 2.7875700956963973e-06, + "loss": 0.4571, + "step": 5933 + }, + { + "epoch": 2.8056737588652485, + "grad_norm": 2.513526439666748, + "learning_rate": 2.7869504005839147e-06, + "loss": 0.4361, + "step": 5934 + }, + { + "epoch": 2.806146572104019, + "grad_norm": 3.2246084213256836, + "learning_rate": 2.7863306876042885e-06, + "loss": 0.4612, + "step": 5935 + }, + { + "epoch": 2.8066193853427897, + "grad_norm": 3.226325511932373, + "learning_rate": 2.7857109567961066e-06, + "loss": 0.4528, + "step": 5936 + }, + { + "epoch": 2.8070921985815604, + "grad_norm": 2.8861422538757324, + "learning_rate": 2.785091208197956e-06, + "loss": 0.5049, + "step": 5937 + }, + { + "epoch": 2.807565011820331, + "grad_norm": 2.76279616355896, + "learning_rate": 2.7844714418484257e-06, + "loss": 0.4714, + "step": 5938 + }, + { + "epoch": 2.8080378250591016, + "grad_norm": 2.9591920375823975, + "learning_rate": 2.7838516577861063e-06, + "loss": 0.4633, + "step": 5939 + }, + { + "epoch": 2.8085106382978724, + "grad_norm": 2.536916971206665, + "learning_rate": 2.7832318560495885e-06, + "loss": 0.4108, + "step": 5940 + }, + { + "epoch": 2.808983451536643, + "grad_norm": 3.2484991550445557, + "learning_rate": 2.7826120366774657e-06, + "loss": 0.4888, + "step": 5941 + }, + { + "epoch": 2.8094562647754135, + "grad_norm": 2.7129359245300293, + "learning_rate": 2.781992199708329e-06, + "loss": 0.4008, + "step": 5942 + }, + { + "epoch": 2.8099290780141843, + "grad_norm": 2.4176113605499268, + "learning_rate": 2.781372345180776e-06, + "loss": 0.3864, + "step": 5943 + }, + { + "epoch": 2.810401891252955, + "grad_norm": 2.6557252407073975, + "learning_rate": 2.7807524731334e-06, + "loss": 0.4295, + "step": 5944 + }, + { + "epoch": 2.8108747044917255, + "grad_norm": 2.9191324710845947, + "learning_rate": 2.7801325836047993e-06, + "loss": 0.4854, + "step": 5945 + }, + { + "epoch": 2.8113475177304963, + "grad_norm": 2.6325371265411377, + "learning_rate": 2.7795126766335705e-06, + "loss": 0.4332, + "step": 5946 + }, + { + "epoch": 2.811820330969267, + "grad_norm": 2.658337116241455, + "learning_rate": 2.778892752258314e-06, + "loss": 0.4276, + "step": 5947 + }, + { + "epoch": 2.812293144208038, + "grad_norm": 2.763782262802124, + "learning_rate": 2.778272810517627e-06, + "loss": 0.4246, + "step": 5948 + }, + { + "epoch": 2.8127659574468087, + "grad_norm": 2.407607078552246, + "learning_rate": 2.777652851450113e-06, + "loss": 0.3788, + "step": 5949 + }, + { + "epoch": 2.813238770685579, + "grad_norm": 3.0339951515197754, + "learning_rate": 2.7770328750943736e-06, + "loss": 0.477, + "step": 5950 + }, + { + "epoch": 2.81371158392435, + "grad_norm": 2.3475773334503174, + "learning_rate": 2.776412881489012e-06, + "loss": 0.4206, + "step": 5951 + }, + { + "epoch": 2.8141843971631206, + "grad_norm": 3.0455260276794434, + "learning_rate": 2.7757928706726318e-06, + "loss": 0.4301, + "step": 5952 + }, + { + "epoch": 2.814657210401891, + "grad_norm": 2.803920030593872, + "learning_rate": 2.7751728426838386e-06, + "loss": 0.3738, + "step": 5953 + }, + { + "epoch": 2.815130023640662, + "grad_norm": 3.1083319187164307, + "learning_rate": 2.77455279756124e-06, + "loss": 0.5365, + "step": 5954 + }, + { + "epoch": 2.8156028368794326, + "grad_norm": 3.180809497833252, + "learning_rate": 2.7739327353434427e-06, + "loss": 0.4789, + "step": 5955 + }, + { + "epoch": 2.8160756501182034, + "grad_norm": 2.975043773651123, + "learning_rate": 2.7733126560690543e-06, + "loss": 0.4798, + "step": 5956 + }, + { + "epoch": 2.816548463356974, + "grad_norm": 2.765475034713745, + "learning_rate": 2.772692559776685e-06, + "loss": 0.4206, + "step": 5957 + }, + { + "epoch": 2.8170212765957445, + "grad_norm": 2.48612380027771, + "learning_rate": 2.7720724465049463e-06, + "loss": 0.4234, + "step": 5958 + }, + { + "epoch": 2.8174940898345153, + "grad_norm": 2.7145729064941406, + "learning_rate": 2.77145231629245e-06, + "loss": 0.4713, + "step": 5959 + }, + { + "epoch": 2.817966903073286, + "grad_norm": 2.5993762016296387, + "learning_rate": 2.7708321691778074e-06, + "loss": 0.4144, + "step": 5960 + }, + { + "epoch": 2.8184397163120565, + "grad_norm": 3.0902538299560547, + "learning_rate": 2.770212005199633e-06, + "loss": 0.4822, + "step": 5961 + }, + { + "epoch": 2.8189125295508273, + "grad_norm": 2.849757671356201, + "learning_rate": 2.7695918243965424e-06, + "loss": 0.4449, + "step": 5962 + }, + { + "epoch": 2.819385342789598, + "grad_norm": 2.77148699760437, + "learning_rate": 2.768971626807151e-06, + "loss": 0.4448, + "step": 5963 + }, + { + "epoch": 2.819858156028369, + "grad_norm": 2.7865898609161377, + "learning_rate": 2.7683514124700757e-06, + "loss": 0.4944, + "step": 5964 + }, + { + "epoch": 2.8203309692671397, + "grad_norm": 2.9057955741882324, + "learning_rate": 2.767731181423934e-06, + "loss": 0.5074, + "step": 5965 + }, + { + "epoch": 2.82080378250591, + "grad_norm": 2.725837469100952, + "learning_rate": 2.7671109337073465e-06, + "loss": 0.4207, + "step": 5966 + }, + { + "epoch": 2.821276595744681, + "grad_norm": 3.078531265258789, + "learning_rate": 2.7664906693589315e-06, + "loss": 0.4835, + "step": 5967 + }, + { + "epoch": 2.8217494089834516, + "grad_norm": 2.8692002296447754, + "learning_rate": 2.765870388417312e-06, + "loss": 0.4284, + "step": 5968 + }, + { + "epoch": 2.822222222222222, + "grad_norm": 2.8519723415374756, + "learning_rate": 2.765250090921109e-06, + "loss": 0.541, + "step": 5969 + }, + { + "epoch": 2.8226950354609928, + "grad_norm": 3.2037532329559326, + "learning_rate": 2.7646297769089457e-06, + "loss": 0.4276, + "step": 5970 + }, + { + "epoch": 2.8231678486997636, + "grad_norm": 2.8637137413024902, + "learning_rate": 2.7640094464194468e-06, + "loss": 0.4904, + "step": 5971 + }, + { + "epoch": 2.8236406619385344, + "grad_norm": 2.681516408920288, + "learning_rate": 2.7633890994912372e-06, + "loss": 0.4942, + "step": 5972 + }, + { + "epoch": 2.824113475177305, + "grad_norm": 3.0035219192504883, + "learning_rate": 2.7627687361629434e-06, + "loss": 0.4556, + "step": 5973 + }, + { + "epoch": 2.8245862884160755, + "grad_norm": 2.8107759952545166, + "learning_rate": 2.7621483564731923e-06, + "loss": 0.4225, + "step": 5974 + }, + { + "epoch": 2.8250591016548463, + "grad_norm": 2.87276029586792, + "learning_rate": 2.7615279604606126e-06, + "loss": 0.5045, + "step": 5975 + }, + { + "epoch": 2.825531914893617, + "grad_norm": 2.687953233718872, + "learning_rate": 2.760907548163833e-06, + "loss": 0.4018, + "step": 5976 + }, + { + "epoch": 2.8260047281323875, + "grad_norm": 2.587979555130005, + "learning_rate": 2.760287119621486e-06, + "loss": 0.4407, + "step": 5977 + }, + { + "epoch": 2.8264775413711583, + "grad_norm": 2.805602550506592, + "learning_rate": 2.7596666748722e-06, + "loss": 0.4559, + "step": 5978 + }, + { + "epoch": 2.826950354609929, + "grad_norm": 2.320763111114502, + "learning_rate": 2.759046213954609e-06, + "loss": 0.3847, + "step": 5979 + }, + { + "epoch": 2.8274231678487, + "grad_norm": 2.6876401901245117, + "learning_rate": 2.758425736907347e-06, + "loss": 0.4528, + "step": 5980 + }, + { + "epoch": 2.8278959810874706, + "grad_norm": 2.6852915287017822, + "learning_rate": 2.757805243769046e-06, + "loss": 0.395, + "step": 5981 + }, + { + "epoch": 2.828368794326241, + "grad_norm": 2.808326005935669, + "learning_rate": 2.7571847345783447e-06, + "loss": 0.4647, + "step": 5982 + }, + { + "epoch": 2.828841607565012, + "grad_norm": 2.641479015350342, + "learning_rate": 2.7565642093738766e-06, + "loss": 0.3798, + "step": 5983 + }, + { + "epoch": 2.8293144208037826, + "grad_norm": 2.8066110610961914, + "learning_rate": 2.7559436681942803e-06, + "loss": 0.5072, + "step": 5984 + }, + { + "epoch": 2.829787234042553, + "grad_norm": 2.898375988006592, + "learning_rate": 2.7553231110781936e-06, + "loss": 0.5182, + "step": 5985 + }, + { + "epoch": 2.8302600472813237, + "grad_norm": 2.704890489578247, + "learning_rate": 2.7547025380642574e-06, + "loss": 0.3999, + "step": 5986 + }, + { + "epoch": 2.8307328605200945, + "grad_norm": 2.6024270057678223, + "learning_rate": 2.7540819491911106e-06, + "loss": 0.4302, + "step": 5987 + }, + { + "epoch": 2.8312056737588653, + "grad_norm": 2.8006081581115723, + "learning_rate": 2.7534613444973946e-06, + "loss": 0.4492, + "step": 5988 + }, + { + "epoch": 2.831678486997636, + "grad_norm": 2.9532058238983154, + "learning_rate": 2.752840724021752e-06, + "loss": 0.4552, + "step": 5989 + }, + { + "epoch": 2.8321513002364065, + "grad_norm": 3.1830217838287354, + "learning_rate": 2.7522200878028265e-06, + "loss": 0.5013, + "step": 5990 + }, + { + "epoch": 2.8326241134751773, + "grad_norm": 2.716176748275757, + "learning_rate": 2.7515994358792624e-06, + "loss": 0.4569, + "step": 5991 + }, + { + "epoch": 2.833096926713948, + "grad_norm": 2.6852715015411377, + "learning_rate": 2.7509787682897044e-06, + "loss": 0.4764, + "step": 5992 + }, + { + "epoch": 2.8335697399527184, + "grad_norm": 2.9383316040039062, + "learning_rate": 2.7503580850727985e-06, + "loss": 0.5205, + "step": 5993 + }, + { + "epoch": 2.8340425531914892, + "grad_norm": 2.703132152557373, + "learning_rate": 2.749737386267193e-06, + "loss": 0.4543, + "step": 5994 + }, + { + "epoch": 2.83451536643026, + "grad_norm": 2.4304885864257812, + "learning_rate": 2.7491166719115354e-06, + "loss": 0.4479, + "step": 5995 + }, + { + "epoch": 2.834988179669031, + "grad_norm": 2.975722551345825, + "learning_rate": 2.748495942044475e-06, + "loss": 0.4074, + "step": 5996 + }, + { + "epoch": 2.8354609929078016, + "grad_norm": 3.440208911895752, + "learning_rate": 2.7478751967046617e-06, + "loss": 0.4497, + "step": 5997 + }, + { + "epoch": 2.835933806146572, + "grad_norm": 2.734673261642456, + "learning_rate": 2.747254435930747e-06, + "loss": 0.437, + "step": 5998 + }, + { + "epoch": 2.8364066193853428, + "grad_norm": 3.1918959617614746, + "learning_rate": 2.7466336597613826e-06, + "loss": 0.4197, + "step": 5999 + }, + { + "epoch": 2.8368794326241136, + "grad_norm": 3.1440329551696777, + "learning_rate": 2.7460128682352216e-06, + "loss": 0.4425, + "step": 6000 + }, + { + "epoch": 2.837352245862884, + "grad_norm": 2.582993507385254, + "learning_rate": 2.7453920613909183e-06, + "loss": 0.4475, + "step": 6001 + }, + { + "epoch": 2.8378250591016547, + "grad_norm": 3.2682149410247803, + "learning_rate": 2.744771239267128e-06, + "loss": 0.4615, + "step": 6002 + }, + { + "epoch": 2.8382978723404255, + "grad_norm": 2.848477840423584, + "learning_rate": 2.7441504019025046e-06, + "loss": 0.4093, + "step": 6003 + }, + { + "epoch": 2.8387706855791963, + "grad_norm": 2.3582282066345215, + "learning_rate": 2.7435295493357067e-06, + "loss": 0.3911, + "step": 6004 + }, + { + "epoch": 2.839243498817967, + "grad_norm": 2.7707207202911377, + "learning_rate": 2.742908681605392e-06, + "loss": 0.4069, + "step": 6005 + }, + { + "epoch": 2.8397163120567375, + "grad_norm": 3.0763752460479736, + "learning_rate": 2.7422877987502183e-06, + "loss": 0.512, + "step": 6006 + }, + { + "epoch": 2.8401891252955083, + "grad_norm": 2.8027124404907227, + "learning_rate": 2.741666900808846e-06, + "loss": 0.4922, + "step": 6007 + }, + { + "epoch": 2.840661938534279, + "grad_norm": 2.487982988357544, + "learning_rate": 2.7410459878199353e-06, + "loss": 0.4368, + "step": 6008 + }, + { + "epoch": 2.8411347517730494, + "grad_norm": 2.8727993965148926, + "learning_rate": 2.7404250598221484e-06, + "loss": 0.4639, + "step": 6009 + }, + { + "epoch": 2.84160756501182, + "grad_norm": 2.5556678771972656, + "learning_rate": 2.739804116854147e-06, + "loss": 0.4217, + "step": 6010 + }, + { + "epoch": 2.842080378250591, + "grad_norm": 2.6306912899017334, + "learning_rate": 2.7391831589545948e-06, + "loss": 0.4816, + "step": 6011 + }, + { + "epoch": 2.842553191489362, + "grad_norm": 2.7340946197509766, + "learning_rate": 2.7385621861621557e-06, + "loss": 0.4113, + "step": 6012 + }, + { + "epoch": 2.8430260047281326, + "grad_norm": 2.834190607070923, + "learning_rate": 2.737941198515495e-06, + "loss": 0.4691, + "step": 6013 + }, + { + "epoch": 2.843498817966903, + "grad_norm": 2.7139697074890137, + "learning_rate": 2.737320196053281e-06, + "loss": 0.3798, + "step": 6014 + }, + { + "epoch": 2.8439716312056738, + "grad_norm": 2.7934985160827637, + "learning_rate": 2.736699178814177e-06, + "loss": 0.446, + "step": 6015 + }, + { + "epoch": 2.8444444444444446, + "grad_norm": 2.6941518783569336, + "learning_rate": 2.7360781468368534e-06, + "loss": 0.4787, + "step": 6016 + }, + { + "epoch": 2.844917257683215, + "grad_norm": 3.1530468463897705, + "learning_rate": 2.7354571001599792e-06, + "loss": 0.474, + "step": 6017 + }, + { + "epoch": 2.8453900709219857, + "grad_norm": 2.613875389099121, + "learning_rate": 2.7348360388222243e-06, + "loss": 0.4297, + "step": 6018 + }, + { + "epoch": 2.8458628841607565, + "grad_norm": 2.5481486320495605, + "learning_rate": 2.7342149628622587e-06, + "loss": 0.3762, + "step": 6019 + }, + { + "epoch": 2.8463356973995273, + "grad_norm": 2.6425609588623047, + "learning_rate": 2.7335938723187544e-06, + "loss": 0.4077, + "step": 6020 + }, + { + "epoch": 2.846808510638298, + "grad_norm": 2.6281731128692627, + "learning_rate": 2.7329727672303836e-06, + "loss": 0.466, + "step": 6021 + }, + { + "epoch": 2.8472813238770684, + "grad_norm": 2.8862180709838867, + "learning_rate": 2.7323516476358197e-06, + "loss": 0.4191, + "step": 6022 + }, + { + "epoch": 2.8477541371158392, + "grad_norm": 2.907731533050537, + "learning_rate": 2.7317305135737383e-06, + "loss": 0.4867, + "step": 6023 + }, + { + "epoch": 2.84822695035461, + "grad_norm": 2.825593948364258, + "learning_rate": 2.731109365082814e-06, + "loss": 0.4888, + "step": 6024 + }, + { + "epoch": 2.8486997635933804, + "grad_norm": 2.478163003921509, + "learning_rate": 2.730488202201722e-06, + "loss": 0.4714, + "step": 6025 + }, + { + "epoch": 2.849172576832151, + "grad_norm": 2.928899049758911, + "learning_rate": 2.7298670249691418e-06, + "loss": 0.4671, + "step": 6026 + }, + { + "epoch": 2.849645390070922, + "grad_norm": 2.778256893157959, + "learning_rate": 2.7292458334237488e-06, + "loss": 0.429, + "step": 6027 + }, + { + "epoch": 2.850118203309693, + "grad_norm": 3.0689055919647217, + "learning_rate": 2.7286246276042234e-06, + "loss": 0.4727, + "step": 6028 + }, + { + "epoch": 2.8505910165484636, + "grad_norm": 2.582066774368286, + "learning_rate": 2.7280034075492447e-06, + "loss": 0.4025, + "step": 6029 + }, + { + "epoch": 2.851063829787234, + "grad_norm": 3.6679015159606934, + "learning_rate": 2.7273821732974936e-06, + "loss": 0.4856, + "step": 6030 + }, + { + "epoch": 2.8515366430260047, + "grad_norm": 2.7222588062286377, + "learning_rate": 2.7267609248876516e-06, + "loss": 0.4255, + "step": 6031 + }, + { + "epoch": 2.8520094562647755, + "grad_norm": 2.455038547515869, + "learning_rate": 2.726139662358401e-06, + "loss": 0.4234, + "step": 6032 + }, + { + "epoch": 2.852482269503546, + "grad_norm": 2.8277318477630615, + "learning_rate": 2.7255183857484253e-06, + "loss": 0.4146, + "step": 6033 + }, + { + "epoch": 2.8529550827423167, + "grad_norm": 2.523615837097168, + "learning_rate": 2.724897095096409e-06, + "loss": 0.4227, + "step": 6034 + }, + { + "epoch": 2.8534278959810875, + "grad_norm": 3.353646755218506, + "learning_rate": 2.724275790441036e-06, + "loss": 0.5041, + "step": 6035 + }, + { + "epoch": 2.8539007092198583, + "grad_norm": 2.753981828689575, + "learning_rate": 2.7236544718209934e-06, + "loss": 0.4646, + "step": 6036 + }, + { + "epoch": 2.854373522458629, + "grad_norm": 2.954744577407837, + "learning_rate": 2.723033139274967e-06, + "loss": 0.5182, + "step": 6037 + }, + { + "epoch": 2.8548463356973994, + "grad_norm": 2.4814131259918213, + "learning_rate": 2.7224117928416462e-06, + "loss": 0.4626, + "step": 6038 + }, + { + "epoch": 2.8553191489361702, + "grad_norm": 2.7414886951446533, + "learning_rate": 2.721790432559717e-06, + "loss": 0.4111, + "step": 6039 + }, + { + "epoch": 2.855791962174941, + "grad_norm": 2.8743896484375, + "learning_rate": 2.7211690584678706e-06, + "loss": 0.4986, + "step": 6040 + }, + { + "epoch": 2.8562647754137114, + "grad_norm": 3.0691921710968018, + "learning_rate": 2.720547670604797e-06, + "loss": 0.4743, + "step": 6041 + }, + { + "epoch": 2.856737588652482, + "grad_norm": 2.7273411750793457, + "learning_rate": 2.7199262690091872e-06, + "loss": 0.4403, + "step": 6042 + }, + { + "epoch": 2.857210401891253, + "grad_norm": 2.8022944927215576, + "learning_rate": 2.7193048537197325e-06, + "loss": 0.4413, + "step": 6043 + }, + { + "epoch": 2.8576832151300238, + "grad_norm": 2.4883248805999756, + "learning_rate": 2.718683424775126e-06, + "loss": 0.4485, + "step": 6044 + }, + { + "epoch": 2.8581560283687946, + "grad_norm": 2.457249879837036, + "learning_rate": 2.718061982214062e-06, + "loss": 0.4167, + "step": 6045 + }, + { + "epoch": 2.858628841607565, + "grad_norm": 2.7210328578948975, + "learning_rate": 2.717440526075234e-06, + "loss": 0.4419, + "step": 6046 + }, + { + "epoch": 2.8591016548463357, + "grad_norm": 2.684483766555786, + "learning_rate": 2.7168190563973386e-06, + "loss": 0.4449, + "step": 6047 + }, + { + "epoch": 2.8595744680851065, + "grad_norm": 2.5305230617523193, + "learning_rate": 2.7161975732190706e-06, + "loss": 0.3829, + "step": 6048 + }, + { + "epoch": 2.860047281323877, + "grad_norm": 3.0284602642059326, + "learning_rate": 2.7155760765791278e-06, + "loss": 0.5164, + "step": 6049 + }, + { + "epoch": 2.8605200945626477, + "grad_norm": 3.154599189758301, + "learning_rate": 2.7149545665162085e-06, + "loss": 0.527, + "step": 6050 + }, + { + "epoch": 2.8609929078014185, + "grad_norm": 2.6798126697540283, + "learning_rate": 2.7143330430690113e-06, + "loss": 0.4379, + "step": 6051 + }, + { + "epoch": 2.8614657210401893, + "grad_norm": 2.9531302452087402, + "learning_rate": 2.7137115062762344e-06, + "loss": 0.4549, + "step": 6052 + }, + { + "epoch": 2.86193853427896, + "grad_norm": 2.779531240463257, + "learning_rate": 2.7130899561765787e-06, + "loss": 0.4037, + "step": 6053 + }, + { + "epoch": 2.8624113475177304, + "grad_norm": 2.786763906478882, + "learning_rate": 2.7124683928087466e-06, + "loss": 0.3986, + "step": 6054 + }, + { + "epoch": 2.862884160756501, + "grad_norm": 2.430415630340576, + "learning_rate": 2.7118468162114385e-06, + "loss": 0.4402, + "step": 6055 + }, + { + "epoch": 2.863356973995272, + "grad_norm": 3.027268409729004, + "learning_rate": 2.7112252264233596e-06, + "loss": 0.4737, + "step": 6056 + }, + { + "epoch": 2.8638297872340424, + "grad_norm": 3.024935483932495, + "learning_rate": 2.710603623483211e-06, + "loss": 0.3997, + "step": 6057 + }, + { + "epoch": 2.864302600472813, + "grad_norm": 2.8862195014953613, + "learning_rate": 2.7099820074296985e-06, + "loss": 0.4896, + "step": 6058 + }, + { + "epoch": 2.864775413711584, + "grad_norm": 2.595579147338867, + "learning_rate": 2.709360378301527e-06, + "loss": 0.4387, + "step": 6059 + }, + { + "epoch": 2.8652482269503547, + "grad_norm": 2.8046188354492188, + "learning_rate": 2.708738736137403e-06, + "loss": 0.4726, + "step": 6060 + }, + { + "epoch": 2.8657210401891255, + "grad_norm": 3.040304660797119, + "learning_rate": 2.708117080976033e-06, + "loss": 0.4642, + "step": 6061 + }, + { + "epoch": 2.866193853427896, + "grad_norm": 2.618128538131714, + "learning_rate": 2.7074954128561248e-06, + "loss": 0.3171, + "step": 6062 + }, + { + "epoch": 2.8666666666666667, + "grad_norm": 2.7966055870056152, + "learning_rate": 2.706873731816387e-06, + "loss": 0.4893, + "step": 6063 + }, + { + "epoch": 2.8671394799054375, + "grad_norm": 2.9198038578033447, + "learning_rate": 2.706252037895529e-06, + "loss": 0.4428, + "step": 6064 + }, + { + "epoch": 2.867612293144208, + "grad_norm": 2.417705774307251, + "learning_rate": 2.7056303311322617e-06, + "loss": 0.3704, + "step": 6065 + }, + { + "epoch": 2.8680851063829786, + "grad_norm": 3.143918752670288, + "learning_rate": 2.7050086115652953e-06, + "loss": 0.5247, + "step": 6066 + }, + { + "epoch": 2.8685579196217494, + "grad_norm": 2.620781183242798, + "learning_rate": 2.704386879233341e-06, + "loss": 0.4131, + "step": 6067 + }, + { + "epoch": 2.8690307328605202, + "grad_norm": 2.6929845809936523, + "learning_rate": 2.703765134175112e-06, + "loss": 0.4833, + "step": 6068 + }, + { + "epoch": 2.869503546099291, + "grad_norm": 2.695920944213867, + "learning_rate": 2.7031433764293214e-06, + "loss": 0.435, + "step": 6069 + }, + { + "epoch": 2.8699763593380614, + "grad_norm": 2.6184475421905518, + "learning_rate": 2.702521606034684e-06, + "loss": 0.3898, + "step": 6070 + }, + { + "epoch": 2.870449172576832, + "grad_norm": 3.130624532699585, + "learning_rate": 2.7018998230299136e-06, + "loss": 0.4934, + "step": 6071 + }, + { + "epoch": 2.870921985815603, + "grad_norm": 2.947936534881592, + "learning_rate": 2.701278027453727e-06, + "loss": 0.4167, + "step": 6072 + }, + { + "epoch": 2.8713947990543733, + "grad_norm": 2.389263391494751, + "learning_rate": 2.7006562193448406e-06, + "loss": 0.3854, + "step": 6073 + }, + { + "epoch": 2.871867612293144, + "grad_norm": 2.9040684700012207, + "learning_rate": 2.700034398741971e-06, + "loss": 0.4656, + "step": 6074 + }, + { + "epoch": 2.872340425531915, + "grad_norm": 2.8671910762786865, + "learning_rate": 2.6994125656838365e-06, + "loss": 0.4642, + "step": 6075 + }, + { + "epoch": 2.8728132387706857, + "grad_norm": 2.6957180500030518, + "learning_rate": 2.698790720209156e-06, + "loss": 0.4894, + "step": 6076 + }, + { + "epoch": 2.8732860520094565, + "grad_norm": 2.748342514038086, + "learning_rate": 2.698168862356648e-06, + "loss": 0.4552, + "step": 6077 + }, + { + "epoch": 2.873758865248227, + "grad_norm": 2.7459912300109863, + "learning_rate": 2.6975469921650344e-06, + "loss": 0.4244, + "step": 6078 + }, + { + "epoch": 2.8742316784869977, + "grad_norm": 2.515650987625122, + "learning_rate": 2.6969251096730366e-06, + "loss": 0.4178, + "step": 6079 + }, + { + "epoch": 2.8747044917257685, + "grad_norm": 2.747373342514038, + "learning_rate": 2.696303214919375e-06, + "loss": 0.4623, + "step": 6080 + }, + { + "epoch": 2.875177304964539, + "grad_norm": 2.72092604637146, + "learning_rate": 2.695681307942773e-06, + "loss": 0.4227, + "step": 6081 + }, + { + "epoch": 2.8756501182033096, + "grad_norm": 2.6925108432769775, + "learning_rate": 2.695059388781955e-06, + "loss": 0.3807, + "step": 6082 + }, + { + "epoch": 2.8761229314420804, + "grad_norm": 2.673546314239502, + "learning_rate": 2.6944374574756427e-06, + "loss": 0.424, + "step": 6083 + }, + { + "epoch": 2.876595744680851, + "grad_norm": 2.7018187046051025, + "learning_rate": 2.6938155140625636e-06, + "loss": 0.4367, + "step": 6084 + }, + { + "epoch": 2.877068557919622, + "grad_norm": 2.9420957565307617, + "learning_rate": 2.6931935585814416e-06, + "loss": 0.4223, + "step": 6085 + }, + { + "epoch": 2.8775413711583924, + "grad_norm": 2.6523385047912598, + "learning_rate": 2.6925715910710036e-06, + "loss": 0.4074, + "step": 6086 + }, + { + "epoch": 2.878014184397163, + "grad_norm": 2.6104063987731934, + "learning_rate": 2.691949611569978e-06, + "loss": 0.423, + "step": 6087 + }, + { + "epoch": 2.878486997635934, + "grad_norm": 2.6463685035705566, + "learning_rate": 2.691327620117091e-06, + "loss": 0.4354, + "step": 6088 + }, + { + "epoch": 2.8789598108747043, + "grad_norm": 2.5863583087921143, + "learning_rate": 2.6907056167510725e-06, + "loss": 0.4177, + "step": 6089 + }, + { + "epoch": 2.879432624113475, + "grad_norm": 2.6946942806243896, + "learning_rate": 2.690083601510651e-06, + "loss": 0.4176, + "step": 6090 + }, + { + "epoch": 2.879905437352246, + "grad_norm": 3.0649454593658447, + "learning_rate": 2.6894615744345575e-06, + "loss": 0.4827, + "step": 6091 + }, + { + "epoch": 2.8803782505910167, + "grad_norm": 2.6454906463623047, + "learning_rate": 2.6888395355615226e-06, + "loss": 0.4757, + "step": 6092 + }, + { + "epoch": 2.8808510638297875, + "grad_norm": 3.251805067062378, + "learning_rate": 2.688217484930278e-06, + "loss": 0.5651, + "step": 6093 + }, + { + "epoch": 2.881323877068558, + "grad_norm": 2.543999433517456, + "learning_rate": 2.687595422579555e-06, + "loss": 0.4196, + "step": 6094 + }, + { + "epoch": 2.8817966903073287, + "grad_norm": 3.1502909660339355, + "learning_rate": 2.686973348548088e-06, + "loss": 0.4376, + "step": 6095 + }, + { + "epoch": 2.8822695035460995, + "grad_norm": 2.7800376415252686, + "learning_rate": 2.686351262874611e-06, + "loss": 0.444, + "step": 6096 + }, + { + "epoch": 2.88274231678487, + "grad_norm": 3.1529603004455566, + "learning_rate": 2.685729165597858e-06, + "loss": 0.5137, + "step": 6097 + }, + { + "epoch": 2.8832151300236406, + "grad_norm": 2.6079602241516113, + "learning_rate": 2.685107056756564e-06, + "loss": 0.4213, + "step": 6098 + }, + { + "epoch": 2.8836879432624114, + "grad_norm": 2.8969249725341797, + "learning_rate": 2.6844849363894648e-06, + "loss": 0.4679, + "step": 6099 + }, + { + "epoch": 2.884160756501182, + "grad_norm": 2.5882437229156494, + "learning_rate": 2.6838628045352977e-06, + "loss": 0.3891, + "step": 6100 + }, + { + "epoch": 2.8846335697399526, + "grad_norm": 2.9458062648773193, + "learning_rate": 2.6832406612328007e-06, + "loss": 0.4802, + "step": 6101 + }, + { + "epoch": 2.8851063829787233, + "grad_norm": 2.8463058471679688, + "learning_rate": 2.6826185065207105e-06, + "loss": 0.4332, + "step": 6102 + }, + { + "epoch": 2.885579196217494, + "grad_norm": 2.8799285888671875, + "learning_rate": 2.6819963404377667e-06, + "loss": 0.4474, + "step": 6103 + }, + { + "epoch": 2.8860520094562645, + "grad_norm": 2.846860408782959, + "learning_rate": 2.681374163022709e-06, + "loss": 0.4317, + "step": 6104 + }, + { + "epoch": 2.8865248226950353, + "grad_norm": 2.7918877601623535, + "learning_rate": 2.6807519743142775e-06, + "loss": 0.4243, + "step": 6105 + }, + { + "epoch": 2.886997635933806, + "grad_norm": 2.9351487159729004, + "learning_rate": 2.6801297743512127e-06, + "loss": 0.5253, + "step": 6106 + }, + { + "epoch": 2.887470449172577, + "grad_norm": 2.9422426223754883, + "learning_rate": 2.6795075631722576e-06, + "loss": 0.4887, + "step": 6107 + }, + { + "epoch": 2.8879432624113477, + "grad_norm": 2.6837220191955566, + "learning_rate": 2.678885340816153e-06, + "loss": 0.4761, + "step": 6108 + }, + { + "epoch": 2.888416075650118, + "grad_norm": 2.6800777912139893, + "learning_rate": 2.6782631073216425e-06, + "loss": 0.4248, + "step": 6109 + }, + { + "epoch": 2.888888888888889, + "grad_norm": 2.9654436111450195, + "learning_rate": 2.6776408627274702e-06, + "loss": 0.487, + "step": 6110 + }, + { + "epoch": 2.8893617021276596, + "grad_norm": 2.7725181579589844, + "learning_rate": 2.6770186070723804e-06, + "loss": 0.4166, + "step": 6111 + }, + { + "epoch": 2.88983451536643, + "grad_norm": 2.6547815799713135, + "learning_rate": 2.676396340395118e-06, + "loss": 0.4039, + "step": 6112 + }, + { + "epoch": 2.890307328605201, + "grad_norm": 2.690997838973999, + "learning_rate": 2.6757740627344292e-06, + "loss": 0.4639, + "step": 6113 + }, + { + "epoch": 2.8907801418439716, + "grad_norm": 2.4693069458007812, + "learning_rate": 2.67515177412906e-06, + "loss": 0.4052, + "step": 6114 + }, + { + "epoch": 2.8912529550827424, + "grad_norm": 2.7137033939361572, + "learning_rate": 2.6745294746177576e-06, + "loss": 0.4442, + "step": 6115 + }, + { + "epoch": 2.891725768321513, + "grad_norm": 3.7417004108428955, + "learning_rate": 2.6739071642392712e-06, + "loss": 0.4809, + "step": 6116 + }, + { + "epoch": 2.8921985815602835, + "grad_norm": 2.707094669342041, + "learning_rate": 2.673284843032347e-06, + "loss": 0.411, + "step": 6117 + }, + { + "epoch": 2.8926713947990543, + "grad_norm": 2.7864158153533936, + "learning_rate": 2.672662511035736e-06, + "loss": 0.4939, + "step": 6118 + }, + { + "epoch": 2.893144208037825, + "grad_norm": 2.8753504753112793, + "learning_rate": 2.672040168288187e-06, + "loss": 0.4396, + "step": 6119 + }, + { + "epoch": 2.8936170212765955, + "grad_norm": 2.7581071853637695, + "learning_rate": 2.6714178148284516e-06, + "loss": 0.427, + "step": 6120 + }, + { + "epoch": 2.8940898345153663, + "grad_norm": 2.9754791259765625, + "learning_rate": 2.6707954506952803e-06, + "loss": 0.4255, + "step": 6121 + }, + { + "epoch": 2.894562647754137, + "grad_norm": 2.876939296722412, + "learning_rate": 2.670173075927426e-06, + "loss": 0.4699, + "step": 6122 + }, + { + "epoch": 2.895035460992908, + "grad_norm": 2.4875400066375732, + "learning_rate": 2.6695506905636397e-06, + "loss": 0.3568, + "step": 6123 + }, + { + "epoch": 2.8955082742316787, + "grad_norm": 2.703606128692627, + "learning_rate": 2.668928294642675e-06, + "loss": 0.3646, + "step": 6124 + }, + { + "epoch": 2.895981087470449, + "grad_norm": 2.8618338108062744, + "learning_rate": 2.6683058882032868e-06, + "loss": 0.378, + "step": 6125 + }, + { + "epoch": 2.89645390070922, + "grad_norm": 2.9756760597229004, + "learning_rate": 2.667683471284229e-06, + "loss": 0.4348, + "step": 6126 + }, + { + "epoch": 2.8969267139479906, + "grad_norm": 2.7861104011535645, + "learning_rate": 2.667061043924256e-06, + "loss": 0.4435, + "step": 6127 + }, + { + "epoch": 2.897399527186761, + "grad_norm": 2.7932238578796387, + "learning_rate": 2.6664386061621243e-06, + "loss": 0.4824, + "step": 6128 + }, + { + "epoch": 2.8978723404255318, + "grad_norm": 2.85483455657959, + "learning_rate": 2.6658161580365917e-06, + "loss": 0.4925, + "step": 6129 + }, + { + "epoch": 2.8983451536643026, + "grad_norm": 2.4242141246795654, + "learning_rate": 2.6651936995864136e-06, + "loss": 0.3466, + "step": 6130 + }, + { + "epoch": 2.8988179669030734, + "grad_norm": 3.385214328765869, + "learning_rate": 2.6645712308503473e-06, + "loss": 0.4751, + "step": 6131 + }, + { + "epoch": 2.899290780141844, + "grad_norm": 2.7109622955322266, + "learning_rate": 2.6639487518671525e-06, + "loss": 0.4469, + "step": 6132 + }, + { + "epoch": 2.8997635933806145, + "grad_norm": 2.6537814140319824, + "learning_rate": 2.6633262626755877e-06, + "loss": 0.4678, + "step": 6133 + }, + { + "epoch": 2.9002364066193853, + "grad_norm": 2.5992231369018555, + "learning_rate": 2.6627037633144124e-06, + "loss": 0.4206, + "step": 6134 + }, + { + "epoch": 2.900709219858156, + "grad_norm": 2.988940954208374, + "learning_rate": 2.6620812538223885e-06, + "loss": 0.4554, + "step": 6135 + }, + { + "epoch": 2.9011820330969265, + "grad_norm": 3.0678138732910156, + "learning_rate": 2.661458734238274e-06, + "loss": 0.4671, + "step": 6136 + }, + { + "epoch": 2.9016548463356973, + "grad_norm": 2.6902482509613037, + "learning_rate": 2.6608362046008335e-06, + "loss": 0.372, + "step": 6137 + }, + { + "epoch": 2.902127659574468, + "grad_norm": 3.031597375869751, + "learning_rate": 2.660213664948827e-06, + "loss": 0.4424, + "step": 6138 + }, + { + "epoch": 2.902600472813239, + "grad_norm": 2.8376755714416504, + "learning_rate": 2.6595911153210187e-06, + "loss": 0.4599, + "step": 6139 + }, + { + "epoch": 2.9030732860520096, + "grad_norm": 3.3164854049682617, + "learning_rate": 2.6589685557561707e-06, + "loss": 0.3897, + "step": 6140 + }, + { + "epoch": 2.90354609929078, + "grad_norm": 2.9535014629364014, + "learning_rate": 2.658345986293048e-06, + "loss": 0.4957, + "step": 6141 + }, + { + "epoch": 2.904018912529551, + "grad_norm": 2.821276903152466, + "learning_rate": 2.657723406970415e-06, + "loss": 0.4453, + "step": 6142 + }, + { + "epoch": 2.9044917257683216, + "grad_norm": 2.7314651012420654, + "learning_rate": 2.657100817827037e-06, + "loss": 0.4406, + "step": 6143 + }, + { + "epoch": 2.904964539007092, + "grad_norm": 2.9509520530700684, + "learning_rate": 2.6564782189016804e-06, + "loss": 0.4629, + "step": 6144 + }, + { + "epoch": 2.9054373522458627, + "grad_norm": 2.6234960556030273, + "learning_rate": 2.655855610233111e-06, + "loss": 0.4306, + "step": 6145 + }, + { + "epoch": 2.9059101654846335, + "grad_norm": 2.7209644317626953, + "learning_rate": 2.6552329918600962e-06, + "loss": 0.3643, + "step": 6146 + }, + { + "epoch": 2.9063829787234043, + "grad_norm": 2.9797747135162354, + "learning_rate": 2.654610363821404e-06, + "loss": 0.4616, + "step": 6147 + }, + { + "epoch": 2.906855791962175, + "grad_norm": 2.8179666996002197, + "learning_rate": 2.6539877261558016e-06, + "loss": 0.4526, + "step": 6148 + }, + { + "epoch": 2.9073286052009455, + "grad_norm": 2.7492244243621826, + "learning_rate": 2.653365078902059e-06, + "loss": 0.4862, + "step": 6149 + }, + { + "epoch": 2.9078014184397163, + "grad_norm": 3.0262451171875, + "learning_rate": 2.6527424220989457e-06, + "loss": 0.3728, + "step": 6150 + }, + { + "epoch": 2.908274231678487, + "grad_norm": 2.8092808723449707, + "learning_rate": 2.6521197557852315e-06, + "loss": 0.4668, + "step": 6151 + }, + { + "epoch": 2.9087470449172574, + "grad_norm": 2.915719985961914, + "learning_rate": 2.651497079999687e-06, + "loss": 0.5124, + "step": 6152 + }, + { + "epoch": 2.9092198581560282, + "grad_norm": 2.9794204235076904, + "learning_rate": 2.6508743947810834e-06, + "loss": 0.5207, + "step": 6153 + }, + { + "epoch": 2.909692671394799, + "grad_norm": 2.882453680038452, + "learning_rate": 2.650251700168193e-06, + "loss": 0.4382, + "step": 6154 + }, + { + "epoch": 2.91016548463357, + "grad_norm": 3.183680534362793, + "learning_rate": 2.6496289961997886e-06, + "loss": 0.5134, + "step": 6155 + }, + { + "epoch": 2.9106382978723406, + "grad_norm": 2.9374759197235107, + "learning_rate": 2.649006282914642e-06, + "loss": 0.4748, + "step": 6156 + }, + { + "epoch": 2.911111111111111, + "grad_norm": 2.8096041679382324, + "learning_rate": 2.648383560351527e-06, + "loss": 0.4672, + "step": 6157 + }, + { + "epoch": 2.911583924349882, + "grad_norm": 2.8799238204956055, + "learning_rate": 2.6477608285492196e-06, + "loss": 0.4679, + "step": 6158 + }, + { + "epoch": 2.9120567375886526, + "grad_norm": 2.689310073852539, + "learning_rate": 2.6471380875464923e-06, + "loss": 0.4069, + "step": 6159 + }, + { + "epoch": 2.912529550827423, + "grad_norm": 2.909323215484619, + "learning_rate": 2.6465153373821216e-06, + "loss": 0.4463, + "step": 6160 + }, + { + "epoch": 2.9130023640661937, + "grad_norm": 2.797724962234497, + "learning_rate": 2.6458925780948845e-06, + "loss": 0.4269, + "step": 6161 + }, + { + "epoch": 2.9134751773049645, + "grad_norm": 2.7533204555511475, + "learning_rate": 2.645269809723556e-06, + "loss": 0.453, + "step": 6162 + }, + { + "epoch": 2.9139479905437353, + "grad_norm": 2.6615989208221436, + "learning_rate": 2.6446470323069122e-06, + "loss": 0.3921, + "step": 6163 + }, + { + "epoch": 2.914420803782506, + "grad_norm": 3.0493314266204834, + "learning_rate": 2.644024245883733e-06, + "loss": 0.4779, + "step": 6164 + }, + { + "epoch": 2.9148936170212765, + "grad_norm": 2.649845600128174, + "learning_rate": 2.643401450492795e-06, + "loss": 0.454, + "step": 6165 + }, + { + "epoch": 2.9153664302600473, + "grad_norm": 2.7931838035583496, + "learning_rate": 2.642778646172877e-06, + "loss": 0.504, + "step": 6166 + }, + { + "epoch": 2.915839243498818, + "grad_norm": 2.9518136978149414, + "learning_rate": 2.64215583296276e-06, + "loss": 0.4767, + "step": 6167 + }, + { + "epoch": 2.9163120567375884, + "grad_norm": 2.6047427654266357, + "learning_rate": 2.6415330109012216e-06, + "loss": 0.4316, + "step": 6168 + }, + { + "epoch": 2.916784869976359, + "grad_norm": 2.7732112407684326, + "learning_rate": 2.640910180027044e-06, + "loss": 0.4213, + "step": 6169 + }, + { + "epoch": 2.91725768321513, + "grad_norm": 3.1157236099243164, + "learning_rate": 2.6402873403790068e-06, + "loss": 0.4559, + "step": 6170 + }, + { + "epoch": 2.917730496453901, + "grad_norm": 2.68424129486084, + "learning_rate": 2.6396644919958917e-06, + "loss": 0.3456, + "step": 6171 + }, + { + "epoch": 2.9182033096926716, + "grad_norm": 3.1093270778656006, + "learning_rate": 2.639041634916482e-06, + "loss": 0.4172, + "step": 6172 + }, + { + "epoch": 2.918676122931442, + "grad_norm": 2.9844655990600586, + "learning_rate": 2.6384187691795594e-06, + "loss": 0.4844, + "step": 6173 + }, + { + "epoch": 2.9191489361702128, + "grad_norm": 2.907151222229004, + "learning_rate": 2.637795894823906e-06, + "loss": 0.5126, + "step": 6174 + }, + { + "epoch": 2.9196217494089836, + "grad_norm": 2.804105520248413, + "learning_rate": 2.637173011888307e-06, + "loss": 0.3919, + "step": 6175 + }, + { + "epoch": 2.920094562647754, + "grad_norm": 2.8809266090393066, + "learning_rate": 2.636550120411547e-06, + "loss": 0.4468, + "step": 6176 + }, + { + "epoch": 2.9205673758865247, + "grad_norm": 2.686290979385376, + "learning_rate": 2.6359272204324087e-06, + "loss": 0.4352, + "step": 6177 + }, + { + "epoch": 2.9210401891252955, + "grad_norm": 2.448101758956909, + "learning_rate": 2.635304311989678e-06, + "loss": 0.4218, + "step": 6178 + }, + { + "epoch": 2.9215130023640663, + "grad_norm": 2.81024169921875, + "learning_rate": 2.6346813951221416e-06, + "loss": 0.5177, + "step": 6179 + }, + { + "epoch": 2.921985815602837, + "grad_norm": 2.7590086460113525, + "learning_rate": 2.6340584698685856e-06, + "loss": 0.3897, + "step": 6180 + }, + { + "epoch": 2.9224586288416075, + "grad_norm": 3.1226227283477783, + "learning_rate": 2.6334355362677965e-06, + "loss": 0.4595, + "step": 6181 + }, + { + "epoch": 2.9229314420803783, + "grad_norm": 2.673828125, + "learning_rate": 2.6328125943585607e-06, + "loss": 0.4932, + "step": 6182 + }, + { + "epoch": 2.923404255319149, + "grad_norm": 2.8297293186187744, + "learning_rate": 2.632189644179668e-06, + "loss": 0.3819, + "step": 6183 + }, + { + "epoch": 2.9238770685579194, + "grad_norm": 2.9661548137664795, + "learning_rate": 2.6315666857699056e-06, + "loss": 0.4419, + "step": 6184 + }, + { + "epoch": 2.92434988179669, + "grad_norm": 2.9745798110961914, + "learning_rate": 2.6309437191680627e-06, + "loss": 0.4423, + "step": 6185 + }, + { + "epoch": 2.924822695035461, + "grad_norm": 2.8351712226867676, + "learning_rate": 2.6303207444129285e-06, + "loss": 0.5043, + "step": 6186 + }, + { + "epoch": 2.925295508274232, + "grad_norm": 2.6442384719848633, + "learning_rate": 2.6296977615432927e-06, + "loss": 0.4431, + "step": 6187 + }, + { + "epoch": 2.9257683215130026, + "grad_norm": 2.4128029346466064, + "learning_rate": 2.6290747705979457e-06, + "loss": 0.3603, + "step": 6188 + }, + { + "epoch": 2.926241134751773, + "grad_norm": 2.730424642562866, + "learning_rate": 2.6284517716156786e-06, + "loss": 0.439, + "step": 6189 + }, + { + "epoch": 2.9267139479905437, + "grad_norm": 2.6215405464172363, + "learning_rate": 2.627828764635284e-06, + "loss": 0.4117, + "step": 6190 + }, + { + "epoch": 2.9271867612293145, + "grad_norm": 2.56585955619812, + "learning_rate": 2.627205749695552e-06, + "loss": 0.4404, + "step": 6191 + }, + { + "epoch": 2.927659574468085, + "grad_norm": 2.9587886333465576, + "learning_rate": 2.6265827268352763e-06, + "loss": 0.4295, + "step": 6192 + }, + { + "epoch": 2.9281323877068557, + "grad_norm": 2.6611828804016113, + "learning_rate": 2.625959696093249e-06, + "loss": 0.4441, + "step": 6193 + }, + { + "epoch": 2.9286052009456265, + "grad_norm": 2.4391369819641113, + "learning_rate": 2.6253366575082634e-06, + "loss": 0.4447, + "step": 6194 + }, + { + "epoch": 2.9290780141843973, + "grad_norm": 2.710763454437256, + "learning_rate": 2.6247136111191144e-06, + "loss": 0.4662, + "step": 6195 + }, + { + "epoch": 2.929550827423168, + "grad_norm": 2.770697593688965, + "learning_rate": 2.6240905569645952e-06, + "loss": 0.4263, + "step": 6196 + }, + { + "epoch": 2.9300236406619384, + "grad_norm": 2.5885732173919678, + "learning_rate": 2.623467495083501e-06, + "loss": 0.4303, + "step": 6197 + }, + { + "epoch": 2.9304964539007092, + "grad_norm": 2.5716748237609863, + "learning_rate": 2.6228444255146274e-06, + "loss": 0.3714, + "step": 6198 + }, + { + "epoch": 2.93096926713948, + "grad_norm": 3.0437910556793213, + "learning_rate": 2.6222213482967703e-06, + "loss": 0.4077, + "step": 6199 + }, + { + "epoch": 2.9314420803782504, + "grad_norm": 2.7861344814300537, + "learning_rate": 2.6215982634687253e-06, + "loss": 0.4157, + "step": 6200 + }, + { + "epoch": 2.931914893617021, + "grad_norm": 2.5265355110168457, + "learning_rate": 2.6209751710692905e-06, + "loss": 0.4586, + "step": 6201 + }, + { + "epoch": 2.932387706855792, + "grad_norm": 2.940112590789795, + "learning_rate": 2.6203520711372615e-06, + "loss": 0.4208, + "step": 6202 + }, + { + "epoch": 2.9328605200945628, + "grad_norm": 2.7124581336975098, + "learning_rate": 2.6197289637114363e-06, + "loss": 0.4173, + "step": 6203 + }, + { + "epoch": 2.9333333333333336, + "grad_norm": 2.818523406982422, + "learning_rate": 2.619105848830615e-06, + "loss": 0.4349, + "step": 6204 + }, + { + "epoch": 2.933806146572104, + "grad_norm": 2.7630393505096436, + "learning_rate": 2.6184827265335937e-06, + "loss": 0.5078, + "step": 6205 + }, + { + "epoch": 2.9342789598108747, + "grad_norm": 3.0554699897766113, + "learning_rate": 2.6178595968591726e-06, + "loss": 0.4712, + "step": 6206 + }, + { + "epoch": 2.9347517730496455, + "grad_norm": 2.721992254257202, + "learning_rate": 2.6172364598461507e-06, + "loss": 0.4847, + "step": 6207 + }, + { + "epoch": 2.935224586288416, + "grad_norm": 2.809663772583008, + "learning_rate": 2.6166133155333303e-06, + "loss": 0.4447, + "step": 6208 + }, + { + "epoch": 2.9356973995271867, + "grad_norm": 2.568394660949707, + "learning_rate": 2.6159901639595088e-06, + "loss": 0.4543, + "step": 6209 + }, + { + "epoch": 2.9361702127659575, + "grad_norm": 3.3670637607574463, + "learning_rate": 2.6153670051634884e-06, + "loss": 0.4901, + "step": 6210 + }, + { + "epoch": 2.9366430260047283, + "grad_norm": 3.082508087158203, + "learning_rate": 2.614743839184071e-06, + "loss": 0.4862, + "step": 6211 + }, + { + "epoch": 2.937115839243499, + "grad_norm": 2.692139148712158, + "learning_rate": 2.6141206660600566e-06, + "loss": 0.5199, + "step": 6212 + }, + { + "epoch": 2.9375886524822694, + "grad_norm": 3.231433391571045, + "learning_rate": 2.6134974858302504e-06, + "loss": 0.464, + "step": 6213 + }, + { + "epoch": 2.93806146572104, + "grad_norm": 3.224238157272339, + "learning_rate": 2.612874298533452e-06, + "loss": 0.4507, + "step": 6214 + }, + { + "epoch": 2.938534278959811, + "grad_norm": 2.812755584716797, + "learning_rate": 2.6122511042084663e-06, + "loss": 0.4527, + "step": 6215 + }, + { + "epoch": 2.9390070921985814, + "grad_norm": 2.837811231613159, + "learning_rate": 2.611627902894098e-06, + "loss": 0.4782, + "step": 6216 + }, + { + "epoch": 2.939479905437352, + "grad_norm": 3.093817710876465, + "learning_rate": 2.6110046946291476e-06, + "loss": 0.4933, + "step": 6217 + }, + { + "epoch": 2.939952718676123, + "grad_norm": 2.950119733810425, + "learning_rate": 2.6103814794524235e-06, + "loss": 0.4884, + "step": 6218 + }, + { + "epoch": 2.9404255319148938, + "grad_norm": 2.469681978225708, + "learning_rate": 2.6097582574027274e-06, + "loss": 0.4135, + "step": 6219 + }, + { + "epoch": 2.9408983451536646, + "grad_norm": 2.779238224029541, + "learning_rate": 2.609135028518866e-06, + "loss": 0.5165, + "step": 6220 + }, + { + "epoch": 2.941371158392435, + "grad_norm": 2.807705879211426, + "learning_rate": 2.608511792839645e-06, + "loss": 0.4046, + "step": 6221 + }, + { + "epoch": 2.9418439716312057, + "grad_norm": 2.6067750453948975, + "learning_rate": 2.607888550403871e-06, + "loss": 0.406, + "step": 6222 + }, + { + "epoch": 2.9423167848699765, + "grad_norm": 2.865766763687134, + "learning_rate": 2.607265301250349e-06, + "loss": 0.471, + "step": 6223 + }, + { + "epoch": 2.942789598108747, + "grad_norm": 2.977681875228882, + "learning_rate": 2.6066420454178876e-06, + "loss": 0.4666, + "step": 6224 + }, + { + "epoch": 2.9432624113475176, + "grad_norm": 2.870884418487549, + "learning_rate": 2.606018782945294e-06, + "loss": 0.4768, + "step": 6225 + }, + { + "epoch": 2.9437352245862884, + "grad_norm": 2.992851495742798, + "learning_rate": 2.6053955138713756e-06, + "loss": 0.4657, + "step": 6226 + }, + { + "epoch": 2.9442080378250592, + "grad_norm": 2.7279815673828125, + "learning_rate": 2.6047722382349406e-06, + "loss": 0.4087, + "step": 6227 + }, + { + "epoch": 2.94468085106383, + "grad_norm": 2.8587028980255127, + "learning_rate": 2.604148956074797e-06, + "loss": 0.4452, + "step": 6228 + }, + { + "epoch": 2.9451536643026004, + "grad_norm": 3.001694679260254, + "learning_rate": 2.6035256674297555e-06, + "loss": 0.4852, + "step": 6229 + }, + { + "epoch": 2.945626477541371, + "grad_norm": 2.858069896697998, + "learning_rate": 2.6029023723386237e-06, + "loss": 0.4281, + "step": 6230 + }, + { + "epoch": 2.946099290780142, + "grad_norm": 2.675856828689575, + "learning_rate": 2.602279070840213e-06, + "loss": 0.4545, + "step": 6231 + }, + { + "epoch": 2.9465721040189123, + "grad_norm": 2.530245065689087, + "learning_rate": 2.6016557629733334e-06, + "loss": 0.4619, + "step": 6232 + }, + { + "epoch": 2.947044917257683, + "grad_norm": 2.7533743381500244, + "learning_rate": 2.601032448776795e-06, + "loss": 0.4879, + "step": 6233 + }, + { + "epoch": 2.947517730496454, + "grad_norm": 3.130453109741211, + "learning_rate": 2.600409128289409e-06, + "loss": 0.4056, + "step": 6234 + }, + { + "epoch": 2.9479905437352247, + "grad_norm": 3.4736509323120117, + "learning_rate": 2.5997858015499867e-06, + "loss": 0.5063, + "step": 6235 + }, + { + "epoch": 2.9484633569739955, + "grad_norm": 2.871978282928467, + "learning_rate": 2.5991624685973406e-06, + "loss": 0.4562, + "step": 6236 + }, + { + "epoch": 2.948936170212766, + "grad_norm": 2.976503372192383, + "learning_rate": 2.5985391294702817e-06, + "loss": 0.5079, + "step": 6237 + }, + { + "epoch": 2.9494089834515367, + "grad_norm": 2.578122615814209, + "learning_rate": 2.597915784207623e-06, + "loss": 0.4069, + "step": 6238 + }, + { + "epoch": 2.9498817966903075, + "grad_norm": 2.885911226272583, + "learning_rate": 2.597292432848178e-06, + "loss": 0.4382, + "step": 6239 + }, + { + "epoch": 2.950354609929078, + "grad_norm": 2.9301681518554688, + "learning_rate": 2.5966690754307605e-06, + "loss": 0.4888, + "step": 6240 + }, + { + "epoch": 2.9508274231678486, + "grad_norm": 2.9912192821502686, + "learning_rate": 2.5960457119941834e-06, + "loss": 0.4699, + "step": 6241 + }, + { + "epoch": 2.9513002364066194, + "grad_norm": 2.6612601280212402, + "learning_rate": 2.5954223425772607e-06, + "loss": 0.3736, + "step": 6242 + }, + { + "epoch": 2.9517730496453902, + "grad_norm": 2.9325380325317383, + "learning_rate": 2.5947989672188067e-06, + "loss": 0.4771, + "step": 6243 + }, + { + "epoch": 2.952245862884161, + "grad_norm": 2.8143959045410156, + "learning_rate": 2.594175585957637e-06, + "loss": 0.5103, + "step": 6244 + }, + { + "epoch": 2.9527186761229314, + "grad_norm": 2.355078935623169, + "learning_rate": 2.5935521988325674e-06, + "loss": 0.44, + "step": 6245 + }, + { + "epoch": 2.953191489361702, + "grad_norm": 2.733156442642212, + "learning_rate": 2.5929288058824114e-06, + "loss": 0.4306, + "step": 6246 + }, + { + "epoch": 2.953664302600473, + "grad_norm": 3.182563304901123, + "learning_rate": 2.5923054071459865e-06, + "loss": 0.417, + "step": 6247 + }, + { + "epoch": 2.9541371158392433, + "grad_norm": 2.4162323474884033, + "learning_rate": 2.5916820026621094e-06, + "loss": 0.3802, + "step": 6248 + }, + { + "epoch": 2.954609929078014, + "grad_norm": 2.772706985473633, + "learning_rate": 2.591058592469595e-06, + "loss": 0.4654, + "step": 6249 + }, + { + "epoch": 2.955082742316785, + "grad_norm": 2.6011102199554443, + "learning_rate": 2.5904351766072616e-06, + "loss": 0.4619, + "step": 6250 + }, + { + "epoch": 2.9555555555555557, + "grad_norm": 2.5700361728668213, + "learning_rate": 2.589811755113926e-06, + "loss": 0.3991, + "step": 6251 + }, + { + "epoch": 2.9560283687943265, + "grad_norm": 2.6444971561431885, + "learning_rate": 2.589188328028407e-06, + "loss": 0.4388, + "step": 6252 + }, + { + "epoch": 2.956501182033097, + "grad_norm": 2.739567279815674, + "learning_rate": 2.588564895389521e-06, + "loss": 0.4193, + "step": 6253 + }, + { + "epoch": 2.9569739952718677, + "grad_norm": 2.7070045471191406, + "learning_rate": 2.5879414572360877e-06, + "loss": 0.4347, + "step": 6254 + }, + { + "epoch": 2.9574468085106385, + "grad_norm": 2.7811532020568848, + "learning_rate": 2.587318013606926e-06, + "loss": 0.43, + "step": 6255 + }, + { + "epoch": 2.957919621749409, + "grad_norm": 3.0036091804504395, + "learning_rate": 2.5866945645408537e-06, + "loss": 0.4855, + "step": 6256 + }, + { + "epoch": 2.9583924349881796, + "grad_norm": 2.948573112487793, + "learning_rate": 2.5860711100766918e-06, + "loss": 0.4594, + "step": 6257 + }, + { + "epoch": 2.9588652482269504, + "grad_norm": 2.6371593475341797, + "learning_rate": 2.5854476502532583e-06, + "loss": 0.446, + "step": 6258 + }, + { + "epoch": 2.959338061465721, + "grad_norm": 2.668677806854248, + "learning_rate": 2.5848241851093754e-06, + "loss": 0.3991, + "step": 6259 + }, + { + "epoch": 2.959810874704492, + "grad_norm": 3.1640663146972656, + "learning_rate": 2.5842007146838614e-06, + "loss": 0.5146, + "step": 6260 + }, + { + "epoch": 2.9602836879432624, + "grad_norm": 2.9412102699279785, + "learning_rate": 2.5835772390155382e-06, + "loss": 0.4798, + "step": 6261 + }, + { + "epoch": 2.960756501182033, + "grad_norm": 2.7674343585968018, + "learning_rate": 2.582953758143227e-06, + "loss": 0.4262, + "step": 6262 + }, + { + "epoch": 2.961229314420804, + "grad_norm": 3.5219457149505615, + "learning_rate": 2.582330272105749e-06, + "loss": 0.4905, + "step": 6263 + }, + { + "epoch": 2.9617021276595743, + "grad_norm": 2.4274468421936035, + "learning_rate": 2.5817067809419267e-06, + "loss": 0.4048, + "step": 6264 + }, + { + "epoch": 2.962174940898345, + "grad_norm": 2.6907944679260254, + "learning_rate": 2.5810832846905814e-06, + "loss": 0.388, + "step": 6265 + }, + { + "epoch": 2.962647754137116, + "grad_norm": 2.603151321411133, + "learning_rate": 2.5804597833905347e-06, + "loss": 0.4377, + "step": 6266 + }, + { + "epoch": 2.9631205673758867, + "grad_norm": 2.685837507247925, + "learning_rate": 2.57983627708061e-06, + "loss": 0.4409, + "step": 6267 + }, + { + "epoch": 2.963593380614657, + "grad_norm": 2.8281500339508057, + "learning_rate": 2.579212765799631e-06, + "loss": 0.4567, + "step": 6268 + }, + { + "epoch": 2.964066193853428, + "grad_norm": 2.6387875080108643, + "learning_rate": 2.57858924958642e-06, + "loss": 0.4061, + "step": 6269 + }, + { + "epoch": 2.9645390070921986, + "grad_norm": 2.64139986038208, + "learning_rate": 2.5779657284798017e-06, + "loss": 0.4539, + "step": 6270 + }, + { + "epoch": 2.965011820330969, + "grad_norm": 2.7384836673736572, + "learning_rate": 2.5773422025185983e-06, + "loss": 0.408, + "step": 6271 + }, + { + "epoch": 2.96548463356974, + "grad_norm": 2.262514352798462, + "learning_rate": 2.576718671741636e-06, + "loss": 0.3726, + "step": 6272 + }, + { + "epoch": 2.9659574468085106, + "grad_norm": 2.53800106048584, + "learning_rate": 2.5760951361877384e-06, + "loss": 0.4716, + "step": 6273 + }, + { + "epoch": 2.9664302600472814, + "grad_norm": 3.256701707839966, + "learning_rate": 2.57547159589573e-06, + "loss": 0.518, + "step": 6274 + }, + { + "epoch": 2.966903073286052, + "grad_norm": 2.9427342414855957, + "learning_rate": 2.574848050904436e-06, + "loss": 0.4255, + "step": 6275 + }, + { + "epoch": 2.9673758865248225, + "grad_norm": 2.5794098377227783, + "learning_rate": 2.574224501252682e-06, + "loss": 0.4412, + "step": 6276 + }, + { + "epoch": 2.9678486997635933, + "grad_norm": 2.5894877910614014, + "learning_rate": 2.573600946979294e-06, + "loss": 0.4356, + "step": 6277 + }, + { + "epoch": 2.968321513002364, + "grad_norm": 2.9597361087799072, + "learning_rate": 2.572977388123098e-06, + "loss": 0.4376, + "step": 6278 + }, + { + "epoch": 2.9687943262411345, + "grad_norm": 2.779303550720215, + "learning_rate": 2.5723538247229197e-06, + "loss": 0.3985, + "step": 6279 + }, + { + "epoch": 2.9692671394799053, + "grad_norm": 2.9173855781555176, + "learning_rate": 2.5717302568175866e-06, + "loss": 0.4581, + "step": 6280 + }, + { + "epoch": 2.969739952718676, + "grad_norm": 2.703721284866333, + "learning_rate": 2.5711066844459242e-06, + "loss": 0.3705, + "step": 6281 + }, + { + "epoch": 2.970212765957447, + "grad_norm": 2.5415029525756836, + "learning_rate": 2.5704831076467613e-06, + "loss": 0.4089, + "step": 6282 + }, + { + "epoch": 2.9706855791962177, + "grad_norm": 2.791780948638916, + "learning_rate": 2.5698595264589234e-06, + "loss": 0.4357, + "step": 6283 + }, + { + "epoch": 2.971158392434988, + "grad_norm": 2.887662887573242, + "learning_rate": 2.5692359409212392e-06, + "loss": 0.4093, + "step": 6284 + }, + { + "epoch": 2.971631205673759, + "grad_norm": 3.0309557914733887, + "learning_rate": 2.5686123510725364e-06, + "loss": 0.4461, + "step": 6285 + }, + { + "epoch": 2.9721040189125296, + "grad_norm": 2.6861515045166016, + "learning_rate": 2.5679887569516437e-06, + "loss": 0.4199, + "step": 6286 + }, + { + "epoch": 2.9725768321513, + "grad_norm": 2.7014012336730957, + "learning_rate": 2.5673651585973897e-06, + "loss": 0.4373, + "step": 6287 + }, + { + "epoch": 2.9730496453900708, + "grad_norm": 2.951265811920166, + "learning_rate": 2.5667415560486026e-06, + "loss": 0.4426, + "step": 6288 + }, + { + "epoch": 2.9735224586288416, + "grad_norm": 2.7664504051208496, + "learning_rate": 2.5661179493441106e-06, + "loss": 0.474, + "step": 6289 + }, + { + "epoch": 2.9739952718676124, + "grad_norm": 2.6081087589263916, + "learning_rate": 2.5654943385227445e-06, + "loss": 0.4058, + "step": 6290 + }, + { + "epoch": 2.974468085106383, + "grad_norm": 2.9416966438293457, + "learning_rate": 2.564870723623333e-06, + "loss": 0.506, + "step": 6291 + }, + { + "epoch": 2.9749408983451535, + "grad_norm": 2.9441659450531006, + "learning_rate": 2.564247104684706e-06, + "loss": 0.4505, + "step": 6292 + }, + { + "epoch": 2.9754137115839243, + "grad_norm": 2.7110862731933594, + "learning_rate": 2.563623481745693e-06, + "loss": 0.4493, + "step": 6293 + }, + { + "epoch": 2.975886524822695, + "grad_norm": 2.88459849357605, + "learning_rate": 2.562999854845125e-06, + "loss": 0.4462, + "step": 6294 + }, + { + "epoch": 2.9763593380614655, + "grad_norm": 3.0491793155670166, + "learning_rate": 2.5623762240218327e-06, + "loss": 0.4928, + "step": 6295 + }, + { + "epoch": 2.9768321513002363, + "grad_norm": 2.9475483894348145, + "learning_rate": 2.561752589314646e-06, + "loss": 0.4535, + "step": 6296 + }, + { + "epoch": 2.977304964539007, + "grad_norm": 2.879495859146118, + "learning_rate": 2.561128950762397e-06, + "loss": 0.4393, + "step": 6297 + }, + { + "epoch": 2.977777777777778, + "grad_norm": 2.8478336334228516, + "learning_rate": 2.560505308403916e-06, + "loss": 0.4363, + "step": 6298 + }, + { + "epoch": 2.9782505910165487, + "grad_norm": 2.5475094318389893, + "learning_rate": 2.5598816622780343e-06, + "loss": 0.3825, + "step": 6299 + }, + { + "epoch": 2.978723404255319, + "grad_norm": 2.85430908203125, + "learning_rate": 2.5592580124235838e-06, + "loss": 0.4226, + "step": 6300 + }, + { + "epoch": 2.97919621749409, + "grad_norm": 2.569775104522705, + "learning_rate": 2.5586343588793975e-06, + "loss": 0.4045, + "step": 6301 + }, + { + "epoch": 2.9796690307328606, + "grad_norm": 2.4482202529907227, + "learning_rate": 2.558010701684307e-06, + "loss": 0.4625, + "step": 6302 + }, + { + "epoch": 2.980141843971631, + "grad_norm": 2.9301230907440186, + "learning_rate": 2.5573870408771436e-06, + "loss": 0.4358, + "step": 6303 + }, + { + "epoch": 2.9806146572104018, + "grad_norm": 2.9865870475769043, + "learning_rate": 2.5567633764967416e-06, + "loss": 0.497, + "step": 6304 + }, + { + "epoch": 2.9810874704491725, + "grad_norm": 2.523524522781372, + "learning_rate": 2.556139708581933e-06, + "loss": 0.4141, + "step": 6305 + }, + { + "epoch": 2.9815602836879433, + "grad_norm": 2.8489344120025635, + "learning_rate": 2.5555160371715504e-06, + "loss": 0.4205, + "step": 6306 + }, + { + "epoch": 2.982033096926714, + "grad_norm": 2.417759895324707, + "learning_rate": 2.5548923623044274e-06, + "loss": 0.44, + "step": 6307 + }, + { + "epoch": 2.9825059101654845, + "grad_norm": 2.7626900672912598, + "learning_rate": 2.554268684019398e-06, + "loss": 0.4646, + "step": 6308 + }, + { + "epoch": 2.9829787234042553, + "grad_norm": 3.0916266441345215, + "learning_rate": 2.5536450023552956e-06, + "loss": 0.4443, + "step": 6309 + }, + { + "epoch": 2.983451536643026, + "grad_norm": 2.721992015838623, + "learning_rate": 2.5530213173509542e-06, + "loss": 0.4008, + "step": 6310 + }, + { + "epoch": 2.9839243498817964, + "grad_norm": 2.825334072113037, + "learning_rate": 2.552397629045208e-06, + "loss": 0.4513, + "step": 6311 + }, + { + "epoch": 2.9843971631205672, + "grad_norm": 2.912050485610962, + "learning_rate": 2.5517739374768915e-06, + "loss": 0.4104, + "step": 6312 + }, + { + "epoch": 2.984869976359338, + "grad_norm": 2.760650634765625, + "learning_rate": 2.551150242684838e-06, + "loss": 0.4372, + "step": 6313 + }, + { + "epoch": 2.985342789598109, + "grad_norm": 2.8926033973693848, + "learning_rate": 2.5505265447078838e-06, + "loss": 0.475, + "step": 6314 + }, + { + "epoch": 2.9858156028368796, + "grad_norm": 2.6279892921447754, + "learning_rate": 2.5499028435848633e-06, + "loss": 0.4589, + "step": 6315 + }, + { + "epoch": 2.98628841607565, + "grad_norm": 3.2147316932678223, + "learning_rate": 2.549279139354611e-06, + "loss": 0.4968, + "step": 6316 + }, + { + "epoch": 2.986761229314421, + "grad_norm": 2.4510674476623535, + "learning_rate": 2.5486554320559626e-06, + "loss": 0.4291, + "step": 6317 + }, + { + "epoch": 2.9872340425531916, + "grad_norm": 2.6919643878936768, + "learning_rate": 2.5480317217277544e-06, + "loss": 0.4704, + "step": 6318 + }, + { + "epoch": 2.987706855791962, + "grad_norm": 2.9832234382629395, + "learning_rate": 2.5474080084088215e-06, + "loss": 0.4129, + "step": 6319 + }, + { + "epoch": 2.9881796690307327, + "grad_norm": 2.893209218978882, + "learning_rate": 2.5467842921380004e-06, + "loss": 0.5099, + "step": 6320 + }, + { + "epoch": 2.9886524822695035, + "grad_norm": 2.6734580993652344, + "learning_rate": 2.5461605729541254e-06, + "loss": 0.4588, + "step": 6321 + }, + { + "epoch": 2.9891252955082743, + "grad_norm": 2.5591681003570557, + "learning_rate": 2.5455368508960343e-06, + "loss": 0.4162, + "step": 6322 + }, + { + "epoch": 2.989598108747045, + "grad_norm": 3.2619881629943848, + "learning_rate": 2.5449131260025626e-06, + "loss": 0.4412, + "step": 6323 + }, + { + "epoch": 2.9900709219858155, + "grad_norm": 2.897914409637451, + "learning_rate": 2.544289398312549e-06, + "loss": 0.5079, + "step": 6324 + }, + { + "epoch": 2.9905437352245863, + "grad_norm": 2.7891685962677, + "learning_rate": 2.5436656678648274e-06, + "loss": 0.42, + "step": 6325 + }, + { + "epoch": 2.991016548463357, + "grad_norm": 3.022341728210449, + "learning_rate": 2.5430419346982367e-06, + "loss": 0.4739, + "step": 6326 + }, + { + "epoch": 2.9914893617021274, + "grad_norm": 3.395775556564331, + "learning_rate": 2.542418198851614e-06, + "loss": 0.4822, + "step": 6327 + }, + { + "epoch": 2.9919621749408982, + "grad_norm": 3.0200490951538086, + "learning_rate": 2.541794460363795e-06, + "loss": 0.4755, + "step": 6328 + }, + { + "epoch": 2.992434988179669, + "grad_norm": 3.302020311355591, + "learning_rate": 2.541170719273619e-06, + "loss": 0.4603, + "step": 6329 + }, + { + "epoch": 2.99290780141844, + "grad_norm": 2.5985910892486572, + "learning_rate": 2.5405469756199226e-06, + "loss": 0.4475, + "step": 6330 + }, + { + "epoch": 2.9933806146572106, + "grad_norm": 2.9413928985595703, + "learning_rate": 2.5399232294415434e-06, + "loss": 0.4695, + "step": 6331 + }, + { + "epoch": 2.993853427895981, + "grad_norm": 2.942777156829834, + "learning_rate": 2.53929948077732e-06, + "loss": 0.4462, + "step": 6332 + }, + { + "epoch": 2.9943262411347518, + "grad_norm": 2.971120595932007, + "learning_rate": 2.53867572966609e-06, + "loss": 0.4546, + "step": 6333 + }, + { + "epoch": 2.9947990543735226, + "grad_norm": 2.8248138427734375, + "learning_rate": 2.5380519761466927e-06, + "loss": 0.453, + "step": 6334 + }, + { + "epoch": 2.995271867612293, + "grad_norm": 3.0819008350372314, + "learning_rate": 2.5374282202579647e-06, + "loss": 0.4774, + "step": 6335 + }, + { + "epoch": 2.9957446808510637, + "grad_norm": 2.742570161819458, + "learning_rate": 2.5368044620387466e-06, + "loss": 0.5059, + "step": 6336 + }, + { + "epoch": 2.9962174940898345, + "grad_norm": 2.9087419509887695, + "learning_rate": 2.5361807015278757e-06, + "loss": 0.3606, + "step": 6337 + }, + { + "epoch": 2.9966903073286053, + "grad_norm": 2.6887354850769043, + "learning_rate": 2.5355569387641908e-06, + "loss": 0.4247, + "step": 6338 + }, + { + "epoch": 2.997163120567376, + "grad_norm": 2.8516008853912354, + "learning_rate": 2.534933173786531e-06, + "loss": 0.4502, + "step": 6339 + }, + { + "epoch": 2.9976359338061465, + "grad_norm": 2.4463164806365967, + "learning_rate": 2.5343094066337366e-06, + "loss": 0.3883, + "step": 6340 + }, + { + "epoch": 2.9981087470449173, + "grad_norm": 2.87025785446167, + "learning_rate": 2.533685637344645e-06, + "loss": 0.4534, + "step": 6341 + }, + { + "epoch": 2.998581560283688, + "grad_norm": 3.0706169605255127, + "learning_rate": 2.5330618659580967e-06, + "loss": 0.5426, + "step": 6342 + }, + { + "epoch": 2.9990543735224584, + "grad_norm": 2.7185773849487305, + "learning_rate": 2.532438092512931e-06, + "loss": 0.497, + "step": 6343 + }, + { + "epoch": 2.999527186761229, + "grad_norm": 2.840207815170288, + "learning_rate": 2.531814317047988e-06, + "loss": 0.4073, + "step": 6344 + }, + { + "epoch": 3.0, + "grad_norm": 3.1592655181884766, + "learning_rate": 2.5311905396021063e-06, + "loss": 0.4728, + "step": 6345 + } + ], + "logging_steps": 1, + "max_steps": 12690, + "num_input_tokens_seen": 0, + "num_train_epochs": 6, + "save_steps": 2115, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 1.602707133177004e+19, + "train_batch_size": 4, + "trial_name": null, + "trial_params": null +} diff --git a/checkpoint-6345/training_args.bin b/checkpoint-6345/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..3885e6a27b204c834ff6fbc6f3a5a4c7f69fe2cc --- /dev/null +++ b/checkpoint-6345/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:51188d498a3fe1bf9ddf234271786a31e25e89b3aa68721a97f8264e1013c9b6 +size 8056 diff --git a/checkpoint-6345/zero_to_fp32.py b/checkpoint-6345/zero_to_fp32.py new file mode 100644 index 0000000000000000000000000000000000000000..24cc342e78d1a006c782b3a4cd68d9ce786d8fd8 --- /dev/null +++ b/checkpoint-6345/zero_to_fp32.py @@ -0,0 +1,604 @@ +#!/usr/bin/env python + +# Copyright (c) Microsoft Corporation. +# SPDX-License-Identifier: Apache-2.0 + +# DeepSpeed Team + +# This script extracts fp32 consolidated weights from a zero 1, 2 and 3 DeepSpeed checkpoints. It gets +# copied into the top level checkpoint dir, so the user can easily do the conversion at any point in +# the future. Once extracted, the weights don't require DeepSpeed and can be used in any +# application. +# +# example: python zero_to_fp32.py . pytorch_model.bin + +import argparse +import torch +import glob +import math +import os +import re +from collections import OrderedDict +from dataclasses import dataclass + +# while this script doesn't use deepspeed to recover data, since the checkpoints are pickled with +# DeepSpeed data structures it has to be available in the current python environment. +from deepspeed.utils import logger +from deepspeed.checkpoint.constants import (DS_VERSION, OPTIMIZER_STATE_DICT, SINGLE_PARTITION_OF_FP32_GROUPS, + FP32_FLAT_GROUPS, ZERO_STAGE, PARTITION_COUNT, PARAM_SHAPES, BUFFER_NAMES, + FROZEN_PARAM_SHAPES, FROZEN_PARAM_FRAGMENTS) + + +@dataclass +class zero_model_state: + buffers: dict() + param_shapes: dict() + shared_params: list + ds_version: int + frozen_param_shapes: dict() + frozen_param_fragments: dict() + + +debug = 0 + +# load to cpu +device = torch.device('cpu') + + +def atoi(text): + return int(text) if text.isdigit() else text + + +def natural_keys(text): + ''' + alist.sort(key=natural_keys) sorts in human order + http://nedbatchelder.com/blog/200712/human_sorting.html + (See Toothy's implementation in the comments) + ''' + return [atoi(c) for c in re.split(r'(\d+)', text)] + + +def get_model_state_file(checkpoint_dir, zero_stage): + if not os.path.isdir(checkpoint_dir): + raise FileNotFoundError(f"Directory '{checkpoint_dir}' doesn't exist") + + # there should be only one file + if zero_stage <= 2: + file = os.path.join(checkpoint_dir, "mp_rank_00_model_states.pt") + elif zero_stage == 3: + file = os.path.join(checkpoint_dir, "zero_pp_rank_0_mp_rank_00_model_states.pt") + + if not os.path.exists(file): + raise FileNotFoundError(f"can't find model states file at '{file}'") + + return file + + +def get_checkpoint_files(checkpoint_dir, glob_pattern): + # XXX: need to test that this simple glob rule works for multi-node setup too + ckpt_files = sorted(glob.glob(os.path.join(checkpoint_dir, glob_pattern)), key=natural_keys) + + if len(ckpt_files) == 0: + raise FileNotFoundError(f"can't find {glob_pattern} files in directory '{checkpoint_dir}'") + + return ckpt_files + + +def get_optim_files(checkpoint_dir): + return get_checkpoint_files(checkpoint_dir, "*_optim_states.pt") + + +def get_model_state_files(checkpoint_dir): + return get_checkpoint_files(checkpoint_dir, "*_model_states.pt") + + +def parse_model_states(files): + zero_model_states = [] + for file in files: + state_dict = torch.load(file, map_location=device) + + if BUFFER_NAMES not in state_dict: + raise ValueError(f"{file} is not a model state checkpoint") + buffer_names = state_dict[BUFFER_NAMES] + if debug: + print("Found buffers:", buffer_names) + + # recover just the buffers while restoring them to fp32 if they were saved in fp16 + buffers = {k: v.float() for k, v in state_dict["module"].items() if k in buffer_names} + param_shapes = state_dict[PARAM_SHAPES] + + # collect parameters that are included in param_shapes + param_names = [] + for s in param_shapes: + for name in s.keys(): + param_names.append(name) + + # update with frozen parameters + frozen_param_shapes = state_dict.get(FROZEN_PARAM_SHAPES, None) + if frozen_param_shapes is not None: + if debug: + print(f"Found frozen_param_shapes: {frozen_param_shapes}") + param_names += list(frozen_param_shapes.keys()) + + # handle shared params + shared_params = [[k, v] for k, v in state_dict["shared_params"].items()] + + ds_version = state_dict.get(DS_VERSION, None) + + frozen_param_fragments = state_dict.get(FROZEN_PARAM_FRAGMENTS, None) + + z_model_state = zero_model_state(buffers=buffers, + param_shapes=param_shapes, + shared_params=shared_params, + ds_version=ds_version, + frozen_param_shapes=frozen_param_shapes, + frozen_param_fragments=frozen_param_fragments) + zero_model_states.append(z_model_state) + + return zero_model_states + + +def parse_optim_states(files, ds_checkpoint_dir): + + total_files = len(files) + state_dicts = [] + for f in files: + state_dict = torch.load(f, map_location=device) + # immediately discard the potentially huge 2 optimizer states as we only care for fp32 master weights + # and also handle the case where it was already removed by another helper script + state_dict["optimizer_state_dict"].pop("optimizer_state_dict", None) + state_dicts.append(state_dict) + + if not ZERO_STAGE in state_dicts[0][OPTIMIZER_STATE_DICT]: + raise ValueError(f"{files[0]} is not a zero checkpoint") + zero_stage = state_dicts[0][OPTIMIZER_STATE_DICT][ZERO_STAGE] + world_size = state_dicts[0][OPTIMIZER_STATE_DICT][PARTITION_COUNT] + + # For ZeRO-2 each param group can have different partition_count as data parallelism for expert + # parameters can be different from data parallelism for non-expert parameters. So we can just + # use the max of the partition_count to get the dp world_size. + + if type(world_size) is list: + world_size = max(world_size) + + if world_size != total_files: + raise ValueError( + f"Expected {world_size} of '*_optim_states.pt' under '{ds_checkpoint_dir}' but found {total_files} files. " + "Possibly due to an overwrite of an old checkpoint, or a checkpoint didn't get saved by one or more processes." + ) + + # the groups are named differently in each stage + if zero_stage <= 2: + fp32_groups_key = SINGLE_PARTITION_OF_FP32_GROUPS + elif zero_stage == 3: + fp32_groups_key = FP32_FLAT_GROUPS + else: + raise ValueError(f"unknown zero stage {zero_stage}") + + if zero_stage <= 2: + fp32_flat_groups = [state_dicts[i][OPTIMIZER_STATE_DICT][fp32_groups_key] for i in range(len(state_dicts))] + elif zero_stage == 3: + # if there is more than one param group, there will be multiple flattened tensors - one + # flattened tensor per group - for simplicity merge them into a single tensor + # + # XXX: could make the script more memory efficient for when there are multiple groups - it + # will require matching the sub-lists of param_shapes for each param group flattened tensor + + fp32_flat_groups = [ + torch.cat(state_dicts[i][OPTIMIZER_STATE_DICT][fp32_groups_key], 0) for i in range(len(state_dicts)) + ] + + return zero_stage, world_size, fp32_flat_groups + + +def _get_fp32_state_dict_from_zero_checkpoint(ds_checkpoint_dir, exclude_frozen_parameters): + """ + Returns fp32 state_dict reconstructed from ds checkpoint + + Args: + - ``ds_checkpoint_dir``: path to the deepspeed checkpoint folder (where the optimizer files are) + + """ + print(f"Processing zero checkpoint '{ds_checkpoint_dir}'") + + optim_files = get_optim_files(ds_checkpoint_dir) + zero_stage, world_size, fp32_flat_groups = parse_optim_states(optim_files, ds_checkpoint_dir) + print(f"Detected checkpoint of type zero stage {zero_stage}, world_size: {world_size}") + + model_files = get_model_state_files(ds_checkpoint_dir) + + zero_model_states = parse_model_states(model_files) + print(f'Parsing checkpoint created by deepspeed=={zero_model_states[0].ds_version}') + + if zero_stage <= 2: + return _get_fp32_state_dict_from_zero2_checkpoint(world_size, fp32_flat_groups, zero_model_states, + exclude_frozen_parameters) + elif zero_stage == 3: + return _get_fp32_state_dict_from_zero3_checkpoint(world_size, fp32_flat_groups, zero_model_states, + exclude_frozen_parameters) + + +def _zero2_merge_frozen_params(state_dict, zero_model_states): + if zero_model_states[0].frozen_param_shapes is None or len(zero_model_states[0].frozen_param_shapes) == 0: + return + + frozen_param_shapes = zero_model_states[0].frozen_param_shapes + frozen_param_fragments = zero_model_states[0].frozen_param_fragments + + if debug: + num_elem = sum(s.numel() for s in frozen_param_shapes.values()) + print(f'rank 0: {FROZEN_PARAM_SHAPES}.numel = {num_elem}') + + wanted_params = len(frozen_param_shapes) + wanted_numel = sum(s.numel() for s in frozen_param_shapes.values()) + avail_numel = sum([p.numel() for p in frozen_param_fragments.values()]) + print(f'Frozen params: Have {avail_numel} numels to process.') + print(f'Frozen params: Need {wanted_numel} numels in {wanted_params} params') + + total_params = 0 + total_numel = 0 + for name, shape in frozen_param_shapes.items(): + total_params += 1 + unpartitioned_numel = shape.numel() + total_numel += unpartitioned_numel + + state_dict[name] = frozen_param_fragments[name] + + if debug: + print(f"{name} full shape: {shape} unpartitioned numel {unpartitioned_numel} ") + + print(f"Reconstructed Frozen fp32 state dict with {total_params} params {total_numel} elements") + + +def _has_callable(obj, fn): + attr = getattr(obj, fn, None) + return callable(attr) + + +def _zero2_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states): + param_shapes = zero_model_states[0].param_shapes + + # Reconstruction protocol: + # + # XXX: document this + + if debug: + for i in range(world_size): + for j in range(len(fp32_flat_groups[0])): + print(f"{FP32_FLAT_GROUPS}[{i}][{j}].shape={fp32_flat_groups[i][j].shape}") + + # XXX: memory usage doubles here (zero2) + num_param_groups = len(fp32_flat_groups[0]) + merged_single_partition_of_fp32_groups = [] + for i in range(num_param_groups): + merged_partitions = [sd[i] for sd in fp32_flat_groups] + full_single_fp32_vector = torch.cat(merged_partitions, 0) + merged_single_partition_of_fp32_groups.append(full_single_fp32_vector) + avail_numel = sum( + [full_single_fp32_vector.numel() for full_single_fp32_vector in merged_single_partition_of_fp32_groups]) + + if debug: + wanted_params = sum([len(shapes) for shapes in param_shapes]) + wanted_numel = sum([sum(shape.numel() for shape in shapes.values()) for shapes in param_shapes]) + # not asserting if there is a mismatch due to possible padding + print(f"Have {avail_numel} numels to process.") + print(f"Need {wanted_numel} numels in {wanted_params} params.") + + # params + # XXX: for huge models that can't fit into the host's RAM we will have to recode this to support + # out-of-core computing solution + total_numel = 0 + total_params = 0 + for shapes, full_single_fp32_vector in zip(param_shapes, merged_single_partition_of_fp32_groups): + offset = 0 + avail_numel = full_single_fp32_vector.numel() + for name, shape in shapes.items(): + + unpartitioned_numel = shape.numel() if _has_callable(shape, 'numel') else math.prod(shape) + total_numel += unpartitioned_numel + total_params += 1 + + if debug: + print(f"{name} full shape: {shape} unpartitioned numel {unpartitioned_numel} ") + state_dict[name] = full_single_fp32_vector.narrow(0, offset, unpartitioned_numel).view(shape) + offset += unpartitioned_numel + + # Z2 started to align to 2*world_size to improve nccl performance. Therefore both offset and + # avail_numel can differ by anywhere between 0..2*world_size. Due to two unrelated complex + # paddings performed in the code it's almost impossible to predict the exact numbers w/o the + # live optimizer object, so we are checking that the numbers are within the right range + align_to = 2 * world_size + + def zero2_align(x): + return align_to * math.ceil(x / align_to) + + if debug: + print(f"original offset={offset}, avail_numel={avail_numel}") + + offset = zero2_align(offset) + avail_numel = zero2_align(avail_numel) + + if debug: + print(f"aligned offset={offset}, avail_numel={avail_numel}") + + # Sanity check + if offset != avail_numel: + raise ValueError(f"consumed {offset} numels out of {avail_numel} - something is wrong") + + print(f"Reconstructed fp32 state dict with {total_params} params {total_numel} elements") + + +def _get_fp32_state_dict_from_zero2_checkpoint(world_size, fp32_flat_groups, zero_model_states, + exclude_frozen_parameters): + state_dict = OrderedDict() + + # buffers + buffers = zero_model_states[0].buffers + state_dict.update(buffers) + if debug: + print(f"added {len(buffers)} buffers") + + if not exclude_frozen_parameters: + _zero2_merge_frozen_params(state_dict, zero_model_states) + + _zero2_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states) + + # recover shared parameters + for pair in zero_model_states[0].shared_params: + if pair[1] in state_dict: + state_dict[pair[0]] = state_dict[pair[1]] + + return state_dict + + +def zero3_partitioned_param_info(unpartitioned_numel, world_size): + remainder = unpartitioned_numel % world_size + padding_numel = (world_size - remainder) if remainder else 0 + partitioned_numel = math.ceil(unpartitioned_numel / world_size) + return partitioned_numel, padding_numel + + +def _zero3_merge_frozen_params(state_dict, world_size, zero_model_states): + if zero_model_states[0].frozen_param_shapes is None or len(zero_model_states[0].frozen_param_shapes) == 0: + return + + if debug: + for i in range(world_size): + num_elem = sum(s.numel() for s in zero_model_states[i].frozen_param_fragments.values()) + print(f'rank {i}: {FROZEN_PARAM_SHAPES}.numel = {num_elem}') + + frozen_param_shapes = zero_model_states[0].frozen_param_shapes + wanted_params = len(frozen_param_shapes) + wanted_numel = sum(s.numel() for s in frozen_param_shapes.values()) + avail_numel = sum([p.numel() for p in zero_model_states[0].frozen_param_fragments.values()]) * world_size + print(f'Frozen params: Have {avail_numel} numels to process.') + print(f'Frozen params: Need {wanted_numel} numels in {wanted_params} params') + + total_params = 0 + total_numel = 0 + for name, shape in zero_model_states[0].frozen_param_shapes.items(): + total_params += 1 + unpartitioned_numel = shape.numel() + total_numel += unpartitioned_numel + + param_frags = tuple(model_state.frozen_param_fragments[name] for model_state in zero_model_states) + state_dict[name] = torch.cat(param_frags, 0).narrow(0, 0, unpartitioned_numel).view(shape) + + partitioned_numel, partitioned_padding_numel = zero3_partitioned_param_info(unpartitioned_numel, world_size) + + if debug: + print( + f"Frozen params: {total_params} {name} full shape: {shape} partition0 numel={partitioned_numel} partitioned_padding_numel={partitioned_padding_numel}" + ) + + print(f"Reconstructed Frozen fp32 state dict with {total_params} params {total_numel} elements") + + +def _zero3_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states): + param_shapes = zero_model_states[0].param_shapes + avail_numel = fp32_flat_groups[0].numel() * world_size + # Reconstruction protocol: For zero3 we need to zip the partitions together at boundary of each + # param, re-consolidating each param, while dealing with padding if any + + # merge list of dicts, preserving order + param_shapes = {k: v for d in param_shapes for k, v in d.items()} + + if debug: + for i in range(world_size): + print(f"{FP32_FLAT_GROUPS}[{i}].shape={fp32_flat_groups[i].shape}") + + wanted_params = len(param_shapes) + wanted_numel = sum(shape.numel() for shape in param_shapes.values()) + # not asserting if there is a mismatch due to possible padding + avail_numel = fp32_flat_groups[0].numel() * world_size + print(f"Trainable params: Have {avail_numel} numels to process.") + print(f"Trainable params: Need {wanted_numel} numels in {wanted_params} params.") + + # params + # XXX: for huge models that can't fit into the host's RAM we will have to recode this to support + # out-of-core computing solution + offset = 0 + total_numel = 0 + total_params = 0 + for name, shape in param_shapes.items(): + + unpartitioned_numel = shape.numel() + total_numel += unpartitioned_numel + total_params += 1 + + partitioned_numel, partitioned_padding_numel = zero3_partitioned_param_info(unpartitioned_numel, world_size) + + if debug: + print( + f"Trainable params: {total_params} {name} full shape: {shape} partition0 numel={partitioned_numel} partitioned_padding_numel={partitioned_padding_numel}" + ) + + # XXX: memory usage doubles here + state_dict[name] = torch.cat( + tuple(fp32_flat_groups[i].narrow(0, offset, partitioned_numel) for i in range(world_size)), + 0).narrow(0, 0, unpartitioned_numel).view(shape) + offset += partitioned_numel + + offset *= world_size + + # Sanity check + if offset != avail_numel: + raise ValueError(f"consumed {offset} numels out of {avail_numel} - something is wrong") + + print(f"Reconstructed Trainable fp32 state dict with {total_params} params {total_numel} elements") + + +def _get_fp32_state_dict_from_zero3_checkpoint(world_size, fp32_flat_groups, zero_model_states, + exclude_frozen_parameters): + state_dict = OrderedDict() + + # buffers + buffers = zero_model_states[0].buffers + state_dict.update(buffers) + if debug: + print(f"added {len(buffers)} buffers") + + if not exclude_frozen_parameters: + _zero3_merge_frozen_params(state_dict, world_size, zero_model_states) + + _zero3_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states) + + # recover shared parameters + for pair in zero_model_states[0].shared_params: + if pair[1] in state_dict: + state_dict[pair[0]] = state_dict[pair[1]] + + return state_dict + + +def get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, tag=None, exclude_frozen_parameters=False): + """ + Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated state_dict that can be loaded with + ``load_state_dict()`` and used for training without DeepSpeed or shared with others, for example + via a model hub. + + Args: + - ``checkpoint_dir``: path to the desired checkpoint folder + - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in 'latest' file. e.g., ``global_step14`` + - ``exclude_frozen_parameters``: exclude frozen parameters + + Returns: + - pytorch ``state_dict`` + + Note: this approach may not work if your application doesn't have sufficient free CPU memory and + you may need to use the offline approach using the ``zero_to_fp32.py`` script that is saved with + the checkpoint. + + A typical usage might be :: + + from deepspeed.utils.zero_to_fp32 import get_fp32_state_dict_from_zero_checkpoint + # do the training and checkpoint saving + state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir) # already on cpu + model = model.cpu() # move to cpu + model.load_state_dict(state_dict) + # submit to model hub or save the model to share with others + + In this example the ``model`` will no longer be usable in the deepspeed context of the same + application. i.e. you will need to re-initialize the deepspeed engine, since + ``model.load_state_dict(state_dict)`` will remove all the deepspeed magic from it. + + If you want it all done for you, use ``load_state_dict_from_zero_checkpoint`` instead. + + """ + if tag is None: + latest_path = os.path.join(checkpoint_dir, 'latest') + if os.path.isfile(latest_path): + with open(latest_path, 'r') as fd: + tag = fd.read().strip() + else: + raise ValueError(f"Unable to find 'latest' file at {latest_path}") + + ds_checkpoint_dir = os.path.join(checkpoint_dir, tag) + + if not os.path.isdir(ds_checkpoint_dir): + raise FileNotFoundError(f"Directory '{ds_checkpoint_dir}' doesn't exist") + + return _get_fp32_state_dict_from_zero_checkpoint(ds_checkpoint_dir, exclude_frozen_parameters) + + +def convert_zero_checkpoint_to_fp32_state_dict(checkpoint_dir, output_file, tag=None, exclude_frozen_parameters=False): + """ + Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated ``state_dict`` file that can be + loaded with ``torch.load(file)`` + ``load_state_dict()`` and used for training without DeepSpeed. + + Args: + - ``checkpoint_dir``: path to the desired checkpoint folder. (one that contains the tag-folder, like ``global_step14``) + - ``output_file``: path to the pytorch fp32 state_dict output file (e.g. path/pytorch_model.bin) + - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in the file named ``latest`` in the checkpoint folder, e.g., ``global_step14`` + - ``exclude_frozen_parameters``: exclude frozen parameters + """ + + state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, tag, exclude_frozen_parameters) + print(f"Saving fp32 state dict to {output_file}") + torch.save(state_dict, output_file) + + +def load_state_dict_from_zero_checkpoint(model, checkpoint_dir, tag=None): + """ + 1. Put the provided model to cpu + 2. Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated ``state_dict`` + 3. Load it into the provided model + + Args: + - ``model``: the model object to update + - ``checkpoint_dir``: path to the desired checkpoint folder. (one that contains the tag-folder, like ``global_step14``) + - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in the file named ``latest`` in the checkpoint folder, e.g., ``global_step14`` + + Returns: + - ``model`: modified model + + Make sure you have plenty of CPU memory available before you call this function. If you don't + have enough use the ``zero_to_fp32.py`` utility to do the conversion. You will find it + conveniently placed for you in the checkpoint folder. + + A typical usage might be :: + + from deepspeed.utils.zero_to_fp32 import load_state_dict_from_zero_checkpoint + model = load_state_dict_from_zero_checkpoint(trainer.model, checkpoint_dir) + # submit to model hub or save the model to share with others + + Note, that once this was run, the ``model`` will no longer be usable in the deepspeed context + of the same application. i.e. you will need to re-initialize the deepspeed engine, since + ``model.load_state_dict(state_dict)`` will remove all the deepspeed magic from it. + + """ + logger.info(f"Extracting fp32 weights") + state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, tag) + + logger.info(f"Overwriting model with fp32 weights") + model = model.cpu() + model.load_state_dict(state_dict, strict=False) + + return model + + +if __name__ == "__main__": + + parser = argparse.ArgumentParser() + parser.add_argument("checkpoint_dir", + type=str, + help="path to the desired checkpoint folder, e.g., path/checkpoint-12") + parser.add_argument( + "output_file", + type=str, + help="path to the pytorch fp32 state_dict output file (e.g. path/checkpoint-12/pytorch_model.bin)") + parser.add_argument("-t", + "--tag", + type=str, + default=None, + help="checkpoint tag used as a unique identifier for checkpoint. e.g., global_step1") + parser.add_argument("--exclude_frozen_parameters", action='store_true', help="exclude frozen parameters") + parser.add_argument("-d", "--debug", action='store_true', help="enable debug") + args = parser.parse_args() + + debug = args.debug + + convert_zero_checkpoint_to_fp32_state_dict(args.checkpoint_dir, + args.output_file, + tag=args.tag, + exclude_frozen_parameters=args.exclude_frozen_parameters) diff --git a/checkpoint-8460/README.md b/checkpoint-8460/README.md new file mode 100644 index 0000000000000000000000000000000000000000..049d467664ca6172b7ffbe6ba60b3eac7479cac4 --- /dev/null +++ b/checkpoint-8460/README.md @@ -0,0 +1,202 @@ +--- +base_model: meta-llama/Llama-3.1-8B +library_name: peft +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.14.0 \ No newline at end of file diff --git a/checkpoint-8460/adapter_config.json b/checkpoint-8460/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..38890ca21f7e1854f7350049a9113a9eec8a443a --- /dev/null +++ b/checkpoint-8460/adapter_config.json @@ -0,0 +1,40 @@ +{ + "alpha_pattern": {}, + "auto_mapping": null, + "base_model_name_or_path": "meta-llama/Llama-3.1-8B", + "bias": "none", + "eva_config": null, + "exclude_modules": null, + "fan_in_fan_out": null, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 512, + "lora_bias": false, + "lora_dropout": 0.05, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": [ + "embed_tokens", + "lm_head" + ], + "peft_type": "LORA", + "r": 256, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "v_proj", + "q_proj", + "gate_proj", + "k_proj", + "down_proj", + "up_proj", + "o_proj" + ], + "task_type": "CAUSAL_LM", + "use_dora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/checkpoint-8460/adapter_model.safetensors b/checkpoint-8460/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..fa4326d337a0367fb307452fbb18e0d6811353a3 --- /dev/null +++ b/checkpoint-8460/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5cf66a7393d44ed5793e47a8d373bf1fe6cfc2b1ae0d9bb705e40621dae6ac59 +size 3443586272 diff --git a/checkpoint-8460/global_step8460/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt b/checkpoint-8460/global_step8460/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..bffc1cb862cdaaf2f5be13c9b935ea0e155d5e0c --- /dev/null +++ b/checkpoint-8460/global_step8460/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:db80595bb7524b5c711427518049c54e03a91cd0eb192779cec37d5357e16f3d +size 20661195036 diff --git a/checkpoint-8460/global_step8460/mp_rank_00_model_states.pt b/checkpoint-8460/global_step8460/mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..d61389e9281218f5df2f414c63a04e7489340bc6 --- /dev/null +++ b/checkpoint-8460/global_step8460/mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1b51c058c0ea1479af01a9aa78fb7ce7e200463b1ac38fb017b8dd09bfada7c4 +size 3555326841 diff --git a/checkpoint-8460/latest b/checkpoint-8460/latest new file mode 100644 index 0000000000000000000000000000000000000000..54f6ed5439ebecfdabdb8ffb050bd657ed115c6c --- /dev/null +++ b/checkpoint-8460/latest @@ -0,0 +1 @@ +global_step8460 \ No newline at end of file diff --git a/checkpoint-8460/rng_state.pth b/checkpoint-8460/rng_state.pth new file mode 100644 index 0000000000000000000000000000000000000000..65526098b19e2db0331412cbe0cbbf7b88169aae --- /dev/null +++ b/checkpoint-8460/rng_state.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e725b02912a27d6bc1153bf1e82bf617b65b24cf1135b112005175c489e7e308 +size 14244 diff --git a/checkpoint-8460/scheduler.pt b/checkpoint-8460/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..4dc69344bcc497bb307eafd8dedd38d029ef1020 --- /dev/null +++ b/checkpoint-8460/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f00190c6fe6ba125384231887614f34f1cef1f3f4a39aba19b287c2903fc371e +size 1064 diff --git a/checkpoint-8460/special_tokens_map.json b/checkpoint-8460/special_tokens_map.json new file mode 100644 index 0000000000000000000000000000000000000000..e5b39b6305d89284b04934011c68dbb26bf588ca --- /dev/null +++ b/checkpoint-8460/special_tokens_map.json @@ -0,0 +1,23 @@ +{ + "bos_token": { + "content": "<|begin_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "eos_token": { + "content": "<|end_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "pad_token": { + "content": "<|end_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + } +} diff --git a/checkpoint-8460/tokenizer.json b/checkpoint-8460/tokenizer.json new file mode 100644 index 0000000000000000000000000000000000000000..1c1d8d5c9024994f1d3b00f9662b8dd89ca13cf2 --- /dev/null +++ b/checkpoint-8460/tokenizer.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6b9e4e7fb171f92fd137b777cc2714bf87d11576700a1dcd7a399e7bbe39537b +size 17209920 diff --git a/checkpoint-8460/tokenizer_config.json b/checkpoint-8460/tokenizer_config.json new file mode 100644 index 0000000000000000000000000000000000000000..81dd14db6632ad5b35b9d447732e37ac074873a5 --- /dev/null +++ b/checkpoint-8460/tokenizer_config.json @@ -0,0 +1,2063 @@ +{ + "added_tokens_decoder": { + "128000": { + "content": "<|begin_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128001": { + "content": "<|end_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128002": { + "content": "<|reserved_special_token_0|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128003": { + "content": "<|reserved_special_token_1|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128004": { + "content": "<|finetune_right_pad_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128005": { + "content": "<|reserved_special_token_2|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128006": { + "content": "<|start_header_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128007": { + "content": "<|end_header_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128008": { + "content": "<|eom_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128009": { + "content": "<|eot_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128010": { + "content": "<|python_tag|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128011": { + "content": "<|reserved_special_token_3|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128012": { + "content": "<|reserved_special_token_4|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128013": { + "content": "<|reserved_special_token_5|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128014": { + "content": "<|reserved_special_token_6|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128015": { + "content": "<|reserved_special_token_7|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128016": { + "content": "<|reserved_special_token_8|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128017": { + "content": "<|reserved_special_token_9|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128018": { + "content": "<|reserved_special_token_10|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128019": { + "content": "<|reserved_special_token_11|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128020": { + "content": "<|reserved_special_token_12|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128021": { + "content": "<|reserved_special_token_13|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128022": { + "content": "<|reserved_special_token_14|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128023": { + "content": "<|reserved_special_token_15|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128024": { + "content": "<|reserved_special_token_16|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128025": { + "content": "<|reserved_special_token_17|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128026": { + "content": "<|reserved_special_token_18|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128027": { + "content": "<|reserved_special_token_19|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128028": { + "content": "<|reserved_special_token_20|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128029": { + "content": "<|reserved_special_token_21|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128030": { + "content": "<|reserved_special_token_22|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128031": { + "content": "<|reserved_special_token_23|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128032": { + "content": "<|reserved_special_token_24|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128033": { + "content": "<|reserved_special_token_25|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128034": { + "content": "<|reserved_special_token_26|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128035": { + "content": "<|reserved_special_token_27|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128036": { + "content": "<|reserved_special_token_28|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128037": { + "content": "<|reserved_special_token_29|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128038": { + "content": "<|reserved_special_token_30|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128039": { + "content": "<|reserved_special_token_31|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128040": { + "content": "<|reserved_special_token_32|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128041": { + "content": "<|reserved_special_token_33|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128042": { + "content": "<|reserved_special_token_34|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128043": { + "content": "<|reserved_special_token_35|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128044": { + "content": "<|reserved_special_token_36|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128045": { + "content": "<|reserved_special_token_37|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128046": { + "content": "<|reserved_special_token_38|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128047": { + "content": "<|reserved_special_token_39|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128048": { + "content": "<|reserved_special_token_40|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128049": { + "content": "<|reserved_special_token_41|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128050": { + "content": "<|reserved_special_token_42|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128051": { + "content": "<|reserved_special_token_43|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128052": { + "content": "<|reserved_special_token_44|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128053": { + "content": "<|reserved_special_token_45|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128054": { + "content": "<|reserved_special_token_46|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128055": { + "content": "<|reserved_special_token_47|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128056": { + "content": "<|reserved_special_token_48|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128057": { + "content": "<|reserved_special_token_49|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128058": { + "content": "<|reserved_special_token_50|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128059": { + "content": "<|reserved_special_token_51|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128060": { + "content": "<|reserved_special_token_52|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128061": { + "content": "<|reserved_special_token_53|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128062": { + "content": "<|reserved_special_token_54|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128063": { + "content": "<|reserved_special_token_55|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128064": { + "content": "<|reserved_special_token_56|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128065": { + "content": "<|reserved_special_token_57|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128066": { + "content": "<|reserved_special_token_58|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128067": { + "content": "<|reserved_special_token_59|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128068": { + "content": "<|reserved_special_token_60|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128069": { + "content": "<|reserved_special_token_61|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128070": { + "content": "<|reserved_special_token_62|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128071": { + "content": "<|reserved_special_token_63|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128072": { + "content": "<|reserved_special_token_64|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128073": { + "content": "<|reserved_special_token_65|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128074": { + "content": "<|reserved_special_token_66|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128075": { + "content": "<|reserved_special_token_67|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128076": { + "content": "<|reserved_special_token_68|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128077": { + "content": "<|reserved_special_token_69|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128078": { + "content": "<|reserved_special_token_70|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128079": { + "content": "<|reserved_special_token_71|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128080": { + "content": "<|reserved_special_token_72|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128081": { + "content": "<|reserved_special_token_73|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128082": { + "content": "<|reserved_special_token_74|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128083": { + "content": "<|reserved_special_token_75|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128084": { + "content": "<|reserved_special_token_76|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128085": { + "content": "<|reserved_special_token_77|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128086": { + "content": "<|reserved_special_token_78|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128087": { + "content": "<|reserved_special_token_79|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128088": { + "content": "<|reserved_special_token_80|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128089": { + "content": "<|reserved_special_token_81|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128090": { + "content": "<|reserved_special_token_82|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128091": { + "content": "<|reserved_special_token_83|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128092": { + "content": "<|reserved_special_token_84|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128093": { + "content": "<|reserved_special_token_85|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128094": { + "content": "<|reserved_special_token_86|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128095": { + "content": "<|reserved_special_token_87|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128096": { + "content": "<|reserved_special_token_88|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128097": { + "content": "<|reserved_special_token_89|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128098": { + "content": "<|reserved_special_token_90|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128099": { + "content": "<|reserved_special_token_91|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128100": { + "content": "<|reserved_special_token_92|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128101": { + "content": "<|reserved_special_token_93|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128102": { + "content": "<|reserved_special_token_94|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128103": { + "content": "<|reserved_special_token_95|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128104": { + "content": "<|reserved_special_token_96|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128105": { + "content": "<|reserved_special_token_97|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128106": { + "content": "<|reserved_special_token_98|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128107": { + "content": "<|reserved_special_token_99|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128108": { + "content": "<|reserved_special_token_100|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128109": { + "content": "<|reserved_special_token_101|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128110": { + "content": "<|reserved_special_token_102|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128111": { + "content": "<|reserved_special_token_103|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128112": { + "content": "<|reserved_special_token_104|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128113": { + "content": "<|reserved_special_token_105|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128114": { + "content": "<|reserved_special_token_106|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128115": { + "content": "<|reserved_special_token_107|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128116": { + "content": "<|reserved_special_token_108|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128117": { + "content": "<|reserved_special_token_109|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128118": { + "content": "<|reserved_special_token_110|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128119": { + "content": "<|reserved_special_token_111|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128120": { + "content": "<|reserved_special_token_112|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128121": { + "content": "<|reserved_special_token_113|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128122": { + "content": "<|reserved_special_token_114|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128123": { + "content": "<|reserved_special_token_115|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128124": { + "content": "<|reserved_special_token_116|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128125": { + "content": "<|reserved_special_token_117|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128126": { + "content": "<|reserved_special_token_118|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128127": { + "content": "<|reserved_special_token_119|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128128": { + "content": "<|reserved_special_token_120|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128129": { + "content": "<|reserved_special_token_121|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128130": { + "content": "<|reserved_special_token_122|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128131": { + "content": "<|reserved_special_token_123|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128132": { + "content": "<|reserved_special_token_124|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128133": { + "content": "<|reserved_special_token_125|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128134": { + "content": "<|reserved_special_token_126|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128135": { + "content": "<|reserved_special_token_127|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128136": { + "content": "<|reserved_special_token_128|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128137": { + "content": "<|reserved_special_token_129|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128138": { + "content": "<|reserved_special_token_130|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128139": { + "content": "<|reserved_special_token_131|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128140": { + "content": "<|reserved_special_token_132|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128141": { + "content": "<|reserved_special_token_133|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128142": { + "content": "<|reserved_special_token_134|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128143": { + "content": "<|reserved_special_token_135|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128144": { + "content": "<|reserved_special_token_136|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128145": { + "content": "<|reserved_special_token_137|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128146": { + "content": "<|reserved_special_token_138|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128147": { + "content": "<|reserved_special_token_139|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128148": { + "content": "<|reserved_special_token_140|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128149": { + "content": "<|reserved_special_token_141|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128150": { + "content": "<|reserved_special_token_142|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128151": { + "content": "<|reserved_special_token_143|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128152": { + "content": "<|reserved_special_token_144|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128153": { + "content": "<|reserved_special_token_145|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128154": { + "content": "<|reserved_special_token_146|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128155": { + "content": "<|reserved_special_token_147|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128156": { + "content": "<|reserved_special_token_148|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128157": { + "content": "<|reserved_special_token_149|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128158": { + "content": "<|reserved_special_token_150|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128159": { + "content": "<|reserved_special_token_151|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128160": { + "content": "<|reserved_special_token_152|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128161": { + "content": "<|reserved_special_token_153|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128162": { + "content": "<|reserved_special_token_154|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128163": { + "content": "<|reserved_special_token_155|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128164": { + "content": "<|reserved_special_token_156|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128165": { + "content": "<|reserved_special_token_157|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128166": { + "content": "<|reserved_special_token_158|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128167": { + "content": "<|reserved_special_token_159|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128168": { + "content": "<|reserved_special_token_160|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128169": { + "content": "<|reserved_special_token_161|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128170": { + "content": "<|reserved_special_token_162|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128171": { + "content": "<|reserved_special_token_163|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128172": { + "content": "<|reserved_special_token_164|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128173": { + "content": "<|reserved_special_token_165|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128174": { + "content": "<|reserved_special_token_166|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128175": { + "content": "<|reserved_special_token_167|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128176": { + "content": "<|reserved_special_token_168|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128177": { + "content": "<|reserved_special_token_169|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128178": { + "content": "<|reserved_special_token_170|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128179": { + "content": "<|reserved_special_token_171|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128180": { + "content": "<|reserved_special_token_172|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128181": { + "content": "<|reserved_special_token_173|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128182": { + "content": "<|reserved_special_token_174|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128183": { + "content": "<|reserved_special_token_175|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128184": { + "content": "<|reserved_special_token_176|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128185": { + "content": "<|reserved_special_token_177|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128186": { + "content": "<|reserved_special_token_178|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128187": { + "content": "<|reserved_special_token_179|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128188": { + "content": "<|reserved_special_token_180|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128189": { + "content": "<|reserved_special_token_181|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128190": { + "content": "<|reserved_special_token_182|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128191": { + "content": "<|reserved_special_token_183|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128192": { + "content": "<|reserved_special_token_184|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128193": { + "content": "<|reserved_special_token_185|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128194": { + "content": "<|reserved_special_token_186|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128195": { + "content": "<|reserved_special_token_187|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128196": { + "content": "<|reserved_special_token_188|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128197": { + "content": "<|reserved_special_token_189|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128198": { + "content": "<|reserved_special_token_190|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128199": { + "content": "<|reserved_special_token_191|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128200": { + "content": "<|reserved_special_token_192|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128201": { + "content": "<|reserved_special_token_193|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128202": { + "content": "<|reserved_special_token_194|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128203": { + "content": "<|reserved_special_token_195|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128204": { + "content": "<|reserved_special_token_196|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128205": { + "content": "<|reserved_special_token_197|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128206": { + "content": "<|reserved_special_token_198|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128207": { + "content": "<|reserved_special_token_199|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128208": { + "content": "<|reserved_special_token_200|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128209": { + "content": "<|reserved_special_token_201|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128210": { + "content": "<|reserved_special_token_202|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128211": { + "content": "<|reserved_special_token_203|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128212": { + "content": "<|reserved_special_token_204|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128213": { + "content": "<|reserved_special_token_205|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128214": { + "content": "<|reserved_special_token_206|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128215": { + "content": "<|reserved_special_token_207|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128216": { + "content": "<|reserved_special_token_208|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128217": { + "content": "<|reserved_special_token_209|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128218": { + "content": "<|reserved_special_token_210|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128219": { + "content": "<|reserved_special_token_211|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128220": { + "content": "<|reserved_special_token_212|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128221": { + "content": "<|reserved_special_token_213|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128222": { + "content": "<|reserved_special_token_214|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128223": { + "content": "<|reserved_special_token_215|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128224": { + "content": "<|reserved_special_token_216|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128225": { + "content": "<|reserved_special_token_217|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128226": { + "content": "<|reserved_special_token_218|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128227": { + "content": "<|reserved_special_token_219|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128228": { + "content": "<|reserved_special_token_220|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128229": { + "content": "<|reserved_special_token_221|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128230": { + "content": "<|reserved_special_token_222|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128231": { + "content": "<|reserved_special_token_223|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128232": { + "content": "<|reserved_special_token_224|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128233": { + "content": "<|reserved_special_token_225|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128234": { + "content": "<|reserved_special_token_226|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128235": { + "content": "<|reserved_special_token_227|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128236": { + "content": "<|reserved_special_token_228|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128237": { + "content": "<|reserved_special_token_229|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128238": { + "content": "<|reserved_special_token_230|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128239": { + "content": "<|reserved_special_token_231|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128240": { + "content": "<|reserved_special_token_232|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128241": { + "content": "<|reserved_special_token_233|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128242": { + "content": "<|reserved_special_token_234|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128243": { + "content": "<|reserved_special_token_235|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128244": { + "content": "<|reserved_special_token_236|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128245": { + "content": "<|reserved_special_token_237|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128246": { + "content": "<|reserved_special_token_238|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128247": { + "content": "<|reserved_special_token_239|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128248": { + "content": "<|reserved_special_token_240|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128249": { + "content": "<|reserved_special_token_241|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128250": { + "content": "<|reserved_special_token_242|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128251": { + "content": "<|reserved_special_token_243|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128252": { + "content": "<|reserved_special_token_244|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128253": { + "content": "<|reserved_special_token_245|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128254": { + "content": "<|reserved_special_token_246|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128255": { + "content": "<|reserved_special_token_247|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + } + }, + "bos_token": "<|begin_of_text|>", + "clean_up_tokenization_spaces": true, + "eos_token": "<|end_of_text|>", + "extra_special_tokens": {}, + "model_input_names": [ + "input_ids", + "attention_mask" + ], + "model_max_length": 131072, + "pad_token": "<|end_of_text|>", + "tokenizer_class": "PreTrainedTokenizer" +} diff --git a/checkpoint-8460/trainer_state.json b/checkpoint-8460/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..51dba668b24d65764f94a1d2c69d6bbcd2f54ae2 --- /dev/null +++ b/checkpoint-8460/trainer_state.json @@ -0,0 +1,59253 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 4.0, + "eval_steps": 500, + "global_step": 8460, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.00047281323877068556, + "grad_norm": 5.163570880889893, + "learning_rate": 5.0000000000000004e-08, + "loss": 1.4628, + "step": 1 + }, + { + "epoch": 0.0009456264775413711, + "grad_norm": 6.298020839691162, + "learning_rate": 1.0000000000000001e-07, + "loss": 1.5003, + "step": 2 + }, + { + "epoch": 0.0014184397163120568, + "grad_norm": 5.853623390197754, + "learning_rate": 1.5000000000000002e-07, + "loss": 1.4495, + "step": 3 + }, + { + "epoch": 0.0018912529550827422, + "grad_norm": 5.456025123596191, + "learning_rate": 2.0000000000000002e-07, + "loss": 1.3798, + "step": 4 + }, + { + "epoch": 0.002364066193853428, + "grad_norm": 5.757407188415527, + "learning_rate": 2.5000000000000004e-07, + "loss": 1.4515, + "step": 5 + }, + { + "epoch": 0.0028368794326241137, + "grad_norm": 5.872277736663818, + "learning_rate": 3.0000000000000004e-07, + "loss": 1.4424, + "step": 6 + }, + { + "epoch": 0.003309692671394799, + "grad_norm": 6.7816009521484375, + "learning_rate": 3.5000000000000004e-07, + "loss": 1.4004, + "step": 7 + }, + { + "epoch": 0.0037825059101654845, + "grad_norm": 6.229667663574219, + "learning_rate": 4.0000000000000003e-07, + "loss": 1.4494, + "step": 8 + }, + { + "epoch": 0.00425531914893617, + "grad_norm": 5.336202621459961, + "learning_rate": 4.5000000000000003e-07, + "loss": 1.3916, + "step": 9 + }, + { + "epoch": 0.004728132387706856, + "grad_norm": 5.589445114135742, + "learning_rate": 5.000000000000001e-07, + "loss": 1.2318, + "step": 10 + }, + { + "epoch": 0.005200945626477541, + "grad_norm": 5.720539569854736, + "learning_rate": 5.5e-07, + "loss": 1.4367, + "step": 11 + }, + { + "epoch": 0.005673758865248227, + "grad_norm": 5.913913726806641, + "learning_rate": 6.000000000000001e-07, + "loss": 1.342, + "step": 12 + }, + { + "epoch": 0.006146572104018913, + "grad_norm": 5.899744987487793, + "learning_rate": 6.5e-07, + "loss": 1.4307, + "step": 13 + }, + { + "epoch": 0.006619385342789598, + "grad_norm": 5.571037292480469, + "learning_rate": 7.000000000000001e-07, + "loss": 1.3372, + "step": 14 + }, + { + "epoch": 0.0070921985815602835, + "grad_norm": 5.480010509490967, + "learning_rate": 7.5e-07, + "loss": 1.3923, + "step": 15 + }, + { + "epoch": 0.007565011820330969, + "grad_norm": 5.254702091217041, + "learning_rate": 8.000000000000001e-07, + "loss": 1.2928, + "step": 16 + }, + { + "epoch": 0.008037825059101654, + "grad_norm": 6.090312480926514, + "learning_rate": 8.500000000000001e-07, + "loss": 1.4984, + "step": 17 + }, + { + "epoch": 0.00851063829787234, + "grad_norm": 5.689319610595703, + "learning_rate": 9.000000000000001e-07, + "loss": 1.4108, + "step": 18 + }, + { + "epoch": 0.008983451536643027, + "grad_norm": 5.386685848236084, + "learning_rate": 9.500000000000001e-07, + "loss": 1.425, + "step": 19 + }, + { + "epoch": 0.009456264775413711, + "grad_norm": 6.451584815979004, + "learning_rate": 1.0000000000000002e-06, + "loss": 1.5507, + "step": 20 + }, + { + "epoch": 0.009929078014184398, + "grad_norm": 5.37647008895874, + "learning_rate": 1.0500000000000001e-06, + "loss": 1.4109, + "step": 21 + }, + { + "epoch": 0.010401891252955082, + "grad_norm": 4.716553211212158, + "learning_rate": 1.1e-06, + "loss": 1.2028, + "step": 22 + }, + { + "epoch": 0.010874704491725768, + "grad_norm": 4.950989723205566, + "learning_rate": 1.1500000000000002e-06, + "loss": 1.3043, + "step": 23 + }, + { + "epoch": 0.011347517730496455, + "grad_norm": 4.688975811004639, + "learning_rate": 1.2000000000000002e-06, + "loss": 1.2708, + "step": 24 + }, + { + "epoch": 0.01182033096926714, + "grad_norm": 4.905868053436279, + "learning_rate": 1.25e-06, + "loss": 1.3268, + "step": 25 + }, + { + "epoch": 0.012293144208037825, + "grad_norm": 4.503395080566406, + "learning_rate": 1.3e-06, + "loss": 1.1799, + "step": 26 + }, + { + "epoch": 0.01276595744680851, + "grad_norm": 4.77382230758667, + "learning_rate": 1.3500000000000002e-06, + "loss": 1.3882, + "step": 27 + }, + { + "epoch": 0.013238770685579196, + "grad_norm": 4.734329700469971, + "learning_rate": 1.4000000000000001e-06, + "loss": 1.3476, + "step": 28 + }, + { + "epoch": 0.013711583924349883, + "grad_norm": 4.775066375732422, + "learning_rate": 1.45e-06, + "loss": 1.2429, + "step": 29 + }, + { + "epoch": 0.014184397163120567, + "grad_norm": 4.978334426879883, + "learning_rate": 1.5e-06, + "loss": 1.2119, + "step": 30 + }, + { + "epoch": 0.014657210401891253, + "grad_norm": 4.506785869598389, + "learning_rate": 1.5500000000000002e-06, + "loss": 1.3157, + "step": 31 + }, + { + "epoch": 0.015130023640661938, + "grad_norm": 4.007757186889648, + "learning_rate": 1.6000000000000001e-06, + "loss": 1.1451, + "step": 32 + }, + { + "epoch": 0.015602836879432624, + "grad_norm": 3.6621618270874023, + "learning_rate": 1.6500000000000003e-06, + "loss": 1.093, + "step": 33 + }, + { + "epoch": 0.01607565011820331, + "grad_norm": 3.8733766078948975, + "learning_rate": 1.7000000000000002e-06, + "loss": 1.2289, + "step": 34 + }, + { + "epoch": 0.016548463356973995, + "grad_norm": 4.3391900062561035, + "learning_rate": 1.75e-06, + "loss": 1.1453, + "step": 35 + }, + { + "epoch": 0.01702127659574468, + "grad_norm": 3.287623643875122, + "learning_rate": 1.8000000000000001e-06, + "loss": 1.0257, + "step": 36 + }, + { + "epoch": 0.017494089834515367, + "grad_norm": 3.591721773147583, + "learning_rate": 1.85e-06, + "loss": 0.9976, + "step": 37 + }, + { + "epoch": 0.017966903073286054, + "grad_norm": 4.028271675109863, + "learning_rate": 1.9000000000000002e-06, + "loss": 1.0773, + "step": 38 + }, + { + "epoch": 0.018439716312056736, + "grad_norm": 3.3543951511383057, + "learning_rate": 1.9500000000000004e-06, + "loss": 1.1677, + "step": 39 + }, + { + "epoch": 0.018912529550827423, + "grad_norm": 3.807624340057373, + "learning_rate": 2.0000000000000003e-06, + "loss": 1.1232, + "step": 40 + }, + { + "epoch": 0.01938534278959811, + "grad_norm": 4.242797374725342, + "learning_rate": 2.05e-06, + "loss": 1.1819, + "step": 41 + }, + { + "epoch": 0.019858156028368795, + "grad_norm": 3.4574992656707764, + "learning_rate": 2.1000000000000002e-06, + "loss": 0.9878, + "step": 42 + }, + { + "epoch": 0.02033096926713948, + "grad_norm": 3.906695604324341, + "learning_rate": 2.15e-06, + "loss": 1.0592, + "step": 43 + }, + { + "epoch": 0.020803782505910164, + "grad_norm": 3.7543163299560547, + "learning_rate": 2.2e-06, + "loss": 1.0309, + "step": 44 + }, + { + "epoch": 0.02127659574468085, + "grad_norm": 3.3777148723602295, + "learning_rate": 2.25e-06, + "loss": 1.0664, + "step": 45 + }, + { + "epoch": 0.021749408983451537, + "grad_norm": 3.6003634929656982, + "learning_rate": 2.3000000000000004e-06, + "loss": 1.0482, + "step": 46 + }, + { + "epoch": 0.022222222222222223, + "grad_norm": 3.3961377143859863, + "learning_rate": 2.35e-06, + "loss": 1.0252, + "step": 47 + }, + { + "epoch": 0.02269503546099291, + "grad_norm": 3.1601035594940186, + "learning_rate": 2.4000000000000003e-06, + "loss": 1.0435, + "step": 48 + }, + { + "epoch": 0.023167848699763592, + "grad_norm": 3.4192967414855957, + "learning_rate": 2.4500000000000003e-06, + "loss": 1.0935, + "step": 49 + }, + { + "epoch": 0.02364066193853428, + "grad_norm": 3.1225922107696533, + "learning_rate": 2.5e-06, + "loss": 0.8988, + "step": 50 + }, + { + "epoch": 0.024113475177304965, + "grad_norm": 3.1423380374908447, + "learning_rate": 2.55e-06, + "loss": 1.0159, + "step": 51 + }, + { + "epoch": 0.02458628841607565, + "grad_norm": 3.4782402515411377, + "learning_rate": 2.6e-06, + "loss": 1.0231, + "step": 52 + }, + { + "epoch": 0.025059101654846337, + "grad_norm": 3.8362693786621094, + "learning_rate": 2.6500000000000005e-06, + "loss": 1.0725, + "step": 53 + }, + { + "epoch": 0.02553191489361702, + "grad_norm": 3.033294916152954, + "learning_rate": 2.7000000000000004e-06, + "loss": 0.9377, + "step": 54 + }, + { + "epoch": 0.026004728132387706, + "grad_norm": 3.849741220474243, + "learning_rate": 2.7500000000000004e-06, + "loss": 1.0046, + "step": 55 + }, + { + "epoch": 0.026477541371158392, + "grad_norm": 3.141876220703125, + "learning_rate": 2.8000000000000003e-06, + "loss": 0.9226, + "step": 56 + }, + { + "epoch": 0.02695035460992908, + "grad_norm": 2.773594856262207, + "learning_rate": 2.85e-06, + "loss": 0.8662, + "step": 57 + }, + { + "epoch": 0.027423167848699765, + "grad_norm": 3.1460225582122803, + "learning_rate": 2.9e-06, + "loss": 0.9304, + "step": 58 + }, + { + "epoch": 0.027895981087470448, + "grad_norm": 3.293583631515503, + "learning_rate": 2.95e-06, + "loss": 1.0374, + "step": 59 + }, + { + "epoch": 0.028368794326241134, + "grad_norm": 3.8190863132476807, + "learning_rate": 3e-06, + "loss": 0.971, + "step": 60 + }, + { + "epoch": 0.02884160756501182, + "grad_norm": 3.4566776752471924, + "learning_rate": 3.05e-06, + "loss": 0.9631, + "step": 61 + }, + { + "epoch": 0.029314420803782507, + "grad_norm": 3.355741500854492, + "learning_rate": 3.1000000000000004e-06, + "loss": 1.0097, + "step": 62 + }, + { + "epoch": 0.029787234042553193, + "grad_norm": 3.29746675491333, + "learning_rate": 3.1500000000000003e-06, + "loss": 0.9459, + "step": 63 + }, + { + "epoch": 0.030260047281323876, + "grad_norm": 3.3122968673706055, + "learning_rate": 3.2000000000000003e-06, + "loss": 0.8594, + "step": 64 + }, + { + "epoch": 0.030732860520094562, + "grad_norm": 3.477701187133789, + "learning_rate": 3.2500000000000002e-06, + "loss": 0.9197, + "step": 65 + }, + { + "epoch": 0.031205673758865248, + "grad_norm": 3.3363406658172607, + "learning_rate": 3.3000000000000006e-06, + "loss": 0.9478, + "step": 66 + }, + { + "epoch": 0.03167848699763593, + "grad_norm": 4.143295764923096, + "learning_rate": 3.3500000000000005e-06, + "loss": 1.0534, + "step": 67 + }, + { + "epoch": 0.03215130023640662, + "grad_norm": 3.2363274097442627, + "learning_rate": 3.4000000000000005e-06, + "loss": 0.9454, + "step": 68 + }, + { + "epoch": 0.032624113475177303, + "grad_norm": 3.198746681213379, + "learning_rate": 3.45e-06, + "loss": 0.9388, + "step": 69 + }, + { + "epoch": 0.03309692671394799, + "grad_norm": 3.5751023292541504, + "learning_rate": 3.5e-06, + "loss": 0.9444, + "step": 70 + }, + { + "epoch": 0.033569739952718676, + "grad_norm": 3.1745729446411133, + "learning_rate": 3.5500000000000003e-06, + "loss": 0.8683, + "step": 71 + }, + { + "epoch": 0.03404255319148936, + "grad_norm": 3.3210883140563965, + "learning_rate": 3.6000000000000003e-06, + "loss": 0.8811, + "step": 72 + }, + { + "epoch": 0.03451536643026005, + "grad_norm": 3.2502429485321045, + "learning_rate": 3.65e-06, + "loss": 1.0012, + "step": 73 + }, + { + "epoch": 0.034988179669030735, + "grad_norm": 3.44598126411438, + "learning_rate": 3.7e-06, + "loss": 0.9217, + "step": 74 + }, + { + "epoch": 0.03546099290780142, + "grad_norm": 3.439117431640625, + "learning_rate": 3.7500000000000005e-06, + "loss": 0.8976, + "step": 75 + }, + { + "epoch": 0.03593380614657211, + "grad_norm": 3.523627758026123, + "learning_rate": 3.8000000000000005e-06, + "loss": 0.8996, + "step": 76 + }, + { + "epoch": 0.03640661938534279, + "grad_norm": 3.3716015815734863, + "learning_rate": 3.85e-06, + "loss": 0.9061, + "step": 77 + }, + { + "epoch": 0.03687943262411347, + "grad_norm": 3.33518385887146, + "learning_rate": 3.900000000000001e-06, + "loss": 0.9371, + "step": 78 + }, + { + "epoch": 0.03735224586288416, + "grad_norm": 3.833829879760742, + "learning_rate": 3.95e-06, + "loss": 0.9669, + "step": 79 + }, + { + "epoch": 0.037825059101654845, + "grad_norm": 3.260446786880493, + "learning_rate": 4.000000000000001e-06, + "loss": 0.9449, + "step": 80 + }, + { + "epoch": 0.03829787234042553, + "grad_norm": 3.532451629638672, + "learning_rate": 4.05e-06, + "loss": 0.897, + "step": 81 + }, + { + "epoch": 0.03877068557919622, + "grad_norm": 3.1156492233276367, + "learning_rate": 4.1e-06, + "loss": 0.8463, + "step": 82 + }, + { + "epoch": 0.039243498817966904, + "grad_norm": 2.8801751136779785, + "learning_rate": 4.15e-06, + "loss": 0.8616, + "step": 83 + }, + { + "epoch": 0.03971631205673759, + "grad_norm": 3.072476863861084, + "learning_rate": 4.2000000000000004e-06, + "loss": 0.8387, + "step": 84 + }, + { + "epoch": 0.04018912529550828, + "grad_norm": 2.9601376056671143, + "learning_rate": 4.25e-06, + "loss": 0.8538, + "step": 85 + }, + { + "epoch": 0.04066193853427896, + "grad_norm": 3.521664619445801, + "learning_rate": 4.3e-06, + "loss": 0.8894, + "step": 86 + }, + { + "epoch": 0.04113475177304964, + "grad_norm": 3.2670981884002686, + "learning_rate": 4.350000000000001e-06, + "loss": 0.8387, + "step": 87 + }, + { + "epoch": 0.04160756501182033, + "grad_norm": 3.422089099884033, + "learning_rate": 4.4e-06, + "loss": 0.7728, + "step": 88 + }, + { + "epoch": 0.042080378250591015, + "grad_norm": 3.414034128189087, + "learning_rate": 4.450000000000001e-06, + "loss": 0.7968, + "step": 89 + }, + { + "epoch": 0.0425531914893617, + "grad_norm": 4.234285354614258, + "learning_rate": 4.5e-06, + "loss": 0.8502, + "step": 90 + }, + { + "epoch": 0.04302600472813239, + "grad_norm": 3.1446919441223145, + "learning_rate": 4.5500000000000005e-06, + "loss": 0.8236, + "step": 91 + }, + { + "epoch": 0.043498817966903074, + "grad_norm": 3.683443307876587, + "learning_rate": 4.600000000000001e-06, + "loss": 0.9792, + "step": 92 + }, + { + "epoch": 0.04397163120567376, + "grad_norm": 3.664219617843628, + "learning_rate": 4.65e-06, + "loss": 0.8743, + "step": 93 + }, + { + "epoch": 0.044444444444444446, + "grad_norm": 3.369479179382324, + "learning_rate": 4.7e-06, + "loss": 0.8741, + "step": 94 + }, + { + "epoch": 0.04491725768321513, + "grad_norm": 3.694949150085449, + "learning_rate": 4.75e-06, + "loss": 0.7574, + "step": 95 + }, + { + "epoch": 0.04539007092198582, + "grad_norm": 3.5144498348236084, + "learning_rate": 4.800000000000001e-06, + "loss": 0.9934, + "step": 96 + }, + { + "epoch": 0.0458628841607565, + "grad_norm": 3.164451837539673, + "learning_rate": 4.85e-06, + "loss": 0.7463, + "step": 97 + }, + { + "epoch": 0.046335697399527184, + "grad_norm": 3.222785472869873, + "learning_rate": 4.9000000000000005e-06, + "loss": 0.7698, + "step": 98 + }, + { + "epoch": 0.04680851063829787, + "grad_norm": 2.9129555225372314, + "learning_rate": 4.95e-06, + "loss": 0.7856, + "step": 99 + }, + { + "epoch": 0.04728132387706856, + "grad_norm": 3.5061235427856445, + "learning_rate": 5e-06, + "loss": 0.8588, + "step": 100 + }, + { + "epoch": 0.04775413711583924, + "grad_norm": 3.2805044651031494, + "learning_rate": 4.999999922167982e-06, + "loss": 0.7643, + "step": 101 + }, + { + "epoch": 0.04822695035460993, + "grad_norm": 3.5461678504943848, + "learning_rate": 4.999999688671929e-06, + "loss": 0.8253, + "step": 102 + }, + { + "epoch": 0.048699763593380616, + "grad_norm": 3.2238264083862305, + "learning_rate": 4.99999929951186e-06, + "loss": 0.7622, + "step": 103 + }, + { + "epoch": 0.0491725768321513, + "grad_norm": 3.818955898284912, + "learning_rate": 4.999998754687795e-06, + "loss": 0.8471, + "step": 104 + }, + { + "epoch": 0.04964539007092199, + "grad_norm": 3.1252424716949463, + "learning_rate": 4.99999805419977e-06, + "loss": 0.8409, + "step": 105 + }, + { + "epoch": 0.050118203309692674, + "grad_norm": 3.604283571243286, + "learning_rate": 4.999997198047828e-06, + "loss": 0.9027, + "step": 106 + }, + { + "epoch": 0.050591016548463354, + "grad_norm": 3.6752424240112305, + "learning_rate": 4.999996186232023e-06, + "loss": 0.9336, + "step": 107 + }, + { + "epoch": 0.05106382978723404, + "grad_norm": 3.517557144165039, + "learning_rate": 4.9999950187524184e-06, + "loss": 0.8351, + "step": 108 + }, + { + "epoch": 0.051536643026004726, + "grad_norm": 3.427285671234131, + "learning_rate": 4.999993695609085e-06, + "loss": 0.8457, + "step": 109 + }, + { + "epoch": 0.05200945626477541, + "grad_norm": 3.2792510986328125, + "learning_rate": 4.999992216802107e-06, + "loss": 0.8391, + "step": 110 + }, + { + "epoch": 0.0524822695035461, + "grad_norm": 3.581094741821289, + "learning_rate": 4.999990582331576e-06, + "loss": 0.7533, + "step": 111 + }, + { + "epoch": 0.052955082742316785, + "grad_norm": 3.1667377948760986, + "learning_rate": 4.999988792197593e-06, + "loss": 0.9562, + "step": 112 + }, + { + "epoch": 0.05342789598108747, + "grad_norm": 3.3609890937805176, + "learning_rate": 4.99998684640027e-06, + "loss": 0.8181, + "step": 113 + }, + { + "epoch": 0.05390070921985816, + "grad_norm": 3.260627269744873, + "learning_rate": 4.999984744939729e-06, + "loss": 0.8012, + "step": 114 + }, + { + "epoch": 0.054373522458628844, + "grad_norm": 3.4535653591156006, + "learning_rate": 4.9999824878160985e-06, + "loss": 0.919, + "step": 115 + }, + { + "epoch": 0.05484633569739953, + "grad_norm": 3.4880740642547607, + "learning_rate": 4.999980075029522e-06, + "loss": 0.8114, + "step": 116 + }, + { + "epoch": 0.05531914893617021, + "grad_norm": 3.2546932697296143, + "learning_rate": 4.999977506580147e-06, + "loss": 0.8274, + "step": 117 + }, + { + "epoch": 0.055791962174940896, + "grad_norm": 3.2762744426727295, + "learning_rate": 4.999974782468136e-06, + "loss": 0.9018, + "step": 118 + }, + { + "epoch": 0.05626477541371158, + "grad_norm": 3.42825984954834, + "learning_rate": 4.999971902693657e-06, + "loss": 0.8262, + "step": 119 + }, + { + "epoch": 0.05673758865248227, + "grad_norm": 3.082496404647827, + "learning_rate": 4.99996886725689e-06, + "loss": 0.8181, + "step": 120 + }, + { + "epoch": 0.057210401891252954, + "grad_norm": 3.322869300842285, + "learning_rate": 4.9999656761580225e-06, + "loss": 0.8382, + "step": 121 + }, + { + "epoch": 0.05768321513002364, + "grad_norm": 3.6365339756011963, + "learning_rate": 4.9999623293972555e-06, + "loss": 0.7489, + "step": 122 + }, + { + "epoch": 0.05815602836879433, + "grad_norm": 3.376352548599243, + "learning_rate": 4.999958826974796e-06, + "loss": 0.9012, + "step": 123 + }, + { + "epoch": 0.05862884160756501, + "grad_norm": 3.49088716506958, + "learning_rate": 4.999955168890862e-06, + "loss": 0.8999, + "step": 124 + }, + { + "epoch": 0.0591016548463357, + "grad_norm": 3.3265068531036377, + "learning_rate": 4.999951355145682e-06, + "loss": 0.8161, + "step": 125 + }, + { + "epoch": 0.059574468085106386, + "grad_norm": 3.697282314300537, + "learning_rate": 4.999947385739493e-06, + "loss": 0.9623, + "step": 126 + }, + { + "epoch": 0.06004728132387707, + "grad_norm": 2.7901928424835205, + "learning_rate": 4.999943260672542e-06, + "loss": 0.7371, + "step": 127 + }, + { + "epoch": 0.06052009456264775, + "grad_norm": 3.110319137573242, + "learning_rate": 4.999938979945086e-06, + "loss": 0.715, + "step": 128 + }, + { + "epoch": 0.06099290780141844, + "grad_norm": 3.2211520671844482, + "learning_rate": 4.999934543557392e-06, + "loss": 0.8888, + "step": 129 + }, + { + "epoch": 0.061465721040189124, + "grad_norm": 3.2466187477111816, + "learning_rate": 4.999929951509735e-06, + "loss": 0.9389, + "step": 130 + }, + { + "epoch": 0.06193853427895981, + "grad_norm": 3.3574399948120117, + "learning_rate": 4.999925203802403e-06, + "loss": 0.8263, + "step": 131 + }, + { + "epoch": 0.062411347517730496, + "grad_norm": 3.275601625442505, + "learning_rate": 4.99992030043569e-06, + "loss": 0.8338, + "step": 132 + }, + { + "epoch": 0.06288416075650118, + "grad_norm": 3.6011312007904053, + "learning_rate": 4.999915241409902e-06, + "loss": 0.8351, + "step": 133 + }, + { + "epoch": 0.06335697399527186, + "grad_norm": 2.969011068344116, + "learning_rate": 4.999910026725352e-06, + "loss": 0.79, + "step": 134 + }, + { + "epoch": 0.06382978723404255, + "grad_norm": 3.690784454345703, + "learning_rate": 4.999904656382369e-06, + "loss": 0.8209, + "step": 135 + }, + { + "epoch": 0.06430260047281323, + "grad_norm": 3.3363115787506104, + "learning_rate": 4.999899130381283e-06, + "loss": 0.858, + "step": 136 + }, + { + "epoch": 0.06477541371158392, + "grad_norm": 3.206881523132324, + "learning_rate": 4.9998934487224405e-06, + "loss": 0.834, + "step": 137 + }, + { + "epoch": 0.06524822695035461, + "grad_norm": 2.773146152496338, + "learning_rate": 4.999887611406195e-06, + "loss": 0.7576, + "step": 138 + }, + { + "epoch": 0.0657210401891253, + "grad_norm": 3.307725667953491, + "learning_rate": 4.999881618432908e-06, + "loss": 0.7487, + "step": 139 + }, + { + "epoch": 0.06619385342789598, + "grad_norm": 4.273657321929932, + "learning_rate": 4.999875469802956e-06, + "loss": 0.8176, + "step": 140 + }, + { + "epoch": 0.06666666666666667, + "grad_norm": 3.0898005962371826, + "learning_rate": 4.999869165516719e-06, + "loss": 0.7578, + "step": 141 + }, + { + "epoch": 0.06713947990543735, + "grad_norm": 3.25150990486145, + "learning_rate": 4.9998627055745915e-06, + "loss": 0.7873, + "step": 142 + }, + { + "epoch": 0.06761229314420804, + "grad_norm": 2.9705755710601807, + "learning_rate": 4.999856089976974e-06, + "loss": 0.6473, + "step": 143 + }, + { + "epoch": 0.06808510638297872, + "grad_norm": 3.5658507347106934, + "learning_rate": 4.9998493187242804e-06, + "loss": 0.855, + "step": 144 + }, + { + "epoch": 0.06855791962174941, + "grad_norm": 3.3994076251983643, + "learning_rate": 4.99984239181693e-06, + "loss": 0.7926, + "step": 145 + }, + { + "epoch": 0.0690307328605201, + "grad_norm": 2.8266260623931885, + "learning_rate": 4.999835309255357e-06, + "loss": 0.7564, + "step": 146 + }, + { + "epoch": 0.06950354609929078, + "grad_norm": 3.1143875122070312, + "learning_rate": 4.999828071039999e-06, + "loss": 0.8398, + "step": 147 + }, + { + "epoch": 0.06997635933806147, + "grad_norm": 2.9364278316497803, + "learning_rate": 4.99982067717131e-06, + "loss": 0.7381, + "step": 148 + }, + { + "epoch": 0.07044917257683216, + "grad_norm": 3.4155616760253906, + "learning_rate": 4.999813127649748e-06, + "loss": 0.7933, + "step": 149 + }, + { + "epoch": 0.07092198581560284, + "grad_norm": 4.371236324310303, + "learning_rate": 4.999805422475784e-06, + "loss": 0.8292, + "step": 150 + }, + { + "epoch": 0.07139479905437353, + "grad_norm": 3.3967185020446777, + "learning_rate": 4.999797561649897e-06, + "loss": 0.8712, + "step": 151 + }, + { + "epoch": 0.07186761229314421, + "grad_norm": 3.343303680419922, + "learning_rate": 4.999789545172578e-06, + "loss": 0.8177, + "step": 152 + }, + { + "epoch": 0.07234042553191489, + "grad_norm": 3.040235757827759, + "learning_rate": 4.999781373044325e-06, + "loss": 0.7379, + "step": 153 + }, + { + "epoch": 0.07281323877068557, + "grad_norm": 3.4069204330444336, + "learning_rate": 4.999773045265647e-06, + "loss": 0.7939, + "step": 154 + }, + { + "epoch": 0.07328605200945626, + "grad_norm": 3.1939475536346436, + "learning_rate": 4.999764561837063e-06, + "loss": 0.8037, + "step": 155 + }, + { + "epoch": 0.07375886524822695, + "grad_norm": 4.452004909515381, + "learning_rate": 4.999755922759101e-06, + "loss": 0.8421, + "step": 156 + }, + { + "epoch": 0.07423167848699763, + "grad_norm": 3.2031240463256836, + "learning_rate": 4.999747128032298e-06, + "loss": 0.794, + "step": 157 + }, + { + "epoch": 0.07470449172576832, + "grad_norm": 3.175920009613037, + "learning_rate": 4.999738177657203e-06, + "loss": 0.759, + "step": 158 + }, + { + "epoch": 0.075177304964539, + "grad_norm": 3.7679688930511475, + "learning_rate": 4.9997290716343725e-06, + "loss": 0.8174, + "step": 159 + }, + { + "epoch": 0.07565011820330969, + "grad_norm": 3.7020037174224854, + "learning_rate": 4.999719809964373e-06, + "loss": 0.7116, + "step": 160 + }, + { + "epoch": 0.07612293144208038, + "grad_norm": 4.357471942901611, + "learning_rate": 4.999710392647783e-06, + "loss": 0.7649, + "step": 161 + }, + { + "epoch": 0.07659574468085106, + "grad_norm": 3.3439087867736816, + "learning_rate": 4.999700819685187e-06, + "loss": 0.7907, + "step": 162 + }, + { + "epoch": 0.07706855791962175, + "grad_norm": 3.210815191268921, + "learning_rate": 4.999691091077182e-06, + "loss": 0.8446, + "step": 163 + }, + { + "epoch": 0.07754137115839244, + "grad_norm": 3.1029553413391113, + "learning_rate": 4.9996812068243735e-06, + "loss": 0.7232, + "step": 164 + }, + { + "epoch": 0.07801418439716312, + "grad_norm": 2.9389400482177734, + "learning_rate": 4.999671166927378e-06, + "loss": 0.7413, + "step": 165 + }, + { + "epoch": 0.07848699763593381, + "grad_norm": 3.7062697410583496, + "learning_rate": 4.9996609713868185e-06, + "loss": 0.8773, + "step": 166 + }, + { + "epoch": 0.0789598108747045, + "grad_norm": 3.2768924236297607, + "learning_rate": 4.999650620203332e-06, + "loss": 0.8046, + "step": 167 + }, + { + "epoch": 0.07943262411347518, + "grad_norm": 3.380373001098633, + "learning_rate": 4.999640113377561e-06, + "loss": 0.7529, + "step": 168 + }, + { + "epoch": 0.07990543735224587, + "grad_norm": 3.520022392272949, + "learning_rate": 4.999629450910162e-06, + "loss": 0.7352, + "step": 169 + }, + { + "epoch": 0.08037825059101655, + "grad_norm": 3.43269419670105, + "learning_rate": 4.999618632801796e-06, + "loss": 0.9371, + "step": 170 + }, + { + "epoch": 0.08085106382978724, + "grad_norm": 3.555877923965454, + "learning_rate": 4.99960765905314e-06, + "loss": 0.8276, + "step": 171 + }, + { + "epoch": 0.08132387706855793, + "grad_norm": 3.597050189971924, + "learning_rate": 4.999596529664874e-06, + "loss": 0.8164, + "step": 172 + }, + { + "epoch": 0.0817966903073286, + "grad_norm": 3.2002956867218018, + "learning_rate": 4.999585244637693e-06, + "loss": 0.7824, + "step": 173 + }, + { + "epoch": 0.08226950354609928, + "grad_norm": 3.527275562286377, + "learning_rate": 4.999573803972299e-06, + "loss": 0.8033, + "step": 174 + }, + { + "epoch": 0.08274231678486997, + "grad_norm": 3.5184452533721924, + "learning_rate": 4.999562207669405e-06, + "loss": 0.724, + "step": 175 + }, + { + "epoch": 0.08321513002364066, + "grad_norm": 3.6635067462921143, + "learning_rate": 4.999550455729732e-06, + "loss": 0.819, + "step": 176 + }, + { + "epoch": 0.08368794326241134, + "grad_norm": 3.192399740219116, + "learning_rate": 4.999538548154012e-06, + "loss": 0.7999, + "step": 177 + }, + { + "epoch": 0.08416075650118203, + "grad_norm": 3.0946953296661377, + "learning_rate": 4.999526484942988e-06, + "loss": 0.7367, + "step": 178 + }, + { + "epoch": 0.08463356973995272, + "grad_norm": 2.847198009490967, + "learning_rate": 4.99951426609741e-06, + "loss": 0.7536, + "step": 179 + }, + { + "epoch": 0.0851063829787234, + "grad_norm": 2.7674827575683594, + "learning_rate": 4.999501891618037e-06, + "loss": 0.701, + "step": 180 + }, + { + "epoch": 0.08557919621749409, + "grad_norm": 3.357933521270752, + "learning_rate": 4.999489361505643e-06, + "loss": 0.8331, + "step": 181 + }, + { + "epoch": 0.08605200945626477, + "grad_norm": 3.1464426517486572, + "learning_rate": 4.999476675761004e-06, + "loss": 0.7931, + "step": 182 + }, + { + "epoch": 0.08652482269503546, + "grad_norm": 3.310697078704834, + "learning_rate": 4.999463834384915e-06, + "loss": 0.753, + "step": 183 + }, + { + "epoch": 0.08699763593380615, + "grad_norm": 2.9794881343841553, + "learning_rate": 4.999450837378171e-06, + "loss": 0.7091, + "step": 184 + }, + { + "epoch": 0.08747044917257683, + "grad_norm": 3.0776889324188232, + "learning_rate": 4.999437684741584e-06, + "loss": 0.7226, + "step": 185 + }, + { + "epoch": 0.08794326241134752, + "grad_norm": 3.6657519340515137, + "learning_rate": 4.999424376475972e-06, + "loss": 0.845, + "step": 186 + }, + { + "epoch": 0.0884160756501182, + "grad_norm": 3.872718572616577, + "learning_rate": 4.999410912582164e-06, + "loss": 0.812, + "step": 187 + }, + { + "epoch": 0.08888888888888889, + "grad_norm": 2.9184508323669434, + "learning_rate": 4.9993972930609976e-06, + "loss": 0.6823, + "step": 188 + }, + { + "epoch": 0.08936170212765958, + "grad_norm": 3.5567142963409424, + "learning_rate": 4.999383517913321e-06, + "loss": 0.7614, + "step": 189 + }, + { + "epoch": 0.08983451536643026, + "grad_norm": 3.3688533306121826, + "learning_rate": 4.999369587139992e-06, + "loss": 0.858, + "step": 190 + }, + { + "epoch": 0.09030732860520095, + "grad_norm": 2.893223524093628, + "learning_rate": 4.99935550074188e-06, + "loss": 0.6761, + "step": 191 + }, + { + "epoch": 0.09078014184397164, + "grad_norm": 3.400225877761841, + "learning_rate": 4.999341258719859e-06, + "loss": 0.7531, + "step": 192 + }, + { + "epoch": 0.09125295508274232, + "grad_norm": 3.6167714595794678, + "learning_rate": 4.999326861074817e-06, + "loss": 0.8164, + "step": 193 + }, + { + "epoch": 0.091725768321513, + "grad_norm": 4.325016498565674, + "learning_rate": 4.9993123078076506e-06, + "loss": 0.7069, + "step": 194 + }, + { + "epoch": 0.09219858156028368, + "grad_norm": 3.195317029953003, + "learning_rate": 4.999297598919266e-06, + "loss": 0.726, + "step": 195 + }, + { + "epoch": 0.09267139479905437, + "grad_norm": 3.146530866622925, + "learning_rate": 4.999282734410579e-06, + "loss": 0.7888, + "step": 196 + }, + { + "epoch": 0.09314420803782505, + "grad_norm": 3.5166752338409424, + "learning_rate": 4.999267714282515e-06, + "loss": 0.8473, + "step": 197 + }, + { + "epoch": 0.09361702127659574, + "grad_norm": 3.3140196800231934, + "learning_rate": 4.99925253853601e-06, + "loss": 0.7233, + "step": 198 + }, + { + "epoch": 0.09408983451536643, + "grad_norm": 3.0318164825439453, + "learning_rate": 4.999237207172008e-06, + "loss": 0.7543, + "step": 199 + }, + { + "epoch": 0.09456264775413711, + "grad_norm": 3.662214756011963, + "learning_rate": 4.999221720191464e-06, + "loss": 0.7783, + "step": 200 + }, + { + "epoch": 0.0950354609929078, + "grad_norm": 3.452078104019165, + "learning_rate": 4.9992060775953425e-06, + "loss": 0.7868, + "step": 201 + }, + { + "epoch": 0.09550827423167849, + "grad_norm": 3.4051287174224854, + "learning_rate": 4.999190279384617e-06, + "loss": 0.7849, + "step": 202 + }, + { + "epoch": 0.09598108747044917, + "grad_norm": 3.1377196311950684, + "learning_rate": 4.999174325560271e-06, + "loss": 0.8364, + "step": 203 + }, + { + "epoch": 0.09645390070921986, + "grad_norm": 3.129473924636841, + "learning_rate": 4.999158216123299e-06, + "loss": 0.7458, + "step": 204 + }, + { + "epoch": 0.09692671394799054, + "grad_norm": 3.169548749923706, + "learning_rate": 4.999141951074703e-06, + "loss": 0.7256, + "step": 205 + }, + { + "epoch": 0.09739952718676123, + "grad_norm": 3.186009168624878, + "learning_rate": 4.999125530415495e-06, + "loss": 0.783, + "step": 206 + }, + { + "epoch": 0.09787234042553192, + "grad_norm": 3.0995123386383057, + "learning_rate": 4.9991089541467e-06, + "loss": 0.7519, + "step": 207 + }, + { + "epoch": 0.0983451536643026, + "grad_norm": 3.1854088306427, + "learning_rate": 4.999092222269348e-06, + "loss": 0.7444, + "step": 208 + }, + { + "epoch": 0.09881796690307329, + "grad_norm": 3.1512246131896973, + "learning_rate": 4.999075334784482e-06, + "loss": 0.7882, + "step": 209 + }, + { + "epoch": 0.09929078014184398, + "grad_norm": 3.6199698448181152, + "learning_rate": 4.999058291693153e-06, + "loss": 0.8048, + "step": 210 + }, + { + "epoch": 0.09976359338061466, + "grad_norm": 2.956907272338867, + "learning_rate": 4.999041092996422e-06, + "loss": 0.7663, + "step": 211 + }, + { + "epoch": 0.10023640661938535, + "grad_norm": 3.3493971824645996, + "learning_rate": 4.99902373869536e-06, + "loss": 0.7639, + "step": 212 + }, + { + "epoch": 0.10070921985815603, + "grad_norm": 3.144812822341919, + "learning_rate": 4.9990062287910475e-06, + "loss": 0.7953, + "step": 213 + }, + { + "epoch": 0.10118203309692671, + "grad_norm": 3.5986971855163574, + "learning_rate": 4.998988563284576e-06, + "loss": 0.8297, + "step": 214 + }, + { + "epoch": 0.1016548463356974, + "grad_norm": 3.447584867477417, + "learning_rate": 4.998970742177044e-06, + "loss": 0.808, + "step": 215 + }, + { + "epoch": 0.10212765957446808, + "grad_norm": 3.791353940963745, + "learning_rate": 4.998952765469562e-06, + "loss": 0.8005, + "step": 216 + }, + { + "epoch": 0.10260047281323877, + "grad_norm": 3.4490807056427, + "learning_rate": 4.998934633163247e-06, + "loss": 0.8135, + "step": 217 + }, + { + "epoch": 0.10307328605200945, + "grad_norm": 3.1053314208984375, + "learning_rate": 4.998916345259232e-06, + "loss": 0.7888, + "step": 218 + }, + { + "epoch": 0.10354609929078014, + "grad_norm": 3.407862663269043, + "learning_rate": 4.9988979017586514e-06, + "loss": 0.7099, + "step": 219 + }, + { + "epoch": 0.10401891252955082, + "grad_norm": 3.116656541824341, + "learning_rate": 4.998879302662658e-06, + "loss": 0.8344, + "step": 220 + }, + { + "epoch": 0.10449172576832151, + "grad_norm": 3.339264154434204, + "learning_rate": 4.998860547972406e-06, + "loss": 0.8496, + "step": 221 + }, + { + "epoch": 0.1049645390070922, + "grad_norm": 3.251892566680908, + "learning_rate": 4.998841637689066e-06, + "loss": 0.7455, + "step": 222 + }, + { + "epoch": 0.10543735224586288, + "grad_norm": 4.098135471343994, + "learning_rate": 4.998822571813814e-06, + "loss": 0.7772, + "step": 223 + }, + { + "epoch": 0.10591016548463357, + "grad_norm": 3.9871134757995605, + "learning_rate": 4.998803350347837e-06, + "loss": 0.8261, + "step": 224 + }, + { + "epoch": 0.10638297872340426, + "grad_norm": 3.2822303771972656, + "learning_rate": 4.998783973292333e-06, + "loss": 0.8623, + "step": 225 + }, + { + "epoch": 0.10685579196217494, + "grad_norm": 3.0356857776641846, + "learning_rate": 4.998764440648507e-06, + "loss": 0.7426, + "step": 226 + }, + { + "epoch": 0.10732860520094563, + "grad_norm": 2.8932785987854004, + "learning_rate": 4.998744752417576e-06, + "loss": 0.6741, + "step": 227 + }, + { + "epoch": 0.10780141843971631, + "grad_norm": 3.085820436477661, + "learning_rate": 4.998724908600767e-06, + "loss": 0.6549, + "step": 228 + }, + { + "epoch": 0.108274231678487, + "grad_norm": 3.135829210281372, + "learning_rate": 4.998704909199314e-06, + "loss": 0.6702, + "step": 229 + }, + { + "epoch": 0.10874704491725769, + "grad_norm": 5.016134262084961, + "learning_rate": 4.9986847542144625e-06, + "loss": 0.7852, + "step": 230 + }, + { + "epoch": 0.10921985815602837, + "grad_norm": 3.9056200981140137, + "learning_rate": 4.998664443647468e-06, + "loss": 0.9654, + "step": 231 + }, + { + "epoch": 0.10969267139479906, + "grad_norm": 3.0880749225616455, + "learning_rate": 4.998643977499595e-06, + "loss": 0.7579, + "step": 232 + }, + { + "epoch": 0.11016548463356975, + "grad_norm": 3.6893601417541504, + "learning_rate": 4.998623355772118e-06, + "loss": 0.713, + "step": 233 + }, + { + "epoch": 0.11063829787234042, + "grad_norm": 4.181536674499512, + "learning_rate": 4.998602578466319e-06, + "loss": 0.7331, + "step": 234 + }, + { + "epoch": 0.1111111111111111, + "grad_norm": 3.036386728286743, + "learning_rate": 4.998581645583496e-06, + "loss": 0.7115, + "step": 235 + }, + { + "epoch": 0.11158392434988179, + "grad_norm": 3.6333255767822266, + "learning_rate": 4.998560557124948e-06, + "loss": 0.7544, + "step": 236 + }, + { + "epoch": 0.11205673758865248, + "grad_norm": 2.926417827606201, + "learning_rate": 4.9985393130919915e-06, + "loss": 0.715, + "step": 237 + }, + { + "epoch": 0.11252955082742316, + "grad_norm": 2.969158172607422, + "learning_rate": 4.998517913485946e-06, + "loss": 0.7304, + "step": 238 + }, + { + "epoch": 0.11300236406619385, + "grad_norm": 3.5254971981048584, + "learning_rate": 4.9984963583081466e-06, + "loss": 0.7725, + "step": 239 + }, + { + "epoch": 0.11347517730496454, + "grad_norm": 3.7840335369110107, + "learning_rate": 4.998474647559936e-06, + "loss": 0.8685, + "step": 240 + }, + { + "epoch": 0.11394799054373522, + "grad_norm": 3.0333125591278076, + "learning_rate": 4.9984527812426625e-06, + "loss": 0.7793, + "step": 241 + }, + { + "epoch": 0.11442080378250591, + "grad_norm": 3.290159225463867, + "learning_rate": 4.99843075935769e-06, + "loss": 0.7158, + "step": 242 + }, + { + "epoch": 0.1148936170212766, + "grad_norm": 3.3935494422912598, + "learning_rate": 4.99840858190639e-06, + "loss": 0.7643, + "step": 243 + }, + { + "epoch": 0.11536643026004728, + "grad_norm": 3.333965539932251, + "learning_rate": 4.998386248890142e-06, + "loss": 0.7255, + "step": 244 + }, + { + "epoch": 0.11583924349881797, + "grad_norm": 2.8129613399505615, + "learning_rate": 4.998363760310339e-06, + "loss": 0.768, + "step": 245 + }, + { + "epoch": 0.11631205673758865, + "grad_norm": 2.8678107261657715, + "learning_rate": 4.998341116168378e-06, + "loss": 0.7403, + "step": 246 + }, + { + "epoch": 0.11678486997635934, + "grad_norm": 2.8898239135742188, + "learning_rate": 4.998318316465672e-06, + "loss": 0.6844, + "step": 247 + }, + { + "epoch": 0.11725768321513003, + "grad_norm": 3.139777898788452, + "learning_rate": 4.998295361203637e-06, + "loss": 0.7936, + "step": 248 + }, + { + "epoch": 0.11773049645390071, + "grad_norm": 3.393721103668213, + "learning_rate": 4.998272250383707e-06, + "loss": 0.8173, + "step": 249 + }, + { + "epoch": 0.1182033096926714, + "grad_norm": 3.240973949432373, + "learning_rate": 4.998248984007318e-06, + "loss": 0.8252, + "step": 250 + }, + { + "epoch": 0.11867612293144209, + "grad_norm": 3.384855031967163, + "learning_rate": 4.998225562075918e-06, + "loss": 0.7244, + "step": 251 + }, + { + "epoch": 0.11914893617021277, + "grad_norm": 3.1881816387176514, + "learning_rate": 4.9982019845909675e-06, + "loss": 0.6818, + "step": 252 + }, + { + "epoch": 0.11962174940898346, + "grad_norm": 2.888364553451538, + "learning_rate": 4.998178251553934e-06, + "loss": 0.6753, + "step": 253 + }, + { + "epoch": 0.12009456264775414, + "grad_norm": 3.630093812942505, + "learning_rate": 4.9981543629662944e-06, + "loss": 0.7995, + "step": 254 + }, + { + "epoch": 0.12056737588652482, + "grad_norm": 2.9820947647094727, + "learning_rate": 4.998130318829537e-06, + "loss": 0.7478, + "step": 255 + }, + { + "epoch": 0.1210401891252955, + "grad_norm": 2.7094738483428955, + "learning_rate": 4.998106119145159e-06, + "loss": 0.7237, + "step": 256 + }, + { + "epoch": 0.12151300236406619, + "grad_norm": 3.1808104515075684, + "learning_rate": 4.9980817639146665e-06, + "loss": 0.7915, + "step": 257 + }, + { + "epoch": 0.12198581560283688, + "grad_norm": 3.1661291122436523, + "learning_rate": 4.998057253139575e-06, + "loss": 0.8053, + "step": 258 + }, + { + "epoch": 0.12245862884160756, + "grad_norm": 3.528749942779541, + "learning_rate": 4.998032586821413e-06, + "loss": 0.7946, + "step": 259 + }, + { + "epoch": 0.12293144208037825, + "grad_norm": 3.125964879989624, + "learning_rate": 4.998007764961716e-06, + "loss": 0.7569, + "step": 260 + }, + { + "epoch": 0.12340425531914893, + "grad_norm": 3.0778942108154297, + "learning_rate": 4.997982787562029e-06, + "loss": 0.7184, + "step": 261 + }, + { + "epoch": 0.12387706855791962, + "grad_norm": 3.3531930446624756, + "learning_rate": 4.997957654623906e-06, + "loss": 0.7586, + "step": 262 + }, + { + "epoch": 0.1243498817966903, + "grad_norm": 3.229278564453125, + "learning_rate": 4.997932366148913e-06, + "loss": 0.6092, + "step": 263 + }, + { + "epoch": 0.12482269503546099, + "grad_norm": 3.7286155223846436, + "learning_rate": 4.997906922138626e-06, + "loss": 0.7965, + "step": 264 + }, + { + "epoch": 0.12529550827423167, + "grad_norm": 3.300311803817749, + "learning_rate": 4.997881322594628e-06, + "loss": 0.7665, + "step": 265 + }, + { + "epoch": 0.12576832151300235, + "grad_norm": 3.411482572555542, + "learning_rate": 4.9978555675185115e-06, + "loss": 0.7253, + "step": 266 + }, + { + "epoch": 0.12624113475177304, + "grad_norm": 3.0884511470794678, + "learning_rate": 4.9978296569118825e-06, + "loss": 0.659, + "step": 267 + }, + { + "epoch": 0.12671394799054372, + "grad_norm": 3.0652925968170166, + "learning_rate": 4.9978035907763535e-06, + "loss": 0.6739, + "step": 268 + }, + { + "epoch": 0.1271867612293144, + "grad_norm": 3.280555009841919, + "learning_rate": 4.997777369113547e-06, + "loss": 0.8003, + "step": 269 + }, + { + "epoch": 0.1276595744680851, + "grad_norm": 2.980860948562622, + "learning_rate": 4.997750991925096e-06, + "loss": 0.7097, + "step": 270 + }, + { + "epoch": 0.12813238770685578, + "grad_norm": 3.301760673522949, + "learning_rate": 4.997724459212644e-06, + "loss": 0.7894, + "step": 271 + }, + { + "epoch": 0.12860520094562647, + "grad_norm": 2.9584903717041016, + "learning_rate": 4.997697770977841e-06, + "loss": 0.733, + "step": 272 + }, + { + "epoch": 0.12907801418439716, + "grad_norm": 3.5632214546203613, + "learning_rate": 4.99767092722235e-06, + "loss": 0.7228, + "step": 273 + }, + { + "epoch": 0.12955082742316784, + "grad_norm": 3.5900983810424805, + "learning_rate": 4.997643927947843e-06, + "loss": 0.7634, + "step": 274 + }, + { + "epoch": 0.13002364066193853, + "grad_norm": 3.332650661468506, + "learning_rate": 4.997616773156e-06, + "loss": 0.797, + "step": 275 + }, + { + "epoch": 0.13049645390070921, + "grad_norm": 3.1094167232513428, + "learning_rate": 4.997589462848512e-06, + "loss": 0.7849, + "step": 276 + }, + { + "epoch": 0.1309692671394799, + "grad_norm": 3.5359463691711426, + "learning_rate": 4.99756199702708e-06, + "loss": 0.6871, + "step": 277 + }, + { + "epoch": 0.1314420803782506, + "grad_norm": 3.190441846847534, + "learning_rate": 4.997534375693414e-06, + "loss": 0.6883, + "step": 278 + }, + { + "epoch": 0.13191489361702127, + "grad_norm": 3.063518762588501, + "learning_rate": 4.997506598849234e-06, + "loss": 0.7586, + "step": 279 + }, + { + "epoch": 0.13238770685579196, + "grad_norm": 3.4112050533294678, + "learning_rate": 4.997478666496269e-06, + "loss": 0.796, + "step": 280 + }, + { + "epoch": 0.13286052009456265, + "grad_norm": 3.231886386871338, + "learning_rate": 4.997450578636259e-06, + "loss": 0.7714, + "step": 281 + }, + { + "epoch": 0.13333333333333333, + "grad_norm": 3.279425621032715, + "learning_rate": 4.9974223352709515e-06, + "loss": 0.7793, + "step": 282 + }, + { + "epoch": 0.13380614657210402, + "grad_norm": 3.2154316902160645, + "learning_rate": 4.9973939364021075e-06, + "loss": 0.791, + "step": 283 + }, + { + "epoch": 0.1342789598108747, + "grad_norm": 3.2090768814086914, + "learning_rate": 4.9973653820314925e-06, + "loss": 0.6433, + "step": 284 + }, + { + "epoch": 0.1347517730496454, + "grad_norm": 3.1712026596069336, + "learning_rate": 4.997336672160886e-06, + "loss": 0.8128, + "step": 285 + }, + { + "epoch": 0.13522458628841608, + "grad_norm": 2.929229497909546, + "learning_rate": 4.997307806792076e-06, + "loss": 0.7594, + "step": 286 + }, + { + "epoch": 0.13569739952718676, + "grad_norm": 3.0363314151763916, + "learning_rate": 4.997278785926859e-06, + "loss": 0.7336, + "step": 287 + }, + { + "epoch": 0.13617021276595745, + "grad_norm": 3.1352357864379883, + "learning_rate": 4.997249609567042e-06, + "loss": 0.7225, + "step": 288 + }, + { + "epoch": 0.13664302600472814, + "grad_norm": 3.3171157836914062, + "learning_rate": 4.997220277714442e-06, + "loss": 0.7777, + "step": 289 + }, + { + "epoch": 0.13711583924349882, + "grad_norm": 3.050717353820801, + "learning_rate": 4.997190790370885e-06, + "loss": 0.6836, + "step": 290 + }, + { + "epoch": 0.1375886524822695, + "grad_norm": 3.0297694206237793, + "learning_rate": 4.997161147538208e-06, + "loss": 0.6883, + "step": 291 + }, + { + "epoch": 0.1380614657210402, + "grad_norm": 3.0566554069519043, + "learning_rate": 4.997131349218256e-06, + "loss": 0.6674, + "step": 292 + }, + { + "epoch": 0.13853427895981088, + "grad_norm": 3.799111843109131, + "learning_rate": 4.997101395412885e-06, + "loss": 0.8256, + "step": 293 + }, + { + "epoch": 0.13900709219858157, + "grad_norm": 3.1394248008728027, + "learning_rate": 4.9970712861239576e-06, + "loss": 0.7306, + "step": 294 + }, + { + "epoch": 0.13947990543735225, + "grad_norm": 3.0605666637420654, + "learning_rate": 4.997041021353352e-06, + "loss": 0.7212, + "step": 295 + }, + { + "epoch": 0.13995271867612294, + "grad_norm": 3.8813397884368896, + "learning_rate": 4.997010601102951e-06, + "loss": 0.769, + "step": 296 + }, + { + "epoch": 0.14042553191489363, + "grad_norm": 3.0514819622039795, + "learning_rate": 4.996980025374649e-06, + "loss": 0.7422, + "step": 297 + }, + { + "epoch": 0.1408983451536643, + "grad_norm": 2.9544146060943604, + "learning_rate": 4.99694929417035e-06, + "loss": 0.6912, + "step": 298 + }, + { + "epoch": 0.141371158392435, + "grad_norm": 3.2635602951049805, + "learning_rate": 4.996918407491966e-06, + "loss": 0.7395, + "step": 299 + }, + { + "epoch": 0.14184397163120568, + "grad_norm": 3.373882532119751, + "learning_rate": 4.996887365341423e-06, + "loss": 0.7799, + "step": 300 + }, + { + "epoch": 0.14231678486997637, + "grad_norm": 3.001128673553467, + "learning_rate": 4.996856167720652e-06, + "loss": 0.7168, + "step": 301 + }, + { + "epoch": 0.14278959810874706, + "grad_norm": 3.1026835441589355, + "learning_rate": 4.996824814631595e-06, + "loss": 0.7492, + "step": 302 + }, + { + "epoch": 0.14326241134751774, + "grad_norm": 3.41947603225708, + "learning_rate": 4.996793306076205e-06, + "loss": 0.6659, + "step": 303 + }, + { + "epoch": 0.14373522458628843, + "grad_norm": 3.2272400856018066, + "learning_rate": 4.996761642056444e-06, + "loss": 0.7184, + "step": 304 + }, + { + "epoch": 0.14420803782505912, + "grad_norm": 2.9488935470581055, + "learning_rate": 4.996729822574284e-06, + "loss": 0.7451, + "step": 305 + }, + { + "epoch": 0.14468085106382977, + "grad_norm": 3.268231153488159, + "learning_rate": 4.9966978476317065e-06, + "loss": 0.7798, + "step": 306 + }, + { + "epoch": 0.14515366430260046, + "grad_norm": 3.9086556434631348, + "learning_rate": 4.996665717230701e-06, + "loss": 0.7871, + "step": 307 + }, + { + "epoch": 0.14562647754137115, + "grad_norm": 3.3483879566192627, + "learning_rate": 4.996633431373269e-06, + "loss": 0.7415, + "step": 308 + }, + { + "epoch": 0.14609929078014183, + "grad_norm": 2.839400053024292, + "learning_rate": 4.99660099006142e-06, + "loss": 0.7192, + "step": 309 + }, + { + "epoch": 0.14657210401891252, + "grad_norm": 3.177302598953247, + "learning_rate": 4.996568393297175e-06, + "loss": 0.755, + "step": 310 + }, + { + "epoch": 0.1470449172576832, + "grad_norm": 3.5477044582366943, + "learning_rate": 4.996535641082563e-06, + "loss": 0.7531, + "step": 311 + }, + { + "epoch": 0.1475177304964539, + "grad_norm": 3.418576717376709, + "learning_rate": 4.996502733419624e-06, + "loss": 0.8009, + "step": 312 + }, + { + "epoch": 0.14799054373522458, + "grad_norm": 3.711341619491577, + "learning_rate": 4.996469670310407e-06, + "loss": 0.7362, + "step": 313 + }, + { + "epoch": 0.14846335697399526, + "grad_norm": 3.2419373989105225, + "learning_rate": 4.99643645175697e-06, + "loss": 0.7761, + "step": 314 + }, + { + "epoch": 0.14893617021276595, + "grad_norm": 3.121858835220337, + "learning_rate": 4.996403077761381e-06, + "loss": 0.6495, + "step": 315 + }, + { + "epoch": 0.14940898345153664, + "grad_norm": 3.123054265975952, + "learning_rate": 4.996369548325719e-06, + "loss": 0.7444, + "step": 316 + }, + { + "epoch": 0.14988179669030732, + "grad_norm": 2.780880928039551, + "learning_rate": 4.996335863452072e-06, + "loss": 0.672, + "step": 317 + }, + { + "epoch": 0.150354609929078, + "grad_norm": 3.3738629817962646, + "learning_rate": 4.996302023142536e-06, + "loss": 0.7972, + "step": 318 + }, + { + "epoch": 0.1508274231678487, + "grad_norm": 3.4874777793884277, + "learning_rate": 4.99626802739922e-06, + "loss": 0.8252, + "step": 319 + }, + { + "epoch": 0.15130023640661938, + "grad_norm": 3.7074787616729736, + "learning_rate": 4.9962338762242395e-06, + "loss": 0.8216, + "step": 320 + }, + { + "epoch": 0.15177304964539007, + "grad_norm": 3.281912326812744, + "learning_rate": 4.996199569619721e-06, + "loss": 0.8175, + "step": 321 + }, + { + "epoch": 0.15224586288416075, + "grad_norm": 2.9485340118408203, + "learning_rate": 4.996165107587801e-06, + "loss": 0.707, + "step": 322 + }, + { + "epoch": 0.15271867612293144, + "grad_norm": 3.3757646083831787, + "learning_rate": 4.996130490130625e-06, + "loss": 0.7955, + "step": 323 + }, + { + "epoch": 0.15319148936170213, + "grad_norm": 2.962181568145752, + "learning_rate": 4.996095717250349e-06, + "loss": 0.7067, + "step": 324 + }, + { + "epoch": 0.1536643026004728, + "grad_norm": 3.114272356033325, + "learning_rate": 4.996060788949136e-06, + "loss": 0.7486, + "step": 325 + }, + { + "epoch": 0.1541371158392435, + "grad_norm": 3.0621590614318848, + "learning_rate": 4.996025705229165e-06, + "loss": 0.6547, + "step": 326 + }, + { + "epoch": 0.15460992907801419, + "grad_norm": 2.8745882511138916, + "learning_rate": 4.995990466092616e-06, + "loss": 0.6435, + "step": 327 + }, + { + "epoch": 0.15508274231678487, + "grad_norm": 2.90841007232666, + "learning_rate": 4.995955071541686e-06, + "loss": 0.7331, + "step": 328 + }, + { + "epoch": 0.15555555555555556, + "grad_norm": 2.694580316543579, + "learning_rate": 4.9959195215785784e-06, + "loss": 0.6731, + "step": 329 + }, + { + "epoch": 0.15602836879432624, + "grad_norm": 3.158083438873291, + "learning_rate": 4.995883816205507e-06, + "loss": 0.7257, + "step": 330 + }, + { + "epoch": 0.15650118203309693, + "grad_norm": 3.3234715461730957, + "learning_rate": 4.995847955424694e-06, + "loss": 0.7389, + "step": 331 + }, + { + "epoch": 0.15697399527186762, + "grad_norm": 2.9406495094299316, + "learning_rate": 4.995811939238373e-06, + "loss": 0.643, + "step": 332 + }, + { + "epoch": 0.1574468085106383, + "grad_norm": 3.3191726207733154, + "learning_rate": 4.995775767648785e-06, + "loss": 0.7879, + "step": 333 + }, + { + "epoch": 0.157919621749409, + "grad_norm": 3.711925745010376, + "learning_rate": 4.995739440658185e-06, + "loss": 0.7586, + "step": 334 + }, + { + "epoch": 0.15839243498817968, + "grad_norm": 9.573421478271484, + "learning_rate": 4.995702958268833e-06, + "loss": 0.7842, + "step": 335 + }, + { + "epoch": 0.15886524822695036, + "grad_norm": 3.4154508113861084, + "learning_rate": 4.995666320483001e-06, + "loss": 0.6735, + "step": 336 + }, + { + "epoch": 0.15933806146572105, + "grad_norm": 3.4169859886169434, + "learning_rate": 4.995629527302971e-06, + "loss": 0.741, + "step": 337 + }, + { + "epoch": 0.15981087470449173, + "grad_norm": 3.287503242492676, + "learning_rate": 4.9955925787310335e-06, + "loss": 0.7139, + "step": 338 + }, + { + "epoch": 0.16028368794326242, + "grad_norm": 3.288409471511841, + "learning_rate": 4.995555474769488e-06, + "loss": 0.7636, + "step": 339 + }, + { + "epoch": 0.1607565011820331, + "grad_norm": 2.8021693229675293, + "learning_rate": 4.995518215420646e-06, + "loss": 0.5883, + "step": 340 + }, + { + "epoch": 0.1612293144208038, + "grad_norm": 2.7038564682006836, + "learning_rate": 4.995480800686827e-06, + "loss": 0.657, + "step": 341 + }, + { + "epoch": 0.16170212765957448, + "grad_norm": 3.2370235919952393, + "learning_rate": 4.9954432305703615e-06, + "loss": 0.6999, + "step": 342 + }, + { + "epoch": 0.16217494089834517, + "grad_norm": 2.8666412830352783, + "learning_rate": 4.995405505073588e-06, + "loss": 0.7199, + "step": 343 + }, + { + "epoch": 0.16264775413711585, + "grad_norm": 3.6467232704162598, + "learning_rate": 4.995367624198856e-06, + "loss": 0.7317, + "step": 344 + }, + { + "epoch": 0.16312056737588654, + "grad_norm": 2.7576327323913574, + "learning_rate": 4.9953295879485246e-06, + "loss": 0.647, + "step": 345 + }, + { + "epoch": 0.1635933806146572, + "grad_norm": 2.922232151031494, + "learning_rate": 4.995291396324959e-06, + "loss": 0.6686, + "step": 346 + }, + { + "epoch": 0.16406619385342788, + "grad_norm": 2.8693501949310303, + "learning_rate": 4.995253049330542e-06, + "loss": 0.6756, + "step": 347 + }, + { + "epoch": 0.16453900709219857, + "grad_norm": 3.671865701675415, + "learning_rate": 4.995214546967658e-06, + "loss": 0.7347, + "step": 348 + }, + { + "epoch": 0.16501182033096926, + "grad_norm": 3.024219274520874, + "learning_rate": 4.995175889238706e-06, + "loss": 0.7547, + "step": 349 + }, + { + "epoch": 0.16548463356973994, + "grad_norm": 2.8470778465270996, + "learning_rate": 4.995137076146091e-06, + "loss": 0.6764, + "step": 350 + }, + { + "epoch": 0.16595744680851063, + "grad_norm": 2.905057907104492, + "learning_rate": 4.9950981076922324e-06, + "loss": 0.6814, + "step": 351 + }, + { + "epoch": 0.16643026004728131, + "grad_norm": 3.504377841949463, + "learning_rate": 4.995058983879555e-06, + "loss": 0.7145, + "step": 352 + }, + { + "epoch": 0.166903073286052, + "grad_norm": 3.0029661655426025, + "learning_rate": 4.995019704710495e-06, + "loss": 0.7114, + "step": 353 + }, + { + "epoch": 0.1673758865248227, + "grad_norm": 2.8666274547576904, + "learning_rate": 4.994980270187499e-06, + "loss": 0.7416, + "step": 354 + }, + { + "epoch": 0.16784869976359337, + "grad_norm": 3.1644718647003174, + "learning_rate": 4.994940680313021e-06, + "loss": 0.661, + "step": 355 + }, + { + "epoch": 0.16832151300236406, + "grad_norm": 3.050391674041748, + "learning_rate": 4.994900935089527e-06, + "loss": 0.7243, + "step": 356 + }, + { + "epoch": 0.16879432624113475, + "grad_norm": 2.985466480255127, + "learning_rate": 4.994861034519491e-06, + "loss": 0.6917, + "step": 357 + }, + { + "epoch": 0.16926713947990543, + "grad_norm": 2.909342050552368, + "learning_rate": 4.9948209786053995e-06, + "loss": 0.6636, + "step": 358 + }, + { + "epoch": 0.16973995271867612, + "grad_norm": 3.2214784622192383, + "learning_rate": 4.9947807673497435e-06, + "loss": 0.7903, + "step": 359 + }, + { + "epoch": 0.1702127659574468, + "grad_norm": 2.5654983520507812, + "learning_rate": 4.994740400755029e-06, + "loss": 0.6129, + "step": 360 + }, + { + "epoch": 0.1706855791962175, + "grad_norm": 3.775646448135376, + "learning_rate": 4.99469987882377e-06, + "loss": 0.7145, + "step": 361 + }, + { + "epoch": 0.17115839243498818, + "grad_norm": 2.8965413570404053, + "learning_rate": 4.994659201558487e-06, + "loss": 0.7177, + "step": 362 + }, + { + "epoch": 0.17163120567375886, + "grad_norm": 3.485597848892212, + "learning_rate": 4.9946183689617146e-06, + "loss": 0.8107, + "step": 363 + }, + { + "epoch": 0.17210401891252955, + "grad_norm": 3.277839183807373, + "learning_rate": 4.994577381035995e-06, + "loss": 0.691, + "step": 364 + }, + { + "epoch": 0.17257683215130024, + "grad_norm": 2.8807685375213623, + "learning_rate": 4.99453623778388e-06, + "loss": 0.7627, + "step": 365 + }, + { + "epoch": 0.17304964539007092, + "grad_norm": 3.0659940242767334, + "learning_rate": 4.994494939207932e-06, + "loss": 0.6858, + "step": 366 + }, + { + "epoch": 0.1735224586288416, + "grad_norm": 3.0881855487823486, + "learning_rate": 4.994453485310723e-06, + "loss": 0.8212, + "step": 367 + }, + { + "epoch": 0.1739952718676123, + "grad_norm": 2.7199201583862305, + "learning_rate": 4.994411876094832e-06, + "loss": 0.6516, + "step": 368 + }, + { + "epoch": 0.17446808510638298, + "grad_norm": 2.955889940261841, + "learning_rate": 4.994370111562851e-06, + "loss": 0.6579, + "step": 369 + }, + { + "epoch": 0.17494089834515367, + "grad_norm": 3.1321663856506348, + "learning_rate": 4.994328191717382e-06, + "loss": 0.6891, + "step": 370 + }, + { + "epoch": 0.17541371158392435, + "grad_norm": 3.0560388565063477, + "learning_rate": 4.994286116561034e-06, + "loss": 0.7243, + "step": 371 + }, + { + "epoch": 0.17588652482269504, + "grad_norm": 3.1560704708099365, + "learning_rate": 4.994243886096425e-06, + "loss": 0.7262, + "step": 372 + }, + { + "epoch": 0.17635933806146573, + "grad_norm": 2.913541316986084, + "learning_rate": 4.994201500326187e-06, + "loss": 0.7318, + "step": 373 + }, + { + "epoch": 0.1768321513002364, + "grad_norm": 3.098376512527466, + "learning_rate": 4.994158959252958e-06, + "loss": 0.6419, + "step": 374 + }, + { + "epoch": 0.1773049645390071, + "grad_norm": 2.977508544921875, + "learning_rate": 4.994116262879387e-06, + "loss": 0.6709, + "step": 375 + }, + { + "epoch": 0.17777777777777778, + "grad_norm": 3.168186902999878, + "learning_rate": 4.994073411208133e-06, + "loss": 0.6608, + "step": 376 + }, + { + "epoch": 0.17825059101654847, + "grad_norm": 3.436844825744629, + "learning_rate": 4.994030404241864e-06, + "loss": 0.7227, + "step": 377 + }, + { + "epoch": 0.17872340425531916, + "grad_norm": 2.8998289108276367, + "learning_rate": 4.993987241983258e-06, + "loss": 0.6512, + "step": 378 + }, + { + "epoch": 0.17919621749408984, + "grad_norm": 3.407191514968872, + "learning_rate": 4.993943924435002e-06, + "loss": 0.616, + "step": 379 + }, + { + "epoch": 0.17966903073286053, + "grad_norm": 3.744858741760254, + "learning_rate": 4.993900451599793e-06, + "loss": 0.8599, + "step": 380 + }, + { + "epoch": 0.18014184397163122, + "grad_norm": 3.486283779144287, + "learning_rate": 4.993856823480338e-06, + "loss": 0.6634, + "step": 381 + }, + { + "epoch": 0.1806146572104019, + "grad_norm": 2.895719051361084, + "learning_rate": 4.993813040079355e-06, + "loss": 0.6972, + "step": 382 + }, + { + "epoch": 0.1810874704491726, + "grad_norm": 2.814133882522583, + "learning_rate": 4.993769101399569e-06, + "loss": 0.6271, + "step": 383 + }, + { + "epoch": 0.18156028368794327, + "grad_norm": 2.8609800338745117, + "learning_rate": 4.993725007443715e-06, + "loss": 0.6481, + "step": 384 + }, + { + "epoch": 0.18203309692671396, + "grad_norm": 3.2829644680023193, + "learning_rate": 4.99368075821454e-06, + "loss": 0.7999, + "step": 385 + }, + { + "epoch": 0.18250591016548465, + "grad_norm": 3.1417458057403564, + "learning_rate": 4.993636353714798e-06, + "loss": 0.6972, + "step": 386 + }, + { + "epoch": 0.1829787234042553, + "grad_norm": 3.0679385662078857, + "learning_rate": 4.993591793947256e-06, + "loss": 0.667, + "step": 387 + }, + { + "epoch": 0.183451536643026, + "grad_norm": 3.1387410163879395, + "learning_rate": 4.993547078914686e-06, + "loss": 0.7618, + "step": 388 + }, + { + "epoch": 0.18392434988179668, + "grad_norm": 2.9181406497955322, + "learning_rate": 4.993502208619872e-06, + "loss": 0.7391, + "step": 389 + }, + { + "epoch": 0.18439716312056736, + "grad_norm": 2.8952157497406006, + "learning_rate": 4.993457183065611e-06, + "loss": 0.6988, + "step": 390 + }, + { + "epoch": 0.18486997635933805, + "grad_norm": 3.2274813652038574, + "learning_rate": 4.993412002254704e-06, + "loss": 0.688, + "step": 391 + }, + { + "epoch": 0.18534278959810874, + "grad_norm": 3.4693779945373535, + "learning_rate": 4.993366666189965e-06, + "loss": 0.6634, + "step": 392 + }, + { + "epoch": 0.18581560283687942, + "grad_norm": 3.5358526706695557, + "learning_rate": 4.993321174874217e-06, + "loss": 0.7343, + "step": 393 + }, + { + "epoch": 0.1862884160756501, + "grad_norm": 3.013338088989258, + "learning_rate": 4.993275528310292e-06, + "loss": 0.7579, + "step": 394 + }, + { + "epoch": 0.1867612293144208, + "grad_norm": 2.694772720336914, + "learning_rate": 4.993229726501033e-06, + "loss": 0.718, + "step": 395 + }, + { + "epoch": 0.18723404255319148, + "grad_norm": 3.070612907409668, + "learning_rate": 4.9931837694492915e-06, + "loss": 0.6438, + "step": 396 + }, + { + "epoch": 0.18770685579196217, + "grad_norm": 2.9193027019500732, + "learning_rate": 4.993137657157928e-06, + "loss": 0.6788, + "step": 397 + }, + { + "epoch": 0.18817966903073285, + "grad_norm": 3.047682046890259, + "learning_rate": 4.993091389629816e-06, + "loss": 0.6826, + "step": 398 + }, + { + "epoch": 0.18865248226950354, + "grad_norm": 2.9629905223846436, + "learning_rate": 4.993044966867834e-06, + "loss": 0.7196, + "step": 399 + }, + { + "epoch": 0.18912529550827423, + "grad_norm": 3.0692050457000732, + "learning_rate": 4.992998388874874e-06, + "loss": 0.7015, + "step": 400 + }, + { + "epoch": 0.1895981087470449, + "grad_norm": 3.5427212715148926, + "learning_rate": 4.992951655653836e-06, + "loss": 0.8292, + "step": 401 + }, + { + "epoch": 0.1900709219858156, + "grad_norm": 2.643526554107666, + "learning_rate": 4.992904767207629e-06, + "loss": 0.624, + "step": 402 + }, + { + "epoch": 0.19054373522458629, + "grad_norm": 3.1185996532440186, + "learning_rate": 4.992857723539173e-06, + "loss": 0.7354, + "step": 403 + }, + { + "epoch": 0.19101654846335697, + "grad_norm": 3.006856679916382, + "learning_rate": 4.992810524651398e-06, + "loss": 0.7752, + "step": 404 + }, + { + "epoch": 0.19148936170212766, + "grad_norm": 2.9913275241851807, + "learning_rate": 4.9927631705472425e-06, + "loss": 0.7306, + "step": 405 + }, + { + "epoch": 0.19196217494089834, + "grad_norm": 2.6794071197509766, + "learning_rate": 4.992715661229655e-06, + "loss": 0.6136, + "step": 406 + }, + { + "epoch": 0.19243498817966903, + "grad_norm": 3.5933966636657715, + "learning_rate": 4.992667996701593e-06, + "loss": 0.7024, + "step": 407 + }, + { + "epoch": 0.19290780141843972, + "grad_norm": 2.862187623977661, + "learning_rate": 4.992620176966025e-06, + "loss": 0.692, + "step": 408 + }, + { + "epoch": 0.1933806146572104, + "grad_norm": 3.076845407485962, + "learning_rate": 4.9925722020259286e-06, + "loss": 0.7475, + "step": 409 + }, + { + "epoch": 0.1938534278959811, + "grad_norm": 3.372919797897339, + "learning_rate": 4.9925240718842895e-06, + "loss": 0.6886, + "step": 410 + }, + { + "epoch": 0.19432624113475178, + "grad_norm": 2.922977924346924, + "learning_rate": 4.992475786544108e-06, + "loss": 0.7049, + "step": 411 + }, + { + "epoch": 0.19479905437352246, + "grad_norm": 2.908034324645996, + "learning_rate": 4.992427346008387e-06, + "loss": 0.6498, + "step": 412 + }, + { + "epoch": 0.19527186761229315, + "grad_norm": 3.096723794937134, + "learning_rate": 4.992378750280144e-06, + "loss": 0.7151, + "step": 413 + }, + { + "epoch": 0.19574468085106383, + "grad_norm": 2.895237684249878, + "learning_rate": 4.992329999362405e-06, + "loss": 0.7277, + "step": 414 + }, + { + "epoch": 0.19621749408983452, + "grad_norm": 2.718230724334717, + "learning_rate": 4.9922810932582065e-06, + "loss": 0.6375, + "step": 415 + }, + { + "epoch": 0.1966903073286052, + "grad_norm": 3.187743663787842, + "learning_rate": 4.992232031970592e-06, + "loss": 0.6528, + "step": 416 + }, + { + "epoch": 0.1971631205673759, + "grad_norm": 2.996406316757202, + "learning_rate": 4.992182815502616e-06, + "loss": 0.6552, + "step": 417 + }, + { + "epoch": 0.19763593380614658, + "grad_norm": 3.301084041595459, + "learning_rate": 4.992133443857345e-06, + "loss": 0.7061, + "step": 418 + }, + { + "epoch": 0.19810874704491727, + "grad_norm": 3.7874677181243896, + "learning_rate": 4.992083917037853e-06, + "loss": 0.7859, + "step": 419 + }, + { + "epoch": 0.19858156028368795, + "grad_norm": 3.124253511428833, + "learning_rate": 4.992034235047222e-06, + "loss": 0.7615, + "step": 420 + }, + { + "epoch": 0.19905437352245864, + "grad_norm": 3.0488970279693604, + "learning_rate": 4.991984397888546e-06, + "loss": 0.6916, + "step": 421 + }, + { + "epoch": 0.19952718676122932, + "grad_norm": 3.1241321563720703, + "learning_rate": 4.991934405564929e-06, + "loss": 0.7055, + "step": 422 + }, + { + "epoch": 0.2, + "grad_norm": 3.396632432937622, + "learning_rate": 4.991884258079484e-06, + "loss": 0.7675, + "step": 423 + }, + { + "epoch": 0.2004728132387707, + "grad_norm": 3.7776873111724854, + "learning_rate": 4.9918339554353316e-06, + "loss": 0.7371, + "step": 424 + }, + { + "epoch": 0.20094562647754138, + "grad_norm": 3.3356032371520996, + "learning_rate": 4.991783497635606e-06, + "loss": 0.6778, + "step": 425 + }, + { + "epoch": 0.20141843971631207, + "grad_norm": 2.988856792449951, + "learning_rate": 4.9917328846834474e-06, + "loss": 0.6795, + "step": 426 + }, + { + "epoch": 0.20189125295508276, + "grad_norm": 3.264183282852173, + "learning_rate": 4.99168211658201e-06, + "loss": 0.7707, + "step": 427 + }, + { + "epoch": 0.20236406619385341, + "grad_norm": 3.878068208694458, + "learning_rate": 4.991631193334451e-06, + "loss": 0.857, + "step": 428 + }, + { + "epoch": 0.2028368794326241, + "grad_norm": 3.6377553939819336, + "learning_rate": 4.991580114943943e-06, + "loss": 0.8033, + "step": 429 + }, + { + "epoch": 0.2033096926713948, + "grad_norm": 2.95393967628479, + "learning_rate": 4.991528881413667e-06, + "loss": 0.6809, + "step": 430 + }, + { + "epoch": 0.20378250591016547, + "grad_norm": 3.058704376220703, + "learning_rate": 4.9914774927468125e-06, + "loss": 0.6664, + "step": 431 + }, + { + "epoch": 0.20425531914893616, + "grad_norm": 2.7783217430114746, + "learning_rate": 4.9914259489465795e-06, + "loss": 0.6478, + "step": 432 + }, + { + "epoch": 0.20472813238770685, + "grad_norm": 2.4825217723846436, + "learning_rate": 4.991374250016177e-06, + "loss": 0.6598, + "step": 433 + }, + { + "epoch": 0.20520094562647753, + "grad_norm": 2.8753600120544434, + "learning_rate": 4.991322395958824e-06, + "loss": 0.6947, + "step": 434 + }, + { + "epoch": 0.20567375886524822, + "grad_norm": 3.2339367866516113, + "learning_rate": 4.99127038677775e-06, + "loss": 0.8201, + "step": 435 + }, + { + "epoch": 0.2061465721040189, + "grad_norm": 2.9065537452697754, + "learning_rate": 4.991218222476193e-06, + "loss": 0.6679, + "step": 436 + }, + { + "epoch": 0.2066193853427896, + "grad_norm": 3.283228874206543, + "learning_rate": 4.991165903057401e-06, + "loss": 0.8039, + "step": 437 + }, + { + "epoch": 0.20709219858156028, + "grad_norm": 3.429872751235962, + "learning_rate": 4.991113428524631e-06, + "loss": 0.7392, + "step": 438 + }, + { + "epoch": 0.20756501182033096, + "grad_norm": 3.118943452835083, + "learning_rate": 4.991060798881152e-06, + "loss": 0.6794, + "step": 439 + }, + { + "epoch": 0.20803782505910165, + "grad_norm": 3.395970106124878, + "learning_rate": 4.99100801413024e-06, + "loss": 0.6862, + "step": 440 + }, + { + "epoch": 0.20851063829787234, + "grad_norm": 2.869191884994507, + "learning_rate": 4.99095507427518e-06, + "loss": 0.6076, + "step": 441 + }, + { + "epoch": 0.20898345153664302, + "grad_norm": 3.1934661865234375, + "learning_rate": 4.990901979319272e-06, + "loss": 0.6927, + "step": 442 + }, + { + "epoch": 0.2094562647754137, + "grad_norm": 2.9068603515625, + "learning_rate": 4.990848729265819e-06, + "loss": 0.6864, + "step": 443 + }, + { + "epoch": 0.2099290780141844, + "grad_norm": 3.0535948276519775, + "learning_rate": 4.9907953241181375e-06, + "loss": 0.6396, + "step": 444 + }, + { + "epoch": 0.21040189125295508, + "grad_norm": 2.871511459350586, + "learning_rate": 4.990741763879554e-06, + "loss": 0.6743, + "step": 445 + }, + { + "epoch": 0.21087470449172577, + "grad_norm": 2.9184393882751465, + "learning_rate": 4.9906880485534015e-06, + "loss": 0.6786, + "step": 446 + }, + { + "epoch": 0.21134751773049645, + "grad_norm": 3.0628271102905273, + "learning_rate": 4.990634178143026e-06, + "loss": 0.6326, + "step": 447 + }, + { + "epoch": 0.21182033096926714, + "grad_norm": 3.7878305912017822, + "learning_rate": 4.990580152651782e-06, + "loss": 0.7944, + "step": 448 + }, + { + "epoch": 0.21229314420803783, + "grad_norm": 2.8577189445495605, + "learning_rate": 4.990525972083031e-06, + "loss": 0.71, + "step": 449 + }, + { + "epoch": 0.2127659574468085, + "grad_norm": 3.307769775390625, + "learning_rate": 4.99047163644015e-06, + "loss": 0.6893, + "step": 450 + }, + { + "epoch": 0.2132387706855792, + "grad_norm": 2.7391717433929443, + "learning_rate": 4.990417145726519e-06, + "loss": 0.712, + "step": 451 + }, + { + "epoch": 0.21371158392434988, + "grad_norm": 2.938044786453247, + "learning_rate": 4.990362499945534e-06, + "loss": 0.7516, + "step": 452 + }, + { + "epoch": 0.21418439716312057, + "grad_norm": 2.7831056118011475, + "learning_rate": 4.990307699100595e-06, + "loss": 0.6168, + "step": 453 + }, + { + "epoch": 0.21465721040189126, + "grad_norm": 2.907977342605591, + "learning_rate": 4.990252743195116e-06, + "loss": 0.6706, + "step": 454 + }, + { + "epoch": 0.21513002364066194, + "grad_norm": 3.7882161140441895, + "learning_rate": 4.990197632232517e-06, + "loss": 0.6847, + "step": 455 + }, + { + "epoch": 0.21560283687943263, + "grad_norm": 2.899716854095459, + "learning_rate": 4.990142366216232e-06, + "loss": 0.6699, + "step": 456 + }, + { + "epoch": 0.21607565011820332, + "grad_norm": 2.907003879547119, + "learning_rate": 4.990086945149701e-06, + "loss": 0.6864, + "step": 457 + }, + { + "epoch": 0.216548463356974, + "grad_norm": 3.2407333850860596, + "learning_rate": 4.9900313690363736e-06, + "loss": 0.692, + "step": 458 + }, + { + "epoch": 0.2170212765957447, + "grad_norm": 2.9055583477020264, + "learning_rate": 4.989975637879712e-06, + "loss": 0.7113, + "step": 459 + }, + { + "epoch": 0.21749408983451538, + "grad_norm": 2.9836206436157227, + "learning_rate": 4.989919751683184e-06, + "loss": 0.6673, + "step": 460 + }, + { + "epoch": 0.21796690307328606, + "grad_norm": 3.371035575866699, + "learning_rate": 4.989863710450273e-06, + "loss": 0.7181, + "step": 461 + }, + { + "epoch": 0.21843971631205675, + "grad_norm": 2.9636635780334473, + "learning_rate": 4.989807514184465e-06, + "loss": 0.6082, + "step": 462 + }, + { + "epoch": 0.21891252955082743, + "grad_norm": 2.9634664058685303, + "learning_rate": 4.9897511628892615e-06, + "loss": 0.7086, + "step": 463 + }, + { + "epoch": 0.21938534278959812, + "grad_norm": 3.154763698577881, + "learning_rate": 4.98969465656817e-06, + "loss": 0.7027, + "step": 464 + }, + { + "epoch": 0.2198581560283688, + "grad_norm": 2.9959890842437744, + "learning_rate": 4.98963799522471e-06, + "loss": 0.6498, + "step": 465 + }, + { + "epoch": 0.2203309692671395, + "grad_norm": 3.5470590591430664, + "learning_rate": 4.989581178862408e-06, + "loss": 0.7199, + "step": 466 + }, + { + "epoch": 0.22080378250591018, + "grad_norm": 7.1873369216918945, + "learning_rate": 4.989524207484802e-06, + "loss": 0.6676, + "step": 467 + }, + { + "epoch": 0.22127659574468084, + "grad_norm": 3.1099541187286377, + "learning_rate": 4.98946708109544e-06, + "loss": 0.6785, + "step": 468 + }, + { + "epoch": 0.22174940898345152, + "grad_norm": 2.830991506576538, + "learning_rate": 4.9894097996978795e-06, + "loss": 0.6456, + "step": 469 + }, + { + "epoch": 0.2222222222222222, + "grad_norm": 3.0212316513061523, + "learning_rate": 4.989352363295687e-06, + "loss": 0.6048, + "step": 470 + }, + { + "epoch": 0.2226950354609929, + "grad_norm": 3.18776798248291, + "learning_rate": 4.989294771892437e-06, + "loss": 0.7078, + "step": 471 + }, + { + "epoch": 0.22316784869976358, + "grad_norm": 2.9972598552703857, + "learning_rate": 4.989237025491717e-06, + "loss": 0.7082, + "step": 472 + }, + { + "epoch": 0.22364066193853427, + "grad_norm": 3.4935688972473145, + "learning_rate": 4.989179124097123e-06, + "loss": 0.8199, + "step": 473 + }, + { + "epoch": 0.22411347517730495, + "grad_norm": 2.6485543251037598, + "learning_rate": 4.9891210677122595e-06, + "loss": 0.6371, + "step": 474 + }, + { + "epoch": 0.22458628841607564, + "grad_norm": 2.969233512878418, + "learning_rate": 4.989062856340742e-06, + "loss": 0.6879, + "step": 475 + }, + { + "epoch": 0.22505910165484633, + "grad_norm": 2.881875514984131, + "learning_rate": 4.989004489986194e-06, + "loss": 0.7415, + "step": 476 + }, + { + "epoch": 0.225531914893617, + "grad_norm": 2.624540090560913, + "learning_rate": 4.98894596865225e-06, + "loss": 0.6522, + "step": 477 + }, + { + "epoch": 0.2260047281323877, + "grad_norm": 3.61075496673584, + "learning_rate": 4.988887292342555e-06, + "loss": 0.7109, + "step": 478 + }, + { + "epoch": 0.2264775413711584, + "grad_norm": 2.9368972778320312, + "learning_rate": 4.988828461060762e-06, + "loss": 0.6843, + "step": 479 + }, + { + "epoch": 0.22695035460992907, + "grad_norm": 3.0670197010040283, + "learning_rate": 4.988769474810533e-06, + "loss": 0.6807, + "step": 480 + }, + { + "epoch": 0.22742316784869976, + "grad_norm": 2.9662792682647705, + "learning_rate": 4.988710333595542e-06, + "loss": 0.6796, + "step": 481 + }, + { + "epoch": 0.22789598108747045, + "grad_norm": 2.971235752105713, + "learning_rate": 4.988651037419472e-06, + "loss": 0.696, + "step": 482 + }, + { + "epoch": 0.22836879432624113, + "grad_norm": 2.931884527206421, + "learning_rate": 4.988591586286013e-06, + "loss": 0.7323, + "step": 483 + }, + { + "epoch": 0.22884160756501182, + "grad_norm": 2.8114213943481445, + "learning_rate": 4.988531980198868e-06, + "loss": 0.6584, + "step": 484 + }, + { + "epoch": 0.2293144208037825, + "grad_norm": 3.2785916328430176, + "learning_rate": 4.98847221916175e-06, + "loss": 0.7514, + "step": 485 + }, + { + "epoch": 0.2297872340425532, + "grad_norm": 3.0520215034484863, + "learning_rate": 4.988412303178377e-06, + "loss": 0.7564, + "step": 486 + }, + { + "epoch": 0.23026004728132388, + "grad_norm": 3.181002616882324, + "learning_rate": 4.988352232252483e-06, + "loss": 0.6768, + "step": 487 + }, + { + "epoch": 0.23073286052009456, + "grad_norm": 3.4953625202178955, + "learning_rate": 4.988292006387805e-06, + "loss": 0.7143, + "step": 488 + }, + { + "epoch": 0.23120567375886525, + "grad_norm": 3.326571226119995, + "learning_rate": 4.988231625588096e-06, + "loss": 0.7318, + "step": 489 + }, + { + "epoch": 0.23167848699763594, + "grad_norm": 3.09614634513855, + "learning_rate": 4.988171089857113e-06, + "loss": 0.6574, + "step": 490 + }, + { + "epoch": 0.23215130023640662, + "grad_norm": 2.7439446449279785, + "learning_rate": 4.9881103991986265e-06, + "loss": 0.6637, + "step": 491 + }, + { + "epoch": 0.2326241134751773, + "grad_norm": 3.0681190490722656, + "learning_rate": 4.988049553616416e-06, + "loss": 0.6326, + "step": 492 + }, + { + "epoch": 0.233096926713948, + "grad_norm": 3.0757341384887695, + "learning_rate": 4.98798855311427e-06, + "loss": 0.695, + "step": 493 + }, + { + "epoch": 0.23356973995271868, + "grad_norm": 2.8637635707855225, + "learning_rate": 4.987927397695985e-06, + "loss": 0.6598, + "step": 494 + }, + { + "epoch": 0.23404255319148937, + "grad_norm": 3.3641068935394287, + "learning_rate": 4.9878660873653715e-06, + "loss": 0.7435, + "step": 495 + }, + { + "epoch": 0.23451536643026005, + "grad_norm": 3.5025596618652344, + "learning_rate": 4.987804622126245e-06, + "loss": 0.735, + "step": 496 + }, + { + "epoch": 0.23498817966903074, + "grad_norm": 2.9298837184906006, + "learning_rate": 4.987743001982434e-06, + "loss": 0.7063, + "step": 497 + }, + { + "epoch": 0.23546099290780143, + "grad_norm": 2.70358943939209, + "learning_rate": 4.987681226937774e-06, + "loss": 0.6799, + "step": 498 + }, + { + "epoch": 0.2359338061465721, + "grad_norm": 3.027871608734131, + "learning_rate": 4.9876192969961125e-06, + "loss": 0.6881, + "step": 499 + }, + { + "epoch": 0.2364066193853428, + "grad_norm": 3.362306594848633, + "learning_rate": 4.987557212161304e-06, + "loss": 0.7906, + "step": 500 + }, + { + "epoch": 0.23687943262411348, + "grad_norm": 3.3136050701141357, + "learning_rate": 4.987494972437217e-06, + "loss": 0.6878, + "step": 501 + }, + { + "epoch": 0.23735224586288417, + "grad_norm": 3.017089605331421, + "learning_rate": 4.9874325778277255e-06, + "loss": 0.7279, + "step": 502 + }, + { + "epoch": 0.23782505910165486, + "grad_norm": 2.8300516605377197, + "learning_rate": 4.987370028336714e-06, + "loss": 0.6864, + "step": 503 + }, + { + "epoch": 0.23829787234042554, + "grad_norm": 3.201860189437866, + "learning_rate": 4.987307323968077e-06, + "loss": 0.7531, + "step": 504 + }, + { + "epoch": 0.23877068557919623, + "grad_norm": 2.685396194458008, + "learning_rate": 4.987244464725721e-06, + "loss": 0.5849, + "step": 505 + }, + { + "epoch": 0.23924349881796692, + "grad_norm": 2.8715312480926514, + "learning_rate": 4.987181450613557e-06, + "loss": 0.675, + "step": 506 + }, + { + "epoch": 0.2397163120567376, + "grad_norm": 2.813908815383911, + "learning_rate": 4.987118281635511e-06, + "loss": 0.6841, + "step": 507 + }, + { + "epoch": 0.2401891252955083, + "grad_norm": 3.2738473415374756, + "learning_rate": 4.987054957795514e-06, + "loss": 0.7158, + "step": 508 + }, + { + "epoch": 0.24066193853427895, + "grad_norm": 2.896134376525879, + "learning_rate": 4.986991479097511e-06, + "loss": 0.7542, + "step": 509 + }, + { + "epoch": 0.24113475177304963, + "grad_norm": 3.0390403270721436, + "learning_rate": 4.986927845545454e-06, + "loss": 0.6733, + "step": 510 + }, + { + "epoch": 0.24160756501182032, + "grad_norm": 3.0300254821777344, + "learning_rate": 4.9868640571433044e-06, + "loss": 0.722, + "step": 511 + }, + { + "epoch": 0.242080378250591, + "grad_norm": 3.3037352561950684, + "learning_rate": 4.986800113895035e-06, + "loss": 0.6811, + "step": 512 + }, + { + "epoch": 0.2425531914893617, + "grad_norm": 3.0358474254608154, + "learning_rate": 4.986736015804627e-06, + "loss": 0.7348, + "step": 513 + }, + { + "epoch": 0.24302600472813238, + "grad_norm": 3.108792304992676, + "learning_rate": 4.986671762876071e-06, + "loss": 0.6096, + "step": 514 + }, + { + "epoch": 0.24349881796690306, + "grad_norm": 3.1316237449645996, + "learning_rate": 4.986607355113367e-06, + "loss": 0.6357, + "step": 515 + }, + { + "epoch": 0.24397163120567375, + "grad_norm": 3.3095219135284424, + "learning_rate": 4.986542792520528e-06, + "loss": 0.7515, + "step": 516 + }, + { + "epoch": 0.24444444444444444, + "grad_norm": 3.4775984287261963, + "learning_rate": 4.986478075101572e-06, + "loss": 0.7104, + "step": 517 + }, + { + "epoch": 0.24491725768321512, + "grad_norm": 3.341708183288574, + "learning_rate": 4.986413202860528e-06, + "loss": 0.7339, + "step": 518 + }, + { + "epoch": 0.2453900709219858, + "grad_norm": 2.9646966457366943, + "learning_rate": 4.986348175801438e-06, + "loss": 0.6032, + "step": 519 + }, + { + "epoch": 0.2458628841607565, + "grad_norm": 3.1853902339935303, + "learning_rate": 4.986282993928349e-06, + "loss": 0.6925, + "step": 520 + }, + { + "epoch": 0.24633569739952718, + "grad_norm": 3.286909818649292, + "learning_rate": 4.98621765724532e-06, + "loss": 0.7447, + "step": 521 + }, + { + "epoch": 0.24680851063829787, + "grad_norm": 3.2255051136016846, + "learning_rate": 4.986152165756419e-06, + "loss": 0.7747, + "step": 522 + }, + { + "epoch": 0.24728132387706855, + "grad_norm": 3.002352237701416, + "learning_rate": 4.986086519465724e-06, + "loss": 0.6472, + "step": 523 + }, + { + "epoch": 0.24775413711583924, + "grad_norm": 3.4738974571228027, + "learning_rate": 4.986020718377322e-06, + "loss": 0.7381, + "step": 524 + }, + { + "epoch": 0.24822695035460993, + "grad_norm": 3.4470200538635254, + "learning_rate": 4.985954762495312e-06, + "loss": 0.6878, + "step": 525 + }, + { + "epoch": 0.2486997635933806, + "grad_norm": 2.9219350814819336, + "learning_rate": 4.985888651823799e-06, + "loss": 0.6317, + "step": 526 + }, + { + "epoch": 0.2491725768321513, + "grad_norm": 3.061767101287842, + "learning_rate": 4.985822386366899e-06, + "loss": 0.6842, + "step": 527 + }, + { + "epoch": 0.24964539007092199, + "grad_norm": 3.0291247367858887, + "learning_rate": 4.985755966128742e-06, + "loss": 0.6852, + "step": 528 + }, + { + "epoch": 0.25011820330969264, + "grad_norm": 2.964280843734741, + "learning_rate": 4.985689391113457e-06, + "loss": 0.7738, + "step": 529 + }, + { + "epoch": 0.25059101654846333, + "grad_norm": 3.058302164077759, + "learning_rate": 4.9856226613251955e-06, + "loss": 0.6677, + "step": 530 + }, + { + "epoch": 0.251063829787234, + "grad_norm": 3.345141649246216, + "learning_rate": 4.985555776768109e-06, + "loss": 0.7837, + "step": 531 + }, + { + "epoch": 0.2515366430260047, + "grad_norm": 3.565031051635742, + "learning_rate": 4.9854887374463636e-06, + "loss": 0.7231, + "step": 532 + }, + { + "epoch": 0.2520094562647754, + "grad_norm": 2.7953789234161377, + "learning_rate": 4.985421543364132e-06, + "loss": 0.6102, + "step": 533 + }, + { + "epoch": 0.2524822695035461, + "grad_norm": 2.887606620788574, + "learning_rate": 4.9853541945256e-06, + "loss": 0.6289, + "step": 534 + }, + { + "epoch": 0.25295508274231676, + "grad_norm": 3.1480495929718018, + "learning_rate": 4.985286690934961e-06, + "loss": 0.6348, + "step": 535 + }, + { + "epoch": 0.25342789598108745, + "grad_norm": 2.8912761211395264, + "learning_rate": 4.985219032596416e-06, + "loss": 0.595, + "step": 536 + }, + { + "epoch": 0.25390070921985813, + "grad_norm": 2.947936534881592, + "learning_rate": 4.98515121951418e-06, + "loss": 0.6196, + "step": 537 + }, + { + "epoch": 0.2543735224586288, + "grad_norm": 3.1085827350616455, + "learning_rate": 4.985083251692474e-06, + "loss": 0.6387, + "step": 538 + }, + { + "epoch": 0.2548463356973995, + "grad_norm": 3.1688334941864014, + "learning_rate": 4.985015129135531e-06, + "loss": 0.7055, + "step": 539 + }, + { + "epoch": 0.2553191489361702, + "grad_norm": 3.075042963027954, + "learning_rate": 4.984946851847593e-06, + "loss": 0.7515, + "step": 540 + }, + { + "epoch": 0.2557919621749409, + "grad_norm": 3.1933093070983887, + "learning_rate": 4.98487841983291e-06, + "loss": 0.7054, + "step": 541 + }, + { + "epoch": 0.25626477541371157, + "grad_norm": 3.043473958969116, + "learning_rate": 4.984809833095744e-06, + "loss": 0.6281, + "step": 542 + }, + { + "epoch": 0.25673758865248225, + "grad_norm": 3.0532584190368652, + "learning_rate": 4.9847410916403645e-06, + "loss": 0.6155, + "step": 543 + }, + { + "epoch": 0.25721040189125294, + "grad_norm": 3.608480215072632, + "learning_rate": 4.984672195471053e-06, + "loss": 0.7363, + "step": 544 + }, + { + "epoch": 0.2576832151300236, + "grad_norm": 2.7491862773895264, + "learning_rate": 4.9846031445921e-06, + "loss": 0.6594, + "step": 545 + }, + { + "epoch": 0.2581560283687943, + "grad_norm": 2.8602418899536133, + "learning_rate": 4.984533939007802e-06, + "loss": 0.6742, + "step": 546 + }, + { + "epoch": 0.258628841607565, + "grad_norm": 3.1782007217407227, + "learning_rate": 4.98446457872247e-06, + "loss": 0.731, + "step": 547 + }, + { + "epoch": 0.2591016548463357, + "grad_norm": 2.796147584915161, + "learning_rate": 4.984395063740423e-06, + "loss": 0.6617, + "step": 548 + }, + { + "epoch": 0.25957446808510637, + "grad_norm": 2.8392202854156494, + "learning_rate": 4.984325394065991e-06, + "loss": 0.6753, + "step": 549 + }, + { + "epoch": 0.26004728132387706, + "grad_norm": 3.134672164916992, + "learning_rate": 4.984255569703508e-06, + "loss": 0.7222, + "step": 550 + }, + { + "epoch": 0.26052009456264774, + "grad_norm": 2.734330177307129, + "learning_rate": 4.984185590657325e-06, + "loss": 0.6098, + "step": 551 + }, + { + "epoch": 0.26099290780141843, + "grad_norm": 3.739010810852051, + "learning_rate": 4.984115456931798e-06, + "loss": 0.7457, + "step": 552 + }, + { + "epoch": 0.2614657210401891, + "grad_norm": 2.8412528038024902, + "learning_rate": 4.9840451685312925e-06, + "loss": 0.6972, + "step": 553 + }, + { + "epoch": 0.2619385342789598, + "grad_norm": 3.017395496368408, + "learning_rate": 4.983974725460188e-06, + "loss": 0.6887, + "step": 554 + }, + { + "epoch": 0.2624113475177305, + "grad_norm": 3.2746949195861816, + "learning_rate": 4.98390412772287e-06, + "loss": 0.7047, + "step": 555 + }, + { + "epoch": 0.2628841607565012, + "grad_norm": 3.1561965942382812, + "learning_rate": 4.983833375323732e-06, + "loss": 0.7726, + "step": 556 + }, + { + "epoch": 0.26335697399527186, + "grad_norm": 3.2367217540740967, + "learning_rate": 4.9837624682671816e-06, + "loss": 0.6348, + "step": 557 + }, + { + "epoch": 0.26382978723404255, + "grad_norm": 2.8195858001708984, + "learning_rate": 4.983691406557633e-06, + "loss": 0.6387, + "step": 558 + }, + { + "epoch": 0.26430260047281323, + "grad_norm": 3.349820852279663, + "learning_rate": 4.983620190199511e-06, + "loss": 0.6776, + "step": 559 + }, + { + "epoch": 0.2647754137115839, + "grad_norm": 2.8025588989257812, + "learning_rate": 4.98354881919725e-06, + "loss": 0.6512, + "step": 560 + }, + { + "epoch": 0.2652482269503546, + "grad_norm": 2.9125499725341797, + "learning_rate": 4.983477293555295e-06, + "loss": 0.7024, + "step": 561 + }, + { + "epoch": 0.2657210401891253, + "grad_norm": 3.3479275703430176, + "learning_rate": 4.983405613278098e-06, + "loss": 0.688, + "step": 562 + }, + { + "epoch": 0.266193853427896, + "grad_norm": 3.123971462249756, + "learning_rate": 4.983333778370123e-06, + "loss": 0.6743, + "step": 563 + }, + { + "epoch": 0.26666666666666666, + "grad_norm": 2.891625165939331, + "learning_rate": 4.983261788835843e-06, + "loss": 0.5971, + "step": 564 + }, + { + "epoch": 0.26713947990543735, + "grad_norm": 3.5066864490509033, + "learning_rate": 4.98318964467974e-06, + "loss": 0.6958, + "step": 565 + }, + { + "epoch": 0.26761229314420804, + "grad_norm": 2.570547342300415, + "learning_rate": 4.983117345906306e-06, + "loss": 0.609, + "step": 566 + }, + { + "epoch": 0.2680851063829787, + "grad_norm": 3.005106210708618, + "learning_rate": 4.983044892520044e-06, + "loss": 0.6791, + "step": 567 + }, + { + "epoch": 0.2685579196217494, + "grad_norm": 3.429675340652466, + "learning_rate": 4.982972284525463e-06, + "loss": 0.6625, + "step": 568 + }, + { + "epoch": 0.2690307328605201, + "grad_norm": 3.825657367706299, + "learning_rate": 4.982899521927086e-06, + "loss": 0.6368, + "step": 569 + }, + { + "epoch": 0.2695035460992908, + "grad_norm": 2.8699095249176025, + "learning_rate": 4.982826604729443e-06, + "loss": 0.6425, + "step": 570 + }, + { + "epoch": 0.26997635933806147, + "grad_norm": 3.1688714027404785, + "learning_rate": 4.982753532937074e-06, + "loss": 0.6904, + "step": 571 + }, + { + "epoch": 0.27044917257683215, + "grad_norm": 3.3889992237091064, + "learning_rate": 4.98268030655453e-06, + "loss": 0.7575, + "step": 572 + }, + { + "epoch": 0.27092198581560284, + "grad_norm": 3.108315944671631, + "learning_rate": 4.982606925586367e-06, + "loss": 0.6648, + "step": 573 + }, + { + "epoch": 0.2713947990543735, + "grad_norm": 3.209831953048706, + "learning_rate": 4.982533390037159e-06, + "loss": 0.657, + "step": 574 + }, + { + "epoch": 0.2718676122931442, + "grad_norm": 3.1740927696228027, + "learning_rate": 4.982459699911482e-06, + "loss": 0.7262, + "step": 575 + }, + { + "epoch": 0.2723404255319149, + "grad_norm": 3.0190417766571045, + "learning_rate": 4.982385855213924e-06, + "loss": 0.6368, + "step": 576 + }, + { + "epoch": 0.2728132387706856, + "grad_norm": 3.05049467086792, + "learning_rate": 4.982311855949084e-06, + "loss": 0.72, + "step": 577 + }, + { + "epoch": 0.27328605200945627, + "grad_norm": 2.984816551208496, + "learning_rate": 4.98223770212157e-06, + "loss": 0.6856, + "step": 578 + }, + { + "epoch": 0.27375886524822696, + "grad_norm": 2.744969606399536, + "learning_rate": 4.982163393735998e-06, + "loss": 0.6023, + "step": 579 + }, + { + "epoch": 0.27423167848699764, + "grad_norm": 3.170564889907837, + "learning_rate": 4.982088930796996e-06, + "loss": 0.6678, + "step": 580 + }, + { + "epoch": 0.27470449172576833, + "grad_norm": 2.8686118125915527, + "learning_rate": 4.982014313309199e-06, + "loss": 0.6157, + "step": 581 + }, + { + "epoch": 0.275177304964539, + "grad_norm": 2.8768694400787354, + "learning_rate": 4.981939541277254e-06, + "loss": 0.6566, + "step": 582 + }, + { + "epoch": 0.2756501182033097, + "grad_norm": 2.621481418609619, + "learning_rate": 4.981864614705818e-06, + "loss": 0.7372, + "step": 583 + }, + { + "epoch": 0.2761229314420804, + "grad_norm": 3.527374267578125, + "learning_rate": 4.981789533599554e-06, + "loss": 0.6485, + "step": 584 + }, + { + "epoch": 0.2765957446808511, + "grad_norm": 3.3141074180603027, + "learning_rate": 4.981714297963138e-06, + "loss": 0.6816, + "step": 585 + }, + { + "epoch": 0.27706855791962176, + "grad_norm": 2.9247069358825684, + "learning_rate": 4.981638907801255e-06, + "loss": 0.7217, + "step": 586 + }, + { + "epoch": 0.27754137115839245, + "grad_norm": 2.875236749649048, + "learning_rate": 4.981563363118599e-06, + "loss": 0.6662, + "step": 587 + }, + { + "epoch": 0.27801418439716313, + "grad_norm": 2.9540364742279053, + "learning_rate": 4.981487663919874e-06, + "loss": 0.7225, + "step": 588 + }, + { + "epoch": 0.2784869976359338, + "grad_norm": 2.90889310836792, + "learning_rate": 4.981411810209793e-06, + "loss": 0.6054, + "step": 589 + }, + { + "epoch": 0.2789598108747045, + "grad_norm": 2.8541409969329834, + "learning_rate": 4.981335801993078e-06, + "loss": 0.6539, + "step": 590 + }, + { + "epoch": 0.2794326241134752, + "grad_norm": 3.1600730419158936, + "learning_rate": 4.981259639274465e-06, + "loss": 0.6415, + "step": 591 + }, + { + "epoch": 0.2799054373522459, + "grad_norm": 3.569376230239868, + "learning_rate": 4.981183322058693e-06, + "loss": 0.6944, + "step": 592 + }, + { + "epoch": 0.28037825059101656, + "grad_norm": 3.067667007446289, + "learning_rate": 4.981106850350515e-06, + "loss": 0.7378, + "step": 593 + }, + { + "epoch": 0.28085106382978725, + "grad_norm": 3.082073450088501, + "learning_rate": 4.981030224154693e-06, + "loss": 0.693, + "step": 594 + }, + { + "epoch": 0.28132387706855794, + "grad_norm": 2.902932643890381, + "learning_rate": 4.980953443475998e-06, + "loss": 0.6549, + "step": 595 + }, + { + "epoch": 0.2817966903073286, + "grad_norm": 2.6821181774139404, + "learning_rate": 4.980876508319211e-06, + "loss": 0.6231, + "step": 596 + }, + { + "epoch": 0.2822695035460993, + "grad_norm": 3.1747355461120605, + "learning_rate": 4.9807994186891215e-06, + "loss": 0.6826, + "step": 597 + }, + { + "epoch": 0.28274231678487, + "grad_norm": 2.6975860595703125, + "learning_rate": 4.980722174590531e-06, + "loss": 0.6669, + "step": 598 + }, + { + "epoch": 0.2832151300236407, + "grad_norm": 2.924285650253296, + "learning_rate": 4.9806447760282486e-06, + "loss": 0.689, + "step": 599 + }, + { + "epoch": 0.28368794326241137, + "grad_norm": 2.941417694091797, + "learning_rate": 4.980567223007093e-06, + "loss": 0.6672, + "step": 600 + }, + { + "epoch": 0.28416075650118205, + "grad_norm": 2.8582186698913574, + "learning_rate": 4.980489515531892e-06, + "loss": 0.6229, + "step": 601 + }, + { + "epoch": 0.28463356973995274, + "grad_norm": 2.6462013721466064, + "learning_rate": 4.9804116536074865e-06, + "loss": 0.606, + "step": 602 + }, + { + "epoch": 0.2851063829787234, + "grad_norm": 2.9029998779296875, + "learning_rate": 4.980333637238723e-06, + "loss": 0.5915, + "step": 603 + }, + { + "epoch": 0.2855791962174941, + "grad_norm": 3.9359042644500732, + "learning_rate": 4.980255466430462e-06, + "loss": 0.7035, + "step": 604 + }, + { + "epoch": 0.2860520094562648, + "grad_norm": 3.200524091720581, + "learning_rate": 4.980177141187566e-06, + "loss": 0.7156, + "step": 605 + }, + { + "epoch": 0.2865248226950355, + "grad_norm": 3.1708686351776123, + "learning_rate": 4.980098661514916e-06, + "loss": 0.746, + "step": 606 + }, + { + "epoch": 0.28699763593380617, + "grad_norm": 2.8926830291748047, + "learning_rate": 4.980020027417397e-06, + "loss": 0.6282, + "step": 607 + }, + { + "epoch": 0.28747044917257686, + "grad_norm": 3.0526294708251953, + "learning_rate": 4.979941238899906e-06, + "loss": 0.6594, + "step": 608 + }, + { + "epoch": 0.28794326241134754, + "grad_norm": 2.9869306087493896, + "learning_rate": 4.9798622959673486e-06, + "loss": 0.7771, + "step": 609 + }, + { + "epoch": 0.28841607565011823, + "grad_norm": 2.7894513607025146, + "learning_rate": 4.979783198624638e-06, + "loss": 0.6819, + "step": 610 + }, + { + "epoch": 0.28888888888888886, + "grad_norm": 2.958575963973999, + "learning_rate": 4.9797039468767025e-06, + "loss": 0.6474, + "step": 611 + }, + { + "epoch": 0.28936170212765955, + "grad_norm": 3.423748016357422, + "learning_rate": 4.979624540728475e-06, + "loss": 0.7389, + "step": 612 + }, + { + "epoch": 0.28983451536643023, + "grad_norm": 2.9641635417938232, + "learning_rate": 4.9795449801849e-06, + "loss": 0.6005, + "step": 613 + }, + { + "epoch": 0.2903073286052009, + "grad_norm": 3.02274227142334, + "learning_rate": 4.979465265250933e-06, + "loss": 0.6358, + "step": 614 + }, + { + "epoch": 0.2907801418439716, + "grad_norm": 3.0562758445739746, + "learning_rate": 4.979385395931534e-06, + "loss": 0.6313, + "step": 615 + }, + { + "epoch": 0.2912529550827423, + "grad_norm": 3.301816701889038, + "learning_rate": 4.97930537223168e-06, + "loss": 0.7264, + "step": 616 + }, + { + "epoch": 0.291725768321513, + "grad_norm": 2.975360870361328, + "learning_rate": 4.979225194156351e-06, + "loss": 0.613, + "step": 617 + }, + { + "epoch": 0.29219858156028367, + "grad_norm": 2.9245030879974365, + "learning_rate": 4.97914486171054e-06, + "loss": 0.6646, + "step": 618 + }, + { + "epoch": 0.29267139479905435, + "grad_norm": 3.1336188316345215, + "learning_rate": 4.979064374899249e-06, + "loss": 0.6421, + "step": 619 + }, + { + "epoch": 0.29314420803782504, + "grad_norm": 3.6298763751983643, + "learning_rate": 4.978983733727491e-06, + "loss": 0.6433, + "step": 620 + }, + { + "epoch": 0.2936170212765957, + "grad_norm": 2.919597625732422, + "learning_rate": 4.9789029382002845e-06, + "loss": 0.6288, + "step": 621 + }, + { + "epoch": 0.2940898345153664, + "grad_norm": 3.2206127643585205, + "learning_rate": 4.978821988322662e-06, + "loss": 0.7102, + "step": 622 + }, + { + "epoch": 0.2945626477541371, + "grad_norm": 3.1767101287841797, + "learning_rate": 4.978740884099664e-06, + "loss": 0.6722, + "step": 623 + }, + { + "epoch": 0.2950354609929078, + "grad_norm": 3.3425452709198, + "learning_rate": 4.97865962553634e-06, + "loss": 0.6492, + "step": 624 + }, + { + "epoch": 0.29550827423167847, + "grad_norm": 3.0408358573913574, + "learning_rate": 4.97857821263775e-06, + "loss": 0.6522, + "step": 625 + }, + { + "epoch": 0.29598108747044916, + "grad_norm": 2.8144783973693848, + "learning_rate": 4.978496645408963e-06, + "loss": 0.7237, + "step": 626 + }, + { + "epoch": 0.29645390070921984, + "grad_norm": 3.7010560035705566, + "learning_rate": 4.978414923855057e-06, + "loss": 0.7509, + "step": 627 + }, + { + "epoch": 0.29692671394799053, + "grad_norm": 2.9438371658325195, + "learning_rate": 4.978333047981122e-06, + "loss": 0.6244, + "step": 628 + }, + { + "epoch": 0.2973995271867612, + "grad_norm": 3.285982370376587, + "learning_rate": 4.978251017792255e-06, + "loss": 0.7553, + "step": 629 + }, + { + "epoch": 0.2978723404255319, + "grad_norm": 3.7021138668060303, + "learning_rate": 4.978168833293564e-06, + "loss": 0.7859, + "step": 630 + }, + { + "epoch": 0.2983451536643026, + "grad_norm": 3.481858730316162, + "learning_rate": 4.9780864944901654e-06, + "loss": 0.7146, + "step": 631 + }, + { + "epoch": 0.2988179669030733, + "grad_norm": 3.693824529647827, + "learning_rate": 4.978004001387188e-06, + "loss": 0.6608, + "step": 632 + }, + { + "epoch": 0.29929078014184396, + "grad_norm": 3.0069146156311035, + "learning_rate": 4.9779213539897665e-06, + "loss": 0.6506, + "step": 633 + }, + { + "epoch": 0.29976359338061465, + "grad_norm": 3.037644147872925, + "learning_rate": 4.977838552303048e-06, + "loss": 0.6487, + "step": 634 + }, + { + "epoch": 0.30023640661938533, + "grad_norm": 3.018554449081421, + "learning_rate": 4.977755596332188e-06, + "loss": 0.6128, + "step": 635 + }, + { + "epoch": 0.300709219858156, + "grad_norm": 3.000312089920044, + "learning_rate": 4.977672486082351e-06, + "loss": 0.6431, + "step": 636 + }, + { + "epoch": 0.3011820330969267, + "grad_norm": 2.836803913116455, + "learning_rate": 4.977589221558713e-06, + "loss": 0.5914, + "step": 637 + }, + { + "epoch": 0.3016548463356974, + "grad_norm": 3.080469846725464, + "learning_rate": 4.977505802766457e-06, + "loss": 0.7265, + "step": 638 + }, + { + "epoch": 0.3021276595744681, + "grad_norm": 3.2245471477508545, + "learning_rate": 4.97742222971078e-06, + "loss": 0.6895, + "step": 639 + }, + { + "epoch": 0.30260047281323876, + "grad_norm": 3.559006452560425, + "learning_rate": 4.977338502396882e-06, + "loss": 0.7439, + "step": 640 + }, + { + "epoch": 0.30307328605200945, + "grad_norm": 2.9116289615631104, + "learning_rate": 4.9772546208299795e-06, + "loss": 0.6907, + "step": 641 + }, + { + "epoch": 0.30354609929078014, + "grad_norm": 3.3645524978637695, + "learning_rate": 4.977170585015295e-06, + "loss": 0.6983, + "step": 642 + }, + { + "epoch": 0.3040189125295508, + "grad_norm": 3.080148458480835, + "learning_rate": 4.977086394958058e-06, + "loss": 0.7016, + "step": 643 + }, + { + "epoch": 0.3044917257683215, + "grad_norm": 2.9276750087738037, + "learning_rate": 4.977002050663515e-06, + "loss": 0.6509, + "step": 644 + }, + { + "epoch": 0.3049645390070922, + "grad_norm": 3.183609962463379, + "learning_rate": 4.976917552136914e-06, + "loss": 0.6814, + "step": 645 + }, + { + "epoch": 0.3054373522458629, + "grad_norm": 3.0980000495910645, + "learning_rate": 4.976832899383519e-06, + "loss": 0.6319, + "step": 646 + }, + { + "epoch": 0.30591016548463357, + "grad_norm": 3.211376190185547, + "learning_rate": 4.9767480924086e-06, + "loss": 0.6365, + "step": 647 + }, + { + "epoch": 0.30638297872340425, + "grad_norm": 3.214430093765259, + "learning_rate": 4.976663131217437e-06, + "loss": 0.6006, + "step": 648 + }, + { + "epoch": 0.30685579196217494, + "grad_norm": 3.0914318561553955, + "learning_rate": 4.976578015815321e-06, + "loss": 0.7162, + "step": 649 + }, + { + "epoch": 0.3073286052009456, + "grad_norm": 2.7644500732421875, + "learning_rate": 4.976492746207551e-06, + "loss": 0.6045, + "step": 650 + }, + { + "epoch": 0.3078014184397163, + "grad_norm": 3.1913280487060547, + "learning_rate": 4.9764073223994374e-06, + "loss": 0.6796, + "step": 651 + }, + { + "epoch": 0.308274231678487, + "grad_norm": 2.8919692039489746, + "learning_rate": 4.976321744396299e-06, + "loss": 0.6683, + "step": 652 + }, + { + "epoch": 0.3087470449172577, + "grad_norm": 2.862234115600586, + "learning_rate": 4.976236012203463e-06, + "loss": 0.6631, + "step": 653 + }, + { + "epoch": 0.30921985815602837, + "grad_norm": 2.9708092212677, + "learning_rate": 4.976150125826268e-06, + "loss": 0.6326, + "step": 654 + }, + { + "epoch": 0.30969267139479906, + "grad_norm": 2.892465353012085, + "learning_rate": 4.976064085270063e-06, + "loss": 0.6574, + "step": 655 + }, + { + "epoch": 0.31016548463356974, + "grad_norm": 3.9215126037597656, + "learning_rate": 4.975977890540205e-06, + "loss": 0.7351, + "step": 656 + }, + { + "epoch": 0.31063829787234043, + "grad_norm": 2.9544081687927246, + "learning_rate": 4.975891541642059e-06, + "loss": 0.7264, + "step": 657 + }, + { + "epoch": 0.3111111111111111, + "grad_norm": 2.995035409927368, + "learning_rate": 4.975805038581005e-06, + "loss": 0.7405, + "step": 658 + }, + { + "epoch": 0.3115839243498818, + "grad_norm": 2.9653120040893555, + "learning_rate": 4.975718381362427e-06, + "loss": 0.679, + "step": 659 + }, + { + "epoch": 0.3120567375886525, + "grad_norm": 2.93976092338562, + "learning_rate": 4.9756315699917205e-06, + "loss": 0.627, + "step": 660 + }, + { + "epoch": 0.3125295508274232, + "grad_norm": 3.106522560119629, + "learning_rate": 4.9755446044742915e-06, + "loss": 0.6329, + "step": 661 + }, + { + "epoch": 0.31300236406619386, + "grad_norm": 3.0238280296325684, + "learning_rate": 4.975457484815554e-06, + "loss": 0.6643, + "step": 662 + }, + { + "epoch": 0.31347517730496455, + "grad_norm": 2.943528175354004, + "learning_rate": 4.9753702110209356e-06, + "loss": 0.668, + "step": 663 + }, + { + "epoch": 0.31394799054373523, + "grad_norm": 2.6840121746063232, + "learning_rate": 4.9752827830958676e-06, + "loss": 0.5482, + "step": 664 + }, + { + "epoch": 0.3144208037825059, + "grad_norm": 2.823875904083252, + "learning_rate": 4.975195201045794e-06, + "loss": 0.7017, + "step": 665 + }, + { + "epoch": 0.3148936170212766, + "grad_norm": 3.148181200027466, + "learning_rate": 4.975107464876168e-06, + "loss": 0.747, + "step": 666 + }, + { + "epoch": 0.3153664302600473, + "grad_norm": 2.630584478378296, + "learning_rate": 4.9750195745924545e-06, + "loss": 0.5987, + "step": 667 + }, + { + "epoch": 0.315839243498818, + "grad_norm": 3.075866460800171, + "learning_rate": 4.974931530200124e-06, + "loss": 0.664, + "step": 668 + }, + { + "epoch": 0.31631205673758866, + "grad_norm": 2.947197914123535, + "learning_rate": 4.974843331704659e-06, + "loss": 0.631, + "step": 669 + }, + { + "epoch": 0.31678486997635935, + "grad_norm": 3.519646644592285, + "learning_rate": 4.974754979111552e-06, + "loss": 0.7154, + "step": 670 + }, + { + "epoch": 0.31725768321513004, + "grad_norm": 2.8687186241149902, + "learning_rate": 4.974666472426305e-06, + "loss": 0.6366, + "step": 671 + }, + { + "epoch": 0.3177304964539007, + "grad_norm": 2.6966612339019775, + "learning_rate": 4.974577811654426e-06, + "loss": 0.7112, + "step": 672 + }, + { + "epoch": 0.3182033096926714, + "grad_norm": 3.1390228271484375, + "learning_rate": 4.974488996801439e-06, + "loss": 0.6882, + "step": 673 + }, + { + "epoch": 0.3186761229314421, + "grad_norm": 3.4667599201202393, + "learning_rate": 4.974400027872871e-06, + "loss": 0.7153, + "step": 674 + }, + { + "epoch": 0.3191489361702128, + "grad_norm": 2.9632184505462646, + "learning_rate": 4.974310904874265e-06, + "loss": 0.7081, + "step": 675 + }, + { + "epoch": 0.31962174940898347, + "grad_norm": 3.46150279045105, + "learning_rate": 4.9742216278111666e-06, + "loss": 0.6242, + "step": 676 + }, + { + "epoch": 0.32009456264775416, + "grad_norm": 3.380403757095337, + "learning_rate": 4.974132196689137e-06, + "loss": 0.6863, + "step": 677 + }, + { + "epoch": 0.32056737588652484, + "grad_norm": 3.4279606342315674, + "learning_rate": 4.974042611513746e-06, + "loss": 0.6388, + "step": 678 + }, + { + "epoch": 0.3210401891252955, + "grad_norm": 2.634523391723633, + "learning_rate": 4.973952872290568e-06, + "loss": 0.6038, + "step": 679 + }, + { + "epoch": 0.3215130023640662, + "grad_norm": 3.19693922996521, + "learning_rate": 4.973862979025194e-06, + "loss": 0.6383, + "step": 680 + }, + { + "epoch": 0.3219858156028369, + "grad_norm": 3.437692165374756, + "learning_rate": 4.973772931723218e-06, + "loss": 0.7288, + "step": 681 + }, + { + "epoch": 0.3224586288416076, + "grad_norm": 2.506301164627075, + "learning_rate": 4.97368273039025e-06, + "loss": 0.5707, + "step": 682 + }, + { + "epoch": 0.3229314420803783, + "grad_norm": 3.0942845344543457, + "learning_rate": 4.9735923750319044e-06, + "loss": 0.6348, + "step": 683 + }, + { + "epoch": 0.32340425531914896, + "grad_norm": 3.0889835357666016, + "learning_rate": 4.973501865653809e-06, + "loss": 0.6697, + "step": 684 + }, + { + "epoch": 0.32387706855791965, + "grad_norm": 3.0391931533813477, + "learning_rate": 4.973411202261598e-06, + "loss": 0.7091, + "step": 685 + }, + { + "epoch": 0.32434988179669033, + "grad_norm": 3.0333497524261475, + "learning_rate": 4.973320384860917e-06, + "loss": 0.6403, + "step": 686 + }, + { + "epoch": 0.324822695035461, + "grad_norm": 2.9714622497558594, + "learning_rate": 4.973229413457421e-06, + "loss": 0.6977, + "step": 687 + }, + { + "epoch": 0.3252955082742317, + "grad_norm": 3.057558298110962, + "learning_rate": 4.973138288056774e-06, + "loss": 0.7236, + "step": 688 + }, + { + "epoch": 0.3257683215130024, + "grad_norm": 2.921093463897705, + "learning_rate": 4.97304700866465e-06, + "loss": 0.576, + "step": 689 + }, + { + "epoch": 0.3262411347517731, + "grad_norm": 3.0287256240844727, + "learning_rate": 4.972955575286732e-06, + "loss": 0.7077, + "step": 690 + }, + { + "epoch": 0.32671394799054376, + "grad_norm": 2.8621346950531006, + "learning_rate": 4.972863987928716e-06, + "loss": 0.6952, + "step": 691 + }, + { + "epoch": 0.3271867612293144, + "grad_norm": 2.631359100341797, + "learning_rate": 4.9727722465963006e-06, + "loss": 0.6931, + "step": 692 + }, + { + "epoch": 0.3276595744680851, + "grad_norm": 2.8484320640563965, + "learning_rate": 4.972680351295201e-06, + "loss": 0.6292, + "step": 693 + }, + { + "epoch": 0.32813238770685577, + "grad_norm": 2.593001365661621, + "learning_rate": 4.972588302031138e-06, + "loss": 0.5942, + "step": 694 + }, + { + "epoch": 0.32860520094562645, + "grad_norm": 2.6321065425872803, + "learning_rate": 4.972496098809844e-06, + "loss": 0.65, + "step": 695 + }, + { + "epoch": 0.32907801418439714, + "grad_norm": 3.2516732215881348, + "learning_rate": 4.972403741637059e-06, + "loss": 0.7385, + "step": 696 + }, + { + "epoch": 0.3295508274231678, + "grad_norm": 3.180854320526123, + "learning_rate": 4.972311230518535e-06, + "loss": 0.6569, + "step": 697 + }, + { + "epoch": 0.3300236406619385, + "grad_norm": 4.161016941070557, + "learning_rate": 4.972218565460031e-06, + "loss": 0.6416, + "step": 698 + }, + { + "epoch": 0.3304964539007092, + "grad_norm": 3.153897762298584, + "learning_rate": 4.972125746467317e-06, + "loss": 0.7196, + "step": 699 + }, + { + "epoch": 0.3309692671394799, + "grad_norm": 2.9595556259155273, + "learning_rate": 4.972032773546173e-06, + "loss": 0.7093, + "step": 700 + }, + { + "epoch": 0.33144208037825057, + "grad_norm": 3.1086833477020264, + "learning_rate": 4.9719396467023875e-06, + "loss": 0.6963, + "step": 701 + }, + { + "epoch": 0.33191489361702126, + "grad_norm": 2.958921432495117, + "learning_rate": 4.971846365941759e-06, + "loss": 0.6518, + "step": 702 + }, + { + "epoch": 0.33238770685579194, + "grad_norm": 2.8745479583740234, + "learning_rate": 4.971752931270096e-06, + "loss": 0.696, + "step": 703 + }, + { + "epoch": 0.33286052009456263, + "grad_norm": 3.224358558654785, + "learning_rate": 4.971659342693217e-06, + "loss": 0.6769, + "step": 704 + }, + { + "epoch": 0.3333333333333333, + "grad_norm": 2.696319580078125, + "learning_rate": 4.9715656002169486e-06, + "loss": 0.6833, + "step": 705 + }, + { + "epoch": 0.333806146572104, + "grad_norm": 2.9283502101898193, + "learning_rate": 4.971471703847127e-06, + "loss": 0.6784, + "step": 706 + }, + { + "epoch": 0.3342789598108747, + "grad_norm": 2.654914140701294, + "learning_rate": 4.9713776535896e-06, + "loss": 0.6337, + "step": 707 + }, + { + "epoch": 0.3347517730496454, + "grad_norm": 3.041555643081665, + "learning_rate": 4.971283449450224e-06, + "loss": 0.6227, + "step": 708 + }, + { + "epoch": 0.33522458628841606, + "grad_norm": 2.893008232116699, + "learning_rate": 4.971189091434863e-06, + "loss": 0.655, + "step": 709 + }, + { + "epoch": 0.33569739952718675, + "grad_norm": 2.8806653022766113, + "learning_rate": 4.971094579549393e-06, + "loss": 0.7077, + "step": 710 + }, + { + "epoch": 0.33617021276595743, + "grad_norm": 3.4830048084259033, + "learning_rate": 4.9709999137996986e-06, + "loss": 0.7461, + "step": 711 + }, + { + "epoch": 0.3366430260047281, + "grad_norm": 3.155444860458374, + "learning_rate": 4.970905094191674e-06, + "loss": 0.652, + "step": 712 + }, + { + "epoch": 0.3371158392434988, + "grad_norm": 2.7608706951141357, + "learning_rate": 4.970810120731225e-06, + "loss": 0.684, + "step": 713 + }, + { + "epoch": 0.3375886524822695, + "grad_norm": 2.8209474086761475, + "learning_rate": 4.970714993424265e-06, + "loss": 0.6009, + "step": 714 + }, + { + "epoch": 0.3380614657210402, + "grad_norm": 3.6532654762268066, + "learning_rate": 4.9706197122767145e-06, + "loss": 0.702, + "step": 715 + }, + { + "epoch": 0.33853427895981086, + "grad_norm": 2.6276566982269287, + "learning_rate": 4.970524277294508e-06, + "loss": 0.6338, + "step": 716 + }, + { + "epoch": 0.33900709219858155, + "grad_norm": 3.509871482849121, + "learning_rate": 4.970428688483589e-06, + "loss": 0.6853, + "step": 717 + }, + { + "epoch": 0.33947990543735224, + "grad_norm": 5.332682132720947, + "learning_rate": 4.970332945849906e-06, + "loss": 0.6684, + "step": 718 + }, + { + "epoch": 0.3399527186761229, + "grad_norm": 2.718801975250244, + "learning_rate": 4.970237049399424e-06, + "loss": 0.6676, + "step": 719 + }, + { + "epoch": 0.3404255319148936, + "grad_norm": 3.891003131866455, + "learning_rate": 4.970140999138112e-06, + "loss": 0.7043, + "step": 720 + }, + { + "epoch": 0.3408983451536643, + "grad_norm": 2.8863155841827393, + "learning_rate": 4.970044795071951e-06, + "loss": 0.6563, + "step": 721 + }, + { + "epoch": 0.341371158392435, + "grad_norm": 3.2527518272399902, + "learning_rate": 4.969948437206932e-06, + "loss": 0.7244, + "step": 722 + }, + { + "epoch": 0.34184397163120567, + "grad_norm": 2.9726758003234863, + "learning_rate": 4.969851925549054e-06, + "loss": 0.6548, + "step": 723 + }, + { + "epoch": 0.34231678486997635, + "grad_norm": 3.118309497833252, + "learning_rate": 4.969755260104327e-06, + "loss": 0.7293, + "step": 724 + }, + { + "epoch": 0.34278959810874704, + "grad_norm": 3.373068332672119, + "learning_rate": 4.969658440878769e-06, + "loss": 0.6444, + "step": 725 + }, + { + "epoch": 0.3432624113475177, + "grad_norm": 2.7157437801361084, + "learning_rate": 4.969561467878409e-06, + "loss": 0.642, + "step": 726 + }, + { + "epoch": 0.3437352245862884, + "grad_norm": 2.58929705619812, + "learning_rate": 4.969464341109285e-06, + "loss": 0.6165, + "step": 727 + }, + { + "epoch": 0.3442080378250591, + "grad_norm": 2.8811306953430176, + "learning_rate": 4.969367060577445e-06, + "loss": 0.7127, + "step": 728 + }, + { + "epoch": 0.3446808510638298, + "grad_norm": 3.494358539581299, + "learning_rate": 4.969269626288946e-06, + "loss": 0.7103, + "step": 729 + }, + { + "epoch": 0.34515366430260047, + "grad_norm": 2.9753928184509277, + "learning_rate": 4.969172038249855e-06, + "loss": 0.6911, + "step": 730 + }, + { + "epoch": 0.34562647754137116, + "grad_norm": 3.2885913848876953, + "learning_rate": 4.969074296466247e-06, + "loss": 0.6968, + "step": 731 + }, + { + "epoch": 0.34609929078014184, + "grad_norm": 2.7564568519592285, + "learning_rate": 4.968976400944211e-06, + "loss": 0.6843, + "step": 732 + }, + { + "epoch": 0.34657210401891253, + "grad_norm": 2.9255006313323975, + "learning_rate": 4.96887835168984e-06, + "loss": 0.6024, + "step": 733 + }, + { + "epoch": 0.3470449172576832, + "grad_norm": 3.1808290481567383, + "learning_rate": 4.968780148709239e-06, + "loss": 0.7377, + "step": 734 + }, + { + "epoch": 0.3475177304964539, + "grad_norm": 2.956666946411133, + "learning_rate": 4.968681792008523e-06, + "loss": 0.65, + "step": 735 + }, + { + "epoch": 0.3479905437352246, + "grad_norm": 2.9631855487823486, + "learning_rate": 4.9685832815938175e-06, + "loss": 0.677, + "step": 736 + }, + { + "epoch": 0.3484633569739953, + "grad_norm": 2.501917600631714, + "learning_rate": 4.968484617471256e-06, + "loss": 0.6282, + "step": 737 + }, + { + "epoch": 0.34893617021276596, + "grad_norm": 2.750779628753662, + "learning_rate": 4.968385799646981e-06, + "loss": 0.6507, + "step": 738 + }, + { + "epoch": 0.34940898345153665, + "grad_norm": 2.872300624847412, + "learning_rate": 4.968286828127146e-06, + "loss": 0.5949, + "step": 739 + }, + { + "epoch": 0.34988179669030733, + "grad_norm": 2.6316142082214355, + "learning_rate": 4.9681877029179124e-06, + "loss": 0.6328, + "step": 740 + }, + { + "epoch": 0.350354609929078, + "grad_norm": 3.244364023208618, + "learning_rate": 4.968088424025454e-06, + "loss": 0.7393, + "step": 741 + }, + { + "epoch": 0.3508274231678487, + "grad_norm": 2.620465040206909, + "learning_rate": 4.967988991455951e-06, + "loss": 0.6797, + "step": 742 + }, + { + "epoch": 0.3513002364066194, + "grad_norm": 2.854513645172119, + "learning_rate": 4.967889405215596e-06, + "loss": 0.6368, + "step": 743 + }, + { + "epoch": 0.3517730496453901, + "grad_norm": 2.579854726791382, + "learning_rate": 4.9677896653105886e-06, + "loss": 0.6489, + "step": 744 + }, + { + "epoch": 0.35224586288416077, + "grad_norm": 3.0697381496429443, + "learning_rate": 4.96768977174714e-06, + "loss": 0.6313, + "step": 745 + }, + { + "epoch": 0.35271867612293145, + "grad_norm": 3.369338035583496, + "learning_rate": 4.96758972453147e-06, + "loss": 0.7416, + "step": 746 + }, + { + "epoch": 0.35319148936170214, + "grad_norm": 2.836221933364868, + "learning_rate": 4.967489523669807e-06, + "loss": 0.6422, + "step": 747 + }, + { + "epoch": 0.3536643026004728, + "grad_norm": 2.929579496383667, + "learning_rate": 4.967389169168392e-06, + "loss": 0.6482, + "step": 748 + }, + { + "epoch": 0.3541371158392435, + "grad_norm": 2.9243831634521484, + "learning_rate": 4.967288661033472e-06, + "loss": 0.5813, + "step": 749 + }, + { + "epoch": 0.3546099290780142, + "grad_norm": 3.7555336952209473, + "learning_rate": 4.967187999271306e-06, + "loss": 0.6501, + "step": 750 + }, + { + "epoch": 0.3550827423167849, + "grad_norm": 3.4279143810272217, + "learning_rate": 4.9670871838881615e-06, + "loss": 0.6326, + "step": 751 + }, + { + "epoch": 0.35555555555555557, + "grad_norm": 2.875066041946411, + "learning_rate": 4.9669862148903166e-06, + "loss": 0.664, + "step": 752 + }, + { + "epoch": 0.35602836879432626, + "grad_norm": 3.130394697189331, + "learning_rate": 4.966885092284057e-06, + "loss": 0.706, + "step": 753 + }, + { + "epoch": 0.35650118203309694, + "grad_norm": 2.9606287479400635, + "learning_rate": 4.96678381607568e-06, + "loss": 0.693, + "step": 754 + }, + { + "epoch": 0.35697399527186763, + "grad_norm": 3.0584909915924072, + "learning_rate": 4.966682386271491e-06, + "loss": 0.6034, + "step": 755 + }, + { + "epoch": 0.3574468085106383, + "grad_norm": 2.8215200901031494, + "learning_rate": 4.966580802877805e-06, + "loss": 0.6217, + "step": 756 + }, + { + "epoch": 0.357919621749409, + "grad_norm": 2.7348055839538574, + "learning_rate": 4.966479065900949e-06, + "loss": 0.6194, + "step": 757 + }, + { + "epoch": 0.3583924349881797, + "grad_norm": 3.2347466945648193, + "learning_rate": 4.966377175347257e-06, + "loss": 0.6377, + "step": 758 + }, + { + "epoch": 0.3588652482269504, + "grad_norm": 3.311845302581787, + "learning_rate": 4.966275131223072e-06, + "loss": 0.6234, + "step": 759 + }, + { + "epoch": 0.35933806146572106, + "grad_norm": 3.0384368896484375, + "learning_rate": 4.96617293353475e-06, + "loss": 0.609, + "step": 760 + }, + { + "epoch": 0.35981087470449175, + "grad_norm": 3.516854763031006, + "learning_rate": 4.966070582288653e-06, + "loss": 0.6627, + "step": 761 + }, + { + "epoch": 0.36028368794326243, + "grad_norm": 3.2425215244293213, + "learning_rate": 4.9659680774911534e-06, + "loss": 0.7355, + "step": 762 + }, + { + "epoch": 0.3607565011820331, + "grad_norm": 3.2665750980377197, + "learning_rate": 4.965865419148636e-06, + "loss": 0.6787, + "step": 763 + }, + { + "epoch": 0.3612293144208038, + "grad_norm": 2.729428291320801, + "learning_rate": 4.96576260726749e-06, + "loss": 0.6272, + "step": 764 + }, + { + "epoch": 0.3617021276595745, + "grad_norm": 3.299969434738159, + "learning_rate": 4.965659641854119e-06, + "loss": 0.6552, + "step": 765 + }, + { + "epoch": 0.3621749408983452, + "grad_norm": 2.7090916633605957, + "learning_rate": 4.965556522914934e-06, + "loss": 0.6661, + "step": 766 + }, + { + "epoch": 0.36264775413711586, + "grad_norm": 2.488846778869629, + "learning_rate": 4.965453250456355e-06, + "loss": 0.5821, + "step": 767 + }, + { + "epoch": 0.36312056737588655, + "grad_norm": 2.5267233848571777, + "learning_rate": 4.965349824484813e-06, + "loss": 0.5593, + "step": 768 + }, + { + "epoch": 0.36359338061465724, + "grad_norm": 3.0646679401397705, + "learning_rate": 4.965246245006748e-06, + "loss": 0.6341, + "step": 769 + }, + { + "epoch": 0.3640661938534279, + "grad_norm": 2.9877712726593018, + "learning_rate": 4.965142512028609e-06, + "loss": 0.7202, + "step": 770 + }, + { + "epoch": 0.3645390070921986, + "grad_norm": 3.7494113445281982, + "learning_rate": 4.965038625556854e-06, + "loss": 0.7643, + "step": 771 + }, + { + "epoch": 0.3650118203309693, + "grad_norm": 2.8382890224456787, + "learning_rate": 4.964934585597954e-06, + "loss": 0.6522, + "step": 772 + }, + { + "epoch": 0.3654846335697399, + "grad_norm": 3.091655731201172, + "learning_rate": 4.9648303921583854e-06, + "loss": 0.7117, + "step": 773 + }, + { + "epoch": 0.3659574468085106, + "grad_norm": 3.0608325004577637, + "learning_rate": 4.964726045244635e-06, + "loss": 0.6538, + "step": 774 + }, + { + "epoch": 0.3664302600472813, + "grad_norm": 2.8492867946624756, + "learning_rate": 4.964621544863203e-06, + "loss": 0.6079, + "step": 775 + }, + { + "epoch": 0.366903073286052, + "grad_norm": 3.0669894218444824, + "learning_rate": 4.964516891020594e-06, + "loss": 0.6223, + "step": 776 + }, + { + "epoch": 0.36737588652482267, + "grad_norm": 3.089984893798828, + "learning_rate": 4.964412083723325e-06, + "loss": 0.671, + "step": 777 + }, + { + "epoch": 0.36784869976359336, + "grad_norm": 2.905242443084717, + "learning_rate": 4.964307122977921e-06, + "loss": 0.62, + "step": 778 + }, + { + "epoch": 0.36832151300236404, + "grad_norm": 3.954436779022217, + "learning_rate": 4.964202008790918e-06, + "loss": 0.6535, + "step": 779 + }, + { + "epoch": 0.36879432624113473, + "grad_norm": 2.6026058197021484, + "learning_rate": 4.9640967411688615e-06, + "loss": 0.5865, + "step": 780 + }, + { + "epoch": 0.3692671394799054, + "grad_norm": 2.9876346588134766, + "learning_rate": 4.963991320118306e-06, + "loss": 0.6698, + "step": 781 + }, + { + "epoch": 0.3697399527186761, + "grad_norm": 2.9411263465881348, + "learning_rate": 4.963885745645815e-06, + "loss": 0.6173, + "step": 782 + }, + { + "epoch": 0.3702127659574468, + "grad_norm": 2.5679805278778076, + "learning_rate": 4.963780017757962e-06, + "loss": 0.6285, + "step": 783 + }, + { + "epoch": 0.3706855791962175, + "grad_norm": 3.3100640773773193, + "learning_rate": 4.963674136461332e-06, + "loss": 0.5968, + "step": 784 + }, + { + "epoch": 0.37115839243498816, + "grad_norm": 3.1293699741363525, + "learning_rate": 4.963568101762515e-06, + "loss": 0.697, + "step": 785 + }, + { + "epoch": 0.37163120567375885, + "grad_norm": 3.043853759765625, + "learning_rate": 4.963461913668115e-06, + "loss": 0.5881, + "step": 786 + }, + { + "epoch": 0.37210401891252953, + "grad_norm": 3.07351016998291, + "learning_rate": 4.963355572184744e-06, + "loss": 0.6307, + "step": 787 + }, + { + "epoch": 0.3725768321513002, + "grad_norm": 2.7381317615509033, + "learning_rate": 4.9632490773190225e-06, + "loss": 0.716, + "step": 788 + }, + { + "epoch": 0.3730496453900709, + "grad_norm": 2.892221450805664, + "learning_rate": 4.963142429077582e-06, + "loss": 0.6867, + "step": 789 + }, + { + "epoch": 0.3735224586288416, + "grad_norm": 3.133122205734253, + "learning_rate": 4.963035627467064e-06, + "loss": 0.659, + "step": 790 + }, + { + "epoch": 0.3739952718676123, + "grad_norm": 3.032599925994873, + "learning_rate": 4.962928672494116e-06, + "loss": 0.6848, + "step": 791 + }, + { + "epoch": 0.37446808510638296, + "grad_norm": 3.0076355934143066, + "learning_rate": 4.9628215641654e-06, + "loss": 0.6549, + "step": 792 + }, + { + "epoch": 0.37494089834515365, + "grad_norm": 2.8904454708099365, + "learning_rate": 4.962714302487585e-06, + "loss": 0.6484, + "step": 793 + }, + { + "epoch": 0.37541371158392434, + "grad_norm": 2.881364107131958, + "learning_rate": 4.9626068874673486e-06, + "loss": 0.721, + "step": 794 + }, + { + "epoch": 0.375886524822695, + "grad_norm": 3.11668062210083, + "learning_rate": 4.962499319111379e-06, + "loss": 0.7824, + "step": 795 + }, + { + "epoch": 0.3763593380614657, + "grad_norm": 2.9201436042785645, + "learning_rate": 4.962391597426374e-06, + "loss": 0.6911, + "step": 796 + }, + { + "epoch": 0.3768321513002364, + "grad_norm": 2.926598072052002, + "learning_rate": 4.962283722419043e-06, + "loss": 0.6715, + "step": 797 + }, + { + "epoch": 0.3773049645390071, + "grad_norm": 2.7267675399780273, + "learning_rate": 4.962175694096101e-06, + "loss": 0.6111, + "step": 798 + }, + { + "epoch": 0.37777777777777777, + "grad_norm": 3.194031000137329, + "learning_rate": 4.962067512464275e-06, + "loss": 0.6558, + "step": 799 + }, + { + "epoch": 0.37825059101654845, + "grad_norm": 2.6249136924743652, + "learning_rate": 4.9619591775303e-06, + "loss": 0.6166, + "step": 800 + }, + { + "epoch": 0.37872340425531914, + "grad_norm": 2.6356167793273926, + "learning_rate": 4.961850689300923e-06, + "loss": 0.6112, + "step": 801 + }, + { + "epoch": 0.3791962174940898, + "grad_norm": 3.030724287033081, + "learning_rate": 4.961742047782898e-06, + "loss": 0.6511, + "step": 802 + }, + { + "epoch": 0.3796690307328605, + "grad_norm": 3.4987757205963135, + "learning_rate": 4.96163325298299e-06, + "loss": 0.5888, + "step": 803 + }, + { + "epoch": 0.3801418439716312, + "grad_norm": 3.0371780395507812, + "learning_rate": 4.961524304907974e-06, + "loss": 0.6385, + "step": 804 + }, + { + "epoch": 0.3806146572104019, + "grad_norm": 3.302570104598999, + "learning_rate": 4.961415203564632e-06, + "loss": 0.6515, + "step": 805 + }, + { + "epoch": 0.38108747044917257, + "grad_norm": 2.7597038745880127, + "learning_rate": 4.961305948959759e-06, + "loss": 0.6126, + "step": 806 + }, + { + "epoch": 0.38156028368794326, + "grad_norm": 2.789811849594116, + "learning_rate": 4.9611965411001575e-06, + "loss": 0.6601, + "step": 807 + }, + { + "epoch": 0.38203309692671394, + "grad_norm": 3.0403921604156494, + "learning_rate": 4.961086979992639e-06, + "loss": 0.6947, + "step": 808 + }, + { + "epoch": 0.38250591016548463, + "grad_norm": 3.2139980792999268, + "learning_rate": 4.960977265644026e-06, + "loss": 0.6876, + "step": 809 + }, + { + "epoch": 0.3829787234042553, + "grad_norm": 2.918515205383301, + "learning_rate": 4.960867398061149e-06, + "loss": 0.5997, + "step": 810 + }, + { + "epoch": 0.383451536643026, + "grad_norm": 3.197636604309082, + "learning_rate": 4.9607573772508495e-06, + "loss": 0.5754, + "step": 811 + }, + { + "epoch": 0.3839243498817967, + "grad_norm": 2.8848466873168945, + "learning_rate": 4.960647203219979e-06, + "loss": 0.6424, + "step": 812 + }, + { + "epoch": 0.3843971631205674, + "grad_norm": 3.4810187816619873, + "learning_rate": 4.960536875975397e-06, + "loss": 0.6851, + "step": 813 + }, + { + "epoch": 0.38486997635933806, + "grad_norm": 3.713934898376465, + "learning_rate": 4.960426395523972e-06, + "loss": 0.6122, + "step": 814 + }, + { + "epoch": 0.38534278959810875, + "grad_norm": 2.862600803375244, + "learning_rate": 4.960315761872585e-06, + "loss": 0.6493, + "step": 815 + }, + { + "epoch": 0.38581560283687943, + "grad_norm": 3.133882522583008, + "learning_rate": 4.960204975028123e-06, + "loss": 0.7535, + "step": 816 + }, + { + "epoch": 0.3862884160756501, + "grad_norm": 3.1526732444763184, + "learning_rate": 4.960094034997485e-06, + "loss": 0.6512, + "step": 817 + }, + { + "epoch": 0.3867612293144208, + "grad_norm": 2.7213544845581055, + "learning_rate": 4.959982941787579e-06, + "loss": 0.6121, + "step": 818 + }, + { + "epoch": 0.3872340425531915, + "grad_norm": 3.4935851097106934, + "learning_rate": 4.9598716954053214e-06, + "loss": 0.7852, + "step": 819 + }, + { + "epoch": 0.3877068557919622, + "grad_norm": 2.691016435623169, + "learning_rate": 4.9597602958576395e-06, + "loss": 0.6861, + "step": 820 + }, + { + "epoch": 0.38817966903073287, + "grad_norm": 2.8621015548706055, + "learning_rate": 4.959648743151469e-06, + "loss": 0.6262, + "step": 821 + }, + { + "epoch": 0.38865248226950355, + "grad_norm": 3.3887462615966797, + "learning_rate": 4.959537037293758e-06, + "loss": 0.7103, + "step": 822 + }, + { + "epoch": 0.38912529550827424, + "grad_norm": 2.7565438747406006, + "learning_rate": 4.95942517829146e-06, + "loss": 0.6471, + "step": 823 + }, + { + "epoch": 0.3895981087470449, + "grad_norm": 2.7920358180999756, + "learning_rate": 4.959313166151541e-06, + "loss": 0.6239, + "step": 824 + }, + { + "epoch": 0.3900709219858156, + "grad_norm": 3.18904185295105, + "learning_rate": 4.959201000880973e-06, + "loss": 0.7461, + "step": 825 + }, + { + "epoch": 0.3905437352245863, + "grad_norm": 2.727872371673584, + "learning_rate": 4.959088682486743e-06, + "loss": 0.6333, + "step": 826 + }, + { + "epoch": 0.391016548463357, + "grad_norm": 2.906378746032715, + "learning_rate": 4.958976210975844e-06, + "loss": 0.7547, + "step": 827 + }, + { + "epoch": 0.39148936170212767, + "grad_norm": 2.96482515335083, + "learning_rate": 4.958863586355278e-06, + "loss": 0.6312, + "step": 828 + }, + { + "epoch": 0.39196217494089836, + "grad_norm": 3.2890889644622803, + "learning_rate": 4.958750808632059e-06, + "loss": 0.6943, + "step": 829 + }, + { + "epoch": 0.39243498817966904, + "grad_norm": 2.7004311084747314, + "learning_rate": 4.958637877813207e-06, + "loss": 0.5918, + "step": 830 + }, + { + "epoch": 0.39290780141843973, + "grad_norm": 2.7487950325012207, + "learning_rate": 4.9585247939057566e-06, + "loss": 0.6201, + "step": 831 + }, + { + "epoch": 0.3933806146572104, + "grad_norm": 2.7873897552490234, + "learning_rate": 4.958411556916747e-06, + "loss": 0.6268, + "step": 832 + }, + { + "epoch": 0.3938534278959811, + "grad_norm": 2.8501343727111816, + "learning_rate": 4.958298166853229e-06, + "loss": 0.7119, + "step": 833 + }, + { + "epoch": 0.3943262411347518, + "grad_norm": 3.0391547679901123, + "learning_rate": 4.958184623722265e-06, + "loss": 0.6375, + "step": 834 + }, + { + "epoch": 0.3947990543735225, + "grad_norm": 2.850520133972168, + "learning_rate": 4.958070927530922e-06, + "loss": 0.5962, + "step": 835 + }, + { + "epoch": 0.39527186761229316, + "grad_norm": 3.351914644241333, + "learning_rate": 4.957957078286281e-06, + "loss": 0.7247, + "step": 836 + }, + { + "epoch": 0.39574468085106385, + "grad_norm": 2.9559543132781982, + "learning_rate": 4.957843075995431e-06, + "loss": 0.6571, + "step": 837 + }, + { + "epoch": 0.39621749408983453, + "grad_norm": 3.225785255432129, + "learning_rate": 4.95772892066547e-06, + "loss": 0.7074, + "step": 838 + }, + { + "epoch": 0.3966903073286052, + "grad_norm": 2.7842373847961426, + "learning_rate": 4.957614612303505e-06, + "loss": 0.6469, + "step": 839 + }, + { + "epoch": 0.3971631205673759, + "grad_norm": 4.249724864959717, + "learning_rate": 4.957500150916655e-06, + "loss": 0.741, + "step": 840 + }, + { + "epoch": 0.3976359338061466, + "grad_norm": 3.138221263885498, + "learning_rate": 4.957385536512046e-06, + "loss": 0.6676, + "step": 841 + }, + { + "epoch": 0.3981087470449173, + "grad_norm": 3.456423759460449, + "learning_rate": 4.957270769096816e-06, + "loss": 0.6877, + "step": 842 + }, + { + "epoch": 0.39858156028368796, + "grad_norm": 2.8676278591156006, + "learning_rate": 4.957155848678109e-06, + "loss": 0.5986, + "step": 843 + }, + { + "epoch": 0.39905437352245865, + "grad_norm": 2.705324411392212, + "learning_rate": 4.957040775263082e-06, + "loss": 0.6356, + "step": 844 + }, + { + "epoch": 0.39952718676122934, + "grad_norm": 3.0767486095428467, + "learning_rate": 4.9569255488589e-06, + "loss": 0.6844, + "step": 845 + }, + { + "epoch": 0.4, + "grad_norm": 2.7787704467773438, + "learning_rate": 4.956810169472736e-06, + "loss": 0.6641, + "step": 846 + }, + { + "epoch": 0.4004728132387707, + "grad_norm": 2.584277868270874, + "learning_rate": 4.956694637111777e-06, + "loss": 0.6256, + "step": 847 + }, + { + "epoch": 0.4009456264775414, + "grad_norm": 2.751641273498535, + "learning_rate": 4.956578951783215e-06, + "loss": 0.5954, + "step": 848 + }, + { + "epoch": 0.4014184397163121, + "grad_norm": 3.0181658267974854, + "learning_rate": 4.956463113494253e-06, + "loss": 0.6569, + "step": 849 + }, + { + "epoch": 0.40189125295508277, + "grad_norm": 3.0933220386505127, + "learning_rate": 4.956347122252104e-06, + "loss": 0.6248, + "step": 850 + }, + { + "epoch": 0.40236406619385345, + "grad_norm": 3.3767428398132324, + "learning_rate": 4.956230978063991e-06, + "loss": 0.719, + "step": 851 + }, + { + "epoch": 0.40283687943262414, + "grad_norm": 3.7666573524475098, + "learning_rate": 4.956114680937145e-06, + "loss": 0.6467, + "step": 852 + }, + { + "epoch": 0.4033096926713948, + "grad_norm": 2.9836843013763428, + "learning_rate": 4.955998230878808e-06, + "loss": 0.6993, + "step": 853 + }, + { + "epoch": 0.4037825059101655, + "grad_norm": 2.981497049331665, + "learning_rate": 4.955881627896229e-06, + "loss": 0.6578, + "step": 854 + }, + { + "epoch": 0.40425531914893614, + "grad_norm": 3.1369056701660156, + "learning_rate": 4.955764871996672e-06, + "loss": 0.6763, + "step": 855 + }, + { + "epoch": 0.40472813238770683, + "grad_norm": 2.7675817012786865, + "learning_rate": 4.9556479631874036e-06, + "loss": 0.6488, + "step": 856 + }, + { + "epoch": 0.4052009456264775, + "grad_norm": 3.035334825515747, + "learning_rate": 4.9555309014757034e-06, + "loss": 0.7076, + "step": 857 + }, + { + "epoch": 0.4056737588652482, + "grad_norm": 3.493704319000244, + "learning_rate": 4.955413686868862e-06, + "loss": 0.6773, + "step": 858 + }, + { + "epoch": 0.4061465721040189, + "grad_norm": 3.245487928390503, + "learning_rate": 4.9552963193741765e-06, + "loss": 0.6915, + "step": 859 + }, + { + "epoch": 0.4066193853427896, + "grad_norm": 3.189969539642334, + "learning_rate": 4.955178798998956e-06, + "loss": 0.7318, + "step": 860 + }, + { + "epoch": 0.40709219858156026, + "grad_norm": 2.7987146377563477, + "learning_rate": 4.955061125750517e-06, + "loss": 0.6162, + "step": 861 + }, + { + "epoch": 0.40756501182033095, + "grad_norm": 3.020118474960327, + "learning_rate": 4.954943299636187e-06, + "loss": 0.6678, + "step": 862 + }, + { + "epoch": 0.40803782505910163, + "grad_norm": 2.715463876724243, + "learning_rate": 4.954825320663302e-06, + "loss": 0.668, + "step": 863 + }, + { + "epoch": 0.4085106382978723, + "grad_norm": 2.595050096511841, + "learning_rate": 4.9547071888392085e-06, + "loss": 0.6557, + "step": 864 + }, + { + "epoch": 0.408983451536643, + "grad_norm": 3.131596088409424, + "learning_rate": 4.954588904171261e-06, + "loss": 0.6548, + "step": 865 + }, + { + "epoch": 0.4094562647754137, + "grad_norm": 2.5742313861846924, + "learning_rate": 4.954470466666827e-06, + "loss": 0.6592, + "step": 866 + }, + { + "epoch": 0.4099290780141844, + "grad_norm": 2.8612802028656006, + "learning_rate": 4.9543518763332785e-06, + "loss": 0.5391, + "step": 867 + }, + { + "epoch": 0.41040189125295506, + "grad_norm": 2.8973186016082764, + "learning_rate": 4.954233133178001e-06, + "loss": 0.6649, + "step": 868 + }, + { + "epoch": 0.41087470449172575, + "grad_norm": 2.802525043487549, + "learning_rate": 4.954114237208388e-06, + "loss": 0.6212, + "step": 869 + }, + { + "epoch": 0.41134751773049644, + "grad_norm": 2.5919506549835205, + "learning_rate": 4.953995188431843e-06, + "loss": 0.6596, + "step": 870 + }, + { + "epoch": 0.4118203309692671, + "grad_norm": 3.139169454574585, + "learning_rate": 4.953875986855777e-06, + "loss": 0.6799, + "step": 871 + }, + { + "epoch": 0.4122931442080378, + "grad_norm": 3.99727725982666, + "learning_rate": 4.953756632487614e-06, + "loss": 0.6519, + "step": 872 + }, + { + "epoch": 0.4127659574468085, + "grad_norm": 3.238706350326538, + "learning_rate": 4.953637125334784e-06, + "loss": 0.7361, + "step": 873 + }, + { + "epoch": 0.4132387706855792, + "grad_norm": 2.780019998550415, + "learning_rate": 4.9535174654047295e-06, + "loss": 0.6406, + "step": 874 + }, + { + "epoch": 0.41371158392434987, + "grad_norm": 2.7629551887512207, + "learning_rate": 4.953397652704901e-06, + "loss": 0.6131, + "step": 875 + }, + { + "epoch": 0.41418439716312055, + "grad_norm": 2.8008246421813965, + "learning_rate": 4.9532776872427585e-06, + "loss": 0.6464, + "step": 876 + }, + { + "epoch": 0.41465721040189124, + "grad_norm": 3.0970115661621094, + "learning_rate": 4.953157569025772e-06, + "loss": 0.7066, + "step": 877 + }, + { + "epoch": 0.4151300236406619, + "grad_norm": 2.8375589847564697, + "learning_rate": 4.9530372980614195e-06, + "loss": 0.6551, + "step": 878 + }, + { + "epoch": 0.4156028368794326, + "grad_norm": 2.718843936920166, + "learning_rate": 4.952916874357191e-06, + "loss": 0.5947, + "step": 879 + }, + { + "epoch": 0.4160756501182033, + "grad_norm": 2.7104697227478027, + "learning_rate": 4.952796297920585e-06, + "loss": 0.6708, + "step": 880 + }, + { + "epoch": 0.416548463356974, + "grad_norm": 2.8223445415496826, + "learning_rate": 4.952675568759108e-06, + "loss": 0.6214, + "step": 881 + }, + { + "epoch": 0.41702127659574467, + "grad_norm": 2.6598153114318848, + "learning_rate": 4.952554686880279e-06, + "loss": 0.6116, + "step": 882 + }, + { + "epoch": 0.41749408983451536, + "grad_norm": 2.8639824390411377, + "learning_rate": 4.952433652291623e-06, + "loss": 0.5971, + "step": 883 + }, + { + "epoch": 0.41796690307328604, + "grad_norm": 2.9578304290771484, + "learning_rate": 4.952312465000677e-06, + "loss": 0.6785, + "step": 884 + }, + { + "epoch": 0.41843971631205673, + "grad_norm": 2.872144937515259, + "learning_rate": 4.952191125014987e-06, + "loss": 0.6772, + "step": 885 + }, + { + "epoch": 0.4189125295508274, + "grad_norm": 2.7513675689697266, + "learning_rate": 4.952069632342108e-06, + "loss": 0.702, + "step": 886 + }, + { + "epoch": 0.4193853427895981, + "grad_norm": 2.9275078773498535, + "learning_rate": 4.951947986989606e-06, + "loss": 0.589, + "step": 887 + }, + { + "epoch": 0.4198581560283688, + "grad_norm": 2.740549325942993, + "learning_rate": 4.951826188965053e-06, + "loss": 0.5942, + "step": 888 + }, + { + "epoch": 0.4203309692671395, + "grad_norm": 2.92452073097229, + "learning_rate": 4.951704238276035e-06, + "loss": 0.6819, + "step": 889 + }, + { + "epoch": 0.42080378250591016, + "grad_norm": 2.842491865158081, + "learning_rate": 4.951582134930144e-06, + "loss": 0.6304, + "step": 890 + }, + { + "epoch": 0.42127659574468085, + "grad_norm": 2.613478422164917, + "learning_rate": 4.951459878934983e-06, + "loss": 0.6912, + "step": 891 + }, + { + "epoch": 0.42174940898345153, + "grad_norm": 3.2408607006073, + "learning_rate": 4.951337470298165e-06, + "loss": 0.6755, + "step": 892 + }, + { + "epoch": 0.4222222222222222, + "grad_norm": 3.1022439002990723, + "learning_rate": 4.9512149090273125e-06, + "loss": 0.6138, + "step": 893 + }, + { + "epoch": 0.4226950354609929, + "grad_norm": 2.6418895721435547, + "learning_rate": 4.951092195130055e-06, + "loss": 0.639, + "step": 894 + }, + { + "epoch": 0.4231678486997636, + "grad_norm": 3.010744333267212, + "learning_rate": 4.950969328614035e-06, + "loss": 0.7102, + "step": 895 + }, + { + "epoch": 0.4236406619385343, + "grad_norm": 2.673292636871338, + "learning_rate": 4.950846309486901e-06, + "loss": 0.5676, + "step": 896 + }, + { + "epoch": 0.42411347517730497, + "grad_norm": 3.6974737644195557, + "learning_rate": 4.950723137756314e-06, + "loss": 0.5722, + "step": 897 + }, + { + "epoch": 0.42458628841607565, + "grad_norm": 3.69028902053833, + "learning_rate": 4.9505998134299435e-06, + "loss": 0.6337, + "step": 898 + }, + { + "epoch": 0.42505910165484634, + "grad_norm": 3.2136125564575195, + "learning_rate": 4.950476336515469e-06, + "loss": 0.6469, + "step": 899 + }, + { + "epoch": 0.425531914893617, + "grad_norm": 2.7396016120910645, + "learning_rate": 4.950352707020577e-06, + "loss": 0.6656, + "step": 900 + }, + { + "epoch": 0.4260047281323877, + "grad_norm": 2.825416088104248, + "learning_rate": 4.950228924952967e-06, + "loss": 0.6298, + "step": 901 + }, + { + "epoch": 0.4264775413711584, + "grad_norm": 3.401658535003662, + "learning_rate": 4.950104990320345e-06, + "loss": 0.778, + "step": 902 + }, + { + "epoch": 0.4269503546099291, + "grad_norm": 2.7002272605895996, + "learning_rate": 4.9499809031304294e-06, + "loss": 0.6536, + "step": 903 + }, + { + "epoch": 0.42742316784869977, + "grad_norm": 2.62386417388916, + "learning_rate": 4.949856663390945e-06, + "loss": 0.6629, + "step": 904 + }, + { + "epoch": 0.42789598108747046, + "grad_norm": 2.584247589111328, + "learning_rate": 4.94973227110963e-06, + "loss": 0.5813, + "step": 905 + }, + { + "epoch": 0.42836879432624114, + "grad_norm": 3.4365768432617188, + "learning_rate": 4.9496077262942265e-06, + "loss": 0.7648, + "step": 906 + }, + { + "epoch": 0.42884160756501183, + "grad_norm": 2.8993639945983887, + "learning_rate": 4.949483028952492e-06, + "loss": 0.6696, + "step": 907 + }, + { + "epoch": 0.4293144208037825, + "grad_norm": 2.922809362411499, + "learning_rate": 4.94935817909219e-06, + "loss": 0.6892, + "step": 908 + }, + { + "epoch": 0.4297872340425532, + "grad_norm": 2.85478138923645, + "learning_rate": 4.9492331767210944e-06, + "loss": 0.536, + "step": 909 + }, + { + "epoch": 0.4302600472813239, + "grad_norm": 2.8639259338378906, + "learning_rate": 4.949108021846988e-06, + "loss": 0.634, + "step": 910 + }, + { + "epoch": 0.4307328605200946, + "grad_norm": 3.0533697605133057, + "learning_rate": 4.948982714477664e-06, + "loss": 0.6318, + "step": 911 + }, + { + "epoch": 0.43120567375886526, + "grad_norm": 2.331674814224243, + "learning_rate": 4.9488572546209255e-06, + "loss": 0.6562, + "step": 912 + }, + { + "epoch": 0.43167848699763595, + "grad_norm": 3.0154623985290527, + "learning_rate": 4.9487316422845835e-06, + "loss": 0.6675, + "step": 913 + }, + { + "epoch": 0.43215130023640663, + "grad_norm": 2.7354514598846436, + "learning_rate": 4.948605877476459e-06, + "loss": 0.6012, + "step": 914 + }, + { + "epoch": 0.4326241134751773, + "grad_norm": 2.863736629486084, + "learning_rate": 4.948479960204383e-06, + "loss": 0.6062, + "step": 915 + }, + { + "epoch": 0.433096926713948, + "grad_norm": 3.01998233795166, + "learning_rate": 4.948353890476197e-06, + "loss": 0.6749, + "step": 916 + }, + { + "epoch": 0.4335697399527187, + "grad_norm": 2.7550456523895264, + "learning_rate": 4.94822766829975e-06, + "loss": 0.6507, + "step": 917 + }, + { + "epoch": 0.4340425531914894, + "grad_norm": 3.370572805404663, + "learning_rate": 4.948101293682901e-06, + "loss": 0.714, + "step": 918 + }, + { + "epoch": 0.43451536643026006, + "grad_norm": 2.9736790657043457, + "learning_rate": 4.947974766633519e-06, + "loss": 0.729, + "step": 919 + }, + { + "epoch": 0.43498817966903075, + "grad_norm": 3.1036548614501953, + "learning_rate": 4.947848087159483e-06, + "loss": 0.7547, + "step": 920 + }, + { + "epoch": 0.43546099290780144, + "grad_norm": 2.895094871520996, + "learning_rate": 4.947721255268679e-06, + "loss": 0.6089, + "step": 921 + }, + { + "epoch": 0.4359338061465721, + "grad_norm": 2.798476219177246, + "learning_rate": 4.947594270969005e-06, + "loss": 0.5432, + "step": 922 + }, + { + "epoch": 0.4364066193853428, + "grad_norm": 2.7675702571868896, + "learning_rate": 4.94746713426837e-06, + "loss": 0.5693, + "step": 923 + }, + { + "epoch": 0.4368794326241135, + "grad_norm": 2.6851553916931152, + "learning_rate": 4.947339845174687e-06, + "loss": 0.6503, + "step": 924 + }, + { + "epoch": 0.4373522458628842, + "grad_norm": 2.909635543823242, + "learning_rate": 4.947212403695883e-06, + "loss": 0.6494, + "step": 925 + }, + { + "epoch": 0.43782505910165487, + "grad_norm": 2.604526996612549, + "learning_rate": 4.947084809839894e-06, + "loss": 0.6349, + "step": 926 + }, + { + "epoch": 0.43829787234042555, + "grad_norm": 3.118149518966675, + "learning_rate": 4.946957063614664e-06, + "loss": 0.6219, + "step": 927 + }, + { + "epoch": 0.43877068557919624, + "grad_norm": 2.7452616691589355, + "learning_rate": 4.9468291650281465e-06, + "loss": 0.6096, + "step": 928 + }, + { + "epoch": 0.4392434988179669, + "grad_norm": 3.30098819732666, + "learning_rate": 4.946701114088307e-06, + "loss": 0.6277, + "step": 929 + }, + { + "epoch": 0.4397163120567376, + "grad_norm": 2.789482593536377, + "learning_rate": 4.946572910803116e-06, + "loss": 0.7, + "step": 930 + }, + { + "epoch": 0.4401891252955083, + "grad_norm": 2.7283935546875, + "learning_rate": 4.946444555180559e-06, + "loss": 0.5375, + "step": 931 + }, + { + "epoch": 0.440661938534279, + "grad_norm": 3.101304054260254, + "learning_rate": 4.946316047228627e-06, + "loss": 0.6131, + "step": 932 + }, + { + "epoch": 0.44113475177304967, + "grad_norm": 3.573908805847168, + "learning_rate": 4.946187386955321e-06, + "loss": 0.7073, + "step": 933 + }, + { + "epoch": 0.44160756501182036, + "grad_norm": 3.214979648590088, + "learning_rate": 4.946058574368653e-06, + "loss": 0.6508, + "step": 934 + }, + { + "epoch": 0.44208037825059104, + "grad_norm": 3.145082712173462, + "learning_rate": 4.945929609476643e-06, + "loss": 0.64, + "step": 935 + }, + { + "epoch": 0.4425531914893617, + "grad_norm": 2.991780996322632, + "learning_rate": 4.945800492287321e-06, + "loss": 0.6315, + "step": 936 + }, + { + "epoch": 0.44302600472813236, + "grad_norm": 3.2441139221191406, + "learning_rate": 4.945671222808727e-06, + "loss": 0.7144, + "step": 937 + }, + { + "epoch": 0.44349881796690305, + "grad_norm": 2.9397029876708984, + "learning_rate": 4.94554180104891e-06, + "loss": 0.6818, + "step": 938 + }, + { + "epoch": 0.44397163120567373, + "grad_norm": 3.2471461296081543, + "learning_rate": 4.945412227015929e-06, + "loss": 0.6921, + "step": 939 + }, + { + "epoch": 0.4444444444444444, + "grad_norm": 3.0882487297058105, + "learning_rate": 4.945282500717851e-06, + "loss": 0.718, + "step": 940 + }, + { + "epoch": 0.4449172576832151, + "grad_norm": 2.6035783290863037, + "learning_rate": 4.945152622162753e-06, + "loss": 0.621, + "step": 941 + }, + { + "epoch": 0.4453900709219858, + "grad_norm": 2.83659029006958, + "learning_rate": 4.945022591358724e-06, + "loss": 0.6403, + "step": 942 + }, + { + "epoch": 0.4458628841607565, + "grad_norm": 2.824463129043579, + "learning_rate": 4.944892408313859e-06, + "loss": 0.6594, + "step": 943 + }, + { + "epoch": 0.44633569739952716, + "grad_norm": 2.753735065460205, + "learning_rate": 4.9447620730362645e-06, + "loss": 0.6116, + "step": 944 + }, + { + "epoch": 0.44680851063829785, + "grad_norm": 3.0659725666046143, + "learning_rate": 4.944631585534056e-06, + "loss": 0.5983, + "step": 945 + }, + { + "epoch": 0.44728132387706854, + "grad_norm": 2.969113349914551, + "learning_rate": 4.944500945815357e-06, + "loss": 0.6859, + "step": 946 + }, + { + "epoch": 0.4477541371158392, + "grad_norm": 2.810303211212158, + "learning_rate": 4.944370153888303e-06, + "loss": 0.7025, + "step": 947 + }, + { + "epoch": 0.4482269503546099, + "grad_norm": 3.027721643447876, + "learning_rate": 4.944239209761038e-06, + "loss": 0.7268, + "step": 948 + }, + { + "epoch": 0.4486997635933806, + "grad_norm": 2.661503314971924, + "learning_rate": 4.944108113441716e-06, + "loss": 0.6702, + "step": 949 + }, + { + "epoch": 0.4491725768321513, + "grad_norm": 2.738591432571411, + "learning_rate": 4.943976864938498e-06, + "loss": 0.6728, + "step": 950 + }, + { + "epoch": 0.44964539007092197, + "grad_norm": 3.447505474090576, + "learning_rate": 4.943845464259557e-06, + "loss": 0.6586, + "step": 951 + }, + { + "epoch": 0.45011820330969265, + "grad_norm": 3.0968854427337646, + "learning_rate": 4.943713911413075e-06, + "loss": 0.7666, + "step": 952 + }, + { + "epoch": 0.45059101654846334, + "grad_norm": 2.4113779067993164, + "learning_rate": 4.943582206407244e-06, + "loss": 0.6173, + "step": 953 + }, + { + "epoch": 0.451063829787234, + "grad_norm": 2.6357979774475098, + "learning_rate": 4.943450349250263e-06, + "loss": 0.5589, + "step": 954 + }, + { + "epoch": 0.4515366430260047, + "grad_norm": 2.9182233810424805, + "learning_rate": 4.9433183399503425e-06, + "loss": 0.6252, + "step": 955 + }, + { + "epoch": 0.4520094562647754, + "grad_norm": 2.832740306854248, + "learning_rate": 4.943186178515703e-06, + "loss": 0.6882, + "step": 956 + }, + { + "epoch": 0.4524822695035461, + "grad_norm": 2.9508981704711914, + "learning_rate": 4.943053864954574e-06, + "loss": 0.5722, + "step": 957 + }, + { + "epoch": 0.4529550827423168, + "grad_norm": 3.044729471206665, + "learning_rate": 4.9429213992751925e-06, + "loss": 0.6772, + "step": 958 + }, + { + "epoch": 0.45342789598108746, + "grad_norm": 2.606003522872925, + "learning_rate": 4.9427887814858075e-06, + "loss": 0.6445, + "step": 959 + }, + { + "epoch": 0.45390070921985815, + "grad_norm": 2.4634225368499756, + "learning_rate": 4.942656011594676e-06, + "loss": 0.6151, + "step": 960 + }, + { + "epoch": 0.45437352245862883, + "grad_norm": 2.8872334957122803, + "learning_rate": 4.942523089610066e-06, + "loss": 0.6255, + "step": 961 + }, + { + "epoch": 0.4548463356973995, + "grad_norm": 2.870605707168579, + "learning_rate": 4.942390015540253e-06, + "loss": 0.7481, + "step": 962 + }, + { + "epoch": 0.4553191489361702, + "grad_norm": 2.952680826187134, + "learning_rate": 4.942256789393524e-06, + "loss": 0.5556, + "step": 963 + }, + { + "epoch": 0.4557919621749409, + "grad_norm": 2.623680353164673, + "learning_rate": 4.9421234111781725e-06, + "loss": 0.6115, + "step": 964 + }, + { + "epoch": 0.4562647754137116, + "grad_norm": 2.6933600902557373, + "learning_rate": 4.941989880902505e-06, + "loss": 0.6102, + "step": 965 + }, + { + "epoch": 0.45673758865248226, + "grad_norm": 2.6047189235687256, + "learning_rate": 4.941856198574836e-06, + "loss": 0.612, + "step": 966 + }, + { + "epoch": 0.45721040189125295, + "grad_norm": 2.779186725616455, + "learning_rate": 4.9417223642034885e-06, + "loss": 0.5424, + "step": 967 + }, + { + "epoch": 0.45768321513002364, + "grad_norm": 2.6177165508270264, + "learning_rate": 4.941588377796795e-06, + "loss": 0.4661, + "step": 968 + }, + { + "epoch": 0.4581560283687943, + "grad_norm": 2.959676742553711, + "learning_rate": 4.941454239363101e-06, + "loss": 0.6966, + "step": 969 + }, + { + "epoch": 0.458628841607565, + "grad_norm": 2.9788379669189453, + "learning_rate": 4.941319948910756e-06, + "loss": 0.6181, + "step": 970 + }, + { + "epoch": 0.4591016548463357, + "grad_norm": 4.642750263214111, + "learning_rate": 4.941185506448122e-06, + "loss": 0.5602, + "step": 971 + }, + { + "epoch": 0.4595744680851064, + "grad_norm": 2.793002128601074, + "learning_rate": 4.941050911983572e-06, + "loss": 0.602, + "step": 972 + }, + { + "epoch": 0.46004728132387707, + "grad_norm": 2.6833035945892334, + "learning_rate": 4.9409161655254845e-06, + "loss": 0.5549, + "step": 973 + }, + { + "epoch": 0.46052009456264775, + "grad_norm": 3.905032157897949, + "learning_rate": 4.94078126708225e-06, + "loss": 0.6335, + "step": 974 + }, + { + "epoch": 0.46099290780141844, + "grad_norm": 2.922609329223633, + "learning_rate": 4.94064621666227e-06, + "loss": 0.5839, + "step": 975 + }, + { + "epoch": 0.4614657210401891, + "grad_norm": 2.8277416229248047, + "learning_rate": 4.940511014273952e-06, + "loss": 0.629, + "step": 976 + }, + { + "epoch": 0.4619385342789598, + "grad_norm": 3.07511043548584, + "learning_rate": 4.940375659925714e-06, + "loss": 0.7058, + "step": 977 + }, + { + "epoch": 0.4624113475177305, + "grad_norm": 3.65043044090271, + "learning_rate": 4.940240153625984e-06, + "loss": 0.7174, + "step": 978 + }, + { + "epoch": 0.4628841607565012, + "grad_norm": 2.755167245864868, + "learning_rate": 4.9401044953832e-06, + "loss": 0.6548, + "step": 979 + }, + { + "epoch": 0.46335697399527187, + "grad_norm": 2.9881057739257812, + "learning_rate": 4.939968685205808e-06, + "loss": 0.6245, + "step": 980 + }, + { + "epoch": 0.46382978723404256, + "grad_norm": 2.9484212398529053, + "learning_rate": 4.939832723102266e-06, + "loss": 0.655, + "step": 981 + }, + { + "epoch": 0.46430260047281324, + "grad_norm": 2.898918628692627, + "learning_rate": 4.939696609081038e-06, + "loss": 0.6178, + "step": 982 + }, + { + "epoch": 0.46477541371158393, + "grad_norm": 2.7052435874938965, + "learning_rate": 4.9395603431506e-06, + "loss": 0.6393, + "step": 983 + }, + { + "epoch": 0.4652482269503546, + "grad_norm": 2.5610013008117676, + "learning_rate": 4.939423925319436e-06, + "loss": 0.4847, + "step": 984 + }, + { + "epoch": 0.4657210401891253, + "grad_norm": 3.229083299636841, + "learning_rate": 4.939287355596042e-06, + "loss": 0.6473, + "step": 985 + }, + { + "epoch": 0.466193853427896, + "grad_norm": 2.907097816467285, + "learning_rate": 4.9391506339889195e-06, + "loss": 0.652, + "step": 986 + }, + { + "epoch": 0.4666666666666667, + "grad_norm": 2.6929478645324707, + "learning_rate": 4.939013760506582e-06, + "loss": 0.6175, + "step": 987 + }, + { + "epoch": 0.46713947990543736, + "grad_norm": 3.414813280105591, + "learning_rate": 4.938876735157554e-06, + "loss": 0.7597, + "step": 988 + }, + { + "epoch": 0.46761229314420805, + "grad_norm": 3.297360420227051, + "learning_rate": 4.938739557950365e-06, + "loss": 0.6824, + "step": 989 + }, + { + "epoch": 0.46808510638297873, + "grad_norm": 3.083155393600464, + "learning_rate": 4.938602228893557e-06, + "loss": 0.6505, + "step": 990 + }, + { + "epoch": 0.4685579196217494, + "grad_norm": 2.9781153202056885, + "learning_rate": 4.938464747995681e-06, + "loss": 0.666, + "step": 991 + }, + { + "epoch": 0.4690307328605201, + "grad_norm": 3.1494534015655518, + "learning_rate": 4.9383271152652975e-06, + "loss": 0.6422, + "step": 992 + }, + { + "epoch": 0.4695035460992908, + "grad_norm": 2.547868490219116, + "learning_rate": 4.938189330710976e-06, + "loss": 0.5766, + "step": 993 + }, + { + "epoch": 0.4699763593380615, + "grad_norm": 2.684736967086792, + "learning_rate": 4.938051394341297e-06, + "loss": 0.6407, + "step": 994 + }, + { + "epoch": 0.47044917257683216, + "grad_norm": 2.9619693756103516, + "learning_rate": 4.937913306164847e-06, + "loss": 0.6936, + "step": 995 + }, + { + "epoch": 0.47092198581560285, + "grad_norm": 2.9698498249053955, + "learning_rate": 4.937775066190227e-06, + "loss": 0.6464, + "step": 996 + }, + { + "epoch": 0.47139479905437354, + "grad_norm": 3.121049642562866, + "learning_rate": 4.937636674426042e-06, + "loss": 0.6383, + "step": 997 + }, + { + "epoch": 0.4718676122931442, + "grad_norm": 3.113672971725464, + "learning_rate": 4.93749813088091e-06, + "loss": 0.6892, + "step": 998 + }, + { + "epoch": 0.4723404255319149, + "grad_norm": 3.126113176345825, + "learning_rate": 4.937359435563458e-06, + "loss": 0.6728, + "step": 999 + }, + { + "epoch": 0.4728132387706856, + "grad_norm": 3.353966236114502, + "learning_rate": 4.937220588482321e-06, + "loss": 0.6041, + "step": 1000 + }, + { + "epoch": 0.4732860520094563, + "grad_norm": 2.8860628604888916, + "learning_rate": 4.937081589646144e-06, + "loss": 0.6798, + "step": 1001 + }, + { + "epoch": 0.47375886524822697, + "grad_norm": 3.0510590076446533, + "learning_rate": 4.936942439063584e-06, + "loss": 0.5841, + "step": 1002 + }, + { + "epoch": 0.47423167848699765, + "grad_norm": 2.6998369693756104, + "learning_rate": 4.936803136743303e-06, + "loss": 0.6403, + "step": 1003 + }, + { + "epoch": 0.47470449172576834, + "grad_norm": 2.875347137451172, + "learning_rate": 4.9366636826939765e-06, + "loss": 0.5811, + "step": 1004 + }, + { + "epoch": 0.475177304964539, + "grad_norm": 2.9122262001037598, + "learning_rate": 4.936524076924287e-06, + "loss": 0.6852, + "step": 1005 + }, + { + "epoch": 0.4756501182033097, + "grad_norm": 2.5167057514190674, + "learning_rate": 4.9363843194429265e-06, + "loss": 0.5367, + "step": 1006 + }, + { + "epoch": 0.4761229314420804, + "grad_norm": 2.5745551586151123, + "learning_rate": 4.9362444102585985e-06, + "loss": 0.6241, + "step": 1007 + }, + { + "epoch": 0.4765957446808511, + "grad_norm": 2.5024216175079346, + "learning_rate": 4.9361043493800125e-06, + "loss": 0.6133, + "step": 1008 + }, + { + "epoch": 0.47706855791962177, + "grad_norm": 2.7281384468078613, + "learning_rate": 4.935964136815892e-06, + "loss": 0.6834, + "step": 1009 + }, + { + "epoch": 0.47754137115839246, + "grad_norm": 3.0118913650512695, + "learning_rate": 4.935823772574965e-06, + "loss": 0.6922, + "step": 1010 + }, + { + "epoch": 0.47801418439716314, + "grad_norm": 3.016216993331909, + "learning_rate": 4.935683256665973e-06, + "loss": 0.6653, + "step": 1011 + }, + { + "epoch": 0.47848699763593383, + "grad_norm": 2.9526784420013428, + "learning_rate": 4.9355425890976636e-06, + "loss": 0.6423, + "step": 1012 + }, + { + "epoch": 0.4789598108747045, + "grad_norm": 6.222797393798828, + "learning_rate": 4.9354017698787985e-06, + "loss": 0.5884, + "step": 1013 + }, + { + "epoch": 0.4794326241134752, + "grad_norm": 2.6553597450256348, + "learning_rate": 4.935260799018143e-06, + "loss": 0.6624, + "step": 1014 + }, + { + "epoch": 0.4799054373522459, + "grad_norm": 3.0942065715789795, + "learning_rate": 4.935119676524475e-06, + "loss": 0.6623, + "step": 1015 + }, + { + "epoch": 0.4803782505910166, + "grad_norm": 2.626359224319458, + "learning_rate": 4.934978402406585e-06, + "loss": 0.6195, + "step": 1016 + }, + { + "epoch": 0.4808510638297872, + "grad_norm": 2.7954699993133545, + "learning_rate": 4.934836976673265e-06, + "loss": 0.5545, + "step": 1017 + }, + { + "epoch": 0.4813238770685579, + "grad_norm": 2.913557291030884, + "learning_rate": 4.934695399333324e-06, + "loss": 0.6288, + "step": 1018 + }, + { + "epoch": 0.4817966903073286, + "grad_norm": 3.1043739318847656, + "learning_rate": 4.9345536703955746e-06, + "loss": 0.6771, + "step": 1019 + }, + { + "epoch": 0.48226950354609927, + "grad_norm": 2.789357900619507, + "learning_rate": 4.934411789868845e-06, + "loss": 0.6227, + "step": 1020 + }, + { + "epoch": 0.48274231678486995, + "grad_norm": 2.480609655380249, + "learning_rate": 4.934269757761967e-06, + "loss": 0.5779, + "step": 1021 + }, + { + "epoch": 0.48321513002364064, + "grad_norm": 2.7946252822875977, + "learning_rate": 4.934127574083785e-06, + "loss": 0.6166, + "step": 1022 + }, + { + "epoch": 0.4836879432624113, + "grad_norm": 3.0670509338378906, + "learning_rate": 4.933985238843153e-06, + "loss": 0.7766, + "step": 1023 + }, + { + "epoch": 0.484160756501182, + "grad_norm": 2.8567559719085693, + "learning_rate": 4.933842752048932e-06, + "loss": 0.5088, + "step": 1024 + }, + { + "epoch": 0.4846335697399527, + "grad_norm": 2.5674657821655273, + "learning_rate": 4.933700113709996e-06, + "loss": 0.6036, + "step": 1025 + }, + { + "epoch": 0.4851063829787234, + "grad_norm": 2.782339096069336, + "learning_rate": 4.933557323835224e-06, + "loss": 0.5335, + "step": 1026 + }, + { + "epoch": 0.48557919621749407, + "grad_norm": 2.6334071159362793, + "learning_rate": 4.93341438243351e-06, + "loss": 0.6327, + "step": 1027 + }, + { + "epoch": 0.48605200945626476, + "grad_norm": 3.0853965282440186, + "learning_rate": 4.933271289513751e-06, + "loss": 0.7102, + "step": 1028 + }, + { + "epoch": 0.48652482269503544, + "grad_norm": 2.619997501373291, + "learning_rate": 4.933128045084859e-06, + "loss": 0.6138, + "step": 1029 + }, + { + "epoch": 0.48699763593380613, + "grad_norm": 2.8316116333007812, + "learning_rate": 4.932984649155753e-06, + "loss": 0.6346, + "step": 1030 + }, + { + "epoch": 0.4874704491725768, + "grad_norm": 3.153486490249634, + "learning_rate": 4.932841101735361e-06, + "loss": 0.7626, + "step": 1031 + }, + { + "epoch": 0.4879432624113475, + "grad_norm": 3.1831274032592773, + "learning_rate": 4.9326974028326214e-06, + "loss": 0.6607, + "step": 1032 + }, + { + "epoch": 0.4884160756501182, + "grad_norm": 2.791078567504883, + "learning_rate": 4.932553552456481e-06, + "loss": 0.6141, + "step": 1033 + }, + { + "epoch": 0.4888888888888889, + "grad_norm": 2.627263307571411, + "learning_rate": 4.932409550615898e-06, + "loss": 0.6777, + "step": 1034 + }, + { + "epoch": 0.48936170212765956, + "grad_norm": 2.8550007343292236, + "learning_rate": 4.932265397319838e-06, + "loss": 0.6379, + "step": 1035 + }, + { + "epoch": 0.48983451536643025, + "grad_norm": 4.505824089050293, + "learning_rate": 4.932121092577276e-06, + "loss": 0.5892, + "step": 1036 + }, + { + "epoch": 0.49030732860520093, + "grad_norm": 3.100191116333008, + "learning_rate": 4.931976636397199e-06, + "loss": 0.6443, + "step": 1037 + }, + { + "epoch": 0.4907801418439716, + "grad_norm": 2.921494245529175, + "learning_rate": 4.9318320287886e-06, + "loss": 0.6821, + "step": 1038 + }, + { + "epoch": 0.4912529550827423, + "grad_norm": 4.577807903289795, + "learning_rate": 4.931687269760485e-06, + "loss": 0.5946, + "step": 1039 + }, + { + "epoch": 0.491725768321513, + "grad_norm": 2.7347636222839355, + "learning_rate": 4.931542359321865e-06, + "loss": 0.5689, + "step": 1040 + }, + { + "epoch": 0.4921985815602837, + "grad_norm": 2.5289158821105957, + "learning_rate": 4.931397297481765e-06, + "loss": 0.5632, + "step": 1041 + }, + { + "epoch": 0.49267139479905436, + "grad_norm": 3.3518471717834473, + "learning_rate": 4.9312520842492165e-06, + "loss": 0.6349, + "step": 1042 + }, + { + "epoch": 0.49314420803782505, + "grad_norm": 3.0469748973846436, + "learning_rate": 4.931106719633261e-06, + "loss": 0.5734, + "step": 1043 + }, + { + "epoch": 0.49361702127659574, + "grad_norm": 3.104682445526123, + "learning_rate": 4.930961203642951e-06, + "loss": 0.6101, + "step": 1044 + }, + { + "epoch": 0.4940898345153664, + "grad_norm": 2.776705503463745, + "learning_rate": 4.930815536287346e-06, + "loss": 0.6397, + "step": 1045 + }, + { + "epoch": 0.4945626477541371, + "grad_norm": 2.760380983352661, + "learning_rate": 4.930669717575516e-06, + "loss": 0.668, + "step": 1046 + }, + { + "epoch": 0.4950354609929078, + "grad_norm": 2.70084547996521, + "learning_rate": 4.930523747516541e-06, + "loss": 0.5729, + "step": 1047 + }, + { + "epoch": 0.4955082742316785, + "grad_norm": 2.7319583892822266, + "learning_rate": 4.930377626119511e-06, + "loss": 0.6258, + "step": 1048 + }, + { + "epoch": 0.49598108747044917, + "grad_norm": 3.2515223026275635, + "learning_rate": 4.930231353393521e-06, + "loss": 0.7412, + "step": 1049 + }, + { + "epoch": 0.49645390070921985, + "grad_norm": 3.0646486282348633, + "learning_rate": 4.930084929347682e-06, + "loss": 0.5809, + "step": 1050 + }, + { + "epoch": 0.49692671394799054, + "grad_norm": 3.1621921062469482, + "learning_rate": 4.9299383539911096e-06, + "loss": 0.6282, + "step": 1051 + }, + { + "epoch": 0.4973995271867612, + "grad_norm": 2.864713191986084, + "learning_rate": 4.929791627332931e-06, + "loss": 0.6263, + "step": 1052 + }, + { + "epoch": 0.4978723404255319, + "grad_norm": 3.181016683578491, + "learning_rate": 4.929644749382283e-06, + "loss": 0.5697, + "step": 1053 + }, + { + "epoch": 0.4983451536643026, + "grad_norm": 2.9064836502075195, + "learning_rate": 4.929497720148309e-06, + "loss": 0.6161, + "step": 1054 + }, + { + "epoch": 0.4988179669030733, + "grad_norm": 3.058112859725952, + "learning_rate": 4.9293505396401655e-06, + "loss": 0.6477, + "step": 1055 + }, + { + "epoch": 0.49929078014184397, + "grad_norm": 2.5227596759796143, + "learning_rate": 4.929203207867016e-06, + "loss": 0.5819, + "step": 1056 + }, + { + "epoch": 0.49976359338061466, + "grad_norm": 3.386862277984619, + "learning_rate": 4.929055724838035e-06, + "loss": 0.7342, + "step": 1057 + }, + { + "epoch": 0.5002364066193853, + "grad_norm": 3.368346929550171, + "learning_rate": 4.928908090562404e-06, + "loss": 0.6622, + "step": 1058 + }, + { + "epoch": 0.500709219858156, + "grad_norm": 2.9108314514160156, + "learning_rate": 4.928760305049317e-06, + "loss": 0.6598, + "step": 1059 + }, + { + "epoch": 0.5011820330969267, + "grad_norm": 2.822305917739868, + "learning_rate": 4.928612368307977e-06, + "loss": 0.5841, + "step": 1060 + }, + { + "epoch": 0.5016548463356973, + "grad_norm": 2.689131259918213, + "learning_rate": 4.928464280347592e-06, + "loss": 0.6631, + "step": 1061 + }, + { + "epoch": 0.502127659574468, + "grad_norm": 3.337214946746826, + "learning_rate": 4.9283160411773864e-06, + "loss": 0.6105, + "step": 1062 + }, + { + "epoch": 0.5026004728132387, + "grad_norm": 3.035911798477173, + "learning_rate": 4.928167650806588e-06, + "loss": 0.6981, + "step": 1063 + }, + { + "epoch": 0.5030732860520094, + "grad_norm": 2.8820855617523193, + "learning_rate": 4.9280191092444375e-06, + "loss": 0.6408, + "step": 1064 + }, + { + "epoch": 0.5035460992907801, + "grad_norm": 3.080432415008545, + "learning_rate": 4.927870416500183e-06, + "loss": 0.6398, + "step": 1065 + }, + { + "epoch": 0.5040189125295508, + "grad_norm": 2.761612892150879, + "learning_rate": 4.927721572583084e-06, + "loss": 0.6126, + "step": 1066 + }, + { + "epoch": 0.5044917257683215, + "grad_norm": 2.8561882972717285, + "learning_rate": 4.927572577502408e-06, + "loss": 0.584, + "step": 1067 + }, + { + "epoch": 0.5049645390070922, + "grad_norm": 3.3386311531066895, + "learning_rate": 4.927423431267432e-06, + "loss": 0.6666, + "step": 1068 + }, + { + "epoch": 0.5054373522458628, + "grad_norm": 2.632906675338745, + "learning_rate": 4.927274133887443e-06, + "loss": 0.632, + "step": 1069 + }, + { + "epoch": 0.5059101654846335, + "grad_norm": 2.8737308979034424, + "learning_rate": 4.927124685371737e-06, + "loss": 0.6051, + "step": 1070 + }, + { + "epoch": 0.5063829787234042, + "grad_norm": 3.042222738265991, + "learning_rate": 4.926975085729619e-06, + "loss": 0.6954, + "step": 1071 + }, + { + "epoch": 0.5068557919621749, + "grad_norm": 3.3341481685638428, + "learning_rate": 4.926825334970404e-06, + "loss": 0.7148, + "step": 1072 + }, + { + "epoch": 0.5073286052009456, + "grad_norm": 2.7415387630462646, + "learning_rate": 4.926675433103418e-06, + "loss": 0.5456, + "step": 1073 + }, + { + "epoch": 0.5078014184397163, + "grad_norm": 2.7545325756073, + "learning_rate": 4.926525380137993e-06, + "loss": 0.6213, + "step": 1074 + }, + { + "epoch": 0.508274231678487, + "grad_norm": 2.9153690338134766, + "learning_rate": 4.926375176083472e-06, + "loss": 0.6466, + "step": 1075 + }, + { + "epoch": 0.5087470449172576, + "grad_norm": 4.210638523101807, + "learning_rate": 4.926224820949209e-06, + "loss": 0.6192, + "step": 1076 + }, + { + "epoch": 0.5092198581560283, + "grad_norm": 2.4357898235321045, + "learning_rate": 4.926074314744565e-06, + "loss": 0.594, + "step": 1077 + }, + { + "epoch": 0.509692671394799, + "grad_norm": 2.8004701137542725, + "learning_rate": 4.92592365747891e-06, + "loss": 0.6276, + "step": 1078 + }, + { + "epoch": 0.5101654846335697, + "grad_norm": 2.920675039291382, + "learning_rate": 4.925772849161628e-06, + "loss": 0.6043, + "step": 1079 + }, + { + "epoch": 0.5106382978723404, + "grad_norm": 2.791555404663086, + "learning_rate": 4.9256218898021055e-06, + "loss": 0.6837, + "step": 1080 + }, + { + "epoch": 0.5111111111111111, + "grad_norm": 3.1702463626861572, + "learning_rate": 4.925470779409746e-06, + "loss": 0.668, + "step": 1081 + }, + { + "epoch": 0.5115839243498818, + "grad_norm": 2.7149479389190674, + "learning_rate": 4.925319517993955e-06, + "loss": 0.5842, + "step": 1082 + }, + { + "epoch": 0.5120567375886524, + "grad_norm": 2.916311025619507, + "learning_rate": 4.925168105564153e-06, + "loss": 0.6893, + "step": 1083 + }, + { + "epoch": 0.5125295508274231, + "grad_norm": 2.917654514312744, + "learning_rate": 4.925016542129767e-06, + "loss": 0.6513, + "step": 1084 + }, + { + "epoch": 0.5130023640661938, + "grad_norm": 2.5568928718566895, + "learning_rate": 4.924864827700234e-06, + "loss": 0.6177, + "step": 1085 + }, + { + "epoch": 0.5134751773049645, + "grad_norm": 2.816720485687256, + "learning_rate": 4.924712962285001e-06, + "loss": 0.5833, + "step": 1086 + }, + { + "epoch": 0.5139479905437352, + "grad_norm": 2.6989188194274902, + "learning_rate": 4.9245609458935235e-06, + "loss": 0.6332, + "step": 1087 + }, + { + "epoch": 0.5144208037825059, + "grad_norm": 2.959599494934082, + "learning_rate": 4.924408778535268e-06, + "loss": 0.626, + "step": 1088 + }, + { + "epoch": 0.5148936170212766, + "grad_norm": 2.872814416885376, + "learning_rate": 4.924256460219708e-06, + "loss": 0.6407, + "step": 1089 + }, + { + "epoch": 0.5153664302600472, + "grad_norm": 2.6989097595214844, + "learning_rate": 4.924103990956329e-06, + "loss": 0.6391, + "step": 1090 + }, + { + "epoch": 0.5158392434988179, + "grad_norm": 2.986492156982422, + "learning_rate": 4.9239513707546235e-06, + "loss": 0.6911, + "step": 1091 + }, + { + "epoch": 0.5163120567375886, + "grad_norm": 3.069920301437378, + "learning_rate": 4.9237985996240954e-06, + "loss": 0.671, + "step": 1092 + }, + { + "epoch": 0.5167848699763593, + "grad_norm": 2.8214917182922363, + "learning_rate": 4.9236456775742555e-06, + "loss": 0.5885, + "step": 1093 + }, + { + "epoch": 0.51725768321513, + "grad_norm": 2.9416961669921875, + "learning_rate": 4.923492604614627e-06, + "loss": 0.6293, + "step": 1094 + }, + { + "epoch": 0.5177304964539007, + "grad_norm": 2.761780023574829, + "learning_rate": 4.923339380754741e-06, + "loss": 0.649, + "step": 1095 + }, + { + "epoch": 0.5182033096926714, + "grad_norm": 2.7648792266845703, + "learning_rate": 4.923186006004138e-06, + "loss": 0.5906, + "step": 1096 + }, + { + "epoch": 0.518676122931442, + "grad_norm": 3.5535428524017334, + "learning_rate": 4.923032480372367e-06, + "loss": 0.7138, + "step": 1097 + }, + { + "epoch": 0.5191489361702127, + "grad_norm": 2.6252479553222656, + "learning_rate": 4.922878803868988e-06, + "loss": 0.5499, + "step": 1098 + }, + { + "epoch": 0.5196217494089834, + "grad_norm": 2.901002883911133, + "learning_rate": 4.9227249765035715e-06, + "loss": 0.6991, + "step": 1099 + }, + { + "epoch": 0.5200945626477541, + "grad_norm": 2.621877431869507, + "learning_rate": 4.9225709982856925e-06, + "loss": 0.6269, + "step": 1100 + }, + { + "epoch": 0.5205673758865248, + "grad_norm": 2.872483015060425, + "learning_rate": 4.92241686922494e-06, + "loss": 0.6657, + "step": 1101 + }, + { + "epoch": 0.5210401891252955, + "grad_norm": 2.730447769165039, + "learning_rate": 4.922262589330912e-06, + "loss": 0.6061, + "step": 1102 + }, + { + "epoch": 0.5215130023640662, + "grad_norm": 2.646247386932373, + "learning_rate": 4.922108158613213e-06, + "loss": 0.5923, + "step": 1103 + }, + { + "epoch": 0.5219858156028369, + "grad_norm": 2.6488895416259766, + "learning_rate": 4.92195357708146e-06, + "loss": 0.6293, + "step": 1104 + }, + { + "epoch": 0.5224586288416075, + "grad_norm": 2.756338357925415, + "learning_rate": 4.921798844745278e-06, + "loss": 0.6374, + "step": 1105 + }, + { + "epoch": 0.5229314420803782, + "grad_norm": 3.1441280841827393, + "learning_rate": 4.921643961614301e-06, + "loss": 0.6652, + "step": 1106 + }, + { + "epoch": 0.5234042553191489, + "grad_norm": 3.050002098083496, + "learning_rate": 4.921488927698172e-06, + "loss": 0.6809, + "step": 1107 + }, + { + "epoch": 0.5238770685579196, + "grad_norm": 2.71750807762146, + "learning_rate": 4.921333743006547e-06, + "loss": 0.6266, + "step": 1108 + }, + { + "epoch": 0.5243498817966903, + "grad_norm": 2.8439245223999023, + "learning_rate": 4.921178407549086e-06, + "loss": 0.5663, + "step": 1109 + }, + { + "epoch": 0.524822695035461, + "grad_norm": 3.0722241401672363, + "learning_rate": 4.921022921335464e-06, + "loss": 0.6791, + "step": 1110 + }, + { + "epoch": 0.5252955082742317, + "grad_norm": 3.4381656646728516, + "learning_rate": 4.920867284375358e-06, + "loss": 0.6687, + "step": 1111 + }, + { + "epoch": 0.5257683215130023, + "grad_norm": 2.819812774658203, + "learning_rate": 4.920711496678463e-06, + "loss": 0.6299, + "step": 1112 + }, + { + "epoch": 0.526241134751773, + "grad_norm": 3.6587414741516113, + "learning_rate": 4.9205555582544765e-06, + "loss": 0.7392, + "step": 1113 + }, + { + "epoch": 0.5267139479905437, + "grad_norm": 2.774296522140503, + "learning_rate": 4.920399469113109e-06, + "loss": 0.6652, + "step": 1114 + }, + { + "epoch": 0.5271867612293144, + "grad_norm": 2.7480580806732178, + "learning_rate": 4.920243229264081e-06, + "loss": 0.596, + "step": 1115 + }, + { + "epoch": 0.5276595744680851, + "grad_norm": 3.213057518005371, + "learning_rate": 4.920086838717119e-06, + "loss": 0.6986, + "step": 1116 + }, + { + "epoch": 0.5281323877068558, + "grad_norm": 2.940546989440918, + "learning_rate": 4.919930297481962e-06, + "loss": 0.6481, + "step": 1117 + }, + { + "epoch": 0.5286052009456265, + "grad_norm": 2.5970494747161865, + "learning_rate": 4.9197736055683555e-06, + "loss": 0.5658, + "step": 1118 + }, + { + "epoch": 0.5290780141843971, + "grad_norm": 4.49385404586792, + "learning_rate": 4.919616762986057e-06, + "loss": 0.605, + "step": 1119 + }, + { + "epoch": 0.5295508274231678, + "grad_norm": 2.971857786178589, + "learning_rate": 4.919459769744833e-06, + "loss": 0.6539, + "step": 1120 + }, + { + "epoch": 0.5300236406619385, + "grad_norm": 2.6192965507507324, + "learning_rate": 4.919302625854457e-06, + "loss": 0.6226, + "step": 1121 + }, + { + "epoch": 0.5304964539007092, + "grad_norm": 2.665088176727295, + "learning_rate": 4.919145331324716e-06, + "loss": 0.6647, + "step": 1122 + }, + { + "epoch": 0.5309692671394799, + "grad_norm": 2.612126111984253, + "learning_rate": 4.918987886165403e-06, + "loss": 0.6965, + "step": 1123 + }, + { + "epoch": 0.5314420803782506, + "grad_norm": 3.80017352104187, + "learning_rate": 4.9188302903863205e-06, + "loss": 0.7396, + "step": 1124 + }, + { + "epoch": 0.5319148936170213, + "grad_norm": 2.781752824783325, + "learning_rate": 4.918672543997282e-06, + "loss": 0.5985, + "step": 1125 + }, + { + "epoch": 0.532387706855792, + "grad_norm": 2.6067914962768555, + "learning_rate": 4.91851464700811e-06, + "loss": 0.6159, + "step": 1126 + }, + { + "epoch": 0.5328605200945626, + "grad_norm": 2.670807123184204, + "learning_rate": 4.918356599428636e-06, + "loss": 0.5958, + "step": 1127 + }, + { + "epoch": 0.5333333333333333, + "grad_norm": 2.608611822128296, + "learning_rate": 4.9181984012687e-06, + "loss": 0.5768, + "step": 1128 + }, + { + "epoch": 0.533806146572104, + "grad_norm": 2.586764097213745, + "learning_rate": 4.918040052538154e-06, + "loss": 0.661, + "step": 1129 + }, + { + "epoch": 0.5342789598108747, + "grad_norm": 3.1317451000213623, + "learning_rate": 4.917881553246856e-06, + "loss": 0.6626, + "step": 1130 + }, + { + "epoch": 0.5347517730496454, + "grad_norm": 2.7135281562805176, + "learning_rate": 4.917722903404676e-06, + "loss": 0.6572, + "step": 1131 + }, + { + "epoch": 0.5352245862884161, + "grad_norm": 3.4546358585357666, + "learning_rate": 4.917564103021493e-06, + "loss": 0.5597, + "step": 1132 + }, + { + "epoch": 0.5356973995271868, + "grad_norm": 3.0943493843078613, + "learning_rate": 4.917405152107193e-06, + "loss": 0.7258, + "step": 1133 + }, + { + "epoch": 0.5361702127659574, + "grad_norm": 2.6069352626800537, + "learning_rate": 4.917246050671674e-06, + "loss": 0.6209, + "step": 1134 + }, + { + "epoch": 0.5366430260047281, + "grad_norm": 2.584883689880371, + "learning_rate": 4.917086798724844e-06, + "loss": 0.658, + "step": 1135 + }, + { + "epoch": 0.5371158392434988, + "grad_norm": 3.001976490020752, + "learning_rate": 4.9169273962766166e-06, + "loss": 0.6306, + "step": 1136 + }, + { + "epoch": 0.5375886524822695, + "grad_norm": 2.5013928413391113, + "learning_rate": 4.916767843336918e-06, + "loss": 0.572, + "step": 1137 + }, + { + "epoch": 0.5380614657210402, + "grad_norm": 2.9114553928375244, + "learning_rate": 4.916608139915684e-06, + "loss": 0.5841, + "step": 1138 + }, + { + "epoch": 0.5385342789598109, + "grad_norm": 2.8878467082977295, + "learning_rate": 4.9164482860228564e-06, + "loss": 0.6654, + "step": 1139 + }, + { + "epoch": 0.5390070921985816, + "grad_norm": 2.9827866554260254, + "learning_rate": 4.91628828166839e-06, + "loss": 0.6674, + "step": 1140 + }, + { + "epoch": 0.5394799054373522, + "grad_norm": 3.8696281909942627, + "learning_rate": 4.916128126862248e-06, + "loss": 0.6241, + "step": 1141 + }, + { + "epoch": 0.5399527186761229, + "grad_norm": 2.9556291103363037, + "learning_rate": 4.915967821614402e-06, + "loss": 0.6478, + "step": 1142 + }, + { + "epoch": 0.5404255319148936, + "grad_norm": 2.392942428588867, + "learning_rate": 4.915807365934834e-06, + "loss": 0.6097, + "step": 1143 + }, + { + "epoch": 0.5408983451536643, + "grad_norm": 3.032235860824585, + "learning_rate": 4.915646759833534e-06, + "loss": 0.7193, + "step": 1144 + }, + { + "epoch": 0.541371158392435, + "grad_norm": 2.840416193008423, + "learning_rate": 4.915486003320501e-06, + "loss": 0.5506, + "step": 1145 + }, + { + "epoch": 0.5418439716312057, + "grad_norm": 2.5438895225524902, + "learning_rate": 4.915325096405747e-06, + "loss": 0.6487, + "step": 1146 + }, + { + "epoch": 0.5423167848699764, + "grad_norm": 2.544334650039673, + "learning_rate": 4.9151640390992905e-06, + "loss": 0.6168, + "step": 1147 + }, + { + "epoch": 0.542789598108747, + "grad_norm": 2.8535678386688232, + "learning_rate": 4.91500283141116e-06, + "loss": 0.678, + "step": 1148 + }, + { + "epoch": 0.5432624113475177, + "grad_norm": 2.8086955547332764, + "learning_rate": 4.9148414733513915e-06, + "loss": 0.6473, + "step": 1149 + }, + { + "epoch": 0.5437352245862884, + "grad_norm": 2.4709885120391846, + "learning_rate": 4.914679964930034e-06, + "loss": 0.6797, + "step": 1150 + }, + { + "epoch": 0.5442080378250591, + "grad_norm": 2.8546934127807617, + "learning_rate": 4.9145183061571435e-06, + "loss": 0.6247, + "step": 1151 + }, + { + "epoch": 0.5446808510638298, + "grad_norm": 2.991184711456299, + "learning_rate": 4.9143564970427844e-06, + "loss": 0.5977, + "step": 1152 + }, + { + "epoch": 0.5451536643026005, + "grad_norm": 3.011216402053833, + "learning_rate": 4.914194537597033e-06, + "loss": 0.7005, + "step": 1153 + }, + { + "epoch": 0.5456264775413712, + "grad_norm": 2.807521343231201, + "learning_rate": 4.9140324278299744e-06, + "loss": 0.5412, + "step": 1154 + }, + { + "epoch": 0.5460992907801419, + "grad_norm": 3.0401229858398438, + "learning_rate": 4.913870167751701e-06, + "loss": 0.6394, + "step": 1155 + }, + { + "epoch": 0.5465721040189125, + "grad_norm": 2.853914976119995, + "learning_rate": 4.913707757372317e-06, + "loss": 0.6745, + "step": 1156 + }, + { + "epoch": 0.5470449172576832, + "grad_norm": 4.505620956420898, + "learning_rate": 4.913545196701935e-06, + "loss": 0.6668, + "step": 1157 + }, + { + "epoch": 0.5475177304964539, + "grad_norm": 3.0505781173706055, + "learning_rate": 4.913382485750676e-06, + "loss": 0.6926, + "step": 1158 + }, + { + "epoch": 0.5479905437352246, + "grad_norm": 2.798435688018799, + "learning_rate": 4.913219624528672e-06, + "loss": 0.605, + "step": 1159 + }, + { + "epoch": 0.5484633569739953, + "grad_norm": 2.7814908027648926, + "learning_rate": 4.913056613046065e-06, + "loss": 0.6678, + "step": 1160 + }, + { + "epoch": 0.548936170212766, + "grad_norm": 3.2089321613311768, + "learning_rate": 4.9128934513130025e-06, + "loss": 0.5995, + "step": 1161 + }, + { + "epoch": 0.5494089834515367, + "grad_norm": 2.7699952125549316, + "learning_rate": 4.9127301393396455e-06, + "loss": 0.7062, + "step": 1162 + }, + { + "epoch": 0.5498817966903073, + "grad_norm": 2.859368324279785, + "learning_rate": 4.912566677136162e-06, + "loss": 0.6063, + "step": 1163 + }, + { + "epoch": 0.550354609929078, + "grad_norm": 2.727334499359131, + "learning_rate": 4.91240306471273e-06, + "loss": 0.6848, + "step": 1164 + }, + { + "epoch": 0.5508274231678487, + "grad_norm": 2.6017510890960693, + "learning_rate": 4.912239302079537e-06, + "loss": 0.5808, + "step": 1165 + }, + { + "epoch": 0.5513002364066194, + "grad_norm": 3.539583206176758, + "learning_rate": 4.912075389246781e-06, + "loss": 0.7053, + "step": 1166 + }, + { + "epoch": 0.5517730496453901, + "grad_norm": 2.918280601501465, + "learning_rate": 4.911911326224666e-06, + "loss": 0.5904, + "step": 1167 + }, + { + "epoch": 0.5522458628841608, + "grad_norm": 3.0067362785339355, + "learning_rate": 4.9117471130234095e-06, + "loss": 0.6392, + "step": 1168 + }, + { + "epoch": 0.5527186761229315, + "grad_norm": 2.4374797344207764, + "learning_rate": 4.911582749653236e-06, + "loss": 0.5793, + "step": 1169 + }, + { + "epoch": 0.5531914893617021, + "grad_norm": 3.121182918548584, + "learning_rate": 4.911418236124378e-06, + "loss": 0.6636, + "step": 1170 + }, + { + "epoch": 0.5536643026004728, + "grad_norm": 3.1289851665496826, + "learning_rate": 4.91125357244708e-06, + "loss": 0.656, + "step": 1171 + }, + { + "epoch": 0.5541371158392435, + "grad_norm": 2.7034592628479004, + "learning_rate": 4.911088758631596e-06, + "loss": 0.6001, + "step": 1172 + }, + { + "epoch": 0.5546099290780142, + "grad_norm": 2.710146188735962, + "learning_rate": 4.910923794688187e-06, + "loss": 0.6007, + "step": 1173 + }, + { + "epoch": 0.5550827423167849, + "grad_norm": 2.5424487590789795, + "learning_rate": 4.910758680627124e-06, + "loss": 0.5193, + "step": 1174 + }, + { + "epoch": 0.5555555555555556, + "grad_norm": 2.615893602371216, + "learning_rate": 4.91059341645869e-06, + "loss": 0.5525, + "step": 1175 + }, + { + "epoch": 0.5560283687943263, + "grad_norm": 3.3179728984832764, + "learning_rate": 4.910428002193174e-06, + "loss": 0.7285, + "step": 1176 + }, + { + "epoch": 0.556501182033097, + "grad_norm": 2.7234175205230713, + "learning_rate": 4.910262437840875e-06, + "loss": 0.574, + "step": 1177 + }, + { + "epoch": 0.5569739952718676, + "grad_norm": 3.0416605472564697, + "learning_rate": 4.9100967234121034e-06, + "loss": 0.5623, + "step": 1178 + }, + { + "epoch": 0.5574468085106383, + "grad_norm": 3.067786455154419, + "learning_rate": 4.909930858917177e-06, + "loss": 0.6491, + "step": 1179 + }, + { + "epoch": 0.557919621749409, + "grad_norm": 3.0037379264831543, + "learning_rate": 4.909764844366422e-06, + "loss": 0.5696, + "step": 1180 + }, + { + "epoch": 0.5583924349881797, + "grad_norm": 2.966179609298706, + "learning_rate": 4.909598679770178e-06, + "loss": 0.6042, + "step": 1181 + }, + { + "epoch": 0.5588652482269504, + "grad_norm": 2.6000657081604004, + "learning_rate": 4.909432365138789e-06, + "loss": 0.5883, + "step": 1182 + }, + { + "epoch": 0.5593380614657211, + "grad_norm": 2.6794495582580566, + "learning_rate": 4.909265900482612e-06, + "loss": 0.6809, + "step": 1183 + }, + { + "epoch": 0.5598108747044918, + "grad_norm": 2.6765122413635254, + "learning_rate": 4.9090992858120115e-06, + "loss": 0.6601, + "step": 1184 + }, + { + "epoch": 0.5602836879432624, + "grad_norm": 2.6051928997039795, + "learning_rate": 4.908932521137363e-06, + "loss": 0.5946, + "step": 1185 + }, + { + "epoch": 0.5607565011820331, + "grad_norm": 3.0405542850494385, + "learning_rate": 4.908765606469048e-06, + "loss": 0.6998, + "step": 1186 + }, + { + "epoch": 0.5612293144208038, + "grad_norm": 2.7975668907165527, + "learning_rate": 4.908598541817462e-06, + "loss": 0.6218, + "step": 1187 + }, + { + "epoch": 0.5617021276595745, + "grad_norm": 2.5367627143859863, + "learning_rate": 4.908431327193005e-06, + "loss": 0.6354, + "step": 1188 + }, + { + "epoch": 0.5621749408983452, + "grad_norm": 3.7939631938934326, + "learning_rate": 4.908263962606091e-06, + "loss": 0.6376, + "step": 1189 + }, + { + "epoch": 0.5626477541371159, + "grad_norm": 2.864079475402832, + "learning_rate": 4.908096448067139e-06, + "loss": 0.5485, + "step": 1190 + }, + { + "epoch": 0.5631205673758866, + "grad_norm": 2.7855563163757324, + "learning_rate": 4.9079287835865804e-06, + "loss": 0.6645, + "step": 1191 + }, + { + "epoch": 0.5635933806146572, + "grad_norm": 2.6156625747680664, + "learning_rate": 4.9077609691748556e-06, + "loss": 0.5751, + "step": 1192 + }, + { + "epoch": 0.5640661938534279, + "grad_norm": 3.0475659370422363, + "learning_rate": 4.907593004842412e-06, + "loss": 0.6739, + "step": 1193 + }, + { + "epoch": 0.5645390070921986, + "grad_norm": 2.9176738262176514, + "learning_rate": 4.9074248905997104e-06, + "loss": 0.6493, + "step": 1194 + }, + { + "epoch": 0.5650118203309693, + "grad_norm": 2.6168384552001953, + "learning_rate": 4.907256626457216e-06, + "loss": 0.6154, + "step": 1195 + }, + { + "epoch": 0.56548463356974, + "grad_norm": 2.893980026245117, + "learning_rate": 4.907088212425408e-06, + "loss": 0.5808, + "step": 1196 + }, + { + "epoch": 0.5659574468085107, + "grad_norm": 3.3832836151123047, + "learning_rate": 4.90691964851477e-06, + "loss": 0.7888, + "step": 1197 + }, + { + "epoch": 0.5664302600472814, + "grad_norm": 3.088932752609253, + "learning_rate": 4.906750934735801e-06, + "loss": 0.6516, + "step": 1198 + }, + { + "epoch": 0.566903073286052, + "grad_norm": 2.494471549987793, + "learning_rate": 4.906582071099004e-06, + "loss": 0.6286, + "step": 1199 + }, + { + "epoch": 0.5673758865248227, + "grad_norm": 2.716550588607788, + "learning_rate": 4.906413057614895e-06, + "loss": 0.5939, + "step": 1200 + }, + { + "epoch": 0.5678486997635934, + "grad_norm": 2.5821073055267334, + "learning_rate": 4.906243894293995e-06, + "loss": 0.6668, + "step": 1201 + }, + { + "epoch": 0.5683215130023641, + "grad_norm": 3.651787042617798, + "learning_rate": 4.90607458114684e-06, + "loss": 0.6124, + "step": 1202 + }, + { + "epoch": 0.5687943262411348, + "grad_norm": 2.7567858695983887, + "learning_rate": 4.9059051181839705e-06, + "loss": 0.6656, + "step": 1203 + }, + { + "epoch": 0.5692671394799055, + "grad_norm": 2.8067586421966553, + "learning_rate": 4.90573550541594e-06, + "loss": 0.6306, + "step": 1204 + }, + { + "epoch": 0.5697399527186762, + "grad_norm": 2.6136393547058105, + "learning_rate": 4.905565742853307e-06, + "loss": 0.5992, + "step": 1205 + }, + { + "epoch": 0.5702127659574469, + "grad_norm": 2.899049758911133, + "learning_rate": 4.905395830506644e-06, + "loss": 0.621, + "step": 1206 + }, + { + "epoch": 0.5706855791962175, + "grad_norm": 3.036583185195923, + "learning_rate": 4.9052257683865294e-06, + "loss": 0.652, + "step": 1207 + }, + { + "epoch": 0.5711583924349882, + "grad_norm": 2.7947216033935547, + "learning_rate": 4.905055556503553e-06, + "loss": 0.6636, + "step": 1208 + }, + { + "epoch": 0.5716312056737589, + "grad_norm": 3.1646955013275146, + "learning_rate": 4.9048851948683135e-06, + "loss": 0.6376, + "step": 1209 + }, + { + "epoch": 0.5721040189125296, + "grad_norm": 2.8175766468048096, + "learning_rate": 4.904714683491417e-06, + "loss": 0.5929, + "step": 1210 + }, + { + "epoch": 0.5725768321513003, + "grad_norm": 2.923923969268799, + "learning_rate": 4.904544022383483e-06, + "loss": 0.6633, + "step": 1211 + }, + { + "epoch": 0.573049645390071, + "grad_norm": 2.7471134662628174, + "learning_rate": 4.9043732115551356e-06, + "loss": 0.6551, + "step": 1212 + }, + { + "epoch": 0.5735224586288417, + "grad_norm": 2.8660807609558105, + "learning_rate": 4.90420225101701e-06, + "loss": 0.6423, + "step": 1213 + }, + { + "epoch": 0.5739952718676123, + "grad_norm": 2.769247531890869, + "learning_rate": 4.904031140779754e-06, + "loss": 0.5982, + "step": 1214 + }, + { + "epoch": 0.574468085106383, + "grad_norm": 2.9043145179748535, + "learning_rate": 4.90385988085402e-06, + "loss": 0.5843, + "step": 1215 + }, + { + "epoch": 0.5749408983451537, + "grad_norm": 2.6639609336853027, + "learning_rate": 4.903688471250471e-06, + "loss": 0.5858, + "step": 1216 + }, + { + "epoch": 0.5754137115839244, + "grad_norm": 2.6967573165893555, + "learning_rate": 4.903516911979781e-06, + "loss": 0.5755, + "step": 1217 + }, + { + "epoch": 0.5758865248226951, + "grad_norm": 2.8865857124328613, + "learning_rate": 4.903345203052633e-06, + "loss": 0.6051, + "step": 1218 + }, + { + "epoch": 0.5763593380614658, + "grad_norm": 2.381979465484619, + "learning_rate": 4.903173344479717e-06, + "loss": 0.5727, + "step": 1219 + }, + { + "epoch": 0.5768321513002365, + "grad_norm": 2.7717981338500977, + "learning_rate": 4.903001336271734e-06, + "loss": 0.6406, + "step": 1220 + }, + { + "epoch": 0.577304964539007, + "grad_norm": 2.6431570053100586, + "learning_rate": 4.902829178439395e-06, + "loss": 0.6226, + "step": 1221 + }, + { + "epoch": 0.5777777777777777, + "grad_norm": 2.8090415000915527, + "learning_rate": 4.902656870993419e-06, + "loss": 0.5761, + "step": 1222 + }, + { + "epoch": 0.5782505910165484, + "grad_norm": 2.4769368171691895, + "learning_rate": 4.902484413944535e-06, + "loss": 0.5602, + "step": 1223 + }, + { + "epoch": 0.5787234042553191, + "grad_norm": 2.693316698074341, + "learning_rate": 4.902311807303481e-06, + "loss": 0.5222, + "step": 1224 + }, + { + "epoch": 0.5791962174940898, + "grad_norm": 2.7623913288116455, + "learning_rate": 4.902139051081004e-06, + "loss": 0.6978, + "step": 1225 + }, + { + "epoch": 0.5796690307328605, + "grad_norm": 2.6133766174316406, + "learning_rate": 4.901966145287863e-06, + "loss": 0.5802, + "step": 1226 + }, + { + "epoch": 0.5801418439716312, + "grad_norm": 2.7345972061157227, + "learning_rate": 4.901793089934821e-06, + "loss": 0.6294, + "step": 1227 + }, + { + "epoch": 0.5806146572104018, + "grad_norm": 2.7545835971832275, + "learning_rate": 4.9016198850326555e-06, + "loss": 0.6085, + "step": 1228 + }, + { + "epoch": 0.5810874704491725, + "grad_norm": 2.6947758197784424, + "learning_rate": 4.90144653059215e-06, + "loss": 0.6025, + "step": 1229 + }, + { + "epoch": 0.5815602836879432, + "grad_norm": 2.692967414855957, + "learning_rate": 4.901273026624099e-06, + "loss": 0.5715, + "step": 1230 + }, + { + "epoch": 0.5820330969267139, + "grad_norm": 2.78347110748291, + "learning_rate": 4.901099373139307e-06, + "loss": 0.6063, + "step": 1231 + }, + { + "epoch": 0.5825059101654846, + "grad_norm": 2.346496343612671, + "learning_rate": 4.900925570148585e-06, + "loss": 0.5869, + "step": 1232 + }, + { + "epoch": 0.5829787234042553, + "grad_norm": 2.606639862060547, + "learning_rate": 4.900751617662755e-06, + "loss": 0.6197, + "step": 1233 + }, + { + "epoch": 0.583451536643026, + "grad_norm": 2.5825929641723633, + "learning_rate": 4.900577515692649e-06, + "loss": 0.6721, + "step": 1234 + }, + { + "epoch": 0.5839243498817966, + "grad_norm": 2.731349468231201, + "learning_rate": 4.900403264249107e-06, + "loss": 0.6273, + "step": 1235 + }, + { + "epoch": 0.5843971631205673, + "grad_norm": 3.2133874893188477, + "learning_rate": 4.90022886334298e-06, + "loss": 0.6231, + "step": 1236 + }, + { + "epoch": 0.584869976359338, + "grad_norm": 2.9213852882385254, + "learning_rate": 4.900054312985127e-06, + "loss": 0.6677, + "step": 1237 + }, + { + "epoch": 0.5853427895981087, + "grad_norm": 2.815425157546997, + "learning_rate": 4.899879613186414e-06, + "loss": 0.6405, + "step": 1238 + }, + { + "epoch": 0.5858156028368794, + "grad_norm": 2.730782985687256, + "learning_rate": 4.899704763957721e-06, + "loss": 0.6233, + "step": 1239 + }, + { + "epoch": 0.5862884160756501, + "grad_norm": 2.6432766914367676, + "learning_rate": 4.899529765309936e-06, + "loss": 0.6267, + "step": 1240 + }, + { + "epoch": 0.5867612293144208, + "grad_norm": 2.616215229034424, + "learning_rate": 4.899354617253953e-06, + "loss": 0.6268, + "step": 1241 + }, + { + "epoch": 0.5872340425531914, + "grad_norm": 2.7630255222320557, + "learning_rate": 4.899179319800679e-06, + "loss": 0.6348, + "step": 1242 + }, + { + "epoch": 0.5877068557919621, + "grad_norm": 2.785095453262329, + "learning_rate": 4.899003872961029e-06, + "loss": 0.5839, + "step": 1243 + }, + { + "epoch": 0.5881796690307328, + "grad_norm": 2.9050328731536865, + "learning_rate": 4.898828276745927e-06, + "loss": 0.651, + "step": 1244 + }, + { + "epoch": 0.5886524822695035, + "grad_norm": 2.958092212677002, + "learning_rate": 4.8986525311663065e-06, + "loss": 0.6395, + "step": 1245 + }, + { + "epoch": 0.5891252955082742, + "grad_norm": 2.952310800552368, + "learning_rate": 4.898476636233111e-06, + "loss": 0.6731, + "step": 1246 + }, + { + "epoch": 0.5895981087470449, + "grad_norm": 2.9876346588134766, + "learning_rate": 4.898300591957293e-06, + "loss": 0.7015, + "step": 1247 + }, + { + "epoch": 0.5900709219858156, + "grad_norm": 2.8941752910614014, + "learning_rate": 4.898124398349813e-06, + "loss": 0.6452, + "step": 1248 + }, + { + "epoch": 0.5905437352245863, + "grad_norm": 2.9809536933898926, + "learning_rate": 4.897948055421642e-06, + "loss": 0.5736, + "step": 1249 + }, + { + "epoch": 0.5910165484633569, + "grad_norm": 2.927046775817871, + "learning_rate": 4.897771563183761e-06, + "loss": 0.5918, + "step": 1250 + }, + { + "epoch": 0.5914893617021276, + "grad_norm": 2.865020275115967, + "learning_rate": 4.897594921647158e-06, + "loss": 0.6924, + "step": 1251 + }, + { + "epoch": 0.5919621749408983, + "grad_norm": 2.7406699657440186, + "learning_rate": 4.897418130822832e-06, + "loss": 0.509, + "step": 1252 + }, + { + "epoch": 0.592434988179669, + "grad_norm": 2.781606912612915, + "learning_rate": 4.897241190721791e-06, + "loss": 0.5555, + "step": 1253 + }, + { + "epoch": 0.5929078014184397, + "grad_norm": 2.79209303855896, + "learning_rate": 4.8970641013550535e-06, + "loss": 0.6722, + "step": 1254 + }, + { + "epoch": 0.5933806146572104, + "grad_norm": 3.0672268867492676, + "learning_rate": 4.896886862733645e-06, + "loss": 0.6366, + "step": 1255 + }, + { + "epoch": 0.5938534278959811, + "grad_norm": 2.7456953525543213, + "learning_rate": 4.896709474868602e-06, + "loss": 0.6246, + "step": 1256 + }, + { + "epoch": 0.5943262411347517, + "grad_norm": 3.6731202602386475, + "learning_rate": 4.896531937770968e-06, + "loss": 0.668, + "step": 1257 + }, + { + "epoch": 0.5947990543735224, + "grad_norm": 2.6056087017059326, + "learning_rate": 4.8963542514518e-06, + "loss": 0.5815, + "step": 1258 + }, + { + "epoch": 0.5952718676122931, + "grad_norm": 2.719698905944824, + "learning_rate": 4.89617641592216e-06, + "loss": 0.6058, + "step": 1259 + }, + { + "epoch": 0.5957446808510638, + "grad_norm": 2.625838279724121, + "learning_rate": 4.895998431193121e-06, + "loss": 0.6143, + "step": 1260 + }, + { + "epoch": 0.5962174940898345, + "grad_norm": 2.7166085243225098, + "learning_rate": 4.895820297275767e-06, + "loss": 0.5187, + "step": 1261 + }, + { + "epoch": 0.5966903073286052, + "grad_norm": 2.7544102668762207, + "learning_rate": 4.8956420141811875e-06, + "loss": 0.5928, + "step": 1262 + }, + { + "epoch": 0.5971631205673759, + "grad_norm": 2.6678333282470703, + "learning_rate": 4.895463581920484e-06, + "loss": 0.611, + "step": 1263 + }, + { + "epoch": 0.5976359338061465, + "grad_norm": 2.853384494781494, + "learning_rate": 4.895285000504768e-06, + "loss": 0.642, + "step": 1264 + }, + { + "epoch": 0.5981087470449172, + "grad_norm": 2.637852430343628, + "learning_rate": 4.895106269945158e-06, + "loss": 0.6308, + "step": 1265 + }, + { + "epoch": 0.5985815602836879, + "grad_norm": 2.9880387783050537, + "learning_rate": 4.8949273902527826e-06, + "loss": 0.5781, + "step": 1266 + }, + { + "epoch": 0.5990543735224586, + "grad_norm": 3.5984015464782715, + "learning_rate": 4.89474836143878e-06, + "loss": 0.5865, + "step": 1267 + }, + { + "epoch": 0.5995271867612293, + "grad_norm": 2.719855546951294, + "learning_rate": 4.8945691835142975e-06, + "loss": 0.6393, + "step": 1268 + }, + { + "epoch": 0.6, + "grad_norm": 2.7885141372680664, + "learning_rate": 4.894389856490492e-06, + "loss": 0.66, + "step": 1269 + }, + { + "epoch": 0.6004728132387707, + "grad_norm": 2.698819875717163, + "learning_rate": 4.894210380378529e-06, + "loss": 0.6144, + "step": 1270 + }, + { + "epoch": 0.6009456264775414, + "grad_norm": 2.278045654296875, + "learning_rate": 4.894030755189584e-06, + "loss": 0.5609, + "step": 1271 + }, + { + "epoch": 0.601418439716312, + "grad_norm": 2.8729357719421387, + "learning_rate": 4.893850980934841e-06, + "loss": 0.6715, + "step": 1272 + }, + { + "epoch": 0.6018912529550827, + "grad_norm": 2.8541221618652344, + "learning_rate": 4.893671057625495e-06, + "loss": 0.6787, + "step": 1273 + }, + { + "epoch": 0.6023640661938534, + "grad_norm": 2.4561476707458496, + "learning_rate": 4.893490985272748e-06, + "loss": 0.6331, + "step": 1274 + }, + { + "epoch": 0.6028368794326241, + "grad_norm": 2.565739154815674, + "learning_rate": 4.893310763887812e-06, + "loss": 0.587, + "step": 1275 + }, + { + "epoch": 0.6033096926713948, + "grad_norm": 2.384951591491699, + "learning_rate": 4.8931303934819095e-06, + "loss": 0.5358, + "step": 1276 + }, + { + "epoch": 0.6037825059101655, + "grad_norm": 2.380808115005493, + "learning_rate": 4.89294987406627e-06, + "loss": 0.5402, + "step": 1277 + }, + { + "epoch": 0.6042553191489362, + "grad_norm": 2.764815092086792, + "learning_rate": 4.892769205652136e-06, + "loss": 0.6103, + "step": 1278 + }, + { + "epoch": 0.6047281323877068, + "grad_norm": 2.463350296020508, + "learning_rate": 4.892588388250754e-06, + "loss": 0.5937, + "step": 1279 + }, + { + "epoch": 0.6052009456264775, + "grad_norm": 3.099689245223999, + "learning_rate": 4.8924074218733855e-06, + "loss": 0.6354, + "step": 1280 + }, + { + "epoch": 0.6056737588652482, + "grad_norm": 2.804450035095215, + "learning_rate": 4.892226306531297e-06, + "loss": 0.6595, + "step": 1281 + }, + { + "epoch": 0.6061465721040189, + "grad_norm": 3.1559767723083496, + "learning_rate": 4.892045042235765e-06, + "loss": 0.6664, + "step": 1282 + }, + { + "epoch": 0.6066193853427896, + "grad_norm": 2.844341993331909, + "learning_rate": 4.891863628998079e-06, + "loss": 0.7454, + "step": 1283 + }, + { + "epoch": 0.6070921985815603, + "grad_norm": 2.686602830886841, + "learning_rate": 4.891682066829532e-06, + "loss": 0.6755, + "step": 1284 + }, + { + "epoch": 0.607565011820331, + "grad_norm": 2.736457347869873, + "learning_rate": 4.8915003557414285e-06, + "loss": 0.6305, + "step": 1285 + }, + { + "epoch": 0.6080378250591016, + "grad_norm": 2.661362409591675, + "learning_rate": 4.891318495745086e-06, + "loss": 0.5958, + "step": 1286 + }, + { + "epoch": 0.6085106382978723, + "grad_norm": 2.707348108291626, + "learning_rate": 4.8911364868518255e-06, + "loss": 0.5824, + "step": 1287 + }, + { + "epoch": 0.608983451536643, + "grad_norm": 2.9798858165740967, + "learning_rate": 4.890954329072981e-06, + "loss": 0.5981, + "step": 1288 + }, + { + "epoch": 0.6094562647754137, + "grad_norm": 2.6285455226898193, + "learning_rate": 4.890772022419895e-06, + "loss": 0.6194, + "step": 1289 + }, + { + "epoch": 0.6099290780141844, + "grad_norm": 2.9254322052001953, + "learning_rate": 4.890589566903917e-06, + "loss": 0.6002, + "step": 1290 + }, + { + "epoch": 0.6104018912529551, + "grad_norm": 2.6458325386047363, + "learning_rate": 4.89040696253641e-06, + "loss": 0.5457, + "step": 1291 + }, + { + "epoch": 0.6108747044917258, + "grad_norm": 2.508242607116699, + "learning_rate": 4.890224209328743e-06, + "loss": 0.6168, + "step": 1292 + }, + { + "epoch": 0.6113475177304964, + "grad_norm": 3.034785509109497, + "learning_rate": 4.890041307292296e-06, + "loss": 0.664, + "step": 1293 + }, + { + "epoch": 0.6118203309692671, + "grad_norm": 3.52469539642334, + "learning_rate": 4.889858256438455e-06, + "loss": 0.7301, + "step": 1294 + }, + { + "epoch": 0.6122931442080378, + "grad_norm": 2.9145348072052, + "learning_rate": 4.889675056778622e-06, + "loss": 0.6494, + "step": 1295 + }, + { + "epoch": 0.6127659574468085, + "grad_norm": 2.831829071044922, + "learning_rate": 4.8894917083242e-06, + "loss": 0.6064, + "step": 1296 + }, + { + "epoch": 0.6132387706855792, + "grad_norm": 2.6883130073547363, + "learning_rate": 4.889308211086608e-06, + "loss": 0.5642, + "step": 1297 + }, + { + "epoch": 0.6137115839243499, + "grad_norm": 3.0605485439300537, + "learning_rate": 4.889124565077269e-06, + "loss": 0.6695, + "step": 1298 + }, + { + "epoch": 0.6141843971631206, + "grad_norm": 3.44062876701355, + "learning_rate": 4.88894077030762e-06, + "loss": 0.6415, + "step": 1299 + }, + { + "epoch": 0.6146572104018913, + "grad_norm": 2.5970818996429443, + "learning_rate": 4.888756826789105e-06, + "loss": 0.6518, + "step": 1300 + }, + { + "epoch": 0.6151300236406619, + "grad_norm": 4.2233567237854, + "learning_rate": 4.8885727345331755e-06, + "loss": 0.6555, + "step": 1301 + }, + { + "epoch": 0.6156028368794326, + "grad_norm": 2.645385503768921, + "learning_rate": 4.888388493551297e-06, + "loss": 0.6762, + "step": 1302 + }, + { + "epoch": 0.6160756501182033, + "grad_norm": 2.907954454421997, + "learning_rate": 4.8882041038549385e-06, + "loss": 0.6526, + "step": 1303 + }, + { + "epoch": 0.616548463356974, + "grad_norm": 2.482771873474121, + "learning_rate": 4.888019565455583e-06, + "loss": 0.628, + "step": 1304 + }, + { + "epoch": 0.6170212765957447, + "grad_norm": 2.7165915966033936, + "learning_rate": 4.88783487836472e-06, + "loss": 0.5743, + "step": 1305 + }, + { + "epoch": 0.6174940898345154, + "grad_norm": 3.095627546310425, + "learning_rate": 4.88765004259385e-06, + "loss": 0.627, + "step": 1306 + }, + { + "epoch": 0.6179669030732861, + "grad_norm": 2.5018465518951416, + "learning_rate": 4.8874650581544805e-06, + "loss": 0.5215, + "step": 1307 + }, + { + "epoch": 0.6184397163120567, + "grad_norm": 3.094337224960327, + "learning_rate": 4.8872799250581316e-06, + "loss": 0.6979, + "step": 1308 + }, + { + "epoch": 0.6189125295508274, + "grad_norm": 3.1002209186553955, + "learning_rate": 4.887094643316329e-06, + "loss": 0.6565, + "step": 1309 + }, + { + "epoch": 0.6193853427895981, + "grad_norm": 2.551431894302368, + "learning_rate": 4.88690921294061e-06, + "loss": 0.5748, + "step": 1310 + }, + { + "epoch": 0.6198581560283688, + "grad_norm": 2.8282904624938965, + "learning_rate": 4.886723633942521e-06, + "loss": 0.676, + "step": 1311 + }, + { + "epoch": 0.6203309692671395, + "grad_norm": 2.8887810707092285, + "learning_rate": 4.886537906333617e-06, + "loss": 0.5971, + "step": 1312 + }, + { + "epoch": 0.6208037825059102, + "grad_norm": 2.9989118576049805, + "learning_rate": 4.886352030125462e-06, + "loss": 0.6341, + "step": 1313 + }, + { + "epoch": 0.6212765957446809, + "grad_norm": 2.8042776584625244, + "learning_rate": 4.886166005329629e-06, + "loss": 0.6578, + "step": 1314 + }, + { + "epoch": 0.6217494089834515, + "grad_norm": 2.4980967044830322, + "learning_rate": 4.8859798319577026e-06, + "loss": 0.6711, + "step": 1315 + }, + { + "epoch": 0.6222222222222222, + "grad_norm": 2.762369155883789, + "learning_rate": 4.885793510021274e-06, + "loss": 0.5747, + "step": 1316 + }, + { + "epoch": 0.6226950354609929, + "grad_norm": 3.136327028274536, + "learning_rate": 4.885607039531945e-06, + "loss": 0.7544, + "step": 1317 + }, + { + "epoch": 0.6231678486997636, + "grad_norm": 2.8736963272094727, + "learning_rate": 4.885420420501327e-06, + "loss": 0.6603, + "step": 1318 + }, + { + "epoch": 0.6236406619385343, + "grad_norm": 2.766237497329712, + "learning_rate": 4.885233652941039e-06, + "loss": 0.581, + "step": 1319 + }, + { + "epoch": 0.624113475177305, + "grad_norm": 2.4740939140319824, + "learning_rate": 4.88504673686271e-06, + "loss": 0.6335, + "step": 1320 + }, + { + "epoch": 0.6245862884160757, + "grad_norm": 3.324795961380005, + "learning_rate": 4.884859672277978e-06, + "loss": 0.6019, + "step": 1321 + }, + { + "epoch": 0.6250591016548463, + "grad_norm": 3.521327257156372, + "learning_rate": 4.884672459198493e-06, + "loss": 0.6104, + "step": 1322 + }, + { + "epoch": 0.625531914893617, + "grad_norm": 2.7728071212768555, + "learning_rate": 4.884485097635909e-06, + "loss": 0.6714, + "step": 1323 + }, + { + "epoch": 0.6260047281323877, + "grad_norm": 3.0738155841827393, + "learning_rate": 4.884297587601895e-06, + "loss": 0.604, + "step": 1324 + }, + { + "epoch": 0.6264775413711584, + "grad_norm": 2.719240427017212, + "learning_rate": 4.884109929108124e-06, + "loss": 0.6795, + "step": 1325 + }, + { + "epoch": 0.6269503546099291, + "grad_norm": 2.4108200073242188, + "learning_rate": 4.883922122166282e-06, + "loss": 0.5846, + "step": 1326 + }, + { + "epoch": 0.6274231678486998, + "grad_norm": 2.393899917602539, + "learning_rate": 4.883734166788063e-06, + "loss": 0.6188, + "step": 1327 + }, + { + "epoch": 0.6278959810874705, + "grad_norm": 4.555255889892578, + "learning_rate": 4.883546062985169e-06, + "loss": 0.5962, + "step": 1328 + }, + { + "epoch": 0.6283687943262412, + "grad_norm": 2.571075439453125, + "learning_rate": 4.883357810769315e-06, + "loss": 0.6165, + "step": 1329 + }, + { + "epoch": 0.6288416075650118, + "grad_norm": 2.553115129470825, + "learning_rate": 4.8831694101522185e-06, + "loss": 0.6787, + "step": 1330 + }, + { + "epoch": 0.6293144208037825, + "grad_norm": 3.2564642429351807, + "learning_rate": 4.882980861145614e-06, + "loss": 0.659, + "step": 1331 + }, + { + "epoch": 0.6297872340425532, + "grad_norm": 2.535216808319092, + "learning_rate": 4.882792163761241e-06, + "loss": 0.6176, + "step": 1332 + }, + { + "epoch": 0.6302600472813239, + "grad_norm": 3.097921848297119, + "learning_rate": 4.882603318010847e-06, + "loss": 0.6822, + "step": 1333 + }, + { + "epoch": 0.6307328605200946, + "grad_norm": 2.8135175704956055, + "learning_rate": 4.882414323906192e-06, + "loss": 0.6782, + "step": 1334 + }, + { + "epoch": 0.6312056737588653, + "grad_norm": 2.724634885787964, + "learning_rate": 4.882225181459044e-06, + "loss": 0.6545, + "step": 1335 + }, + { + "epoch": 0.631678486997636, + "grad_norm": 2.9585227966308594, + "learning_rate": 4.882035890681179e-06, + "loss": 0.6218, + "step": 1336 + }, + { + "epoch": 0.6321513002364066, + "grad_norm": 2.6952011585235596, + "learning_rate": 4.881846451584385e-06, + "loss": 0.6, + "step": 1337 + }, + { + "epoch": 0.6326241134751773, + "grad_norm": 3.1400704383850098, + "learning_rate": 4.881656864180455e-06, + "loss": 0.6687, + "step": 1338 + }, + { + "epoch": 0.633096926713948, + "grad_norm": 2.8382487297058105, + "learning_rate": 4.881467128481197e-06, + "loss": 0.574, + "step": 1339 + }, + { + "epoch": 0.6335697399527187, + "grad_norm": 2.8520095348358154, + "learning_rate": 4.881277244498422e-06, + "loss": 0.6582, + "step": 1340 + }, + { + "epoch": 0.6340425531914894, + "grad_norm": 2.703498363494873, + "learning_rate": 4.881087212243956e-06, + "loss": 0.7224, + "step": 1341 + }, + { + "epoch": 0.6345153664302601, + "grad_norm": 3.697205066680908, + "learning_rate": 4.880897031729629e-06, + "loss": 0.6582, + "step": 1342 + }, + { + "epoch": 0.6349881796690308, + "grad_norm": 2.7625808715820312, + "learning_rate": 4.880706702967284e-06, + "loss": 0.574, + "step": 1343 + }, + { + "epoch": 0.6354609929078014, + "grad_norm": 2.949984073638916, + "learning_rate": 4.880516225968771e-06, + "loss": 0.66, + "step": 1344 + }, + { + "epoch": 0.6359338061465721, + "grad_norm": 2.548269748687744, + "learning_rate": 4.8803256007459525e-06, + "loss": 0.642, + "step": 1345 + }, + { + "epoch": 0.6364066193853428, + "grad_norm": 2.5102174282073975, + "learning_rate": 4.8801348273106945e-06, + "loss": 0.6238, + "step": 1346 + }, + { + "epoch": 0.6368794326241135, + "grad_norm": 2.9847946166992188, + "learning_rate": 4.8799439056748786e-06, + "loss": 0.5416, + "step": 1347 + }, + { + "epoch": 0.6373522458628842, + "grad_norm": 2.8711049556732178, + "learning_rate": 4.879752835850391e-06, + "loss": 0.6427, + "step": 1348 + }, + { + "epoch": 0.6378250591016549, + "grad_norm": 2.7901716232299805, + "learning_rate": 4.879561617849129e-06, + "loss": 0.6026, + "step": 1349 + }, + { + "epoch": 0.6382978723404256, + "grad_norm": 2.659778356552124, + "learning_rate": 4.879370251682999e-06, + "loss": 0.6623, + "step": 1350 + }, + { + "epoch": 0.6387706855791963, + "grad_norm": 3.224386692047119, + "learning_rate": 4.879178737363917e-06, + "loss": 0.6485, + "step": 1351 + }, + { + "epoch": 0.6392434988179669, + "grad_norm": 2.6385605335235596, + "learning_rate": 4.8789870749038076e-06, + "loss": 0.5866, + "step": 1352 + }, + { + "epoch": 0.6397163120567376, + "grad_norm": 2.807713270187378, + "learning_rate": 4.8787952643146045e-06, + "loss": 0.6537, + "step": 1353 + }, + { + "epoch": 0.6401891252955083, + "grad_norm": 2.5689280033111572, + "learning_rate": 4.878603305608251e-06, + "loss": 0.6216, + "step": 1354 + }, + { + "epoch": 0.640661938534279, + "grad_norm": 2.7347843647003174, + "learning_rate": 4.8784111987967e-06, + "loss": 0.6318, + "step": 1355 + }, + { + "epoch": 0.6411347517730497, + "grad_norm": 2.5210378170013428, + "learning_rate": 4.878218943891911e-06, + "loss": 0.5472, + "step": 1356 + }, + { + "epoch": 0.6416075650118204, + "grad_norm": 2.866785764694214, + "learning_rate": 4.878026540905858e-06, + "loss": 0.7108, + "step": 1357 + }, + { + "epoch": 0.642080378250591, + "grad_norm": 2.923314332962036, + "learning_rate": 4.877833989850519e-06, + "loss": 0.5557, + "step": 1358 + }, + { + "epoch": 0.6425531914893617, + "grad_norm": 2.925463914871216, + "learning_rate": 4.8776412907378845e-06, + "loss": 0.6382, + "step": 1359 + }, + { + "epoch": 0.6430260047281324, + "grad_norm": 2.909644365310669, + "learning_rate": 4.877448443579952e-06, + "loss": 0.5603, + "step": 1360 + }, + { + "epoch": 0.6434988179669031, + "grad_norm": 3.501148223876953, + "learning_rate": 4.8772554483887306e-06, + "loss": 0.6722, + "step": 1361 + }, + { + "epoch": 0.6439716312056738, + "grad_norm": 2.823765516281128, + "learning_rate": 4.877062305176235e-06, + "loss": 0.6408, + "step": 1362 + }, + { + "epoch": 0.6444444444444445, + "grad_norm": 2.9807584285736084, + "learning_rate": 4.8768690139544935e-06, + "loss": 0.5984, + "step": 1363 + }, + { + "epoch": 0.6449172576832152, + "grad_norm": 2.8411378860473633, + "learning_rate": 4.8766755747355405e-06, + "loss": 0.6231, + "step": 1364 + }, + { + "epoch": 0.6453900709219859, + "grad_norm": 3.158952236175537, + "learning_rate": 4.8764819875314215e-06, + "loss": 0.6441, + "step": 1365 + }, + { + "epoch": 0.6458628841607565, + "grad_norm": 2.9614369869232178, + "learning_rate": 4.876288252354189e-06, + "loss": 0.6308, + "step": 1366 + }, + { + "epoch": 0.6463356973995272, + "grad_norm": 3.073805570602417, + "learning_rate": 4.876094369215907e-06, + "loss": 0.6046, + "step": 1367 + }, + { + "epoch": 0.6468085106382979, + "grad_norm": 2.719189405441284, + "learning_rate": 4.875900338128648e-06, + "loss": 0.6082, + "step": 1368 + }, + { + "epoch": 0.6472813238770686, + "grad_norm": 2.676726818084717, + "learning_rate": 4.8757061591044914e-06, + "loss": 0.6344, + "step": 1369 + }, + { + "epoch": 0.6477541371158393, + "grad_norm": 2.955256938934326, + "learning_rate": 4.87551183215553e-06, + "loss": 0.6506, + "step": 1370 + }, + { + "epoch": 0.64822695035461, + "grad_norm": 2.5672218799591064, + "learning_rate": 4.875317357293864e-06, + "loss": 0.5284, + "step": 1371 + }, + { + "epoch": 0.6486997635933807, + "grad_norm": 2.5860238075256348, + "learning_rate": 4.875122734531602e-06, + "loss": 0.667, + "step": 1372 + }, + { + "epoch": 0.6491725768321513, + "grad_norm": 3.1037003993988037, + "learning_rate": 4.8749279638808605e-06, + "loss": 0.6902, + "step": 1373 + }, + { + "epoch": 0.649645390070922, + "grad_norm": 2.7715282440185547, + "learning_rate": 4.874733045353769e-06, + "loss": 0.6291, + "step": 1374 + }, + { + "epoch": 0.6501182033096927, + "grad_norm": 2.527071475982666, + "learning_rate": 4.874537978962463e-06, + "loss": 0.5565, + "step": 1375 + }, + { + "epoch": 0.6505910165484634, + "grad_norm": 2.722092628479004, + "learning_rate": 4.874342764719091e-06, + "loss": 0.5724, + "step": 1376 + }, + { + "epoch": 0.6510638297872341, + "grad_norm": 2.6342411041259766, + "learning_rate": 4.874147402635805e-06, + "loss": 0.6308, + "step": 1377 + }, + { + "epoch": 0.6515366430260048, + "grad_norm": 2.3850719928741455, + "learning_rate": 4.8739518927247695e-06, + "loss": 0.5692, + "step": 1378 + }, + { + "epoch": 0.6520094562647755, + "grad_norm": 2.9787259101867676, + "learning_rate": 4.873756234998161e-06, + "loss": 0.6953, + "step": 1379 + }, + { + "epoch": 0.6524822695035462, + "grad_norm": 2.634141683578491, + "learning_rate": 4.873560429468159e-06, + "loss": 0.6077, + "step": 1380 + }, + { + "epoch": 0.6529550827423168, + "grad_norm": 2.803046941757202, + "learning_rate": 4.873364476146958e-06, + "loss": 0.6657, + "step": 1381 + }, + { + "epoch": 0.6534278959810875, + "grad_norm": 2.762827157974243, + "learning_rate": 4.8731683750467574e-06, + "loss": 0.6061, + "step": 1382 + }, + { + "epoch": 0.6539007092198581, + "grad_norm": 2.6654391288757324, + "learning_rate": 4.872972126179768e-06, + "loss": 0.6387, + "step": 1383 + }, + { + "epoch": 0.6543735224586288, + "grad_norm": 2.4363625049591064, + "learning_rate": 4.872775729558209e-06, + "loss": 0.5623, + "step": 1384 + }, + { + "epoch": 0.6548463356973995, + "grad_norm": 2.528959035873413, + "learning_rate": 4.87257918519431e-06, + "loss": 0.5609, + "step": 1385 + }, + { + "epoch": 0.6553191489361702, + "grad_norm": 2.718383312225342, + "learning_rate": 4.872382493100309e-06, + "loss": 0.5575, + "step": 1386 + }, + { + "epoch": 0.6557919621749408, + "grad_norm": 2.660841226577759, + "learning_rate": 4.872185653288453e-06, + "loss": 0.6106, + "step": 1387 + }, + { + "epoch": 0.6562647754137115, + "grad_norm": 2.508753538131714, + "learning_rate": 4.871988665770997e-06, + "loss": 0.5705, + "step": 1388 + }, + { + "epoch": 0.6567375886524822, + "grad_norm": 2.5134334564208984, + "learning_rate": 4.871791530560208e-06, + "loss": 0.5592, + "step": 1389 + }, + { + "epoch": 0.6572104018912529, + "grad_norm": 2.7475597858428955, + "learning_rate": 4.871594247668361e-06, + "loss": 0.6277, + "step": 1390 + }, + { + "epoch": 0.6576832151300236, + "grad_norm": 2.793616533279419, + "learning_rate": 4.871396817107739e-06, + "loss": 0.595, + "step": 1391 + }, + { + "epoch": 0.6581560283687943, + "grad_norm": 2.8285086154937744, + "learning_rate": 4.871199238890635e-06, + "loss": 0.6094, + "step": 1392 + }, + { + "epoch": 0.658628841607565, + "grad_norm": 2.74124813079834, + "learning_rate": 4.871001513029352e-06, + "loss": 0.6296, + "step": 1393 + }, + { + "epoch": 0.6591016548463356, + "grad_norm": 2.761237621307373, + "learning_rate": 4.870803639536202e-06, + "loss": 0.5702, + "step": 1394 + }, + { + "epoch": 0.6595744680851063, + "grad_norm": 2.761038064956665, + "learning_rate": 4.870605618423504e-06, + "loss": 0.6195, + "step": 1395 + }, + { + "epoch": 0.660047281323877, + "grad_norm": 2.8812482357025146, + "learning_rate": 4.870407449703589e-06, + "loss": 0.616, + "step": 1396 + }, + { + "epoch": 0.6605200945626477, + "grad_norm": 2.9966578483581543, + "learning_rate": 4.870209133388797e-06, + "loss": 0.6547, + "step": 1397 + }, + { + "epoch": 0.6609929078014184, + "grad_norm": 2.7969017028808594, + "learning_rate": 4.870010669491474e-06, + "loss": 0.5762, + "step": 1398 + }, + { + "epoch": 0.6614657210401891, + "grad_norm": 2.557783842086792, + "learning_rate": 4.86981205802398e-06, + "loss": 0.6184, + "step": 1399 + }, + { + "epoch": 0.6619385342789598, + "grad_norm": 2.5393927097320557, + "learning_rate": 4.86961329899868e-06, + "loss": 0.5953, + "step": 1400 + }, + { + "epoch": 0.6624113475177305, + "grad_norm": 2.7745981216430664, + "learning_rate": 4.86941439242795e-06, + "loss": 0.5967, + "step": 1401 + }, + { + "epoch": 0.6628841607565011, + "grad_norm": 2.650381326675415, + "learning_rate": 4.869215338324176e-06, + "loss": 0.5667, + "step": 1402 + }, + { + "epoch": 0.6633569739952718, + "grad_norm": 2.583169937133789, + "learning_rate": 4.869016136699751e-06, + "loss": 0.549, + "step": 1403 + }, + { + "epoch": 0.6638297872340425, + "grad_norm": 2.984978437423706, + "learning_rate": 4.868816787567079e-06, + "loss": 0.5931, + "step": 1404 + }, + { + "epoch": 0.6643026004728132, + "grad_norm": 3.1947181224823, + "learning_rate": 4.868617290938573e-06, + "loss": 0.5473, + "step": 1405 + }, + { + "epoch": 0.6647754137115839, + "grad_norm": 2.562927007675171, + "learning_rate": 4.868417646826654e-06, + "loss": 0.6878, + "step": 1406 + }, + { + "epoch": 0.6652482269503546, + "grad_norm": 2.8741261959075928, + "learning_rate": 4.868217855243754e-06, + "loss": 0.6312, + "step": 1407 + }, + { + "epoch": 0.6657210401891253, + "grad_norm": 2.9834797382354736, + "learning_rate": 4.868017916202312e-06, + "loss": 0.5624, + "step": 1408 + }, + { + "epoch": 0.6661938534278959, + "grad_norm": 2.6935982704162598, + "learning_rate": 4.8678178297147785e-06, + "loss": 0.5857, + "step": 1409 + }, + { + "epoch": 0.6666666666666666, + "grad_norm": 2.8200576305389404, + "learning_rate": 4.86761759579361e-06, + "loss": 0.6153, + "step": 1410 + }, + { + "epoch": 0.6671394799054373, + "grad_norm": 2.831425189971924, + "learning_rate": 4.867417214451276e-06, + "loss": 0.6495, + "step": 1411 + }, + { + "epoch": 0.667612293144208, + "grad_norm": 2.733565092086792, + "learning_rate": 4.867216685700253e-06, + "loss": 0.6036, + "step": 1412 + }, + { + "epoch": 0.6680851063829787, + "grad_norm": 3.0609400272369385, + "learning_rate": 4.867016009553027e-06, + "loss": 0.6773, + "step": 1413 + }, + { + "epoch": 0.6685579196217494, + "grad_norm": 2.665452241897583, + "learning_rate": 4.866815186022093e-06, + "loss": 0.6256, + "step": 1414 + }, + { + "epoch": 0.6690307328605201, + "grad_norm": 2.9480721950531006, + "learning_rate": 4.866614215119956e-06, + "loss": 0.535, + "step": 1415 + }, + { + "epoch": 0.6695035460992907, + "grad_norm": 2.5514180660247803, + "learning_rate": 4.866413096859128e-06, + "loss": 0.6588, + "step": 1416 + }, + { + "epoch": 0.6699763593380614, + "grad_norm": 3.3442373275756836, + "learning_rate": 4.866211831252134e-06, + "loss": 0.5754, + "step": 1417 + }, + { + "epoch": 0.6704491725768321, + "grad_norm": 2.521467685699463, + "learning_rate": 4.866010418311504e-06, + "loss": 0.5546, + "step": 1418 + }, + { + "epoch": 0.6709219858156028, + "grad_norm": 2.930706262588501, + "learning_rate": 4.865808858049781e-06, + "loss": 0.589, + "step": 1419 + }, + { + "epoch": 0.6713947990543735, + "grad_norm": 2.6298375129699707, + "learning_rate": 4.865607150479513e-06, + "loss": 0.5915, + "step": 1420 + }, + { + "epoch": 0.6718676122931442, + "grad_norm": 2.9554293155670166, + "learning_rate": 4.8654052956132615e-06, + "loss": 0.6654, + "step": 1421 + }, + { + "epoch": 0.6723404255319149, + "grad_norm": 3.2706902027130127, + "learning_rate": 4.865203293463593e-06, + "loss": 0.7115, + "step": 1422 + }, + { + "epoch": 0.6728132387706856, + "grad_norm": 3.041539430618286, + "learning_rate": 4.865001144043088e-06, + "loss": 0.5818, + "step": 1423 + }, + { + "epoch": 0.6732860520094562, + "grad_norm": 3.1314544677734375, + "learning_rate": 4.864798847364331e-06, + "loss": 0.5822, + "step": 1424 + }, + { + "epoch": 0.6737588652482269, + "grad_norm": 2.5301461219787598, + "learning_rate": 4.86459640343992e-06, + "loss": 0.5525, + "step": 1425 + }, + { + "epoch": 0.6742316784869976, + "grad_norm": 2.809295892715454, + "learning_rate": 4.864393812282458e-06, + "loss": 0.6768, + "step": 1426 + }, + { + "epoch": 0.6747044917257683, + "grad_norm": 2.794664144515991, + "learning_rate": 4.864191073904562e-06, + "loss": 0.5793, + "step": 1427 + }, + { + "epoch": 0.675177304964539, + "grad_norm": 2.7771105766296387, + "learning_rate": 4.863988188318854e-06, + "loss": 0.6453, + "step": 1428 + }, + { + "epoch": 0.6756501182033097, + "grad_norm": 2.6431946754455566, + "learning_rate": 4.863785155537967e-06, + "loss": 0.5877, + "step": 1429 + }, + { + "epoch": 0.6761229314420804, + "grad_norm": 2.951353073120117, + "learning_rate": 4.863581975574544e-06, + "loss": 0.6793, + "step": 1430 + }, + { + "epoch": 0.676595744680851, + "grad_norm": 3.1336071491241455, + "learning_rate": 4.863378648441235e-06, + "loss": 0.6695, + "step": 1431 + }, + { + "epoch": 0.6770685579196217, + "grad_norm": 2.735982656478882, + "learning_rate": 4.8631751741507e-06, + "loss": 0.5239, + "step": 1432 + }, + { + "epoch": 0.6775413711583924, + "grad_norm": 2.7085206508636475, + "learning_rate": 4.862971552715611e-06, + "loss": 0.6837, + "step": 1433 + }, + { + "epoch": 0.6780141843971631, + "grad_norm": 3.136528730392456, + "learning_rate": 4.8627677841486436e-06, + "loss": 0.683, + "step": 1434 + }, + { + "epoch": 0.6784869976359338, + "grad_norm": 2.7879369258880615, + "learning_rate": 4.862563868462486e-06, + "loss": 0.608, + "step": 1435 + }, + { + "epoch": 0.6789598108747045, + "grad_norm": 2.7937729358673096, + "learning_rate": 4.862359805669837e-06, + "loss": 0.6131, + "step": 1436 + }, + { + "epoch": 0.6794326241134752, + "grad_norm": 2.5988364219665527, + "learning_rate": 4.862155595783401e-06, + "loss": 0.6303, + "step": 1437 + }, + { + "epoch": 0.6799054373522458, + "grad_norm": 3.251070499420166, + "learning_rate": 4.861951238815894e-06, + "loss": 0.7246, + "step": 1438 + }, + { + "epoch": 0.6803782505910165, + "grad_norm": 2.646759271621704, + "learning_rate": 4.861746734780039e-06, + "loss": 0.6313, + "step": 1439 + }, + { + "epoch": 0.6808510638297872, + "grad_norm": 2.773866891860962, + "learning_rate": 4.861542083688573e-06, + "loss": 0.6463, + "step": 1440 + }, + { + "epoch": 0.6813238770685579, + "grad_norm": 2.759965658187866, + "learning_rate": 4.861337285554235e-06, + "loss": 0.5428, + "step": 1441 + }, + { + "epoch": 0.6817966903073286, + "grad_norm": 3.3250818252563477, + "learning_rate": 4.861132340389779e-06, + "loss": 0.6522, + "step": 1442 + }, + { + "epoch": 0.6822695035460993, + "grad_norm": 2.661797523498535, + "learning_rate": 4.860927248207965e-06, + "loss": 0.5871, + "step": 1443 + }, + { + "epoch": 0.68274231678487, + "grad_norm": 2.706289052963257, + "learning_rate": 4.860722009021563e-06, + "loss": 0.6651, + "step": 1444 + }, + { + "epoch": 0.6832151300236406, + "grad_norm": 2.8459298610687256, + "learning_rate": 4.860516622843354e-06, + "loss": 0.5827, + "step": 1445 + }, + { + "epoch": 0.6836879432624113, + "grad_norm": 3.1041831970214844, + "learning_rate": 4.860311089686125e-06, + "loss": 0.6727, + "step": 1446 + }, + { + "epoch": 0.684160756501182, + "grad_norm": 2.9382801055908203, + "learning_rate": 4.8601054095626746e-06, + "loss": 0.6002, + "step": 1447 + }, + { + "epoch": 0.6846335697399527, + "grad_norm": 2.782475471496582, + "learning_rate": 4.859899582485808e-06, + "loss": 0.6951, + "step": 1448 + }, + { + "epoch": 0.6851063829787234, + "grad_norm": 3.313894510269165, + "learning_rate": 4.859693608468343e-06, + "loss": 0.6363, + "step": 1449 + }, + { + "epoch": 0.6855791962174941, + "grad_norm": 3.1639695167541504, + "learning_rate": 4.8594874875231045e-06, + "loss": 0.7002, + "step": 1450 + }, + { + "epoch": 0.6860520094562648, + "grad_norm": 2.6762218475341797, + "learning_rate": 4.859281219662926e-06, + "loss": 0.6246, + "step": 1451 + }, + { + "epoch": 0.6865248226950355, + "grad_norm": 2.8368663787841797, + "learning_rate": 4.85907480490065e-06, + "loss": 0.5906, + "step": 1452 + }, + { + "epoch": 0.6869976359338061, + "grad_norm": 2.887373208999634, + "learning_rate": 4.858868243249131e-06, + "loss": 0.5931, + "step": 1453 + }, + { + "epoch": 0.6874704491725768, + "grad_norm": 2.8115322589874268, + "learning_rate": 4.858661534721229e-06, + "loss": 0.6337, + "step": 1454 + }, + { + "epoch": 0.6879432624113475, + "grad_norm": 2.8470499515533447, + "learning_rate": 4.8584546793298174e-06, + "loss": 0.632, + "step": 1455 + }, + { + "epoch": 0.6884160756501182, + "grad_norm": 2.8229613304138184, + "learning_rate": 4.8582476770877725e-06, + "loss": 0.6494, + "step": 1456 + }, + { + "epoch": 0.6888888888888889, + "grad_norm": 2.4235479831695557, + "learning_rate": 4.858040528007987e-06, + "loss": 0.5709, + "step": 1457 + }, + { + "epoch": 0.6893617021276596, + "grad_norm": 2.9348199367523193, + "learning_rate": 4.857833232103356e-06, + "loss": 0.5404, + "step": 1458 + }, + { + "epoch": 0.6898345153664303, + "grad_norm": 2.8274219036102295, + "learning_rate": 4.857625789386789e-06, + "loss": 0.701, + "step": 1459 + }, + { + "epoch": 0.6903073286052009, + "grad_norm": 3.136929988861084, + "learning_rate": 4.857418199871203e-06, + "loss": 0.6971, + "step": 1460 + }, + { + "epoch": 0.6907801418439716, + "grad_norm": 2.8987185955047607, + "learning_rate": 4.8572104635695214e-06, + "loss": 0.6613, + "step": 1461 + }, + { + "epoch": 0.6912529550827423, + "grad_norm": 2.5073442459106445, + "learning_rate": 4.857002580494681e-06, + "loss": 0.6032, + "step": 1462 + }, + { + "epoch": 0.691725768321513, + "grad_norm": 2.7019522190093994, + "learning_rate": 4.856794550659625e-06, + "loss": 0.567, + "step": 1463 + }, + { + "epoch": 0.6921985815602837, + "grad_norm": 2.4795594215393066, + "learning_rate": 4.8565863740773054e-06, + "loss": 0.5777, + "step": 1464 + }, + { + "epoch": 0.6926713947990544, + "grad_norm": 3.032506227493286, + "learning_rate": 4.856378050760687e-06, + "loss": 0.607, + "step": 1465 + }, + { + "epoch": 0.6931442080378251, + "grad_norm": 3.052091121673584, + "learning_rate": 4.85616958072274e-06, + "loss": 0.591, + "step": 1466 + }, + { + "epoch": 0.6936170212765957, + "grad_norm": 2.704831838607788, + "learning_rate": 4.855960963976443e-06, + "loss": 0.6528, + "step": 1467 + }, + { + "epoch": 0.6940898345153664, + "grad_norm": 2.680995225906372, + "learning_rate": 4.855752200534788e-06, + "loss": 0.6294, + "step": 1468 + }, + { + "epoch": 0.6945626477541371, + "grad_norm": 2.3948659896850586, + "learning_rate": 4.855543290410774e-06, + "loss": 0.6091, + "step": 1469 + }, + { + "epoch": 0.6950354609929078, + "grad_norm": 2.6407411098480225, + "learning_rate": 4.855334233617407e-06, + "loss": 0.5572, + "step": 1470 + }, + { + "epoch": 0.6955082742316785, + "grad_norm": 2.5526835918426514, + "learning_rate": 4.8551250301677064e-06, + "loss": 0.5432, + "step": 1471 + }, + { + "epoch": 0.6959810874704492, + "grad_norm": 3.1237430572509766, + "learning_rate": 4.8549156800746965e-06, + "loss": 0.5944, + "step": 1472 + }, + { + "epoch": 0.6964539007092199, + "grad_norm": 2.8112540245056152, + "learning_rate": 4.854706183351412e-06, + "loss": 0.604, + "step": 1473 + }, + { + "epoch": 0.6969267139479906, + "grad_norm": 2.664644479751587, + "learning_rate": 4.8544965400109e-06, + "loss": 0.5647, + "step": 1474 + }, + { + "epoch": 0.6973995271867612, + "grad_norm": 3.26310133934021, + "learning_rate": 4.854286750066212e-06, + "loss": 0.6999, + "step": 1475 + }, + { + "epoch": 0.6978723404255319, + "grad_norm": 2.9717442989349365, + "learning_rate": 4.8540768135304115e-06, + "loss": 0.6655, + "step": 1476 + }, + { + "epoch": 0.6983451536643026, + "grad_norm": 2.5302982330322266, + "learning_rate": 4.85386673041657e-06, + "loss": 0.6384, + "step": 1477 + }, + { + "epoch": 0.6988179669030733, + "grad_norm": 2.864877700805664, + "learning_rate": 4.853656500737769e-06, + "loss": 0.6834, + "step": 1478 + }, + { + "epoch": 0.699290780141844, + "grad_norm": 2.5522031784057617, + "learning_rate": 4.853446124507098e-06, + "loss": 0.5929, + "step": 1479 + }, + { + "epoch": 0.6997635933806147, + "grad_norm": 3.096477746963501, + "learning_rate": 4.853235601737656e-06, + "loss": 0.5737, + "step": 1480 + }, + { + "epoch": 0.7002364066193854, + "grad_norm": 2.884779214859009, + "learning_rate": 4.853024932442552e-06, + "loss": 0.6362, + "step": 1481 + }, + { + "epoch": 0.700709219858156, + "grad_norm": 3.368558406829834, + "learning_rate": 4.852814116634903e-06, + "loss": 0.6721, + "step": 1482 + }, + { + "epoch": 0.7011820330969267, + "grad_norm": 2.742414951324463, + "learning_rate": 4.852603154327837e-06, + "loss": 0.6212, + "step": 1483 + }, + { + "epoch": 0.7016548463356974, + "grad_norm": 2.53454852104187, + "learning_rate": 4.8523920455344864e-06, + "loss": 0.6675, + "step": 1484 + }, + { + "epoch": 0.7021276595744681, + "grad_norm": 2.9354238510131836, + "learning_rate": 4.852180790267999e-06, + "loss": 0.6692, + "step": 1485 + }, + { + "epoch": 0.7026004728132388, + "grad_norm": 2.585070848464966, + "learning_rate": 4.8519693885415274e-06, + "loss": 0.6215, + "step": 1486 + }, + { + "epoch": 0.7030732860520095, + "grad_norm": 2.9047999382019043, + "learning_rate": 4.851757840368235e-06, + "loss": 0.6231, + "step": 1487 + }, + { + "epoch": 0.7035460992907802, + "grad_norm": 3.0930933952331543, + "learning_rate": 4.851546145761295e-06, + "loss": 0.7267, + "step": 1488 + }, + { + "epoch": 0.7040189125295508, + "grad_norm": 3.0224719047546387, + "learning_rate": 4.8513343047338875e-06, + "loss": 0.6293, + "step": 1489 + }, + { + "epoch": 0.7044917257683215, + "grad_norm": 2.5758471488952637, + "learning_rate": 4.851122317299203e-06, + "loss": 0.5855, + "step": 1490 + }, + { + "epoch": 0.7049645390070922, + "grad_norm": 2.579272508621216, + "learning_rate": 4.850910183470441e-06, + "loss": 0.582, + "step": 1491 + }, + { + "epoch": 0.7054373522458629, + "grad_norm": 2.8148300647735596, + "learning_rate": 4.85069790326081e-06, + "loss": 0.6396, + "step": 1492 + }, + { + "epoch": 0.7059101654846336, + "grad_norm": 2.6380527019500732, + "learning_rate": 4.850485476683528e-06, + "loss": 0.6114, + "step": 1493 + }, + { + "epoch": 0.7063829787234043, + "grad_norm": 2.7736263275146484, + "learning_rate": 4.850272903751823e-06, + "loss": 0.6683, + "step": 1494 + }, + { + "epoch": 0.706855791962175, + "grad_norm": 3.1958179473876953, + "learning_rate": 4.8500601844789285e-06, + "loss": 0.6265, + "step": 1495 + }, + { + "epoch": 0.7073286052009456, + "grad_norm": 3.783212423324585, + "learning_rate": 4.8498473188780916e-06, + "loss": 0.6078, + "step": 1496 + }, + { + "epoch": 0.7078014184397163, + "grad_norm": 2.6656646728515625, + "learning_rate": 4.849634306962566e-06, + "loss": 0.5756, + "step": 1497 + }, + { + "epoch": 0.708274231678487, + "grad_norm": 2.757141590118408, + "learning_rate": 4.849421148745615e-06, + "loss": 0.5596, + "step": 1498 + }, + { + "epoch": 0.7087470449172577, + "grad_norm": 3.0391886234283447, + "learning_rate": 4.849207844240511e-06, + "loss": 0.5293, + "step": 1499 + }, + { + "epoch": 0.7092198581560284, + "grad_norm": 2.981912851333618, + "learning_rate": 4.848994393460535e-06, + "loss": 0.598, + "step": 1500 + }, + { + "epoch": 0.7096926713947991, + "grad_norm": 2.5470798015594482, + "learning_rate": 4.848780796418978e-06, + "loss": 0.6266, + "step": 1501 + }, + { + "epoch": 0.7101654846335698, + "grad_norm": 2.8394415378570557, + "learning_rate": 4.8485670531291415e-06, + "loss": 0.6844, + "step": 1502 + }, + { + "epoch": 0.7106382978723405, + "grad_norm": 3.2023508548736572, + "learning_rate": 4.848353163604331e-06, + "loss": 0.6134, + "step": 1503 + }, + { + "epoch": 0.7111111111111111, + "grad_norm": 2.98245906829834, + "learning_rate": 4.848139127857867e-06, + "loss": 0.7084, + "step": 1504 + }, + { + "epoch": 0.7115839243498818, + "grad_norm": 2.5917441844940186, + "learning_rate": 4.847924945903076e-06, + "loss": 0.5676, + "step": 1505 + }, + { + "epoch": 0.7120567375886525, + "grad_norm": 2.8736681938171387, + "learning_rate": 4.847710617753294e-06, + "loss": 0.6304, + "step": 1506 + }, + { + "epoch": 0.7125295508274232, + "grad_norm": 2.7832682132720947, + "learning_rate": 4.847496143421866e-06, + "loss": 0.5705, + "step": 1507 + }, + { + "epoch": 0.7130023640661939, + "grad_norm": 2.480560779571533, + "learning_rate": 4.847281522922147e-06, + "loss": 0.5595, + "step": 1508 + }, + { + "epoch": 0.7134751773049646, + "grad_norm": 2.357675313949585, + "learning_rate": 4.847066756267499e-06, + "loss": 0.5065, + "step": 1509 + }, + { + "epoch": 0.7139479905437353, + "grad_norm": 2.632669448852539, + "learning_rate": 4.846851843471296e-06, + "loss": 0.6949, + "step": 1510 + }, + { + "epoch": 0.7144208037825059, + "grad_norm": 2.7691073417663574, + "learning_rate": 4.84663678454692e-06, + "loss": 0.6638, + "step": 1511 + }, + { + "epoch": 0.7148936170212766, + "grad_norm": 2.5647685527801514, + "learning_rate": 4.846421579507761e-06, + "loss": 0.6098, + "step": 1512 + }, + { + "epoch": 0.7153664302600473, + "grad_norm": 2.476701021194458, + "learning_rate": 4.846206228367218e-06, + "loss": 0.592, + "step": 1513 + }, + { + "epoch": 0.715839243498818, + "grad_norm": 2.805727958679199, + "learning_rate": 4.845990731138702e-06, + "loss": 0.5466, + "step": 1514 + }, + { + "epoch": 0.7163120567375887, + "grad_norm": 2.551392078399658, + "learning_rate": 4.84577508783563e-06, + "loss": 0.6039, + "step": 1515 + }, + { + "epoch": 0.7167848699763594, + "grad_norm": 2.6861350536346436, + "learning_rate": 4.845559298471429e-06, + "loss": 0.6427, + "step": 1516 + }, + { + "epoch": 0.7172576832151301, + "grad_norm": 3.1908371448516846, + "learning_rate": 4.845343363059535e-06, + "loss": 0.5447, + "step": 1517 + }, + { + "epoch": 0.7177304964539007, + "grad_norm": 2.9021761417388916, + "learning_rate": 4.845127281613394e-06, + "loss": 0.5836, + "step": 1518 + }, + { + "epoch": 0.7182033096926714, + "grad_norm": 2.476670742034912, + "learning_rate": 4.844911054146461e-06, + "loss": 0.5863, + "step": 1519 + }, + { + "epoch": 0.7186761229314421, + "grad_norm": 2.662935495376587, + "learning_rate": 4.844694680672198e-06, + "loss": 0.5678, + "step": 1520 + }, + { + "epoch": 0.7191489361702128, + "grad_norm": 2.677896738052368, + "learning_rate": 4.844478161204079e-06, + "loss": 0.6195, + "step": 1521 + }, + { + "epoch": 0.7196217494089835, + "grad_norm": 2.781921863555908, + "learning_rate": 4.844261495755585e-06, + "loss": 0.643, + "step": 1522 + }, + { + "epoch": 0.7200945626477542, + "grad_norm": 3.0157392024993896, + "learning_rate": 4.844044684340206e-06, + "loss": 0.7559, + "step": 1523 + }, + { + "epoch": 0.7205673758865249, + "grad_norm": 2.8109354972839355, + "learning_rate": 4.843827726971444e-06, + "loss": 0.6264, + "step": 1524 + }, + { + "epoch": 0.7210401891252955, + "grad_norm": 3.0953569412231445, + "learning_rate": 4.8436106236628064e-06, + "loss": 0.6429, + "step": 1525 + }, + { + "epoch": 0.7215130023640662, + "grad_norm": 2.6850643157958984, + "learning_rate": 4.843393374427812e-06, + "loss": 0.6598, + "step": 1526 + }, + { + "epoch": 0.7219858156028369, + "grad_norm": 3.043480634689331, + "learning_rate": 4.8431759792799874e-06, + "loss": 0.6331, + "step": 1527 + }, + { + "epoch": 0.7224586288416076, + "grad_norm": 2.723870038986206, + "learning_rate": 4.842958438232868e-06, + "loss": 0.6259, + "step": 1528 + }, + { + "epoch": 0.7229314420803783, + "grad_norm": 2.822492837905884, + "learning_rate": 4.842740751300002e-06, + "loss": 0.6554, + "step": 1529 + }, + { + "epoch": 0.723404255319149, + "grad_norm": 2.7866315841674805, + "learning_rate": 4.842522918494941e-06, + "loss": 0.6991, + "step": 1530 + }, + { + "epoch": 0.7238770685579197, + "grad_norm": 2.8881826400756836, + "learning_rate": 4.84230493983125e-06, + "loss": 0.5876, + "step": 1531 + }, + { + "epoch": 0.7243498817966904, + "grad_norm": 2.7456939220428467, + "learning_rate": 4.8420868153225e-06, + "loss": 0.6188, + "step": 1532 + }, + { + "epoch": 0.724822695035461, + "grad_norm": 3.0257532596588135, + "learning_rate": 4.841868544982274e-06, + "loss": 0.63, + "step": 1533 + }, + { + "epoch": 0.7252955082742317, + "grad_norm": 3.1581954956054688, + "learning_rate": 4.841650128824164e-06, + "loss": 0.7214, + "step": 1534 + }, + { + "epoch": 0.7257683215130024, + "grad_norm": 2.9174306392669678, + "learning_rate": 4.841431566861767e-06, + "loss": 0.704, + "step": 1535 + }, + { + "epoch": 0.7262411347517731, + "grad_norm": 2.5019054412841797, + "learning_rate": 4.8412128591086935e-06, + "loss": 0.6298, + "step": 1536 + }, + { + "epoch": 0.7267139479905438, + "grad_norm": 2.724285125732422, + "learning_rate": 4.840994005578562e-06, + "loss": 0.6289, + "step": 1537 + }, + { + "epoch": 0.7271867612293145, + "grad_norm": 2.5882341861724854, + "learning_rate": 4.840775006284998e-06, + "loss": 0.6355, + "step": 1538 + }, + { + "epoch": 0.7276595744680852, + "grad_norm": 3.1281991004943848, + "learning_rate": 4.840555861241638e-06, + "loss": 0.5551, + "step": 1539 + }, + { + "epoch": 0.7281323877068558, + "grad_norm": 2.6064817905426025, + "learning_rate": 4.840336570462127e-06, + "loss": 0.5543, + "step": 1540 + }, + { + "epoch": 0.7286052009456265, + "grad_norm": 2.67112398147583, + "learning_rate": 4.840117133960122e-06, + "loss": 0.6044, + "step": 1541 + }, + { + "epoch": 0.7290780141843972, + "grad_norm": 2.838022232055664, + "learning_rate": 4.839897551749282e-06, + "loss": 0.6814, + "step": 1542 + }, + { + "epoch": 0.7295508274231679, + "grad_norm": 2.8897151947021484, + "learning_rate": 4.839677823843283e-06, + "loss": 0.593, + "step": 1543 + }, + { + "epoch": 0.7300236406619386, + "grad_norm": 2.9238014221191406, + "learning_rate": 4.839457950255805e-06, + "loss": 0.5544, + "step": 1544 + }, + { + "epoch": 0.7304964539007093, + "grad_norm": 3.016876459121704, + "learning_rate": 4.839237931000538e-06, + "loss": 0.6099, + "step": 1545 + }, + { + "epoch": 0.7309692671394799, + "grad_norm": 2.9415392875671387, + "learning_rate": 4.839017766091182e-06, + "loss": 0.6413, + "step": 1546 + }, + { + "epoch": 0.7314420803782505, + "grad_norm": 2.658067226409912, + "learning_rate": 4.838797455541446e-06, + "loss": 0.6534, + "step": 1547 + }, + { + "epoch": 0.7319148936170212, + "grad_norm": 2.460358142852783, + "learning_rate": 4.838576999365049e-06, + "loss": 0.5307, + "step": 1548 + }, + { + "epoch": 0.7323877068557919, + "grad_norm": 2.5818674564361572, + "learning_rate": 4.838356397575716e-06, + "loss": 0.6265, + "step": 1549 + }, + { + "epoch": 0.7328605200945626, + "grad_norm": 3.009197473526001, + "learning_rate": 4.838135650187183e-06, + "loss": 0.6957, + "step": 1550 + }, + { + "epoch": 0.7333333333333333, + "grad_norm": 2.738543748855591, + "learning_rate": 4.837914757213196e-06, + "loss": 0.646, + "step": 1551 + }, + { + "epoch": 0.733806146572104, + "grad_norm": 2.8208494186401367, + "learning_rate": 4.837693718667508e-06, + "loss": 0.5936, + "step": 1552 + }, + { + "epoch": 0.7342789598108747, + "grad_norm": 3.1574649810791016, + "learning_rate": 4.837472534563883e-06, + "loss": 0.6455, + "step": 1553 + }, + { + "epoch": 0.7347517730496453, + "grad_norm": 2.6737420558929443, + "learning_rate": 4.837251204916093e-06, + "loss": 0.5921, + "step": 1554 + }, + { + "epoch": 0.735224586288416, + "grad_norm": 2.424983024597168, + "learning_rate": 4.837029729737918e-06, + "loss": 0.6346, + "step": 1555 + }, + { + "epoch": 0.7356973995271867, + "grad_norm": 2.5163493156433105, + "learning_rate": 4.836808109043151e-06, + "loss": 0.6061, + "step": 1556 + }, + { + "epoch": 0.7361702127659574, + "grad_norm": 2.8377044200897217, + "learning_rate": 4.836586342845588e-06, + "loss": 0.611, + "step": 1557 + }, + { + "epoch": 0.7366430260047281, + "grad_norm": 2.5929181575775146, + "learning_rate": 4.83636443115904e-06, + "loss": 0.5496, + "step": 1558 + }, + { + "epoch": 0.7371158392434988, + "grad_norm": 2.5017223358154297, + "learning_rate": 4.836142373997323e-06, + "loss": 0.6235, + "step": 1559 + }, + { + "epoch": 0.7375886524822695, + "grad_norm": 2.822500228881836, + "learning_rate": 4.835920171374265e-06, + "loss": 0.6147, + "step": 1560 + }, + { + "epoch": 0.7380614657210401, + "grad_norm": 2.7234230041503906, + "learning_rate": 4.8356978233037e-06, + "loss": 0.6228, + "step": 1561 + }, + { + "epoch": 0.7385342789598108, + "grad_norm": 2.9565515518188477, + "learning_rate": 4.835475329799472e-06, + "loss": 0.5728, + "step": 1562 + }, + { + "epoch": 0.7390070921985815, + "grad_norm": 2.4356038570404053, + "learning_rate": 4.835252690875438e-06, + "loss": 0.6723, + "step": 1563 + }, + { + "epoch": 0.7394799054373522, + "grad_norm": 2.765913248062134, + "learning_rate": 4.835029906545458e-06, + "loss": 0.5805, + "step": 1564 + }, + { + "epoch": 0.7399527186761229, + "grad_norm": 2.4481914043426514, + "learning_rate": 4.834806976823405e-06, + "loss": 0.599, + "step": 1565 + }, + { + "epoch": 0.7404255319148936, + "grad_norm": 2.620779514312744, + "learning_rate": 4.834583901723158e-06, + "loss": 0.63, + "step": 1566 + }, + { + "epoch": 0.7408983451536643, + "grad_norm": 2.654426097869873, + "learning_rate": 4.83436068125861e-06, + "loss": 0.6544, + "step": 1567 + }, + { + "epoch": 0.741371158392435, + "grad_norm": 2.589623212814331, + "learning_rate": 4.834137315443656e-06, + "loss": 0.5596, + "step": 1568 + }, + { + "epoch": 0.7418439716312056, + "grad_norm": 2.572883129119873, + "learning_rate": 4.833913804292209e-06, + "loss": 0.5974, + "step": 1569 + }, + { + "epoch": 0.7423167848699763, + "grad_norm": 2.8744914531707764, + "learning_rate": 4.833690147818181e-06, + "loss": 0.5364, + "step": 1570 + }, + { + "epoch": 0.742789598108747, + "grad_norm": 2.9800851345062256, + "learning_rate": 4.833466346035502e-06, + "loss": 0.6287, + "step": 1571 + }, + { + "epoch": 0.7432624113475177, + "grad_norm": 2.627784490585327, + "learning_rate": 4.833242398958105e-06, + "loss": 0.621, + "step": 1572 + }, + { + "epoch": 0.7437352245862884, + "grad_norm": 2.5187721252441406, + "learning_rate": 4.833018306599933e-06, + "loss": 0.5901, + "step": 1573 + }, + { + "epoch": 0.7442080378250591, + "grad_norm": 2.4843688011169434, + "learning_rate": 4.832794068974944e-06, + "loss": 0.6336, + "step": 1574 + }, + { + "epoch": 0.7446808510638298, + "grad_norm": 2.774911880493164, + "learning_rate": 4.832569686097096e-06, + "loss": 0.6091, + "step": 1575 + }, + { + "epoch": 0.7451536643026004, + "grad_norm": 3.2562527656555176, + "learning_rate": 4.8323451579803615e-06, + "loss": 0.7686, + "step": 1576 + }, + { + "epoch": 0.7456264775413711, + "grad_norm": 2.799570083618164, + "learning_rate": 4.832120484638721e-06, + "loss": 0.6233, + "step": 1577 + }, + { + "epoch": 0.7460992907801418, + "grad_norm": 2.661893367767334, + "learning_rate": 4.831895666086164e-06, + "loss": 0.5841, + "step": 1578 + }, + { + "epoch": 0.7465721040189125, + "grad_norm": 3.0382652282714844, + "learning_rate": 4.831670702336689e-06, + "loss": 0.5769, + "step": 1579 + }, + { + "epoch": 0.7470449172576832, + "grad_norm": 2.676398515701294, + "learning_rate": 4.831445593404304e-06, + "loss": 0.619, + "step": 1580 + }, + { + "epoch": 0.7475177304964539, + "grad_norm": 2.717916965484619, + "learning_rate": 4.831220339303024e-06, + "loss": 0.5787, + "step": 1581 + }, + { + "epoch": 0.7479905437352246, + "grad_norm": 2.3918066024780273, + "learning_rate": 4.830994940046876e-06, + "loss": 0.5108, + "step": 1582 + }, + { + "epoch": 0.7484633569739952, + "grad_norm": 2.709144115447998, + "learning_rate": 4.830769395649895e-06, + "loss": 0.6875, + "step": 1583 + }, + { + "epoch": 0.7489361702127659, + "grad_norm": 2.8711116313934326, + "learning_rate": 4.830543706126123e-06, + "loss": 0.6745, + "step": 1584 + }, + { + "epoch": 0.7494089834515366, + "grad_norm": 2.612339496612549, + "learning_rate": 4.830317871489614e-06, + "loss": 0.5738, + "step": 1585 + }, + { + "epoch": 0.7498817966903073, + "grad_norm": 2.4355857372283936, + "learning_rate": 4.830091891754429e-06, + "loss": 0.5907, + "step": 1586 + }, + { + "epoch": 0.750354609929078, + "grad_norm": 2.676051378250122, + "learning_rate": 4.829865766934638e-06, + "loss": 0.6628, + "step": 1587 + }, + { + "epoch": 0.7508274231678487, + "grad_norm": 2.66489839553833, + "learning_rate": 4.829639497044323e-06, + "loss": 0.5984, + "step": 1588 + }, + { + "epoch": 0.7513002364066194, + "grad_norm": 2.5358035564422607, + "learning_rate": 4.829413082097572e-06, + "loss": 0.5867, + "step": 1589 + }, + { + "epoch": 0.75177304964539, + "grad_norm": 2.6530144214630127, + "learning_rate": 4.8291865221084815e-06, + "loss": 0.5917, + "step": 1590 + }, + { + "epoch": 0.7522458628841607, + "grad_norm": 2.5160958766937256, + "learning_rate": 4.82895981709116e-06, + "loss": 0.6347, + "step": 1591 + }, + { + "epoch": 0.7527186761229314, + "grad_norm": 2.61592698097229, + "learning_rate": 4.8287329670597225e-06, + "loss": 0.5472, + "step": 1592 + }, + { + "epoch": 0.7531914893617021, + "grad_norm": 2.7528622150421143, + "learning_rate": 4.828505972028296e-06, + "loss": 0.5842, + "step": 1593 + }, + { + "epoch": 0.7536643026004728, + "grad_norm": 2.8154072761535645, + "learning_rate": 4.828278832011011e-06, + "loss": 0.5757, + "step": 1594 + }, + { + "epoch": 0.7541371158392435, + "grad_norm": 3.118515729904175, + "learning_rate": 4.828051547022013e-06, + "loss": 0.6472, + "step": 1595 + }, + { + "epoch": 0.7546099290780142, + "grad_norm": 2.452033758163452, + "learning_rate": 4.827824117075453e-06, + "loss": 0.5571, + "step": 1596 + }, + { + "epoch": 0.7550827423167848, + "grad_norm": 2.984388828277588, + "learning_rate": 4.827596542185492e-06, + "loss": 0.6656, + "step": 1597 + }, + { + "epoch": 0.7555555555555555, + "grad_norm": 2.61356782913208, + "learning_rate": 4.8273688223663014e-06, + "loss": 0.6444, + "step": 1598 + }, + { + "epoch": 0.7560283687943262, + "grad_norm": 2.8967196941375732, + "learning_rate": 4.8271409576320595e-06, + "loss": 0.6457, + "step": 1599 + }, + { + "epoch": 0.7565011820330969, + "grad_norm": 2.852367639541626, + "learning_rate": 4.826912947996954e-06, + "loss": 0.5629, + "step": 1600 + }, + { + "epoch": 0.7569739952718676, + "grad_norm": 2.905280590057373, + "learning_rate": 4.826684793475182e-06, + "loss": 0.6245, + "step": 1601 + }, + { + "epoch": 0.7574468085106383, + "grad_norm": 2.6156530380249023, + "learning_rate": 4.826456494080951e-06, + "loss": 0.5869, + "step": 1602 + }, + { + "epoch": 0.757919621749409, + "grad_norm": 2.6490228176116943, + "learning_rate": 4.826228049828475e-06, + "loss": 0.5461, + "step": 1603 + }, + { + "epoch": 0.7583924349881797, + "grad_norm": 2.9626693725585938, + "learning_rate": 4.825999460731978e-06, + "loss": 0.6842, + "step": 1604 + }, + { + "epoch": 0.7588652482269503, + "grad_norm": 2.6866023540496826, + "learning_rate": 4.825770726805695e-06, + "loss": 0.5726, + "step": 1605 + }, + { + "epoch": 0.759338061465721, + "grad_norm": 2.5525858402252197, + "learning_rate": 4.825541848063866e-06, + "loss": 0.6061, + "step": 1606 + }, + { + "epoch": 0.7598108747044917, + "grad_norm": 2.703977584838867, + "learning_rate": 4.825312824520743e-06, + "loss": 0.6726, + "step": 1607 + }, + { + "epoch": 0.7602836879432624, + "grad_norm": 2.856534957885742, + "learning_rate": 4.825083656190588e-06, + "loss": 0.625, + "step": 1608 + }, + { + "epoch": 0.7607565011820331, + "grad_norm": 2.8564887046813965, + "learning_rate": 4.824854343087668e-06, + "loss": 0.7251, + "step": 1609 + }, + { + "epoch": 0.7612293144208038, + "grad_norm": 2.327650308609009, + "learning_rate": 4.824624885226262e-06, + "loss": 0.526, + "step": 1610 + }, + { + "epoch": 0.7617021276595745, + "grad_norm": 3.0025737285614014, + "learning_rate": 4.824395282620659e-06, + "loss": 0.6043, + "step": 1611 + }, + { + "epoch": 0.7621749408983451, + "grad_norm": 2.5441737174987793, + "learning_rate": 4.824165535285152e-06, + "loss": 0.6276, + "step": 1612 + }, + { + "epoch": 0.7626477541371158, + "grad_norm": 2.4177372455596924, + "learning_rate": 4.823935643234049e-06, + "loss": 0.6419, + "step": 1613 + }, + { + "epoch": 0.7631205673758865, + "grad_norm": 2.9210550785064697, + "learning_rate": 4.823705606481664e-06, + "loss": 0.5663, + "step": 1614 + }, + { + "epoch": 0.7635933806146572, + "grad_norm": 2.6353724002838135, + "learning_rate": 4.82347542504232e-06, + "loss": 0.5669, + "step": 1615 + }, + { + "epoch": 0.7640661938534279, + "grad_norm": 2.419081926345825, + "learning_rate": 4.823245098930349e-06, + "loss": 0.5777, + "step": 1616 + }, + { + "epoch": 0.7645390070921986, + "grad_norm": 2.5077571868896484, + "learning_rate": 4.823014628160093e-06, + "loss": 0.5924, + "step": 1617 + }, + { + "epoch": 0.7650118203309693, + "grad_norm": 2.816056251525879, + "learning_rate": 4.822784012745902e-06, + "loss": 0.7273, + "step": 1618 + }, + { + "epoch": 0.76548463356974, + "grad_norm": 2.7163147926330566, + "learning_rate": 4.8225532527021366e-06, + "loss": 0.5545, + "step": 1619 + }, + { + "epoch": 0.7659574468085106, + "grad_norm": 2.4784302711486816, + "learning_rate": 4.822322348043164e-06, + "loss": 0.556, + "step": 1620 + }, + { + "epoch": 0.7664302600472813, + "grad_norm": 2.712467670440674, + "learning_rate": 4.822091298783361e-06, + "loss": 0.6501, + "step": 1621 + }, + { + "epoch": 0.766903073286052, + "grad_norm": 2.7217724323272705, + "learning_rate": 4.821860104937115e-06, + "loss": 0.5989, + "step": 1622 + }, + { + "epoch": 0.7673758865248227, + "grad_norm": 2.5622854232788086, + "learning_rate": 4.821628766518821e-06, + "loss": 0.5263, + "step": 1623 + }, + { + "epoch": 0.7678486997635934, + "grad_norm": 3.230923891067505, + "learning_rate": 4.821397283542884e-06, + "loss": 0.6707, + "step": 1624 + }, + { + "epoch": 0.7683215130023641, + "grad_norm": 2.37929105758667, + "learning_rate": 4.821165656023718e-06, + "loss": 0.6124, + "step": 1625 + }, + { + "epoch": 0.7687943262411348, + "grad_norm": 2.9811325073242188, + "learning_rate": 4.820933883975745e-06, + "loss": 0.6435, + "step": 1626 + }, + { + "epoch": 0.7692671394799054, + "grad_norm": 2.887380838394165, + "learning_rate": 4.820701967413395e-06, + "loss": 0.621, + "step": 1627 + }, + { + "epoch": 0.7697399527186761, + "grad_norm": 2.6762876510620117, + "learning_rate": 4.820469906351109e-06, + "loss": 0.5713, + "step": 1628 + }, + { + "epoch": 0.7702127659574468, + "grad_norm": 2.7347512245178223, + "learning_rate": 4.820237700803337e-06, + "loss": 0.6136, + "step": 1629 + }, + { + "epoch": 0.7706855791962175, + "grad_norm": 2.7244746685028076, + "learning_rate": 4.820005350784539e-06, + "loss": 0.5816, + "step": 1630 + }, + { + "epoch": 0.7711583924349882, + "grad_norm": 2.9293999671936035, + "learning_rate": 4.8197728563091795e-06, + "loss": 0.6649, + "step": 1631 + }, + { + "epoch": 0.7716312056737589, + "grad_norm": 2.4402127265930176, + "learning_rate": 4.819540217391736e-06, + "loss": 0.6481, + "step": 1632 + }, + { + "epoch": 0.7721040189125296, + "grad_norm": 3.083941698074341, + "learning_rate": 4.819307434046694e-06, + "loss": 0.6951, + "step": 1633 + }, + { + "epoch": 0.7725768321513002, + "grad_norm": 2.544952392578125, + "learning_rate": 4.819074506288548e-06, + "loss": 0.539, + "step": 1634 + }, + { + "epoch": 0.7730496453900709, + "grad_norm": 2.7791268825531006, + "learning_rate": 4.818841434131801e-06, + "loss": 0.5827, + "step": 1635 + }, + { + "epoch": 0.7735224586288416, + "grad_norm": 2.7349796295166016, + "learning_rate": 4.818608217590967e-06, + "loss": 0.5584, + "step": 1636 + }, + { + "epoch": 0.7739952718676123, + "grad_norm": 2.637652635574341, + "learning_rate": 4.818374856680565e-06, + "loss": 0.6386, + "step": 1637 + }, + { + "epoch": 0.774468085106383, + "grad_norm": 2.9821584224700928, + "learning_rate": 4.818141351415127e-06, + "loss": 0.6734, + "step": 1638 + }, + { + "epoch": 0.7749408983451537, + "grad_norm": 2.992938995361328, + "learning_rate": 4.817907701809192e-06, + "loss": 0.5899, + "step": 1639 + }, + { + "epoch": 0.7754137115839244, + "grad_norm": 4.35719633102417, + "learning_rate": 4.8176739078773076e-06, + "loss": 0.6281, + "step": 1640 + }, + { + "epoch": 0.775886524822695, + "grad_norm": 2.838146209716797, + "learning_rate": 4.8174399696340315e-06, + "loss": 0.5766, + "step": 1641 + }, + { + "epoch": 0.7763593380614657, + "grad_norm": 3.3116989135742188, + "learning_rate": 4.81720588709393e-06, + "loss": 0.6409, + "step": 1642 + }, + { + "epoch": 0.7768321513002364, + "grad_norm": 2.9843590259552, + "learning_rate": 4.816971660271579e-06, + "loss": 0.6108, + "step": 1643 + }, + { + "epoch": 0.7773049645390071, + "grad_norm": 2.843770742416382, + "learning_rate": 4.816737289181562e-06, + "loss": 0.6053, + "step": 1644 + }, + { + "epoch": 0.7777777777777778, + "grad_norm": 2.7608556747436523, + "learning_rate": 4.816502773838473e-06, + "loss": 0.5854, + "step": 1645 + }, + { + "epoch": 0.7782505910165485, + "grad_norm": 3.343682289123535, + "learning_rate": 4.816268114256914e-06, + "loss": 0.6329, + "step": 1646 + }, + { + "epoch": 0.7787234042553192, + "grad_norm": 2.769768476486206, + "learning_rate": 4.816033310451496e-06, + "loss": 0.6242, + "step": 1647 + }, + { + "epoch": 0.7791962174940898, + "grad_norm": 2.989851713180542, + "learning_rate": 4.815798362436838e-06, + "loss": 0.6493, + "step": 1648 + }, + { + "epoch": 0.7796690307328605, + "grad_norm": 3.170736312866211, + "learning_rate": 4.8155632702275716e-06, + "loss": 0.6341, + "step": 1649 + }, + { + "epoch": 0.7801418439716312, + "grad_norm": 2.7372522354125977, + "learning_rate": 4.815328033838334e-06, + "loss": 0.5445, + "step": 1650 + }, + { + "epoch": 0.7806146572104019, + "grad_norm": 2.6947238445281982, + "learning_rate": 4.8150926532837715e-06, + "loss": 0.6437, + "step": 1651 + }, + { + "epoch": 0.7810874704491726, + "grad_norm": 2.472323179244995, + "learning_rate": 4.81485712857854e-06, + "loss": 0.5751, + "step": 1652 + }, + { + "epoch": 0.7815602836879433, + "grad_norm": 2.791114091873169, + "learning_rate": 4.814621459737308e-06, + "loss": 0.5996, + "step": 1653 + }, + { + "epoch": 0.782033096926714, + "grad_norm": 3.1957521438598633, + "learning_rate": 4.814385646774745e-06, + "loss": 0.5803, + "step": 1654 + }, + { + "epoch": 0.7825059101654847, + "grad_norm": 2.4120798110961914, + "learning_rate": 4.8141496897055364e-06, + "loss": 0.5814, + "step": 1655 + }, + { + "epoch": 0.7829787234042553, + "grad_norm": 2.9262423515319824, + "learning_rate": 4.813913588544374e-06, + "loss": 0.6292, + "step": 1656 + }, + { + "epoch": 0.783451536643026, + "grad_norm": 2.8251047134399414, + "learning_rate": 4.813677343305959e-06, + "loss": 0.6787, + "step": 1657 + }, + { + "epoch": 0.7839243498817967, + "grad_norm": 2.931659698486328, + "learning_rate": 4.8134409540050005e-06, + "loss": 0.6163, + "step": 1658 + }, + { + "epoch": 0.7843971631205674, + "grad_norm": 2.7160706520080566, + "learning_rate": 4.813204420656219e-06, + "loss": 0.6831, + "step": 1659 + }, + { + "epoch": 0.7848699763593381, + "grad_norm": 3.2134454250335693, + "learning_rate": 4.81296774327434e-06, + "loss": 0.6002, + "step": 1660 + }, + { + "epoch": 0.7853427895981088, + "grad_norm": 2.4002513885498047, + "learning_rate": 4.812730921874103e-06, + "loss": 0.5488, + "step": 1661 + }, + { + "epoch": 0.7858156028368795, + "grad_norm": 2.5559282302856445, + "learning_rate": 4.812493956470251e-06, + "loss": 0.5802, + "step": 1662 + }, + { + "epoch": 0.7862884160756501, + "grad_norm": 2.57478404045105, + "learning_rate": 4.812256847077541e-06, + "loss": 0.646, + "step": 1663 + }, + { + "epoch": 0.7867612293144208, + "grad_norm": 2.811851978302002, + "learning_rate": 4.812019593710736e-06, + "loss": 0.6245, + "step": 1664 + }, + { + "epoch": 0.7872340425531915, + "grad_norm": 2.5228829383850098, + "learning_rate": 4.811782196384609e-06, + "loss": 0.5949, + "step": 1665 + }, + { + "epoch": 0.7877068557919622, + "grad_norm": 2.744096040725708, + "learning_rate": 4.8115446551139415e-06, + "loss": 0.6006, + "step": 1666 + }, + { + "epoch": 0.7881796690307329, + "grad_norm": 3.129242420196533, + "learning_rate": 4.811306969913524e-06, + "loss": 0.7251, + "step": 1667 + }, + { + "epoch": 0.7886524822695036, + "grad_norm": 2.7855660915374756, + "learning_rate": 4.811069140798156e-06, + "loss": 0.6534, + "step": 1668 + }, + { + "epoch": 0.7891252955082743, + "grad_norm": 2.836603879928589, + "learning_rate": 4.810831167782647e-06, + "loss": 0.6661, + "step": 1669 + }, + { + "epoch": 0.789598108747045, + "grad_norm": 2.5339887142181396, + "learning_rate": 4.810593050881813e-06, + "loss": 0.5354, + "step": 1670 + }, + { + "epoch": 0.7900709219858156, + "grad_norm": 2.9553709030151367, + "learning_rate": 4.810354790110482e-06, + "loss": 0.6001, + "step": 1671 + }, + { + "epoch": 0.7905437352245863, + "grad_norm": 2.6581788063049316, + "learning_rate": 4.8101163854834885e-06, + "loss": 0.6802, + "step": 1672 + }, + { + "epoch": 0.791016548463357, + "grad_norm": 3.2002551555633545, + "learning_rate": 4.809877837015677e-06, + "loss": 0.6641, + "step": 1673 + }, + { + "epoch": 0.7914893617021277, + "grad_norm": 2.918792963027954, + "learning_rate": 4.809639144721902e-06, + "loss": 0.6758, + "step": 1674 + }, + { + "epoch": 0.7919621749408984, + "grad_norm": 2.7993946075439453, + "learning_rate": 4.8094003086170245e-06, + "loss": 0.5889, + "step": 1675 + }, + { + "epoch": 0.7924349881796691, + "grad_norm": 2.3698952198028564, + "learning_rate": 4.809161328715916e-06, + "loss": 0.6244, + "step": 1676 + }, + { + "epoch": 0.7929078014184398, + "grad_norm": 2.8891594409942627, + "learning_rate": 4.808922205033458e-06, + "loss": 0.5835, + "step": 1677 + }, + { + "epoch": 0.7933806146572104, + "grad_norm": 2.838345766067505, + "learning_rate": 4.808682937584537e-06, + "loss": 0.6907, + "step": 1678 + }, + { + "epoch": 0.7938534278959811, + "grad_norm": 2.8443174362182617, + "learning_rate": 4.808443526384053e-06, + "loss": 0.6692, + "step": 1679 + }, + { + "epoch": 0.7943262411347518, + "grad_norm": 2.7355034351348877, + "learning_rate": 4.808203971446913e-06, + "loss": 0.5799, + "step": 1680 + }, + { + "epoch": 0.7947990543735225, + "grad_norm": 2.7108020782470703, + "learning_rate": 4.807964272788033e-06, + "loss": 0.652, + "step": 1681 + }, + { + "epoch": 0.7952718676122932, + "grad_norm": 2.397650957107544, + "learning_rate": 4.807724430422338e-06, + "loss": 0.5418, + "step": 1682 + }, + { + "epoch": 0.7957446808510639, + "grad_norm": 2.4981582164764404, + "learning_rate": 4.807484444364762e-06, + "loss": 0.5731, + "step": 1683 + }, + { + "epoch": 0.7962174940898346, + "grad_norm": 2.7943713665008545, + "learning_rate": 4.8072443146302475e-06, + "loss": 0.5913, + "step": 1684 + }, + { + "epoch": 0.7966903073286052, + "grad_norm": 2.5691423416137695, + "learning_rate": 4.807004041233746e-06, + "loss": 0.6475, + "step": 1685 + }, + { + "epoch": 0.7971631205673759, + "grad_norm": 3.2367498874664307, + "learning_rate": 4.8067636241902195e-06, + "loss": 0.675, + "step": 1686 + }, + { + "epoch": 0.7976359338061466, + "grad_norm": 3.000595808029175, + "learning_rate": 4.806523063514637e-06, + "loss": 0.5481, + "step": 1687 + }, + { + "epoch": 0.7981087470449173, + "grad_norm": 2.702014207839966, + "learning_rate": 4.806282359221976e-06, + "loss": 0.5993, + "step": 1688 + }, + { + "epoch": 0.798581560283688, + "grad_norm": 2.383671998977661, + "learning_rate": 4.806041511327226e-06, + "loss": 0.562, + "step": 1689 + }, + { + "epoch": 0.7990543735224587, + "grad_norm": 2.6965041160583496, + "learning_rate": 4.8058005198453834e-06, + "loss": 0.5955, + "step": 1690 + }, + { + "epoch": 0.7995271867612294, + "grad_norm": 2.5906765460968018, + "learning_rate": 4.805559384791453e-06, + "loss": 0.5151, + "step": 1691 + }, + { + "epoch": 0.8, + "grad_norm": 2.5454652309417725, + "learning_rate": 4.8053181061804475e-06, + "loss": 0.5843, + "step": 1692 + }, + { + "epoch": 0.8004728132387707, + "grad_norm": 2.661343812942505, + "learning_rate": 4.8050766840273935e-06, + "loss": 0.5995, + "step": 1693 + }, + { + "epoch": 0.8009456264775414, + "grad_norm": 2.5635924339294434, + "learning_rate": 4.8048351183473215e-06, + "loss": 0.5676, + "step": 1694 + }, + { + "epoch": 0.8014184397163121, + "grad_norm": 2.5936667919158936, + "learning_rate": 4.804593409155274e-06, + "loss": 0.6291, + "step": 1695 + }, + { + "epoch": 0.8018912529550828, + "grad_norm": 2.6902432441711426, + "learning_rate": 4.804351556466299e-06, + "loss": 0.6114, + "step": 1696 + }, + { + "epoch": 0.8023640661938535, + "grad_norm": 2.7764673233032227, + "learning_rate": 4.804109560295457e-06, + "loss": 0.5768, + "step": 1697 + }, + { + "epoch": 0.8028368794326242, + "grad_norm": 2.9587221145629883, + "learning_rate": 4.803867420657816e-06, + "loss": 0.6048, + "step": 1698 + }, + { + "epoch": 0.8033096926713948, + "grad_norm": 2.9238998889923096, + "learning_rate": 4.803625137568453e-06, + "loss": 0.6329, + "step": 1699 + }, + { + "epoch": 0.8037825059101655, + "grad_norm": 2.70473313331604, + "learning_rate": 4.803382711042455e-06, + "loss": 0.5427, + "step": 1700 + }, + { + "epoch": 0.8042553191489362, + "grad_norm": 3.1604979038238525, + "learning_rate": 4.803140141094914e-06, + "loss": 0.626, + "step": 1701 + }, + { + "epoch": 0.8047281323877069, + "grad_norm": 2.9567699432373047, + "learning_rate": 4.802897427740936e-06, + "loss": 0.5319, + "step": 1702 + }, + { + "epoch": 0.8052009456264776, + "grad_norm": 2.90983247756958, + "learning_rate": 4.802654570995632e-06, + "loss": 0.586, + "step": 1703 + }, + { + "epoch": 0.8056737588652483, + "grad_norm": 2.783480167388916, + "learning_rate": 4.8024115708741255e-06, + "loss": 0.5773, + "step": 1704 + }, + { + "epoch": 0.806146572104019, + "grad_norm": 3.3307793140411377, + "learning_rate": 4.802168427391547e-06, + "loss": 0.6257, + "step": 1705 + }, + { + "epoch": 0.8066193853427897, + "grad_norm": 3.0475001335144043, + "learning_rate": 4.801925140563034e-06, + "loss": 0.6612, + "step": 1706 + }, + { + "epoch": 0.8070921985815603, + "grad_norm": 2.8278894424438477, + "learning_rate": 4.8016817104037375e-06, + "loss": 0.6449, + "step": 1707 + }, + { + "epoch": 0.807565011820331, + "grad_norm": 2.760244369506836, + "learning_rate": 4.801438136928812e-06, + "loss": 0.7007, + "step": 1708 + }, + { + "epoch": 0.8080378250591016, + "grad_norm": 2.827634572982788, + "learning_rate": 4.801194420153427e-06, + "loss": 0.6418, + "step": 1709 + }, + { + "epoch": 0.8085106382978723, + "grad_norm": 2.8655009269714355, + "learning_rate": 4.800950560092754e-06, + "loss": 0.6231, + "step": 1710 + }, + { + "epoch": 0.808983451536643, + "grad_norm": 2.738112688064575, + "learning_rate": 4.800706556761981e-06, + "loss": 0.6463, + "step": 1711 + }, + { + "epoch": 0.8094562647754137, + "grad_norm": 2.4781179428100586, + "learning_rate": 4.800462410176296e-06, + "loss": 0.5365, + "step": 1712 + }, + { + "epoch": 0.8099290780141843, + "grad_norm": 2.6049838066101074, + "learning_rate": 4.800218120350906e-06, + "loss": 0.6035, + "step": 1713 + }, + { + "epoch": 0.810401891252955, + "grad_norm": 2.9089980125427246, + "learning_rate": 4.79997368730102e-06, + "loss": 0.5828, + "step": 1714 + }, + { + "epoch": 0.8108747044917257, + "grad_norm": 2.831871747970581, + "learning_rate": 4.799729111041857e-06, + "loss": 0.5953, + "step": 1715 + }, + { + "epoch": 0.8113475177304964, + "grad_norm": 2.5611300468444824, + "learning_rate": 4.799484391588647e-06, + "loss": 0.6302, + "step": 1716 + }, + { + "epoch": 0.8118203309692671, + "grad_norm": 2.744070053100586, + "learning_rate": 4.799239528956625e-06, + "loss": 0.5561, + "step": 1717 + }, + { + "epoch": 0.8122931442080378, + "grad_norm": 2.7344231605529785, + "learning_rate": 4.798994523161041e-06, + "loss": 0.6317, + "step": 1718 + }, + { + "epoch": 0.8127659574468085, + "grad_norm": 2.3420889377593994, + "learning_rate": 4.798749374217149e-06, + "loss": 0.5415, + "step": 1719 + }, + { + "epoch": 0.8132387706855791, + "grad_norm": 2.57384991645813, + "learning_rate": 4.798504082140212e-06, + "loss": 0.6383, + "step": 1720 + }, + { + "epoch": 0.8137115839243498, + "grad_norm": 2.8819844722747803, + "learning_rate": 4.798258646945505e-06, + "loss": 0.6355, + "step": 1721 + }, + { + "epoch": 0.8141843971631205, + "grad_norm": 2.908123254776001, + "learning_rate": 4.79801306864831e-06, + "loss": 0.701, + "step": 1722 + }, + { + "epoch": 0.8146572104018912, + "grad_norm": 2.6500701904296875, + "learning_rate": 4.797767347263917e-06, + "loss": 0.6152, + "step": 1723 + }, + { + "epoch": 0.8151300236406619, + "grad_norm": 2.5513017177581787, + "learning_rate": 4.797521482807628e-06, + "loss": 0.6241, + "step": 1724 + }, + { + "epoch": 0.8156028368794326, + "grad_norm": 2.6239185333251953, + "learning_rate": 4.7972754752947495e-06, + "loss": 0.6072, + "step": 1725 + }, + { + "epoch": 0.8160756501182033, + "grad_norm": 2.673436403274536, + "learning_rate": 4.797029324740601e-06, + "loss": 0.5802, + "step": 1726 + }, + { + "epoch": 0.816548463356974, + "grad_norm": 2.533831834793091, + "learning_rate": 4.796783031160508e-06, + "loss": 0.5566, + "step": 1727 + }, + { + "epoch": 0.8170212765957446, + "grad_norm": 2.9806582927703857, + "learning_rate": 4.796536594569807e-06, + "loss": 0.6945, + "step": 1728 + }, + { + "epoch": 0.8174940898345153, + "grad_norm": 2.7093560695648193, + "learning_rate": 4.796290014983842e-06, + "loss": 0.7143, + "step": 1729 + }, + { + "epoch": 0.817966903073286, + "grad_norm": 2.814507246017456, + "learning_rate": 4.796043292417967e-06, + "loss": 0.6122, + "step": 1730 + }, + { + "epoch": 0.8184397163120567, + "grad_norm": 2.537156820297241, + "learning_rate": 4.795796426887543e-06, + "loss": 0.6229, + "step": 1731 + }, + { + "epoch": 0.8189125295508274, + "grad_norm": 2.4878013134002686, + "learning_rate": 4.795549418407944e-06, + "loss": 0.5442, + "step": 1732 + }, + { + "epoch": 0.8193853427895981, + "grad_norm": 2.839383363723755, + "learning_rate": 4.795302266994548e-06, + "loss": 0.6717, + "step": 1733 + }, + { + "epoch": 0.8198581560283688, + "grad_norm": 3.1981801986694336, + "learning_rate": 4.795054972662744e-06, + "loss": 0.6596, + "step": 1734 + }, + { + "epoch": 0.8203309692671394, + "grad_norm": 2.781730890274048, + "learning_rate": 4.79480753542793e-06, + "loss": 0.5845, + "step": 1735 + }, + { + "epoch": 0.8208037825059101, + "grad_norm": 2.689948558807373, + "learning_rate": 4.794559955305513e-06, + "loss": 0.5928, + "step": 1736 + }, + { + "epoch": 0.8212765957446808, + "grad_norm": 2.7267637252807617, + "learning_rate": 4.7943122323109105e-06, + "loss": 0.5224, + "step": 1737 + }, + { + "epoch": 0.8217494089834515, + "grad_norm": 2.4346601963043213, + "learning_rate": 4.794064366459544e-06, + "loss": 0.6431, + "step": 1738 + }, + { + "epoch": 0.8222222222222222, + "grad_norm": 2.7440176010131836, + "learning_rate": 4.793816357766849e-06, + "loss": 0.6083, + "step": 1739 + }, + { + "epoch": 0.8226950354609929, + "grad_norm": 2.6558027267456055, + "learning_rate": 4.793568206248268e-06, + "loss": 0.698, + "step": 1740 + }, + { + "epoch": 0.8231678486997636, + "grad_norm": 2.591658353805542, + "learning_rate": 4.793319911919251e-06, + "loss": 0.6601, + "step": 1741 + }, + { + "epoch": 0.8236406619385342, + "grad_norm": 2.5431172847747803, + "learning_rate": 4.79307147479526e-06, + "loss": 0.5917, + "step": 1742 + }, + { + "epoch": 0.8241134751773049, + "grad_norm": 2.7335588932037354, + "learning_rate": 4.792822894891762e-06, + "loss": 0.5925, + "step": 1743 + }, + { + "epoch": 0.8245862884160756, + "grad_norm": 2.2500839233398438, + "learning_rate": 4.792574172224237e-06, + "loss": 0.4984, + "step": 1744 + }, + { + "epoch": 0.8250591016548463, + "grad_norm": 2.691343069076538, + "learning_rate": 4.79232530680817e-06, + "loss": 0.6262, + "step": 1745 + }, + { + "epoch": 0.825531914893617, + "grad_norm": 2.612204074859619, + "learning_rate": 4.792076298659058e-06, + "loss": 0.5822, + "step": 1746 + }, + { + "epoch": 0.8260047281323877, + "grad_norm": 3.0163519382476807, + "learning_rate": 4.791827147792406e-06, + "loss": 0.6263, + "step": 1747 + }, + { + "epoch": 0.8264775413711584, + "grad_norm": 2.742183208465576, + "learning_rate": 4.791577854223727e-06, + "loss": 0.6628, + "step": 1748 + }, + { + "epoch": 0.826950354609929, + "grad_norm": 2.872213840484619, + "learning_rate": 4.791328417968542e-06, + "loss": 0.6332, + "step": 1749 + }, + { + "epoch": 0.8274231678486997, + "grad_norm": 2.725006580352783, + "learning_rate": 4.7910788390423844e-06, + "loss": 0.6266, + "step": 1750 + }, + { + "epoch": 0.8278959810874704, + "grad_norm": 3.0366697311401367, + "learning_rate": 4.790829117460793e-06, + "loss": 0.6403, + "step": 1751 + }, + { + "epoch": 0.8283687943262411, + "grad_norm": 2.594881772994995, + "learning_rate": 4.790579253239318e-06, + "loss": 0.521, + "step": 1752 + }, + { + "epoch": 0.8288416075650118, + "grad_norm": 2.4496347904205322, + "learning_rate": 4.790329246393517e-06, + "loss": 0.54, + "step": 1753 + }, + { + "epoch": 0.8293144208037825, + "grad_norm": 3.102278470993042, + "learning_rate": 4.790079096938956e-06, + "loss": 0.6142, + "step": 1754 + }, + { + "epoch": 0.8297872340425532, + "grad_norm": 2.4645912647247314, + "learning_rate": 4.789828804891212e-06, + "loss": 0.5212, + "step": 1755 + }, + { + "epoch": 0.8302600472813239, + "grad_norm": 2.7482516765594482, + "learning_rate": 4.789578370265868e-06, + "loss": 0.6712, + "step": 1756 + }, + { + "epoch": 0.8307328605200945, + "grad_norm": 2.61360502243042, + "learning_rate": 4.7893277930785195e-06, + "loss": 0.6367, + "step": 1757 + }, + { + "epoch": 0.8312056737588652, + "grad_norm": 2.79028058052063, + "learning_rate": 4.789077073344767e-06, + "loss": 0.5099, + "step": 1758 + }, + { + "epoch": 0.8316784869976359, + "grad_norm": 2.647662401199341, + "learning_rate": 4.788826211080222e-06, + "loss": 0.6698, + "step": 1759 + }, + { + "epoch": 0.8321513002364066, + "grad_norm": 3.0214831829071045, + "learning_rate": 4.7885752063005055e-06, + "loss": 0.6121, + "step": 1760 + }, + { + "epoch": 0.8326241134751773, + "grad_norm": 2.8244032859802246, + "learning_rate": 4.788324059021247e-06, + "loss": 0.6921, + "step": 1761 + }, + { + "epoch": 0.833096926713948, + "grad_norm": 3.1501076221466064, + "learning_rate": 4.788072769258082e-06, + "loss": 0.6872, + "step": 1762 + }, + { + "epoch": 0.8335697399527187, + "grad_norm": 2.6989903450012207, + "learning_rate": 4.7878213370266594e-06, + "loss": 0.5884, + "step": 1763 + }, + { + "epoch": 0.8340425531914893, + "grad_norm": 2.6982665061950684, + "learning_rate": 4.787569762342633e-06, + "loss": 0.6112, + "step": 1764 + }, + { + "epoch": 0.83451536643026, + "grad_norm": 2.6918323040008545, + "learning_rate": 4.7873180452216685e-06, + "loss": 0.5315, + "step": 1765 + }, + { + "epoch": 0.8349881796690307, + "grad_norm": 2.5494401454925537, + "learning_rate": 4.78706618567944e-06, + "loss": 0.5909, + "step": 1766 + }, + { + "epoch": 0.8354609929078014, + "grad_norm": 2.7532095909118652, + "learning_rate": 4.786814183731627e-06, + "loss": 0.5566, + "step": 1767 + }, + { + "epoch": 0.8359338061465721, + "grad_norm": 2.550865888595581, + "learning_rate": 4.786562039393923e-06, + "loss": 0.555, + "step": 1768 + }, + { + "epoch": 0.8364066193853428, + "grad_norm": 2.4477791786193848, + "learning_rate": 4.786309752682028e-06, + "loss": 0.5844, + "step": 1769 + }, + { + "epoch": 0.8368794326241135, + "grad_norm": 2.6982262134552, + "learning_rate": 4.7860573236116485e-06, + "loss": 0.6136, + "step": 1770 + }, + { + "epoch": 0.8373522458628841, + "grad_norm": 2.456263542175293, + "learning_rate": 4.785804752198503e-06, + "loss": 0.5055, + "step": 1771 + }, + { + "epoch": 0.8378250591016548, + "grad_norm": 2.428544521331787, + "learning_rate": 4.78555203845832e-06, + "loss": 0.5859, + "step": 1772 + }, + { + "epoch": 0.8382978723404255, + "grad_norm": 2.1782307624816895, + "learning_rate": 4.785299182406833e-06, + "loss": 0.5325, + "step": 1773 + }, + { + "epoch": 0.8387706855791962, + "grad_norm": 3.137956142425537, + "learning_rate": 4.785046184059786e-06, + "loss": 0.6097, + "step": 1774 + }, + { + "epoch": 0.8392434988179669, + "grad_norm": 2.6269001960754395, + "learning_rate": 4.7847930434329336e-06, + "loss": 0.5972, + "step": 1775 + }, + { + "epoch": 0.8397163120567376, + "grad_norm": 2.732659339904785, + "learning_rate": 4.784539760542037e-06, + "loss": 0.6054, + "step": 1776 + }, + { + "epoch": 0.8401891252955083, + "grad_norm": 2.5346736907958984, + "learning_rate": 4.784286335402866e-06, + "loss": 0.5521, + "step": 1777 + }, + { + "epoch": 0.840661938534279, + "grad_norm": 3.1420228481292725, + "learning_rate": 4.784032768031202e-06, + "loss": 0.6165, + "step": 1778 + }, + { + "epoch": 0.8411347517730496, + "grad_norm": 3.073793411254883, + "learning_rate": 4.783779058442831e-06, + "loss": 0.6414, + "step": 1779 + }, + { + "epoch": 0.8416075650118203, + "grad_norm": 2.6621336936950684, + "learning_rate": 4.783525206653554e-06, + "loss": 0.5836, + "step": 1780 + }, + { + "epoch": 0.842080378250591, + "grad_norm": 2.7029049396514893, + "learning_rate": 4.7832712126791745e-06, + "loss": 0.5897, + "step": 1781 + }, + { + "epoch": 0.8425531914893617, + "grad_norm": 2.4733822345733643, + "learning_rate": 4.783017076535509e-06, + "loss": 0.5913, + "step": 1782 + }, + { + "epoch": 0.8430260047281324, + "grad_norm": 2.8119473457336426, + "learning_rate": 4.782762798238381e-06, + "loss": 0.6105, + "step": 1783 + }, + { + "epoch": 0.8434988179669031, + "grad_norm": 2.5290818214416504, + "learning_rate": 4.782508377803622e-06, + "loss": 0.6119, + "step": 1784 + }, + { + "epoch": 0.8439716312056738, + "grad_norm": 3.193472385406494, + "learning_rate": 4.782253815247076e-06, + "loss": 0.6665, + "step": 1785 + }, + { + "epoch": 0.8444444444444444, + "grad_norm": 3.206759452819824, + "learning_rate": 4.781999110584592e-06, + "loss": 0.6012, + "step": 1786 + }, + { + "epoch": 0.8449172576832151, + "grad_norm": 2.6227457523345947, + "learning_rate": 4.781744263832029e-06, + "loss": 0.5845, + "step": 1787 + }, + { + "epoch": 0.8453900709219858, + "grad_norm": 2.838365316390991, + "learning_rate": 4.781489275005257e-06, + "loss": 0.5695, + "step": 1788 + }, + { + "epoch": 0.8458628841607565, + "grad_norm": 2.8348326683044434, + "learning_rate": 4.78123414412015e-06, + "loss": 0.6136, + "step": 1789 + }, + { + "epoch": 0.8463356973995272, + "grad_norm": 2.5698344707489014, + "learning_rate": 4.780978871192597e-06, + "loss": 0.6576, + "step": 1790 + }, + { + "epoch": 0.8468085106382979, + "grad_norm": 2.5198330879211426, + "learning_rate": 4.780723456238492e-06, + "loss": 0.5521, + "step": 1791 + }, + { + "epoch": 0.8472813238770686, + "grad_norm": 3.001325845718384, + "learning_rate": 4.780467899273737e-06, + "loss": 0.6075, + "step": 1792 + }, + { + "epoch": 0.8477541371158392, + "grad_norm": 2.7732746601104736, + "learning_rate": 4.780212200314247e-06, + "loss": 0.6245, + "step": 1793 + }, + { + "epoch": 0.8482269503546099, + "grad_norm": 2.6950337886810303, + "learning_rate": 4.77995635937594e-06, + "loss": 0.5723, + "step": 1794 + }, + { + "epoch": 0.8486997635933806, + "grad_norm": 2.82051420211792, + "learning_rate": 4.779700376474749e-06, + "loss": 0.6184, + "step": 1795 + }, + { + "epoch": 0.8491725768321513, + "grad_norm": 2.757791757583618, + "learning_rate": 4.779444251626611e-06, + "loss": 0.608, + "step": 1796 + }, + { + "epoch": 0.849645390070922, + "grad_norm": 2.394108533859253, + "learning_rate": 4.779187984847475e-06, + "loss": 0.6174, + "step": 1797 + }, + { + "epoch": 0.8501182033096927, + "grad_norm": 2.427562713623047, + "learning_rate": 4.778931576153296e-06, + "loss": 0.5618, + "step": 1798 + }, + { + "epoch": 0.8505910165484634, + "grad_norm": 2.891268491744995, + "learning_rate": 4.778675025560042e-06, + "loss": 0.6865, + "step": 1799 + }, + { + "epoch": 0.851063829787234, + "grad_norm": 2.665534257888794, + "learning_rate": 4.778418333083685e-06, + "loss": 0.5852, + "step": 1800 + }, + { + "epoch": 0.8515366430260047, + "grad_norm": 2.5492889881134033, + "learning_rate": 4.7781614987402095e-06, + "loss": 0.5161, + "step": 1801 + }, + { + "epoch": 0.8520094562647754, + "grad_norm": 2.400177001953125, + "learning_rate": 4.777904522545607e-06, + "loss": 0.5128, + "step": 1802 + }, + { + "epoch": 0.8524822695035461, + "grad_norm": 2.3949809074401855, + "learning_rate": 4.777647404515878e-06, + "loss": 0.571, + "step": 1803 + }, + { + "epoch": 0.8529550827423168, + "grad_norm": 2.3624472618103027, + "learning_rate": 4.7773901446670325e-06, + "loss": 0.5486, + "step": 1804 + }, + { + "epoch": 0.8534278959810875, + "grad_norm": 2.711366891860962, + "learning_rate": 4.7771327430150885e-06, + "loss": 0.5667, + "step": 1805 + }, + { + "epoch": 0.8539007092198582, + "grad_norm": 2.7681493759155273, + "learning_rate": 4.776875199576073e-06, + "loss": 0.5686, + "step": 1806 + }, + { + "epoch": 0.8543735224586289, + "grad_norm": 3.0369436740875244, + "learning_rate": 4.776617514366023e-06, + "loss": 0.6635, + "step": 1807 + }, + { + "epoch": 0.8548463356973995, + "grad_norm": 2.919649600982666, + "learning_rate": 4.776359687400983e-06, + "loss": 0.5749, + "step": 1808 + }, + { + "epoch": 0.8553191489361702, + "grad_norm": 2.7986185550689697, + "learning_rate": 4.776101718697007e-06, + "loss": 0.559, + "step": 1809 + }, + { + "epoch": 0.8557919621749409, + "grad_norm": 2.5951223373413086, + "learning_rate": 4.775843608270158e-06, + "loss": 0.5654, + "step": 1810 + }, + { + "epoch": 0.8562647754137116, + "grad_norm": 2.674138069152832, + "learning_rate": 4.775585356136505e-06, + "loss": 0.5286, + "step": 1811 + }, + { + "epoch": 0.8567375886524823, + "grad_norm": 3.045437812805176, + "learning_rate": 4.775326962312131e-06, + "loss": 0.6185, + "step": 1812 + }, + { + "epoch": 0.857210401891253, + "grad_norm": 2.6145293712615967, + "learning_rate": 4.775068426813124e-06, + "loss": 0.6075, + "step": 1813 + }, + { + "epoch": 0.8576832151300237, + "grad_norm": 2.6320106983184814, + "learning_rate": 4.7748097496555824e-06, + "loss": 0.561, + "step": 1814 + }, + { + "epoch": 0.8581560283687943, + "grad_norm": 2.5038623809814453, + "learning_rate": 4.774550930855612e-06, + "loss": 0.593, + "step": 1815 + }, + { + "epoch": 0.858628841607565, + "grad_norm": 2.8168089389801025, + "learning_rate": 4.774291970429329e-06, + "loss": 0.5196, + "step": 1816 + }, + { + "epoch": 0.8591016548463357, + "grad_norm": 2.778130292892456, + "learning_rate": 4.774032868392858e-06, + "loss": 0.5984, + "step": 1817 + }, + { + "epoch": 0.8595744680851064, + "grad_norm": 2.536458730697632, + "learning_rate": 4.7737736247623305e-06, + "loss": 0.568, + "step": 1818 + }, + { + "epoch": 0.8600472813238771, + "grad_norm": 2.6669719219207764, + "learning_rate": 4.77351423955389e-06, + "loss": 0.6233, + "step": 1819 + }, + { + "epoch": 0.8605200945626478, + "grad_norm": 2.578242540359497, + "learning_rate": 4.773254712783687e-06, + "loss": 0.579, + "step": 1820 + }, + { + "epoch": 0.8609929078014185, + "grad_norm": 2.816664457321167, + "learning_rate": 4.772995044467881e-06, + "loss": 0.6635, + "step": 1821 + }, + { + "epoch": 0.8614657210401891, + "grad_norm": 3.1111979484558105, + "learning_rate": 4.77273523462264e-06, + "loss": 0.6372, + "step": 1822 + }, + { + "epoch": 0.8619385342789598, + "grad_norm": 2.764552354812622, + "learning_rate": 4.772475283264142e-06, + "loss": 0.6216, + "step": 1823 + }, + { + "epoch": 0.8624113475177305, + "grad_norm": 2.9126830101013184, + "learning_rate": 4.772215190408572e-06, + "loss": 0.6396, + "step": 1824 + }, + { + "epoch": 0.8628841607565012, + "grad_norm": 2.7502307891845703, + "learning_rate": 4.7719549560721264e-06, + "loss": 0.6186, + "step": 1825 + }, + { + "epoch": 0.8633569739952719, + "grad_norm": 2.6279006004333496, + "learning_rate": 4.771694580271007e-06, + "loss": 0.5557, + "step": 1826 + }, + { + "epoch": 0.8638297872340426, + "grad_norm": 2.996563196182251, + "learning_rate": 4.7714340630214276e-06, + "loss": 0.6259, + "step": 1827 + }, + { + "epoch": 0.8643026004728133, + "grad_norm": 3.231323480606079, + "learning_rate": 4.771173404339609e-06, + "loss": 0.5473, + "step": 1828 + }, + { + "epoch": 0.864775413711584, + "grad_norm": 3.143519878387451, + "learning_rate": 4.770912604241781e-06, + "loss": 0.593, + "step": 1829 + }, + { + "epoch": 0.8652482269503546, + "grad_norm": 2.515484094619751, + "learning_rate": 4.770651662744184e-06, + "loss": 0.538, + "step": 1830 + }, + { + "epoch": 0.8657210401891253, + "grad_norm": 2.629058837890625, + "learning_rate": 4.770390579863064e-06, + "loss": 0.5745, + "step": 1831 + }, + { + "epoch": 0.866193853427896, + "grad_norm": 2.5826802253723145, + "learning_rate": 4.770129355614677e-06, + "loss": 0.6397, + "step": 1832 + }, + { + "epoch": 0.8666666666666667, + "grad_norm": 2.954623222351074, + "learning_rate": 4.769867990015289e-06, + "loss": 0.6106, + "step": 1833 + }, + { + "epoch": 0.8671394799054374, + "grad_norm": 2.742192268371582, + "learning_rate": 4.769606483081175e-06, + "loss": 0.6902, + "step": 1834 + }, + { + "epoch": 0.8676122931442081, + "grad_norm": 2.2619097232818604, + "learning_rate": 4.769344834828618e-06, + "loss": 0.5414, + "step": 1835 + }, + { + "epoch": 0.8680851063829788, + "grad_norm": 2.7384188175201416, + "learning_rate": 4.769083045273908e-06, + "loss": 0.5787, + "step": 1836 + }, + { + "epoch": 0.8685579196217494, + "grad_norm": 2.6734485626220703, + "learning_rate": 4.768821114433346e-06, + "loss": 0.5923, + "step": 1837 + }, + { + "epoch": 0.8690307328605201, + "grad_norm": 2.286140203475952, + "learning_rate": 4.768559042323243e-06, + "loss": 0.5822, + "step": 1838 + }, + { + "epoch": 0.8695035460992908, + "grad_norm": 3.0243725776672363, + "learning_rate": 4.768296828959915e-06, + "loss": 0.6623, + "step": 1839 + }, + { + "epoch": 0.8699763593380615, + "grad_norm": 2.4026312828063965, + "learning_rate": 4.768034474359689e-06, + "loss": 0.5554, + "step": 1840 + }, + { + "epoch": 0.8704491725768322, + "grad_norm": 2.7469029426574707, + "learning_rate": 4.767771978538903e-06, + "loss": 0.6316, + "step": 1841 + }, + { + "epoch": 0.8709219858156029, + "grad_norm": 2.729659080505371, + "learning_rate": 4.767509341513899e-06, + "loss": 0.5807, + "step": 1842 + }, + { + "epoch": 0.8713947990543736, + "grad_norm": 2.5336945056915283, + "learning_rate": 4.76724656330103e-06, + "loss": 0.6109, + "step": 1843 + }, + { + "epoch": 0.8718676122931442, + "grad_norm": 2.519880533218384, + "learning_rate": 4.76698364391666e-06, + "loss": 0.5313, + "step": 1844 + }, + { + "epoch": 0.8723404255319149, + "grad_norm": 2.698862075805664, + "learning_rate": 4.766720583377159e-06, + "loss": 0.5953, + "step": 1845 + }, + { + "epoch": 0.8728132387706856, + "grad_norm": 3.0195560455322266, + "learning_rate": 4.766457381698907e-06, + "loss": 0.5965, + "step": 1846 + }, + { + "epoch": 0.8732860520094563, + "grad_norm": 2.5972697734832764, + "learning_rate": 4.766194038898291e-06, + "loss": 0.6014, + "step": 1847 + }, + { + "epoch": 0.873758865248227, + "grad_norm": 2.7132294178009033, + "learning_rate": 4.76593055499171e-06, + "loss": 0.5638, + "step": 1848 + }, + { + "epoch": 0.8742316784869977, + "grad_norm": 2.7134575843811035, + "learning_rate": 4.765666929995568e-06, + "loss": 0.52, + "step": 1849 + }, + { + "epoch": 0.8747044917257684, + "grad_norm": 2.3804993629455566, + "learning_rate": 4.765403163926282e-06, + "loss": 0.5435, + "step": 1850 + }, + { + "epoch": 0.875177304964539, + "grad_norm": 2.8782761096954346, + "learning_rate": 4.765139256800274e-06, + "loss": 0.5843, + "step": 1851 + }, + { + "epoch": 0.8756501182033097, + "grad_norm": 2.836209774017334, + "learning_rate": 4.764875208633977e-06, + "loss": 0.6667, + "step": 1852 + }, + { + "epoch": 0.8761229314420804, + "grad_norm": 2.608851194381714, + "learning_rate": 4.764611019443831e-06, + "loss": 0.5436, + "step": 1853 + }, + { + "epoch": 0.8765957446808511, + "grad_norm": 2.788738965988159, + "learning_rate": 4.764346689246288e-06, + "loss": 0.7331, + "step": 1854 + }, + { + "epoch": 0.8770685579196218, + "grad_norm": 2.524277687072754, + "learning_rate": 4.764082218057805e-06, + "loss": 0.5067, + "step": 1855 + }, + { + "epoch": 0.8775413711583925, + "grad_norm": 3.7559316158294678, + "learning_rate": 4.763817605894851e-06, + "loss": 0.6809, + "step": 1856 + }, + { + "epoch": 0.8780141843971632, + "grad_norm": 2.9070613384246826, + "learning_rate": 4.763552852773899e-06, + "loss": 0.5913, + "step": 1857 + }, + { + "epoch": 0.8784869976359339, + "grad_norm": 2.7050609588623047, + "learning_rate": 4.7632879587114386e-06, + "loss": 0.6074, + "step": 1858 + }, + { + "epoch": 0.8789598108747045, + "grad_norm": 2.891134262084961, + "learning_rate": 4.76302292372396e-06, + "loss": 0.5939, + "step": 1859 + }, + { + "epoch": 0.8794326241134752, + "grad_norm": 2.8581702709198, + "learning_rate": 4.762757747827968e-06, + "loss": 0.5972, + "step": 1860 + }, + { + "epoch": 0.8799054373522459, + "grad_norm": 2.8266196250915527, + "learning_rate": 4.762492431039971e-06, + "loss": 0.5993, + "step": 1861 + }, + { + "epoch": 0.8803782505910166, + "grad_norm": 2.4853954315185547, + "learning_rate": 4.762226973376493e-06, + "loss": 0.6388, + "step": 1862 + }, + { + "epoch": 0.8808510638297873, + "grad_norm": 3.2212886810302734, + "learning_rate": 4.761961374854059e-06, + "loss": 0.6698, + "step": 1863 + }, + { + "epoch": 0.881323877068558, + "grad_norm": 3.1254501342773438, + "learning_rate": 4.761695635489211e-06, + "loss": 0.5263, + "step": 1864 + }, + { + "epoch": 0.8817966903073287, + "grad_norm": 2.6891462802886963, + "learning_rate": 4.761429755298491e-06, + "loss": 0.5359, + "step": 1865 + }, + { + "epoch": 0.8822695035460993, + "grad_norm": 2.8557538986206055, + "learning_rate": 4.761163734298457e-06, + "loss": 0.5933, + "step": 1866 + }, + { + "epoch": 0.88274231678487, + "grad_norm": 2.53548264503479, + "learning_rate": 4.7608975725056724e-06, + "loss": 0.6397, + "step": 1867 + }, + { + "epoch": 0.8832151300236407, + "grad_norm": 3.0237956047058105, + "learning_rate": 4.76063126993671e-06, + "loss": 0.6845, + "step": 1868 + }, + { + "epoch": 0.8836879432624114, + "grad_norm": 3.222886800765991, + "learning_rate": 4.76036482660815e-06, + "loss": 0.6055, + "step": 1869 + }, + { + "epoch": 0.8841607565011821, + "grad_norm": 3.1867551803588867, + "learning_rate": 4.760098242536584e-06, + "loss": 0.6592, + "step": 1870 + }, + { + "epoch": 0.8846335697399527, + "grad_norm": 2.782209873199463, + "learning_rate": 4.7598315177386115e-06, + "loss": 0.5833, + "step": 1871 + }, + { + "epoch": 0.8851063829787233, + "grad_norm": 2.899871587753296, + "learning_rate": 4.759564652230838e-06, + "loss": 0.6129, + "step": 1872 + }, + { + "epoch": 0.885579196217494, + "grad_norm": 2.5690579414367676, + "learning_rate": 4.759297646029882e-06, + "loss": 0.5827, + "step": 1873 + }, + { + "epoch": 0.8860520094562647, + "grad_norm": 2.666130304336548, + "learning_rate": 4.759030499152368e-06, + "loss": 0.5272, + "step": 1874 + }, + { + "epoch": 0.8865248226950354, + "grad_norm": 2.7030911445617676, + "learning_rate": 4.758763211614932e-06, + "loss": 0.6415, + "step": 1875 + }, + { + "epoch": 0.8869976359338061, + "grad_norm": 2.717512845993042, + "learning_rate": 4.7584957834342135e-06, + "loss": 0.5827, + "step": 1876 + }, + { + "epoch": 0.8874704491725768, + "grad_norm": 2.665823459625244, + "learning_rate": 4.758228214626867e-06, + "loss": 0.6209, + "step": 1877 + }, + { + "epoch": 0.8879432624113475, + "grad_norm": 2.636653184890747, + "learning_rate": 4.75796050520955e-06, + "loss": 0.6413, + "step": 1878 + }, + { + "epoch": 0.8884160756501182, + "grad_norm": 2.585115671157837, + "learning_rate": 4.7576926551989345e-06, + "loss": 0.5518, + "step": 1879 + }, + { + "epoch": 0.8888888888888888, + "grad_norm": 2.808526039123535, + "learning_rate": 4.757424664611697e-06, + "loss": 0.5717, + "step": 1880 + }, + { + "epoch": 0.8893617021276595, + "grad_norm": 3.5957939624786377, + "learning_rate": 4.757156533464524e-06, + "loss": 0.6323, + "step": 1881 + }, + { + "epoch": 0.8898345153664302, + "grad_norm": 2.5003883838653564, + "learning_rate": 4.756888261774111e-06, + "loss": 0.5937, + "step": 1882 + }, + { + "epoch": 0.8903073286052009, + "grad_norm": 2.749061346054077, + "learning_rate": 4.756619849557161e-06, + "loss": 0.6642, + "step": 1883 + }, + { + "epoch": 0.8907801418439716, + "grad_norm": 2.6757891178131104, + "learning_rate": 4.756351296830389e-06, + "loss": 0.5887, + "step": 1884 + }, + { + "epoch": 0.8912529550827423, + "grad_norm": 2.811925172805786, + "learning_rate": 4.756082603610516e-06, + "loss": 0.6571, + "step": 1885 + }, + { + "epoch": 0.891725768321513, + "grad_norm": 2.5054616928100586, + "learning_rate": 4.755813769914271e-06, + "loss": 0.6312, + "step": 1886 + }, + { + "epoch": 0.8921985815602836, + "grad_norm": 2.7518467903137207, + "learning_rate": 4.755544795758395e-06, + "loss": 0.6685, + "step": 1887 + }, + { + "epoch": 0.8926713947990543, + "grad_norm": 2.7527287006378174, + "learning_rate": 4.755275681159634e-06, + "loss": 0.5886, + "step": 1888 + }, + { + "epoch": 0.893144208037825, + "grad_norm": 2.6162452697753906, + "learning_rate": 4.755006426134745e-06, + "loss": 0.546, + "step": 1889 + }, + { + "epoch": 0.8936170212765957, + "grad_norm": 2.4016737937927246, + "learning_rate": 4.754737030700495e-06, + "loss": 0.5726, + "step": 1890 + }, + { + "epoch": 0.8940898345153664, + "grad_norm": 2.528327703475952, + "learning_rate": 4.754467494873656e-06, + "loss": 0.5682, + "step": 1891 + }, + { + "epoch": 0.8945626477541371, + "grad_norm": 2.3139286041259766, + "learning_rate": 4.7541978186710115e-06, + "loss": 0.6108, + "step": 1892 + }, + { + "epoch": 0.8950354609929078, + "grad_norm": 2.7269136905670166, + "learning_rate": 4.753928002109354e-06, + "loss": 0.5875, + "step": 1893 + }, + { + "epoch": 0.8955082742316784, + "grad_norm": 4.425495147705078, + "learning_rate": 4.753658045205482e-06, + "loss": 0.5572, + "step": 1894 + }, + { + "epoch": 0.8959810874704491, + "grad_norm": 2.535409927368164, + "learning_rate": 4.753387947976206e-06, + "loss": 0.5868, + "step": 1895 + }, + { + "epoch": 0.8964539007092198, + "grad_norm": 2.722458600997925, + "learning_rate": 4.753117710438343e-06, + "loss": 0.5935, + "step": 1896 + }, + { + "epoch": 0.8969267139479905, + "grad_norm": 2.743861436843872, + "learning_rate": 4.75284733260872e-06, + "loss": 0.572, + "step": 1897 + }, + { + "epoch": 0.8973995271867612, + "grad_norm": 2.60640549659729, + "learning_rate": 4.752576814504173e-06, + "loss": 0.567, + "step": 1898 + }, + { + "epoch": 0.8978723404255319, + "grad_norm": 2.7486042976379395, + "learning_rate": 4.7523061561415435e-06, + "loss": 0.5768, + "step": 1899 + }, + { + "epoch": 0.8983451536643026, + "grad_norm": 3.8410251140594482, + "learning_rate": 4.752035357537686e-06, + "loss": 0.6034, + "step": 1900 + }, + { + "epoch": 0.8988179669030733, + "grad_norm": 3.0935890674591064, + "learning_rate": 4.751764418709462e-06, + "loss": 0.5644, + "step": 1901 + }, + { + "epoch": 0.8992907801418439, + "grad_norm": 2.7989892959594727, + "learning_rate": 4.751493339673742e-06, + "loss": 0.656, + "step": 1902 + }, + { + "epoch": 0.8997635933806146, + "grad_norm": 3.6940557956695557, + "learning_rate": 4.751222120447403e-06, + "loss": 0.6632, + "step": 1903 + }, + { + "epoch": 0.9002364066193853, + "grad_norm": 2.3428797721862793, + "learning_rate": 4.750950761047335e-06, + "loss": 0.4485, + "step": 1904 + }, + { + "epoch": 0.900709219858156, + "grad_norm": 2.622544050216675, + "learning_rate": 4.750679261490432e-06, + "loss": 0.5857, + "step": 1905 + }, + { + "epoch": 0.9011820330969267, + "grad_norm": 2.4911322593688965, + "learning_rate": 4.750407621793601e-06, + "loss": 0.5618, + "step": 1906 + }, + { + "epoch": 0.9016548463356974, + "grad_norm": 2.6434662342071533, + "learning_rate": 4.750135841973755e-06, + "loss": 0.6057, + "step": 1907 + }, + { + "epoch": 0.902127659574468, + "grad_norm": 3.115443706512451, + "learning_rate": 4.749863922047817e-06, + "loss": 0.6064, + "step": 1908 + }, + { + "epoch": 0.9026004728132387, + "grad_norm": 2.5671091079711914, + "learning_rate": 4.749591862032718e-06, + "loss": 0.5625, + "step": 1909 + }, + { + "epoch": 0.9030732860520094, + "grad_norm": 3.2008655071258545, + "learning_rate": 4.749319661945398e-06, + "loss": 0.5547, + "step": 1910 + }, + { + "epoch": 0.9035460992907801, + "grad_norm": 2.905987024307251, + "learning_rate": 4.749047321802805e-06, + "loss": 0.6033, + "step": 1911 + }, + { + "epoch": 0.9040189125295508, + "grad_norm": 3.1456053256988525, + "learning_rate": 4.748774841621897e-06, + "loss": 0.5651, + "step": 1912 + }, + { + "epoch": 0.9044917257683215, + "grad_norm": 2.8116416931152344, + "learning_rate": 4.748502221419641e-06, + "loss": 0.5853, + "step": 1913 + }, + { + "epoch": 0.9049645390070922, + "grad_norm": 3.123835325241089, + "learning_rate": 4.748229461213011e-06, + "loss": 0.5427, + "step": 1914 + }, + { + "epoch": 0.9054373522458629, + "grad_norm": 2.4750146865844727, + "learning_rate": 4.747956561018989e-06, + "loss": 0.6517, + "step": 1915 + }, + { + "epoch": 0.9059101654846335, + "grad_norm": 2.6174299716949463, + "learning_rate": 4.7476835208545705e-06, + "loss": 0.6119, + "step": 1916 + }, + { + "epoch": 0.9063829787234042, + "grad_norm": 2.7390382289886475, + "learning_rate": 4.747410340736755e-06, + "loss": 0.5664, + "step": 1917 + }, + { + "epoch": 0.9068557919621749, + "grad_norm": 2.7940444946289062, + "learning_rate": 4.747137020682552e-06, + "loss": 0.5628, + "step": 1918 + }, + { + "epoch": 0.9073286052009456, + "grad_norm": 2.477365016937256, + "learning_rate": 4.7468635607089795e-06, + "loss": 0.5261, + "step": 1919 + }, + { + "epoch": 0.9078014184397163, + "grad_norm": 2.7016685009002686, + "learning_rate": 4.746589960833066e-06, + "loss": 0.5576, + "step": 1920 + }, + { + "epoch": 0.908274231678487, + "grad_norm": 2.8806519508361816, + "learning_rate": 4.746316221071846e-06, + "loss": 0.5925, + "step": 1921 + }, + { + "epoch": 0.9087470449172577, + "grad_norm": 3.0315234661102295, + "learning_rate": 4.746042341442365e-06, + "loss": 0.6142, + "step": 1922 + }, + { + "epoch": 0.9092198581560283, + "grad_norm": 4.2446160316467285, + "learning_rate": 4.745768321961676e-06, + "loss": 0.5352, + "step": 1923 + }, + { + "epoch": 0.909692671394799, + "grad_norm": 2.6517012119293213, + "learning_rate": 4.745494162646841e-06, + "loss": 0.6118, + "step": 1924 + }, + { + "epoch": 0.9101654846335697, + "grad_norm": 2.774900197982788, + "learning_rate": 4.7452198635149304e-06, + "loss": 0.572, + "step": 1925 + }, + { + "epoch": 0.9106382978723404, + "grad_norm": 3.0133683681488037, + "learning_rate": 4.744945424583024e-06, + "loss": 0.5897, + "step": 1926 + }, + { + "epoch": 0.9111111111111111, + "grad_norm": 2.7344839572906494, + "learning_rate": 4.744670845868211e-06, + "loss": 0.6207, + "step": 1927 + }, + { + "epoch": 0.9115839243498818, + "grad_norm": 2.636578321456909, + "learning_rate": 4.744396127387586e-06, + "loss": 0.6687, + "step": 1928 + }, + { + "epoch": 0.9120567375886525, + "grad_norm": 2.8663458824157715, + "learning_rate": 4.744121269158255e-06, + "loss": 0.5002, + "step": 1929 + }, + { + "epoch": 0.9125295508274232, + "grad_norm": 2.661079168319702, + "learning_rate": 4.743846271197333e-06, + "loss": 0.5848, + "step": 1930 + }, + { + "epoch": 0.9130023640661938, + "grad_norm": 2.881256341934204, + "learning_rate": 4.743571133521943e-06, + "loss": 0.5911, + "step": 1931 + }, + { + "epoch": 0.9134751773049645, + "grad_norm": 2.5540573596954346, + "learning_rate": 4.743295856149217e-06, + "loss": 0.5647, + "step": 1932 + }, + { + "epoch": 0.9139479905437352, + "grad_norm": 2.7060387134552, + "learning_rate": 4.743020439096293e-06, + "loss": 0.6267, + "step": 1933 + }, + { + "epoch": 0.9144208037825059, + "grad_norm": 2.694481372833252, + "learning_rate": 4.742744882380323e-06, + "loss": 0.6283, + "step": 1934 + }, + { + "epoch": 0.9148936170212766, + "grad_norm": 2.711555242538452, + "learning_rate": 4.7424691860184625e-06, + "loss": 0.5784, + "step": 1935 + }, + { + "epoch": 0.9153664302600473, + "grad_norm": 2.9077224731445312, + "learning_rate": 4.742193350027879e-06, + "loss": 0.5948, + "step": 1936 + }, + { + "epoch": 0.915839243498818, + "grad_norm": 2.9824187755584717, + "learning_rate": 4.7419173744257476e-06, + "loss": 0.6115, + "step": 1937 + }, + { + "epoch": 0.9163120567375886, + "grad_norm": 2.5127830505371094, + "learning_rate": 4.7416412592292515e-06, + "loss": 0.5803, + "step": 1938 + }, + { + "epoch": 0.9167848699763593, + "grad_norm": 3.1307175159454346, + "learning_rate": 4.741365004455583e-06, + "loss": 0.5657, + "step": 1939 + }, + { + "epoch": 0.91725768321513, + "grad_norm": 2.8205273151397705, + "learning_rate": 4.741088610121944e-06, + "loss": 0.6145, + "step": 1940 + }, + { + "epoch": 0.9177304964539007, + "grad_norm": 2.6119720935821533, + "learning_rate": 4.7408120762455444e-06, + "loss": 0.6058, + "step": 1941 + }, + { + "epoch": 0.9182033096926714, + "grad_norm": 2.421276092529297, + "learning_rate": 4.7405354028436025e-06, + "loss": 0.5973, + "step": 1942 + }, + { + "epoch": 0.9186761229314421, + "grad_norm": 2.9846808910369873, + "learning_rate": 4.740258589933346e-06, + "loss": 0.6892, + "step": 1943 + }, + { + "epoch": 0.9191489361702128, + "grad_norm": 2.6899871826171875, + "learning_rate": 4.739981637532009e-06, + "loss": 0.5705, + "step": 1944 + }, + { + "epoch": 0.9196217494089834, + "grad_norm": 2.8636131286621094, + "learning_rate": 4.739704545656839e-06, + "loss": 0.5775, + "step": 1945 + }, + { + "epoch": 0.9200945626477541, + "grad_norm": 2.7659449577331543, + "learning_rate": 4.739427314325087e-06, + "loss": 0.5823, + "step": 1946 + }, + { + "epoch": 0.9205673758865248, + "grad_norm": 4.71295166015625, + "learning_rate": 4.739149943554016e-06, + "loss": 0.5601, + "step": 1947 + }, + { + "epoch": 0.9210401891252955, + "grad_norm": 2.642636775970459, + "learning_rate": 4.738872433360896e-06, + "loss": 0.5278, + "step": 1948 + }, + { + "epoch": 0.9215130023640662, + "grad_norm": 2.4658217430114746, + "learning_rate": 4.7385947837630065e-06, + "loss": 0.6392, + "step": 1949 + }, + { + "epoch": 0.9219858156028369, + "grad_norm": 2.851602792739868, + "learning_rate": 4.738316994777636e-06, + "loss": 0.6164, + "step": 1950 + }, + { + "epoch": 0.9224586288416076, + "grad_norm": 2.394226551055908, + "learning_rate": 4.738039066422081e-06, + "loss": 0.5556, + "step": 1951 + }, + { + "epoch": 0.9229314420803783, + "grad_norm": 2.7985100746154785, + "learning_rate": 4.737760998713647e-06, + "loss": 0.5799, + "step": 1952 + }, + { + "epoch": 0.9234042553191489, + "grad_norm": 2.5974674224853516, + "learning_rate": 4.737482791669648e-06, + "loss": 0.6984, + "step": 1953 + }, + { + "epoch": 0.9238770685579196, + "grad_norm": 2.707636594772339, + "learning_rate": 4.737204445307406e-06, + "loss": 0.5548, + "step": 1954 + }, + { + "epoch": 0.9243498817966903, + "grad_norm": 2.7882707118988037, + "learning_rate": 4.736925959644254e-06, + "loss": 0.6026, + "step": 1955 + }, + { + "epoch": 0.924822695035461, + "grad_norm": 2.474482774734497, + "learning_rate": 4.7366473346975304e-06, + "loss": 0.5832, + "step": 1956 + }, + { + "epoch": 0.9252955082742317, + "grad_norm": 2.6196324825286865, + "learning_rate": 4.736368570484585e-06, + "loss": 0.5861, + "step": 1957 + }, + { + "epoch": 0.9257683215130024, + "grad_norm": 2.826864004135132, + "learning_rate": 4.736089667022775e-06, + "loss": 0.6173, + "step": 1958 + }, + { + "epoch": 0.926241134751773, + "grad_norm": 2.414473056793213, + "learning_rate": 4.735810624329466e-06, + "loss": 0.5753, + "step": 1959 + }, + { + "epoch": 0.9267139479905437, + "grad_norm": 2.8037970066070557, + "learning_rate": 4.7355314424220335e-06, + "loss": 0.6207, + "step": 1960 + }, + { + "epoch": 0.9271867612293144, + "grad_norm": 2.645458698272705, + "learning_rate": 4.735252121317861e-06, + "loss": 0.5959, + "step": 1961 + }, + { + "epoch": 0.9276595744680851, + "grad_norm": 2.7983884811401367, + "learning_rate": 4.734972661034339e-06, + "loss": 0.5696, + "step": 1962 + }, + { + "epoch": 0.9281323877068558, + "grad_norm": 3.0568997859954834, + "learning_rate": 4.73469306158887e-06, + "loss": 0.6194, + "step": 1963 + }, + { + "epoch": 0.9286052009456265, + "grad_norm": 2.7205135822296143, + "learning_rate": 4.734413322998863e-06, + "loss": 0.5292, + "step": 1964 + }, + { + "epoch": 0.9290780141843972, + "grad_norm": 3.3168489933013916, + "learning_rate": 4.734133445281735e-06, + "loss": 0.5654, + "step": 1965 + }, + { + "epoch": 0.9295508274231679, + "grad_norm": 3.0095653533935547, + "learning_rate": 4.733853428454916e-06, + "loss": 0.6508, + "step": 1966 + }, + { + "epoch": 0.9300236406619385, + "grad_norm": 2.7726712226867676, + "learning_rate": 4.733573272535838e-06, + "loss": 0.644, + "step": 1967 + }, + { + "epoch": 0.9304964539007092, + "grad_norm": 2.474397659301758, + "learning_rate": 4.7332929775419456e-06, + "loss": 0.5479, + "step": 1968 + }, + { + "epoch": 0.9309692671394799, + "grad_norm": 2.4518635272979736, + "learning_rate": 4.733012543490693e-06, + "loss": 0.6, + "step": 1969 + }, + { + "epoch": 0.9314420803782506, + "grad_norm": 2.9292192459106445, + "learning_rate": 4.73273197039954e-06, + "loss": 0.6647, + "step": 1970 + }, + { + "epoch": 0.9319148936170213, + "grad_norm": 2.425004720687866, + "learning_rate": 4.732451258285958e-06, + "loss": 0.6338, + "step": 1971 + }, + { + "epoch": 0.932387706855792, + "grad_norm": 2.904479503631592, + "learning_rate": 4.7321704071674255e-06, + "loss": 0.5923, + "step": 1972 + }, + { + "epoch": 0.9328605200945627, + "grad_norm": 2.477085590362549, + "learning_rate": 4.731889417061428e-06, + "loss": 0.5984, + "step": 1973 + }, + { + "epoch": 0.9333333333333333, + "grad_norm": 2.585240364074707, + "learning_rate": 4.731608287985465e-06, + "loss": 0.558, + "step": 1974 + }, + { + "epoch": 0.933806146572104, + "grad_norm": 2.658714532852173, + "learning_rate": 4.731327019957039e-06, + "loss": 0.5567, + "step": 1975 + }, + { + "epoch": 0.9342789598108747, + "grad_norm": 2.7593026161193848, + "learning_rate": 4.731045612993662e-06, + "loss": 0.5772, + "step": 1976 + }, + { + "epoch": 0.9347517730496454, + "grad_norm": 2.4386026859283447, + "learning_rate": 4.7307640671128585e-06, + "loss": 0.6199, + "step": 1977 + }, + { + "epoch": 0.9352245862884161, + "grad_norm": 2.681910514831543, + "learning_rate": 4.730482382332158e-06, + "loss": 0.5971, + "step": 1978 + }, + { + "epoch": 0.9356973995271868, + "grad_norm": 3.7593860626220703, + "learning_rate": 4.7302005586691e-06, + "loss": 0.6346, + "step": 1979 + }, + { + "epoch": 0.9361702127659575, + "grad_norm": 2.5789096355438232, + "learning_rate": 4.729918596141232e-06, + "loss": 0.5684, + "step": 1980 + }, + { + "epoch": 0.9366430260047282, + "grad_norm": 3.0607335567474365, + "learning_rate": 4.729636494766111e-06, + "loss": 0.6223, + "step": 1981 + }, + { + "epoch": 0.9371158392434988, + "grad_norm": 2.906643867492676, + "learning_rate": 4.729354254561303e-06, + "loss": 0.6513, + "step": 1982 + }, + { + "epoch": 0.9375886524822695, + "grad_norm": 3.192430019378662, + "learning_rate": 4.7290718755443795e-06, + "loss": 0.5095, + "step": 1983 + }, + { + "epoch": 0.9380614657210402, + "grad_norm": 2.661536931991577, + "learning_rate": 4.7287893577329255e-06, + "loss": 0.5525, + "step": 1984 + }, + { + "epoch": 0.9385342789598109, + "grad_norm": 2.8436734676361084, + "learning_rate": 4.728506701144531e-06, + "loss": 0.6323, + "step": 1985 + }, + { + "epoch": 0.9390070921985816, + "grad_norm": 2.75544810295105, + "learning_rate": 4.728223905796796e-06, + "loss": 0.6018, + "step": 1986 + }, + { + "epoch": 0.9394799054373523, + "grad_norm": 3.0652759075164795, + "learning_rate": 4.727940971707329e-06, + "loss": 0.62, + "step": 1987 + }, + { + "epoch": 0.939952718676123, + "grad_norm": 2.802567720413208, + "learning_rate": 4.727657898893747e-06, + "loss": 0.5809, + "step": 1988 + }, + { + "epoch": 0.9404255319148936, + "grad_norm": 2.6208512783050537, + "learning_rate": 4.7273746873736745e-06, + "loss": 0.5762, + "step": 1989 + }, + { + "epoch": 0.9408983451536643, + "grad_norm": 2.5901873111724854, + "learning_rate": 4.727091337164748e-06, + "loss": 0.6111, + "step": 1990 + }, + { + "epoch": 0.941371158392435, + "grad_norm": 3.002347707748413, + "learning_rate": 4.726807848284609e-06, + "loss": 0.6419, + "step": 1991 + }, + { + "epoch": 0.9418439716312057, + "grad_norm": 2.522151470184326, + "learning_rate": 4.72652422075091e-06, + "loss": 0.642, + "step": 1992 + }, + { + "epoch": 0.9423167848699764, + "grad_norm": 2.5571532249450684, + "learning_rate": 4.726240454581311e-06, + "loss": 0.5729, + "step": 1993 + }, + { + "epoch": 0.9427895981087471, + "grad_norm": 2.7704918384552, + "learning_rate": 4.72595654979348e-06, + "loss": 0.6816, + "step": 1994 + }, + { + "epoch": 0.9432624113475178, + "grad_norm": 2.517040491104126, + "learning_rate": 4.7256725064050955e-06, + "loss": 0.5782, + "step": 1995 + }, + { + "epoch": 0.9437352245862884, + "grad_norm": 2.613955020904541, + "learning_rate": 4.725388324433843e-06, + "loss": 0.6291, + "step": 1996 + }, + { + "epoch": 0.9442080378250591, + "grad_norm": 2.848891258239746, + "learning_rate": 4.725104003897418e-06, + "loss": 0.6544, + "step": 1997 + }, + { + "epoch": 0.9446808510638298, + "grad_norm": 3.0162429809570312, + "learning_rate": 4.724819544813523e-06, + "loss": 0.6301, + "step": 1998 + }, + { + "epoch": 0.9451536643026005, + "grad_norm": 2.613614559173584, + "learning_rate": 4.72453494719987e-06, + "loss": 0.5829, + "step": 1999 + }, + { + "epoch": 0.9456264775413712, + "grad_norm": 2.4838767051696777, + "learning_rate": 4.724250211074182e-06, + "loss": 0.6042, + "step": 2000 + }, + { + "epoch": 0.9460992907801419, + "grad_norm": 2.526470899581909, + "learning_rate": 4.723965336454185e-06, + "loss": 0.6167, + "step": 2001 + }, + { + "epoch": 0.9465721040189126, + "grad_norm": 2.504506826400757, + "learning_rate": 4.723680323357618e-06, + "loss": 0.6061, + "step": 2002 + }, + { + "epoch": 0.9470449172576832, + "grad_norm": 3.0547544956207275, + "learning_rate": 4.723395171802228e-06, + "loss": 0.6619, + "step": 2003 + }, + { + "epoch": 0.9475177304964539, + "grad_norm": 2.8692407608032227, + "learning_rate": 4.723109881805771e-06, + "loss": 0.5985, + "step": 2004 + }, + { + "epoch": 0.9479905437352246, + "grad_norm": 2.7929654121398926, + "learning_rate": 4.7228244533860094e-06, + "loss": 0.5869, + "step": 2005 + }, + { + "epoch": 0.9484633569739953, + "grad_norm": 2.764869451522827, + "learning_rate": 4.7225388865607146e-06, + "loss": 0.6288, + "step": 2006 + }, + { + "epoch": 0.948936170212766, + "grad_norm": 2.7656404972076416, + "learning_rate": 4.722253181347671e-06, + "loss": 0.5831, + "step": 2007 + }, + { + "epoch": 0.9494089834515367, + "grad_norm": 2.6698336601257324, + "learning_rate": 4.7219673377646635e-06, + "loss": 0.6087, + "step": 2008 + }, + { + "epoch": 0.9498817966903074, + "grad_norm": 2.524935722351074, + "learning_rate": 4.7216813558294946e-06, + "loss": 0.5675, + "step": 2009 + }, + { + "epoch": 0.950354609929078, + "grad_norm": 2.5998785495758057, + "learning_rate": 4.721395235559969e-06, + "loss": 0.5667, + "step": 2010 + }, + { + "epoch": 0.9508274231678487, + "grad_norm": 2.758021354675293, + "learning_rate": 4.721108976973902e-06, + "loss": 0.4931, + "step": 2011 + }, + { + "epoch": 0.9513002364066194, + "grad_norm": 2.767695903778076, + "learning_rate": 4.72082258008912e-06, + "loss": 0.5778, + "step": 2012 + }, + { + "epoch": 0.9517730496453901, + "grad_norm": 2.982314348220825, + "learning_rate": 4.720536044923453e-06, + "loss": 0.6096, + "step": 2013 + }, + { + "epoch": 0.9522458628841608, + "grad_norm": 2.7608799934387207, + "learning_rate": 4.720249371494743e-06, + "loss": 0.6242, + "step": 2014 + }, + { + "epoch": 0.9527186761229315, + "grad_norm": 2.60054349899292, + "learning_rate": 4.71996255982084e-06, + "loss": 0.6249, + "step": 2015 + }, + { + "epoch": 0.9531914893617022, + "grad_norm": 2.654355764389038, + "learning_rate": 4.719675609919603e-06, + "loss": 0.6327, + "step": 2016 + }, + { + "epoch": 0.9536643026004729, + "grad_norm": 2.589404582977295, + "learning_rate": 4.719388521808899e-06, + "loss": 0.6357, + "step": 2017 + }, + { + "epoch": 0.9541371158392435, + "grad_norm": 2.8016581535339355, + "learning_rate": 4.719101295506603e-06, + "loss": 0.5901, + "step": 2018 + }, + { + "epoch": 0.9546099290780142, + "grad_norm": 3.1408045291900635, + "learning_rate": 4.7188139310306e-06, + "loss": 0.598, + "step": 2019 + }, + { + "epoch": 0.9550827423167849, + "grad_norm": 2.7432665824890137, + "learning_rate": 4.718526428398783e-06, + "loss": 0.5508, + "step": 2020 + }, + { + "epoch": 0.9555555555555556, + "grad_norm": 2.947800874710083, + "learning_rate": 4.718238787629053e-06, + "loss": 0.6439, + "step": 2021 + }, + { + "epoch": 0.9560283687943263, + "grad_norm": 2.50828218460083, + "learning_rate": 4.71795100873932e-06, + "loss": 0.5441, + "step": 2022 + }, + { + "epoch": 0.956501182033097, + "grad_norm": 2.8558974266052246, + "learning_rate": 4.717663091747503e-06, + "loss": 0.5416, + "step": 2023 + }, + { + "epoch": 0.9569739952718677, + "grad_norm": 2.4803316593170166, + "learning_rate": 4.71737503667153e-06, + "loss": 0.5317, + "step": 2024 + }, + { + "epoch": 0.9574468085106383, + "grad_norm": 4.36754035949707, + "learning_rate": 4.717086843529336e-06, + "loss": 0.5808, + "step": 2025 + }, + { + "epoch": 0.957919621749409, + "grad_norm": 2.730185031890869, + "learning_rate": 4.7167985123388665e-06, + "loss": 0.5257, + "step": 2026 + }, + { + "epoch": 0.9583924349881797, + "grad_norm": 2.8136069774627686, + "learning_rate": 4.716510043118074e-06, + "loss": 0.5836, + "step": 2027 + }, + { + "epoch": 0.9588652482269504, + "grad_norm": 2.793975353240967, + "learning_rate": 4.71622143588492e-06, + "loss": 0.5706, + "step": 2028 + }, + { + "epoch": 0.9593380614657211, + "grad_norm": 2.3883821964263916, + "learning_rate": 4.7159326906573745e-06, + "loss": 0.5291, + "step": 2029 + }, + { + "epoch": 0.9598108747044918, + "grad_norm": 2.6135976314544678, + "learning_rate": 4.715643807453417e-06, + "loss": 0.6199, + "step": 2030 + }, + { + "epoch": 0.9602836879432625, + "grad_norm": 2.6245670318603516, + "learning_rate": 4.715354786291035e-06, + "loss": 0.5585, + "step": 2031 + }, + { + "epoch": 0.9607565011820332, + "grad_norm": 2.7870967388153076, + "learning_rate": 4.715065627188225e-06, + "loss": 0.6196, + "step": 2032 + }, + { + "epoch": 0.9612293144208038, + "grad_norm": 2.6983911991119385, + "learning_rate": 4.714776330162991e-06, + "loss": 0.6424, + "step": 2033 + }, + { + "epoch": 0.9617021276595744, + "grad_norm": 2.3221919536590576, + "learning_rate": 4.7144868952333465e-06, + "loss": 0.568, + "step": 2034 + }, + { + "epoch": 0.9621749408983451, + "grad_norm": 2.9408178329467773, + "learning_rate": 4.714197322417314e-06, + "loss": 0.6175, + "step": 2035 + }, + { + "epoch": 0.9626477541371158, + "grad_norm": 2.404057264328003, + "learning_rate": 4.713907611732921e-06, + "loss": 0.4943, + "step": 2036 + }, + { + "epoch": 0.9631205673758865, + "grad_norm": 3.547607660293579, + "learning_rate": 4.71361776319821e-06, + "loss": 0.5488, + "step": 2037 + }, + { + "epoch": 0.9635933806146572, + "grad_norm": 2.679614543914795, + "learning_rate": 4.713327776831227e-06, + "loss": 0.6234, + "step": 2038 + }, + { + "epoch": 0.9640661938534278, + "grad_norm": 2.526914119720459, + "learning_rate": 4.7130376526500286e-06, + "loss": 0.5891, + "step": 2039 + }, + { + "epoch": 0.9645390070921985, + "grad_norm": 2.6953470706939697, + "learning_rate": 4.71274739067268e-06, + "loss": 0.69, + "step": 2040 + }, + { + "epoch": 0.9650118203309692, + "grad_norm": 2.546660900115967, + "learning_rate": 4.712456990917254e-06, + "loss": 0.6185, + "step": 2041 + }, + { + "epoch": 0.9654846335697399, + "grad_norm": 3.3920490741729736, + "learning_rate": 4.712166453401832e-06, + "loss": 0.587, + "step": 2042 + }, + { + "epoch": 0.9659574468085106, + "grad_norm": 2.5961573123931885, + "learning_rate": 4.711875778144504e-06, + "loss": 0.6105, + "step": 2043 + }, + { + "epoch": 0.9664302600472813, + "grad_norm": 2.5111498832702637, + "learning_rate": 4.711584965163372e-06, + "loss": 0.5533, + "step": 2044 + }, + { + "epoch": 0.966903073286052, + "grad_norm": 2.4878132343292236, + "learning_rate": 4.7112940144765405e-06, + "loss": 0.5604, + "step": 2045 + }, + { + "epoch": 0.9673758865248226, + "grad_norm": 2.5714077949523926, + "learning_rate": 4.711002926102128e-06, + "loss": 0.5794, + "step": 2046 + }, + { + "epoch": 0.9678486997635933, + "grad_norm": 2.7069091796875, + "learning_rate": 4.710711700058257e-06, + "loss": 0.594, + "step": 2047 + }, + { + "epoch": 0.968321513002364, + "grad_norm": 2.8104631900787354, + "learning_rate": 4.710420336363063e-06, + "loss": 0.6247, + "step": 2048 + }, + { + "epoch": 0.9687943262411347, + "grad_norm": 2.8464386463165283, + "learning_rate": 4.7101288350346865e-06, + "loss": 0.6162, + "step": 2049 + }, + { + "epoch": 0.9692671394799054, + "grad_norm": 2.7187976837158203, + "learning_rate": 4.709837196091279e-06, + "loss": 0.6109, + "step": 2050 + }, + { + "epoch": 0.9697399527186761, + "grad_norm": 2.556734085083008, + "learning_rate": 4.709545419550999e-06, + "loss": 0.6297, + "step": 2051 + }, + { + "epoch": 0.9702127659574468, + "grad_norm": 2.937195062637329, + "learning_rate": 4.709253505432014e-06, + "loss": 0.6862, + "step": 2052 + }, + { + "epoch": 0.9706855791962175, + "grad_norm": 2.792175531387329, + "learning_rate": 4.7089614537525015e-06, + "loss": 0.6105, + "step": 2053 + }, + { + "epoch": 0.9711583924349881, + "grad_norm": 2.625636100769043, + "learning_rate": 4.708669264530644e-06, + "loss": 0.5849, + "step": 2054 + }, + { + "epoch": 0.9716312056737588, + "grad_norm": 2.6752610206604004, + "learning_rate": 4.708376937784637e-06, + "loss": 0.5949, + "step": 2055 + }, + { + "epoch": 0.9721040189125295, + "grad_norm": 2.6072793006896973, + "learning_rate": 4.708084473532681e-06, + "loss": 0.5776, + "step": 2056 + }, + { + "epoch": 0.9725768321513002, + "grad_norm": 2.728632926940918, + "learning_rate": 4.707791871792988e-06, + "loss": 0.6352, + "step": 2057 + }, + { + "epoch": 0.9730496453900709, + "grad_norm": 2.5841758251190186, + "learning_rate": 4.707499132583775e-06, + "loss": 0.5488, + "step": 2058 + }, + { + "epoch": 0.9735224586288416, + "grad_norm": 2.8464293479919434, + "learning_rate": 4.707206255923271e-06, + "loss": 0.7051, + "step": 2059 + }, + { + "epoch": 0.9739952718676123, + "grad_norm": 2.547297239303589, + "learning_rate": 4.706913241829712e-06, + "loss": 0.5937, + "step": 2060 + }, + { + "epoch": 0.9744680851063829, + "grad_norm": 2.6572306156158447, + "learning_rate": 4.706620090321341e-06, + "loss": 0.6041, + "step": 2061 + }, + { + "epoch": 0.9749408983451536, + "grad_norm": 2.3262805938720703, + "learning_rate": 4.706326801416414e-06, + "loss": 0.5144, + "step": 2062 + }, + { + "epoch": 0.9754137115839243, + "grad_norm": 2.9693965911865234, + "learning_rate": 4.706033375133191e-06, + "loss": 0.551, + "step": 2063 + }, + { + "epoch": 0.975886524822695, + "grad_norm": 2.5993731021881104, + "learning_rate": 4.7057398114899435e-06, + "loss": 0.6143, + "step": 2064 + }, + { + "epoch": 0.9763593380614657, + "grad_norm": 2.453336477279663, + "learning_rate": 4.70544611050495e-06, + "loss": 0.6093, + "step": 2065 + }, + { + "epoch": 0.9768321513002364, + "grad_norm": 2.898629665374756, + "learning_rate": 4.705152272196497e-06, + "loss": 0.6007, + "step": 2066 + }, + { + "epoch": 0.9773049645390071, + "grad_norm": 2.7990612983703613, + "learning_rate": 4.7048582965828815e-06, + "loss": 0.6687, + "step": 2067 + }, + { + "epoch": 0.9777777777777777, + "grad_norm": 2.635284423828125, + "learning_rate": 4.704564183682408e-06, + "loss": 0.5564, + "step": 2068 + }, + { + "epoch": 0.9782505910165484, + "grad_norm": 3.014547109603882, + "learning_rate": 4.704269933513389e-06, + "loss": 0.6084, + "step": 2069 + }, + { + "epoch": 0.9787234042553191, + "grad_norm": 2.659357786178589, + "learning_rate": 4.703975546094147e-06, + "loss": 0.6031, + "step": 2070 + }, + { + "epoch": 0.9791962174940898, + "grad_norm": 2.326932668685913, + "learning_rate": 4.703681021443013e-06, + "loss": 0.5859, + "step": 2071 + }, + { + "epoch": 0.9796690307328605, + "grad_norm": 2.958803653717041, + "learning_rate": 4.7033863595783235e-06, + "loss": 0.5586, + "step": 2072 + }, + { + "epoch": 0.9801418439716312, + "grad_norm": 2.921386957168579, + "learning_rate": 4.703091560518427e-06, + "loss": 0.6126, + "step": 2073 + }, + { + "epoch": 0.9806146572104019, + "grad_norm": 2.6500775814056396, + "learning_rate": 4.702796624281679e-06, + "loss": 0.5678, + "step": 2074 + }, + { + "epoch": 0.9810874704491725, + "grad_norm": 2.7740228176116943, + "learning_rate": 4.702501550886445e-06, + "loss": 0.6067, + "step": 2075 + }, + { + "epoch": 0.9815602836879432, + "grad_norm": 2.3296213150024414, + "learning_rate": 4.702206340351096e-06, + "loss": 0.5247, + "step": 2076 + }, + { + "epoch": 0.9820330969267139, + "grad_norm": 2.748300790786743, + "learning_rate": 4.701910992694016e-06, + "loss": 0.5197, + "step": 2077 + }, + { + "epoch": 0.9825059101654846, + "grad_norm": 2.250985622406006, + "learning_rate": 4.7016155079335926e-06, + "loss": 0.5214, + "step": 2078 + }, + { + "epoch": 0.9829787234042553, + "grad_norm": 2.389845848083496, + "learning_rate": 4.701319886088226e-06, + "loss": 0.519, + "step": 2079 + }, + { + "epoch": 0.983451536643026, + "grad_norm": 2.818220853805542, + "learning_rate": 4.701024127176322e-06, + "loss": 0.607, + "step": 2080 + }, + { + "epoch": 0.9839243498817967, + "grad_norm": 3.4058034420013428, + "learning_rate": 4.700728231216297e-06, + "loss": 0.5711, + "step": 2081 + }, + { + "epoch": 0.9843971631205674, + "grad_norm": 2.5297787189483643, + "learning_rate": 4.700432198226575e-06, + "loss": 0.5979, + "step": 2082 + }, + { + "epoch": 0.984869976359338, + "grad_norm": 3.0548105239868164, + "learning_rate": 4.7001360282255885e-06, + "loss": 0.6041, + "step": 2083 + }, + { + "epoch": 0.9853427895981087, + "grad_norm": 2.8983733654022217, + "learning_rate": 4.699839721231779e-06, + "loss": 0.5926, + "step": 2084 + }, + { + "epoch": 0.9858156028368794, + "grad_norm": 3.2717764377593994, + "learning_rate": 4.699543277263596e-06, + "loss": 0.6477, + "step": 2085 + }, + { + "epoch": 0.9862884160756501, + "grad_norm": 3.03729248046875, + "learning_rate": 4.699246696339497e-06, + "loss": 0.6786, + "step": 2086 + }, + { + "epoch": 0.9867612293144208, + "grad_norm": 2.852301597595215, + "learning_rate": 4.698949978477951e-06, + "loss": 0.6565, + "step": 2087 + }, + { + "epoch": 0.9872340425531915, + "grad_norm": 2.843485116958618, + "learning_rate": 4.698653123697431e-06, + "loss": 0.6627, + "step": 2088 + }, + { + "epoch": 0.9877068557919622, + "grad_norm": 2.6315064430236816, + "learning_rate": 4.698356132016423e-06, + "loss": 0.6577, + "step": 2089 + }, + { + "epoch": 0.9881796690307328, + "grad_norm": 2.7482151985168457, + "learning_rate": 4.698059003453417e-06, + "loss": 0.5514, + "step": 2090 + }, + { + "epoch": 0.9886524822695035, + "grad_norm": 2.826673746109009, + "learning_rate": 4.6977617380269145e-06, + "loss": 0.565, + "step": 2091 + }, + { + "epoch": 0.9891252955082742, + "grad_norm": 3.0273752212524414, + "learning_rate": 4.697464335755427e-06, + "loss": 0.6331, + "step": 2092 + }, + { + "epoch": 0.9895981087470449, + "grad_norm": 2.7551653385162354, + "learning_rate": 4.6971667966574695e-06, + "loss": 0.6486, + "step": 2093 + }, + { + "epoch": 0.9900709219858156, + "grad_norm": 2.656299114227295, + "learning_rate": 4.696869120751571e-06, + "loss": 0.6562, + "step": 2094 + }, + { + "epoch": 0.9905437352245863, + "grad_norm": 2.785322904586792, + "learning_rate": 4.696571308056265e-06, + "loss": 0.5892, + "step": 2095 + }, + { + "epoch": 0.991016548463357, + "grad_norm": 2.9334635734558105, + "learning_rate": 4.696273358590095e-06, + "loss": 0.6346, + "step": 2096 + }, + { + "epoch": 0.9914893617021276, + "grad_norm": 2.7944300174713135, + "learning_rate": 4.695975272371613e-06, + "loss": 0.5859, + "step": 2097 + }, + { + "epoch": 0.9919621749408983, + "grad_norm": 2.5416972637176514, + "learning_rate": 4.695677049419381e-06, + "loss": 0.5658, + "step": 2098 + }, + { + "epoch": 0.992434988179669, + "grad_norm": 2.4056856632232666, + "learning_rate": 4.695378689751966e-06, + "loss": 0.5121, + "step": 2099 + }, + { + "epoch": 0.9929078014184397, + "grad_norm": 2.614548683166504, + "learning_rate": 4.695080193387948e-06, + "loss": 0.5961, + "step": 2100 + }, + { + "epoch": 0.9933806146572104, + "grad_norm": 2.8966517448425293, + "learning_rate": 4.69478156034591e-06, + "loss": 0.5985, + "step": 2101 + }, + { + "epoch": 0.9938534278959811, + "grad_norm": 2.9514098167419434, + "learning_rate": 4.694482790644448e-06, + "loss": 0.5677, + "step": 2102 + }, + { + "epoch": 0.9943262411347518, + "grad_norm": 2.4326791763305664, + "learning_rate": 4.694183884302165e-06, + "loss": 0.5698, + "step": 2103 + }, + { + "epoch": 0.9947990543735225, + "grad_norm": 2.9242892265319824, + "learning_rate": 4.6938848413376735e-06, + "loss": 0.6245, + "step": 2104 + }, + { + "epoch": 0.9952718676122931, + "grad_norm": 2.9134104251861572, + "learning_rate": 4.693585661769593e-06, + "loss": 0.6164, + "step": 2105 + }, + { + "epoch": 0.9957446808510638, + "grad_norm": 2.472564458847046, + "learning_rate": 4.693286345616551e-06, + "loss": 0.5616, + "step": 2106 + }, + { + "epoch": 0.9962174940898345, + "grad_norm": 3.2456448078155518, + "learning_rate": 4.692986892897186e-06, + "loss": 0.6977, + "step": 2107 + }, + { + "epoch": 0.9966903073286052, + "grad_norm": 3.4032769203186035, + "learning_rate": 4.692687303630143e-06, + "loss": 0.643, + "step": 2108 + }, + { + "epoch": 0.9971631205673759, + "grad_norm": 2.722200870513916, + "learning_rate": 4.692387577834076e-06, + "loss": 0.5873, + "step": 2109 + }, + { + "epoch": 0.9976359338061466, + "grad_norm": 2.687532663345337, + "learning_rate": 4.692087715527648e-06, + "loss": 0.5423, + "step": 2110 + }, + { + "epoch": 0.9981087470449173, + "grad_norm": 2.578613042831421, + "learning_rate": 4.6917877167295305e-06, + "loss": 0.5689, + "step": 2111 + }, + { + "epoch": 0.9985815602836879, + "grad_norm": 3.1806094646453857, + "learning_rate": 4.691487581458402e-06, + "loss": 0.6133, + "step": 2112 + }, + { + "epoch": 0.9990543735224586, + "grad_norm": 2.4449520111083984, + "learning_rate": 4.691187309732952e-06, + "loss": 0.5841, + "step": 2113 + }, + { + "epoch": 0.9995271867612293, + "grad_norm": 2.908749580383301, + "learning_rate": 4.690886901571875e-06, + "loss": 0.534, + "step": 2114 + }, + { + "epoch": 1.0, + "grad_norm": 4.019968032836914, + "learning_rate": 4.6905863569938785e-06, + "loss": 0.596, + "step": 2115 + }, + { + "epoch": 1.0004728132387706, + "grad_norm": 2.4319307804107666, + "learning_rate": 4.690285676017675e-06, + "loss": 0.4973, + "step": 2116 + }, + { + "epoch": 1.0009456264775414, + "grad_norm": 2.6366477012634277, + "learning_rate": 4.689984858661986e-06, + "loss": 0.5682, + "step": 2117 + }, + { + "epoch": 1.001418439716312, + "grad_norm": 2.815114974975586, + "learning_rate": 4.689683904945542e-06, + "loss": 0.5616, + "step": 2118 + }, + { + "epoch": 1.0018912529550827, + "grad_norm": 2.6680490970611572, + "learning_rate": 4.689382814887084e-06, + "loss": 0.5161, + "step": 2119 + }, + { + "epoch": 1.0023640661938533, + "grad_norm": 2.7406351566314697, + "learning_rate": 4.689081588505358e-06, + "loss": 0.4937, + "step": 2120 + }, + { + "epoch": 1.0028368794326241, + "grad_norm": 2.2832298278808594, + "learning_rate": 4.68878022581912e-06, + "loss": 0.4986, + "step": 2121 + }, + { + "epoch": 1.0033096926713947, + "grad_norm": 2.5525307655334473, + "learning_rate": 4.688478726847136e-06, + "loss": 0.4909, + "step": 2122 + }, + { + "epoch": 1.0037825059101655, + "grad_norm": 2.9843199253082275, + "learning_rate": 4.688177091608176e-06, + "loss": 0.6046, + "step": 2123 + }, + { + "epoch": 1.004255319148936, + "grad_norm": 2.5231106281280518, + "learning_rate": 4.687875320121024e-06, + "loss": 0.5423, + "step": 2124 + }, + { + "epoch": 1.0047281323877069, + "grad_norm": 2.567599058151245, + "learning_rate": 4.68757341240447e-06, + "loss": 0.5092, + "step": 2125 + }, + { + "epoch": 1.0052009456264774, + "grad_norm": 2.768111228942871, + "learning_rate": 4.687271368477311e-06, + "loss": 0.5175, + "step": 2126 + }, + { + "epoch": 1.0056737588652482, + "grad_norm": 2.7223286628723145, + "learning_rate": 4.686969188358355e-06, + "loss": 0.5412, + "step": 2127 + }, + { + "epoch": 1.0061465721040188, + "grad_norm": 2.488299608230591, + "learning_rate": 4.686666872066418e-06, + "loss": 0.5288, + "step": 2128 + }, + { + "epoch": 1.0066193853427896, + "grad_norm": 2.882981777191162, + "learning_rate": 4.6863644196203215e-06, + "loss": 0.6117, + "step": 2129 + }, + { + "epoch": 1.0070921985815602, + "grad_norm": 3.0019447803497314, + "learning_rate": 4.686061831038901e-06, + "loss": 0.5308, + "step": 2130 + }, + { + "epoch": 1.007565011820331, + "grad_norm": 3.0056138038635254, + "learning_rate": 4.685759106340996e-06, + "loss": 0.5833, + "step": 2131 + }, + { + "epoch": 1.0080378250591016, + "grad_norm": 2.5709075927734375, + "learning_rate": 4.685456245545454e-06, + "loss": 0.5071, + "step": 2132 + }, + { + "epoch": 1.0085106382978724, + "grad_norm": 2.4641504287719727, + "learning_rate": 4.685153248671136e-06, + "loss": 0.4813, + "step": 2133 + }, + { + "epoch": 1.008983451536643, + "grad_norm": 2.374413013458252, + "learning_rate": 4.684850115736906e-06, + "loss": 0.5179, + "step": 2134 + }, + { + "epoch": 1.0094562647754137, + "grad_norm": 2.6504571437835693, + "learning_rate": 4.684546846761641e-06, + "loss": 0.437, + "step": 2135 + }, + { + "epoch": 1.0099290780141843, + "grad_norm": 2.5977871417999268, + "learning_rate": 4.684243441764221e-06, + "loss": 0.497, + "step": 2136 + }, + { + "epoch": 1.010401891252955, + "grad_norm": 2.4950785636901855, + "learning_rate": 4.683939900763541e-06, + "loss": 0.5624, + "step": 2137 + }, + { + "epoch": 1.0108747044917257, + "grad_norm": 3.065718412399292, + "learning_rate": 4.6836362237785e-06, + "loss": 0.512, + "step": 2138 + }, + { + "epoch": 1.0113475177304965, + "grad_norm": 2.7419207096099854, + "learning_rate": 4.6833324108280045e-06, + "loss": 0.5585, + "step": 2139 + }, + { + "epoch": 1.011820330969267, + "grad_norm": 2.623610496520996, + "learning_rate": 4.6830284619309744e-06, + "loss": 0.5163, + "step": 2140 + }, + { + "epoch": 1.0122931442080378, + "grad_norm": 2.774322986602783, + "learning_rate": 4.682724377106334e-06, + "loss": 0.527, + "step": 2141 + }, + { + "epoch": 1.0127659574468084, + "grad_norm": 2.959935188293457, + "learning_rate": 4.682420156373017e-06, + "loss": 0.6166, + "step": 2142 + }, + { + "epoch": 1.0132387706855792, + "grad_norm": 2.584026336669922, + "learning_rate": 4.682115799749968e-06, + "loss": 0.5086, + "step": 2143 + }, + { + "epoch": 1.0137115839243498, + "grad_norm": 2.6039700508117676, + "learning_rate": 4.6818113072561346e-06, + "loss": 0.49, + "step": 2144 + }, + { + "epoch": 1.0141843971631206, + "grad_norm": 2.466381072998047, + "learning_rate": 4.681506678910479e-06, + "loss": 0.4959, + "step": 2145 + }, + { + "epoch": 1.0146572104018912, + "grad_norm": 2.432636260986328, + "learning_rate": 4.681201914731969e-06, + "loss": 0.5057, + "step": 2146 + }, + { + "epoch": 1.015130023640662, + "grad_norm": 2.6134090423583984, + "learning_rate": 4.680897014739579e-06, + "loss": 0.4874, + "step": 2147 + }, + { + "epoch": 1.0156028368794325, + "grad_norm": 2.774481773376465, + "learning_rate": 4.680591978952295e-06, + "loss": 0.4967, + "step": 2148 + }, + { + "epoch": 1.0160756501182033, + "grad_norm": 2.66050124168396, + "learning_rate": 4.68028680738911e-06, + "loss": 0.4932, + "step": 2149 + }, + { + "epoch": 1.016548463356974, + "grad_norm": 3.020594835281372, + "learning_rate": 4.679981500069026e-06, + "loss": 0.5788, + "step": 2150 + }, + { + "epoch": 1.0170212765957447, + "grad_norm": 2.697758436203003, + "learning_rate": 4.679676057011053e-06, + "loss": 0.5441, + "step": 2151 + }, + { + "epoch": 1.0174940898345153, + "grad_norm": 6.986445903778076, + "learning_rate": 4.679370478234209e-06, + "loss": 0.6483, + "step": 2152 + }, + { + "epoch": 1.017966903073286, + "grad_norm": 2.6637115478515625, + "learning_rate": 4.679064763757522e-06, + "loss": 0.5859, + "step": 2153 + }, + { + "epoch": 1.0184397163120567, + "grad_norm": 2.7501862049102783, + "learning_rate": 4.678758913600027e-06, + "loss": 0.5745, + "step": 2154 + }, + { + "epoch": 1.0189125295508275, + "grad_norm": 2.7959372997283936, + "learning_rate": 4.678452927780768e-06, + "loss": 0.5076, + "step": 2155 + }, + { + "epoch": 1.019385342789598, + "grad_norm": 2.4377388954162598, + "learning_rate": 4.678146806318798e-06, + "loss": 0.5061, + "step": 2156 + }, + { + "epoch": 1.0198581560283688, + "grad_norm": 2.5478947162628174, + "learning_rate": 4.677840549233176e-06, + "loss": 0.4941, + "step": 2157 + }, + { + "epoch": 1.0203309692671394, + "grad_norm": 3.0956528186798096, + "learning_rate": 4.677534156542973e-06, + "loss": 0.5879, + "step": 2158 + }, + { + "epoch": 1.0208037825059102, + "grad_norm": 2.5247607231140137, + "learning_rate": 4.6772276282672666e-06, + "loss": 0.5532, + "step": 2159 + }, + { + "epoch": 1.0212765957446808, + "grad_norm": 3.1972787380218506, + "learning_rate": 4.676920964425143e-06, + "loss": 0.6081, + "step": 2160 + }, + { + "epoch": 1.0217494089834516, + "grad_norm": 2.6173388957977295, + "learning_rate": 4.6766141650356955e-06, + "loss": 0.5001, + "step": 2161 + }, + { + "epoch": 1.0222222222222221, + "grad_norm": 2.9914398193359375, + "learning_rate": 4.676307230118029e-06, + "loss": 0.5566, + "step": 2162 + }, + { + "epoch": 1.022695035460993, + "grad_norm": 2.8011834621429443, + "learning_rate": 4.676000159691254e-06, + "loss": 0.4909, + "step": 2163 + }, + { + "epoch": 1.0231678486997635, + "grad_norm": 2.6049559116363525, + "learning_rate": 4.67569295377449e-06, + "loss": 0.5018, + "step": 2164 + }, + { + "epoch": 1.0236406619385343, + "grad_norm": 2.8175013065338135, + "learning_rate": 4.675385612386866e-06, + "loss": 0.5309, + "step": 2165 + }, + { + "epoch": 1.0241134751773049, + "grad_norm": 2.854696750640869, + "learning_rate": 4.675078135547519e-06, + "loss": 0.5627, + "step": 2166 + }, + { + "epoch": 1.0245862884160757, + "grad_norm": 3.1856436729431152, + "learning_rate": 4.674770523275594e-06, + "loss": 0.5475, + "step": 2167 + }, + { + "epoch": 1.0250591016548463, + "grad_norm": 2.8289129734039307, + "learning_rate": 4.674462775590244e-06, + "loss": 0.5878, + "step": 2168 + }, + { + "epoch": 1.025531914893617, + "grad_norm": 2.8824517726898193, + "learning_rate": 4.6741548925106325e-06, + "loss": 0.4392, + "step": 2169 + }, + { + "epoch": 1.0260047281323876, + "grad_norm": 2.7044589519500732, + "learning_rate": 4.673846874055928e-06, + "loss": 0.5264, + "step": 2170 + }, + { + "epoch": 1.0264775413711584, + "grad_norm": 2.575035810470581, + "learning_rate": 4.673538720245312e-06, + "loss": 0.4615, + "step": 2171 + }, + { + "epoch": 1.026950354609929, + "grad_norm": 2.48168683052063, + "learning_rate": 4.67323043109797e-06, + "loss": 0.4404, + "step": 2172 + }, + { + "epoch": 1.0274231678486998, + "grad_norm": 2.926593065261841, + "learning_rate": 4.672922006633098e-06, + "loss": 0.54, + "step": 2173 + }, + { + "epoch": 1.0278959810874704, + "grad_norm": 2.4610698223114014, + "learning_rate": 4.672613446869901e-06, + "loss": 0.5555, + "step": 2174 + }, + { + "epoch": 1.0283687943262412, + "grad_norm": 3.026901960372925, + "learning_rate": 4.672304751827592e-06, + "loss": 0.62, + "step": 2175 + }, + { + "epoch": 1.0288416075650118, + "grad_norm": 2.3946213722229004, + "learning_rate": 4.671995921525391e-06, + "loss": 0.5228, + "step": 2176 + }, + { + "epoch": 1.0293144208037825, + "grad_norm": 2.985020399093628, + "learning_rate": 4.671686955982528e-06, + "loss": 0.6256, + "step": 2177 + }, + { + "epoch": 1.0297872340425531, + "grad_norm": 3.0910139083862305, + "learning_rate": 4.671377855218239e-06, + "loss": 0.5893, + "step": 2178 + }, + { + "epoch": 1.030260047281324, + "grad_norm": 2.507805109024048, + "learning_rate": 4.6710686192517744e-06, + "loss": 0.5329, + "step": 2179 + }, + { + "epoch": 1.0307328605200945, + "grad_norm": 2.4514641761779785, + "learning_rate": 4.670759248102386e-06, + "loss": 0.4585, + "step": 2180 + }, + { + "epoch": 1.0312056737588653, + "grad_norm": 2.742838144302368, + "learning_rate": 4.670449741789337e-06, + "loss": 0.6255, + "step": 2181 + }, + { + "epoch": 1.0316784869976359, + "grad_norm": 2.374349594116211, + "learning_rate": 4.670140100331901e-06, + "loss": 0.5049, + "step": 2182 + }, + { + "epoch": 1.0321513002364067, + "grad_norm": 2.78894305229187, + "learning_rate": 4.669830323749356e-06, + "loss": 0.6061, + "step": 2183 + }, + { + "epoch": 1.0326241134751772, + "grad_norm": 2.7195091247558594, + "learning_rate": 4.6695204120609905e-06, + "loss": 0.592, + "step": 2184 + }, + { + "epoch": 1.033096926713948, + "grad_norm": 2.824411630630493, + "learning_rate": 4.6692103652861035e-06, + "loss": 0.5666, + "step": 2185 + }, + { + "epoch": 1.0335697399527186, + "grad_norm": 2.4981014728546143, + "learning_rate": 4.6689001834439975e-06, + "loss": 0.5045, + "step": 2186 + }, + { + "epoch": 1.0340425531914894, + "grad_norm": 2.7375214099884033, + "learning_rate": 4.668589866553988e-06, + "loss": 0.5305, + "step": 2187 + }, + { + "epoch": 1.03451536643026, + "grad_norm": 2.625345468521118, + "learning_rate": 4.668279414635396e-06, + "loss": 0.4819, + "step": 2188 + }, + { + "epoch": 1.0349881796690308, + "grad_norm": 2.60479736328125, + "learning_rate": 4.667968827707553e-06, + "loss": 0.55, + "step": 2189 + }, + { + "epoch": 1.0354609929078014, + "grad_norm": 2.642014741897583, + "learning_rate": 4.667658105789797e-06, + "loss": 0.5264, + "step": 2190 + }, + { + "epoch": 1.0359338061465722, + "grad_norm": 2.5439083576202393, + "learning_rate": 4.667347248901476e-06, + "loss": 0.4657, + "step": 2191 + }, + { + "epoch": 1.0364066193853427, + "grad_norm": 2.5537586212158203, + "learning_rate": 4.667036257061945e-06, + "loss": 0.527, + "step": 2192 + }, + { + "epoch": 1.0368794326241135, + "grad_norm": 2.595466375350952, + "learning_rate": 4.666725130290569e-06, + "loss": 0.5336, + "step": 2193 + }, + { + "epoch": 1.037352245862884, + "grad_norm": 3.5106313228607178, + "learning_rate": 4.666413868606719e-06, + "loss": 0.5176, + "step": 2194 + }, + { + "epoch": 1.037825059101655, + "grad_norm": 2.931553363800049, + "learning_rate": 4.666102472029778e-06, + "loss": 0.549, + "step": 2195 + }, + { + "epoch": 1.0382978723404255, + "grad_norm": 2.4325125217437744, + "learning_rate": 4.665790940579133e-06, + "loss": 0.5095, + "step": 2196 + }, + { + "epoch": 1.0387706855791963, + "grad_norm": 2.708477258682251, + "learning_rate": 4.665479274274184e-06, + "loss": 0.5264, + "step": 2197 + }, + { + "epoch": 1.0392434988179668, + "grad_norm": 2.905977487564087, + "learning_rate": 4.665167473134335e-06, + "loss": 0.5575, + "step": 2198 + }, + { + "epoch": 1.0397163120567376, + "grad_norm": 2.428938865661621, + "learning_rate": 4.664855537179003e-06, + "loss": 0.5099, + "step": 2199 + }, + { + "epoch": 1.0401891252955082, + "grad_norm": 2.8432137966156006, + "learning_rate": 4.6645434664276075e-06, + "loss": 0.5331, + "step": 2200 + }, + { + "epoch": 1.040661938534279, + "grad_norm": 2.5185136795043945, + "learning_rate": 4.6642312608995825e-06, + "loss": 0.5217, + "step": 2201 + }, + { + "epoch": 1.0411347517730496, + "grad_norm": 2.556607723236084, + "learning_rate": 4.663918920614366e-06, + "loss": 0.4431, + "step": 2202 + }, + { + "epoch": 1.0416075650118204, + "grad_norm": 3.1271166801452637, + "learning_rate": 4.663606445591407e-06, + "loss": 0.5398, + "step": 2203 + }, + { + "epoch": 1.042080378250591, + "grad_norm": 2.573680877685547, + "learning_rate": 4.663293835850162e-06, + "loss": 0.4713, + "step": 2204 + }, + { + "epoch": 1.0425531914893618, + "grad_norm": 2.5230324268341064, + "learning_rate": 4.662981091410096e-06, + "loss": 0.5571, + "step": 2205 + }, + { + "epoch": 1.0430260047281323, + "grad_norm": 2.552182912826538, + "learning_rate": 4.662668212290681e-06, + "loss": 0.5173, + "step": 2206 + }, + { + "epoch": 1.0434988179669031, + "grad_norm": 2.832345724105835, + "learning_rate": 4.6623551985113995e-06, + "loss": 0.525, + "step": 2207 + }, + { + "epoch": 1.0439716312056737, + "grad_norm": 2.9729080200195312, + "learning_rate": 4.6620420500917416e-06, + "loss": 0.6308, + "step": 2208 + }, + { + "epoch": 1.0444444444444445, + "grad_norm": 2.618187665939331, + "learning_rate": 4.661728767051206e-06, + "loss": 0.4942, + "step": 2209 + }, + { + "epoch": 1.044917257683215, + "grad_norm": 2.515566349029541, + "learning_rate": 4.661415349409299e-06, + "loss": 0.5229, + "step": 2210 + }, + { + "epoch": 1.0453900709219859, + "grad_norm": 2.8651459217071533, + "learning_rate": 4.6611017971855356e-06, + "loss": 0.5029, + "step": 2211 + }, + { + "epoch": 1.0458628841607565, + "grad_norm": 2.502405881881714, + "learning_rate": 4.660788110399439e-06, + "loss": 0.4732, + "step": 2212 + }, + { + "epoch": 1.0463356973995273, + "grad_norm": 2.540668249130249, + "learning_rate": 4.660474289070541e-06, + "loss": 0.547, + "step": 2213 + }, + { + "epoch": 1.0468085106382978, + "grad_norm": 2.803469181060791, + "learning_rate": 4.660160333218384e-06, + "loss": 0.5441, + "step": 2214 + }, + { + "epoch": 1.0472813238770686, + "grad_norm": 3.233325481414795, + "learning_rate": 4.659846242862514e-06, + "loss": 0.4457, + "step": 2215 + }, + { + "epoch": 1.0477541371158392, + "grad_norm": 2.549548387527466, + "learning_rate": 4.659532018022489e-06, + "loss": 0.5684, + "step": 2216 + }, + { + "epoch": 1.04822695035461, + "grad_norm": 2.6112852096557617, + "learning_rate": 4.659217658717875e-06, + "loss": 0.5323, + "step": 2217 + }, + { + "epoch": 1.0486997635933806, + "grad_norm": 2.347418785095215, + "learning_rate": 4.658903164968245e-06, + "loss": 0.5349, + "step": 2218 + }, + { + "epoch": 1.0491725768321514, + "grad_norm": 2.695502281188965, + "learning_rate": 4.658588536793182e-06, + "loss": 0.4883, + "step": 2219 + }, + { + "epoch": 1.049645390070922, + "grad_norm": 2.7575674057006836, + "learning_rate": 4.658273774212275e-06, + "loss": 0.5517, + "step": 2220 + }, + { + "epoch": 1.0501182033096927, + "grad_norm": 2.787855386734009, + "learning_rate": 4.6579588772451245e-06, + "loss": 0.5744, + "step": 2221 + }, + { + "epoch": 1.0505910165484633, + "grad_norm": 3.0699398517608643, + "learning_rate": 4.657643845911337e-06, + "loss": 0.5258, + "step": 2222 + }, + { + "epoch": 1.0510638297872341, + "grad_norm": 2.652040719985962, + "learning_rate": 4.657328680230527e-06, + "loss": 0.5141, + "step": 2223 + }, + { + "epoch": 1.0515366430260047, + "grad_norm": 2.6896369457244873, + "learning_rate": 4.657013380222322e-06, + "loss": 0.5139, + "step": 2224 + }, + { + "epoch": 1.0520094562647755, + "grad_norm": 2.551839590072632, + "learning_rate": 4.65669794590635e-06, + "loss": 0.5099, + "step": 2225 + }, + { + "epoch": 1.052482269503546, + "grad_norm": 2.8543262481689453, + "learning_rate": 4.656382377302255e-06, + "loss": 0.6085, + "step": 2226 + }, + { + "epoch": 1.0529550827423169, + "grad_norm": 2.871469259262085, + "learning_rate": 4.656066674429685e-06, + "loss": 0.6108, + "step": 2227 + }, + { + "epoch": 1.0534278959810874, + "grad_norm": 2.4840824604034424, + "learning_rate": 4.655750837308296e-06, + "loss": 0.4994, + "step": 2228 + }, + { + "epoch": 1.0539007092198582, + "grad_norm": 2.5203280448913574, + "learning_rate": 4.6554348659577555e-06, + "loss": 0.4928, + "step": 2229 + }, + { + "epoch": 1.0543735224586288, + "grad_norm": 2.9327683448791504, + "learning_rate": 4.655118760397737e-06, + "loss": 0.6324, + "step": 2230 + }, + { + "epoch": 1.0548463356973996, + "grad_norm": 2.6766855716705322, + "learning_rate": 4.654802520647924e-06, + "loss": 0.5178, + "step": 2231 + }, + { + "epoch": 1.0553191489361702, + "grad_norm": 2.8438873291015625, + "learning_rate": 4.654486146728006e-06, + "loss": 0.509, + "step": 2232 + }, + { + "epoch": 1.055791962174941, + "grad_norm": 2.538661241531372, + "learning_rate": 4.6541696386576826e-06, + "loss": 0.5463, + "step": 2233 + }, + { + "epoch": 1.0562647754137116, + "grad_norm": 2.829030990600586, + "learning_rate": 4.653852996456662e-06, + "loss": 0.5404, + "step": 2234 + }, + { + "epoch": 1.0567375886524824, + "grad_norm": 2.5657269954681396, + "learning_rate": 4.653536220144659e-06, + "loss": 0.5479, + "step": 2235 + }, + { + "epoch": 1.057210401891253, + "grad_norm": 2.6641297340393066, + "learning_rate": 4.653219309741399e-06, + "loss": 0.5503, + "step": 2236 + }, + { + "epoch": 1.0576832151300237, + "grad_norm": 2.966350555419922, + "learning_rate": 4.652902265266615e-06, + "loss": 0.6404, + "step": 2237 + }, + { + "epoch": 1.0581560283687943, + "grad_norm": 2.462430000305176, + "learning_rate": 4.6525850867400455e-06, + "loss": 0.4885, + "step": 2238 + }, + { + "epoch": 1.058628841607565, + "grad_norm": 2.1791880130767822, + "learning_rate": 4.652267774181443e-06, + "loss": 0.4405, + "step": 2239 + }, + { + "epoch": 1.0591016548463357, + "grad_norm": 2.5473732948303223, + "learning_rate": 4.651950327610563e-06, + "loss": 0.5295, + "step": 2240 + }, + { + "epoch": 1.0595744680851065, + "grad_norm": 2.70904803276062, + "learning_rate": 4.651632747047172e-06, + "loss": 0.5169, + "step": 2241 + }, + { + "epoch": 1.060047281323877, + "grad_norm": 3.8442928791046143, + "learning_rate": 4.651315032511045e-06, + "loss": 0.5473, + "step": 2242 + }, + { + "epoch": 1.0605200945626478, + "grad_norm": 2.8613383769989014, + "learning_rate": 4.650997184021963e-06, + "loss": 0.5445, + "step": 2243 + }, + { + "epoch": 1.0609929078014184, + "grad_norm": 2.5995829105377197, + "learning_rate": 4.6506792015997184e-06, + "loss": 0.5525, + "step": 2244 + }, + { + "epoch": 1.0614657210401892, + "grad_norm": 2.5465996265411377, + "learning_rate": 4.650361085264111e-06, + "loss": 0.5093, + "step": 2245 + }, + { + "epoch": 1.0619385342789598, + "grad_norm": 2.46553111076355, + "learning_rate": 4.650042835034948e-06, + "loss": 0.5375, + "step": 2246 + }, + { + "epoch": 1.0624113475177306, + "grad_norm": 2.6907830238342285, + "learning_rate": 4.649724450932045e-06, + "loss": 0.572, + "step": 2247 + }, + { + "epoch": 1.0628841607565012, + "grad_norm": 3.0671346187591553, + "learning_rate": 4.649405932975226e-06, + "loss": 0.4974, + "step": 2248 + }, + { + "epoch": 1.063356973995272, + "grad_norm": 2.5392491817474365, + "learning_rate": 4.649087281184325e-06, + "loss": 0.524, + "step": 2249 + }, + { + "epoch": 1.0638297872340425, + "grad_norm": 2.7498562335968018, + "learning_rate": 4.648768495579183e-06, + "loss": 0.5801, + "step": 2250 + }, + { + "epoch": 1.0643026004728133, + "grad_norm": 2.8536248207092285, + "learning_rate": 4.648449576179649e-06, + "loss": 0.5384, + "step": 2251 + }, + { + "epoch": 1.064775413711584, + "grad_norm": 2.7062792778015137, + "learning_rate": 4.64813052300558e-06, + "loss": 0.5262, + "step": 2252 + }, + { + "epoch": 1.0652482269503547, + "grad_norm": 2.798650026321411, + "learning_rate": 4.647811336076841e-06, + "loss": 0.5719, + "step": 2253 + }, + { + "epoch": 1.0657210401891253, + "grad_norm": 2.9793951511383057, + "learning_rate": 4.647492015413311e-06, + "loss": 0.5377, + "step": 2254 + }, + { + "epoch": 1.066193853427896, + "grad_norm": 2.572129011154175, + "learning_rate": 4.647172561034868e-06, + "loss": 0.4791, + "step": 2255 + }, + { + "epoch": 1.0666666666666667, + "grad_norm": 3.7490930557250977, + "learning_rate": 4.646852972961405e-06, + "loss": 0.5423, + "step": 2256 + }, + { + "epoch": 1.0671394799054374, + "grad_norm": 2.626255750656128, + "learning_rate": 4.646533251212821e-06, + "loss": 0.5558, + "step": 2257 + }, + { + "epoch": 1.067612293144208, + "grad_norm": 2.8408126831054688, + "learning_rate": 4.646213395809023e-06, + "loss": 0.55, + "step": 2258 + }, + { + "epoch": 1.0680851063829788, + "grad_norm": 3.255606174468994, + "learning_rate": 4.645893406769929e-06, + "loss": 0.547, + "step": 2259 + }, + { + "epoch": 1.0685579196217494, + "grad_norm": 2.4352102279663086, + "learning_rate": 4.645573284115461e-06, + "loss": 0.4898, + "step": 2260 + }, + { + "epoch": 1.0690307328605202, + "grad_norm": 2.408634662628174, + "learning_rate": 4.6452530278655535e-06, + "loss": 0.5264, + "step": 2261 + }, + { + "epoch": 1.0695035460992908, + "grad_norm": 2.4220449924468994, + "learning_rate": 4.644932638040146e-06, + "loss": 0.5166, + "step": 2262 + }, + { + "epoch": 1.0699763593380616, + "grad_norm": 2.9188082218170166, + "learning_rate": 4.644612114659188e-06, + "loss": 0.5611, + "step": 2263 + }, + { + "epoch": 1.0704491725768321, + "grad_norm": 2.906557083129883, + "learning_rate": 4.644291457742638e-06, + "loss": 0.5515, + "step": 2264 + }, + { + "epoch": 1.070921985815603, + "grad_norm": 2.9039015769958496, + "learning_rate": 4.643970667310462e-06, + "loss": 0.5732, + "step": 2265 + }, + { + "epoch": 1.0713947990543735, + "grad_norm": 2.9985480308532715, + "learning_rate": 4.643649743382632e-06, + "loss": 0.563, + "step": 2266 + }, + { + "epoch": 1.0718676122931443, + "grad_norm": 2.5780906677246094, + "learning_rate": 4.6433286859791335e-06, + "loss": 0.502, + "step": 2267 + }, + { + "epoch": 1.0723404255319149, + "grad_norm": 2.590209722518921, + "learning_rate": 4.643007495119955e-06, + "loss": 0.4995, + "step": 2268 + }, + { + "epoch": 1.0728132387706855, + "grad_norm": 2.378894805908203, + "learning_rate": 4.642686170825097e-06, + "loss": 0.4886, + "step": 2269 + }, + { + "epoch": 1.0732860520094563, + "grad_norm": 2.6826229095458984, + "learning_rate": 4.642364713114567e-06, + "loss": 0.465, + "step": 2270 + }, + { + "epoch": 1.073758865248227, + "grad_norm": 2.627819538116455, + "learning_rate": 4.64204312200838e-06, + "loss": 0.4954, + "step": 2271 + }, + { + "epoch": 1.0742316784869976, + "grad_norm": 2.993021249771118, + "learning_rate": 4.641721397526561e-06, + "loss": 0.5073, + "step": 2272 + }, + { + "epoch": 1.0747044917257682, + "grad_norm": 2.719052791595459, + "learning_rate": 4.64139953968914e-06, + "loss": 0.538, + "step": 2273 + }, + { + "epoch": 1.075177304964539, + "grad_norm": 2.729252576828003, + "learning_rate": 4.6410775485161605e-06, + "loss": 0.552, + "step": 2274 + }, + { + "epoch": 1.0756501182033098, + "grad_norm": 2.924142599105835, + "learning_rate": 4.640755424027671e-06, + "loss": 0.522, + "step": 2275 + }, + { + "epoch": 1.0761229314420804, + "grad_norm": 3.329162120819092, + "learning_rate": 4.640433166243728e-06, + "loss": 0.5965, + "step": 2276 + }, + { + "epoch": 1.076595744680851, + "grad_norm": 2.9810245037078857, + "learning_rate": 4.640110775184396e-06, + "loss": 0.5653, + "step": 2277 + }, + { + "epoch": 1.0770685579196217, + "grad_norm": 2.61772084236145, + "learning_rate": 4.639788250869751e-06, + "loss": 0.5382, + "step": 2278 + }, + { + "epoch": 1.0775413711583925, + "grad_norm": 2.741225004196167, + "learning_rate": 4.639465593319874e-06, + "loss": 0.4866, + "step": 2279 + }, + { + "epoch": 1.0780141843971631, + "grad_norm": 2.7945218086242676, + "learning_rate": 4.639142802554856e-06, + "loss": 0.4711, + "step": 2280 + }, + { + "epoch": 1.0784869976359337, + "grad_norm": 2.4282329082489014, + "learning_rate": 4.638819878594795e-06, + "loss": 0.4911, + "step": 2281 + }, + { + "epoch": 1.0789598108747045, + "grad_norm": 2.551741361618042, + "learning_rate": 4.638496821459799e-06, + "loss": 0.453, + "step": 2282 + }, + { + "epoch": 1.0794326241134753, + "grad_norm": 2.5622754096984863, + "learning_rate": 4.638173631169983e-06, + "loss": 0.5983, + "step": 2283 + }, + { + "epoch": 1.0799054373522459, + "grad_norm": 2.7748284339904785, + "learning_rate": 4.6378503077454715e-06, + "loss": 0.5143, + "step": 2284 + }, + { + "epoch": 1.0803782505910164, + "grad_norm": 2.7693238258361816, + "learning_rate": 4.637526851206394e-06, + "loss": 0.5929, + "step": 2285 + }, + { + "epoch": 1.0808510638297872, + "grad_norm": 2.705548048019409, + "learning_rate": 4.637203261572893e-06, + "loss": 0.5577, + "step": 2286 + }, + { + "epoch": 1.081323877068558, + "grad_norm": 2.739307165145874, + "learning_rate": 4.636879538865117e-06, + "loss": 0.5676, + "step": 2287 + }, + { + "epoch": 1.0817966903073286, + "grad_norm": 2.514059543609619, + "learning_rate": 4.636555683103221e-06, + "loss": 0.5001, + "step": 2288 + }, + { + "epoch": 1.0822695035460992, + "grad_norm": 2.7166874408721924, + "learning_rate": 4.636231694307372e-06, + "loss": 0.5411, + "step": 2289 + }, + { + "epoch": 1.08274231678487, + "grad_norm": 2.7661683559417725, + "learning_rate": 4.635907572497741e-06, + "loss": 0.6353, + "step": 2290 + }, + { + "epoch": 1.0832151300236406, + "grad_norm": 2.598381996154785, + "learning_rate": 4.635583317694512e-06, + "loss": 0.5213, + "step": 2291 + }, + { + "epoch": 1.0836879432624114, + "grad_norm": 2.821491003036499, + "learning_rate": 4.6352589299178744e-06, + "loss": 0.6172, + "step": 2292 + }, + { + "epoch": 1.084160756501182, + "grad_norm": 2.5422823429107666, + "learning_rate": 4.634934409188025e-06, + "loss": 0.5245, + "step": 2293 + }, + { + "epoch": 1.0846335697399527, + "grad_norm": 2.8264620304107666, + "learning_rate": 4.634609755525173e-06, + "loss": 0.5004, + "step": 2294 + }, + { + "epoch": 1.0851063829787233, + "grad_norm": 2.3286643028259277, + "learning_rate": 4.63428496894953e-06, + "loss": 0.4561, + "step": 2295 + }, + { + "epoch": 1.085579196217494, + "grad_norm": 2.462005376815796, + "learning_rate": 4.633960049481321e-06, + "loss": 0.4948, + "step": 2296 + }, + { + "epoch": 1.0860520094562647, + "grad_norm": 2.760258913040161, + "learning_rate": 4.633634997140777e-06, + "loss": 0.5407, + "step": 2297 + }, + { + "epoch": 1.0865248226950355, + "grad_norm": 3.0234217643737793, + "learning_rate": 4.633309811948138e-06, + "loss": 0.4914, + "step": 2298 + }, + { + "epoch": 1.086997635933806, + "grad_norm": 2.8380849361419678, + "learning_rate": 4.63298449392365e-06, + "loss": 0.5562, + "step": 2299 + }, + { + "epoch": 1.0874704491725768, + "grad_norm": 2.6201648712158203, + "learning_rate": 4.632659043087572e-06, + "loss": 0.5882, + "step": 2300 + }, + { + "epoch": 1.0879432624113474, + "grad_norm": 2.586339235305786, + "learning_rate": 4.632333459460165e-06, + "loss": 0.4991, + "step": 2301 + }, + { + "epoch": 1.0884160756501182, + "grad_norm": 2.500115394592285, + "learning_rate": 4.632007743061705e-06, + "loss": 0.552, + "step": 2302 + }, + { + "epoch": 1.0888888888888888, + "grad_norm": 2.816390037536621, + "learning_rate": 4.63168189391247e-06, + "loss": 0.5301, + "step": 2303 + }, + { + "epoch": 1.0893617021276596, + "grad_norm": 2.975400924682617, + "learning_rate": 4.631355912032753e-06, + "loss": 0.6056, + "step": 2304 + }, + { + "epoch": 1.0898345153664302, + "grad_norm": 2.747985363006592, + "learning_rate": 4.631029797442846e-06, + "loss": 0.5335, + "step": 2305 + }, + { + "epoch": 1.090307328605201, + "grad_norm": 2.609281539916992, + "learning_rate": 4.630703550163059e-06, + "loss": 0.5189, + "step": 2306 + }, + { + "epoch": 1.0907801418439715, + "grad_norm": 2.624131202697754, + "learning_rate": 4.630377170213705e-06, + "loss": 0.5646, + "step": 2307 + }, + { + "epoch": 1.0912529550827423, + "grad_norm": 2.6186959743499756, + "learning_rate": 4.630050657615107e-06, + "loss": 0.5187, + "step": 2308 + }, + { + "epoch": 1.091725768321513, + "grad_norm": 2.9961764812469482, + "learning_rate": 4.629724012387594e-06, + "loss": 0.6207, + "step": 2309 + }, + { + "epoch": 1.0921985815602837, + "grad_norm": 2.665799140930176, + "learning_rate": 4.629397234551505e-06, + "loss": 0.5046, + "step": 2310 + }, + { + "epoch": 1.0926713947990543, + "grad_norm": 2.6154725551605225, + "learning_rate": 4.629070324127187e-06, + "loss": 0.5553, + "step": 2311 + }, + { + "epoch": 1.093144208037825, + "grad_norm": 2.702967643737793, + "learning_rate": 4.628743281134996e-06, + "loss": 0.5159, + "step": 2312 + }, + { + "epoch": 1.0936170212765957, + "grad_norm": 2.578080177307129, + "learning_rate": 4.628416105595295e-06, + "loss": 0.4934, + "step": 2313 + }, + { + "epoch": 1.0940898345153665, + "grad_norm": 2.8763060569763184, + "learning_rate": 4.628088797528456e-06, + "loss": 0.5404, + "step": 2314 + }, + { + "epoch": 1.094562647754137, + "grad_norm": 2.5301198959350586, + "learning_rate": 4.6277613569548585e-06, + "loss": 0.524, + "step": 2315 + }, + { + "epoch": 1.0950354609929078, + "grad_norm": 2.559903144836426, + "learning_rate": 4.627433783894892e-06, + "loss": 0.5177, + "step": 2316 + }, + { + "epoch": 1.0955082742316784, + "grad_norm": 2.430863380432129, + "learning_rate": 4.627106078368952e-06, + "loss": 0.5368, + "step": 2317 + }, + { + "epoch": 1.0959810874704492, + "grad_norm": 2.687567949295044, + "learning_rate": 4.626778240397444e-06, + "loss": 0.5385, + "step": 2318 + }, + { + "epoch": 1.0964539007092198, + "grad_norm": 3.053466558456421, + "learning_rate": 4.62645027000078e-06, + "loss": 0.5814, + "step": 2319 + }, + { + "epoch": 1.0969267139479906, + "grad_norm": 2.4612979888916016, + "learning_rate": 4.6261221671993815e-06, + "loss": 0.5069, + "step": 2320 + }, + { + "epoch": 1.0973995271867611, + "grad_norm": 2.6153628826141357, + "learning_rate": 4.625793932013679e-06, + "loss": 0.5422, + "step": 2321 + }, + { + "epoch": 1.097872340425532, + "grad_norm": 2.8918874263763428, + "learning_rate": 4.62546556446411e-06, + "loss": 0.5326, + "step": 2322 + }, + { + "epoch": 1.0983451536643025, + "grad_norm": 3.62565279006958, + "learning_rate": 4.625137064571119e-06, + "loss": 0.5164, + "step": 2323 + }, + { + "epoch": 1.0988179669030733, + "grad_norm": 2.4285085201263428, + "learning_rate": 4.624808432355164e-06, + "loss": 0.5084, + "step": 2324 + }, + { + "epoch": 1.099290780141844, + "grad_norm": 2.593979835510254, + "learning_rate": 4.624479667836702e-06, + "loss": 0.4986, + "step": 2325 + }, + { + "epoch": 1.0997635933806147, + "grad_norm": 2.490752935409546, + "learning_rate": 4.624150771036208e-06, + "loss": 0.5296, + "step": 2326 + }, + { + "epoch": 1.1002364066193853, + "grad_norm": 2.67694091796875, + "learning_rate": 4.6238217419741595e-06, + "loss": 0.5229, + "step": 2327 + }, + { + "epoch": 1.100709219858156, + "grad_norm": 2.594147205352783, + "learning_rate": 4.623492580671044e-06, + "loss": 0.4916, + "step": 2328 + }, + { + "epoch": 1.1011820330969266, + "grad_norm": 2.943472385406494, + "learning_rate": 4.623163287147356e-06, + "loss": 0.5591, + "step": 2329 + }, + { + "epoch": 1.1016548463356974, + "grad_norm": 2.569410562515259, + "learning_rate": 4.622833861423601e-06, + "loss": 0.4648, + "step": 2330 + }, + { + "epoch": 1.102127659574468, + "grad_norm": 2.5490405559539795, + "learning_rate": 4.6225043035202886e-06, + "loss": 0.5493, + "step": 2331 + }, + { + "epoch": 1.1026004728132388, + "grad_norm": 2.5964598655700684, + "learning_rate": 4.622174613457941e-06, + "loss": 0.5358, + "step": 2332 + }, + { + "epoch": 1.1030732860520094, + "grad_norm": 2.6456820964813232, + "learning_rate": 4.621844791257085e-06, + "loss": 0.5864, + "step": 2333 + }, + { + "epoch": 1.1035460992907802, + "grad_norm": 2.861180067062378, + "learning_rate": 4.621514836938259e-06, + "loss": 0.6064, + "step": 2334 + }, + { + "epoch": 1.1040189125295508, + "grad_norm": 2.8199548721313477, + "learning_rate": 4.621184750522005e-06, + "loss": 0.5244, + "step": 2335 + }, + { + "epoch": 1.1044917257683216, + "grad_norm": 2.7398853302001953, + "learning_rate": 4.6208545320288795e-06, + "loss": 0.5496, + "step": 2336 + }, + { + "epoch": 1.1049645390070921, + "grad_norm": 2.7941031455993652, + "learning_rate": 4.620524181479441e-06, + "loss": 0.5496, + "step": 2337 + }, + { + "epoch": 1.105437352245863, + "grad_norm": 2.973785161972046, + "learning_rate": 4.620193698894259e-06, + "loss": 0.5492, + "step": 2338 + }, + { + "epoch": 1.1059101654846335, + "grad_norm": 2.650355815887451, + "learning_rate": 4.6198630842939144e-06, + "loss": 0.5392, + "step": 2339 + }, + { + "epoch": 1.1063829787234043, + "grad_norm": 2.9092214107513428, + "learning_rate": 4.61953233769899e-06, + "loss": 0.5305, + "step": 2340 + }, + { + "epoch": 1.1068557919621749, + "grad_norm": 2.6329731941223145, + "learning_rate": 4.61920145913008e-06, + "loss": 0.5031, + "step": 2341 + }, + { + "epoch": 1.1073286052009457, + "grad_norm": 2.7214207649230957, + "learning_rate": 4.618870448607788e-06, + "loss": 0.5536, + "step": 2342 + }, + { + "epoch": 1.1078014184397162, + "grad_norm": 2.873119592666626, + "learning_rate": 4.618539306152724e-06, + "loss": 0.4531, + "step": 2343 + }, + { + "epoch": 1.108274231678487, + "grad_norm": 2.701042413711548, + "learning_rate": 4.618208031785507e-06, + "loss": 0.5217, + "step": 2344 + }, + { + "epoch": 1.1087470449172576, + "grad_norm": 2.7189881801605225, + "learning_rate": 4.6178766255267635e-06, + "loss": 0.6205, + "step": 2345 + }, + { + "epoch": 1.1092198581560284, + "grad_norm": 2.546382188796997, + "learning_rate": 4.61754508739713e-06, + "loss": 0.5475, + "step": 2346 + }, + { + "epoch": 1.109692671394799, + "grad_norm": 2.8429276943206787, + "learning_rate": 4.617213417417249e-06, + "loss": 0.4809, + "step": 2347 + }, + { + "epoch": 1.1101654846335698, + "grad_norm": 2.9515812397003174, + "learning_rate": 4.616881615607772e-06, + "loss": 0.5067, + "step": 2348 + }, + { + "epoch": 1.1106382978723404, + "grad_norm": 2.5910723209381104, + "learning_rate": 4.616549681989358e-06, + "loss": 0.5368, + "step": 2349 + }, + { + "epoch": 1.1111111111111112, + "grad_norm": 2.80855655670166, + "learning_rate": 4.616217616582678e-06, + "loss": 0.5827, + "step": 2350 + }, + { + "epoch": 1.1115839243498817, + "grad_norm": 2.604383945465088, + "learning_rate": 4.6158854194084044e-06, + "loss": 0.5716, + "step": 2351 + }, + { + "epoch": 1.1120567375886525, + "grad_norm": 3.0585904121398926, + "learning_rate": 4.6155530904872246e-06, + "loss": 0.4998, + "step": 2352 + }, + { + "epoch": 1.112529550827423, + "grad_norm": 2.660961627960205, + "learning_rate": 4.61522062983983e-06, + "loss": 0.4533, + "step": 2353 + }, + { + "epoch": 1.113002364066194, + "grad_norm": 2.8042070865631104, + "learning_rate": 4.614888037486923e-06, + "loss": 0.5592, + "step": 2354 + }, + { + "epoch": 1.1134751773049645, + "grad_norm": 2.681664228439331, + "learning_rate": 4.61455531344921e-06, + "loss": 0.5439, + "step": 2355 + }, + { + "epoch": 1.1139479905437353, + "grad_norm": 2.905054807662964, + "learning_rate": 4.61422245774741e-06, + "loss": 0.5497, + "step": 2356 + }, + { + "epoch": 1.1144208037825059, + "grad_norm": 2.7979753017425537, + "learning_rate": 4.6138894704022484e-06, + "loss": 0.5374, + "step": 2357 + }, + { + "epoch": 1.1148936170212767, + "grad_norm": 2.965611696243286, + "learning_rate": 4.613556351434458e-06, + "loss": 0.5145, + "step": 2358 + }, + { + "epoch": 1.1153664302600472, + "grad_norm": 2.583134889602661, + "learning_rate": 4.613223100864782e-06, + "loss": 0.535, + "step": 2359 + }, + { + "epoch": 1.115839243498818, + "grad_norm": 2.5979621410369873, + "learning_rate": 4.61288971871397e-06, + "loss": 0.5514, + "step": 2360 + }, + { + "epoch": 1.1163120567375886, + "grad_norm": 3.0117669105529785, + "learning_rate": 4.612556205002779e-06, + "loss": 0.5266, + "step": 2361 + }, + { + "epoch": 1.1167848699763594, + "grad_norm": 2.425133466720581, + "learning_rate": 4.612222559751976e-06, + "loss": 0.4838, + "step": 2362 + }, + { + "epoch": 1.11725768321513, + "grad_norm": 2.5102691650390625, + "learning_rate": 4.611888782982337e-06, + "loss": 0.3947, + "step": 2363 + }, + { + "epoch": 1.1177304964539008, + "grad_norm": 3.0327367782592773, + "learning_rate": 4.611554874714645e-06, + "loss": 0.5753, + "step": 2364 + }, + { + "epoch": 1.1182033096926713, + "grad_norm": 2.4561009407043457, + "learning_rate": 4.6112208349696875e-06, + "loss": 0.5054, + "step": 2365 + }, + { + "epoch": 1.1186761229314421, + "grad_norm": 3.3898050785064697, + "learning_rate": 4.610886663768267e-06, + "loss": 0.5946, + "step": 2366 + }, + { + "epoch": 1.1191489361702127, + "grad_norm": 2.8112242221832275, + "learning_rate": 4.61055236113119e-06, + "loss": 0.5475, + "step": 2367 + }, + { + "epoch": 1.1196217494089835, + "grad_norm": 3.152946710586548, + "learning_rate": 4.610217927079272e-06, + "loss": 0.5165, + "step": 2368 + }, + { + "epoch": 1.120094562647754, + "grad_norm": 2.7847867012023926, + "learning_rate": 4.609883361633336e-06, + "loss": 0.5533, + "step": 2369 + }, + { + "epoch": 1.1205673758865249, + "grad_norm": 2.6376686096191406, + "learning_rate": 4.6095486648142155e-06, + "loss": 0.4942, + "step": 2370 + }, + { + "epoch": 1.1210401891252955, + "grad_norm": 3.123072862625122, + "learning_rate": 4.609213836642749e-06, + "loss": 0.616, + "step": 2371 + }, + { + "epoch": 1.1215130023640663, + "grad_norm": 2.802694320678711, + "learning_rate": 4.608878877139786e-06, + "loss": 0.5323, + "step": 2372 + }, + { + "epoch": 1.1219858156028368, + "grad_norm": 2.3567938804626465, + "learning_rate": 4.6085437863261825e-06, + "loss": 0.4822, + "step": 2373 + }, + { + "epoch": 1.1224586288416076, + "grad_norm": 2.553112030029297, + "learning_rate": 4.608208564222804e-06, + "loss": 0.5447, + "step": 2374 + }, + { + "epoch": 1.1229314420803782, + "grad_norm": 3.0020132064819336, + "learning_rate": 4.607873210850521e-06, + "loss": 0.6486, + "step": 2375 + }, + { + "epoch": 1.123404255319149, + "grad_norm": 2.832442045211792, + "learning_rate": 4.607537726230216e-06, + "loss": 0.5257, + "step": 2376 + }, + { + "epoch": 1.1238770685579196, + "grad_norm": 2.471527099609375, + "learning_rate": 4.607202110382778e-06, + "loss": 0.4816, + "step": 2377 + }, + { + "epoch": 1.1243498817966904, + "grad_norm": 2.4232118129730225, + "learning_rate": 4.606866363329105e-06, + "loss": 0.5533, + "step": 2378 + }, + { + "epoch": 1.124822695035461, + "grad_norm": 2.477506637573242, + "learning_rate": 4.6065304850901025e-06, + "loss": 0.5223, + "step": 2379 + }, + { + "epoch": 1.1252955082742317, + "grad_norm": 3.54127836227417, + "learning_rate": 4.6061944756866824e-06, + "loss": 0.6514, + "step": 2380 + }, + { + "epoch": 1.1257683215130023, + "grad_norm": 2.5148677825927734, + "learning_rate": 4.605858335139768e-06, + "loss": 0.4864, + "step": 2381 + }, + { + "epoch": 1.1262411347517731, + "grad_norm": 2.8363659381866455, + "learning_rate": 4.605522063470289e-06, + "loss": 0.5034, + "step": 2382 + }, + { + "epoch": 1.1267139479905437, + "grad_norm": 2.4996654987335205, + "learning_rate": 4.605185660699184e-06, + "loss": 0.4126, + "step": 2383 + }, + { + "epoch": 1.1271867612293145, + "grad_norm": 2.352543830871582, + "learning_rate": 4.604849126847398e-06, + "loss": 0.5224, + "step": 2384 + }, + { + "epoch": 1.127659574468085, + "grad_norm": 2.60101056098938, + "learning_rate": 4.6045124619358875e-06, + "loss": 0.4867, + "step": 2385 + }, + { + "epoch": 1.1281323877068559, + "grad_norm": 2.9471068382263184, + "learning_rate": 4.604175665985613e-06, + "loss": 0.6474, + "step": 2386 + }, + { + "epoch": 1.1286052009456264, + "grad_norm": 2.5933351516723633, + "learning_rate": 4.603838739017546e-06, + "loss": 0.5081, + "step": 2387 + }, + { + "epoch": 1.1290780141843972, + "grad_norm": 2.3740346431732178, + "learning_rate": 4.6035016810526665e-06, + "loss": 0.4438, + "step": 2388 + }, + { + "epoch": 1.1295508274231678, + "grad_norm": 2.675020217895508, + "learning_rate": 4.6031644921119614e-06, + "loss": 0.4968, + "step": 2389 + }, + { + "epoch": 1.1300236406619386, + "grad_norm": 2.599472999572754, + "learning_rate": 4.602827172216424e-06, + "loss": 0.5131, + "step": 2390 + }, + { + "epoch": 1.1304964539007092, + "grad_norm": 2.8176097869873047, + "learning_rate": 4.602489721387061e-06, + "loss": 0.5549, + "step": 2391 + }, + { + "epoch": 1.13096926713948, + "grad_norm": 2.466914176940918, + "learning_rate": 4.602152139644881e-06, + "loss": 0.5052, + "step": 2392 + }, + { + "epoch": 1.1314420803782506, + "grad_norm": 2.8938796520233154, + "learning_rate": 4.601814427010905e-06, + "loss": 0.6181, + "step": 2393 + }, + { + "epoch": 1.1319148936170214, + "grad_norm": 2.7390825748443604, + "learning_rate": 4.601476583506161e-06, + "loss": 0.5178, + "step": 2394 + }, + { + "epoch": 1.132387706855792, + "grad_norm": 3.180112838745117, + "learning_rate": 4.601138609151685e-06, + "loss": 0.6071, + "step": 2395 + }, + { + "epoch": 1.1328605200945627, + "grad_norm": 2.9282350540161133, + "learning_rate": 4.600800503968521e-06, + "loss": 0.5557, + "step": 2396 + }, + { + "epoch": 1.1333333333333333, + "grad_norm": 2.6689717769622803, + "learning_rate": 4.6004622679777215e-06, + "loss": 0.4679, + "step": 2397 + }, + { + "epoch": 1.133806146572104, + "grad_norm": 2.651582956314087, + "learning_rate": 4.600123901200347e-06, + "loss": 0.4907, + "step": 2398 + }, + { + "epoch": 1.1342789598108747, + "grad_norm": 2.5702924728393555, + "learning_rate": 4.599785403657464e-06, + "loss": 0.4919, + "step": 2399 + }, + { + "epoch": 1.1347517730496455, + "grad_norm": 2.636812448501587, + "learning_rate": 4.599446775370153e-06, + "loss": 0.5091, + "step": 2400 + }, + { + "epoch": 1.135224586288416, + "grad_norm": 2.5965442657470703, + "learning_rate": 4.599108016359497e-06, + "loss": 0.5035, + "step": 2401 + }, + { + "epoch": 1.1356973995271868, + "grad_norm": 2.689732313156128, + "learning_rate": 4.5987691266465885e-06, + "loss": 0.5307, + "step": 2402 + }, + { + "epoch": 1.1361702127659574, + "grad_norm": 2.7256956100463867, + "learning_rate": 4.59843010625253e-06, + "loss": 0.5066, + "step": 2403 + }, + { + "epoch": 1.1366430260047282, + "grad_norm": 2.726020574569702, + "learning_rate": 4.59809095519843e-06, + "loss": 0.4805, + "step": 2404 + }, + { + "epoch": 1.1371158392434988, + "grad_norm": 2.703339099884033, + "learning_rate": 4.597751673505406e-06, + "loss": 0.4992, + "step": 2405 + }, + { + "epoch": 1.1375886524822696, + "grad_norm": 2.54455304145813, + "learning_rate": 4.5974122611945835e-06, + "loss": 0.5251, + "step": 2406 + }, + { + "epoch": 1.1380614657210402, + "grad_norm": 2.623507022857666, + "learning_rate": 4.597072718287096e-06, + "loss": 0.4831, + "step": 2407 + }, + { + "epoch": 1.138534278959811, + "grad_norm": 2.653590202331543, + "learning_rate": 4.596733044804086e-06, + "loss": 0.5646, + "step": 2408 + }, + { + "epoch": 1.1390070921985815, + "grad_norm": 2.8230600357055664, + "learning_rate": 4.5963932407667035e-06, + "loss": 0.514, + "step": 2409 + }, + { + "epoch": 1.1394799054373523, + "grad_norm": 2.6077451705932617, + "learning_rate": 4.5960533061961065e-06, + "loss": 0.4713, + "step": 2410 + }, + { + "epoch": 1.139952718676123, + "grad_norm": 2.3945798873901367, + "learning_rate": 4.595713241113461e-06, + "loss": 0.466, + "step": 2411 + }, + { + "epoch": 1.1404255319148937, + "grad_norm": 2.8100006580352783, + "learning_rate": 4.595373045539941e-06, + "loss": 0.5365, + "step": 2412 + }, + { + "epoch": 1.1408983451536643, + "grad_norm": 2.6825881004333496, + "learning_rate": 4.59503271949673e-06, + "loss": 0.4457, + "step": 2413 + }, + { + "epoch": 1.141371158392435, + "grad_norm": 2.969435691833496, + "learning_rate": 4.594692263005016e-06, + "loss": 0.5459, + "step": 2414 + }, + { + "epoch": 1.1418439716312057, + "grad_norm": 2.4103164672851562, + "learning_rate": 4.594351676086002e-06, + "loss": 0.4573, + "step": 2415 + }, + { + "epoch": 1.1423167848699765, + "grad_norm": 2.9450128078460693, + "learning_rate": 4.594010958760892e-06, + "loss": 0.5529, + "step": 2416 + }, + { + "epoch": 1.142789598108747, + "grad_norm": 2.6416335105895996, + "learning_rate": 4.593670111050901e-06, + "loss": 0.5153, + "step": 2417 + }, + { + "epoch": 1.1432624113475178, + "grad_norm": 2.473177194595337, + "learning_rate": 4.593329132977253e-06, + "loss": 0.4962, + "step": 2418 + }, + { + "epoch": 1.1437352245862884, + "grad_norm": 2.4494502544403076, + "learning_rate": 4.592988024561179e-06, + "loss": 0.5182, + "step": 2419 + }, + { + "epoch": 1.1442080378250592, + "grad_norm": 2.773930311203003, + "learning_rate": 4.592646785823918e-06, + "loss": 0.4442, + "step": 2420 + }, + { + "epoch": 1.1446808510638298, + "grad_norm": 2.4733314514160156, + "learning_rate": 4.592305416786718e-06, + "loss": 0.5106, + "step": 2421 + }, + { + "epoch": 1.1451536643026006, + "grad_norm": 2.6870038509368896, + "learning_rate": 4.591963917470834e-06, + "loss": 0.5316, + "step": 2422 + }, + { + "epoch": 1.1456264775413711, + "grad_norm": 2.8989531993865967, + "learning_rate": 4.591622287897529e-06, + "loss": 0.5906, + "step": 2423 + }, + { + "epoch": 1.1460992907801417, + "grad_norm": 2.6349124908447266, + "learning_rate": 4.591280528088077e-06, + "loss": 0.6225, + "step": 2424 + }, + { + "epoch": 1.1465721040189125, + "grad_norm": 3.19022274017334, + "learning_rate": 4.5909386380637555e-06, + "loss": 0.555, + "step": 2425 + }, + { + "epoch": 1.1470449172576833, + "grad_norm": 3.1473541259765625, + "learning_rate": 4.5905966178458535e-06, + "loss": 0.537, + "step": 2426 + }, + { + "epoch": 1.147517730496454, + "grad_norm": 2.6996145248413086, + "learning_rate": 4.590254467455667e-06, + "loss": 0.565, + "step": 2427 + }, + { + "epoch": 1.1479905437352245, + "grad_norm": 2.830188274383545, + "learning_rate": 4.5899121869145015e-06, + "loss": 0.6773, + "step": 2428 + }, + { + "epoch": 1.1484633569739953, + "grad_norm": 2.4937260150909424, + "learning_rate": 4.589569776243667e-06, + "loss": 0.5484, + "step": 2429 + }, + { + "epoch": 1.148936170212766, + "grad_norm": 2.54011869430542, + "learning_rate": 4.589227235464486e-06, + "loss": 0.5307, + "step": 2430 + }, + { + "epoch": 1.1494089834515366, + "grad_norm": 2.8764214515686035, + "learning_rate": 4.5888845645982845e-06, + "loss": 0.5296, + "step": 2431 + }, + { + "epoch": 1.1498817966903072, + "grad_norm": 2.637033462524414, + "learning_rate": 4.588541763666402e-06, + "loss": 0.5975, + "step": 2432 + }, + { + "epoch": 1.150354609929078, + "grad_norm": 2.8534255027770996, + "learning_rate": 4.5881988326901815e-06, + "loss": 0.5431, + "step": 2433 + }, + { + "epoch": 1.1508274231678488, + "grad_norm": 2.8546559810638428, + "learning_rate": 4.587855771690976e-06, + "loss": 0.469, + "step": 2434 + }, + { + "epoch": 1.1513002364066194, + "grad_norm": 2.9084973335266113, + "learning_rate": 4.587512580690146e-06, + "loss": 0.5566, + "step": 2435 + }, + { + "epoch": 1.15177304964539, + "grad_norm": 3.0993130207061768, + "learning_rate": 4.587169259709063e-06, + "loss": 0.5612, + "step": 2436 + }, + { + "epoch": 1.1522458628841608, + "grad_norm": 10.847400665283203, + "learning_rate": 4.5868258087691e-06, + "loss": 0.4678, + "step": 2437 + }, + { + "epoch": 1.1527186761229316, + "grad_norm": 2.6648571491241455, + "learning_rate": 4.586482227891645e-06, + "loss": 0.5951, + "step": 2438 + }, + { + "epoch": 1.1531914893617021, + "grad_norm": 2.529043197631836, + "learning_rate": 4.586138517098091e-06, + "loss": 0.5048, + "step": 2439 + }, + { + "epoch": 1.1536643026004727, + "grad_norm": 2.833904504776001, + "learning_rate": 4.585794676409839e-06, + "loss": 0.536, + "step": 2440 + }, + { + "epoch": 1.1541371158392435, + "grad_norm": 3.507657766342163, + "learning_rate": 4.585450705848298e-06, + "loss": 0.5954, + "step": 2441 + }, + { + "epoch": 1.1546099290780143, + "grad_norm": 2.6108388900756836, + "learning_rate": 4.585106605434887e-06, + "loss": 0.5684, + "step": 2442 + }, + { + "epoch": 1.1550827423167849, + "grad_norm": 2.490708589553833, + "learning_rate": 4.58476237519103e-06, + "loss": 0.4678, + "step": 2443 + }, + { + "epoch": 1.1555555555555554, + "grad_norm": 2.8192343711853027, + "learning_rate": 4.584418015138161e-06, + "loss": 0.5291, + "step": 2444 + }, + { + "epoch": 1.1560283687943262, + "grad_norm": 3.0878679752349854, + "learning_rate": 4.584073525297722e-06, + "loss": 0.5691, + "step": 2445 + }, + { + "epoch": 1.156501182033097, + "grad_norm": 3.1444318294525146, + "learning_rate": 4.583728905691163e-06, + "loss": 0.5643, + "step": 2446 + }, + { + "epoch": 1.1569739952718676, + "grad_norm": 3.02382230758667, + "learning_rate": 4.583384156339942e-06, + "loss": 0.6008, + "step": 2447 + }, + { + "epoch": 1.1574468085106382, + "grad_norm": 2.5942490100860596, + "learning_rate": 4.583039277265525e-06, + "loss": 0.5105, + "step": 2448 + }, + { + "epoch": 1.157919621749409, + "grad_norm": 2.938608407974243, + "learning_rate": 4.582694268489386e-06, + "loss": 0.5123, + "step": 2449 + }, + { + "epoch": 1.1583924349881798, + "grad_norm": 2.4622268676757812, + "learning_rate": 4.5823491300330075e-06, + "loss": 0.4538, + "step": 2450 + }, + { + "epoch": 1.1588652482269504, + "grad_norm": 2.4380505084991455, + "learning_rate": 4.5820038619178795e-06, + "loss": 0.4682, + "step": 2451 + }, + { + "epoch": 1.159338061465721, + "grad_norm": 2.479896068572998, + "learning_rate": 4.581658464165501e-06, + "loss": 0.4877, + "step": 2452 + }, + { + "epoch": 1.1598108747044917, + "grad_norm": 2.3373546600341797, + "learning_rate": 4.5813129367973765e-06, + "loss": 0.445, + "step": 2453 + }, + { + "epoch": 1.1602836879432625, + "grad_norm": 2.8586013317108154, + "learning_rate": 4.5809672798350214e-06, + "loss": 0.5232, + "step": 2454 + }, + { + "epoch": 1.160756501182033, + "grad_norm": 3.2302439212799072, + "learning_rate": 4.5806214932999595e-06, + "loss": 0.5336, + "step": 2455 + }, + { + "epoch": 1.1612293144208037, + "grad_norm": 3.1005783081054688, + "learning_rate": 4.580275577213721e-06, + "loss": 0.5123, + "step": 2456 + }, + { + "epoch": 1.1617021276595745, + "grad_norm": 2.7131073474884033, + "learning_rate": 4.579929531597842e-06, + "loss": 0.5648, + "step": 2457 + }, + { + "epoch": 1.1621749408983453, + "grad_norm": 2.5067050457000732, + "learning_rate": 4.579583356473874e-06, + "loss": 0.5324, + "step": 2458 + }, + { + "epoch": 1.1626477541371159, + "grad_norm": 2.7870543003082275, + "learning_rate": 4.579237051863366e-06, + "loss": 0.5094, + "step": 2459 + }, + { + "epoch": 1.1631205673758864, + "grad_norm": 2.739196300506592, + "learning_rate": 4.578890617787887e-06, + "loss": 0.5103, + "step": 2460 + }, + { + "epoch": 1.1635933806146572, + "grad_norm": 2.7108185291290283, + "learning_rate": 4.578544054269003e-06, + "loss": 0.533, + "step": 2461 + }, + { + "epoch": 1.1640661938534278, + "grad_norm": 3.028005361557007, + "learning_rate": 4.578197361328295e-06, + "loss": 0.636, + "step": 2462 + }, + { + "epoch": 1.1645390070921986, + "grad_norm": 2.4855129718780518, + "learning_rate": 4.5778505389873505e-06, + "loss": 0.501, + "step": 2463 + }, + { + "epoch": 1.1650118203309692, + "grad_norm": 2.6314198970794678, + "learning_rate": 4.577503587267764e-06, + "loss": 0.5812, + "step": 2464 + }, + { + "epoch": 1.16548463356974, + "grad_norm": 2.4209671020507812, + "learning_rate": 4.5771565061911385e-06, + "loss": 0.5168, + "step": 2465 + }, + { + "epoch": 1.1659574468085105, + "grad_norm": 2.526388645172119, + "learning_rate": 4.576809295779085e-06, + "loss": 0.5047, + "step": 2466 + }, + { + "epoch": 1.1664302600472813, + "grad_norm": 2.8278191089630127, + "learning_rate": 4.576461956053224e-06, + "loss": 0.4759, + "step": 2467 + }, + { + "epoch": 1.166903073286052, + "grad_norm": 2.7862167358398438, + "learning_rate": 4.576114487035182e-06, + "loss": 0.5492, + "step": 2468 + }, + { + "epoch": 1.1673758865248227, + "grad_norm": 2.6303019523620605, + "learning_rate": 4.575766888746594e-06, + "loss": 0.5538, + "step": 2469 + }, + { + "epoch": 1.1678486997635933, + "grad_norm": 2.613104820251465, + "learning_rate": 4.5754191612091034e-06, + "loss": 0.5114, + "step": 2470 + }, + { + "epoch": 1.168321513002364, + "grad_norm": 2.653958320617676, + "learning_rate": 4.5750713044443625e-06, + "loss": 0.5858, + "step": 2471 + }, + { + "epoch": 1.1687943262411347, + "grad_norm": 3.1143975257873535, + "learning_rate": 4.574723318474031e-06, + "loss": 0.5193, + "step": 2472 + }, + { + "epoch": 1.1692671394799055, + "grad_norm": 3.05454421043396, + "learning_rate": 4.574375203319775e-06, + "loss": 0.464, + "step": 2473 + }, + { + "epoch": 1.169739952718676, + "grad_norm": 2.66626238822937, + "learning_rate": 4.574026959003272e-06, + "loss": 0.4988, + "step": 2474 + }, + { + "epoch": 1.1702127659574468, + "grad_norm": 2.8871963024139404, + "learning_rate": 4.573678585546203e-06, + "loss": 0.5557, + "step": 2475 + }, + { + "epoch": 1.1706855791962174, + "grad_norm": 2.592949628829956, + "learning_rate": 4.573330082970262e-06, + "loss": 0.5178, + "step": 2476 + }, + { + "epoch": 1.1711583924349882, + "grad_norm": 2.9111456871032715, + "learning_rate": 4.572981451297148e-06, + "loss": 0.5712, + "step": 2477 + }, + { + "epoch": 1.1716312056737588, + "grad_norm": 2.8152248859405518, + "learning_rate": 4.57263269054857e-06, + "loss": 0.5548, + "step": 2478 + }, + { + "epoch": 1.1721040189125296, + "grad_norm": 3.0292418003082275, + "learning_rate": 4.572283800746241e-06, + "loss": 0.5937, + "step": 2479 + }, + { + "epoch": 1.1725768321513002, + "grad_norm": 3.454618215560913, + "learning_rate": 4.571934781911886e-06, + "loss": 0.5537, + "step": 2480 + }, + { + "epoch": 1.173049645390071, + "grad_norm": 2.7817866802215576, + "learning_rate": 4.571585634067239e-06, + "loss": 0.5649, + "step": 2481 + }, + { + "epoch": 1.1735224586288415, + "grad_norm": 2.7989349365234375, + "learning_rate": 4.571236357234037e-06, + "loss": 0.5448, + "step": 2482 + }, + { + "epoch": 1.1739952718676123, + "grad_norm": 2.8863933086395264, + "learning_rate": 4.57088695143403e-06, + "loss": 0.63, + "step": 2483 + }, + { + "epoch": 1.174468085106383, + "grad_norm": 2.5738039016723633, + "learning_rate": 4.570537416688972e-06, + "loss": 0.4702, + "step": 2484 + }, + { + "epoch": 1.1749408983451537, + "grad_norm": 3.003643274307251, + "learning_rate": 4.570187753020629e-06, + "loss": 0.5918, + "step": 2485 + }, + { + "epoch": 1.1754137115839243, + "grad_norm": 2.8619167804718018, + "learning_rate": 4.569837960450772e-06, + "loss": 0.5268, + "step": 2486 + }, + { + "epoch": 1.175886524822695, + "grad_norm": 2.876077175140381, + "learning_rate": 4.569488039001181e-06, + "loss": 0.4915, + "step": 2487 + }, + { + "epoch": 1.1763593380614656, + "grad_norm": 3.407115936279297, + "learning_rate": 4.569137988693644e-06, + "loss": 0.5761, + "step": 2488 + }, + { + "epoch": 1.1768321513002364, + "grad_norm": 2.7292826175689697, + "learning_rate": 4.568787809549958e-06, + "loss": 0.541, + "step": 2489 + }, + { + "epoch": 1.177304964539007, + "grad_norm": 2.8805999755859375, + "learning_rate": 4.568437501591926e-06, + "loss": 0.6223, + "step": 2490 + }, + { + "epoch": 1.1777777777777778, + "grad_norm": 2.9264373779296875, + "learning_rate": 4.56808706484136e-06, + "loss": 0.6081, + "step": 2491 + }, + { + "epoch": 1.1782505910165484, + "grad_norm": 2.5167033672332764, + "learning_rate": 4.567736499320082e-06, + "loss": 0.5393, + "step": 2492 + }, + { + "epoch": 1.1787234042553192, + "grad_norm": 3.4647862911224365, + "learning_rate": 4.567385805049918e-06, + "loss": 0.4826, + "step": 2493 + }, + { + "epoch": 1.1791962174940898, + "grad_norm": 2.9824202060699463, + "learning_rate": 4.5670349820527055e-06, + "loss": 0.541, + "step": 2494 + }, + { + "epoch": 1.1796690307328606, + "grad_norm": 2.997105836868286, + "learning_rate": 4.5666840303502885e-06, + "loss": 0.5771, + "step": 2495 + }, + { + "epoch": 1.1801418439716311, + "grad_norm": 2.8728017807006836, + "learning_rate": 4.56633294996452e-06, + "loss": 0.4877, + "step": 2496 + }, + { + "epoch": 1.180614657210402, + "grad_norm": 2.626498222351074, + "learning_rate": 4.5659817409172565e-06, + "loss": 0.5296, + "step": 2497 + }, + { + "epoch": 1.1810874704491725, + "grad_norm": 2.87037992477417, + "learning_rate": 4.565630403230371e-06, + "loss": 0.539, + "step": 2498 + }, + { + "epoch": 1.1815602836879433, + "grad_norm": 2.5719685554504395, + "learning_rate": 4.5652789369257375e-06, + "loss": 0.5653, + "step": 2499 + }, + { + "epoch": 1.1820330969267139, + "grad_norm": 2.4842135906219482, + "learning_rate": 4.56492734202524e-06, + "loss": 0.515, + "step": 2500 + }, + { + "epoch": 1.1825059101654847, + "grad_norm": 2.640951156616211, + "learning_rate": 4.564575618550773e-06, + "loss": 0.5601, + "step": 2501 + }, + { + "epoch": 1.1829787234042553, + "grad_norm": 2.624394655227661, + "learning_rate": 4.564223766524234e-06, + "loss": 0.5551, + "step": 2502 + }, + { + "epoch": 1.183451536643026, + "grad_norm": 3.014537811279297, + "learning_rate": 4.563871785967533e-06, + "loss": 0.5212, + "step": 2503 + }, + { + "epoch": 1.1839243498817966, + "grad_norm": 2.8756890296936035, + "learning_rate": 4.563519676902585e-06, + "loss": 0.5132, + "step": 2504 + }, + { + "epoch": 1.1843971631205674, + "grad_norm": 2.636781692504883, + "learning_rate": 4.5631674393513145e-06, + "loss": 0.5323, + "step": 2505 + }, + { + "epoch": 1.184869976359338, + "grad_norm": 2.7233786582946777, + "learning_rate": 4.562815073335655e-06, + "loss": 0.5608, + "step": 2506 + }, + { + "epoch": 1.1853427895981088, + "grad_norm": 2.7158713340759277, + "learning_rate": 4.562462578877546e-06, + "loss": 0.5373, + "step": 2507 + }, + { + "epoch": 1.1858156028368794, + "grad_norm": 2.9754762649536133, + "learning_rate": 4.562109955998936e-06, + "loss": 0.5712, + "step": 2508 + }, + { + "epoch": 1.1862884160756502, + "grad_norm": 2.8815054893493652, + "learning_rate": 4.561757204721781e-06, + "loss": 0.6126, + "step": 2509 + }, + { + "epoch": 1.1867612293144207, + "grad_norm": 2.866319417953491, + "learning_rate": 4.561404325068045e-06, + "loss": 0.506, + "step": 2510 + }, + { + "epoch": 1.1872340425531915, + "grad_norm": 2.6187376976013184, + "learning_rate": 4.561051317059701e-06, + "loss": 0.4674, + "step": 2511 + }, + { + "epoch": 1.1877068557919621, + "grad_norm": 2.642552137374878, + "learning_rate": 4.560698180718729e-06, + "loss": 0.4793, + "step": 2512 + }, + { + "epoch": 1.188179669030733, + "grad_norm": 2.7815041542053223, + "learning_rate": 4.560344916067117e-06, + "loss": 0.5034, + "step": 2513 + }, + { + "epoch": 1.1886524822695035, + "grad_norm": 2.70853590965271, + "learning_rate": 4.559991523126862e-06, + "loss": 0.4811, + "step": 2514 + }, + { + "epoch": 1.1891252955082743, + "grad_norm": 2.7049436569213867, + "learning_rate": 4.559638001919967e-06, + "loss": 0.547, + "step": 2515 + }, + { + "epoch": 1.1895981087470449, + "grad_norm": 2.766773223876953, + "learning_rate": 4.559284352468445e-06, + "loss": 0.5362, + "step": 2516 + }, + { + "epoch": 1.1900709219858157, + "grad_norm": 3.0064334869384766, + "learning_rate": 4.558930574794316e-06, + "loss": 0.5915, + "step": 2517 + }, + { + "epoch": 1.1905437352245862, + "grad_norm": 2.4899885654449463, + "learning_rate": 4.558576668919609e-06, + "loss": 0.4379, + "step": 2518 + }, + { + "epoch": 1.191016548463357, + "grad_norm": 2.925963878631592, + "learning_rate": 4.558222634866358e-06, + "loss": 0.5389, + "step": 2519 + }, + { + "epoch": 1.1914893617021276, + "grad_norm": 6.087667465209961, + "learning_rate": 4.55786847265661e-06, + "loss": 0.4777, + "step": 2520 + }, + { + "epoch": 1.1919621749408984, + "grad_norm": 2.4560582637786865, + "learning_rate": 4.5575141823124145e-06, + "loss": 0.5576, + "step": 2521 + }, + { + "epoch": 1.192434988179669, + "grad_norm": 3.184252977371216, + "learning_rate": 4.557159763855834e-06, + "loss": 0.5151, + "step": 2522 + }, + { + "epoch": 1.1929078014184398, + "grad_norm": 2.359722137451172, + "learning_rate": 4.556805217308935e-06, + "loss": 0.478, + "step": 2523 + }, + { + "epoch": 1.1933806146572103, + "grad_norm": 3.0821568965911865, + "learning_rate": 4.5564505426937935e-06, + "loss": 0.5784, + "step": 2524 + }, + { + "epoch": 1.1938534278959811, + "grad_norm": 2.9905128479003906, + "learning_rate": 4.5560957400324936e-06, + "loss": 0.6087, + "step": 2525 + }, + { + "epoch": 1.1943262411347517, + "grad_norm": 2.462102174758911, + "learning_rate": 4.555740809347128e-06, + "loss": 0.4739, + "step": 2526 + }, + { + "epoch": 1.1947990543735225, + "grad_norm": 2.7931067943573, + "learning_rate": 4.555385750659796e-06, + "loss": 0.4961, + "step": 2527 + }, + { + "epoch": 1.195271867612293, + "grad_norm": 2.660320997238159, + "learning_rate": 4.555030563992607e-06, + "loss": 0.487, + "step": 2528 + }, + { + "epoch": 1.195744680851064, + "grad_norm": 2.8135557174682617, + "learning_rate": 4.554675249367675e-06, + "loss": 0.5269, + "step": 2529 + }, + { + "epoch": 1.1962174940898345, + "grad_norm": 2.661933422088623, + "learning_rate": 4.554319806807126e-06, + "loss": 0.4723, + "step": 2530 + }, + { + "epoch": 1.1966903073286053, + "grad_norm": 2.568176507949829, + "learning_rate": 4.553964236333089e-06, + "loss": 0.5258, + "step": 2531 + }, + { + "epoch": 1.1971631205673758, + "grad_norm": 2.6890947818756104, + "learning_rate": 4.553608537967705e-06, + "loss": 0.4965, + "step": 2532 + }, + { + "epoch": 1.1976359338061466, + "grad_norm": 3.133470058441162, + "learning_rate": 4.553252711733124e-06, + "loss": 0.5423, + "step": 2533 + }, + { + "epoch": 1.1981087470449172, + "grad_norm": 2.7086687088012695, + "learning_rate": 4.552896757651498e-06, + "loss": 0.5326, + "step": 2534 + }, + { + "epoch": 1.198581560283688, + "grad_norm": 2.8411715030670166, + "learning_rate": 4.552540675744994e-06, + "loss": 0.5793, + "step": 2535 + }, + { + "epoch": 1.1990543735224586, + "grad_norm": 3.041077136993408, + "learning_rate": 4.552184466035782e-06, + "loss": 0.5068, + "step": 2536 + }, + { + "epoch": 1.1995271867612294, + "grad_norm": 2.5921192169189453, + "learning_rate": 4.551828128546041e-06, + "loss": 0.5189, + "step": 2537 + }, + { + "epoch": 1.2, + "grad_norm": 2.923305034637451, + "learning_rate": 4.5514716632979605e-06, + "loss": 0.516, + "step": 2538 + }, + { + "epoch": 1.2004728132387708, + "grad_norm": 2.7083024978637695, + "learning_rate": 4.551115070313734e-06, + "loss": 0.4825, + "step": 2539 + }, + { + "epoch": 1.2009456264775413, + "grad_norm": 2.746842384338379, + "learning_rate": 4.550758349615567e-06, + "loss": 0.5691, + "step": 2540 + }, + { + "epoch": 1.2014184397163121, + "grad_norm": 2.6596429347991943, + "learning_rate": 4.550401501225669e-06, + "loss": 0.5983, + "step": 2541 + }, + { + "epoch": 1.2018912529550827, + "grad_norm": 2.9057931900024414, + "learning_rate": 4.550044525166261e-06, + "loss": 0.5069, + "step": 2542 + }, + { + "epoch": 1.2023640661938535, + "grad_norm": 2.6139039993286133, + "learning_rate": 4.5496874214595686e-06, + "loss": 0.5102, + "step": 2543 + }, + { + "epoch": 1.202836879432624, + "grad_norm": 2.630286455154419, + "learning_rate": 4.5493301901278285e-06, + "loss": 0.4902, + "step": 2544 + }, + { + "epoch": 1.2033096926713949, + "grad_norm": 2.639174222946167, + "learning_rate": 4.548972831193284e-06, + "loss": 0.4566, + "step": 2545 + }, + { + "epoch": 1.2037825059101654, + "grad_norm": 2.9569664001464844, + "learning_rate": 4.548615344678186e-06, + "loss": 0.5636, + "step": 2546 + }, + { + "epoch": 1.2042553191489362, + "grad_norm": 2.981734037399292, + "learning_rate": 4.5482577306047924e-06, + "loss": 0.4884, + "step": 2547 + }, + { + "epoch": 1.2047281323877068, + "grad_norm": 2.6760342121124268, + "learning_rate": 4.547899988995371e-06, + "loss": 0.5426, + "step": 2548 + }, + { + "epoch": 1.2052009456264776, + "grad_norm": 2.825805902481079, + "learning_rate": 4.547542119872198e-06, + "loss": 0.4989, + "step": 2549 + }, + { + "epoch": 1.2056737588652482, + "grad_norm": 2.856426954269409, + "learning_rate": 4.547184123257555e-06, + "loss": 0.5734, + "step": 2550 + }, + { + "epoch": 1.206146572104019, + "grad_norm": 2.555682420730591, + "learning_rate": 4.5468259991737334e-06, + "loss": 0.5299, + "step": 2551 + }, + { + "epoch": 1.2066193853427896, + "grad_norm": 2.6324024200439453, + "learning_rate": 4.546467747643032e-06, + "loss": 0.5906, + "step": 2552 + }, + { + "epoch": 1.2070921985815604, + "grad_norm": 3.4145350456237793, + "learning_rate": 4.546109368687757e-06, + "loss": 0.5153, + "step": 2553 + }, + { + "epoch": 1.207565011820331, + "grad_norm": 2.658691644668579, + "learning_rate": 4.545750862330225e-06, + "loss": 0.5759, + "step": 2554 + }, + { + "epoch": 1.2080378250591017, + "grad_norm": 3.162605047225952, + "learning_rate": 4.545392228592755e-06, + "loss": 0.5379, + "step": 2555 + }, + { + "epoch": 1.2085106382978723, + "grad_norm": 2.8631198406219482, + "learning_rate": 4.545033467497681e-06, + "loss": 0.5959, + "step": 2556 + }, + { + "epoch": 1.208983451536643, + "grad_norm": 2.457109212875366, + "learning_rate": 4.54467457906734e-06, + "loss": 0.4864, + "step": 2557 + }, + { + "epoch": 1.2094562647754137, + "grad_norm": 2.5307061672210693, + "learning_rate": 4.544315563324078e-06, + "loss": 0.5308, + "step": 2558 + }, + { + "epoch": 1.2099290780141845, + "grad_norm": 2.8482773303985596, + "learning_rate": 4.543956420290251e-06, + "loss": 0.5126, + "step": 2559 + }, + { + "epoch": 1.210401891252955, + "grad_norm": 2.4990832805633545, + "learning_rate": 4.5435971499882195e-06, + "loss": 0.4534, + "step": 2560 + }, + { + "epoch": 1.2108747044917259, + "grad_norm": 2.6292665004730225, + "learning_rate": 4.543237752440354e-06, + "loss": 0.4434, + "step": 2561 + }, + { + "epoch": 1.2113475177304964, + "grad_norm": 2.865983247756958, + "learning_rate": 4.542878227669033e-06, + "loss": 0.5667, + "step": 2562 + }, + { + "epoch": 1.2118203309692672, + "grad_norm": 2.745614528656006, + "learning_rate": 4.542518575696644e-06, + "loss": 0.4724, + "step": 2563 + }, + { + "epoch": 1.2122931442080378, + "grad_norm": 2.8562581539154053, + "learning_rate": 4.5421587965455785e-06, + "loss": 0.5405, + "step": 2564 + }, + { + "epoch": 1.2127659574468086, + "grad_norm": 2.6670095920562744, + "learning_rate": 4.5417988902382385e-06, + "loss": 0.5432, + "step": 2565 + }, + { + "epoch": 1.2132387706855792, + "grad_norm": 2.9320743083953857, + "learning_rate": 4.541438856797036e-06, + "loss": 0.5862, + "step": 2566 + }, + { + "epoch": 1.21371158392435, + "grad_norm": 2.577505588531494, + "learning_rate": 4.541078696244386e-06, + "loss": 0.4742, + "step": 2567 + }, + { + "epoch": 1.2141843971631205, + "grad_norm": 3.4476120471954346, + "learning_rate": 4.540718408602717e-06, + "loss": 0.5903, + "step": 2568 + }, + { + "epoch": 1.2146572104018913, + "grad_norm": 2.816210985183716, + "learning_rate": 4.540357993894459e-06, + "loss": 0.5033, + "step": 2569 + }, + { + "epoch": 1.215130023640662, + "grad_norm": 3.0806639194488525, + "learning_rate": 4.539997452142058e-06, + "loss": 0.6064, + "step": 2570 + }, + { + "epoch": 1.2156028368794327, + "grad_norm": 2.563060760498047, + "learning_rate": 4.5396367833679586e-06, + "loss": 0.5597, + "step": 2571 + }, + { + "epoch": 1.2160756501182033, + "grad_norm": 3.1014397144317627, + "learning_rate": 4.5392759875946215e-06, + "loss": 0.54, + "step": 2572 + }, + { + "epoch": 1.216548463356974, + "grad_norm": 3.124190330505371, + "learning_rate": 4.53891506484451e-06, + "loss": 0.5122, + "step": 2573 + }, + { + "epoch": 1.2170212765957447, + "grad_norm": 2.6688716411590576, + "learning_rate": 4.538554015140097e-06, + "loss": 0.5615, + "step": 2574 + }, + { + "epoch": 1.2174940898345155, + "grad_norm": 2.775543689727783, + "learning_rate": 4.538192838503866e-06, + "loss": 0.496, + "step": 2575 + }, + { + "epoch": 1.217966903073286, + "grad_norm": 2.7877283096313477, + "learning_rate": 4.537831534958303e-06, + "loss": 0.4995, + "step": 2576 + }, + { + "epoch": 1.2184397163120568, + "grad_norm": 2.824810028076172, + "learning_rate": 4.537470104525906e-06, + "loss": 0.5481, + "step": 2577 + }, + { + "epoch": 1.2189125295508274, + "grad_norm": 2.801269292831421, + "learning_rate": 4.53710854722918e-06, + "loss": 0.5628, + "step": 2578 + }, + { + "epoch": 1.2193853427895982, + "grad_norm": 2.7780683040618896, + "learning_rate": 4.536746863090637e-06, + "loss": 0.4845, + "step": 2579 + }, + { + "epoch": 1.2198581560283688, + "grad_norm": 2.536010265350342, + "learning_rate": 4.536385052132798e-06, + "loss": 0.4771, + "step": 2580 + }, + { + "epoch": 1.2203309692671396, + "grad_norm": 2.768775701522827, + "learning_rate": 4.536023114378191e-06, + "loss": 0.5366, + "step": 2581 + }, + { + "epoch": 1.2208037825059102, + "grad_norm": 2.658125877380371, + "learning_rate": 4.535661049849352e-06, + "loss": 0.524, + "step": 2582 + }, + { + "epoch": 1.2212765957446807, + "grad_norm": 2.558696746826172, + "learning_rate": 4.535298858568825e-06, + "loss": 0.5482, + "step": 2583 + }, + { + "epoch": 1.2217494089834515, + "grad_norm": 2.5284535884857178, + "learning_rate": 4.534936540559164e-06, + "loss": 0.4454, + "step": 2584 + }, + { + "epoch": 1.2222222222222223, + "grad_norm": 7.617330074310303, + "learning_rate": 4.534574095842927e-06, + "loss": 0.5615, + "step": 2585 + }, + { + "epoch": 1.222695035460993, + "grad_norm": 2.9120311737060547, + "learning_rate": 4.534211524442682e-06, + "loss": 0.5624, + "step": 2586 + }, + { + "epoch": 1.2231678486997635, + "grad_norm": 2.5004289150238037, + "learning_rate": 4.533848826381005e-06, + "loss": 0.4743, + "step": 2587 + }, + { + "epoch": 1.2236406619385343, + "grad_norm": 2.8395533561706543, + "learning_rate": 4.53348600168048e-06, + "loss": 0.4457, + "step": 2588 + }, + { + "epoch": 1.224113475177305, + "grad_norm": 2.832211494445801, + "learning_rate": 4.533123050363699e-06, + "loss": 0.5559, + "step": 2589 + }, + { + "epoch": 1.2245862884160756, + "grad_norm": 2.6318583488464355, + "learning_rate": 4.53275997245326e-06, + "loss": 0.5281, + "step": 2590 + }, + { + "epoch": 1.2250591016548462, + "grad_norm": 3.0509233474731445, + "learning_rate": 4.532396767971771e-06, + "loss": 0.6003, + "step": 2591 + }, + { + "epoch": 1.225531914893617, + "grad_norm": 2.6863620281219482, + "learning_rate": 4.532033436941847e-06, + "loss": 0.5219, + "step": 2592 + }, + { + "epoch": 1.2260047281323878, + "grad_norm": 2.401463747024536, + "learning_rate": 4.5316699793861104e-06, + "loss": 0.5994, + "step": 2593 + }, + { + "epoch": 1.2264775413711584, + "grad_norm": 2.613517999649048, + "learning_rate": 4.531306395327194e-06, + "loss": 0.5785, + "step": 2594 + }, + { + "epoch": 1.226950354609929, + "grad_norm": 2.5016374588012695, + "learning_rate": 4.530942684787735e-06, + "loss": 0.5695, + "step": 2595 + }, + { + "epoch": 1.2274231678486998, + "grad_norm": 2.576464891433716, + "learning_rate": 4.53057884779038e-06, + "loss": 0.4427, + "step": 2596 + }, + { + "epoch": 1.2278959810874706, + "grad_norm": 2.5688700675964355, + "learning_rate": 4.530214884357785e-06, + "loss": 0.4966, + "step": 2597 + }, + { + "epoch": 1.2283687943262411, + "grad_norm": 3.179013729095459, + "learning_rate": 4.52985079451261e-06, + "loss": 0.5239, + "step": 2598 + }, + { + "epoch": 1.2288416075650117, + "grad_norm": 2.6015284061431885, + "learning_rate": 4.529486578277527e-06, + "loss": 0.5135, + "step": 2599 + }, + { + "epoch": 1.2293144208037825, + "grad_norm": 2.3029589653015137, + "learning_rate": 4.529122235675214e-06, + "loss": 0.4044, + "step": 2600 + }, + { + "epoch": 1.2297872340425533, + "grad_norm": 2.994093656539917, + "learning_rate": 4.528757766728357e-06, + "loss": 0.5419, + "step": 2601 + }, + { + "epoch": 1.2302600472813239, + "grad_norm": 2.6297390460968018, + "learning_rate": 4.52839317145965e-06, + "loss": 0.488, + "step": 2602 + }, + { + "epoch": 1.2307328605200945, + "grad_norm": 2.4814043045043945, + "learning_rate": 4.528028449891793e-06, + "loss": 0.4917, + "step": 2603 + }, + { + "epoch": 1.2312056737588652, + "grad_norm": 3.6052863597869873, + "learning_rate": 4.527663602047499e-06, + "loss": 0.5301, + "step": 2604 + }, + { + "epoch": 1.231678486997636, + "grad_norm": 2.6984751224517822, + "learning_rate": 4.5272986279494825e-06, + "loss": 0.5253, + "step": 2605 + }, + { + "epoch": 1.2321513002364066, + "grad_norm": 2.514000415802002, + "learning_rate": 4.526933527620469e-06, + "loss": 0.5661, + "step": 2606 + }, + { + "epoch": 1.2326241134751772, + "grad_norm": 2.890921115875244, + "learning_rate": 4.526568301083195e-06, + "loss": 0.5585, + "step": 2607 + }, + { + "epoch": 1.233096926713948, + "grad_norm": 2.6390011310577393, + "learning_rate": 4.526202948360397e-06, + "loss": 0.5168, + "step": 2608 + }, + { + "epoch": 1.2335697399527188, + "grad_norm": 2.7370636463165283, + "learning_rate": 4.5258374694748266e-06, + "loss": 0.5453, + "step": 2609 + }, + { + "epoch": 1.2340425531914894, + "grad_norm": 2.8203976154327393, + "learning_rate": 4.52547186444924e-06, + "loss": 0.5763, + "step": 2610 + }, + { + "epoch": 1.23451536643026, + "grad_norm": 2.7567849159240723, + "learning_rate": 4.5251061333064025e-06, + "loss": 0.5194, + "step": 2611 + }, + { + "epoch": 1.2349881796690307, + "grad_norm": 2.767519474029541, + "learning_rate": 4.524740276069085e-06, + "loss": 0.5355, + "step": 2612 + }, + { + "epoch": 1.2354609929078015, + "grad_norm": 3.072035312652588, + "learning_rate": 4.5243742927600695e-06, + "loss": 0.5391, + "step": 2613 + }, + { + "epoch": 1.2359338061465721, + "grad_norm": 2.5957462787628174, + "learning_rate": 4.524008183402143e-06, + "loss": 0.5645, + "step": 2614 + }, + { + "epoch": 1.2364066193853427, + "grad_norm": 2.774897575378418, + "learning_rate": 4.523641948018101e-06, + "loss": 0.5576, + "step": 2615 + }, + { + "epoch": 1.2368794326241135, + "grad_norm": 2.635887622833252, + "learning_rate": 4.5232755866307496e-06, + "loss": 0.5254, + "step": 2616 + }, + { + "epoch": 1.2373522458628843, + "grad_norm": 2.4860997200012207, + "learning_rate": 4.522909099262899e-06, + "loss": 0.4692, + "step": 2617 + }, + { + "epoch": 1.2378250591016549, + "grad_norm": 2.595513105392456, + "learning_rate": 4.522542485937369e-06, + "loss": 0.5166, + "step": 2618 + }, + { + "epoch": 1.2382978723404254, + "grad_norm": 2.961474895477295, + "learning_rate": 4.522175746676986e-06, + "loss": 0.5455, + "step": 2619 + }, + { + "epoch": 1.2387706855791962, + "grad_norm": 2.813889741897583, + "learning_rate": 4.521808881504588e-06, + "loss": 0.5249, + "step": 2620 + }, + { + "epoch": 1.239243498817967, + "grad_norm": 2.8434813022613525, + "learning_rate": 4.521441890443015e-06, + "loss": 0.472, + "step": 2621 + }, + { + "epoch": 1.2397163120567376, + "grad_norm": 2.4264845848083496, + "learning_rate": 4.521074773515119e-06, + "loss": 0.4783, + "step": 2622 + }, + { + "epoch": 1.2401891252955082, + "grad_norm": 2.615169048309326, + "learning_rate": 4.520707530743761e-06, + "loss": 0.5324, + "step": 2623 + }, + { + "epoch": 1.240661938534279, + "grad_norm": 2.6772537231445312, + "learning_rate": 4.520340162151803e-06, + "loss": 0.5224, + "step": 2624 + }, + { + "epoch": 1.2411347517730495, + "grad_norm": 2.683393955230713, + "learning_rate": 4.519972667762124e-06, + "loss": 0.4863, + "step": 2625 + }, + { + "epoch": 1.2416075650118203, + "grad_norm": 3.0335750579833984, + "learning_rate": 4.519605047597603e-06, + "loss": 0.544, + "step": 2626 + }, + { + "epoch": 1.242080378250591, + "grad_norm": 2.8694353103637695, + "learning_rate": 4.519237301681132e-06, + "loss": 0.5576, + "step": 2627 + }, + { + "epoch": 1.2425531914893617, + "grad_norm": 3.217808246612549, + "learning_rate": 4.518869430035609e-06, + "loss": 0.5459, + "step": 2628 + }, + { + "epoch": 1.2430260047281323, + "grad_norm": 2.7700083255767822, + "learning_rate": 4.518501432683937e-06, + "loss": 0.5579, + "step": 2629 + }, + { + "epoch": 1.243498817966903, + "grad_norm": 2.4759175777435303, + "learning_rate": 4.5181333096490335e-06, + "loss": 0.5049, + "step": 2630 + }, + { + "epoch": 1.2439716312056737, + "grad_norm": 2.8652584552764893, + "learning_rate": 4.517765060953818e-06, + "loss": 0.5366, + "step": 2631 + }, + { + "epoch": 1.2444444444444445, + "grad_norm": 2.776334524154663, + "learning_rate": 4.517396686621218e-06, + "loss": 0.5677, + "step": 2632 + }, + { + "epoch": 1.244917257683215, + "grad_norm": 2.676708221435547, + "learning_rate": 4.517028186674174e-06, + "loss": 0.5055, + "step": 2633 + }, + { + "epoch": 1.2453900709219858, + "grad_norm": 2.6851537227630615, + "learning_rate": 4.516659561135629e-06, + "loss": 0.5537, + "step": 2634 + }, + { + "epoch": 1.2458628841607564, + "grad_norm": 2.619971513748169, + "learning_rate": 4.516290810028536e-06, + "loss": 0.5765, + "step": 2635 + }, + { + "epoch": 1.2463356973995272, + "grad_norm": 2.7302334308624268, + "learning_rate": 4.515921933375855e-06, + "loss": 0.5611, + "step": 2636 + }, + { + "epoch": 1.2468085106382978, + "grad_norm": 2.5005829334259033, + "learning_rate": 4.5155529312005554e-06, + "loss": 0.442, + "step": 2637 + }, + { + "epoch": 1.2472813238770686, + "grad_norm": 2.713587522506714, + "learning_rate": 4.515183803525612e-06, + "loss": 0.5023, + "step": 2638 + }, + { + "epoch": 1.2477541371158392, + "grad_norm": 2.5146236419677734, + "learning_rate": 4.514814550374009e-06, + "loss": 0.5195, + "step": 2639 + }, + { + "epoch": 1.24822695035461, + "grad_norm": 2.761060953140259, + "learning_rate": 4.51444517176874e-06, + "loss": 0.5138, + "step": 2640 + }, + { + "epoch": 1.2486997635933805, + "grad_norm": 3.082329273223877, + "learning_rate": 4.5140756677328026e-06, + "loss": 0.6105, + "step": 2641 + }, + { + "epoch": 1.2491725768321513, + "grad_norm": 2.6933493614196777, + "learning_rate": 4.513706038289205e-06, + "loss": 0.5185, + "step": 2642 + }, + { + "epoch": 1.249645390070922, + "grad_norm": 2.515856981277466, + "learning_rate": 4.513336283460962e-06, + "loss": 0.5375, + "step": 2643 + }, + { + "epoch": 1.2501182033096927, + "grad_norm": 2.8553731441497803, + "learning_rate": 4.512966403271096e-06, + "loss": 0.5582, + "step": 2644 + }, + { + "epoch": 1.2505910165484633, + "grad_norm": 2.640880823135376, + "learning_rate": 4.5125963977426405e-06, + "loss": 0.5125, + "step": 2645 + }, + { + "epoch": 1.251063829787234, + "grad_norm": 2.9845943450927734, + "learning_rate": 4.512226266898631e-06, + "loss": 0.4749, + "step": 2646 + }, + { + "epoch": 1.2515366430260046, + "grad_norm": 2.5131032466888428, + "learning_rate": 4.511856010762116e-06, + "loss": 0.4764, + "step": 2647 + }, + { + "epoch": 1.2520094562647754, + "grad_norm": 2.370638370513916, + "learning_rate": 4.511485629356148e-06, + "loss": 0.5153, + "step": 2648 + }, + { + "epoch": 1.252482269503546, + "grad_norm": 2.912461996078491, + "learning_rate": 4.511115122703791e-06, + "loss": 0.6117, + "step": 2649 + }, + { + "epoch": 1.2529550827423168, + "grad_norm": 2.7308082580566406, + "learning_rate": 4.510744490828113e-06, + "loss": 0.5076, + "step": 2650 + }, + { + "epoch": 1.2534278959810874, + "grad_norm": 2.8524296283721924, + "learning_rate": 4.510373733752193e-06, + "loss": 0.542, + "step": 2651 + }, + { + "epoch": 1.2539007092198582, + "grad_norm": 2.799377202987671, + "learning_rate": 4.5100028514991145e-06, + "loss": 0.486, + "step": 2652 + }, + { + "epoch": 1.2543735224586288, + "grad_norm": 2.7248027324676514, + "learning_rate": 4.509631844091973e-06, + "loss": 0.4972, + "step": 2653 + }, + { + "epoch": 1.2548463356973996, + "grad_norm": 2.8041458129882812, + "learning_rate": 4.5092607115538686e-06, + "loss": 0.588, + "step": 2654 + }, + { + "epoch": 1.2553191489361701, + "grad_norm": 2.679417133331299, + "learning_rate": 4.50888945390791e-06, + "loss": 0.4639, + "step": 2655 + }, + { + "epoch": 1.255791962174941, + "grad_norm": 3.1049270629882812, + "learning_rate": 4.508518071177214e-06, + "loss": 0.5857, + "step": 2656 + }, + { + "epoch": 1.2562647754137115, + "grad_norm": 2.8590362071990967, + "learning_rate": 4.508146563384904e-06, + "loss": 0.5451, + "step": 2657 + }, + { + "epoch": 1.2567375886524823, + "grad_norm": 2.9774081707000732, + "learning_rate": 4.507774930554114e-06, + "loss": 0.5493, + "step": 2658 + }, + { + "epoch": 1.2572104018912529, + "grad_norm": 2.617643356323242, + "learning_rate": 4.507403172707983e-06, + "loss": 0.5472, + "step": 2659 + }, + { + "epoch": 1.2576832151300237, + "grad_norm": 2.9195587635040283, + "learning_rate": 4.507031289869658e-06, + "loss": 0.5403, + "step": 2660 + }, + { + "epoch": 1.2581560283687943, + "grad_norm": 2.706089496612549, + "learning_rate": 4.506659282062295e-06, + "loss": 0.4899, + "step": 2661 + }, + { + "epoch": 1.258628841607565, + "grad_norm": 2.8229358196258545, + "learning_rate": 4.506287149309057e-06, + "loss": 0.5336, + "step": 2662 + }, + { + "epoch": 1.2591016548463356, + "grad_norm": 2.5295674800872803, + "learning_rate": 4.505914891633117e-06, + "loss": 0.4806, + "step": 2663 + }, + { + "epoch": 1.2595744680851064, + "grad_norm": 3.098208427429199, + "learning_rate": 4.505542509057651e-06, + "loss": 0.6039, + "step": 2664 + }, + { + "epoch": 1.260047281323877, + "grad_norm": 2.5118041038513184, + "learning_rate": 4.5051700016058475e-06, + "loss": 0.5279, + "step": 2665 + }, + { + "epoch": 1.2605200945626478, + "grad_norm": 2.6901369094848633, + "learning_rate": 4.5047973693009005e-06, + "loss": 0.5515, + "step": 2666 + }, + { + "epoch": 1.2609929078014184, + "grad_norm": 2.5622377395629883, + "learning_rate": 4.504424612166012e-06, + "loss": 0.5405, + "step": 2667 + }, + { + "epoch": 1.2614657210401892, + "grad_norm": 2.685751438140869, + "learning_rate": 4.5040517302243915e-06, + "loss": 0.5797, + "step": 2668 + }, + { + "epoch": 1.2619385342789597, + "grad_norm": 2.8525350093841553, + "learning_rate": 4.503678723499259e-06, + "loss": 0.5561, + "step": 2669 + }, + { + "epoch": 1.2624113475177305, + "grad_norm": 2.803386926651001, + "learning_rate": 4.503305592013836e-06, + "loss": 0.5376, + "step": 2670 + }, + { + "epoch": 1.2628841607565011, + "grad_norm": 2.78633189201355, + "learning_rate": 4.502932335791359e-06, + "loss": 0.4739, + "step": 2671 + }, + { + "epoch": 1.263356973995272, + "grad_norm": 2.8337297439575195, + "learning_rate": 4.502558954855069e-06, + "loss": 0.5406, + "step": 2672 + }, + { + "epoch": 1.2638297872340425, + "grad_norm": 2.610275983810425, + "learning_rate": 4.502185449228213e-06, + "loss": 0.5343, + "step": 2673 + }, + { + "epoch": 1.2643026004728133, + "grad_norm": 2.7842252254486084, + "learning_rate": 4.501811818934048e-06, + "loss": 0.532, + "step": 2674 + }, + { + "epoch": 1.2647754137115839, + "grad_norm": 2.4472389221191406, + "learning_rate": 4.501438063995839e-06, + "loss": 0.4976, + "step": 2675 + }, + { + "epoch": 1.2652482269503547, + "grad_norm": 3.076580762863159, + "learning_rate": 4.501064184436858e-06, + "loss": 0.507, + "step": 2676 + }, + { + "epoch": 1.2657210401891252, + "grad_norm": 2.5952908992767334, + "learning_rate": 4.500690180280384e-06, + "loss": 0.5498, + "step": 2677 + }, + { + "epoch": 1.266193853427896, + "grad_norm": 2.476943016052246, + "learning_rate": 4.500316051549706e-06, + "loss": 0.557, + "step": 2678 + }, + { + "epoch": 1.2666666666666666, + "grad_norm": 2.730579376220703, + "learning_rate": 4.499941798268118e-06, + "loss": 0.4975, + "step": 2679 + }, + { + "epoch": 1.2671394799054374, + "grad_norm": 2.7916698455810547, + "learning_rate": 4.499567420458924e-06, + "loss": 0.5673, + "step": 2680 + }, + { + "epoch": 1.267612293144208, + "grad_norm": 2.4249091148376465, + "learning_rate": 4.4991929181454355e-06, + "loss": 0.4836, + "step": 2681 + }, + { + "epoch": 1.2680851063829788, + "grad_norm": 2.661911725997925, + "learning_rate": 4.498818291350969e-06, + "loss": 0.5332, + "step": 2682 + }, + { + "epoch": 1.2685579196217494, + "grad_norm": 2.693657875061035, + "learning_rate": 4.498443540098852e-06, + "loss": 0.5257, + "step": 2683 + }, + { + "epoch": 1.2690307328605201, + "grad_norm": 2.609386682510376, + "learning_rate": 4.4980686644124195e-06, + "loss": 0.4918, + "step": 2684 + }, + { + "epoch": 1.2695035460992907, + "grad_norm": 3.2104930877685547, + "learning_rate": 4.4976936643150124e-06, + "loss": 0.6097, + "step": 2685 + }, + { + "epoch": 1.2699763593380615, + "grad_norm": 2.707860231399536, + "learning_rate": 4.49731853982998e-06, + "loss": 0.5109, + "step": 2686 + }, + { + "epoch": 1.270449172576832, + "grad_norm": 3.5046379566192627, + "learning_rate": 4.49694329098068e-06, + "loss": 0.5883, + "step": 2687 + }, + { + "epoch": 1.270921985815603, + "grad_norm": 2.5362324714660645, + "learning_rate": 4.496567917790477e-06, + "loss": 0.5301, + "step": 2688 + }, + { + "epoch": 1.2713947990543735, + "grad_norm": 2.7095518112182617, + "learning_rate": 4.496192420282746e-06, + "loss": 0.4772, + "step": 2689 + }, + { + "epoch": 1.2718676122931443, + "grad_norm": 2.416433095932007, + "learning_rate": 4.495816798480865e-06, + "loss": 0.5012, + "step": 2690 + }, + { + "epoch": 1.2723404255319148, + "grad_norm": 2.5362391471862793, + "learning_rate": 4.495441052408224e-06, + "loss": 0.5197, + "step": 2691 + }, + { + "epoch": 1.2728132387706856, + "grad_norm": 2.9093947410583496, + "learning_rate": 4.495065182088218e-06, + "loss": 0.4893, + "step": 2692 + }, + { + "epoch": 1.2732860520094562, + "grad_norm": 2.520470142364502, + "learning_rate": 4.494689187544251e-06, + "loss": 0.5072, + "step": 2693 + }, + { + "epoch": 1.273758865248227, + "grad_norm": 2.4385125637054443, + "learning_rate": 4.494313068799735e-06, + "loss": 0.4923, + "step": 2694 + }, + { + "epoch": 1.2742316784869976, + "grad_norm": 2.636852502822876, + "learning_rate": 4.493936825878089e-06, + "loss": 0.5409, + "step": 2695 + }, + { + "epoch": 1.2747044917257684, + "grad_norm": 2.7027053833007812, + "learning_rate": 4.493560458802741e-06, + "loss": 0.5906, + "step": 2696 + }, + { + "epoch": 1.275177304964539, + "grad_norm": 2.58752179145813, + "learning_rate": 4.493183967597123e-06, + "loss": 0.5292, + "step": 2697 + }, + { + "epoch": 1.2756501182033098, + "grad_norm": 2.7658379077911377, + "learning_rate": 4.49280735228468e-06, + "loss": 0.5613, + "step": 2698 + }, + { + "epoch": 1.2761229314420803, + "grad_norm": 3.272688388824463, + "learning_rate": 4.492430612888861e-06, + "loss": 0.5654, + "step": 2699 + }, + { + "epoch": 1.2765957446808511, + "grad_norm": 2.806819438934326, + "learning_rate": 4.492053749433125e-06, + "loss": 0.5388, + "step": 2700 + }, + { + "epoch": 1.2770685579196217, + "grad_norm": 2.879727602005005, + "learning_rate": 4.491676761940936e-06, + "loss": 0.5033, + "step": 2701 + }, + { + "epoch": 1.2775413711583925, + "grad_norm": 2.733347177505493, + "learning_rate": 4.4912996504357695e-06, + "loss": 0.5113, + "step": 2702 + }, + { + "epoch": 1.278014184397163, + "grad_norm": 2.7431252002716064, + "learning_rate": 4.490922414941104e-06, + "loss": 0.5417, + "step": 2703 + }, + { + "epoch": 1.2784869976359339, + "grad_norm": 2.9287240505218506, + "learning_rate": 4.490545055480431e-06, + "loss": 0.5875, + "step": 2704 + }, + { + "epoch": 1.2789598108747045, + "grad_norm": 2.576775550842285, + "learning_rate": 4.490167572077244e-06, + "loss": 0.5176, + "step": 2705 + }, + { + "epoch": 1.2794326241134752, + "grad_norm": 2.4335594177246094, + "learning_rate": 4.4897899647550505e-06, + "loss": 0.4749, + "step": 2706 + }, + { + "epoch": 1.2799054373522458, + "grad_norm": 2.6798062324523926, + "learning_rate": 4.489412233537361e-06, + "loss": 0.5439, + "step": 2707 + }, + { + "epoch": 1.2803782505910166, + "grad_norm": 2.8440675735473633, + "learning_rate": 4.489034378447693e-06, + "loss": 0.552, + "step": 2708 + }, + { + "epoch": 1.2808510638297872, + "grad_norm": 2.9059503078460693, + "learning_rate": 4.488656399509577e-06, + "loss": 0.5667, + "step": 2709 + }, + { + "epoch": 1.281323877068558, + "grad_norm": 2.7415006160736084, + "learning_rate": 4.488278296746548e-06, + "loss": 0.5676, + "step": 2710 + }, + { + "epoch": 1.2817966903073286, + "grad_norm": 2.4584875106811523, + "learning_rate": 4.487900070182147e-06, + "loss": 0.4787, + "step": 2711 + }, + { + "epoch": 1.2822695035460994, + "grad_norm": 2.990940809249878, + "learning_rate": 4.487521719839924e-06, + "loss": 0.5239, + "step": 2712 + }, + { + "epoch": 1.28274231678487, + "grad_norm": 3.075201988220215, + "learning_rate": 4.487143245743441e-06, + "loss": 0.5103, + "step": 2713 + }, + { + "epoch": 1.2832151300236407, + "grad_norm": 2.543341875076294, + "learning_rate": 4.486764647916259e-06, + "loss": 0.5475, + "step": 2714 + }, + { + "epoch": 1.2836879432624113, + "grad_norm": 2.9927213191986084, + "learning_rate": 4.486385926381957e-06, + "loss": 0.4923, + "step": 2715 + }, + { + "epoch": 1.284160756501182, + "grad_norm": 2.4220657348632812, + "learning_rate": 4.486007081164111e-06, + "loss": 0.543, + "step": 2716 + }, + { + "epoch": 1.2846335697399527, + "grad_norm": 2.468214988708496, + "learning_rate": 4.4856281122863134e-06, + "loss": 0.5248, + "step": 2717 + }, + { + "epoch": 1.2851063829787235, + "grad_norm": 2.633711099624634, + "learning_rate": 4.48524901977216e-06, + "loss": 0.4764, + "step": 2718 + }, + { + "epoch": 1.285579196217494, + "grad_norm": 2.8399546146392822, + "learning_rate": 4.484869803645254e-06, + "loss": 0.5503, + "step": 2719 + }, + { + "epoch": 1.2860520094562649, + "grad_norm": 2.769063949584961, + "learning_rate": 4.484490463929209e-06, + "loss": 0.5468, + "step": 2720 + }, + { + "epoch": 1.2865248226950354, + "grad_norm": 2.617863893508911, + "learning_rate": 4.4841110006476465e-06, + "loss": 0.5906, + "step": 2721 + }, + { + "epoch": 1.2869976359338062, + "grad_norm": 2.7639541625976562, + "learning_rate": 4.4837314138241905e-06, + "loss": 0.552, + "step": 2722 + }, + { + "epoch": 1.2874704491725768, + "grad_norm": 2.7711129188537598, + "learning_rate": 4.483351703482478e-06, + "loss": 0.5229, + "step": 2723 + }, + { + "epoch": 1.2879432624113476, + "grad_norm": 2.611205577850342, + "learning_rate": 4.482971869646152e-06, + "loss": 0.5055, + "step": 2724 + }, + { + "epoch": 1.2884160756501182, + "grad_norm": 2.8602211475372314, + "learning_rate": 4.482591912338862e-06, + "loss": 0.5561, + "step": 2725 + }, + { + "epoch": 1.2888888888888888, + "grad_norm": 2.5882298946380615, + "learning_rate": 4.4822118315842675e-06, + "loss": 0.5555, + "step": 2726 + }, + { + "epoch": 1.2893617021276595, + "grad_norm": 2.7533531188964844, + "learning_rate": 4.481831627406033e-06, + "loss": 0.5346, + "step": 2727 + }, + { + "epoch": 1.2898345153664303, + "grad_norm": 2.4296958446502686, + "learning_rate": 4.481451299827835e-06, + "loss": 0.4915, + "step": 2728 + }, + { + "epoch": 1.290307328605201, + "grad_norm": 2.4403445720672607, + "learning_rate": 4.481070848873352e-06, + "loss": 0.5648, + "step": 2729 + }, + { + "epoch": 1.2907801418439715, + "grad_norm": 2.473224401473999, + "learning_rate": 4.480690274566274e-06, + "loss": 0.4849, + "step": 2730 + }, + { + "epoch": 1.2912529550827423, + "grad_norm": 2.637899875640869, + "learning_rate": 4.480309576930297e-06, + "loss": 0.4968, + "step": 2731 + }, + { + "epoch": 1.291725768321513, + "grad_norm": 2.7156927585601807, + "learning_rate": 4.479928755989127e-06, + "loss": 0.4759, + "step": 2732 + }, + { + "epoch": 1.2921985815602837, + "grad_norm": 2.632786989212036, + "learning_rate": 4.479547811766475e-06, + "loss": 0.5468, + "step": 2733 + }, + { + "epoch": 1.2926713947990542, + "grad_norm": 2.529218912124634, + "learning_rate": 4.479166744286061e-06, + "loss": 0.4852, + "step": 2734 + }, + { + "epoch": 1.293144208037825, + "grad_norm": 2.561978340148926, + "learning_rate": 4.4787855535716115e-06, + "loss": 0.546, + "step": 2735 + }, + { + "epoch": 1.2936170212765958, + "grad_norm": 2.3684909343719482, + "learning_rate": 4.478404239646862e-06, + "loss": 0.5369, + "step": 2736 + }, + { + "epoch": 1.2940898345153664, + "grad_norm": 2.8940367698669434, + "learning_rate": 4.4780228025355566e-06, + "loss": 0.568, + "step": 2737 + }, + { + "epoch": 1.294562647754137, + "grad_norm": 2.6950316429138184, + "learning_rate": 4.477641242261445e-06, + "loss": 0.4576, + "step": 2738 + }, + { + "epoch": 1.2950354609929078, + "grad_norm": 2.4211716651916504, + "learning_rate": 4.4772595588482835e-06, + "loss": 0.4341, + "step": 2739 + }, + { + "epoch": 1.2955082742316786, + "grad_norm": 3.141097068786621, + "learning_rate": 4.47687775231984e-06, + "loss": 0.5944, + "step": 2740 + }, + { + "epoch": 1.2959810874704492, + "grad_norm": 3.077522039413452, + "learning_rate": 4.476495822699887e-06, + "loss": 0.5786, + "step": 2741 + }, + { + "epoch": 1.2964539007092197, + "grad_norm": 2.708139419555664, + "learning_rate": 4.476113770012206e-06, + "loss": 0.5014, + "step": 2742 + }, + { + "epoch": 1.2969267139479905, + "grad_norm": 2.7572035789489746, + "learning_rate": 4.475731594280586e-06, + "loss": 0.594, + "step": 2743 + }, + { + "epoch": 1.2973995271867613, + "grad_norm": 2.673126459121704, + "learning_rate": 4.475349295528822e-06, + "loss": 0.5317, + "step": 2744 + }, + { + "epoch": 1.297872340425532, + "grad_norm": 2.6757819652557373, + "learning_rate": 4.4749668737807195e-06, + "loss": 0.5614, + "step": 2745 + }, + { + "epoch": 1.2983451536643025, + "grad_norm": 2.7077620029449463, + "learning_rate": 4.47458432906009e-06, + "loss": 0.4916, + "step": 2746 + }, + { + "epoch": 1.2988179669030733, + "grad_norm": 2.446570873260498, + "learning_rate": 4.474201661390752e-06, + "loss": 0.5005, + "step": 2747 + }, + { + "epoch": 1.299290780141844, + "grad_norm": 2.642695665359497, + "learning_rate": 4.473818870796533e-06, + "loss": 0.5048, + "step": 2748 + }, + { + "epoch": 1.2997635933806146, + "grad_norm": 2.519824743270874, + "learning_rate": 4.4734359573012686e-06, + "loss": 0.5131, + "step": 2749 + }, + { + "epoch": 1.3002364066193852, + "grad_norm": 2.5901925563812256, + "learning_rate": 4.4730529209287995e-06, + "loss": 0.4582, + "step": 2750 + }, + { + "epoch": 1.300709219858156, + "grad_norm": 2.6789121627807617, + "learning_rate": 4.472669761702978e-06, + "loss": 0.5685, + "step": 2751 + }, + { + "epoch": 1.3011820330969268, + "grad_norm": 2.408003807067871, + "learning_rate": 4.472286479647659e-06, + "loss": 0.4329, + "step": 2752 + }, + { + "epoch": 1.3016548463356974, + "grad_norm": 2.681403398513794, + "learning_rate": 4.47190307478671e-06, + "loss": 0.4853, + "step": 2753 + }, + { + "epoch": 1.302127659574468, + "grad_norm": 2.9923183917999268, + "learning_rate": 4.4715195471440025e-06, + "loss": 0.5184, + "step": 2754 + }, + { + "epoch": 1.3026004728132388, + "grad_norm": 2.5100321769714355, + "learning_rate": 4.471135896743418e-06, + "loss": 0.5148, + "step": 2755 + }, + { + "epoch": 1.3030732860520096, + "grad_norm": 2.267881393432617, + "learning_rate": 4.4707521236088444e-06, + "loss": 0.5028, + "step": 2756 + }, + { + "epoch": 1.3035460992907801, + "grad_norm": 2.7779829502105713, + "learning_rate": 4.4703682277641775e-06, + "loss": 0.5724, + "step": 2757 + }, + { + "epoch": 1.3040189125295507, + "grad_norm": 2.4262194633483887, + "learning_rate": 4.4699842092333205e-06, + "loss": 0.5341, + "step": 2758 + }, + { + "epoch": 1.3044917257683215, + "grad_norm": 2.8682050704956055, + "learning_rate": 4.469600068040185e-06, + "loss": 0.6114, + "step": 2759 + }, + { + "epoch": 1.3049645390070923, + "grad_norm": 2.647853374481201, + "learning_rate": 4.46921580420869e-06, + "loss": 0.5107, + "step": 2760 + }, + { + "epoch": 1.3054373522458629, + "grad_norm": 2.561998128890991, + "learning_rate": 4.468831417762762e-06, + "loss": 0.6019, + "step": 2761 + }, + { + "epoch": 1.3059101654846335, + "grad_norm": 2.763425350189209, + "learning_rate": 4.468446908726334e-06, + "loss": 0.572, + "step": 2762 + }, + { + "epoch": 1.3063829787234043, + "grad_norm": 2.7052934169769287, + "learning_rate": 4.468062277123348e-06, + "loss": 0.4876, + "step": 2763 + }, + { + "epoch": 1.306855791962175, + "grad_norm": 2.997845411300659, + "learning_rate": 4.467677522977755e-06, + "loss": 0.5683, + "step": 2764 + }, + { + "epoch": 1.3073286052009456, + "grad_norm": 2.503129005432129, + "learning_rate": 4.46729264631351e-06, + "loss": 0.4951, + "step": 2765 + }, + { + "epoch": 1.3078014184397162, + "grad_norm": 2.617492437362671, + "learning_rate": 4.466907647154578e-06, + "loss": 0.5054, + "step": 2766 + }, + { + "epoch": 1.308274231678487, + "grad_norm": 2.934967279434204, + "learning_rate": 4.4665225255249315e-06, + "loss": 0.5299, + "step": 2767 + }, + { + "epoch": 1.3087470449172578, + "grad_norm": 2.787252187728882, + "learning_rate": 4.46613728144855e-06, + "loss": 0.4652, + "step": 2768 + }, + { + "epoch": 1.3092198581560284, + "grad_norm": 2.567439556121826, + "learning_rate": 4.465751914949422e-06, + "loss": 0.538, + "step": 2769 + }, + { + "epoch": 1.309692671394799, + "grad_norm": 2.6386024951934814, + "learning_rate": 4.4653664260515416e-06, + "loss": 0.464, + "step": 2770 + }, + { + "epoch": 1.3101654846335697, + "grad_norm": 2.966848134994507, + "learning_rate": 4.464980814778912e-06, + "loss": 0.4889, + "step": 2771 + }, + { + "epoch": 1.3106382978723405, + "grad_norm": 2.571256637573242, + "learning_rate": 4.464595081155542e-06, + "loss": 0.4979, + "step": 2772 + }, + { + "epoch": 1.3111111111111111, + "grad_norm": 2.774203062057495, + "learning_rate": 4.4642092252054515e-06, + "loss": 0.5366, + "step": 2773 + }, + { + "epoch": 1.3115839243498817, + "grad_norm": 2.682969331741333, + "learning_rate": 4.463823246952666e-06, + "loss": 0.5118, + "step": 2774 + }, + { + "epoch": 1.3120567375886525, + "grad_norm": 2.4873905181884766, + "learning_rate": 4.463437146421217e-06, + "loss": 0.5548, + "step": 2775 + }, + { + "epoch": 1.3125295508274233, + "grad_norm": 2.6769661903381348, + "learning_rate": 4.463050923635147e-06, + "loss": 0.5023, + "step": 2776 + }, + { + "epoch": 1.3130023640661939, + "grad_norm": 2.7190892696380615, + "learning_rate": 4.462664578618503e-06, + "loss": 0.5546, + "step": 2777 + }, + { + "epoch": 1.3134751773049644, + "grad_norm": 2.8193624019622803, + "learning_rate": 4.462278111395343e-06, + "loss": 0.5265, + "step": 2778 + }, + { + "epoch": 1.3139479905437352, + "grad_norm": 2.7324538230895996, + "learning_rate": 4.461891521989728e-06, + "loss": 0.5449, + "step": 2779 + }, + { + "epoch": 1.314420803782506, + "grad_norm": 2.87320876121521, + "learning_rate": 4.4615048104257305e-06, + "loss": 0.5367, + "step": 2780 + }, + { + "epoch": 1.3148936170212766, + "grad_norm": 2.6777031421661377, + "learning_rate": 4.4611179767274306e-06, + "loss": 0.5026, + "step": 2781 + }, + { + "epoch": 1.3153664302600472, + "grad_norm": 3.714524269104004, + "learning_rate": 4.460731020918913e-06, + "loss": 0.569, + "step": 2782 + }, + { + "epoch": 1.315839243498818, + "grad_norm": 2.7493600845336914, + "learning_rate": 4.460343943024273e-06, + "loss": 0.5826, + "step": 2783 + }, + { + "epoch": 1.3163120567375888, + "grad_norm": 2.6544079780578613, + "learning_rate": 4.459956743067609e-06, + "loss": 0.5399, + "step": 2784 + }, + { + "epoch": 1.3167848699763594, + "grad_norm": 2.4338037967681885, + "learning_rate": 4.459569421073036e-06, + "loss": 0.5186, + "step": 2785 + }, + { + "epoch": 1.31725768321513, + "grad_norm": 2.9312374591827393, + "learning_rate": 4.459181977064665e-06, + "loss": 0.5571, + "step": 2786 + }, + { + "epoch": 1.3177304964539007, + "grad_norm": 2.5988922119140625, + "learning_rate": 4.458794411066624e-06, + "loss": 0.5926, + "step": 2787 + }, + { + "epoch": 1.3182033096926715, + "grad_norm": 2.5193772315979004, + "learning_rate": 4.458406723103044e-06, + "loss": 0.5243, + "step": 2788 + }, + { + "epoch": 1.318676122931442, + "grad_norm": 2.8653743267059326, + "learning_rate": 4.458018913198066e-06, + "loss": 0.5421, + "step": 2789 + }, + { + "epoch": 1.3191489361702127, + "grad_norm": 2.486245632171631, + "learning_rate": 4.457630981375834e-06, + "loss": 0.4862, + "step": 2790 + }, + { + "epoch": 1.3196217494089835, + "grad_norm": 3.155435800552368, + "learning_rate": 4.457242927660506e-06, + "loss": 0.5386, + "step": 2791 + }, + { + "epoch": 1.3200945626477543, + "grad_norm": 3.102023124694824, + "learning_rate": 4.456854752076242e-06, + "loss": 0.5527, + "step": 2792 + }, + { + "epoch": 1.3205673758865248, + "grad_norm": 2.7995986938476562, + "learning_rate": 4.456466454647215e-06, + "loss": 0.4364, + "step": 2793 + }, + { + "epoch": 1.3210401891252954, + "grad_norm": 2.8328311443328857, + "learning_rate": 4.456078035397599e-06, + "loss": 0.5516, + "step": 2794 + }, + { + "epoch": 1.3215130023640662, + "grad_norm": 2.606161594390869, + "learning_rate": 4.455689494351581e-06, + "loss": 0.5042, + "step": 2795 + }, + { + "epoch": 1.321985815602837, + "grad_norm": 2.6344757080078125, + "learning_rate": 4.455300831533354e-06, + "loss": 0.4807, + "step": 2796 + }, + { + "epoch": 1.3224586288416076, + "grad_norm": 2.8539786338806152, + "learning_rate": 4.454912046967118e-06, + "loss": 0.4694, + "step": 2797 + }, + { + "epoch": 1.3229314420803782, + "grad_norm": 2.849066734313965, + "learning_rate": 4.454523140677081e-06, + "loss": 0.5037, + "step": 2798 + }, + { + "epoch": 1.323404255319149, + "grad_norm": 2.6803371906280518, + "learning_rate": 4.454134112687458e-06, + "loss": 0.4959, + "step": 2799 + }, + { + "epoch": 1.3238770685579198, + "grad_norm": 3.0546066761016846, + "learning_rate": 4.453744963022473e-06, + "loss": 0.5935, + "step": 2800 + }, + { + "epoch": 1.3243498817966903, + "grad_norm": 2.625602960586548, + "learning_rate": 4.453355691706356e-06, + "loss": 0.5349, + "step": 2801 + }, + { + "epoch": 1.324822695035461, + "grad_norm": 2.7568554878234863, + "learning_rate": 4.452966298763345e-06, + "loss": 0.5012, + "step": 2802 + }, + { + "epoch": 1.3252955082742317, + "grad_norm": 2.940427303314209, + "learning_rate": 4.452576784217686e-06, + "loss": 0.5246, + "step": 2803 + }, + { + "epoch": 1.3257683215130025, + "grad_norm": 2.5485289096832275, + "learning_rate": 4.452187148093633e-06, + "loss": 0.5282, + "step": 2804 + }, + { + "epoch": 1.326241134751773, + "grad_norm": 2.8152987957000732, + "learning_rate": 4.4517973904154455e-06, + "loss": 0.5468, + "step": 2805 + }, + { + "epoch": 1.3267139479905437, + "grad_norm": 2.9399688243865967, + "learning_rate": 4.451407511207393e-06, + "loss": 0.5586, + "step": 2806 + }, + { + "epoch": 1.3271867612293144, + "grad_norm": 2.3870036602020264, + "learning_rate": 4.451017510493751e-06, + "loss": 0.4807, + "step": 2807 + }, + { + "epoch": 1.327659574468085, + "grad_norm": 3.4667887687683105, + "learning_rate": 4.450627388298805e-06, + "loss": 0.5571, + "step": 2808 + }, + { + "epoch": 1.3281323877068558, + "grad_norm": 2.685986042022705, + "learning_rate": 4.450237144646844e-06, + "loss": 0.5525, + "step": 2809 + }, + { + "epoch": 1.3286052009456264, + "grad_norm": 2.8529131412506104, + "learning_rate": 4.449846779562168e-06, + "loss": 0.491, + "step": 2810 + }, + { + "epoch": 1.3290780141843972, + "grad_norm": 2.7360332012176514, + "learning_rate": 4.449456293069082e-06, + "loss": 0.5574, + "step": 2811 + }, + { + "epoch": 1.3295508274231678, + "grad_norm": 2.4656026363372803, + "learning_rate": 4.4490656851919015e-06, + "loss": 0.4678, + "step": 2812 + }, + { + "epoch": 1.3300236406619386, + "grad_norm": 2.602651357650757, + "learning_rate": 4.448674955954947e-06, + "loss": 0.5118, + "step": 2813 + }, + { + "epoch": 1.3304964539007091, + "grad_norm": 3.0129756927490234, + "learning_rate": 4.448284105382548e-06, + "loss": 0.6136, + "step": 2814 + }, + { + "epoch": 1.33096926713948, + "grad_norm": 2.8499927520751953, + "learning_rate": 4.447893133499039e-06, + "loss": 0.5286, + "step": 2815 + }, + { + "epoch": 1.3314420803782505, + "grad_norm": 2.8320744037628174, + "learning_rate": 4.447502040328767e-06, + "loss": 0.5186, + "step": 2816 + }, + { + "epoch": 1.3319148936170213, + "grad_norm": 2.499950885772705, + "learning_rate": 4.447110825896084e-06, + "loss": 0.5338, + "step": 2817 + }, + { + "epoch": 1.3323877068557919, + "grad_norm": 2.530895233154297, + "learning_rate": 4.446719490225346e-06, + "loss": 0.5151, + "step": 2818 + }, + { + "epoch": 1.3328605200945627, + "grad_norm": 2.5276098251342773, + "learning_rate": 4.446328033340921e-06, + "loss": 0.5424, + "step": 2819 + }, + { + "epoch": 1.3333333333333333, + "grad_norm": 2.90218186378479, + "learning_rate": 4.4459364552671845e-06, + "loss": 0.5747, + "step": 2820 + }, + { + "epoch": 1.333806146572104, + "grad_norm": 2.500943183898926, + "learning_rate": 4.445544756028518e-06, + "loss": 0.5459, + "step": 2821 + }, + { + "epoch": 1.3342789598108746, + "grad_norm": 2.960374355316162, + "learning_rate": 4.44515293564931e-06, + "loss": 0.6092, + "step": 2822 + }, + { + "epoch": 1.3347517730496454, + "grad_norm": 2.813671827316284, + "learning_rate": 4.444760994153958e-06, + "loss": 0.5536, + "step": 2823 + }, + { + "epoch": 1.335224586288416, + "grad_norm": 2.7147483825683594, + "learning_rate": 4.444368931566867e-06, + "loss": 0.5291, + "step": 2824 + }, + { + "epoch": 1.3356973995271868, + "grad_norm": 2.710101842880249, + "learning_rate": 4.443976747912447e-06, + "loss": 0.5138, + "step": 2825 + }, + { + "epoch": 1.3361702127659574, + "grad_norm": 2.711419105529785, + "learning_rate": 4.443584443215121e-06, + "loss": 0.5223, + "step": 2826 + }, + { + "epoch": 1.3366430260047282, + "grad_norm": 2.887472152709961, + "learning_rate": 4.443192017499313e-06, + "loss": 0.5464, + "step": 2827 + }, + { + "epoch": 1.3371158392434987, + "grad_norm": 2.8867223262786865, + "learning_rate": 4.4427994707894585e-06, + "loss": 0.5748, + "step": 2828 + }, + { + "epoch": 1.3375886524822695, + "grad_norm": 2.407247543334961, + "learning_rate": 4.44240680311e-06, + "loss": 0.4727, + "step": 2829 + }, + { + "epoch": 1.3380614657210401, + "grad_norm": 2.578420877456665, + "learning_rate": 4.4420140144853865e-06, + "loss": 0.5129, + "step": 2830 + }, + { + "epoch": 1.338534278959811, + "grad_norm": 2.884373426437378, + "learning_rate": 4.441621104940077e-06, + "loss": 0.5366, + "step": 2831 + }, + { + "epoch": 1.3390070921985815, + "grad_norm": 2.8652374744415283, + "learning_rate": 4.441228074498534e-06, + "loss": 0.5045, + "step": 2832 + }, + { + "epoch": 1.3394799054373523, + "grad_norm": 2.5380210876464844, + "learning_rate": 4.440834923185231e-06, + "loss": 0.509, + "step": 2833 + }, + { + "epoch": 1.3399527186761229, + "grad_norm": 2.415734052658081, + "learning_rate": 4.440441651024648e-06, + "loss": 0.5066, + "step": 2834 + }, + { + "epoch": 1.3404255319148937, + "grad_norm": 2.503051996231079, + "learning_rate": 4.440048258041272e-06, + "loss": 0.5118, + "step": 2835 + }, + { + "epoch": 1.3408983451536642, + "grad_norm": 3.351001024246216, + "learning_rate": 4.439654744259598e-06, + "loss": 0.5758, + "step": 2836 + }, + { + "epoch": 1.341371158392435, + "grad_norm": 2.7368781566619873, + "learning_rate": 4.439261109704129e-06, + "loss": 0.5674, + "step": 2837 + }, + { + "epoch": 1.3418439716312056, + "grad_norm": 3.008199453353882, + "learning_rate": 4.438867354399372e-06, + "loss": 0.5891, + "step": 2838 + }, + { + "epoch": 1.3423167848699764, + "grad_norm": 2.538907766342163, + "learning_rate": 4.438473478369847e-06, + "loss": 0.5102, + "step": 2839 + }, + { + "epoch": 1.342789598108747, + "grad_norm": 2.7169063091278076, + "learning_rate": 4.438079481640079e-06, + "loss": 0.6131, + "step": 2840 + }, + { + "epoch": 1.3432624113475178, + "grad_norm": 2.7411608695983887, + "learning_rate": 4.437685364234601e-06, + "loss": 0.5337, + "step": 2841 + }, + { + "epoch": 1.3437352245862884, + "grad_norm": 3.2374939918518066, + "learning_rate": 4.43729112617795e-06, + "loss": 0.5401, + "step": 2842 + }, + { + "epoch": 1.3442080378250592, + "grad_norm": 2.4712226390838623, + "learning_rate": 4.436896767494676e-06, + "loss": 0.5365, + "step": 2843 + }, + { + "epoch": 1.3446808510638297, + "grad_norm": 2.661619186401367, + "learning_rate": 4.436502288209334e-06, + "loss": 0.4919, + "step": 2844 + }, + { + "epoch": 1.3451536643026005, + "grad_norm": 2.5943779945373535, + "learning_rate": 4.4361076883464845e-06, + "loss": 0.5253, + "step": 2845 + }, + { + "epoch": 1.345626477541371, + "grad_norm": 2.672297477722168, + "learning_rate": 4.4357129679307e-06, + "loss": 0.541, + "step": 2846 + }, + { + "epoch": 1.346099290780142, + "grad_norm": 2.6830925941467285, + "learning_rate": 4.435318126986557e-06, + "loss": 0.5641, + "step": 2847 + }, + { + "epoch": 1.3465721040189125, + "grad_norm": 2.7394626140594482, + "learning_rate": 4.434923165538639e-06, + "loss": 0.5591, + "step": 2848 + }, + { + "epoch": 1.3470449172576833, + "grad_norm": 2.9656317234039307, + "learning_rate": 4.434528083611541e-06, + "loss": 0.515, + "step": 2849 + }, + { + "epoch": 1.3475177304964538, + "grad_norm": 3.30155086517334, + "learning_rate": 4.434132881229861e-06, + "loss": 0.5871, + "step": 2850 + }, + { + "epoch": 1.3479905437352246, + "grad_norm": 2.6222476959228516, + "learning_rate": 4.433737558418209e-06, + "loss": 0.5143, + "step": 2851 + }, + { + "epoch": 1.3484633569739952, + "grad_norm": 2.903158187866211, + "learning_rate": 4.4333421152011965e-06, + "loss": 0.4484, + "step": 2852 + }, + { + "epoch": 1.348936170212766, + "grad_norm": 2.863116979598999, + "learning_rate": 4.432946551603449e-06, + "loss": 0.5213, + "step": 2853 + }, + { + "epoch": 1.3494089834515366, + "grad_norm": 2.8253962993621826, + "learning_rate": 4.432550867649596e-06, + "loss": 0.5713, + "step": 2854 + }, + { + "epoch": 1.3498817966903074, + "grad_norm": 2.652493953704834, + "learning_rate": 4.432155063364273e-06, + "loss": 0.5559, + "step": 2855 + }, + { + "epoch": 1.350354609929078, + "grad_norm": 2.4289376735687256, + "learning_rate": 4.431759138772127e-06, + "loss": 0.5122, + "step": 2856 + }, + { + "epoch": 1.3508274231678488, + "grad_norm": 2.6329853534698486, + "learning_rate": 4.43136309389781e-06, + "loss": 0.5332, + "step": 2857 + }, + { + "epoch": 1.3513002364066193, + "grad_norm": 2.431103229522705, + "learning_rate": 4.430966928765982e-06, + "loss": 0.4863, + "step": 2858 + }, + { + "epoch": 1.3517730496453901, + "grad_norm": 2.7529025077819824, + "learning_rate": 4.4305706434013106e-06, + "loss": 0.5263, + "step": 2859 + }, + { + "epoch": 1.3522458628841607, + "grad_norm": 2.884605646133423, + "learning_rate": 4.43017423782847e-06, + "loss": 0.564, + "step": 2860 + }, + { + "epoch": 1.3527186761229315, + "grad_norm": 3.027771234512329, + "learning_rate": 4.4297777120721435e-06, + "loss": 0.5846, + "step": 2861 + }, + { + "epoch": 1.353191489361702, + "grad_norm": 3.0140626430511475, + "learning_rate": 4.4293810661570205e-06, + "loss": 0.6621, + "step": 2862 + }, + { + "epoch": 1.3536643026004729, + "grad_norm": 2.721799612045288, + "learning_rate": 4.428984300107799e-06, + "loss": 0.5566, + "step": 2863 + }, + { + "epoch": 1.3541371158392435, + "grad_norm": 3.0016496181488037, + "learning_rate": 4.428587413949183e-06, + "loss": 0.5525, + "step": 2864 + }, + { + "epoch": 1.3546099290780143, + "grad_norm": 2.77138614654541, + "learning_rate": 4.428190407705886e-06, + "loss": 0.6016, + "step": 2865 + }, + { + "epoch": 1.3550827423167848, + "grad_norm": 2.9783477783203125, + "learning_rate": 4.427793281402627e-06, + "loss": 0.5556, + "step": 2866 + }, + { + "epoch": 1.3555555555555556, + "grad_norm": 2.2490382194519043, + "learning_rate": 4.427396035064132e-06, + "loss": 0.5138, + "step": 2867 + }, + { + "epoch": 1.3560283687943262, + "grad_norm": 2.442225217819214, + "learning_rate": 4.426998668715139e-06, + "loss": 0.4843, + "step": 2868 + }, + { + "epoch": 1.356501182033097, + "grad_norm": 2.74040150642395, + "learning_rate": 4.426601182380388e-06, + "loss": 0.54, + "step": 2869 + }, + { + "epoch": 1.3569739952718676, + "grad_norm": 2.4434332847595215, + "learning_rate": 4.426203576084629e-06, + "loss": 0.5199, + "step": 2870 + }, + { + "epoch": 1.3574468085106384, + "grad_norm": 2.6380388736724854, + "learning_rate": 4.42580584985262e-06, + "loss": 0.5049, + "step": 2871 + }, + { + "epoch": 1.357919621749409, + "grad_norm": 2.7324254512786865, + "learning_rate": 4.425408003709125e-06, + "loss": 0.5036, + "step": 2872 + }, + { + "epoch": 1.3583924349881797, + "grad_norm": 2.661012649536133, + "learning_rate": 4.425010037678916e-06, + "loss": 0.4965, + "step": 2873 + }, + { + "epoch": 1.3588652482269503, + "grad_norm": 2.5380208492279053, + "learning_rate": 4.424611951786773e-06, + "loss": 0.4293, + "step": 2874 + }, + { + "epoch": 1.3593380614657211, + "grad_norm": 2.6060714721679688, + "learning_rate": 4.424213746057483e-06, + "loss": 0.5335, + "step": 2875 + }, + { + "epoch": 1.3598108747044917, + "grad_norm": 2.98282527923584, + "learning_rate": 4.423815420515841e-06, + "loss": 0.5626, + "step": 2876 + }, + { + "epoch": 1.3602836879432625, + "grad_norm": 2.779371500015259, + "learning_rate": 4.423416975186647e-06, + "loss": 0.5353, + "step": 2877 + }, + { + "epoch": 1.360756501182033, + "grad_norm": 2.8033530712127686, + "learning_rate": 4.423018410094713e-06, + "loss": 0.538, + "step": 2878 + }, + { + "epoch": 1.3612293144208039, + "grad_norm": 3.225177764892578, + "learning_rate": 4.422619725264855e-06, + "loss": 0.5441, + "step": 2879 + }, + { + "epoch": 1.3617021276595744, + "grad_norm": 2.959135055541992, + "learning_rate": 4.422220920721896e-06, + "loss": 0.5293, + "step": 2880 + }, + { + "epoch": 1.3621749408983452, + "grad_norm": 2.5558884143829346, + "learning_rate": 4.4218219964906704e-06, + "loss": 0.442, + "step": 2881 + }, + { + "epoch": 1.3626477541371158, + "grad_norm": 2.694899797439575, + "learning_rate": 4.421422952596015e-06, + "loss": 0.5318, + "step": 2882 + }, + { + "epoch": 1.3631205673758866, + "grad_norm": 2.7909531593322754, + "learning_rate": 4.421023789062777e-06, + "loss": 0.6648, + "step": 2883 + }, + { + "epoch": 1.3635933806146572, + "grad_norm": 2.421995162963867, + "learning_rate": 4.420624505915813e-06, + "loss": 0.4644, + "step": 2884 + }, + { + "epoch": 1.364066193853428, + "grad_norm": 2.5876688957214355, + "learning_rate": 4.420225103179981e-06, + "loss": 0.5743, + "step": 2885 + }, + { + "epoch": 1.3645390070921986, + "grad_norm": 2.89341139793396, + "learning_rate": 4.419825580880152e-06, + "loss": 0.5454, + "step": 2886 + }, + { + "epoch": 1.3650118203309693, + "grad_norm": 2.534708261489868, + "learning_rate": 4.419425939041203e-06, + "loss": 0.5572, + "step": 2887 + }, + { + "epoch": 1.36548463356974, + "grad_norm": 2.6052141189575195, + "learning_rate": 4.419026177688017e-06, + "loss": 0.4763, + "step": 2888 + }, + { + "epoch": 1.3659574468085105, + "grad_norm": 2.723720073699951, + "learning_rate": 4.4186262968454854e-06, + "loss": 0.5659, + "step": 2889 + }, + { + "epoch": 1.3664302600472813, + "grad_norm": 2.8909599781036377, + "learning_rate": 4.418226296538507e-06, + "loss": 0.4996, + "step": 2890 + }, + { + "epoch": 1.366903073286052, + "grad_norm": 2.551375389099121, + "learning_rate": 4.417826176791988e-06, + "loss": 0.5259, + "step": 2891 + }, + { + "epoch": 1.3673758865248227, + "grad_norm": 3.360267162322998, + "learning_rate": 4.417425937630843e-06, + "loss": 0.5381, + "step": 2892 + }, + { + "epoch": 1.3678486997635932, + "grad_norm": 2.7611942291259766, + "learning_rate": 4.417025579079992e-06, + "loss": 0.6022, + "step": 2893 + }, + { + "epoch": 1.368321513002364, + "grad_norm": 2.5931224822998047, + "learning_rate": 4.416625101164365e-06, + "loss": 0.5102, + "step": 2894 + }, + { + "epoch": 1.3687943262411348, + "grad_norm": 2.5888102054595947, + "learning_rate": 4.416224503908897e-06, + "loss": 0.4955, + "step": 2895 + }, + { + "epoch": 1.3692671394799054, + "grad_norm": 2.6262896060943604, + "learning_rate": 4.41582378733853e-06, + "loss": 0.5101, + "step": 2896 + }, + { + "epoch": 1.369739952718676, + "grad_norm": 3.339170217514038, + "learning_rate": 4.415422951478218e-06, + "loss": 0.4939, + "step": 2897 + }, + { + "epoch": 1.3702127659574468, + "grad_norm": 2.940866708755493, + "learning_rate": 4.415021996352917e-06, + "loss": 0.5157, + "step": 2898 + }, + { + "epoch": 1.3706855791962176, + "grad_norm": 2.7423818111419678, + "learning_rate": 4.414620921987594e-06, + "loss": 0.5308, + "step": 2899 + }, + { + "epoch": 1.3711583924349882, + "grad_norm": 2.7177040576934814, + "learning_rate": 4.414219728407221e-06, + "loss": 0.5429, + "step": 2900 + }, + { + "epoch": 1.3716312056737587, + "grad_norm": 2.560774087905884, + "learning_rate": 4.4138184156367794e-06, + "loss": 0.5266, + "step": 2901 + }, + { + "epoch": 1.3721040189125295, + "grad_norm": 2.5649116039276123, + "learning_rate": 4.413416983701256e-06, + "loss": 0.4718, + "step": 2902 + }, + { + "epoch": 1.3725768321513003, + "grad_norm": 2.8547167778015137, + "learning_rate": 4.413015432625648e-06, + "loss": 0.5129, + "step": 2903 + }, + { + "epoch": 1.373049645390071, + "grad_norm": 2.5413618087768555, + "learning_rate": 4.412613762434958e-06, + "loss": 0.5738, + "step": 2904 + }, + { + "epoch": 1.3735224586288415, + "grad_norm": 3.3252241611480713, + "learning_rate": 4.412211973154195e-06, + "loss": 0.5639, + "step": 2905 + }, + { + "epoch": 1.3739952718676123, + "grad_norm": 2.869102954864502, + "learning_rate": 4.411810064808376e-06, + "loss": 0.5384, + "step": 2906 + }, + { + "epoch": 1.374468085106383, + "grad_norm": 2.703199863433838, + "learning_rate": 4.411408037422529e-06, + "loss": 0.5742, + "step": 2907 + }, + { + "epoch": 1.3749408983451537, + "grad_norm": 2.685450792312622, + "learning_rate": 4.411005891021684e-06, + "loss": 0.5121, + "step": 2908 + }, + { + "epoch": 1.3754137115839242, + "grad_norm": 2.9572203159332275, + "learning_rate": 4.410603625630882e-06, + "loss": 0.5444, + "step": 2909 + }, + { + "epoch": 1.375886524822695, + "grad_norm": 2.707002878189087, + "learning_rate": 4.410201241275169e-06, + "loss": 0.5125, + "step": 2910 + }, + { + "epoch": 1.3763593380614658, + "grad_norm": 3.0158939361572266, + "learning_rate": 4.409798737979602e-06, + "loss": 0.5299, + "step": 2911 + }, + { + "epoch": 1.3768321513002364, + "grad_norm": 2.7932698726654053, + "learning_rate": 4.4093961157692415e-06, + "loss": 0.5437, + "step": 2912 + }, + { + "epoch": 1.377304964539007, + "grad_norm": 2.459510326385498, + "learning_rate": 4.408993374669156e-06, + "loss": 0.5548, + "step": 2913 + }, + { + "epoch": 1.3777777777777778, + "grad_norm": 2.7500696182250977, + "learning_rate": 4.408590514704425e-06, + "loss": 0.5186, + "step": 2914 + }, + { + "epoch": 1.3782505910165486, + "grad_norm": 2.7824268341064453, + "learning_rate": 4.4081875359001315e-06, + "loss": 0.4762, + "step": 2915 + }, + { + "epoch": 1.3787234042553191, + "grad_norm": 2.4202158451080322, + "learning_rate": 4.4077844382813675e-06, + "loss": 0.5005, + "step": 2916 + }, + { + "epoch": 1.3791962174940897, + "grad_norm": 2.5566670894622803, + "learning_rate": 4.4073812218732316e-06, + "loss": 0.5377, + "step": 2917 + }, + { + "epoch": 1.3796690307328605, + "grad_norm": 3.400874376296997, + "learning_rate": 4.406977886700831e-06, + "loss": 0.6637, + "step": 2918 + }, + { + "epoch": 1.3801418439716313, + "grad_norm": 2.8187878131866455, + "learning_rate": 4.406574432789278e-06, + "loss": 0.5033, + "step": 2919 + }, + { + "epoch": 1.3806146572104019, + "grad_norm": 2.5578041076660156, + "learning_rate": 4.406170860163697e-06, + "loss": 0.5293, + "step": 2920 + }, + { + "epoch": 1.3810874704491725, + "grad_norm": 2.6709718704223633, + "learning_rate": 4.405767168849213e-06, + "loss": 0.5144, + "step": 2921 + }, + { + "epoch": 1.3815602836879433, + "grad_norm": 3.049365997314453, + "learning_rate": 4.405363358870965e-06, + "loss": 0.4894, + "step": 2922 + }, + { + "epoch": 1.382033096926714, + "grad_norm": 2.5569891929626465, + "learning_rate": 4.404959430254095e-06, + "loss": 0.4929, + "step": 2923 + }, + { + "epoch": 1.3825059101654846, + "grad_norm": 2.8288230895996094, + "learning_rate": 4.404555383023754e-06, + "loss": 0.5438, + "step": 2924 + }, + { + "epoch": 1.3829787234042552, + "grad_norm": 2.8363358974456787, + "learning_rate": 4.404151217205102e-06, + "loss": 0.545, + "step": 2925 + }, + { + "epoch": 1.383451536643026, + "grad_norm": 2.720972776412964, + "learning_rate": 4.403746932823302e-06, + "loss": 0.5732, + "step": 2926 + }, + { + "epoch": 1.3839243498817968, + "grad_norm": 2.728043794631958, + "learning_rate": 4.403342529903528e-06, + "loss": 0.4944, + "step": 2927 + }, + { + "epoch": 1.3843971631205674, + "grad_norm": 2.4366135597229004, + "learning_rate": 4.402938008470961e-06, + "loss": 0.4441, + "step": 2928 + }, + { + "epoch": 1.384869976359338, + "grad_norm": 2.858454704284668, + "learning_rate": 4.402533368550788e-06, + "loss": 0.5359, + "step": 2929 + }, + { + "epoch": 1.3853427895981087, + "grad_norm": 2.805795907974243, + "learning_rate": 4.402128610168205e-06, + "loss": 0.4954, + "step": 2930 + }, + { + "epoch": 1.3858156028368795, + "grad_norm": 3.3514177799224854, + "learning_rate": 4.401723733348413e-06, + "loss": 0.579, + "step": 2931 + }, + { + "epoch": 1.3862884160756501, + "grad_norm": 2.6255125999450684, + "learning_rate": 4.401318738116624e-06, + "loss": 0.5002, + "step": 2932 + }, + { + "epoch": 1.3867612293144207, + "grad_norm": 2.3480796813964844, + "learning_rate": 4.400913624498054e-06, + "loss": 0.4688, + "step": 2933 + }, + { + "epoch": 1.3872340425531915, + "grad_norm": 2.710165023803711, + "learning_rate": 4.400508392517927e-06, + "loss": 0.5099, + "step": 2934 + }, + { + "epoch": 1.3877068557919623, + "grad_norm": 2.5820295810699463, + "learning_rate": 4.400103042201477e-06, + "loss": 0.512, + "step": 2935 + }, + { + "epoch": 1.3881796690307329, + "grad_norm": 2.750596523284912, + "learning_rate": 4.399697573573942e-06, + "loss": 0.463, + "step": 2936 + }, + { + "epoch": 1.3886524822695034, + "grad_norm": 3.497537612915039, + "learning_rate": 4.399291986660569e-06, + "loss": 0.5676, + "step": 2937 + }, + { + "epoch": 1.3891252955082742, + "grad_norm": 2.4046003818511963, + "learning_rate": 4.398886281486612e-06, + "loss": 0.5408, + "step": 2938 + }, + { + "epoch": 1.389598108747045, + "grad_norm": 2.941606283187866, + "learning_rate": 4.398480458077332e-06, + "loss": 0.5734, + "step": 2939 + }, + { + "epoch": 1.3900709219858156, + "grad_norm": 3.030214309692383, + "learning_rate": 4.398074516458e-06, + "loss": 0.5353, + "step": 2940 + }, + { + "epoch": 1.3905437352245862, + "grad_norm": 2.9991626739501953, + "learning_rate": 4.397668456653889e-06, + "loss": 0.5989, + "step": 2941 + }, + { + "epoch": 1.391016548463357, + "grad_norm": 4.163141250610352, + "learning_rate": 4.397262278690285e-06, + "loss": 0.5436, + "step": 2942 + }, + { + "epoch": 1.3914893617021278, + "grad_norm": 2.6576037406921387, + "learning_rate": 4.396855982592478e-06, + "loss": 0.5206, + "step": 2943 + }, + { + "epoch": 1.3919621749408984, + "grad_norm": 2.7729203701019287, + "learning_rate": 4.396449568385768e-06, + "loss": 0.5403, + "step": 2944 + }, + { + "epoch": 1.392434988179669, + "grad_norm": 2.4560446739196777, + "learning_rate": 4.396043036095457e-06, + "loss": 0.4924, + "step": 2945 + }, + { + "epoch": 1.3929078014184397, + "grad_norm": 2.6370556354522705, + "learning_rate": 4.39563638574686e-06, + "loss": 0.5543, + "step": 2946 + }, + { + "epoch": 1.3933806146572105, + "grad_norm": 2.593914270401001, + "learning_rate": 4.395229617365298e-06, + "loss": 0.5133, + "step": 2947 + }, + { + "epoch": 1.393853427895981, + "grad_norm": 2.3583998680114746, + "learning_rate": 4.394822730976099e-06, + "loss": 0.4436, + "step": 2948 + }, + { + "epoch": 1.3943262411347517, + "grad_norm": 3.2768537998199463, + "learning_rate": 4.394415726604596e-06, + "loss": 0.5489, + "step": 2949 + }, + { + "epoch": 1.3947990543735225, + "grad_norm": 2.88662052154541, + "learning_rate": 4.394008604276133e-06, + "loss": 0.5194, + "step": 2950 + }, + { + "epoch": 1.3952718676122933, + "grad_norm": 2.46610426902771, + "learning_rate": 4.393601364016059e-06, + "loss": 0.5255, + "step": 2951 + }, + { + "epoch": 1.3957446808510638, + "grad_norm": 3.122509241104126, + "learning_rate": 4.393194005849731e-06, + "loss": 0.6046, + "step": 2952 + }, + { + "epoch": 1.3962174940898344, + "grad_norm": 2.724926471710205, + "learning_rate": 4.392786529802513e-06, + "loss": 0.4958, + "step": 2953 + }, + { + "epoch": 1.3966903073286052, + "grad_norm": 2.491485595703125, + "learning_rate": 4.3923789358997785e-06, + "loss": 0.5209, + "step": 2954 + }, + { + "epoch": 1.397163120567376, + "grad_norm": 2.61110520362854, + "learning_rate": 4.3919712241669056e-06, + "loss": 0.5202, + "step": 2955 + }, + { + "epoch": 1.3976359338061466, + "grad_norm": 2.3814501762390137, + "learning_rate": 4.39156339462928e-06, + "loss": 0.4966, + "step": 2956 + }, + { + "epoch": 1.3981087470449172, + "grad_norm": 2.762498617172241, + "learning_rate": 4.391155447312296e-06, + "loss": 0.6025, + "step": 2957 + }, + { + "epoch": 1.398581560283688, + "grad_norm": 2.964975595474243, + "learning_rate": 4.390747382241355e-06, + "loss": 0.4845, + "step": 2958 + }, + { + "epoch": 1.3990543735224588, + "grad_norm": 3.0117249488830566, + "learning_rate": 4.3903391994418655e-06, + "loss": 0.5326, + "step": 2959 + }, + { + "epoch": 1.3995271867612293, + "grad_norm": 2.578626871109009, + "learning_rate": 4.389930898939243e-06, + "loss": 0.5271, + "step": 2960 + }, + { + "epoch": 1.4, + "grad_norm": 2.747441053390503, + "learning_rate": 4.38952248075891e-06, + "loss": 0.5553, + "step": 2961 + }, + { + "epoch": 1.4004728132387707, + "grad_norm": 2.8273086547851562, + "learning_rate": 4.389113944926297e-06, + "loss": 0.5475, + "step": 2962 + }, + { + "epoch": 1.4009456264775415, + "grad_norm": 2.55238676071167, + "learning_rate": 4.388705291466843e-06, + "loss": 0.4864, + "step": 2963 + }, + { + "epoch": 1.401418439716312, + "grad_norm": 2.597214460372925, + "learning_rate": 4.388296520405992e-06, + "loss": 0.4845, + "step": 2964 + }, + { + "epoch": 1.4018912529550827, + "grad_norm": 2.608962297439575, + "learning_rate": 4.387887631769196e-06, + "loss": 0.5544, + "step": 2965 + }, + { + "epoch": 1.4023640661938535, + "grad_norm": 2.2754876613616943, + "learning_rate": 4.3874786255819165e-06, + "loss": 0.5045, + "step": 2966 + }, + { + "epoch": 1.4028368794326243, + "grad_norm": 2.9900264739990234, + "learning_rate": 4.387069501869618e-06, + "loss": 0.562, + "step": 2967 + }, + { + "epoch": 1.4033096926713948, + "grad_norm": 2.8069417476654053, + "learning_rate": 4.386660260657778e-06, + "loss": 0.5284, + "step": 2968 + }, + { + "epoch": 1.4037825059101654, + "grad_norm": 2.68894624710083, + "learning_rate": 4.386250901971875e-06, + "loss": 0.5879, + "step": 2969 + }, + { + "epoch": 1.4042553191489362, + "grad_norm": 2.614485025405884, + "learning_rate": 4.385841425837399e-06, + "loss": 0.4771, + "step": 2970 + }, + { + "epoch": 1.4047281323877068, + "grad_norm": 2.487950325012207, + "learning_rate": 4.385431832279848e-06, + "loss": 0.5552, + "step": 2971 + }, + { + "epoch": 1.4052009456264776, + "grad_norm": 2.5098392963409424, + "learning_rate": 4.385022121324723e-06, + "loss": 0.5267, + "step": 2972 + }, + { + "epoch": 1.4056737588652481, + "grad_norm": 2.825838565826416, + "learning_rate": 4.384612292997537e-06, + "loss": 0.5336, + "step": 2973 + }, + { + "epoch": 1.406146572104019, + "grad_norm": 2.898188829421997, + "learning_rate": 4.384202347323806e-06, + "loss": 0.5685, + "step": 2974 + }, + { + "epoch": 1.4066193853427895, + "grad_norm": 2.8722569942474365, + "learning_rate": 4.383792284329057e-06, + "loss": 0.5977, + "step": 2975 + }, + { + "epoch": 1.4070921985815603, + "grad_norm": 2.832951307296753, + "learning_rate": 4.3833821040388235e-06, + "loss": 0.5766, + "step": 2976 + }, + { + "epoch": 1.407565011820331, + "grad_norm": 2.7353670597076416, + "learning_rate": 4.3829718064786446e-06, + "loss": 0.5461, + "step": 2977 + }, + { + "epoch": 1.4080378250591017, + "grad_norm": 2.6050429344177246, + "learning_rate": 4.3825613916740675e-06, + "loss": 0.5501, + "step": 2978 + }, + { + "epoch": 1.4085106382978723, + "grad_norm": 2.79719877243042, + "learning_rate": 4.382150859650647e-06, + "loss": 0.502, + "step": 2979 + }, + { + "epoch": 1.408983451536643, + "grad_norm": 2.5538079738616943, + "learning_rate": 4.381740210433946e-06, + "loss": 0.4762, + "step": 2980 + }, + { + "epoch": 1.4094562647754136, + "grad_norm": 2.7256062030792236, + "learning_rate": 4.381329444049533e-06, + "loss": 0.4692, + "step": 2981 + }, + { + "epoch": 1.4099290780141844, + "grad_norm": 2.7778146266937256, + "learning_rate": 4.3809185605229855e-06, + "loss": 0.5366, + "step": 2982 + }, + { + "epoch": 1.410401891252955, + "grad_norm": 2.6289451122283936, + "learning_rate": 4.380507559879887e-06, + "loss": 0.5412, + "step": 2983 + }, + { + "epoch": 1.4108747044917258, + "grad_norm": 2.697204828262329, + "learning_rate": 4.380096442145827e-06, + "loss": 0.5065, + "step": 2984 + }, + { + "epoch": 1.4113475177304964, + "grad_norm": 2.4709219932556152, + "learning_rate": 4.379685207346407e-06, + "loss": 0.568, + "step": 2985 + }, + { + "epoch": 1.4118203309692672, + "grad_norm": 2.9740655422210693, + "learning_rate": 4.379273855507231e-06, + "loss": 0.5512, + "step": 2986 + }, + { + "epoch": 1.4122931442080378, + "grad_norm": 3.0090627670288086, + "learning_rate": 4.378862386653911e-06, + "loss": 0.5459, + "step": 2987 + }, + { + "epoch": 1.4127659574468086, + "grad_norm": 2.8835368156433105, + "learning_rate": 4.378450800812071e-06, + "loss": 0.5357, + "step": 2988 + }, + { + "epoch": 1.4132387706855791, + "grad_norm": 2.558824062347412, + "learning_rate": 4.378039098007335e-06, + "loss": 0.536, + "step": 2989 + }, + { + "epoch": 1.41371158392435, + "grad_norm": 2.5572092533111572, + "learning_rate": 4.377627278265339e-06, + "loss": 0.5183, + "step": 2990 + }, + { + "epoch": 1.4141843971631205, + "grad_norm": 2.7356579303741455, + "learning_rate": 4.377215341611727e-06, + "loss": 0.5087, + "step": 2991 + }, + { + "epoch": 1.4146572104018913, + "grad_norm": 2.7541024684906006, + "learning_rate": 4.376803288072146e-06, + "loss": 0.4509, + "step": 2992 + }, + { + "epoch": 1.4151300236406619, + "grad_norm": 2.7548446655273438, + "learning_rate": 4.376391117672254e-06, + "loss": 0.5532, + "step": 2993 + }, + { + "epoch": 1.4156028368794327, + "grad_norm": 2.9107465744018555, + "learning_rate": 4.375978830437715e-06, + "loss": 0.5719, + "step": 2994 + }, + { + "epoch": 1.4160756501182032, + "grad_norm": 2.7077393531799316, + "learning_rate": 4.3755664263942e-06, + "loss": 0.5084, + "step": 2995 + }, + { + "epoch": 1.416548463356974, + "grad_norm": 2.764209270477295, + "learning_rate": 4.375153905567388e-06, + "loss": 0.5976, + "step": 2996 + }, + { + "epoch": 1.4170212765957446, + "grad_norm": 2.7792932987213135, + "learning_rate": 4.374741267982964e-06, + "loss": 0.5358, + "step": 2997 + }, + { + "epoch": 1.4174940898345154, + "grad_norm": 2.459212064743042, + "learning_rate": 4.374328513666622e-06, + "loss": 0.5181, + "step": 2998 + }, + { + "epoch": 1.417966903073286, + "grad_norm": 2.548546552658081, + "learning_rate": 4.373915642644062e-06, + "loss": 0.528, + "step": 2999 + }, + { + "epoch": 1.4184397163120568, + "grad_norm": 2.998138189315796, + "learning_rate": 4.373502654940992e-06, + "loss": 0.5233, + "step": 3000 + }, + { + "epoch": 1.4189125295508274, + "grad_norm": 2.604341983795166, + "learning_rate": 4.373089550583126e-06, + "loss": 0.5274, + "step": 3001 + }, + { + "epoch": 1.4193853427895982, + "grad_norm": 2.6792588233947754, + "learning_rate": 4.372676329596188e-06, + "loss": 0.5061, + "step": 3002 + }, + { + "epoch": 1.4198581560283687, + "grad_norm": 2.5182368755340576, + "learning_rate": 4.372262992005906e-06, + "loss": 0.541, + "step": 3003 + }, + { + "epoch": 1.4203309692671395, + "grad_norm": 2.690718173980713, + "learning_rate": 4.371849537838018e-06, + "loss": 0.5308, + "step": 3004 + }, + { + "epoch": 1.42080378250591, + "grad_norm": 2.6797590255737305, + "learning_rate": 4.371435967118266e-06, + "loss": 0.5728, + "step": 3005 + }, + { + "epoch": 1.421276595744681, + "grad_norm": 2.847900152206421, + "learning_rate": 4.371022279872403e-06, + "loss": 0.5053, + "step": 3006 + }, + { + "epoch": 1.4217494089834515, + "grad_norm": 2.497810125350952, + "learning_rate": 4.370608476126186e-06, + "loss": 0.5057, + "step": 3007 + }, + { + "epoch": 1.4222222222222223, + "grad_norm": 2.5259225368499756, + "learning_rate": 4.370194555905382e-06, + "loss": 0.5508, + "step": 3008 + }, + { + "epoch": 1.4226950354609929, + "grad_norm": 2.774118423461914, + "learning_rate": 4.369780519235763e-06, + "loss": 0.5419, + "step": 3009 + }, + { + "epoch": 1.4231678486997636, + "grad_norm": 2.2764663696289062, + "learning_rate": 4.369366366143111e-06, + "loss": 0.5032, + "step": 3010 + }, + { + "epoch": 1.4236406619385342, + "grad_norm": 2.736347198486328, + "learning_rate": 4.368952096653211e-06, + "loss": 0.5184, + "step": 3011 + }, + { + "epoch": 1.424113475177305, + "grad_norm": 2.476762056350708, + "learning_rate": 4.36853771079186e-06, + "loss": 0.5331, + "step": 3012 + }, + { + "epoch": 1.4245862884160756, + "grad_norm": 2.8006162643432617, + "learning_rate": 4.3681232085848585e-06, + "loss": 0.5331, + "step": 3013 + }, + { + "epoch": 1.4250591016548464, + "grad_norm": 2.509143590927124, + "learning_rate": 4.367708590058016e-06, + "loss": 0.5127, + "step": 3014 + }, + { + "epoch": 1.425531914893617, + "grad_norm": 3.030137538909912, + "learning_rate": 4.3672938552371505e-06, + "loss": 0.5555, + "step": 3015 + }, + { + "epoch": 1.4260047281323878, + "grad_norm": 3.0536904335021973, + "learning_rate": 4.3668790041480835e-06, + "loss": 0.5241, + "step": 3016 + }, + { + "epoch": 1.4264775413711583, + "grad_norm": 2.6400439739227295, + "learning_rate": 4.366464036816647e-06, + "loss": 0.4946, + "step": 3017 + }, + { + "epoch": 1.4269503546099291, + "grad_norm": 2.7302589416503906, + "learning_rate": 4.366048953268679e-06, + "loss": 0.5105, + "step": 3018 + }, + { + "epoch": 1.4274231678486997, + "grad_norm": 2.504549264907837, + "learning_rate": 4.365633753530026e-06, + "loss": 0.4844, + "step": 3019 + }, + { + "epoch": 1.4278959810874705, + "grad_norm": 2.3872320652008057, + "learning_rate": 4.365218437626539e-06, + "loss": 0.4402, + "step": 3020 + }, + { + "epoch": 1.428368794326241, + "grad_norm": 2.531649351119995, + "learning_rate": 4.364803005584078e-06, + "loss": 0.4913, + "step": 3021 + }, + { + "epoch": 1.4288416075650119, + "grad_norm": 2.4683783054351807, + "learning_rate": 4.364387457428512e-06, + "loss": 0.515, + "step": 3022 + }, + { + "epoch": 1.4293144208037825, + "grad_norm": 2.632336378097534, + "learning_rate": 4.363971793185713e-06, + "loss": 0.5398, + "step": 3023 + }, + { + "epoch": 1.4297872340425533, + "grad_norm": 2.7456719875335693, + "learning_rate": 4.363556012881565e-06, + "loss": 0.5254, + "step": 3024 + }, + { + "epoch": 1.4302600472813238, + "grad_norm": 2.607177972793579, + "learning_rate": 4.363140116541955e-06, + "loss": 0.5266, + "step": 3025 + }, + { + "epoch": 1.4307328605200946, + "grad_norm": 2.640127420425415, + "learning_rate": 4.3627241041927796e-06, + "loss": 0.5157, + "step": 3026 + }, + { + "epoch": 1.4312056737588652, + "grad_norm": 2.4210736751556396, + "learning_rate": 4.362307975859941e-06, + "loss": 0.4599, + "step": 3027 + }, + { + "epoch": 1.431678486997636, + "grad_norm": 2.6007790565490723, + "learning_rate": 4.361891731569352e-06, + "loss": 0.5298, + "step": 3028 + }, + { + "epoch": 1.4321513002364066, + "grad_norm": 2.5352046489715576, + "learning_rate": 4.361475371346928e-06, + "loss": 0.5128, + "step": 3029 + }, + { + "epoch": 1.4326241134751774, + "grad_norm": 2.4204049110412598, + "learning_rate": 4.361058895218596e-06, + "loss": 0.4669, + "step": 3030 + }, + { + "epoch": 1.433096926713948, + "grad_norm": 2.525240182876587, + "learning_rate": 4.360642303210286e-06, + "loss": 0.4925, + "step": 3031 + }, + { + "epoch": 1.4335697399527187, + "grad_norm": 2.839646339416504, + "learning_rate": 4.360225595347939e-06, + "loss": 0.5868, + "step": 3032 + }, + { + "epoch": 1.4340425531914893, + "grad_norm": 2.5043296813964844, + "learning_rate": 4.359808771657501e-06, + "loss": 0.4951, + "step": 3033 + }, + { + "epoch": 1.4345153664302601, + "grad_norm": 2.9082300662994385, + "learning_rate": 4.359391832164927e-06, + "loss": 0.5259, + "step": 3034 + }, + { + "epoch": 1.4349881796690307, + "grad_norm": 2.6651999950408936, + "learning_rate": 4.3589747768961745e-06, + "loss": 0.537, + "step": 3035 + }, + { + "epoch": 1.4354609929078015, + "grad_norm": 2.577077865600586, + "learning_rate": 4.358557605877216e-06, + "loss": 0.5186, + "step": 3036 + }, + { + "epoch": 1.435933806146572, + "grad_norm": 2.7445287704467773, + "learning_rate": 4.3581403191340236e-06, + "loss": 0.5573, + "step": 3037 + }, + { + "epoch": 1.4364066193853429, + "grad_norm": 2.502086639404297, + "learning_rate": 4.357722916692582e-06, + "loss": 0.5039, + "step": 3038 + }, + { + "epoch": 1.4368794326241134, + "grad_norm": 2.4476163387298584, + "learning_rate": 4.357305398578879e-06, + "loss": 0.5638, + "step": 3039 + }, + { + "epoch": 1.4373522458628842, + "grad_norm": 2.7705588340759277, + "learning_rate": 4.356887764818915e-06, + "loss": 0.5485, + "step": 3040 + }, + { + "epoch": 1.4378250591016548, + "grad_norm": 2.498225450515747, + "learning_rate": 4.356470015438691e-06, + "loss": 0.5486, + "step": 3041 + }, + { + "epoch": 1.4382978723404256, + "grad_norm": 2.394320011138916, + "learning_rate": 4.356052150464219e-06, + "loss": 0.512, + "step": 3042 + }, + { + "epoch": 1.4387706855791962, + "grad_norm": 2.8725767135620117, + "learning_rate": 4.3556341699215185e-06, + "loss": 0.5202, + "step": 3043 + }, + { + "epoch": 1.439243498817967, + "grad_norm": 3.1707918643951416, + "learning_rate": 4.355216073836615e-06, + "loss": 0.5229, + "step": 3044 + }, + { + "epoch": 1.4397163120567376, + "grad_norm": 2.532578468322754, + "learning_rate": 4.3547978622355415e-06, + "loss": 0.4569, + "step": 3045 + }, + { + "epoch": 1.4401891252955084, + "grad_norm": 3.0111029148101807, + "learning_rate": 4.354379535144338e-06, + "loss": 0.5801, + "step": 3046 + }, + { + "epoch": 1.440661938534279, + "grad_norm": 2.9554224014282227, + "learning_rate": 4.353961092589052e-06, + "loss": 0.5968, + "step": 3047 + }, + { + "epoch": 1.4411347517730497, + "grad_norm": 2.7562637329101562, + "learning_rate": 4.353542534595738e-06, + "loss": 0.5005, + "step": 3048 + }, + { + "epoch": 1.4416075650118203, + "grad_norm": 3.083254337310791, + "learning_rate": 4.3531238611904595e-06, + "loss": 0.5389, + "step": 3049 + }, + { + "epoch": 1.442080378250591, + "grad_norm": 2.7778005599975586, + "learning_rate": 4.352705072399282e-06, + "loss": 0.5342, + "step": 3050 + }, + { + "epoch": 1.4425531914893617, + "grad_norm": 2.6673996448516846, + "learning_rate": 4.3522861682482845e-06, + "loss": 0.5213, + "step": 3051 + }, + { + "epoch": 1.4430260047281322, + "grad_norm": 2.637605905532837, + "learning_rate": 4.351867148763548e-06, + "loss": 0.4893, + "step": 3052 + }, + { + "epoch": 1.443498817966903, + "grad_norm": 2.834469795227051, + "learning_rate": 4.351448013971166e-06, + "loss": 0.5391, + "step": 3053 + }, + { + "epoch": 1.4439716312056738, + "grad_norm": 2.824153184890747, + "learning_rate": 4.351028763897234e-06, + "loss": 0.6403, + "step": 3054 + }, + { + "epoch": 1.4444444444444444, + "grad_norm": 2.558966875076294, + "learning_rate": 4.350609398567857e-06, + "loss": 0.4912, + "step": 3055 + }, + { + "epoch": 1.444917257683215, + "grad_norm": 2.281726360321045, + "learning_rate": 4.3501899180091475e-06, + "loss": 0.4655, + "step": 3056 + }, + { + "epoch": 1.4453900709219858, + "grad_norm": 2.499472141265869, + "learning_rate": 4.349770322247225e-06, + "loss": 0.4878, + "step": 3057 + }, + { + "epoch": 1.4458628841607566, + "grad_norm": 2.578615188598633, + "learning_rate": 4.349350611308215e-06, + "loss": 0.4855, + "step": 3058 + }, + { + "epoch": 1.4463356973995272, + "grad_norm": 2.7111165523529053, + "learning_rate": 4.348930785218252e-06, + "loss": 0.5415, + "step": 3059 + }, + { + "epoch": 1.4468085106382977, + "grad_norm": 2.8081610202789307, + "learning_rate": 4.348510844003476e-06, + "loss": 0.4881, + "step": 3060 + }, + { + "epoch": 1.4472813238770685, + "grad_norm": 2.9439868927001953, + "learning_rate": 4.348090787690036e-06, + "loss": 0.5485, + "step": 3061 + }, + { + "epoch": 1.4477541371158393, + "grad_norm": 2.592532157897949, + "learning_rate": 4.347670616304085e-06, + "loss": 0.4912, + "step": 3062 + }, + { + "epoch": 1.44822695035461, + "grad_norm": 2.960592746734619, + "learning_rate": 4.347250329871787e-06, + "loss": 0.5473, + "step": 3063 + }, + { + "epoch": 1.4486997635933805, + "grad_norm": 2.5786688327789307, + "learning_rate": 4.3468299284193116e-06, + "loss": 0.5348, + "step": 3064 + }, + { + "epoch": 1.4491725768321513, + "grad_norm": 2.6084046363830566, + "learning_rate": 4.346409411972834e-06, + "loss": 0.527, + "step": 3065 + }, + { + "epoch": 1.449645390070922, + "grad_norm": 2.489748239517212, + "learning_rate": 4.3459887805585385e-06, + "loss": 0.4943, + "step": 3066 + }, + { + "epoch": 1.4501182033096927, + "grad_norm": 2.452131986618042, + "learning_rate": 4.345568034202617e-06, + "loss": 0.4886, + "step": 3067 + }, + { + "epoch": 1.4505910165484632, + "grad_norm": 2.4034671783447266, + "learning_rate": 4.345147172931266e-06, + "loss": 0.4689, + "step": 3068 + }, + { + "epoch": 1.451063829787234, + "grad_norm": 2.6045448780059814, + "learning_rate": 4.344726196770691e-06, + "loss": 0.5842, + "step": 3069 + }, + { + "epoch": 1.4515366430260048, + "grad_norm": 2.697593927383423, + "learning_rate": 4.3443051057471045e-06, + "loss": 0.5358, + "step": 3070 + }, + { + "epoch": 1.4520094562647754, + "grad_norm": 2.6080820560455322, + "learning_rate": 4.343883899886727e-06, + "loss": 0.5361, + "step": 3071 + }, + { + "epoch": 1.452482269503546, + "grad_norm": 2.4605307579040527, + "learning_rate": 4.343462579215783e-06, + "loss": 0.4941, + "step": 3072 + }, + { + "epoch": 1.4529550827423168, + "grad_norm": 2.8025355339050293, + "learning_rate": 4.343041143760509e-06, + "loss": 0.5116, + "step": 3073 + }, + { + "epoch": 1.4534278959810876, + "grad_norm": 2.432515859603882, + "learning_rate": 4.3426195935471434e-06, + "loss": 0.4991, + "step": 3074 + }, + { + "epoch": 1.4539007092198581, + "grad_norm": 2.5838661193847656, + "learning_rate": 4.342197928601935e-06, + "loss": 0.4994, + "step": 3075 + }, + { + "epoch": 1.4543735224586287, + "grad_norm": 2.421692371368408, + "learning_rate": 4.341776148951141e-06, + "loss": 0.4945, + "step": 3076 + }, + { + "epoch": 1.4548463356973995, + "grad_norm": 2.5354676246643066, + "learning_rate": 4.341354254621021e-06, + "loss": 0.4859, + "step": 3077 + }, + { + "epoch": 1.4553191489361703, + "grad_norm": 2.7316789627075195, + "learning_rate": 4.340932245637846e-06, + "loss": 0.5136, + "step": 3078 + }, + { + "epoch": 1.455791962174941, + "grad_norm": 3.5903496742248535, + "learning_rate": 4.340510122027891e-06, + "loss": 0.6451, + "step": 3079 + }, + { + "epoch": 1.4562647754137115, + "grad_norm": 2.95190167427063, + "learning_rate": 4.340087883817442e-06, + "loss": 0.6354, + "step": 3080 + }, + { + "epoch": 1.4567375886524823, + "grad_norm": 2.8659214973449707, + "learning_rate": 4.339665531032789e-06, + "loss": 0.5514, + "step": 3081 + }, + { + "epoch": 1.457210401891253, + "grad_norm": 2.5681674480438232, + "learning_rate": 4.339243063700231e-06, + "loss": 0.5135, + "step": 3082 + }, + { + "epoch": 1.4576832151300236, + "grad_norm": 2.7353906631469727, + "learning_rate": 4.338820481846072e-06, + "loss": 0.4608, + "step": 3083 + }, + { + "epoch": 1.4581560283687942, + "grad_norm": 2.6116466522216797, + "learning_rate": 4.3383977854966245e-06, + "loss": 0.4924, + "step": 3084 + }, + { + "epoch": 1.458628841607565, + "grad_norm": 2.6676487922668457, + "learning_rate": 4.337974974678207e-06, + "loss": 0.5747, + "step": 3085 + }, + { + "epoch": 1.4591016548463358, + "grad_norm": 2.909031629562378, + "learning_rate": 4.337552049417147e-06, + "loss": 0.4618, + "step": 3086 + }, + { + "epoch": 1.4595744680851064, + "grad_norm": 2.7614190578460693, + "learning_rate": 4.33712900973978e-06, + "loss": 0.5154, + "step": 3087 + }, + { + "epoch": 1.460047281323877, + "grad_norm": 2.452188014984131, + "learning_rate": 4.336705855672444e-06, + "loss": 0.542, + "step": 3088 + }, + { + "epoch": 1.4605200945626478, + "grad_norm": 3.0004117488861084, + "learning_rate": 4.336282587241488e-06, + "loss": 0.5857, + "step": 3089 + }, + { + "epoch": 1.4609929078014185, + "grad_norm": 2.870783567428589, + "learning_rate": 4.335859204473268e-06, + "loss": 0.5506, + "step": 3090 + }, + { + "epoch": 1.4614657210401891, + "grad_norm": 3.1078689098358154, + "learning_rate": 4.335435707394145e-06, + "loss": 0.5138, + "step": 3091 + }, + { + "epoch": 1.4619385342789597, + "grad_norm": 2.8516197204589844, + "learning_rate": 4.335012096030488e-06, + "loss": 0.5842, + "step": 3092 + }, + { + "epoch": 1.4624113475177305, + "grad_norm": 2.615922212600708, + "learning_rate": 4.334588370408675e-06, + "loss": 0.4896, + "step": 3093 + }, + { + "epoch": 1.4628841607565013, + "grad_norm": 3.1911802291870117, + "learning_rate": 4.334164530555088e-06, + "loss": 0.4974, + "step": 3094 + }, + { + "epoch": 1.4633569739952719, + "grad_norm": 3.075051784515381, + "learning_rate": 4.3337405764961186e-06, + "loss": 0.567, + "step": 3095 + }, + { + "epoch": 1.4638297872340424, + "grad_norm": 2.550625801086426, + "learning_rate": 4.333316508258163e-06, + "loss": 0.4887, + "step": 3096 + }, + { + "epoch": 1.4643026004728132, + "grad_norm": 2.3986475467681885, + "learning_rate": 4.332892325867629e-06, + "loss": 0.5047, + "step": 3097 + }, + { + "epoch": 1.464775413711584, + "grad_norm": 2.5045125484466553, + "learning_rate": 4.332468029350926e-06, + "loss": 0.4721, + "step": 3098 + }, + { + "epoch": 1.4652482269503546, + "grad_norm": 2.347365617752075, + "learning_rate": 4.332043618734474e-06, + "loss": 0.4913, + "step": 3099 + }, + { + "epoch": 1.4657210401891252, + "grad_norm": 2.459928512573242, + "learning_rate": 4.331619094044699e-06, + "loss": 0.523, + "step": 3100 + }, + { + "epoch": 1.466193853427896, + "grad_norm": 2.5771310329437256, + "learning_rate": 4.331194455308035e-06, + "loss": 0.593, + "step": 3101 + }, + { + "epoch": 1.4666666666666668, + "grad_norm": 3.1351823806762695, + "learning_rate": 4.330769702550921e-06, + "loss": 0.5852, + "step": 3102 + }, + { + "epoch": 1.4671394799054374, + "grad_norm": 2.589817523956299, + "learning_rate": 4.330344835799806e-06, + "loss": 0.508, + "step": 3103 + }, + { + "epoch": 1.467612293144208, + "grad_norm": 3.1140341758728027, + "learning_rate": 4.329919855081144e-06, + "loss": 0.469, + "step": 3104 + }, + { + "epoch": 1.4680851063829787, + "grad_norm": 2.8186635971069336, + "learning_rate": 4.329494760421396e-06, + "loss": 0.5088, + "step": 3105 + }, + { + "epoch": 1.4685579196217495, + "grad_norm": 2.676077365875244, + "learning_rate": 4.329069551847031e-06, + "loss": 0.52, + "step": 3106 + }, + { + "epoch": 1.46903073286052, + "grad_norm": 2.5543313026428223, + "learning_rate": 4.328644229384526e-06, + "loss": 0.5066, + "step": 3107 + }, + { + "epoch": 1.4695035460992907, + "grad_norm": 2.8176217079162598, + "learning_rate": 4.328218793060362e-06, + "loss": 0.6404, + "step": 3108 + }, + { + "epoch": 1.4699763593380615, + "grad_norm": 2.485217332839966, + "learning_rate": 4.3277932429010314e-06, + "loss": 0.4578, + "step": 3109 + }, + { + "epoch": 1.4704491725768323, + "grad_norm": 2.6741621494293213, + "learning_rate": 4.327367578933031e-06, + "loss": 0.5068, + "step": 3110 + }, + { + "epoch": 1.4709219858156029, + "grad_norm": 2.377242088317871, + "learning_rate": 4.326941801182863e-06, + "loss": 0.5249, + "step": 3111 + }, + { + "epoch": 1.4713947990543734, + "grad_norm": 2.790046215057373, + "learning_rate": 4.32651590967704e-06, + "loss": 0.5532, + "step": 3112 + }, + { + "epoch": 1.4718676122931442, + "grad_norm": 2.78019642829895, + "learning_rate": 4.326089904442081e-06, + "loss": 0.5362, + "step": 3113 + }, + { + "epoch": 1.472340425531915, + "grad_norm": 2.5661380290985107, + "learning_rate": 4.32566378550451e-06, + "loss": 0.5041, + "step": 3114 + }, + { + "epoch": 1.4728132387706856, + "grad_norm": 2.522153615951538, + "learning_rate": 4.3252375528908605e-06, + "loss": 0.5074, + "step": 3115 + }, + { + "epoch": 1.4732860520094562, + "grad_norm": 2.874688148498535, + "learning_rate": 4.3248112066276725e-06, + "loss": 0.59, + "step": 3116 + }, + { + "epoch": 1.473758865248227, + "grad_norm": 3.067866802215576, + "learning_rate": 4.324384746741492e-06, + "loss": 0.5924, + "step": 3117 + }, + { + "epoch": 1.4742316784869978, + "grad_norm": 3.359463930130005, + "learning_rate": 4.323958173258873e-06, + "loss": 0.6346, + "step": 3118 + }, + { + "epoch": 1.4747044917257683, + "grad_norm": 2.193024158477783, + "learning_rate": 4.323531486206376e-06, + "loss": 0.4594, + "step": 3119 + }, + { + "epoch": 1.475177304964539, + "grad_norm": 2.886889934539795, + "learning_rate": 4.323104685610569e-06, + "loss": 0.523, + "step": 3120 + }, + { + "epoch": 1.4756501182033097, + "grad_norm": 2.7558681964874268, + "learning_rate": 4.322677771498028e-06, + "loss": 0.5387, + "step": 3121 + }, + { + "epoch": 1.4761229314420805, + "grad_norm": 2.639277935028076, + "learning_rate": 4.322250743895335e-06, + "loss": 0.5599, + "step": 3122 + }, + { + "epoch": 1.476595744680851, + "grad_norm": 2.786198616027832, + "learning_rate": 4.321823602829078e-06, + "loss": 0.5405, + "step": 3123 + }, + { + "epoch": 1.4770685579196217, + "grad_norm": 2.582315683364868, + "learning_rate": 4.321396348325853e-06, + "loss": 0.4452, + "step": 3124 + }, + { + "epoch": 1.4775413711583925, + "grad_norm": 2.8574297428131104, + "learning_rate": 4.320968980412265e-06, + "loss": 0.4846, + "step": 3125 + }, + { + "epoch": 1.4780141843971633, + "grad_norm": 2.705281972885132, + "learning_rate": 4.320541499114922e-06, + "loss": 0.5548, + "step": 3126 + }, + { + "epoch": 1.4784869976359338, + "grad_norm": 2.3152754306793213, + "learning_rate": 4.320113904460444e-06, + "loss": 0.5216, + "step": 3127 + }, + { + "epoch": 1.4789598108747044, + "grad_norm": 3.230764150619507, + "learning_rate": 4.319686196475453e-06, + "loss": 0.6192, + "step": 3128 + }, + { + "epoch": 1.4794326241134752, + "grad_norm": 2.463380813598633, + "learning_rate": 4.319258375186583e-06, + "loss": 0.4872, + "step": 3129 + }, + { + "epoch": 1.479905437352246, + "grad_norm": 2.8477656841278076, + "learning_rate": 4.31883044062047e-06, + "loss": 0.5371, + "step": 3130 + }, + { + "epoch": 1.4803782505910166, + "grad_norm": 2.393911123275757, + "learning_rate": 4.318402392803762e-06, + "loss": 0.5334, + "step": 3131 + }, + { + "epoch": 1.4808510638297872, + "grad_norm": 2.6113736629486084, + "learning_rate": 4.317974231763109e-06, + "loss": 0.5572, + "step": 3132 + }, + { + "epoch": 1.481323877068558, + "grad_norm": 2.3941731452941895, + "learning_rate": 4.317545957525173e-06, + "loss": 0.4849, + "step": 3133 + }, + { + "epoch": 1.4817966903073285, + "grad_norm": 2.9536755084991455, + "learning_rate": 4.317117570116619e-06, + "loss": 0.6058, + "step": 3134 + }, + { + "epoch": 1.4822695035460993, + "grad_norm": 2.595754623413086, + "learning_rate": 4.316689069564123e-06, + "loss": 0.5193, + "step": 3135 + }, + { + "epoch": 1.48274231678487, + "grad_norm": 2.569833993911743, + "learning_rate": 4.316260455894364e-06, + "loss": 0.543, + "step": 3136 + }, + { + "epoch": 1.4832151300236407, + "grad_norm": 2.5137455463409424, + "learning_rate": 4.315831729134031e-06, + "loss": 0.5415, + "step": 3137 + }, + { + "epoch": 1.4836879432624113, + "grad_norm": 2.5582292079925537, + "learning_rate": 4.3154028893098176e-06, + "loss": 0.5338, + "step": 3138 + }, + { + "epoch": 1.484160756501182, + "grad_norm": 2.666426181793213, + "learning_rate": 4.3149739364484265e-06, + "loss": 0.5435, + "step": 3139 + }, + { + "epoch": 1.4846335697399526, + "grad_norm": 2.790851354598999, + "learning_rate": 4.314544870576568e-06, + "loss": 0.5746, + "step": 3140 + }, + { + "epoch": 1.4851063829787234, + "grad_norm": 2.620326042175293, + "learning_rate": 4.314115691720956e-06, + "loss": 0.5076, + "step": 3141 + }, + { + "epoch": 1.485579196217494, + "grad_norm": 3.075674533843994, + "learning_rate": 4.313686399908314e-06, + "loss": 0.5486, + "step": 3142 + }, + { + "epoch": 1.4860520094562648, + "grad_norm": 3.1347315311431885, + "learning_rate": 4.3132569951653745e-06, + "loss": 0.531, + "step": 3143 + }, + { + "epoch": 1.4865248226950354, + "grad_norm": 2.5783653259277344, + "learning_rate": 4.312827477518871e-06, + "loss": 0.5818, + "step": 3144 + }, + { + "epoch": 1.4869976359338062, + "grad_norm": 3.0247137546539307, + "learning_rate": 4.3123978469955505e-06, + "loss": 0.5347, + "step": 3145 + }, + { + "epoch": 1.4874704491725768, + "grad_norm": 2.4789345264434814, + "learning_rate": 4.311968103622163e-06, + "loss": 0.5, + "step": 3146 + }, + { + "epoch": 1.4879432624113476, + "grad_norm": 2.663341522216797, + "learning_rate": 4.311538247425466e-06, + "loss": 0.4825, + "step": 3147 + }, + { + "epoch": 1.4884160756501181, + "grad_norm": 2.633711099624634, + "learning_rate": 4.311108278432226e-06, + "loss": 0.5244, + "step": 3148 + }, + { + "epoch": 1.488888888888889, + "grad_norm": 2.51312518119812, + "learning_rate": 4.310678196669216e-06, + "loss": 0.513, + "step": 3149 + }, + { + "epoch": 1.4893617021276595, + "grad_norm": 2.5263755321502686, + "learning_rate": 4.310248002163214e-06, + "loss": 0.5236, + "step": 3150 + }, + { + "epoch": 1.4898345153664303, + "grad_norm": 2.559216260910034, + "learning_rate": 4.309817694941007e-06, + "loss": 0.5107, + "step": 3151 + }, + { + "epoch": 1.4903073286052009, + "grad_norm": 2.5023303031921387, + "learning_rate": 4.309387275029386e-06, + "loss": 0.4685, + "step": 3152 + }, + { + "epoch": 1.4907801418439717, + "grad_norm": 3.0314254760742188, + "learning_rate": 4.308956742455155e-06, + "loss": 0.5462, + "step": 3153 + }, + { + "epoch": 1.4912529550827422, + "grad_norm": 2.675295114517212, + "learning_rate": 4.308526097245119e-06, + "loss": 0.5398, + "step": 3154 + }, + { + "epoch": 1.491725768321513, + "grad_norm": 2.6613399982452393, + "learning_rate": 4.308095339426094e-06, + "loss": 0.5376, + "step": 3155 + }, + { + "epoch": 1.4921985815602836, + "grad_norm": 2.58937668800354, + "learning_rate": 4.307664469024899e-06, + "loss": 0.5385, + "step": 3156 + }, + { + "epoch": 1.4926713947990544, + "grad_norm": 2.583631992340088, + "learning_rate": 4.3072334860683655e-06, + "loss": 0.4927, + "step": 3157 + }, + { + "epoch": 1.493144208037825, + "grad_norm": 2.5889222621917725, + "learning_rate": 4.306802390583327e-06, + "loss": 0.47, + "step": 3158 + }, + { + "epoch": 1.4936170212765958, + "grad_norm": 2.9362716674804688, + "learning_rate": 4.3063711825966244e-06, + "loss": 0.4902, + "step": 3159 + }, + { + "epoch": 1.4940898345153664, + "grad_norm": 2.5385425090789795, + "learning_rate": 4.305939862135111e-06, + "loss": 0.5396, + "step": 3160 + }, + { + "epoch": 1.4945626477541372, + "grad_norm": 2.776326894760132, + "learning_rate": 4.305508429225641e-06, + "loss": 0.5169, + "step": 3161 + }, + { + "epoch": 1.4950354609929077, + "grad_norm": 2.575063467025757, + "learning_rate": 4.305076883895076e-06, + "loss": 0.4938, + "step": 3162 + }, + { + "epoch": 1.4955082742316785, + "grad_norm": 2.7552313804626465, + "learning_rate": 4.304645226170291e-06, + "loss": 0.6211, + "step": 3163 + }, + { + "epoch": 1.4959810874704491, + "grad_norm": 2.57149338722229, + "learning_rate": 4.30421345607816e-06, + "loss": 0.5241, + "step": 3164 + }, + { + "epoch": 1.49645390070922, + "grad_norm": 2.8142426013946533, + "learning_rate": 4.303781573645568e-06, + "loss": 0.5699, + "step": 3165 + }, + { + "epoch": 1.4969267139479905, + "grad_norm": 2.6344845294952393, + "learning_rate": 4.303349578899407e-06, + "loss": 0.5049, + "step": 3166 + }, + { + "epoch": 1.4973995271867613, + "grad_norm": 2.554410934448242, + "learning_rate": 4.302917471866575e-06, + "loss": 0.4404, + "step": 3167 + }, + { + "epoch": 1.4978723404255319, + "grad_norm": 2.896240711212158, + "learning_rate": 4.302485252573978e-06, + "loss": 0.602, + "step": 3168 + }, + { + "epoch": 1.4983451536643027, + "grad_norm": 2.4044477939605713, + "learning_rate": 4.302052921048527e-06, + "loss": 0.4857, + "step": 3169 + }, + { + "epoch": 1.4988179669030732, + "grad_norm": 2.7447879314422607, + "learning_rate": 4.301620477317144e-06, + "loss": 0.5438, + "step": 3170 + }, + { + "epoch": 1.499290780141844, + "grad_norm": 2.851820945739746, + "learning_rate": 4.301187921406752e-06, + "loss": 0.5245, + "step": 3171 + }, + { + "epoch": 1.4997635933806146, + "grad_norm": 3.247114419937134, + "learning_rate": 4.300755253344287e-06, + "loss": 0.504, + "step": 3172 + }, + { + "epoch": 1.5002364066193854, + "grad_norm": 3.117490291595459, + "learning_rate": 4.300322473156688e-06, + "loss": 0.4627, + "step": 3173 + }, + { + "epoch": 1.500709219858156, + "grad_norm": 2.558319330215454, + "learning_rate": 4.299889580870904e-06, + "loss": 0.5721, + "step": 3174 + }, + { + "epoch": 1.5011820330969265, + "grad_norm": 2.8983113765716553, + "learning_rate": 4.2994565765138865e-06, + "loss": 0.5257, + "step": 3175 + }, + { + "epoch": 1.5016548463356973, + "grad_norm": 2.744056463241577, + "learning_rate": 4.299023460112599e-06, + "loss": 0.4892, + "step": 3176 + }, + { + "epoch": 1.5021276595744681, + "grad_norm": 2.5506751537323, + "learning_rate": 4.29859023169401e-06, + "loss": 0.4933, + "step": 3177 + }, + { + "epoch": 1.5026004728132387, + "grad_norm": 2.842615842819214, + "learning_rate": 4.298156891285092e-06, + "loss": 0.6124, + "step": 3178 + }, + { + "epoch": 1.5030732860520093, + "grad_norm": 2.5355329513549805, + "learning_rate": 4.2977234389128305e-06, + "loss": 0.641, + "step": 3179 + }, + { + "epoch": 1.50354609929078, + "grad_norm": 2.674781084060669, + "learning_rate": 4.297289874604213e-06, + "loss": 0.475, + "step": 3180 + }, + { + "epoch": 1.5040189125295509, + "grad_norm": 2.6845548152923584, + "learning_rate": 4.296856198386235e-06, + "loss": 0.5328, + "step": 3181 + }, + { + "epoch": 1.5044917257683215, + "grad_norm": 2.9686241149902344, + "learning_rate": 4.296422410285902e-06, + "loss": 0.6216, + "step": 3182 + }, + { + "epoch": 1.504964539007092, + "grad_norm": 2.5095980167388916, + "learning_rate": 4.295988510330222e-06, + "loss": 0.4993, + "step": 3183 + }, + { + "epoch": 1.5054373522458628, + "grad_norm": 2.4906392097473145, + "learning_rate": 4.2955544985462125e-06, + "loss": 0.4795, + "step": 3184 + }, + { + "epoch": 1.5059101654846336, + "grad_norm": 2.5593366622924805, + "learning_rate": 4.295120374960897e-06, + "loss": 0.5527, + "step": 3185 + }, + { + "epoch": 1.5063829787234042, + "grad_norm": 2.691495180130005, + "learning_rate": 4.294686139601308e-06, + "loss": 0.5646, + "step": 3186 + }, + { + "epoch": 1.5068557919621748, + "grad_norm": 2.74320387840271, + "learning_rate": 4.294251792494483e-06, + "loss": 0.6149, + "step": 3187 + }, + { + "epoch": 1.5073286052009456, + "grad_norm": 2.8827052116394043, + "learning_rate": 4.293817333667465e-06, + "loss": 0.5414, + "step": 3188 + }, + { + "epoch": 1.5078014184397164, + "grad_norm": 2.5652425289154053, + "learning_rate": 4.293382763147308e-06, + "loss": 0.5006, + "step": 3189 + }, + { + "epoch": 1.508274231678487, + "grad_norm": 2.729295253753662, + "learning_rate": 4.29294808096107e-06, + "loss": 0.522, + "step": 3190 + }, + { + "epoch": 1.5087470449172575, + "grad_norm": 2.348118305206299, + "learning_rate": 4.292513287135817e-06, + "loss": 0.4125, + "step": 3191 + }, + { + "epoch": 1.5092198581560283, + "grad_norm": 2.809551954269409, + "learning_rate": 4.292078381698621e-06, + "loss": 0.5577, + "step": 3192 + }, + { + "epoch": 1.5096926713947991, + "grad_norm": 2.6925361156463623, + "learning_rate": 4.291643364676563e-06, + "loss": 0.62, + "step": 3193 + }, + { + "epoch": 1.5101654846335697, + "grad_norm": 2.4200620651245117, + "learning_rate": 4.291208236096729e-06, + "loss": 0.5464, + "step": 3194 + }, + { + "epoch": 1.5106382978723403, + "grad_norm": 2.5659191608428955, + "learning_rate": 4.290772995986211e-06, + "loss": 0.5402, + "step": 3195 + }, + { + "epoch": 1.511111111111111, + "grad_norm": 2.3877315521240234, + "learning_rate": 4.290337644372113e-06, + "loss": 0.463, + "step": 3196 + }, + { + "epoch": 1.5115839243498819, + "grad_norm": 2.7063233852386475, + "learning_rate": 4.289902181281538e-06, + "loss": 0.5253, + "step": 3197 + }, + { + "epoch": 1.5120567375886524, + "grad_norm": 2.56788969039917, + "learning_rate": 4.289466606741603e-06, + "loss": 0.5012, + "step": 3198 + }, + { + "epoch": 1.512529550827423, + "grad_norm": 2.637164831161499, + "learning_rate": 4.28903092077943e-06, + "loss": 0.5236, + "step": 3199 + }, + { + "epoch": 1.5130023640661938, + "grad_norm": 2.767526865005493, + "learning_rate": 4.288595123422146e-06, + "loss": 0.5832, + "step": 3200 + }, + { + "epoch": 1.5134751773049646, + "grad_norm": 2.33365535736084, + "learning_rate": 4.2881592146968866e-06, + "loss": 0.4548, + "step": 3201 + }, + { + "epoch": 1.5139479905437352, + "grad_norm": 2.544189453125, + "learning_rate": 4.287723194630793e-06, + "loss": 0.5115, + "step": 3202 + }, + { + "epoch": 1.5144208037825058, + "grad_norm": 2.588793992996216, + "learning_rate": 4.2872870632510155e-06, + "loss": 0.4766, + "step": 3203 + }, + { + "epoch": 1.5148936170212766, + "grad_norm": 2.5382184982299805, + "learning_rate": 4.286850820584709e-06, + "loss": 0.5401, + "step": 3204 + }, + { + "epoch": 1.5153664302600474, + "grad_norm": 2.597930669784546, + "learning_rate": 4.286414466659038e-06, + "loss": 0.5346, + "step": 3205 + }, + { + "epoch": 1.515839243498818, + "grad_norm": 2.8522393703460693, + "learning_rate": 4.28597800150117e-06, + "loss": 0.486, + "step": 3206 + }, + { + "epoch": 1.5163120567375885, + "grad_norm": 2.4801454544067383, + "learning_rate": 4.285541425138285e-06, + "loss": 0.5162, + "step": 3207 + }, + { + "epoch": 1.5167848699763593, + "grad_norm": 2.353665351867676, + "learning_rate": 4.285104737597563e-06, + "loss": 0.5066, + "step": 3208 + }, + { + "epoch": 1.51725768321513, + "grad_norm": 2.767976760864258, + "learning_rate": 4.2846679389061975e-06, + "loss": 0.5331, + "step": 3209 + }, + { + "epoch": 1.5177304964539007, + "grad_norm": 2.9307682514190674, + "learning_rate": 4.284231029091385e-06, + "loss": 0.5291, + "step": 3210 + }, + { + "epoch": 1.5182033096926713, + "grad_norm": 2.39719820022583, + "learning_rate": 4.283794008180329e-06, + "loss": 0.4759, + "step": 3211 + }, + { + "epoch": 1.518676122931442, + "grad_norm": 2.452244758605957, + "learning_rate": 4.283356876200242e-06, + "loss": 0.4283, + "step": 3212 + }, + { + "epoch": 1.5191489361702128, + "grad_norm": 2.4911608695983887, + "learning_rate": 4.282919633178343e-06, + "loss": 0.4812, + "step": 3213 + }, + { + "epoch": 1.5196217494089834, + "grad_norm": 2.5813944339752197, + "learning_rate": 4.282482279141856e-06, + "loss": 0.4911, + "step": 3214 + }, + { + "epoch": 1.520094562647754, + "grad_norm": 2.503542184829712, + "learning_rate": 4.282044814118013e-06, + "loss": 0.4969, + "step": 3215 + }, + { + "epoch": 1.5205673758865248, + "grad_norm": 2.5090713500976562, + "learning_rate": 4.281607238134053e-06, + "loss": 0.5293, + "step": 3216 + }, + { + "epoch": 1.5210401891252956, + "grad_norm": 2.425994396209717, + "learning_rate": 4.281169551217223e-06, + "loss": 0.5365, + "step": 3217 + }, + { + "epoch": 1.5215130023640662, + "grad_norm": 2.637655258178711, + "learning_rate": 4.2807317533947765e-06, + "loss": 0.5589, + "step": 3218 + }, + { + "epoch": 1.5219858156028367, + "grad_norm": 2.9335296154022217, + "learning_rate": 4.28029384469397e-06, + "loss": 0.6071, + "step": 3219 + }, + { + "epoch": 1.5224586288416075, + "grad_norm": 2.898683547973633, + "learning_rate": 4.279855825142073e-06, + "loss": 0.5392, + "step": 3220 + }, + { + "epoch": 1.5229314420803783, + "grad_norm": 2.613914966583252, + "learning_rate": 4.279417694766359e-06, + "loss": 0.4968, + "step": 3221 + }, + { + "epoch": 1.523404255319149, + "grad_norm": 2.500682830810547, + "learning_rate": 4.278979453594106e-06, + "loss": 0.471, + "step": 3222 + }, + { + "epoch": 1.5238770685579195, + "grad_norm": 2.5269598960876465, + "learning_rate": 4.278541101652605e-06, + "loss": 0.471, + "step": 3223 + }, + { + "epoch": 1.5243498817966903, + "grad_norm": 2.8153114318847656, + "learning_rate": 4.2781026389691465e-06, + "loss": 0.5742, + "step": 3224 + }, + { + "epoch": 1.524822695035461, + "grad_norm": 2.5648019313812256, + "learning_rate": 4.277664065571034e-06, + "loss": 0.5315, + "step": 3225 + }, + { + "epoch": 1.5252955082742317, + "grad_norm": 2.778355836868286, + "learning_rate": 4.277225381485575e-06, + "loss": 0.5543, + "step": 3226 + }, + { + "epoch": 1.5257683215130022, + "grad_norm": 2.6736745834350586, + "learning_rate": 4.2767865867400846e-06, + "loss": 0.4947, + "step": 3227 + }, + { + "epoch": 1.526241134751773, + "grad_norm": 2.9560294151306152, + "learning_rate": 4.276347681361884e-06, + "loss": 0.5835, + "step": 3228 + }, + { + "epoch": 1.5267139479905438, + "grad_norm": 2.5580296516418457, + "learning_rate": 4.275908665378302e-06, + "loss": 0.4751, + "step": 3229 + }, + { + "epoch": 1.5271867612293144, + "grad_norm": 3.0705175399780273, + "learning_rate": 4.2754695388166755e-06, + "loss": 0.5327, + "step": 3230 + }, + { + "epoch": 1.527659574468085, + "grad_norm": 2.664652109146118, + "learning_rate": 4.275030301704346e-06, + "loss": 0.4934, + "step": 3231 + }, + { + "epoch": 1.5281323877068558, + "grad_norm": 2.308499813079834, + "learning_rate": 4.274590954068663e-06, + "loss": 0.4412, + "step": 3232 + }, + { + "epoch": 1.5286052009456266, + "grad_norm": 2.871189594268799, + "learning_rate": 4.2741514959369815e-06, + "loss": 0.5001, + "step": 3233 + }, + { + "epoch": 1.5290780141843971, + "grad_norm": 2.5274453163146973, + "learning_rate": 4.273711927336666e-06, + "loss": 0.4938, + "step": 3234 + }, + { + "epoch": 1.5295508274231677, + "grad_norm": 2.8848133087158203, + "learning_rate": 4.273272248295087e-06, + "loss": 0.5397, + "step": 3235 + }, + { + "epoch": 1.5300236406619385, + "grad_norm": 2.3927090167999268, + "learning_rate": 4.27283245883962e-06, + "loss": 0.5497, + "step": 3236 + }, + { + "epoch": 1.5304964539007093, + "grad_norm": 2.5413873195648193, + "learning_rate": 4.27239255899765e-06, + "loss": 0.5108, + "step": 3237 + }, + { + "epoch": 1.53096926713948, + "grad_norm": 2.7692389488220215, + "learning_rate": 4.271952548796567e-06, + "loss": 0.5768, + "step": 3238 + }, + { + "epoch": 1.5314420803782505, + "grad_norm": 2.4621126651763916, + "learning_rate": 4.271512428263768e-06, + "loss": 0.4698, + "step": 3239 + }, + { + "epoch": 1.5319148936170213, + "grad_norm": 2.6423375606536865, + "learning_rate": 4.271072197426659e-06, + "loss": 0.4929, + "step": 3240 + }, + { + "epoch": 1.532387706855792, + "grad_norm": 2.7097692489624023, + "learning_rate": 4.270631856312649e-06, + "loss": 0.4836, + "step": 3241 + }, + { + "epoch": 1.5328605200945626, + "grad_norm": 2.545706272125244, + "learning_rate": 4.270191404949158e-06, + "loss": 0.4636, + "step": 3242 + }, + { + "epoch": 1.5333333333333332, + "grad_norm": 3.138781785964966, + "learning_rate": 4.26975084336361e-06, + "loss": 0.5988, + "step": 3243 + }, + { + "epoch": 1.533806146572104, + "grad_norm": 2.492715835571289, + "learning_rate": 4.269310171583438e-06, + "loss": 0.5095, + "step": 3244 + }, + { + "epoch": 1.5342789598108748, + "grad_norm": 2.5705838203430176, + "learning_rate": 4.268869389636077e-06, + "loss": 0.4818, + "step": 3245 + }, + { + "epoch": 1.5347517730496454, + "grad_norm": 2.7633554935455322, + "learning_rate": 4.268428497548979e-06, + "loss": 0.547, + "step": 3246 + }, + { + "epoch": 1.535224586288416, + "grad_norm": 2.654528856277466, + "learning_rate": 4.2679874953495905e-06, + "loss": 0.5261, + "step": 3247 + }, + { + "epoch": 1.5356973995271868, + "grad_norm": 2.5039751529693604, + "learning_rate": 4.2675463830653744e-06, + "loss": 0.4941, + "step": 3248 + }, + { + "epoch": 1.5361702127659576, + "grad_norm": 2.897268295288086, + "learning_rate": 4.267105160723794e-06, + "loss": 0.5404, + "step": 3249 + }, + { + "epoch": 1.5366430260047281, + "grad_norm": 2.500732421875, + "learning_rate": 4.266663828352324e-06, + "loss": 0.5375, + "step": 3250 + }, + { + "epoch": 1.5371158392434987, + "grad_norm": 2.6310064792633057, + "learning_rate": 4.266222385978444e-06, + "loss": 0.5217, + "step": 3251 + }, + { + "epoch": 1.5375886524822695, + "grad_norm": 2.7440476417541504, + "learning_rate": 4.265780833629642e-06, + "loss": 0.5419, + "step": 3252 + }, + { + "epoch": 1.5380614657210403, + "grad_norm": 2.7037577629089355, + "learning_rate": 4.2653391713334095e-06, + "loss": 0.5634, + "step": 3253 + }, + { + "epoch": 1.5385342789598109, + "grad_norm": 2.548525810241699, + "learning_rate": 4.264897399117248e-06, + "loss": 0.535, + "step": 3254 + }, + { + "epoch": 1.5390070921985815, + "grad_norm": 2.6127355098724365, + "learning_rate": 4.264455517008663e-06, + "loss": 0.4619, + "step": 3255 + }, + { + "epoch": 1.5394799054373522, + "grad_norm": 2.5597004890441895, + "learning_rate": 4.264013525035171e-06, + "loss": 0.4477, + "step": 3256 + }, + { + "epoch": 1.539952718676123, + "grad_norm": 2.642432689666748, + "learning_rate": 4.263571423224292e-06, + "loss": 0.4749, + "step": 3257 + }, + { + "epoch": 1.5404255319148936, + "grad_norm": 2.5121877193450928, + "learning_rate": 4.2631292116035526e-06, + "loss": 0.4693, + "step": 3258 + }, + { + "epoch": 1.5408983451536642, + "grad_norm": 2.390292167663574, + "learning_rate": 4.262686890200489e-06, + "loss": 0.4872, + "step": 3259 + }, + { + "epoch": 1.541371158392435, + "grad_norm": 2.5898337364196777, + "learning_rate": 4.2622444590426405e-06, + "loss": 0.5193, + "step": 3260 + }, + { + "epoch": 1.5418439716312058, + "grad_norm": 2.508821487426758, + "learning_rate": 4.261801918157558e-06, + "loss": 0.511, + "step": 3261 + }, + { + "epoch": 1.5423167848699764, + "grad_norm": 2.6992101669311523, + "learning_rate": 4.261359267572795e-06, + "loss": 0.5069, + "step": 3262 + }, + { + "epoch": 1.542789598108747, + "grad_norm": 2.6011030673980713, + "learning_rate": 4.2609165073159145e-06, + "loss": 0.5887, + "step": 3263 + }, + { + "epoch": 1.5432624113475177, + "grad_norm": 2.887053966522217, + "learning_rate": 4.260473637414483e-06, + "loss": 0.5556, + "step": 3264 + }, + { + "epoch": 1.5437352245862885, + "grad_norm": 2.6433887481689453, + "learning_rate": 4.260030657896079e-06, + "loss": 0.4728, + "step": 3265 + }, + { + "epoch": 1.544208037825059, + "grad_norm": 2.6134607791900635, + "learning_rate": 4.259587568788282e-06, + "loss": 0.483, + "step": 3266 + }, + { + "epoch": 1.5446808510638297, + "grad_norm": 2.5308640003204346, + "learning_rate": 4.259144370118684e-06, + "loss": 0.5115, + "step": 3267 + }, + { + "epoch": 1.5451536643026005, + "grad_norm": 2.8256733417510986, + "learning_rate": 4.258701061914879e-06, + "loss": 0.5414, + "step": 3268 + }, + { + "epoch": 1.5456264775413713, + "grad_norm": 2.8648319244384766, + "learning_rate": 4.258257644204471e-06, + "loss": 0.5695, + "step": 3269 + }, + { + "epoch": 1.5460992907801419, + "grad_norm": 2.8568081855773926, + "learning_rate": 4.257814117015069e-06, + "loss": 0.5264, + "step": 3270 + }, + { + "epoch": 1.5465721040189124, + "grad_norm": 2.6065011024475098, + "learning_rate": 4.257370480374289e-06, + "loss": 0.5646, + "step": 3271 + }, + { + "epoch": 1.5470449172576832, + "grad_norm": 2.7840216159820557, + "learning_rate": 4.256926734309756e-06, + "loss": 0.5191, + "step": 3272 + }, + { + "epoch": 1.547517730496454, + "grad_norm": 2.85906982421875, + "learning_rate": 4.256482878849099e-06, + "loss": 0.5911, + "step": 3273 + }, + { + "epoch": 1.5479905437352246, + "grad_norm": 2.916029930114746, + "learning_rate": 4.256038914019954e-06, + "loss": 0.5589, + "step": 3274 + }, + { + "epoch": 1.5484633569739952, + "grad_norm": 2.6748716831207275, + "learning_rate": 4.255594839849967e-06, + "loss": 0.5323, + "step": 3275 + }, + { + "epoch": 1.548936170212766, + "grad_norm": 2.717212200164795, + "learning_rate": 4.255150656366787e-06, + "loss": 0.453, + "step": 3276 + }, + { + "epoch": 1.5494089834515368, + "grad_norm": 2.4974849224090576, + "learning_rate": 4.254706363598072e-06, + "loss": 0.4516, + "step": 3277 + }, + { + "epoch": 1.5498817966903073, + "grad_norm": 2.648151397705078, + "learning_rate": 4.254261961571485e-06, + "loss": 0.5452, + "step": 3278 + }, + { + "epoch": 1.550354609929078, + "grad_norm": 2.932905435562134, + "learning_rate": 4.253817450314699e-06, + "loss": 0.4813, + "step": 3279 + }, + { + "epoch": 1.5508274231678487, + "grad_norm": 2.862912178039551, + "learning_rate": 4.25337282985539e-06, + "loss": 0.5689, + "step": 3280 + }, + { + "epoch": 1.5513002364066195, + "grad_norm": 2.532156467437744, + "learning_rate": 4.2529281002212436e-06, + "loss": 0.485, + "step": 3281 + }, + { + "epoch": 1.55177304964539, + "grad_norm": 2.583299160003662, + "learning_rate": 4.25248326143995e-06, + "loss": 0.4661, + "step": 3282 + }, + { + "epoch": 1.5522458628841607, + "grad_norm": 2.5790653228759766, + "learning_rate": 4.252038313539209e-06, + "loss": 0.5455, + "step": 3283 + }, + { + "epoch": 1.5527186761229315, + "grad_norm": 2.872864007949829, + "learning_rate": 4.251593256546724e-06, + "loss": 0.5317, + "step": 3284 + }, + { + "epoch": 1.5531914893617023, + "grad_norm": 3.0382463932037354, + "learning_rate": 4.251148090490208e-06, + "loss": 0.5131, + "step": 3285 + }, + { + "epoch": 1.5536643026004728, + "grad_norm": 2.574399709701538, + "learning_rate": 4.250702815397379e-06, + "loss": 0.5399, + "step": 3286 + }, + { + "epoch": 1.5541371158392434, + "grad_norm": 2.9784770011901855, + "learning_rate": 4.250257431295962e-06, + "loss": 0.5209, + "step": 3287 + }, + { + "epoch": 1.5546099290780142, + "grad_norm": 2.6482062339782715, + "learning_rate": 4.249811938213689e-06, + "loss": 0.5416, + "step": 3288 + }, + { + "epoch": 1.555082742316785, + "grad_norm": 2.82142972946167, + "learning_rate": 4.2493663361783e-06, + "loss": 0.594, + "step": 3289 + }, + { + "epoch": 1.5555555555555556, + "grad_norm": 2.815595865249634, + "learning_rate": 4.24892062521754e-06, + "loss": 0.5381, + "step": 3290 + }, + { + "epoch": 1.5560283687943262, + "grad_norm": 2.689764976501465, + "learning_rate": 4.248474805359161e-06, + "loss": 0.5141, + "step": 3291 + }, + { + "epoch": 1.556501182033097, + "grad_norm": 2.7718515396118164, + "learning_rate": 4.248028876630922e-06, + "loss": 0.5324, + "step": 3292 + }, + { + "epoch": 1.5569739952718678, + "grad_norm": 3.0196774005889893, + "learning_rate": 4.247582839060591e-06, + "loss": 0.4971, + "step": 3293 + }, + { + "epoch": 1.5574468085106383, + "grad_norm": 2.608475923538208, + "learning_rate": 4.247136692675939e-06, + "loss": 0.5795, + "step": 3294 + }, + { + "epoch": 1.557919621749409, + "grad_norm": 2.4912326335906982, + "learning_rate": 4.246690437504746e-06, + "loss": 0.5348, + "step": 3295 + }, + { + "epoch": 1.5583924349881797, + "grad_norm": 2.519303560256958, + "learning_rate": 4.246244073574799e-06, + "loss": 0.4953, + "step": 3296 + }, + { + "epoch": 1.5588652482269505, + "grad_norm": 2.5667171478271484, + "learning_rate": 4.24579760091389e-06, + "loss": 0.5353, + "step": 3297 + }, + { + "epoch": 1.559338061465721, + "grad_norm": 2.8835761547088623, + "learning_rate": 4.24535101954982e-06, + "loss": 0.578, + "step": 3298 + }, + { + "epoch": 1.5598108747044916, + "grad_norm": 3.0506930351257324, + "learning_rate": 4.244904329510395e-06, + "loss": 0.6418, + "step": 3299 + }, + { + "epoch": 1.5602836879432624, + "grad_norm": 2.579446315765381, + "learning_rate": 4.244457530823428e-06, + "loss": 0.5027, + "step": 3300 + }, + { + "epoch": 1.5607565011820332, + "grad_norm": 2.72012996673584, + "learning_rate": 4.24401062351674e-06, + "loss": 0.5438, + "step": 3301 + }, + { + "epoch": 1.5612293144208038, + "grad_norm": 2.527007818222046, + "learning_rate": 4.243563607618158e-06, + "loss": 0.5303, + "step": 3302 + }, + { + "epoch": 1.5617021276595744, + "grad_norm": 2.4415159225463867, + "learning_rate": 4.243116483155516e-06, + "loss": 0.4893, + "step": 3303 + }, + { + "epoch": 1.5621749408983452, + "grad_norm": 2.462256669998169, + "learning_rate": 4.242669250156653e-06, + "loss": 0.5671, + "step": 3304 + }, + { + "epoch": 1.562647754137116, + "grad_norm": 2.479865074157715, + "learning_rate": 4.242221908649418e-06, + "loss": 0.5038, + "step": 3305 + }, + { + "epoch": 1.5631205673758866, + "grad_norm": 2.74670672416687, + "learning_rate": 4.241774458661662e-06, + "loss": 0.5689, + "step": 3306 + }, + { + "epoch": 1.5635933806146571, + "grad_norm": 2.55938982963562, + "learning_rate": 4.24132690022125e-06, + "loss": 0.492, + "step": 3307 + }, + { + "epoch": 1.564066193853428, + "grad_norm": 2.634956121444702, + "learning_rate": 4.240879233356048e-06, + "loss": 0.503, + "step": 3308 + }, + { + "epoch": 1.5645390070921987, + "grad_norm": 2.381775140762329, + "learning_rate": 4.240431458093928e-06, + "loss": 0.4939, + "step": 3309 + }, + { + "epoch": 1.5650118203309693, + "grad_norm": 2.8176610469818115, + "learning_rate": 4.239983574462774e-06, + "loss": 0.5609, + "step": 3310 + }, + { + "epoch": 1.5654846335697399, + "grad_norm": 3.0268442630767822, + "learning_rate": 4.239535582490471e-06, + "loss": 0.5427, + "step": 3311 + }, + { + "epoch": 1.5659574468085107, + "grad_norm": 2.5881481170654297, + "learning_rate": 4.239087482204916e-06, + "loss": 0.5538, + "step": 3312 + }, + { + "epoch": 1.5664302600472815, + "grad_norm": 2.5317704677581787, + "learning_rate": 4.238639273634008e-06, + "loss": 0.4915, + "step": 3313 + }, + { + "epoch": 1.566903073286052, + "grad_norm": 2.9608731269836426, + "learning_rate": 4.238190956805658e-06, + "loss": 0.564, + "step": 3314 + }, + { + "epoch": 1.5673758865248226, + "grad_norm": 3.022686243057251, + "learning_rate": 4.237742531747777e-06, + "loss": 0.5503, + "step": 3315 + }, + { + "epoch": 1.5678486997635934, + "grad_norm": 2.763622283935547, + "learning_rate": 4.23729399848829e-06, + "loss": 0.5241, + "step": 3316 + }, + { + "epoch": 1.5683215130023642, + "grad_norm": 2.6112794876098633, + "learning_rate": 4.236845357055122e-06, + "loss": 0.4919, + "step": 3317 + }, + { + "epoch": 1.5687943262411348, + "grad_norm": 2.649829149246216, + "learning_rate": 4.23639660747621e-06, + "loss": 0.5472, + "step": 3318 + }, + { + "epoch": 1.5692671394799054, + "grad_norm": 2.8888115882873535, + "learning_rate": 4.2359477497794955e-06, + "loss": 0.5077, + "step": 3319 + }, + { + "epoch": 1.5697399527186762, + "grad_norm": 2.5666911602020264, + "learning_rate": 4.235498783992927e-06, + "loss": 0.5365, + "step": 3320 + }, + { + "epoch": 1.570212765957447, + "grad_norm": 2.448758363723755, + "learning_rate": 4.2350497101444575e-06, + "loss": 0.5043, + "step": 3321 + }, + { + "epoch": 1.5706855791962175, + "grad_norm": 2.595207691192627, + "learning_rate": 4.234600528262052e-06, + "loss": 0.5303, + "step": 3322 + }, + { + "epoch": 1.5711583924349881, + "grad_norm": 2.7814228534698486, + "learning_rate": 4.234151238373676e-06, + "loss": 0.4521, + "step": 3323 + }, + { + "epoch": 1.571631205673759, + "grad_norm": 2.781538724899292, + "learning_rate": 4.233701840507308e-06, + "loss": 0.5193, + "step": 3324 + }, + { + "epoch": 1.5721040189125297, + "grad_norm": 2.771907329559326, + "learning_rate": 4.233252334690928e-06, + "loss": 0.497, + "step": 3325 + }, + { + "epoch": 1.5725768321513003, + "grad_norm": 2.5557498931884766, + "learning_rate": 4.232802720952525e-06, + "loss": 0.4913, + "step": 3326 + }, + { + "epoch": 1.5730496453900709, + "grad_norm": 2.478267192840576, + "learning_rate": 4.232352999320094e-06, + "loss": 0.4967, + "step": 3327 + }, + { + "epoch": 1.5735224586288417, + "grad_norm": 3.1548502445220947, + "learning_rate": 4.231903169821639e-06, + "loss": 0.5009, + "step": 3328 + }, + { + "epoch": 1.5739952718676125, + "grad_norm": 2.634824275970459, + "learning_rate": 4.231453232485168e-06, + "loss": 0.5223, + "step": 3329 + }, + { + "epoch": 1.574468085106383, + "grad_norm": 2.579102039337158, + "learning_rate": 4.231003187338695e-06, + "loss": 0.5513, + "step": 3330 + }, + { + "epoch": 1.5749408983451536, + "grad_norm": 2.8477070331573486, + "learning_rate": 4.230553034410245e-06, + "loss": 0.561, + "step": 3331 + }, + { + "epoch": 1.5754137115839244, + "grad_norm": 2.6714725494384766, + "learning_rate": 4.2301027737278446e-06, + "loss": 0.4687, + "step": 3332 + }, + { + "epoch": 1.5758865248226952, + "grad_norm": 2.6562764644622803, + "learning_rate": 4.229652405319532e-06, + "loss": 0.5925, + "step": 3333 + }, + { + "epoch": 1.5763593380614658, + "grad_norm": 2.750946283340454, + "learning_rate": 4.229201929213348e-06, + "loss": 0.4748, + "step": 3334 + }, + { + "epoch": 1.5768321513002364, + "grad_norm": 2.760470151901245, + "learning_rate": 4.228751345437342e-06, + "loss": 0.5989, + "step": 3335 + }, + { + "epoch": 1.5773049645390071, + "grad_norm": 3.1451845169067383, + "learning_rate": 4.2283006540195706e-06, + "loss": 0.562, + "step": 3336 + }, + { + "epoch": 1.5777777777777777, + "grad_norm": 2.563011407852173, + "learning_rate": 4.227849854988095e-06, + "loss": 0.5473, + "step": 3337 + }, + { + "epoch": 1.5782505910165483, + "grad_norm": 2.310469388961792, + "learning_rate": 4.2273989483709856e-06, + "loss": 0.5033, + "step": 3338 + }, + { + "epoch": 1.578723404255319, + "grad_norm": 2.677978754043579, + "learning_rate": 4.226947934196318e-06, + "loss": 0.5291, + "step": 3339 + }, + { + "epoch": 1.57919621749409, + "grad_norm": 3.0423545837402344, + "learning_rate": 4.226496812492176e-06, + "loss": 0.5201, + "step": 3340 + }, + { + "epoch": 1.5796690307328605, + "grad_norm": 2.357513904571533, + "learning_rate": 4.226045583286647e-06, + "loss": 0.4421, + "step": 3341 + }, + { + "epoch": 1.580141843971631, + "grad_norm": 2.719860315322876, + "learning_rate": 4.225594246607828e-06, + "loss": 0.4855, + "step": 3342 + }, + { + "epoch": 1.5806146572104018, + "grad_norm": 3.2645058631896973, + "learning_rate": 4.2251428024838215e-06, + "loss": 0.6654, + "step": 3343 + }, + { + "epoch": 1.5810874704491726, + "grad_norm": 2.2997004985809326, + "learning_rate": 4.224691250942737e-06, + "loss": 0.4565, + "step": 3344 + }, + { + "epoch": 1.5815602836879432, + "grad_norm": 2.8103034496307373, + "learning_rate": 4.2242395920126926e-06, + "loss": 0.5543, + "step": 3345 + }, + { + "epoch": 1.5820330969267138, + "grad_norm": 2.720254898071289, + "learning_rate": 4.223787825721808e-06, + "loss": 0.5028, + "step": 3346 + }, + { + "epoch": 1.5825059101654846, + "grad_norm": 2.735544204711914, + "learning_rate": 4.223335952098214e-06, + "loss": 0.5169, + "step": 3347 + }, + { + "epoch": 1.5829787234042554, + "grad_norm": 2.784254550933838, + "learning_rate": 4.222883971170047e-06, + "loss": 0.4989, + "step": 3348 + }, + { + "epoch": 1.583451536643026, + "grad_norm": 2.7192094326019287, + "learning_rate": 4.22243188296545e-06, + "loss": 0.502, + "step": 3349 + }, + { + "epoch": 1.5839243498817965, + "grad_norm": 2.716501474380493, + "learning_rate": 4.221979687512573e-06, + "loss": 0.5687, + "step": 3350 + }, + { + "epoch": 1.5843971631205673, + "grad_norm": 2.8420114517211914, + "learning_rate": 4.22152738483957e-06, + "loss": 0.5903, + "step": 3351 + }, + { + "epoch": 1.5848699763593381, + "grad_norm": 2.734872579574585, + "learning_rate": 4.2210749749746065e-06, + "loss": 0.5397, + "step": 3352 + }, + { + "epoch": 1.5853427895981087, + "grad_norm": 2.4343836307525635, + "learning_rate": 4.220622457945851e-06, + "loss": 0.436, + "step": 3353 + }, + { + "epoch": 1.5858156028368793, + "grad_norm": 2.728177547454834, + "learning_rate": 4.2201698337814785e-06, + "loss": 0.5703, + "step": 3354 + }, + { + "epoch": 1.58628841607565, + "grad_norm": 2.502098560333252, + "learning_rate": 4.219717102509674e-06, + "loss": 0.5275, + "step": 3355 + }, + { + "epoch": 1.5867612293144209, + "grad_norm": 2.6595494747161865, + "learning_rate": 4.219264264158627e-06, + "loss": 0.4659, + "step": 3356 + }, + { + "epoch": 1.5872340425531914, + "grad_norm": 2.5307185649871826, + "learning_rate": 4.218811318756532e-06, + "loss": 0.5048, + "step": 3357 + }, + { + "epoch": 1.587706855791962, + "grad_norm": 2.9300129413604736, + "learning_rate": 4.218358266331593e-06, + "loss": 0.5137, + "step": 3358 + }, + { + "epoch": 1.5881796690307328, + "grad_norm": 2.686586618423462, + "learning_rate": 4.21790510691202e-06, + "loss": 0.4529, + "step": 3359 + }, + { + "epoch": 1.5886524822695036, + "grad_norm": 2.9981517791748047, + "learning_rate": 4.217451840526029e-06, + "loss": 0.6054, + "step": 3360 + }, + { + "epoch": 1.5891252955082742, + "grad_norm": 2.6943674087524414, + "learning_rate": 4.216998467201841e-06, + "loss": 0.5153, + "step": 3361 + }, + { + "epoch": 1.5895981087470448, + "grad_norm": 2.707084894180298, + "learning_rate": 4.216544986967689e-06, + "loss": 0.5235, + "step": 3362 + }, + { + "epoch": 1.5900709219858156, + "grad_norm": 2.6553728580474854, + "learning_rate": 4.216091399851808e-06, + "loss": 0.5275, + "step": 3363 + }, + { + "epoch": 1.5905437352245864, + "grad_norm": 2.9136953353881836, + "learning_rate": 4.215637705882439e-06, + "loss": 0.5834, + "step": 3364 + }, + { + "epoch": 1.591016548463357, + "grad_norm": 2.7647159099578857, + "learning_rate": 4.2151839050878325e-06, + "loss": 0.5641, + "step": 3365 + }, + { + "epoch": 1.5914893617021275, + "grad_norm": 2.4556827545166016, + "learning_rate": 4.214729997496246e-06, + "loss": 0.5636, + "step": 3366 + }, + { + "epoch": 1.5919621749408983, + "grad_norm": 2.6111652851104736, + "learning_rate": 4.2142759831359414e-06, + "loss": 0.5097, + "step": 3367 + }, + { + "epoch": 1.592434988179669, + "grad_norm": 2.4886903762817383, + "learning_rate": 4.213821862035189e-06, + "loss": 0.531, + "step": 3368 + }, + { + "epoch": 1.5929078014184397, + "grad_norm": 2.5245840549468994, + "learning_rate": 4.213367634222263e-06, + "loss": 0.5085, + "step": 3369 + }, + { + "epoch": 1.5933806146572103, + "grad_norm": 2.970214605331421, + "learning_rate": 4.212913299725447e-06, + "loss": 0.5851, + "step": 3370 + }, + { + "epoch": 1.593853427895981, + "grad_norm": 2.5433361530303955, + "learning_rate": 4.212458858573032e-06, + "loss": 0.48, + "step": 3371 + }, + { + "epoch": 1.5943262411347519, + "grad_norm": 2.3550102710723877, + "learning_rate": 4.212004310793312e-06, + "loss": 0.4405, + "step": 3372 + }, + { + "epoch": 1.5947990543735224, + "grad_norm": 2.4824719429016113, + "learning_rate": 4.2115496564145896e-06, + "loss": 0.4634, + "step": 3373 + }, + { + "epoch": 1.595271867612293, + "grad_norm": 2.4751930236816406, + "learning_rate": 4.211094895465176e-06, + "loss": 0.5662, + "step": 3374 + }, + { + "epoch": 1.5957446808510638, + "grad_norm": 2.4193356037139893, + "learning_rate": 4.210640027973386e-06, + "loss": 0.4441, + "step": 3375 + }, + { + "epoch": 1.5962174940898346, + "grad_norm": 2.4477498531341553, + "learning_rate": 4.210185053967543e-06, + "loss": 0.5205, + "step": 3376 + }, + { + "epoch": 1.5966903073286052, + "grad_norm": 2.7954161167144775, + "learning_rate": 4.209729973475976e-06, + "loss": 0.4951, + "step": 3377 + }, + { + "epoch": 1.5971631205673757, + "grad_norm": 3.1907570362091064, + "learning_rate": 4.209274786527019e-06, + "loss": 0.6024, + "step": 3378 + }, + { + "epoch": 1.5976359338061465, + "grad_norm": 2.485245704650879, + "learning_rate": 4.2088194931490165e-06, + "loss": 0.5652, + "step": 3379 + }, + { + "epoch": 1.5981087470449173, + "grad_norm": 2.589310884475708, + "learning_rate": 4.208364093370317e-06, + "loss": 0.5085, + "step": 3380 + }, + { + "epoch": 1.598581560283688, + "grad_norm": 2.8941214084625244, + "learning_rate": 4.207908587219276e-06, + "loss": 0.53, + "step": 3381 + }, + { + "epoch": 1.5990543735224585, + "grad_norm": 2.480509042739868, + "learning_rate": 4.207452974724258e-06, + "loss": 0.4543, + "step": 3382 + }, + { + "epoch": 1.5995271867612293, + "grad_norm": 2.7884905338287354, + "learning_rate": 4.206997255913629e-06, + "loss": 0.5483, + "step": 3383 + }, + { + "epoch": 1.6, + "grad_norm": 2.7976696491241455, + "learning_rate": 4.206541430815766e-06, + "loss": 0.4734, + "step": 3384 + }, + { + "epoch": 1.6004728132387707, + "grad_norm": 2.5463132858276367, + "learning_rate": 4.206085499459051e-06, + "loss": 0.4931, + "step": 3385 + }, + { + "epoch": 1.6009456264775412, + "grad_norm": 2.8384251594543457, + "learning_rate": 4.205629461871871e-06, + "loss": 0.5066, + "step": 3386 + }, + { + "epoch": 1.601418439716312, + "grad_norm": 2.8578574657440186, + "learning_rate": 4.205173318082626e-06, + "loss": 0.458, + "step": 3387 + }, + { + "epoch": 1.6018912529550828, + "grad_norm": 2.7779932022094727, + "learning_rate": 4.204717068119715e-06, + "loss": 0.5293, + "step": 3388 + }, + { + "epoch": 1.6023640661938534, + "grad_norm": 2.9123778343200684, + "learning_rate": 4.204260712011546e-06, + "loss": 0.4866, + "step": 3389 + }, + { + "epoch": 1.602836879432624, + "grad_norm": 2.757922887802124, + "learning_rate": 4.203804249786537e-06, + "loss": 0.4925, + "step": 3390 + }, + { + "epoch": 1.6033096926713948, + "grad_norm": 3.287733316421509, + "learning_rate": 4.203347681473107e-06, + "loss": 0.6694, + "step": 3391 + }, + { + "epoch": 1.6037825059101656, + "grad_norm": 3.2117912769317627, + "learning_rate": 4.202891007099687e-06, + "loss": 0.5269, + "step": 3392 + }, + { + "epoch": 1.6042553191489362, + "grad_norm": 2.8489456176757812, + "learning_rate": 4.20243422669471e-06, + "loss": 0.5073, + "step": 3393 + }, + { + "epoch": 1.6047281323877067, + "grad_norm": 2.7660224437713623, + "learning_rate": 4.201977340286619e-06, + "loss": 0.5014, + "step": 3394 + }, + { + "epoch": 1.6052009456264775, + "grad_norm": 2.68182110786438, + "learning_rate": 4.201520347903862e-06, + "loss": 0.4542, + "step": 3395 + }, + { + "epoch": 1.6056737588652483, + "grad_norm": 2.7546045780181885, + "learning_rate": 4.2010632495748934e-06, + "loss": 0.516, + "step": 3396 + }, + { + "epoch": 1.606146572104019, + "grad_norm": 2.744668483734131, + "learning_rate": 4.200606045328176e-06, + "loss": 0.5243, + "step": 3397 + }, + { + "epoch": 1.6066193853427895, + "grad_norm": 2.935343027114868, + "learning_rate": 4.200148735192177e-06, + "loss": 0.5624, + "step": 3398 + }, + { + "epoch": 1.6070921985815603, + "grad_norm": 2.7392852306365967, + "learning_rate": 4.19969131919537e-06, + "loss": 0.5796, + "step": 3399 + }, + { + "epoch": 1.607565011820331, + "grad_norm": 2.864750385284424, + "learning_rate": 4.199233797366239e-06, + "loss": 0.549, + "step": 3400 + }, + { + "epoch": 1.6080378250591016, + "grad_norm": 2.684157371520996, + "learning_rate": 4.198776169733269e-06, + "loss": 0.5532, + "step": 3401 + }, + { + "epoch": 1.6085106382978722, + "grad_norm": 2.4717135429382324, + "learning_rate": 4.198318436324957e-06, + "loss": 0.5174, + "step": 3402 + }, + { + "epoch": 1.608983451536643, + "grad_norm": 2.640242338180542, + "learning_rate": 4.197860597169802e-06, + "loss": 0.5117, + "step": 3403 + }, + { + "epoch": 1.6094562647754138, + "grad_norm": 2.4957473278045654, + "learning_rate": 4.197402652296313e-06, + "loss": 0.474, + "step": 3404 + }, + { + "epoch": 1.6099290780141844, + "grad_norm": 2.416138172149658, + "learning_rate": 4.196944601733004e-06, + "loss": 0.4858, + "step": 3405 + }, + { + "epoch": 1.610401891252955, + "grad_norm": 2.4498109817504883, + "learning_rate": 4.196486445508395e-06, + "loss": 0.5048, + "step": 3406 + }, + { + "epoch": 1.6108747044917258, + "grad_norm": 2.415895938873291, + "learning_rate": 4.196028183651014e-06, + "loss": 0.4745, + "step": 3407 + }, + { + "epoch": 1.6113475177304966, + "grad_norm": 2.843665838241577, + "learning_rate": 4.195569816189395e-06, + "loss": 0.5219, + "step": 3408 + }, + { + "epoch": 1.6118203309692671, + "grad_norm": 2.608579158782959, + "learning_rate": 4.195111343152079e-06, + "loss": 0.4941, + "step": 3409 + }, + { + "epoch": 1.6122931442080377, + "grad_norm": 2.643789529800415, + "learning_rate": 4.194652764567611e-06, + "loss": 0.515, + "step": 3410 + }, + { + "epoch": 1.6127659574468085, + "grad_norm": 2.8099429607391357, + "learning_rate": 4.194194080464547e-06, + "loss": 0.4935, + "step": 3411 + }, + { + "epoch": 1.6132387706855793, + "grad_norm": 2.595628261566162, + "learning_rate": 4.193735290871446e-06, + "loss": 0.5571, + "step": 3412 + }, + { + "epoch": 1.6137115839243499, + "grad_norm": 2.7903778553009033, + "learning_rate": 4.193276395816876e-06, + "loss": 0.5228, + "step": 3413 + }, + { + "epoch": 1.6141843971631205, + "grad_norm": 2.83910870552063, + "learning_rate": 4.192817395329409e-06, + "loss": 0.6124, + "step": 3414 + }, + { + "epoch": 1.6146572104018913, + "grad_norm": 2.6155734062194824, + "learning_rate": 4.192358289437626e-06, + "loss": 0.552, + "step": 3415 + }, + { + "epoch": 1.615130023640662, + "grad_norm": 2.795832872390747, + "learning_rate": 4.191899078170113e-06, + "loss": 0.5561, + "step": 3416 + }, + { + "epoch": 1.6156028368794326, + "grad_norm": 2.3402161598205566, + "learning_rate": 4.191439761555464e-06, + "loss": 0.4889, + "step": 3417 + }, + { + "epoch": 1.6160756501182032, + "grad_norm": 3.1183433532714844, + "learning_rate": 4.190980339622276e-06, + "loss": 0.5337, + "step": 3418 + }, + { + "epoch": 1.616548463356974, + "grad_norm": 2.6262872219085693, + "learning_rate": 4.190520812399158e-06, + "loss": 0.525, + "step": 3419 + }, + { + "epoch": 1.6170212765957448, + "grad_norm": 2.578340530395508, + "learning_rate": 4.190061179914722e-06, + "loss": 0.4975, + "step": 3420 + }, + { + "epoch": 1.6174940898345154, + "grad_norm": 3.19482159614563, + "learning_rate": 4.189601442197586e-06, + "loss": 0.5832, + "step": 3421 + }, + { + "epoch": 1.617966903073286, + "grad_norm": 2.6398792266845703, + "learning_rate": 4.189141599276378e-06, + "loss": 0.4676, + "step": 3422 + }, + { + "epoch": 1.6184397163120567, + "grad_norm": 2.624865770339966, + "learning_rate": 4.1886816511797275e-06, + "loss": 0.4507, + "step": 3423 + }, + { + "epoch": 1.6189125295508275, + "grad_norm": 2.4136857986450195, + "learning_rate": 4.1882215979362775e-06, + "loss": 0.4616, + "step": 3424 + }, + { + "epoch": 1.6193853427895981, + "grad_norm": 2.6906614303588867, + "learning_rate": 4.18776143957467e-06, + "loss": 0.5142, + "step": 3425 + }, + { + "epoch": 1.6198581560283687, + "grad_norm": 2.5149154663085938, + "learning_rate": 4.187301176123558e-06, + "loss": 0.5252, + "step": 3426 + }, + { + "epoch": 1.6203309692671395, + "grad_norm": 2.677405834197998, + "learning_rate": 4.186840807611602e-06, + "loss": 0.4635, + "step": 3427 + }, + { + "epoch": 1.6208037825059103, + "grad_norm": 2.7164649963378906, + "learning_rate": 4.186380334067464e-06, + "loss": 0.5634, + "step": 3428 + }, + { + "epoch": 1.6212765957446809, + "grad_norm": 2.8299832344055176, + "learning_rate": 4.185919755519817e-06, + "loss": 0.5166, + "step": 3429 + }, + { + "epoch": 1.6217494089834514, + "grad_norm": 2.465848207473755, + "learning_rate": 4.18545907199734e-06, + "loss": 0.4696, + "step": 3430 + }, + { + "epoch": 1.6222222222222222, + "grad_norm": 2.407616376876831, + "learning_rate": 4.1849982835287175e-06, + "loss": 0.5111, + "step": 3431 + }, + { + "epoch": 1.622695035460993, + "grad_norm": 2.452146291732788, + "learning_rate": 4.184537390142639e-06, + "loss": 0.4574, + "step": 3432 + }, + { + "epoch": 1.6231678486997636, + "grad_norm": 2.653071165084839, + "learning_rate": 4.1840763918678055e-06, + "loss": 0.5611, + "step": 3433 + }, + { + "epoch": 1.6236406619385342, + "grad_norm": 2.5920350551605225, + "learning_rate": 4.183615288732919e-06, + "loss": 0.5437, + "step": 3434 + }, + { + "epoch": 1.624113475177305, + "grad_norm": 2.782900810241699, + "learning_rate": 4.18315408076669e-06, + "loss": 0.5824, + "step": 3435 + }, + { + "epoch": 1.6245862884160758, + "grad_norm": 2.8769774436950684, + "learning_rate": 4.1826927679978365e-06, + "loss": 0.5271, + "step": 3436 + }, + { + "epoch": 1.6250591016548463, + "grad_norm": 2.488598585128784, + "learning_rate": 4.182231350455084e-06, + "loss": 0.4684, + "step": 3437 + }, + { + "epoch": 1.625531914893617, + "grad_norm": 2.6472036838531494, + "learning_rate": 4.181769828167161e-06, + "loss": 0.5372, + "step": 3438 + }, + { + "epoch": 1.6260047281323877, + "grad_norm": 2.6498794555664062, + "learning_rate": 4.1813082011628045e-06, + "loss": 0.4805, + "step": 3439 + }, + { + "epoch": 1.6264775413711585, + "grad_norm": 2.5386533737182617, + "learning_rate": 4.1808464694707595e-06, + "loss": 0.5015, + "step": 3440 + }, + { + "epoch": 1.626950354609929, + "grad_norm": 2.8812551498413086, + "learning_rate": 4.180384633119775e-06, + "loss": 0.5225, + "step": 3441 + }, + { + "epoch": 1.6274231678486997, + "grad_norm": 2.870124578475952, + "learning_rate": 4.179922692138609e-06, + "loss": 0.537, + "step": 3442 + }, + { + "epoch": 1.6278959810874705, + "grad_norm": 2.5759785175323486, + "learning_rate": 4.179460646556021e-06, + "loss": 0.5142, + "step": 3443 + }, + { + "epoch": 1.6283687943262413, + "grad_norm": 2.629347324371338, + "learning_rate": 4.1789984964007836e-06, + "loss": 0.5007, + "step": 3444 + }, + { + "epoch": 1.6288416075650118, + "grad_norm": 2.751128673553467, + "learning_rate": 4.178536241701672e-06, + "loss": 0.5677, + "step": 3445 + }, + { + "epoch": 1.6293144208037824, + "grad_norm": 2.7582364082336426, + "learning_rate": 4.178073882487469e-06, + "loss": 0.499, + "step": 3446 + }, + { + "epoch": 1.6297872340425532, + "grad_norm": 3.136711359024048, + "learning_rate": 4.177611418786963e-06, + "loss": 0.5294, + "step": 3447 + }, + { + "epoch": 1.630260047281324, + "grad_norm": 2.7363100051879883, + "learning_rate": 4.17714885062895e-06, + "loss": 0.5264, + "step": 3448 + }, + { + "epoch": 1.6307328605200946, + "grad_norm": 2.7305946350097656, + "learning_rate": 4.176686178042233e-06, + "loss": 0.5235, + "step": 3449 + }, + { + "epoch": 1.6312056737588652, + "grad_norm": 2.6500556468963623, + "learning_rate": 4.176223401055619e-06, + "loss": 0.5463, + "step": 3450 + }, + { + "epoch": 1.631678486997636, + "grad_norm": 2.756321907043457, + "learning_rate": 4.175760519697924e-06, + "loss": 0.545, + "step": 3451 + }, + { + "epoch": 1.6321513002364068, + "grad_norm": 2.6234960556030273, + "learning_rate": 4.17529753399797e-06, + "loss": 0.4927, + "step": 3452 + }, + { + "epoch": 1.6326241134751773, + "grad_norm": 2.6358842849731445, + "learning_rate": 4.174834443984584e-06, + "loss": 0.5445, + "step": 3453 + }, + { + "epoch": 1.633096926713948, + "grad_norm": 2.541147470474243, + "learning_rate": 4.174371249686601e-06, + "loss": 0.4691, + "step": 3454 + }, + { + "epoch": 1.6335697399527187, + "grad_norm": 2.566981077194214, + "learning_rate": 4.173907951132863e-06, + "loss": 0.4932, + "step": 3455 + }, + { + "epoch": 1.6340425531914895, + "grad_norm": 2.670940399169922, + "learning_rate": 4.173444548352216e-06, + "loss": 0.4979, + "step": 3456 + }, + { + "epoch": 1.63451536643026, + "grad_norm": 2.5440268516540527, + "learning_rate": 4.172981041373515e-06, + "loss": 0.4716, + "step": 3457 + }, + { + "epoch": 1.6349881796690307, + "grad_norm": 2.3801631927490234, + "learning_rate": 4.17251743022562e-06, + "loss": 0.5126, + "step": 3458 + }, + { + "epoch": 1.6354609929078014, + "grad_norm": 2.5051121711730957, + "learning_rate": 4.1720537149373985e-06, + "loss": 0.4964, + "step": 3459 + }, + { + "epoch": 1.6359338061465722, + "grad_norm": 3.5521697998046875, + "learning_rate": 4.171589895537724e-06, + "loss": 0.5447, + "step": 3460 + }, + { + "epoch": 1.6364066193853428, + "grad_norm": 2.6041572093963623, + "learning_rate": 4.171125972055477e-06, + "loss": 0.4637, + "step": 3461 + }, + { + "epoch": 1.6368794326241134, + "grad_norm": 2.2297258377075195, + "learning_rate": 4.170661944519543e-06, + "loss": 0.4702, + "step": 3462 + }, + { + "epoch": 1.6373522458628842, + "grad_norm": 2.6764535903930664, + "learning_rate": 4.170197812958815e-06, + "loss": 0.5111, + "step": 3463 + }, + { + "epoch": 1.637825059101655, + "grad_norm": 2.86892032623291, + "learning_rate": 4.169733577402193e-06, + "loss": 0.5437, + "step": 3464 + }, + { + "epoch": 1.6382978723404256, + "grad_norm": 2.9007070064544678, + "learning_rate": 4.1692692378785825e-06, + "loss": 0.5425, + "step": 3465 + }, + { + "epoch": 1.6387706855791961, + "grad_norm": 2.5902905464172363, + "learning_rate": 4.168804794416896e-06, + "loss": 0.5252, + "step": 3466 + }, + { + "epoch": 1.639243498817967, + "grad_norm": 2.821183681488037, + "learning_rate": 4.168340247046053e-06, + "loss": 0.5265, + "step": 3467 + }, + { + "epoch": 1.6397163120567377, + "grad_norm": 2.7928314208984375, + "learning_rate": 4.167875595794978e-06, + "loss": 0.5151, + "step": 3468 + }, + { + "epoch": 1.6401891252955083, + "grad_norm": 2.3130412101745605, + "learning_rate": 4.167410840692603e-06, + "loss": 0.4941, + "step": 3469 + }, + { + "epoch": 1.6406619385342789, + "grad_norm": 2.6078619956970215, + "learning_rate": 4.1669459817678655e-06, + "loss": 0.493, + "step": 3470 + }, + { + "epoch": 1.6411347517730497, + "grad_norm": 2.5335731506347656, + "learning_rate": 4.166481019049712e-06, + "loss": 0.4969, + "step": 3471 + }, + { + "epoch": 1.6416075650118205, + "grad_norm": 2.8181469440460205, + "learning_rate": 4.166015952567093e-06, + "loss": 0.5062, + "step": 3472 + }, + { + "epoch": 1.642080378250591, + "grad_norm": 2.7256782054901123, + "learning_rate": 4.165550782348966e-06, + "loss": 0.5397, + "step": 3473 + }, + { + "epoch": 1.6425531914893616, + "grad_norm": 2.284345865249634, + "learning_rate": 4.1650855084242946e-06, + "loss": 0.4448, + "step": 3474 + }, + { + "epoch": 1.6430260047281324, + "grad_norm": 3.0383145809173584, + "learning_rate": 4.164620130822049e-06, + "loss": 0.5873, + "step": 3475 + }, + { + "epoch": 1.6434988179669032, + "grad_norm": 2.754448652267456, + "learning_rate": 4.1641546495712085e-06, + "loss": 0.4852, + "step": 3476 + }, + { + "epoch": 1.6439716312056738, + "grad_norm": 2.6820101737976074, + "learning_rate": 4.1636890647007535e-06, + "loss": 0.5325, + "step": 3477 + }, + { + "epoch": 1.6444444444444444, + "grad_norm": 2.6396398544311523, + "learning_rate": 4.163223376239676e-06, + "loss": 0.466, + "step": 3478 + }, + { + "epoch": 1.6449172576832152, + "grad_norm": 2.395049810409546, + "learning_rate": 4.162757584216972e-06, + "loss": 0.4531, + "step": 3479 + }, + { + "epoch": 1.645390070921986, + "grad_norm": 2.596670627593994, + "learning_rate": 4.162291688661645e-06, + "loss": 0.5207, + "step": 3480 + }, + { + "epoch": 1.6458628841607565, + "grad_norm": 2.4391872882843018, + "learning_rate": 4.161825689602703e-06, + "loss": 0.5133, + "step": 3481 + }, + { + "epoch": 1.6463356973995271, + "grad_norm": 2.6169841289520264, + "learning_rate": 4.161359587069162e-06, + "loss": 0.5096, + "step": 3482 + }, + { + "epoch": 1.646808510638298, + "grad_norm": 2.634089946746826, + "learning_rate": 4.1608933810900445e-06, + "loss": 0.4921, + "step": 3483 + }, + { + "epoch": 1.6472813238770687, + "grad_norm": 2.815877914428711, + "learning_rate": 4.160427071694379e-06, + "loss": 0.5045, + "step": 3484 + }, + { + "epoch": 1.6477541371158393, + "grad_norm": 2.417525053024292, + "learning_rate": 4.159960658911199e-06, + "loss": 0.4997, + "step": 3485 + }, + { + "epoch": 1.6482269503546099, + "grad_norm": 2.5713605880737305, + "learning_rate": 4.15949414276955e-06, + "loss": 0.5246, + "step": 3486 + }, + { + "epoch": 1.6486997635933807, + "grad_norm": 3.49833607673645, + "learning_rate": 4.159027523298475e-06, + "loss": 0.4901, + "step": 3487 + }, + { + "epoch": 1.6491725768321515, + "grad_norm": 2.985464334487915, + "learning_rate": 4.158560800527033e-06, + "loss": 0.5726, + "step": 3488 + }, + { + "epoch": 1.649645390070922, + "grad_norm": 2.72745680809021, + "learning_rate": 4.158093974484282e-06, + "loss": 0.5119, + "step": 3489 + }, + { + "epoch": 1.6501182033096926, + "grad_norm": 2.4885571002960205, + "learning_rate": 4.157627045199289e-06, + "loss": 0.4838, + "step": 3490 + }, + { + "epoch": 1.6505910165484634, + "grad_norm": 2.7622628211975098, + "learning_rate": 4.157160012701128e-06, + "loss": 0.5269, + "step": 3491 + }, + { + "epoch": 1.6510638297872342, + "grad_norm": 2.615122079849243, + "learning_rate": 4.156692877018879e-06, + "loss": 0.5501, + "step": 3492 + }, + { + "epoch": 1.6515366430260048, + "grad_norm": 2.827753782272339, + "learning_rate": 4.156225638181631e-06, + "loss": 0.5452, + "step": 3493 + }, + { + "epoch": 1.6520094562647754, + "grad_norm": 2.724820137023926, + "learning_rate": 4.155758296218474e-06, + "loss": 0.5155, + "step": 3494 + }, + { + "epoch": 1.6524822695035462, + "grad_norm": 2.5806174278259277, + "learning_rate": 4.155290851158508e-06, + "loss": 0.5292, + "step": 3495 + }, + { + "epoch": 1.652955082742317, + "grad_norm": 2.5655179023742676, + "learning_rate": 4.154823303030838e-06, + "loss": 0.4959, + "step": 3496 + }, + { + "epoch": 1.6534278959810875, + "grad_norm": 2.656548261642456, + "learning_rate": 4.154355651864579e-06, + "loss": 0.5703, + "step": 3497 + }, + { + "epoch": 1.653900709219858, + "grad_norm": 2.9085004329681396, + "learning_rate": 4.153887897688847e-06, + "loss": 0.5061, + "step": 3498 + }, + { + "epoch": 1.654373522458629, + "grad_norm": 2.608010768890381, + "learning_rate": 4.1534200405327665e-06, + "loss": 0.5165, + "step": 3499 + }, + { + "epoch": 1.6548463356973995, + "grad_norm": 2.600463628768921, + "learning_rate": 4.152952080425471e-06, + "loss": 0.4946, + "step": 3500 + }, + { + "epoch": 1.65531914893617, + "grad_norm": 2.5561563968658447, + "learning_rate": 4.152484017396098e-06, + "loss": 0.4804, + "step": 3501 + }, + { + "epoch": 1.6557919621749408, + "grad_norm": 2.788594961166382, + "learning_rate": 4.152015851473791e-06, + "loss": 0.5635, + "step": 3502 + }, + { + "epoch": 1.6562647754137116, + "grad_norm": 2.693302631378174, + "learning_rate": 4.151547582687699e-06, + "loss": 0.5139, + "step": 3503 + }, + { + "epoch": 1.6567375886524822, + "grad_norm": 2.7887485027313232, + "learning_rate": 4.1510792110669825e-06, + "loss": 0.4952, + "step": 3504 + }, + { + "epoch": 1.6572104018912528, + "grad_norm": 2.8982298374176025, + "learning_rate": 4.150610736640803e-06, + "loss": 0.4136, + "step": 3505 + }, + { + "epoch": 1.6576832151300236, + "grad_norm": 2.7569408416748047, + "learning_rate": 4.150142159438331e-06, + "loss": 0.5272, + "step": 3506 + }, + { + "epoch": 1.6581560283687944, + "grad_norm": 2.531648874282837, + "learning_rate": 4.149673479488742e-06, + "loss": 0.5016, + "step": 3507 + }, + { + "epoch": 1.658628841607565, + "grad_norm": 2.7706353664398193, + "learning_rate": 4.149204696821219e-06, + "loss": 0.5512, + "step": 3508 + }, + { + "epoch": 1.6591016548463355, + "grad_norm": 2.7307450771331787, + "learning_rate": 4.148735811464951e-06, + "loss": 0.4968, + "step": 3509 + }, + { + "epoch": 1.6595744680851063, + "grad_norm": 3.0097429752349854, + "learning_rate": 4.1482668234491335e-06, + "loss": 0.4797, + "step": 3510 + }, + { + "epoch": 1.6600472813238771, + "grad_norm": 2.6045308113098145, + "learning_rate": 4.147797732802969e-06, + "loss": 0.5496, + "step": 3511 + }, + { + "epoch": 1.6605200945626477, + "grad_norm": 2.702061176300049, + "learning_rate": 4.147328539555664e-06, + "loss": 0.5302, + "step": 3512 + }, + { + "epoch": 1.6609929078014183, + "grad_norm": 3.3724892139434814, + "learning_rate": 4.1468592437364356e-06, + "loss": 0.5124, + "step": 3513 + }, + { + "epoch": 1.661465721040189, + "grad_norm": 2.5117242336273193, + "learning_rate": 4.146389845374502e-06, + "loss": 0.4953, + "step": 3514 + }, + { + "epoch": 1.6619385342789599, + "grad_norm": 2.86547589302063, + "learning_rate": 4.145920344499092e-06, + "loss": 0.5337, + "step": 3515 + }, + { + "epoch": 1.6624113475177305, + "grad_norm": 2.745149850845337, + "learning_rate": 4.14545074113944e-06, + "loss": 0.5187, + "step": 3516 + }, + { + "epoch": 1.662884160756501, + "grad_norm": 2.5560994148254395, + "learning_rate": 4.1449810353247855e-06, + "loss": 0.5183, + "step": 3517 + }, + { + "epoch": 1.6633569739952718, + "grad_norm": 2.2318122386932373, + "learning_rate": 4.144511227084374e-06, + "loss": 0.4452, + "step": 3518 + }, + { + "epoch": 1.6638297872340426, + "grad_norm": 2.6980903148651123, + "learning_rate": 4.14404131644746e-06, + "loss": 0.4974, + "step": 3519 + }, + { + "epoch": 1.6643026004728132, + "grad_norm": 2.6875357627868652, + "learning_rate": 4.1435713034433025e-06, + "loss": 0.4582, + "step": 3520 + }, + { + "epoch": 1.6647754137115838, + "grad_norm": 2.9430019855499268, + "learning_rate": 4.143101188101166e-06, + "loss": 0.5004, + "step": 3521 + }, + { + "epoch": 1.6652482269503546, + "grad_norm": 2.4447221755981445, + "learning_rate": 4.142630970450323e-06, + "loss": 0.5436, + "step": 3522 + }, + { + "epoch": 1.6657210401891254, + "grad_norm": 2.571023941040039, + "learning_rate": 4.142160650520053e-06, + "loss": 0.5307, + "step": 3523 + }, + { + "epoch": 1.666193853427896, + "grad_norm": 2.9725306034088135, + "learning_rate": 4.14169022833964e-06, + "loss": 0.5918, + "step": 3524 + }, + { + "epoch": 1.6666666666666665, + "grad_norm": 2.5958926677703857, + "learning_rate": 4.141219703938375e-06, + "loss": 0.5036, + "step": 3525 + }, + { + "epoch": 1.6671394799054373, + "grad_norm": 2.935788631439209, + "learning_rate": 4.140749077345556e-06, + "loss": 0.5773, + "step": 3526 + }, + { + "epoch": 1.6676122931442081, + "grad_norm": 2.5460526943206787, + "learning_rate": 4.140278348590485e-06, + "loss": 0.4762, + "step": 3527 + }, + { + "epoch": 1.6680851063829787, + "grad_norm": 2.5729143619537354, + "learning_rate": 4.139807517702475e-06, + "loss": 0.5515, + "step": 3528 + }, + { + "epoch": 1.6685579196217493, + "grad_norm": 2.4377381801605225, + "learning_rate": 4.13933658471084e-06, + "loss": 0.5383, + "step": 3529 + }, + { + "epoch": 1.66903073286052, + "grad_norm": 2.6284425258636475, + "learning_rate": 4.138865549644905e-06, + "loss": 0.5396, + "step": 3530 + }, + { + "epoch": 1.6695035460992909, + "grad_norm": 2.857250928878784, + "learning_rate": 4.138394412533998e-06, + "loss": 0.5861, + "step": 3531 + }, + { + "epoch": 1.6699763593380614, + "grad_norm": 2.9226012229919434, + "learning_rate": 4.137923173407456e-06, + "loss": 0.5262, + "step": 3532 + }, + { + "epoch": 1.670449172576832, + "grad_norm": 4.839131832122803, + "learning_rate": 4.137451832294619e-06, + "loss": 0.651, + "step": 3533 + }, + { + "epoch": 1.6709219858156028, + "grad_norm": 2.4727771282196045, + "learning_rate": 4.1369803892248375e-06, + "loss": 0.5149, + "step": 3534 + }, + { + "epoch": 1.6713947990543736, + "grad_norm": 2.5391688346862793, + "learning_rate": 4.1365088442274635e-06, + "loss": 0.4907, + "step": 3535 + }, + { + "epoch": 1.6718676122931442, + "grad_norm": 2.5168209075927734, + "learning_rate": 4.136037197331862e-06, + "loss": 0.5091, + "step": 3536 + }, + { + "epoch": 1.6723404255319148, + "grad_norm": 2.6278600692749023, + "learning_rate": 4.135565448567396e-06, + "loss": 0.4357, + "step": 3537 + }, + { + "epoch": 1.6728132387706856, + "grad_norm": 2.835184097290039, + "learning_rate": 4.135093597963441e-06, + "loss": 0.4786, + "step": 3538 + }, + { + "epoch": 1.6732860520094563, + "grad_norm": 2.385328531265259, + "learning_rate": 4.134621645549379e-06, + "loss": 0.4849, + "step": 3539 + }, + { + "epoch": 1.673758865248227, + "grad_norm": 2.6504149436950684, + "learning_rate": 4.134149591354593e-06, + "loss": 0.6037, + "step": 3540 + }, + { + "epoch": 1.6742316784869975, + "grad_norm": 2.945634126663208, + "learning_rate": 4.1336774354084786e-06, + "loss": 0.532, + "step": 3541 + }, + { + "epoch": 1.6747044917257683, + "grad_norm": 2.8373215198516846, + "learning_rate": 4.133205177740434e-06, + "loss": 0.5138, + "step": 3542 + }, + { + "epoch": 1.675177304964539, + "grad_norm": 2.6616621017456055, + "learning_rate": 4.1327328183798634e-06, + "loss": 0.5543, + "step": 3543 + }, + { + "epoch": 1.6756501182033097, + "grad_norm": 3.0843071937561035, + "learning_rate": 4.13226035735618e-06, + "loss": 0.6585, + "step": 3544 + }, + { + "epoch": 1.6761229314420802, + "grad_norm": 2.2214272022247314, + "learning_rate": 4.131787794698802e-06, + "loss": 0.5413, + "step": 3545 + }, + { + "epoch": 1.676595744680851, + "grad_norm": 2.4515018463134766, + "learning_rate": 4.131315130437152e-06, + "loss": 0.4966, + "step": 3546 + }, + { + "epoch": 1.6770685579196218, + "grad_norm": 2.647414207458496, + "learning_rate": 4.130842364600663e-06, + "loss": 0.5401, + "step": 3547 + }, + { + "epoch": 1.6775413711583924, + "grad_norm": 2.648941993713379, + "learning_rate": 4.13036949721877e-06, + "loss": 0.4796, + "step": 3548 + }, + { + "epoch": 1.678014184397163, + "grad_norm": 2.7835679054260254, + "learning_rate": 4.129896528320919e-06, + "loss": 0.5653, + "step": 3549 + }, + { + "epoch": 1.6784869976359338, + "grad_norm": 2.995964288711548, + "learning_rate": 4.129423457936556e-06, + "loss": 0.4999, + "step": 3550 + }, + { + "epoch": 1.6789598108747046, + "grad_norm": 2.5980007648468018, + "learning_rate": 4.1289502860951405e-06, + "loss": 0.5177, + "step": 3551 + }, + { + "epoch": 1.6794326241134752, + "grad_norm": 2.442254066467285, + "learning_rate": 4.128477012826133e-06, + "loss": 0.5062, + "step": 3552 + }, + { + "epoch": 1.6799054373522457, + "grad_norm": 2.3007538318634033, + "learning_rate": 4.1280036381590025e-06, + "loss": 0.5029, + "step": 3553 + }, + { + "epoch": 1.6803782505910165, + "grad_norm": 2.4169347286224365, + "learning_rate": 4.1275301621232245e-06, + "loss": 0.515, + "step": 3554 + }, + { + "epoch": 1.6808510638297873, + "grad_norm": 2.6456379890441895, + "learning_rate": 4.127056584748279e-06, + "loss": 0.5343, + "step": 3555 + }, + { + "epoch": 1.681323877068558, + "grad_norm": 2.6406595706939697, + "learning_rate": 4.1265829060636546e-06, + "loss": 0.5047, + "step": 3556 + }, + { + "epoch": 1.6817966903073285, + "grad_norm": 2.9344475269317627, + "learning_rate": 4.126109126098846e-06, + "loss": 0.5501, + "step": 3557 + }, + { + "epoch": 1.6822695035460993, + "grad_norm": 2.3292455673217773, + "learning_rate": 4.125635244883351e-06, + "loss": 0.463, + "step": 3558 + }, + { + "epoch": 1.68274231678487, + "grad_norm": 2.4150657653808594, + "learning_rate": 4.125161262446677e-06, + "loss": 0.4802, + "step": 3559 + }, + { + "epoch": 1.6832151300236406, + "grad_norm": 2.604292392730713, + "learning_rate": 4.124687178818339e-06, + "loss": 0.5683, + "step": 3560 + }, + { + "epoch": 1.6836879432624112, + "grad_norm": 2.5676791667938232, + "learning_rate": 4.1242129940278544e-06, + "loss": 0.5519, + "step": 3561 + }, + { + "epoch": 1.684160756501182, + "grad_norm": 3.078514814376831, + "learning_rate": 4.123738708104748e-06, + "loss": 0.5194, + "step": 3562 + }, + { + "epoch": 1.6846335697399528, + "grad_norm": 2.893577814102173, + "learning_rate": 4.123264321078552e-06, + "loss": 0.5107, + "step": 3563 + }, + { + "epoch": 1.6851063829787234, + "grad_norm": 2.772413730621338, + "learning_rate": 4.122789832978804e-06, + "loss": 0.6147, + "step": 3564 + }, + { + "epoch": 1.685579196217494, + "grad_norm": 2.5804643630981445, + "learning_rate": 4.12231524383505e-06, + "loss": 0.5057, + "step": 3565 + }, + { + "epoch": 1.6860520094562648, + "grad_norm": 2.599571466445923, + "learning_rate": 4.121840553676839e-06, + "loss": 0.5591, + "step": 3566 + }, + { + "epoch": 1.6865248226950356, + "grad_norm": 2.9124577045440674, + "learning_rate": 4.1213657625337275e-06, + "loss": 0.565, + "step": 3567 + }, + { + "epoch": 1.6869976359338061, + "grad_norm": 2.6582155227661133, + "learning_rate": 4.120890870435281e-06, + "loss": 0.4607, + "step": 3568 + }, + { + "epoch": 1.6874704491725767, + "grad_norm": 2.929227590560913, + "learning_rate": 4.120415877411066e-06, + "loss": 0.5705, + "step": 3569 + }, + { + "epoch": 1.6879432624113475, + "grad_norm": 2.4443247318267822, + "learning_rate": 4.11994078349066e-06, + "loss": 0.4592, + "step": 3570 + }, + { + "epoch": 1.6884160756501183, + "grad_norm": 2.4799163341522217, + "learning_rate": 4.119465588703645e-06, + "loss": 0.5361, + "step": 3571 + }, + { + "epoch": 1.6888888888888889, + "grad_norm": 2.9408936500549316, + "learning_rate": 4.1189902930796085e-06, + "loss": 0.5347, + "step": 3572 + }, + { + "epoch": 1.6893617021276595, + "grad_norm": 3.3348076343536377, + "learning_rate": 4.118514896648146e-06, + "loss": 0.5612, + "step": 3573 + }, + { + "epoch": 1.6898345153664303, + "grad_norm": 2.764889717102051, + "learning_rate": 4.118039399438857e-06, + "loss": 0.4745, + "step": 3574 + }, + { + "epoch": 1.690307328605201, + "grad_norm": 2.7023751735687256, + "learning_rate": 4.11756380148135e-06, + "loss": 0.5106, + "step": 3575 + }, + { + "epoch": 1.6907801418439716, + "grad_norm": 2.8816208839416504, + "learning_rate": 4.117088102805238e-06, + "loss": 0.6016, + "step": 3576 + }, + { + "epoch": 1.6912529550827422, + "grad_norm": 2.215733289718628, + "learning_rate": 4.11661230344014e-06, + "loss": 0.4404, + "step": 3577 + }, + { + "epoch": 1.691725768321513, + "grad_norm": 2.8190999031066895, + "learning_rate": 4.116136403415683e-06, + "loss": 0.5038, + "step": 3578 + }, + { + "epoch": 1.6921985815602838, + "grad_norm": 2.616424083709717, + "learning_rate": 4.115660402761499e-06, + "loss": 0.5493, + "step": 3579 + }, + { + "epoch": 1.6926713947990544, + "grad_norm": 2.7738113403320312, + "learning_rate": 4.115184301507226e-06, + "loss": 0.5416, + "step": 3580 + }, + { + "epoch": 1.693144208037825, + "grad_norm": 2.4793593883514404, + "learning_rate": 4.114708099682509e-06, + "loss": 0.4526, + "step": 3581 + }, + { + "epoch": 1.6936170212765957, + "grad_norm": 2.390652894973755, + "learning_rate": 4.114231797316999e-06, + "loss": 0.4908, + "step": 3582 + }, + { + "epoch": 1.6940898345153665, + "grad_norm": 2.513197660446167, + "learning_rate": 4.113755394440352e-06, + "loss": 0.4738, + "step": 3583 + }, + { + "epoch": 1.6945626477541371, + "grad_norm": 2.504497766494751, + "learning_rate": 4.113278891082234e-06, + "loss": 0.4661, + "step": 3584 + }, + { + "epoch": 1.6950354609929077, + "grad_norm": 2.4966917037963867, + "learning_rate": 4.112802287272314e-06, + "loss": 0.4979, + "step": 3585 + }, + { + "epoch": 1.6955082742316785, + "grad_norm": 2.3129689693450928, + "learning_rate": 4.112325583040265e-06, + "loss": 0.4933, + "step": 3586 + }, + { + "epoch": 1.6959810874704493, + "grad_norm": 2.822136878967285, + "learning_rate": 4.111848778415774e-06, + "loss": 0.5087, + "step": 3587 + }, + { + "epoch": 1.6964539007092199, + "grad_norm": 2.5181210041046143, + "learning_rate": 4.111371873428527e-06, + "loss": 0.4836, + "step": 3588 + }, + { + "epoch": 1.6969267139479904, + "grad_norm": 2.7564687728881836, + "learning_rate": 4.110894868108218e-06, + "loss": 0.5224, + "step": 3589 + }, + { + "epoch": 1.6973995271867612, + "grad_norm": 2.424421787261963, + "learning_rate": 4.11041776248455e-06, + "loss": 0.4552, + "step": 3590 + }, + { + "epoch": 1.697872340425532, + "grad_norm": 2.7013823986053467, + "learning_rate": 4.10994055658723e-06, + "loss": 0.5535, + "step": 3591 + }, + { + "epoch": 1.6983451536643026, + "grad_norm": 2.5660946369171143, + "learning_rate": 4.10946325044597e-06, + "loss": 0.5351, + "step": 3592 + }, + { + "epoch": 1.6988179669030732, + "grad_norm": 2.5598108768463135, + "learning_rate": 4.10898584409049e-06, + "loss": 0.5246, + "step": 3593 + }, + { + "epoch": 1.699290780141844, + "grad_norm": 2.6318907737731934, + "learning_rate": 4.108508337550518e-06, + "loss": 0.5002, + "step": 3594 + }, + { + "epoch": 1.6997635933806148, + "grad_norm": 2.527099132537842, + "learning_rate": 4.108030730855784e-06, + "loss": 0.5366, + "step": 3595 + }, + { + "epoch": 1.7002364066193854, + "grad_norm": 2.8629603385925293, + "learning_rate": 4.107553024036029e-06, + "loss": 0.5742, + "step": 3596 + }, + { + "epoch": 1.700709219858156, + "grad_norm": 2.8084018230438232, + "learning_rate": 4.107075217120994e-06, + "loss": 0.5618, + "step": 3597 + }, + { + "epoch": 1.7011820330969267, + "grad_norm": 3.6470065116882324, + "learning_rate": 4.1065973101404325e-06, + "loss": 0.508, + "step": 3598 + }, + { + "epoch": 1.7016548463356975, + "grad_norm": 3.0332422256469727, + "learning_rate": 4.106119303124102e-06, + "loss": 0.51, + "step": 3599 + }, + { + "epoch": 1.702127659574468, + "grad_norm": 2.4887590408325195, + "learning_rate": 4.105641196101765e-06, + "loss": 0.5109, + "step": 3600 + }, + { + "epoch": 1.7026004728132387, + "grad_norm": 2.6102066040039062, + "learning_rate": 4.105162989103191e-06, + "loss": 0.5278, + "step": 3601 + }, + { + "epoch": 1.7030732860520095, + "grad_norm": 2.771578073501587, + "learning_rate": 4.104684682158156e-06, + "loss": 0.498, + "step": 3602 + }, + { + "epoch": 1.7035460992907803, + "grad_norm": 2.5452702045440674, + "learning_rate": 4.1042062752964425e-06, + "loss": 0.4939, + "step": 3603 + }, + { + "epoch": 1.7040189125295508, + "grad_norm": 2.4287021160125732, + "learning_rate": 4.103727768547838e-06, + "loss": 0.4819, + "step": 3604 + }, + { + "epoch": 1.7044917257683214, + "grad_norm": 2.412280321121216, + "learning_rate": 4.103249161942138e-06, + "loss": 0.5196, + "step": 3605 + }, + { + "epoch": 1.7049645390070922, + "grad_norm": 2.8850717544555664, + "learning_rate": 4.102770455509142e-06, + "loss": 0.5724, + "step": 3606 + }, + { + "epoch": 1.705437352245863, + "grad_norm": 2.7979609966278076, + "learning_rate": 4.102291649278659e-06, + "loss": 0.5295, + "step": 3607 + }, + { + "epoch": 1.7059101654846336, + "grad_norm": 2.762238025665283, + "learning_rate": 4.1018127432805e-06, + "loss": 0.5166, + "step": 3608 + }, + { + "epoch": 1.7063829787234042, + "grad_norm": 2.921586513519287, + "learning_rate": 4.101333737544485e-06, + "loss": 0.5607, + "step": 3609 + }, + { + "epoch": 1.706855791962175, + "grad_norm": 3.001929998397827, + "learning_rate": 4.100854632100439e-06, + "loss": 0.6255, + "step": 3610 + }, + { + "epoch": 1.7073286052009458, + "grad_norm": 2.752713918685913, + "learning_rate": 4.100375426978196e-06, + "loss": 0.5732, + "step": 3611 + }, + { + "epoch": 1.7078014184397163, + "grad_norm": 2.6496472358703613, + "learning_rate": 4.099896122207593e-06, + "loss": 0.5138, + "step": 3612 + }, + { + "epoch": 1.708274231678487, + "grad_norm": 3.0079452991485596, + "learning_rate": 4.099416717818473e-06, + "loss": 0.5746, + "step": 3613 + }, + { + "epoch": 1.7087470449172577, + "grad_norm": 2.5762360095977783, + "learning_rate": 4.098937213840687e-06, + "loss": 0.5308, + "step": 3614 + }, + { + "epoch": 1.7092198581560285, + "grad_norm": 2.6026158332824707, + "learning_rate": 4.098457610304092e-06, + "loss": 0.4857, + "step": 3615 + }, + { + "epoch": 1.709692671394799, + "grad_norm": 2.587583541870117, + "learning_rate": 4.097977907238551e-06, + "loss": 0.4591, + "step": 3616 + }, + { + "epoch": 1.7101654846335697, + "grad_norm": 2.6996991634368896, + "learning_rate": 4.097498104673932e-06, + "loss": 0.5298, + "step": 3617 + }, + { + "epoch": 1.7106382978723405, + "grad_norm": 2.600029945373535, + "learning_rate": 4.097018202640111e-06, + "loss": 0.4726, + "step": 3618 + }, + { + "epoch": 1.7111111111111112, + "grad_norm": 2.8261220455169678, + "learning_rate": 4.096538201166969e-06, + "loss": 0.5242, + "step": 3619 + }, + { + "epoch": 1.7115839243498818, + "grad_norm": 3.053027629852295, + "learning_rate": 4.096058100284394e-06, + "loss": 0.5568, + "step": 3620 + }, + { + "epoch": 1.7120567375886524, + "grad_norm": 2.9638442993164062, + "learning_rate": 4.0955779000222805e-06, + "loss": 0.5325, + "step": 3621 + }, + { + "epoch": 1.7125295508274232, + "grad_norm": 2.731095790863037, + "learning_rate": 4.095097600410527e-06, + "loss": 0.4733, + "step": 3622 + }, + { + "epoch": 1.713002364066194, + "grad_norm": 2.632490873336792, + "learning_rate": 4.09461720147904e-06, + "loss": 0.5253, + "step": 3623 + }, + { + "epoch": 1.7134751773049646, + "grad_norm": 2.847689390182495, + "learning_rate": 4.094136703257732e-06, + "loss": 0.57, + "step": 3624 + }, + { + "epoch": 1.7139479905437351, + "grad_norm": 3.1078696250915527, + "learning_rate": 4.0936561057765215e-06, + "loss": 0.5368, + "step": 3625 + }, + { + "epoch": 1.714420803782506, + "grad_norm": 2.696349620819092, + "learning_rate": 4.0931754090653334e-06, + "loss": 0.491, + "step": 3626 + }, + { + "epoch": 1.7148936170212767, + "grad_norm": 2.712958812713623, + "learning_rate": 4.092694613154099e-06, + "loss": 0.5768, + "step": 3627 + }, + { + "epoch": 1.7153664302600473, + "grad_norm": 2.5421478748321533, + "learning_rate": 4.092213718072754e-06, + "loss": 0.4839, + "step": 3628 + }, + { + "epoch": 1.715839243498818, + "grad_norm": 2.5176162719726562, + "learning_rate": 4.091732723851243e-06, + "loss": 0.5049, + "step": 3629 + }, + { + "epoch": 1.7163120567375887, + "grad_norm": 2.642185926437378, + "learning_rate": 4.091251630519514e-06, + "loss": 0.589, + "step": 3630 + }, + { + "epoch": 1.7167848699763595, + "grad_norm": 2.587348461151123, + "learning_rate": 4.0907704381075245e-06, + "loss": 0.5281, + "step": 3631 + }, + { + "epoch": 1.71725768321513, + "grad_norm": 2.4628195762634277, + "learning_rate": 4.090289146645234e-06, + "loss": 0.5592, + "step": 3632 + }, + { + "epoch": 1.7177304964539006, + "grad_norm": 2.2751028537750244, + "learning_rate": 4.0898077561626125e-06, + "loss": 0.502, + "step": 3633 + }, + { + "epoch": 1.7182033096926714, + "grad_norm": 2.7712769508361816, + "learning_rate": 4.089326266689632e-06, + "loss": 0.5143, + "step": 3634 + }, + { + "epoch": 1.7186761229314422, + "grad_norm": 2.5297727584838867, + "learning_rate": 4.088844678256275e-06, + "loss": 0.5035, + "step": 3635 + }, + { + "epoch": 1.7191489361702128, + "grad_norm": 2.739130735397339, + "learning_rate": 4.088362990892527e-06, + "loss": 0.5959, + "step": 3636 + }, + { + "epoch": 1.7196217494089834, + "grad_norm": 2.3708314895629883, + "learning_rate": 4.08788120462838e-06, + "loss": 0.4796, + "step": 3637 + }, + { + "epoch": 1.7200945626477542, + "grad_norm": 2.7664241790771484, + "learning_rate": 4.087399319493832e-06, + "loss": 0.6052, + "step": 3638 + }, + { + "epoch": 1.720567375886525, + "grad_norm": 2.5900204181671143, + "learning_rate": 4.0869173355188895e-06, + "loss": 0.4955, + "step": 3639 + }, + { + "epoch": 1.7210401891252955, + "grad_norm": 2.6771862506866455, + "learning_rate": 4.0864352527335635e-06, + "loss": 0.4889, + "step": 3640 + }, + { + "epoch": 1.7215130023640661, + "grad_norm": 2.888479471206665, + "learning_rate": 4.085953071167871e-06, + "loss": 0.5719, + "step": 3641 + }, + { + "epoch": 1.721985815602837, + "grad_norm": 2.5967187881469727, + "learning_rate": 4.085470790851833e-06, + "loss": 0.4959, + "step": 3642 + }, + { + "epoch": 1.7224586288416077, + "grad_norm": 2.5317695140838623, + "learning_rate": 4.084988411815483e-06, + "loss": 0.4596, + "step": 3643 + }, + { + "epoch": 1.7229314420803783, + "grad_norm": 2.6531455516815186, + "learning_rate": 4.084505934088853e-06, + "loss": 0.5346, + "step": 3644 + }, + { + "epoch": 1.7234042553191489, + "grad_norm": 2.6525208950042725, + "learning_rate": 4.084023357701987e-06, + "loss": 0.5178, + "step": 3645 + }, + { + "epoch": 1.7238770685579197, + "grad_norm": 2.461954116821289, + "learning_rate": 4.083540682684932e-06, + "loss": 0.4802, + "step": 3646 + }, + { + "epoch": 1.7243498817966905, + "grad_norm": 2.794696807861328, + "learning_rate": 4.083057909067743e-06, + "loss": 0.5148, + "step": 3647 + }, + { + "epoch": 1.724822695035461, + "grad_norm": 2.867572546005249, + "learning_rate": 4.082575036880479e-06, + "loss": 0.5352, + "step": 3648 + }, + { + "epoch": 1.7252955082742316, + "grad_norm": 2.642820358276367, + "learning_rate": 4.082092066153207e-06, + "loss": 0.4652, + "step": 3649 + }, + { + "epoch": 1.7257683215130024, + "grad_norm": 2.782142400741577, + "learning_rate": 4.081608996915999e-06, + "loss": 0.5591, + "step": 3650 + }, + { + "epoch": 1.7262411347517732, + "grad_norm": 2.327331304550171, + "learning_rate": 4.081125829198934e-06, + "loss": 0.4339, + "step": 3651 + }, + { + "epoch": 1.7267139479905438, + "grad_norm": 2.7959988117218018, + "learning_rate": 4.0806425630320965e-06, + "loss": 0.5783, + "step": 3652 + }, + { + "epoch": 1.7271867612293144, + "grad_norm": 2.595053195953369, + "learning_rate": 4.080159198445578e-06, + "loss": 0.4602, + "step": 3653 + }, + { + "epoch": 1.7276595744680852, + "grad_norm": 3.0968129634857178, + "learning_rate": 4.079675735469475e-06, + "loss": 0.5775, + "step": 3654 + }, + { + "epoch": 1.728132387706856, + "grad_norm": 2.628044605255127, + "learning_rate": 4.07919217413389e-06, + "loss": 0.486, + "step": 3655 + }, + { + "epoch": 1.7286052009456265, + "grad_norm": 2.782799005508423, + "learning_rate": 4.078708514468933e-06, + "loss": 0.5282, + "step": 3656 + }, + { + "epoch": 1.729078014184397, + "grad_norm": 2.655365467071533, + "learning_rate": 4.0782247565047205e-06, + "loss": 0.4873, + "step": 3657 + }, + { + "epoch": 1.729550827423168, + "grad_norm": 2.9461584091186523, + "learning_rate": 4.077740900271371e-06, + "loss": 0.548, + "step": 3658 + }, + { + "epoch": 1.7300236406619387, + "grad_norm": 2.5094761848449707, + "learning_rate": 4.077256945799015e-06, + "loss": 0.5437, + "step": 3659 + }, + { + "epoch": 1.7304964539007093, + "grad_norm": 2.555793285369873, + "learning_rate": 4.0767728931177845e-06, + "loss": 0.5268, + "step": 3660 + }, + { + "epoch": 1.7309692671394799, + "grad_norm": 2.4433486461639404, + "learning_rate": 4.07628874225782e-06, + "loss": 0.5211, + "step": 3661 + }, + { + "epoch": 1.7314420803782506, + "grad_norm": 2.365206003189087, + "learning_rate": 4.075804493249267e-06, + "loss": 0.5084, + "step": 3662 + }, + { + "epoch": 1.7319148936170212, + "grad_norm": 2.514305830001831, + "learning_rate": 4.075320146122278e-06, + "loss": 0.4693, + "step": 3663 + }, + { + "epoch": 1.7323877068557918, + "grad_norm": 2.9270083904266357, + "learning_rate": 4.074835700907012e-06, + "loss": 0.5724, + "step": 3664 + }, + { + "epoch": 1.7328605200945626, + "grad_norm": 2.938692569732666, + "learning_rate": 4.0743511576336315e-06, + "loss": 0.5361, + "step": 3665 + }, + { + "epoch": 1.7333333333333334, + "grad_norm": 3.1978867053985596, + "learning_rate": 4.073866516332307e-06, + "loss": 0.6277, + "step": 3666 + }, + { + "epoch": 1.733806146572104, + "grad_norm": 2.3477370738983154, + "learning_rate": 4.073381777033217e-06, + "loss": 0.5139, + "step": 3667 + }, + { + "epoch": 1.7342789598108745, + "grad_norm": 2.5954184532165527, + "learning_rate": 4.072896939766543e-06, + "loss": 0.537, + "step": 3668 + }, + { + "epoch": 1.7347517730496453, + "grad_norm": 2.8999998569488525, + "learning_rate": 4.072412004562472e-06, + "loss": 0.5486, + "step": 3669 + }, + { + "epoch": 1.7352245862884161, + "grad_norm": 2.7320556640625, + "learning_rate": 4.071926971451201e-06, + "loss": 0.6025, + "step": 3670 + }, + { + "epoch": 1.7356973995271867, + "grad_norm": 2.499234676361084, + "learning_rate": 4.0714418404629304e-06, + "loss": 0.456, + "step": 3671 + }, + { + "epoch": 1.7361702127659573, + "grad_norm": 2.485924243927002, + "learning_rate": 4.070956611627867e-06, + "loss": 0.5097, + "step": 3672 + }, + { + "epoch": 1.736643026004728, + "grad_norm": 2.513723373413086, + "learning_rate": 4.070471284976225e-06, + "loss": 0.4744, + "step": 3673 + }, + { + "epoch": 1.7371158392434989, + "grad_norm": 2.281977653503418, + "learning_rate": 4.06998586053822e-06, + "loss": 0.5124, + "step": 3674 + }, + { + "epoch": 1.7375886524822695, + "grad_norm": 2.3683905601501465, + "learning_rate": 4.069500338344081e-06, + "loss": 0.4816, + "step": 3675 + }, + { + "epoch": 1.73806146572104, + "grad_norm": 2.5635924339294434, + "learning_rate": 4.069014718424038e-06, + "loss": 0.5665, + "step": 3676 + }, + { + "epoch": 1.7385342789598108, + "grad_norm": 2.7308456897735596, + "learning_rate": 4.068529000808328e-06, + "loss": 0.534, + "step": 3677 + }, + { + "epoch": 1.7390070921985816, + "grad_norm": 2.788452625274658, + "learning_rate": 4.068043185527196e-06, + "loss": 0.5609, + "step": 3678 + }, + { + "epoch": 1.7394799054373522, + "grad_norm": 2.832368850708008, + "learning_rate": 4.067557272610889e-06, + "loss": 0.553, + "step": 3679 + }, + { + "epoch": 1.7399527186761228, + "grad_norm": 2.9987435340881348, + "learning_rate": 4.067071262089665e-06, + "loss": 0.5, + "step": 3680 + }, + { + "epoch": 1.7404255319148936, + "grad_norm": 3.04913067817688, + "learning_rate": 4.066585153993785e-06, + "loss": 0.5158, + "step": 3681 + }, + { + "epoch": 1.7408983451536644, + "grad_norm": 2.5177130699157715, + "learning_rate": 4.066098948353516e-06, + "loss": 0.4508, + "step": 3682 + }, + { + "epoch": 1.741371158392435, + "grad_norm": 2.8991222381591797, + "learning_rate": 4.065612645199133e-06, + "loss": 0.5268, + "step": 3683 + }, + { + "epoch": 1.7418439716312055, + "grad_norm": 2.4928159713745117, + "learning_rate": 4.0651262445609156e-06, + "loss": 0.5024, + "step": 3684 + }, + { + "epoch": 1.7423167848699763, + "grad_norm": 2.9737319946289062, + "learning_rate": 4.06463974646915e-06, + "loss": 0.5429, + "step": 3685 + }, + { + "epoch": 1.7427895981087471, + "grad_norm": 2.6485493183135986, + "learning_rate": 4.064153150954128e-06, + "loss": 0.5619, + "step": 3686 + }, + { + "epoch": 1.7432624113475177, + "grad_norm": 2.564861297607422, + "learning_rate": 4.063666458046148e-06, + "loss": 0.4878, + "step": 3687 + }, + { + "epoch": 1.7437352245862883, + "grad_norm": 2.6048383712768555, + "learning_rate": 4.063179667775514e-06, + "loss": 0.4836, + "step": 3688 + }, + { + "epoch": 1.744208037825059, + "grad_norm": 2.751638650894165, + "learning_rate": 4.062692780172536e-06, + "loss": 0.5558, + "step": 3689 + }, + { + "epoch": 1.7446808510638299, + "grad_norm": 3.3866634368896484, + "learning_rate": 4.062205795267531e-06, + "loss": 0.4825, + "step": 3690 + }, + { + "epoch": 1.7451536643026004, + "grad_norm": 3.0112249851226807, + "learning_rate": 4.061718713090822e-06, + "loss": 0.5732, + "step": 3691 + }, + { + "epoch": 1.745626477541371, + "grad_norm": 2.5889365673065186, + "learning_rate": 4.061231533672736e-06, + "loss": 0.483, + "step": 3692 + }, + { + "epoch": 1.7460992907801418, + "grad_norm": 2.624598979949951, + "learning_rate": 4.0607442570436085e-06, + "loss": 0.5706, + "step": 3693 + }, + { + "epoch": 1.7465721040189126, + "grad_norm": 2.9219250679016113, + "learning_rate": 4.060256883233779e-06, + "loss": 0.5153, + "step": 3694 + }, + { + "epoch": 1.7470449172576832, + "grad_norm": 3.2219252586364746, + "learning_rate": 4.059769412273595e-06, + "loss": 0.5184, + "step": 3695 + }, + { + "epoch": 1.7475177304964538, + "grad_norm": 2.890697956085205, + "learning_rate": 4.05928184419341e-06, + "loss": 0.5312, + "step": 3696 + }, + { + "epoch": 1.7479905437352246, + "grad_norm": 2.673809289932251, + "learning_rate": 4.0587941790235816e-06, + "loss": 0.4893, + "step": 3697 + }, + { + "epoch": 1.7484633569739954, + "grad_norm": 2.5339348316192627, + "learning_rate": 4.058306416794474e-06, + "loss": 0.5115, + "step": 3698 + }, + { + "epoch": 1.748936170212766, + "grad_norm": 2.6525840759277344, + "learning_rate": 4.05781855753646e-06, + "loss": 0.5256, + "step": 3699 + }, + { + "epoch": 1.7494089834515365, + "grad_norm": 2.7868754863739014, + "learning_rate": 4.057330601279914e-06, + "loss": 0.5227, + "step": 3700 + }, + { + "epoch": 1.7498817966903073, + "grad_norm": 3.1629884243011475, + "learning_rate": 4.056842548055221e-06, + "loss": 0.5617, + "step": 3701 + }, + { + "epoch": 1.750354609929078, + "grad_norm": 2.9350688457489014, + "learning_rate": 4.056354397892769e-06, + "loss": 0.4753, + "step": 3702 + }, + { + "epoch": 1.7508274231678487, + "grad_norm": 2.9688615798950195, + "learning_rate": 4.0558661508229525e-06, + "loss": 0.596, + "step": 3703 + }, + { + "epoch": 1.7513002364066192, + "grad_norm": 2.802205801010132, + "learning_rate": 4.055377806876174e-06, + "loss": 0.5793, + "step": 3704 + }, + { + "epoch": 1.75177304964539, + "grad_norm": 2.4933416843414307, + "learning_rate": 4.054889366082839e-06, + "loss": 0.4824, + "step": 3705 + }, + { + "epoch": 1.7522458628841608, + "grad_norm": 3.7904608249664307, + "learning_rate": 4.054400828473361e-06, + "loss": 0.5124, + "step": 3706 + }, + { + "epoch": 1.7527186761229314, + "grad_norm": 2.694838762283325, + "learning_rate": 4.053912194078159e-06, + "loss": 0.5604, + "step": 3707 + }, + { + "epoch": 1.753191489361702, + "grad_norm": 2.3721256256103516, + "learning_rate": 4.053423462927659e-06, + "loss": 0.4978, + "step": 3708 + }, + { + "epoch": 1.7536643026004728, + "grad_norm": 2.718512773513794, + "learning_rate": 4.052934635052292e-06, + "loss": 0.5029, + "step": 3709 + }, + { + "epoch": 1.7541371158392436, + "grad_norm": 3.061558246612549, + "learning_rate": 4.052445710482493e-06, + "loss": 0.4886, + "step": 3710 + }, + { + "epoch": 1.7546099290780142, + "grad_norm": 3.0490729808807373, + "learning_rate": 4.051956689248709e-06, + "loss": 0.5363, + "step": 3711 + }, + { + "epoch": 1.7550827423167847, + "grad_norm": 2.611661672592163, + "learning_rate": 4.051467571381385e-06, + "loss": 0.5397, + "step": 3712 + }, + { + "epoch": 1.7555555555555555, + "grad_norm": 2.7829177379608154, + "learning_rate": 4.050978356910979e-06, + "loss": 0.4973, + "step": 3713 + }, + { + "epoch": 1.7560283687943263, + "grad_norm": 2.6228256225585938, + "learning_rate": 4.0504890458679525e-06, + "loss": 0.4551, + "step": 3714 + }, + { + "epoch": 1.756501182033097, + "grad_norm": 2.6801326274871826, + "learning_rate": 4.049999638282771e-06, + "loss": 0.5581, + "step": 3715 + }, + { + "epoch": 1.7569739952718675, + "grad_norm": 2.4476819038391113, + "learning_rate": 4.049510134185908e-06, + "loss": 0.5226, + "step": 3716 + }, + { + "epoch": 1.7574468085106383, + "grad_norm": 2.5661075115203857, + "learning_rate": 4.049020533607844e-06, + "loss": 0.5163, + "step": 3717 + }, + { + "epoch": 1.757919621749409, + "grad_norm": 2.3923349380493164, + "learning_rate": 4.048530836579065e-06, + "loss": 0.5076, + "step": 3718 + }, + { + "epoch": 1.7583924349881797, + "grad_norm": 2.8204405307769775, + "learning_rate": 4.0480410431300585e-06, + "loss": 0.5883, + "step": 3719 + }, + { + "epoch": 1.7588652482269502, + "grad_norm": 2.323107957839966, + "learning_rate": 4.047551153291325e-06, + "loss": 0.5116, + "step": 3720 + }, + { + "epoch": 1.759338061465721, + "grad_norm": 2.8306009769439697, + "learning_rate": 4.047061167093368e-06, + "loss": 0.5094, + "step": 3721 + }, + { + "epoch": 1.7598108747044918, + "grad_norm": 2.568765640258789, + "learning_rate": 4.046571084566695e-06, + "loss": 0.4725, + "step": 3722 + }, + { + "epoch": 1.7602836879432624, + "grad_norm": 2.7212061882019043, + "learning_rate": 4.046080905741822e-06, + "loss": 0.4741, + "step": 3723 + }, + { + "epoch": 1.760756501182033, + "grad_norm": 2.802917003631592, + "learning_rate": 4.04559063064927e-06, + "loss": 0.5691, + "step": 3724 + }, + { + "epoch": 1.7612293144208038, + "grad_norm": 3.1044139862060547, + "learning_rate": 4.0451002593195675e-06, + "loss": 0.5472, + "step": 3725 + }, + { + "epoch": 1.7617021276595746, + "grad_norm": 2.5855562686920166, + "learning_rate": 4.044609791783246e-06, + "loss": 0.4852, + "step": 3726 + }, + { + "epoch": 1.7621749408983451, + "grad_norm": 2.6235129833221436, + "learning_rate": 4.0441192280708465e-06, + "loss": 0.5269, + "step": 3727 + }, + { + "epoch": 1.7626477541371157, + "grad_norm": 3.535630464553833, + "learning_rate": 4.043628568212914e-06, + "loss": 0.5266, + "step": 3728 + }, + { + "epoch": 1.7631205673758865, + "grad_norm": 2.7783355712890625, + "learning_rate": 4.043137812239998e-06, + "loss": 0.5609, + "step": 3729 + }, + { + "epoch": 1.7635933806146573, + "grad_norm": 2.9344944953918457, + "learning_rate": 4.042646960182657e-06, + "loss": 0.5056, + "step": 3730 + }, + { + "epoch": 1.7640661938534279, + "grad_norm": 2.6205739974975586, + "learning_rate": 4.042156012071453e-06, + "loss": 0.4914, + "step": 3731 + }, + { + "epoch": 1.7645390070921985, + "grad_norm": 2.8004493713378906, + "learning_rate": 4.041664967936958e-06, + "loss": 0.4901, + "step": 3732 + }, + { + "epoch": 1.7650118203309693, + "grad_norm": 2.944589138031006, + "learning_rate": 4.041173827809745e-06, + "loss": 0.5572, + "step": 3733 + }, + { + "epoch": 1.76548463356974, + "grad_norm": 2.5021605491638184, + "learning_rate": 4.040682591720397e-06, + "loss": 0.4637, + "step": 3734 + }, + { + "epoch": 1.7659574468085106, + "grad_norm": 2.448030948638916, + "learning_rate": 4.040191259699497e-06, + "loss": 0.4785, + "step": 3735 + }, + { + "epoch": 1.7664302600472812, + "grad_norm": 2.7171032428741455, + "learning_rate": 4.039699831777643e-06, + "loss": 0.4919, + "step": 3736 + }, + { + "epoch": 1.766903073286052, + "grad_norm": 2.453118324279785, + "learning_rate": 4.03920830798543e-06, + "loss": 0.4326, + "step": 3737 + }, + { + "epoch": 1.7673758865248228, + "grad_norm": 3.112877368927002, + "learning_rate": 4.038716688353466e-06, + "loss": 0.5375, + "step": 3738 + }, + { + "epoch": 1.7678486997635934, + "grad_norm": 2.742239236831665, + "learning_rate": 4.038224972912361e-06, + "loss": 0.5267, + "step": 3739 + }, + { + "epoch": 1.768321513002364, + "grad_norm": 2.544785737991333, + "learning_rate": 4.037733161692731e-06, + "loss": 0.5032, + "step": 3740 + }, + { + "epoch": 1.7687943262411348, + "grad_norm": 2.4639062881469727, + "learning_rate": 4.037241254725201e-06, + "loss": 0.5532, + "step": 3741 + }, + { + "epoch": 1.7692671394799055, + "grad_norm": 2.866290330886841, + "learning_rate": 4.036749252040398e-06, + "loss": 0.5503, + "step": 3742 + }, + { + "epoch": 1.7697399527186761, + "grad_norm": 2.3466262817382812, + "learning_rate": 4.0362571536689575e-06, + "loss": 0.5286, + "step": 3743 + }, + { + "epoch": 1.7702127659574467, + "grad_norm": 2.246464967727661, + "learning_rate": 4.03576495964152e-06, + "loss": 0.4656, + "step": 3744 + }, + { + "epoch": 1.7706855791962175, + "grad_norm": 2.667558431625366, + "learning_rate": 4.035272669988733e-06, + "loss": 0.5205, + "step": 3745 + }, + { + "epoch": 1.7711583924349883, + "grad_norm": 2.974666118621826, + "learning_rate": 4.034780284741249e-06, + "loss": 0.6007, + "step": 3746 + }, + { + "epoch": 1.7716312056737589, + "grad_norm": 2.7164433002471924, + "learning_rate": 4.034287803929726e-06, + "loss": 0.4913, + "step": 3747 + }, + { + "epoch": 1.7721040189125294, + "grad_norm": 2.5923962593078613, + "learning_rate": 4.033795227584829e-06, + "loss": 0.5275, + "step": 3748 + }, + { + "epoch": 1.7725768321513002, + "grad_norm": 2.606027126312256, + "learning_rate": 4.033302555737229e-06, + "loss": 0.4869, + "step": 3749 + }, + { + "epoch": 1.773049645390071, + "grad_norm": 3.0110089778900146, + "learning_rate": 4.032809788417602e-06, + "loss": 0.4956, + "step": 3750 + }, + { + "epoch": 1.7735224586288416, + "grad_norm": 3.004598617553711, + "learning_rate": 4.032316925656632e-06, + "loss": 0.5159, + "step": 3751 + }, + { + "epoch": 1.7739952718676122, + "grad_norm": 2.731539249420166, + "learning_rate": 4.031823967485005e-06, + "loss": 0.5237, + "step": 3752 + }, + { + "epoch": 1.774468085106383, + "grad_norm": 2.7466373443603516, + "learning_rate": 4.0313309139334155e-06, + "loss": 0.4948, + "step": 3753 + }, + { + "epoch": 1.7749408983451538, + "grad_norm": 2.8596460819244385, + "learning_rate": 4.030837765032565e-06, + "loss": 0.5016, + "step": 3754 + }, + { + "epoch": 1.7754137115839244, + "grad_norm": 3.2886788845062256, + "learning_rate": 4.03034452081316e-06, + "loss": 0.5377, + "step": 3755 + }, + { + "epoch": 1.775886524822695, + "grad_norm": 2.5629258155822754, + "learning_rate": 4.029851181305912e-06, + "loss": 0.519, + "step": 3756 + }, + { + "epoch": 1.7763593380614657, + "grad_norm": 2.5988714694976807, + "learning_rate": 4.029357746541539e-06, + "loss": 0.5521, + "step": 3757 + }, + { + "epoch": 1.7768321513002365, + "grad_norm": 2.987884759902954, + "learning_rate": 4.028864216550765e-06, + "loss": 0.6225, + "step": 3758 + }, + { + "epoch": 1.777304964539007, + "grad_norm": 2.6875851154327393, + "learning_rate": 4.02837059136432e-06, + "loss": 0.5321, + "step": 3759 + }, + { + "epoch": 1.7777777777777777, + "grad_norm": 2.6414570808410645, + "learning_rate": 4.02787687101294e-06, + "loss": 0.4831, + "step": 3760 + }, + { + "epoch": 1.7782505910165485, + "grad_norm": 2.581475019454956, + "learning_rate": 4.027383055527368e-06, + "loss": 0.5204, + "step": 3761 + }, + { + "epoch": 1.7787234042553193, + "grad_norm": 2.811298131942749, + "learning_rate": 4.026889144938349e-06, + "loss": 0.5486, + "step": 3762 + }, + { + "epoch": 1.7791962174940898, + "grad_norm": 3.1589081287384033, + "learning_rate": 4.026395139276639e-06, + "loss": 0.4979, + "step": 3763 + }, + { + "epoch": 1.7796690307328604, + "grad_norm": 2.3773093223571777, + "learning_rate": 4.025901038572996e-06, + "loss": 0.503, + "step": 3764 + }, + { + "epoch": 1.7801418439716312, + "grad_norm": 2.962541341781616, + "learning_rate": 4.025406842858187e-06, + "loss": 0.4613, + "step": 3765 + }, + { + "epoch": 1.780614657210402, + "grad_norm": 2.603092908859253, + "learning_rate": 4.024912552162982e-06, + "loss": 0.5142, + "step": 3766 + }, + { + "epoch": 1.7810874704491726, + "grad_norm": 2.648927927017212, + "learning_rate": 4.024418166518159e-06, + "loss": 0.4491, + "step": 3767 + }, + { + "epoch": 1.7815602836879432, + "grad_norm": 3.3239917755126953, + "learning_rate": 4.023923685954502e-06, + "loss": 0.6272, + "step": 3768 + }, + { + "epoch": 1.782033096926714, + "grad_norm": 2.672821283340454, + "learning_rate": 4.023429110502798e-06, + "loss": 0.5171, + "step": 3769 + }, + { + "epoch": 1.7825059101654848, + "grad_norm": 2.364332437515259, + "learning_rate": 4.022934440193844e-06, + "loss": 0.4513, + "step": 3770 + }, + { + "epoch": 1.7829787234042553, + "grad_norm": 3.03108549118042, + "learning_rate": 4.022439675058441e-06, + "loss": 0.4324, + "step": 3771 + }, + { + "epoch": 1.783451536643026, + "grad_norm": 2.647557020187378, + "learning_rate": 4.021944815127393e-06, + "loss": 0.5162, + "step": 3772 + }, + { + "epoch": 1.7839243498817967, + "grad_norm": 2.4111907482147217, + "learning_rate": 4.021449860431517e-06, + "loss": 0.4712, + "step": 3773 + }, + { + "epoch": 1.7843971631205675, + "grad_norm": 2.796175718307495, + "learning_rate": 4.020954811001629e-06, + "loss": 0.5131, + "step": 3774 + }, + { + "epoch": 1.784869976359338, + "grad_norm": 2.4594924449920654, + "learning_rate": 4.020459666868553e-06, + "loss": 0.4739, + "step": 3775 + }, + { + "epoch": 1.7853427895981087, + "grad_norm": 2.5735671520233154, + "learning_rate": 4.0199644280631215e-06, + "loss": 0.4716, + "step": 3776 + }, + { + "epoch": 1.7858156028368795, + "grad_norm": 2.419990062713623, + "learning_rate": 4.01946909461617e-06, + "loss": 0.4866, + "step": 3777 + }, + { + "epoch": 1.7862884160756503, + "grad_norm": 2.5597951412200928, + "learning_rate": 4.01897366655854e-06, + "loss": 0.5569, + "step": 3778 + }, + { + "epoch": 1.7867612293144208, + "grad_norm": 2.462383985519409, + "learning_rate": 4.018478143921081e-06, + "loss": 0.4588, + "step": 3779 + }, + { + "epoch": 1.7872340425531914, + "grad_norm": 2.536701202392578, + "learning_rate": 4.017982526734646e-06, + "loss": 0.5278, + "step": 3780 + }, + { + "epoch": 1.7877068557919622, + "grad_norm": 2.691077470779419, + "learning_rate": 4.017486815030095e-06, + "loss": 0.4815, + "step": 3781 + }, + { + "epoch": 1.788179669030733, + "grad_norm": 2.4277288913726807, + "learning_rate": 4.016991008838294e-06, + "loss": 0.4877, + "step": 3782 + }, + { + "epoch": 1.7886524822695036, + "grad_norm": 2.6740009784698486, + "learning_rate": 4.016495108190115e-06, + "loss": 0.572, + "step": 3783 + }, + { + "epoch": 1.7891252955082741, + "grad_norm": 3.179232120513916, + "learning_rate": 4.0159991131164355e-06, + "loss": 0.4821, + "step": 3784 + }, + { + "epoch": 1.789598108747045, + "grad_norm": 3.2747793197631836, + "learning_rate": 4.015503023648138e-06, + "loss": 0.5517, + "step": 3785 + }, + { + "epoch": 1.7900709219858157, + "grad_norm": 2.671367645263672, + "learning_rate": 4.015006839816113e-06, + "loss": 0.5158, + "step": 3786 + }, + { + "epoch": 1.7905437352245863, + "grad_norm": 2.6600193977355957, + "learning_rate": 4.014510561651256e-06, + "loss": 0.535, + "step": 3787 + }, + { + "epoch": 1.791016548463357, + "grad_norm": 2.481509208679199, + "learning_rate": 4.014014189184466e-06, + "loss": 0.5596, + "step": 3788 + }, + { + "epoch": 1.7914893617021277, + "grad_norm": 2.759816884994507, + "learning_rate": 4.013517722446652e-06, + "loss": 0.5201, + "step": 3789 + }, + { + "epoch": 1.7919621749408985, + "grad_norm": 2.6913561820983887, + "learning_rate": 4.013021161468724e-06, + "loss": 0.5758, + "step": 3790 + }, + { + "epoch": 1.792434988179669, + "grad_norm": 2.775087594985962, + "learning_rate": 4.0125245062816044e-06, + "loss": 0.499, + "step": 3791 + }, + { + "epoch": 1.7929078014184396, + "grad_norm": 2.6134777069091797, + "learning_rate": 4.012027756916216e-06, + "loss": 0.5659, + "step": 3792 + }, + { + "epoch": 1.7933806146572104, + "grad_norm": 2.7109756469726562, + "learning_rate": 4.0115309134034895e-06, + "loss": 0.5337, + "step": 3793 + }, + { + "epoch": 1.7938534278959812, + "grad_norm": 2.5389950275421143, + "learning_rate": 4.0110339757743595e-06, + "loss": 0.4501, + "step": 3794 + }, + { + "epoch": 1.7943262411347518, + "grad_norm": 2.634648561477661, + "learning_rate": 4.010536944059771e-06, + "loss": 0.4411, + "step": 3795 + }, + { + "epoch": 1.7947990543735224, + "grad_norm": 2.527070999145508, + "learning_rate": 4.0100398182906695e-06, + "loss": 0.5145, + "step": 3796 + }, + { + "epoch": 1.7952718676122932, + "grad_norm": 2.62988543510437, + "learning_rate": 4.0095425984980105e-06, + "loss": 0.4981, + "step": 3797 + }, + { + "epoch": 1.795744680851064, + "grad_norm": 2.6032519340515137, + "learning_rate": 4.009045284712752e-06, + "loss": 0.453, + "step": 3798 + }, + { + "epoch": 1.7962174940898346, + "grad_norm": 2.735173463821411, + "learning_rate": 4.008547876965863e-06, + "loss": 0.5925, + "step": 3799 + }, + { + "epoch": 1.7966903073286051, + "grad_norm": 2.6296730041503906, + "learning_rate": 4.00805037528831e-06, + "loss": 0.5651, + "step": 3800 + }, + { + "epoch": 1.797163120567376, + "grad_norm": 2.641214370727539, + "learning_rate": 4.0075527797110735e-06, + "loss": 0.4973, + "step": 3801 + }, + { + "epoch": 1.7976359338061467, + "grad_norm": 2.6104819774627686, + "learning_rate": 4.007055090265136e-06, + "loss": 0.4432, + "step": 3802 + }, + { + "epoch": 1.7981087470449173, + "grad_norm": 2.8200619220733643, + "learning_rate": 4.0065573069814865e-06, + "loss": 0.4899, + "step": 3803 + }, + { + "epoch": 1.7985815602836879, + "grad_norm": 2.982354164123535, + "learning_rate": 4.006059429891119e-06, + "loss": 0.5488, + "step": 3804 + }, + { + "epoch": 1.7990543735224587, + "grad_norm": 2.7561678886413574, + "learning_rate": 4.005561459025034e-06, + "loss": 0.5637, + "step": 3805 + }, + { + "epoch": 1.7995271867612295, + "grad_norm": 2.702212333679199, + "learning_rate": 4.005063394414241e-06, + "loss": 0.4804, + "step": 3806 + }, + { + "epoch": 1.8, + "grad_norm": 2.8655319213867188, + "learning_rate": 4.004565236089748e-06, + "loss": 0.5759, + "step": 3807 + }, + { + "epoch": 1.8004728132387706, + "grad_norm": 2.703676223754883, + "learning_rate": 4.0040669840825756e-06, + "loss": 0.4728, + "step": 3808 + }, + { + "epoch": 1.8009456264775414, + "grad_norm": 2.802645683288574, + "learning_rate": 4.003568638423747e-06, + "loss": 0.5421, + "step": 3809 + }, + { + "epoch": 1.8014184397163122, + "grad_norm": 2.4723124504089355, + "learning_rate": 4.003070199144292e-06, + "loss": 0.4944, + "step": 3810 + }, + { + "epoch": 1.8018912529550828, + "grad_norm": 2.4889068603515625, + "learning_rate": 4.0025716662752475e-06, + "loss": 0.4774, + "step": 3811 + }, + { + "epoch": 1.8023640661938534, + "grad_norm": 2.5408077239990234, + "learning_rate": 4.002073039847653e-06, + "loss": 0.5233, + "step": 3812 + }, + { + "epoch": 1.8028368794326242, + "grad_norm": 2.734602689743042, + "learning_rate": 4.001574319892557e-06, + "loss": 0.5403, + "step": 3813 + }, + { + "epoch": 1.803309692671395, + "grad_norm": 3.3786163330078125, + "learning_rate": 4.001075506441012e-06, + "loss": 0.6969, + "step": 3814 + }, + { + "epoch": 1.8037825059101655, + "grad_norm": 2.7375378608703613, + "learning_rate": 4.000576599524078e-06, + "loss": 0.4907, + "step": 3815 + }, + { + "epoch": 1.804255319148936, + "grad_norm": 3.041804075241089, + "learning_rate": 4.000077599172818e-06, + "loss": 0.6021, + "step": 3816 + }, + { + "epoch": 1.804728132387707, + "grad_norm": 2.697599411010742, + "learning_rate": 3.999578505418305e-06, + "loss": 0.4743, + "step": 3817 + }, + { + "epoch": 1.8052009456264777, + "grad_norm": 2.276921272277832, + "learning_rate": 3.999079318291612e-06, + "loss": 0.4885, + "step": 3818 + }, + { + "epoch": 1.8056737588652483, + "grad_norm": 2.4896953105926514, + "learning_rate": 3.998580037823825e-06, + "loss": 0.503, + "step": 3819 + }, + { + "epoch": 1.8061465721040189, + "grad_norm": 2.6232175827026367, + "learning_rate": 3.998080664046029e-06, + "loss": 0.5058, + "step": 3820 + }, + { + "epoch": 1.8066193853427897, + "grad_norm": 2.695861339569092, + "learning_rate": 3.997581196989319e-06, + "loss": 0.4949, + "step": 3821 + }, + { + "epoch": 1.8070921985815604, + "grad_norm": 2.912886142730713, + "learning_rate": 3.997081636684795e-06, + "loss": 0.4971, + "step": 3822 + }, + { + "epoch": 1.807565011820331, + "grad_norm": 2.876500368118286, + "learning_rate": 3.996581983163561e-06, + "loss": 0.5584, + "step": 3823 + }, + { + "epoch": 1.8080378250591016, + "grad_norm": 2.857069730758667, + "learning_rate": 3.99608223645673e-06, + "loss": 0.5457, + "step": 3824 + }, + { + "epoch": 1.8085106382978724, + "grad_norm": 2.486743211746216, + "learning_rate": 3.995582396595419e-06, + "loss": 0.5291, + "step": 3825 + }, + { + "epoch": 1.808983451536643, + "grad_norm": 2.509441375732422, + "learning_rate": 3.9950824636107486e-06, + "loss": 0.4747, + "step": 3826 + }, + { + "epoch": 1.8094562647754135, + "grad_norm": 2.931394100189209, + "learning_rate": 3.99458243753385e-06, + "loss": 0.5116, + "step": 3827 + }, + { + "epoch": 1.8099290780141843, + "grad_norm": 2.4868650436401367, + "learning_rate": 3.994082318395856e-06, + "loss": 0.4671, + "step": 3828 + }, + { + "epoch": 1.8104018912529551, + "grad_norm": 2.5554752349853516, + "learning_rate": 3.993582106227907e-06, + "loss": 0.4969, + "step": 3829 + }, + { + "epoch": 1.8108747044917257, + "grad_norm": 2.8367133140563965, + "learning_rate": 3.99308180106115e-06, + "loss": 0.5507, + "step": 3830 + }, + { + "epoch": 1.8113475177304963, + "grad_norm": 2.68245792388916, + "learning_rate": 3.992581402926737e-06, + "loss": 0.5115, + "step": 3831 + }, + { + "epoch": 1.811820330969267, + "grad_norm": 2.406674385070801, + "learning_rate": 3.992080911855824e-06, + "loss": 0.545, + "step": 3832 + }, + { + "epoch": 1.8122931442080379, + "grad_norm": 2.5003464221954346, + "learning_rate": 3.991580327879575e-06, + "loss": 0.4331, + "step": 3833 + }, + { + "epoch": 1.8127659574468085, + "grad_norm": 2.49320912361145, + "learning_rate": 3.99107965102916e-06, + "loss": 0.5118, + "step": 3834 + }, + { + "epoch": 1.813238770685579, + "grad_norm": 2.6183295249938965, + "learning_rate": 3.990578881335752e-06, + "loss": 0.5286, + "step": 3835 + }, + { + "epoch": 1.8137115839243498, + "grad_norm": 3.1999518871307373, + "learning_rate": 3.990078018830534e-06, + "loss": 0.5048, + "step": 3836 + }, + { + "epoch": 1.8141843971631206, + "grad_norm": 2.4351117610931396, + "learning_rate": 3.9895770635446915e-06, + "loss": 0.514, + "step": 3837 + }, + { + "epoch": 1.8146572104018912, + "grad_norm": 2.6859259605407715, + "learning_rate": 3.989076015509416e-06, + "loss": 0.5575, + "step": 3838 + }, + { + "epoch": 1.8151300236406618, + "grad_norm": 2.790421962738037, + "learning_rate": 3.988574874755909e-06, + "loss": 0.5467, + "step": 3839 + }, + { + "epoch": 1.8156028368794326, + "grad_norm": 2.5202765464782715, + "learning_rate": 3.988073641315369e-06, + "loss": 0.5229, + "step": 3840 + }, + { + "epoch": 1.8160756501182034, + "grad_norm": 2.623652219772339, + "learning_rate": 3.987572315219009e-06, + "loss": 0.509, + "step": 3841 + }, + { + "epoch": 1.816548463356974, + "grad_norm": 2.6038360595703125, + "learning_rate": 3.987070896498044e-06, + "loss": 0.5304, + "step": 3842 + }, + { + "epoch": 1.8170212765957445, + "grad_norm": 2.9378011226654053, + "learning_rate": 3.9865693851836955e-06, + "loss": 0.5845, + "step": 3843 + }, + { + "epoch": 1.8174940898345153, + "grad_norm": 2.4061124324798584, + "learning_rate": 3.98606778130719e-06, + "loss": 0.4333, + "step": 3844 + }, + { + "epoch": 1.8179669030732861, + "grad_norm": 2.483489751815796, + "learning_rate": 3.985566084899759e-06, + "loss": 0.4827, + "step": 3845 + }, + { + "epoch": 1.8184397163120567, + "grad_norm": 2.7774932384490967, + "learning_rate": 3.985064295992642e-06, + "loss": 0.5016, + "step": 3846 + }, + { + "epoch": 1.8189125295508273, + "grad_norm": 2.5936765670776367, + "learning_rate": 3.984562414617083e-06, + "loss": 0.4448, + "step": 3847 + }, + { + "epoch": 1.819385342789598, + "grad_norm": 2.8608627319335938, + "learning_rate": 3.9840604408043325e-06, + "loss": 0.5735, + "step": 3848 + }, + { + "epoch": 1.8198581560283689, + "grad_norm": 2.6212472915649414, + "learning_rate": 3.983558374585646e-06, + "loss": 0.5091, + "step": 3849 + }, + { + "epoch": 1.8203309692671394, + "grad_norm": 2.832460641860962, + "learning_rate": 3.983056215992284e-06, + "loss": 0.5169, + "step": 3850 + }, + { + "epoch": 1.82080378250591, + "grad_norm": 2.5293610095977783, + "learning_rate": 3.982553965055514e-06, + "loss": 0.4708, + "step": 3851 + }, + { + "epoch": 1.8212765957446808, + "grad_norm": 2.9362871646881104, + "learning_rate": 3.982051621806611e-06, + "loss": 0.575, + "step": 3852 + }, + { + "epoch": 1.8217494089834516, + "grad_norm": 2.69073486328125, + "learning_rate": 3.98154918627685e-06, + "loss": 0.5278, + "step": 3853 + }, + { + "epoch": 1.8222222222222222, + "grad_norm": 2.6711034774780273, + "learning_rate": 3.98104665849752e-06, + "loss": 0.4918, + "step": 3854 + }, + { + "epoch": 1.8226950354609928, + "grad_norm": 2.571110963821411, + "learning_rate": 3.980544038499907e-06, + "loss": 0.5234, + "step": 3855 + }, + { + "epoch": 1.8231678486997636, + "grad_norm": 3.2603371143341064, + "learning_rate": 3.980041326315309e-06, + "loss": 0.5996, + "step": 3856 + }, + { + "epoch": 1.8236406619385344, + "grad_norm": 2.8472323417663574, + "learning_rate": 3.979538521975028e-06, + "loss": 0.4769, + "step": 3857 + }, + { + "epoch": 1.824113475177305, + "grad_norm": 2.6714751720428467, + "learning_rate": 3.979035625510371e-06, + "loss": 0.4826, + "step": 3858 + }, + { + "epoch": 1.8245862884160755, + "grad_norm": 2.6816468238830566, + "learning_rate": 3.97853263695265e-06, + "loss": 0.5127, + "step": 3859 + }, + { + "epoch": 1.8250591016548463, + "grad_norm": 2.6464123725891113, + "learning_rate": 3.978029556333185e-06, + "loss": 0.4925, + "step": 3860 + }, + { + "epoch": 1.825531914893617, + "grad_norm": 2.5317227840423584, + "learning_rate": 3.977526383683301e-06, + "loss": 0.4765, + "step": 3861 + }, + { + "epoch": 1.8260047281323877, + "grad_norm": 2.5052425861358643, + "learning_rate": 3.977023119034328e-06, + "loss": 0.4804, + "step": 3862 + }, + { + "epoch": 1.8264775413711583, + "grad_norm": 2.7022836208343506, + "learning_rate": 3.976519762417602e-06, + "loss": 0.4824, + "step": 3863 + }, + { + "epoch": 1.826950354609929, + "grad_norm": 2.7445900440216064, + "learning_rate": 3.976016313864464e-06, + "loss": 0.5698, + "step": 3864 + }, + { + "epoch": 1.8274231678486998, + "grad_norm": 2.442518711090088, + "learning_rate": 3.975512773406262e-06, + "loss": 0.5133, + "step": 3865 + }, + { + "epoch": 1.8278959810874704, + "grad_norm": 2.4100050926208496, + "learning_rate": 3.975009141074351e-06, + "loss": 0.5044, + "step": 3866 + }, + { + "epoch": 1.828368794326241, + "grad_norm": 2.9507648944854736, + "learning_rate": 3.974505416900088e-06, + "loss": 0.5367, + "step": 3867 + }, + { + "epoch": 1.8288416075650118, + "grad_norm": 2.5662600994110107, + "learning_rate": 3.974001600914837e-06, + "loss": 0.5878, + "step": 3868 + }, + { + "epoch": 1.8293144208037826, + "grad_norm": 2.4306657314300537, + "learning_rate": 3.973497693149971e-06, + "loss": 0.4647, + "step": 3869 + }, + { + "epoch": 1.8297872340425532, + "grad_norm": 2.974686622619629, + "learning_rate": 3.972993693636864e-06, + "loss": 0.4911, + "step": 3870 + }, + { + "epoch": 1.8302600472813237, + "grad_norm": 2.5711987018585205, + "learning_rate": 3.972489602406899e-06, + "loss": 0.5089, + "step": 3871 + }, + { + "epoch": 1.8307328605200945, + "grad_norm": 3.259617328643799, + "learning_rate": 3.971985419491463e-06, + "loss": 0.5966, + "step": 3872 + }, + { + "epoch": 1.8312056737588653, + "grad_norm": 2.7437000274658203, + "learning_rate": 3.971481144921949e-06, + "loss": 0.5097, + "step": 3873 + }, + { + "epoch": 1.831678486997636, + "grad_norm": 2.9597461223602295, + "learning_rate": 3.970976778729757e-06, + "loss": 0.5672, + "step": 3874 + }, + { + "epoch": 1.8321513002364065, + "grad_norm": 2.5775723457336426, + "learning_rate": 3.970472320946291e-06, + "loss": 0.4749, + "step": 3875 + }, + { + "epoch": 1.8326241134751773, + "grad_norm": 2.7381200790405273, + "learning_rate": 3.969967771602961e-06, + "loss": 0.5255, + "step": 3876 + }, + { + "epoch": 1.833096926713948, + "grad_norm": 2.651698350906372, + "learning_rate": 3.969463130731183e-06, + "loss": 0.5098, + "step": 3877 + }, + { + "epoch": 1.8335697399527187, + "grad_norm": 2.7277021408081055, + "learning_rate": 3.968958398362381e-06, + "loss": 0.5251, + "step": 3878 + }, + { + "epoch": 1.8340425531914892, + "grad_norm": 2.5184953212738037, + "learning_rate": 3.968453574527978e-06, + "loss": 0.5086, + "step": 3879 + }, + { + "epoch": 1.83451536643026, + "grad_norm": 2.8227882385253906, + "learning_rate": 3.967948659259412e-06, + "loss": 0.5742, + "step": 3880 + }, + { + "epoch": 1.8349881796690308, + "grad_norm": 2.547922134399414, + "learning_rate": 3.967443652588119e-06, + "loss": 0.5411, + "step": 3881 + }, + { + "epoch": 1.8354609929078014, + "grad_norm": 2.6572835445404053, + "learning_rate": 3.966938554545545e-06, + "loss": 0.4854, + "step": 3882 + }, + { + "epoch": 1.835933806146572, + "grad_norm": 2.9416658878326416, + "learning_rate": 3.966433365163139e-06, + "loss": 0.5236, + "step": 3883 + }, + { + "epoch": 1.8364066193853428, + "grad_norm": 2.344325304031372, + "learning_rate": 3.965928084472357e-06, + "loss": 0.4916, + "step": 3884 + }, + { + "epoch": 1.8368794326241136, + "grad_norm": 2.890418291091919, + "learning_rate": 3.965422712504662e-06, + "loss": 0.5287, + "step": 3885 + }, + { + "epoch": 1.8373522458628841, + "grad_norm": 2.6063363552093506, + "learning_rate": 3.96491724929152e-06, + "loss": 0.4842, + "step": 3886 + }, + { + "epoch": 1.8378250591016547, + "grad_norm": 2.5582427978515625, + "learning_rate": 3.964411694864404e-06, + "loss": 0.4768, + "step": 3887 + }, + { + "epoch": 1.8382978723404255, + "grad_norm": 2.84356951713562, + "learning_rate": 3.963906049254793e-06, + "loss": 0.5284, + "step": 3888 + }, + { + "epoch": 1.8387706855791963, + "grad_norm": 2.7048516273498535, + "learning_rate": 3.963400312494172e-06, + "loss": 0.5271, + "step": 3889 + }, + { + "epoch": 1.839243498817967, + "grad_norm": 2.5401699542999268, + "learning_rate": 3.962894484614031e-06, + "loss": 0.4734, + "step": 3890 + }, + { + "epoch": 1.8397163120567375, + "grad_norm": 2.208256244659424, + "learning_rate": 3.962388565645864e-06, + "loss": 0.4113, + "step": 3891 + }, + { + "epoch": 1.8401891252955083, + "grad_norm": 2.775139331817627, + "learning_rate": 3.961882555621173e-06, + "loss": 0.5172, + "step": 3892 + }, + { + "epoch": 1.840661938534279, + "grad_norm": 2.7540855407714844, + "learning_rate": 3.961376454571466e-06, + "loss": 0.5252, + "step": 3893 + }, + { + "epoch": 1.8411347517730496, + "grad_norm": 2.6731574535369873, + "learning_rate": 3.960870262528255e-06, + "loss": 0.4495, + "step": 3894 + }, + { + "epoch": 1.8416075650118202, + "grad_norm": 2.791492223739624, + "learning_rate": 3.960363979523058e-06, + "loss": 0.5457, + "step": 3895 + }, + { + "epoch": 1.842080378250591, + "grad_norm": 2.9280290603637695, + "learning_rate": 3.959857605587401e-06, + "loss": 0.5373, + "step": 3896 + }, + { + "epoch": 1.8425531914893618, + "grad_norm": 2.5652217864990234, + "learning_rate": 3.95935114075281e-06, + "loss": 0.5191, + "step": 3897 + }, + { + "epoch": 1.8430260047281324, + "grad_norm": 2.7297749519348145, + "learning_rate": 3.958844585050824e-06, + "loss": 0.5366, + "step": 3898 + }, + { + "epoch": 1.843498817966903, + "grad_norm": 2.5302982330322266, + "learning_rate": 3.958337938512983e-06, + "loss": 0.569, + "step": 3899 + }, + { + "epoch": 1.8439716312056738, + "grad_norm": 2.644777297973633, + "learning_rate": 3.957831201170832e-06, + "loss": 0.521, + "step": 3900 + }, + { + "epoch": 1.8444444444444446, + "grad_norm": 2.8375515937805176, + "learning_rate": 3.957324373055925e-06, + "loss": 0.573, + "step": 3901 + }, + { + "epoch": 1.8449172576832151, + "grad_norm": 2.512296676635742, + "learning_rate": 3.956817454199819e-06, + "loss": 0.5081, + "step": 3902 + }, + { + "epoch": 1.8453900709219857, + "grad_norm": 2.3662109375, + "learning_rate": 3.956310444634079e-06, + "loss": 0.4989, + "step": 3903 + }, + { + "epoch": 1.8458628841607565, + "grad_norm": 2.6849682331085205, + "learning_rate": 3.955803344390272e-06, + "loss": 0.5459, + "step": 3904 + }, + { + "epoch": 1.8463356973995273, + "grad_norm": 2.8364317417144775, + "learning_rate": 3.9552961534999756e-06, + "loss": 0.5704, + "step": 3905 + }, + { + "epoch": 1.8468085106382979, + "grad_norm": 2.6006948947906494, + "learning_rate": 3.954788871994768e-06, + "loss": 0.5696, + "step": 3906 + }, + { + "epoch": 1.8472813238770684, + "grad_norm": 2.558300018310547, + "learning_rate": 3.9542814999062375e-06, + "loss": 0.5047, + "step": 3907 + }, + { + "epoch": 1.8477541371158392, + "grad_norm": 2.6343321800231934, + "learning_rate": 3.953774037265974e-06, + "loss": 0.525, + "step": 3908 + }, + { + "epoch": 1.84822695035461, + "grad_norm": 2.5050008296966553, + "learning_rate": 3.953266484105576e-06, + "loss": 0.4867, + "step": 3909 + }, + { + "epoch": 1.8486997635933806, + "grad_norm": 2.3775103092193604, + "learning_rate": 3.952758840456647e-06, + "loss": 0.4349, + "step": 3910 + }, + { + "epoch": 1.8491725768321512, + "grad_norm": 2.508376359939575, + "learning_rate": 3.952251106350794e-06, + "loss": 0.539, + "step": 3911 + }, + { + "epoch": 1.849645390070922, + "grad_norm": 2.7403106689453125, + "learning_rate": 3.951743281819633e-06, + "loss": 0.4478, + "step": 3912 + }, + { + "epoch": 1.8501182033096928, + "grad_norm": 2.5332062244415283, + "learning_rate": 3.951235366894784e-06, + "loss": 0.4658, + "step": 3913 + }, + { + "epoch": 1.8505910165484634, + "grad_norm": 3.0137248039245605, + "learning_rate": 3.950727361607872e-06, + "loss": 0.5047, + "step": 3914 + }, + { + "epoch": 1.851063829787234, + "grad_norm": 2.5820653438568115, + "learning_rate": 3.950219265990528e-06, + "loss": 0.542, + "step": 3915 + }, + { + "epoch": 1.8515366430260047, + "grad_norm": 2.555133819580078, + "learning_rate": 3.949711080074389e-06, + "loss": 0.5253, + "step": 3916 + }, + { + "epoch": 1.8520094562647755, + "grad_norm": 2.876882791519165, + "learning_rate": 3.949202803891099e-06, + "loss": 0.5242, + "step": 3917 + }, + { + "epoch": 1.852482269503546, + "grad_norm": 2.5929203033447266, + "learning_rate": 3.948694437472305e-06, + "loss": 0.5358, + "step": 3918 + }, + { + "epoch": 1.8529550827423167, + "grad_norm": 2.468513250350952, + "learning_rate": 3.948185980849659e-06, + "loss": 0.5119, + "step": 3919 + }, + { + "epoch": 1.8534278959810875, + "grad_norm": 2.9259560108184814, + "learning_rate": 3.947677434054824e-06, + "loss": 0.4756, + "step": 3920 + }, + { + "epoch": 1.8539007092198583, + "grad_norm": 2.5247011184692383, + "learning_rate": 3.947168797119462e-06, + "loss": 0.4627, + "step": 3921 + }, + { + "epoch": 1.8543735224586289, + "grad_norm": 2.7396671772003174, + "learning_rate": 3.946660070075245e-06, + "loss": 0.5013, + "step": 3922 + }, + { + "epoch": 1.8548463356973994, + "grad_norm": 2.7059738636016846, + "learning_rate": 3.946151252953849e-06, + "loss": 0.5875, + "step": 3923 + }, + { + "epoch": 1.8553191489361702, + "grad_norm": 2.5638437271118164, + "learning_rate": 3.945642345786955e-06, + "loss": 0.5063, + "step": 3924 + }, + { + "epoch": 1.855791962174941, + "grad_norm": 2.6647839546203613, + "learning_rate": 3.945133348606251e-06, + "loss": 0.5421, + "step": 3925 + }, + { + "epoch": 1.8562647754137116, + "grad_norm": 3.7235286235809326, + "learning_rate": 3.944624261443431e-06, + "loss": 0.5958, + "step": 3926 + }, + { + "epoch": 1.8567375886524822, + "grad_norm": 2.769984245300293, + "learning_rate": 3.944115084330192e-06, + "loss": 0.5678, + "step": 3927 + }, + { + "epoch": 1.857210401891253, + "grad_norm": 2.567249059677124, + "learning_rate": 3.9436058172982395e-06, + "loss": 0.4767, + "step": 3928 + }, + { + "epoch": 1.8576832151300238, + "grad_norm": 2.6196048259735107, + "learning_rate": 3.943096460379283e-06, + "loss": 0.5345, + "step": 3929 + }, + { + "epoch": 1.8581560283687943, + "grad_norm": 2.5999555587768555, + "learning_rate": 3.942587013605037e-06, + "loss": 0.5482, + "step": 3930 + }, + { + "epoch": 1.858628841607565, + "grad_norm": 2.630387783050537, + "learning_rate": 3.942077477007224e-06, + "loss": 0.6023, + "step": 3931 + }, + { + "epoch": 1.8591016548463357, + "grad_norm": 2.543503761291504, + "learning_rate": 3.941567850617569e-06, + "loss": 0.5157, + "step": 3932 + }, + { + "epoch": 1.8595744680851065, + "grad_norm": 2.5109236240386963, + "learning_rate": 3.941058134467805e-06, + "loss": 0.4774, + "step": 3933 + }, + { + "epoch": 1.860047281323877, + "grad_norm": 2.5110230445861816, + "learning_rate": 3.94054832858967e-06, + "loss": 0.5064, + "step": 3934 + }, + { + "epoch": 1.8605200945626477, + "grad_norm": 2.4780776500701904, + "learning_rate": 3.940038433014908e-06, + "loss": 0.5216, + "step": 3935 + }, + { + "epoch": 1.8609929078014185, + "grad_norm": 2.4398856163024902, + "learning_rate": 3.939528447775266e-06, + "loss": 0.4958, + "step": 3936 + }, + { + "epoch": 1.8614657210401893, + "grad_norm": 2.449498176574707, + "learning_rate": 3.9390183729025e-06, + "loss": 0.5165, + "step": 3937 + }, + { + "epoch": 1.8619385342789598, + "grad_norm": 2.982544422149658, + "learning_rate": 3.938508208428371e-06, + "loss": 0.4803, + "step": 3938 + }, + { + "epoch": 1.8624113475177304, + "grad_norm": 2.6574015617370605, + "learning_rate": 3.937997954384641e-06, + "loss": 0.4797, + "step": 3939 + }, + { + "epoch": 1.8628841607565012, + "grad_norm": 2.7773542404174805, + "learning_rate": 3.937487610803086e-06, + "loss": 0.4843, + "step": 3940 + }, + { + "epoch": 1.863356973995272, + "grad_norm": 2.588937759399414, + "learning_rate": 3.9369771777154805e-06, + "loss": 0.5426, + "step": 3941 + }, + { + "epoch": 1.8638297872340426, + "grad_norm": 2.855442523956299, + "learning_rate": 3.936466655153607e-06, + "loss": 0.5443, + "step": 3942 + }, + { + "epoch": 1.8643026004728132, + "grad_norm": 2.554676055908203, + "learning_rate": 3.935956043149253e-06, + "loss": 0.5334, + "step": 3943 + }, + { + "epoch": 1.864775413711584, + "grad_norm": 2.901599884033203, + "learning_rate": 3.935445341734212e-06, + "loss": 0.5842, + "step": 3944 + }, + { + "epoch": 1.8652482269503547, + "grad_norm": 2.554485321044922, + "learning_rate": 3.934934550940285e-06, + "loss": 0.4941, + "step": 3945 + }, + { + "epoch": 1.8657210401891253, + "grad_norm": 2.357203245162964, + "learning_rate": 3.934423670799275e-06, + "loss": 0.4402, + "step": 3946 + }, + { + "epoch": 1.866193853427896, + "grad_norm": 2.7036049365997314, + "learning_rate": 3.933912701342993e-06, + "loss": 0.4966, + "step": 3947 + }, + { + "epoch": 1.8666666666666667, + "grad_norm": 2.7817211151123047, + "learning_rate": 3.933401642603255e-06, + "loss": 0.4908, + "step": 3948 + }, + { + "epoch": 1.8671394799054375, + "grad_norm": 2.439490795135498, + "learning_rate": 3.932890494611882e-06, + "loss": 0.4322, + "step": 3949 + }, + { + "epoch": 1.867612293144208, + "grad_norm": 3.187152147293091, + "learning_rate": 3.9323792574007e-06, + "loss": 0.501, + "step": 3950 + }, + { + "epoch": 1.8680851063829786, + "grad_norm": 2.405773401260376, + "learning_rate": 3.931867931001543e-06, + "loss": 0.4477, + "step": 3951 + }, + { + "epoch": 1.8685579196217494, + "grad_norm": 2.4922525882720947, + "learning_rate": 3.931356515446248e-06, + "loss": 0.5098, + "step": 3952 + }, + { + "epoch": 1.8690307328605202, + "grad_norm": 2.7781267166137695, + "learning_rate": 3.93084501076666e-06, + "loss": 0.5815, + "step": 3953 + }, + { + "epoch": 1.8695035460992908, + "grad_norm": 2.74621844291687, + "learning_rate": 3.930333416994626e-06, + "loss": 0.5605, + "step": 3954 + }, + { + "epoch": 1.8699763593380614, + "grad_norm": 2.5527689456939697, + "learning_rate": 3.929821734162004e-06, + "loss": 0.5141, + "step": 3955 + }, + { + "epoch": 1.8704491725768322, + "grad_norm": 2.5730628967285156, + "learning_rate": 3.92930996230065e-06, + "loss": 0.5446, + "step": 3956 + }, + { + "epoch": 1.870921985815603, + "grad_norm": 2.7053353786468506, + "learning_rate": 3.9287981014424334e-06, + "loss": 0.4722, + "step": 3957 + }, + { + "epoch": 1.8713947990543736, + "grad_norm": 2.7591893672943115, + "learning_rate": 3.928286151619224e-06, + "loss": 0.509, + "step": 3958 + }, + { + "epoch": 1.8718676122931441, + "grad_norm": 2.6233739852905273, + "learning_rate": 3.927774112862898e-06, + "loss": 0.5266, + "step": 3959 + }, + { + "epoch": 1.872340425531915, + "grad_norm": 2.7715370655059814, + "learning_rate": 3.9272619852053396e-06, + "loss": 0.5612, + "step": 3960 + }, + { + "epoch": 1.8728132387706857, + "grad_norm": 2.4815211296081543, + "learning_rate": 3.926749768678435e-06, + "loss": 0.5498, + "step": 3961 + }, + { + "epoch": 1.8732860520094563, + "grad_norm": 2.6819605827331543, + "learning_rate": 3.926237463314078e-06, + "loss": 0.5499, + "step": 3962 + }, + { + "epoch": 1.8737588652482269, + "grad_norm": 2.638664722442627, + "learning_rate": 3.925725069144168e-06, + "loss": 0.5429, + "step": 3963 + }, + { + "epoch": 1.8742316784869977, + "grad_norm": 2.527294874191284, + "learning_rate": 3.925212586200611e-06, + "loss": 0.5451, + "step": 3964 + }, + { + "epoch": 1.8747044917257685, + "grad_norm": 2.831638813018799, + "learning_rate": 3.924700014515315e-06, + "loss": 0.5276, + "step": 3965 + }, + { + "epoch": 1.875177304964539, + "grad_norm": 2.5906996726989746, + "learning_rate": 3.924187354120196e-06, + "loss": 0.5323, + "step": 3966 + }, + { + "epoch": 1.8756501182033096, + "grad_norm": 2.5482442378997803, + "learning_rate": 3.923674605047175e-06, + "loss": 0.4882, + "step": 3967 + }, + { + "epoch": 1.8761229314420804, + "grad_norm": 2.56402850151062, + "learning_rate": 3.923161767328179e-06, + "loss": 0.5111, + "step": 3968 + }, + { + "epoch": 1.8765957446808512, + "grad_norm": 3.223782539367676, + "learning_rate": 3.9226488409951405e-06, + "loss": 0.5829, + "step": 3969 + }, + { + "epoch": 1.8770685579196218, + "grad_norm": 2.665964365005493, + "learning_rate": 3.922135826079997e-06, + "loss": 0.4739, + "step": 3970 + }, + { + "epoch": 1.8775413711583924, + "grad_norm": 2.602696418762207, + "learning_rate": 3.921622722614691e-06, + "loss": 0.5199, + "step": 3971 + }, + { + "epoch": 1.8780141843971632, + "grad_norm": 2.5384418964385986, + "learning_rate": 3.921109530631172e-06, + "loss": 0.5086, + "step": 3972 + }, + { + "epoch": 1.878486997635934, + "grad_norm": 2.7961080074310303, + "learning_rate": 3.920596250161394e-06, + "loss": 0.5454, + "step": 3973 + }, + { + "epoch": 1.8789598108747045, + "grad_norm": 3.022007465362549, + "learning_rate": 3.920082881237317e-06, + "loss": 0.5537, + "step": 3974 + }, + { + "epoch": 1.8794326241134751, + "grad_norm": 2.699885129928589, + "learning_rate": 3.9195694238909045e-06, + "loss": 0.5274, + "step": 3975 + }, + { + "epoch": 1.879905437352246, + "grad_norm": 2.3994593620300293, + "learning_rate": 3.919055878154129e-06, + "loss": 0.4134, + "step": 3976 + }, + { + "epoch": 1.8803782505910167, + "grad_norm": 4.093045711517334, + "learning_rate": 3.918542244058967e-06, + "loss": 0.5305, + "step": 3977 + }, + { + "epoch": 1.8808510638297873, + "grad_norm": 3.011643171310425, + "learning_rate": 3.9180285216374e-06, + "loss": 0.5481, + "step": 3978 + }, + { + "epoch": 1.8813238770685579, + "grad_norm": 2.6426854133605957, + "learning_rate": 3.917514710921414e-06, + "loss": 0.5415, + "step": 3979 + }, + { + "epoch": 1.8817966903073287, + "grad_norm": 2.4379019737243652, + "learning_rate": 3.917000811943002e-06, + "loss": 0.4566, + "step": 3980 + }, + { + "epoch": 1.8822695035460995, + "grad_norm": 3.18522047996521, + "learning_rate": 3.9164868247341634e-06, + "loss": 0.6079, + "step": 3981 + }, + { + "epoch": 1.88274231678487, + "grad_norm": 2.6451141834259033, + "learning_rate": 3.915972749326903e-06, + "loss": 0.515, + "step": 3982 + }, + { + "epoch": 1.8832151300236406, + "grad_norm": 2.565598726272583, + "learning_rate": 3.915458585753226e-06, + "loss": 0.4799, + "step": 3983 + }, + { + "epoch": 1.8836879432624114, + "grad_norm": 2.711651563644409, + "learning_rate": 3.91494433404515e-06, + "loss": 0.5595, + "step": 3984 + }, + { + "epoch": 1.8841607565011822, + "grad_norm": 2.749328851699829, + "learning_rate": 3.914429994234695e-06, + "loss": 0.495, + "step": 3985 + }, + { + "epoch": 1.8846335697399526, + "grad_norm": 2.9492287635803223, + "learning_rate": 3.913915566353886e-06, + "loss": 0.5683, + "step": 3986 + }, + { + "epoch": 1.8851063829787233, + "grad_norm": 3.07747745513916, + "learning_rate": 3.913401050434756e-06, + "loss": 0.4953, + "step": 3987 + }, + { + "epoch": 1.8855791962174941, + "grad_norm": 2.8746345043182373, + "learning_rate": 3.912886446509338e-06, + "loss": 0.4752, + "step": 3988 + }, + { + "epoch": 1.8860520094562647, + "grad_norm": 2.772954225540161, + "learning_rate": 3.912371754609677e-06, + "loss": 0.5473, + "step": 3989 + }, + { + "epoch": 1.8865248226950353, + "grad_norm": 2.8906044960021973, + "learning_rate": 3.911856974767821e-06, + "loss": 0.5285, + "step": 3990 + }, + { + "epoch": 1.886997635933806, + "grad_norm": 2.8992726802825928, + "learning_rate": 3.9113421070158206e-06, + "loss": 0.571, + "step": 3991 + }, + { + "epoch": 1.887470449172577, + "grad_norm": 2.624662160873413, + "learning_rate": 3.910827151385737e-06, + "loss": 0.5183, + "step": 3992 + }, + { + "epoch": 1.8879432624113475, + "grad_norm": 2.4491732120513916, + "learning_rate": 3.910312107909632e-06, + "loss": 0.4205, + "step": 3993 + }, + { + "epoch": 1.888416075650118, + "grad_norm": 2.278259515762329, + "learning_rate": 3.909796976619575e-06, + "loss": 0.4464, + "step": 3994 + }, + { + "epoch": 1.8888888888888888, + "grad_norm": 2.6481523513793945, + "learning_rate": 3.909281757547644e-06, + "loss": 0.5023, + "step": 3995 + }, + { + "epoch": 1.8893617021276596, + "grad_norm": 2.6687493324279785, + "learning_rate": 3.908766450725917e-06, + "loss": 0.495, + "step": 3996 + }, + { + "epoch": 1.8898345153664302, + "grad_norm": 2.507525682449341, + "learning_rate": 3.908251056186481e-06, + "loss": 0.4155, + "step": 3997 + }, + { + "epoch": 1.8903073286052008, + "grad_norm": 2.7048323154449463, + "learning_rate": 3.907735573961426e-06, + "loss": 0.4601, + "step": 3998 + }, + { + "epoch": 1.8907801418439716, + "grad_norm": 2.6825389862060547, + "learning_rate": 3.907220004082848e-06, + "loss": 0.5067, + "step": 3999 + }, + { + "epoch": 1.8912529550827424, + "grad_norm": 2.775696039199829, + "learning_rate": 3.906704346582852e-06, + "loss": 0.5411, + "step": 4000 + }, + { + "epoch": 1.891725768321513, + "grad_norm": 2.4492077827453613, + "learning_rate": 3.906188601493545e-06, + "loss": 0.4931, + "step": 4001 + }, + { + "epoch": 1.8921985815602835, + "grad_norm": 2.320810556411743, + "learning_rate": 3.905672768847041e-06, + "loss": 0.4908, + "step": 4002 + }, + { + "epoch": 1.8926713947990543, + "grad_norm": 2.455162525177002, + "learning_rate": 3.905156848675455e-06, + "loss": 0.508, + "step": 4003 + }, + { + "epoch": 1.8931442080378251, + "grad_norm": 2.515921115875244, + "learning_rate": 3.904640841010915e-06, + "loss": 0.5318, + "step": 4004 + }, + { + "epoch": 1.8936170212765957, + "grad_norm": 2.7230770587921143, + "learning_rate": 3.904124745885548e-06, + "loss": 0.4793, + "step": 4005 + }, + { + "epoch": 1.8940898345153663, + "grad_norm": 2.519934892654419, + "learning_rate": 3.903608563331491e-06, + "loss": 0.5013, + "step": 4006 + }, + { + "epoch": 1.894562647754137, + "grad_norm": 2.719674587249756, + "learning_rate": 3.903092293380883e-06, + "loss": 0.516, + "step": 4007 + }, + { + "epoch": 1.8950354609929079, + "grad_norm": 3.2107343673706055, + "learning_rate": 3.902575936065869e-06, + "loss": 0.6297, + "step": 4008 + }, + { + "epoch": 1.8955082742316784, + "grad_norm": 2.9773149490356445, + "learning_rate": 3.902059491418603e-06, + "loss": 0.566, + "step": 4009 + }, + { + "epoch": 1.895981087470449, + "grad_norm": 2.6754770278930664, + "learning_rate": 3.90154295947124e-06, + "loss": 0.5187, + "step": 4010 + }, + { + "epoch": 1.8964539007092198, + "grad_norm": 2.457303762435913, + "learning_rate": 3.901026340255943e-06, + "loss": 0.5757, + "step": 4011 + }, + { + "epoch": 1.8969267139479906, + "grad_norm": 2.5944161415100098, + "learning_rate": 3.900509633804878e-06, + "loss": 0.5049, + "step": 4012 + }, + { + "epoch": 1.8973995271867612, + "grad_norm": 2.610445022583008, + "learning_rate": 3.89999284015022e-06, + "loss": 0.521, + "step": 4013 + }, + { + "epoch": 1.8978723404255318, + "grad_norm": 2.6949338912963867, + "learning_rate": 3.899475959324146e-06, + "loss": 0.5619, + "step": 4014 + }, + { + "epoch": 1.8983451536643026, + "grad_norm": 2.7889559268951416, + "learning_rate": 3.898958991358841e-06, + "loss": 0.5223, + "step": 4015 + }, + { + "epoch": 1.8988179669030734, + "grad_norm": 2.569265842437744, + "learning_rate": 3.898441936286493e-06, + "loss": 0.5724, + "step": 4016 + }, + { + "epoch": 1.899290780141844, + "grad_norm": 2.3567774295806885, + "learning_rate": 3.897924794139299e-06, + "loss": 0.4784, + "step": 4017 + }, + { + "epoch": 1.8997635933806145, + "grad_norm": 2.9176526069641113, + "learning_rate": 3.897407564949457e-06, + "loss": 0.646, + "step": 4018 + }, + { + "epoch": 1.9002364066193853, + "grad_norm": 2.7870090007781982, + "learning_rate": 3.896890248749174e-06, + "loss": 0.4922, + "step": 4019 + }, + { + "epoch": 1.900709219858156, + "grad_norm": 2.8310980796813965, + "learning_rate": 3.89637284557066e-06, + "loss": 0.4746, + "step": 4020 + }, + { + "epoch": 1.9011820330969267, + "grad_norm": 2.434915542602539, + "learning_rate": 3.895855355446131e-06, + "loss": 0.4537, + "step": 4021 + }, + { + "epoch": 1.9016548463356973, + "grad_norm": 3.0547034740448, + "learning_rate": 3.89533777840781e-06, + "loss": 0.6161, + "step": 4022 + }, + { + "epoch": 1.902127659574468, + "grad_norm": 3.416774272918701, + "learning_rate": 3.894820114487925e-06, + "loss": 0.5448, + "step": 4023 + }, + { + "epoch": 1.9026004728132389, + "grad_norm": 2.606951951980591, + "learning_rate": 3.894302363718707e-06, + "loss": 0.5501, + "step": 4024 + }, + { + "epoch": 1.9030732860520094, + "grad_norm": 3.082165002822876, + "learning_rate": 3.8937845261323945e-06, + "loss": 0.6035, + "step": 4025 + }, + { + "epoch": 1.90354609929078, + "grad_norm": 2.616093397140503, + "learning_rate": 3.893266601761231e-06, + "loss": 0.5294, + "step": 4026 + }, + { + "epoch": 1.9040189125295508, + "grad_norm": 2.7141637802124023, + "learning_rate": 3.8927485906374654e-06, + "loss": 0.5481, + "step": 4027 + }, + { + "epoch": 1.9044917257683216, + "grad_norm": 2.5129404067993164, + "learning_rate": 3.892230492793352e-06, + "loss": 0.4958, + "step": 4028 + }, + { + "epoch": 1.9049645390070922, + "grad_norm": 2.703403949737549, + "learning_rate": 3.891712308261151e-06, + "loss": 0.4852, + "step": 4029 + }, + { + "epoch": 1.9054373522458627, + "grad_norm": 2.881058931350708, + "learning_rate": 3.891194037073127e-06, + "loss": 0.4662, + "step": 4030 + }, + { + "epoch": 1.9059101654846335, + "grad_norm": 3.216769218444824, + "learning_rate": 3.8906756792615505e-06, + "loss": 0.5076, + "step": 4031 + }, + { + "epoch": 1.9063829787234043, + "grad_norm": 2.442265748977661, + "learning_rate": 3.890157234858697e-06, + "loss": 0.4748, + "step": 4032 + }, + { + "epoch": 1.906855791962175, + "grad_norm": 3.088672399520874, + "learning_rate": 3.889638703896849e-06, + "loss": 0.5729, + "step": 4033 + }, + { + "epoch": 1.9073286052009455, + "grad_norm": 2.9304986000061035, + "learning_rate": 3.889120086408291e-06, + "loss": 0.603, + "step": 4034 + }, + { + "epoch": 1.9078014184397163, + "grad_norm": 2.686093807220459, + "learning_rate": 3.888601382425318e-06, + "loss": 0.4978, + "step": 4035 + }, + { + "epoch": 1.908274231678487, + "grad_norm": 2.5668389797210693, + "learning_rate": 3.888082591980225e-06, + "loss": 0.5086, + "step": 4036 + }, + { + "epoch": 1.9087470449172577, + "grad_norm": 2.530996561050415, + "learning_rate": 3.887563715105315e-06, + "loss": 0.4678, + "step": 4037 + }, + { + "epoch": 1.9092198581560282, + "grad_norm": 3.043342351913452, + "learning_rate": 3.887044751832897e-06, + "loss": 0.5452, + "step": 4038 + }, + { + "epoch": 1.909692671394799, + "grad_norm": 2.799734115600586, + "learning_rate": 3.886525702195284e-06, + "loss": 0.5265, + "step": 4039 + }, + { + "epoch": 1.9101654846335698, + "grad_norm": 2.890022039413452, + "learning_rate": 3.886006566224796e-06, + "loss": 0.4634, + "step": 4040 + }, + { + "epoch": 1.9106382978723404, + "grad_norm": 2.6804237365722656, + "learning_rate": 3.8854873439537555e-06, + "loss": 0.5031, + "step": 4041 + }, + { + "epoch": 1.911111111111111, + "grad_norm": 2.43038272857666, + "learning_rate": 3.884968035414495e-06, + "loss": 0.5098, + "step": 4042 + }, + { + "epoch": 1.9115839243498818, + "grad_norm": 2.589583396911621, + "learning_rate": 3.884448640639346e-06, + "loss": 0.498, + "step": 4043 + }, + { + "epoch": 1.9120567375886526, + "grad_norm": 2.4565231800079346, + "learning_rate": 3.8839291596606524e-06, + "loss": 0.4318, + "step": 4044 + }, + { + "epoch": 1.9125295508274232, + "grad_norm": 2.66762638092041, + "learning_rate": 3.8834095925107575e-06, + "loss": 0.5441, + "step": 4045 + }, + { + "epoch": 1.9130023640661937, + "grad_norm": 2.7334461212158203, + "learning_rate": 3.882889939222013e-06, + "loss": 0.5209, + "step": 4046 + }, + { + "epoch": 1.9134751773049645, + "grad_norm": 2.6398537158966064, + "learning_rate": 3.8823701998267765e-06, + "loss": 0.4874, + "step": 4047 + }, + { + "epoch": 1.9139479905437353, + "grad_norm": 2.82405161857605, + "learning_rate": 3.881850374357409e-06, + "loss": 0.4519, + "step": 4048 + }, + { + "epoch": 1.914420803782506, + "grad_norm": 2.7552523612976074, + "learning_rate": 3.8813304628462776e-06, + "loss": 0.547, + "step": 4049 + }, + { + "epoch": 1.9148936170212765, + "grad_norm": 2.5287928581237793, + "learning_rate": 3.880810465325755e-06, + "loss": 0.5226, + "step": 4050 + }, + { + "epoch": 1.9153664302600473, + "grad_norm": 2.7597358226776123, + "learning_rate": 3.88029038182822e-06, + "loss": 0.5171, + "step": 4051 + }, + { + "epoch": 1.915839243498818, + "grad_norm": 2.563899278640747, + "learning_rate": 3.879770212386055e-06, + "loss": 0.4911, + "step": 4052 + }, + { + "epoch": 1.9163120567375886, + "grad_norm": 2.499404191970825, + "learning_rate": 3.879249957031649e-06, + "loss": 0.5072, + "step": 4053 + }, + { + "epoch": 1.9167848699763592, + "grad_norm": 2.817713499069214, + "learning_rate": 3.878729615797396e-06, + "loss": 0.5452, + "step": 4054 + }, + { + "epoch": 1.91725768321513, + "grad_norm": 2.7152490615844727, + "learning_rate": 3.878209188715696e-06, + "loss": 0.4917, + "step": 4055 + }, + { + "epoch": 1.9177304964539008, + "grad_norm": 2.384265661239624, + "learning_rate": 3.877688675818953e-06, + "loss": 0.4823, + "step": 4056 + }, + { + "epoch": 1.9182033096926714, + "grad_norm": 2.61059308052063, + "learning_rate": 3.877168077139577e-06, + "loss": 0.478, + "step": 4057 + }, + { + "epoch": 1.918676122931442, + "grad_norm": 2.6107938289642334, + "learning_rate": 3.8766473927099824e-06, + "loss": 0.5202, + "step": 4058 + }, + { + "epoch": 1.9191489361702128, + "grad_norm": 2.2339766025543213, + "learning_rate": 3.876126622562592e-06, + "loss": 0.547, + "step": 4059 + }, + { + "epoch": 1.9196217494089836, + "grad_norm": 2.4324610233306885, + "learning_rate": 3.8756057667298304e-06, + "loss": 0.5333, + "step": 4060 + }, + { + "epoch": 1.9200945626477541, + "grad_norm": 2.5521230697631836, + "learning_rate": 3.875084825244131e-06, + "loss": 0.5503, + "step": 4061 + }, + { + "epoch": 1.9205673758865247, + "grad_norm": 2.6985747814178467, + "learning_rate": 3.874563798137928e-06, + "loss": 0.4944, + "step": 4062 + }, + { + "epoch": 1.9210401891252955, + "grad_norm": 2.422332525253296, + "learning_rate": 3.874042685443664e-06, + "loss": 0.4807, + "step": 4063 + }, + { + "epoch": 1.9215130023640663, + "grad_norm": 2.914553165435791, + "learning_rate": 3.873521487193788e-06, + "loss": 0.4439, + "step": 4064 + }, + { + "epoch": 1.9219858156028369, + "grad_norm": 2.8098697662353516, + "learning_rate": 3.873000203420752e-06, + "loss": 0.5433, + "step": 4065 + }, + { + "epoch": 1.9224586288416075, + "grad_norm": 2.6124703884124756, + "learning_rate": 3.872478834157013e-06, + "loss": 0.4812, + "step": 4066 + }, + { + "epoch": 1.9229314420803783, + "grad_norm": 2.511059522628784, + "learning_rate": 3.871957379435035e-06, + "loss": 0.4666, + "step": 4067 + }, + { + "epoch": 1.923404255319149, + "grad_norm": 2.950542688369751, + "learning_rate": 3.871435839287287e-06, + "loss": 0.5687, + "step": 4068 + }, + { + "epoch": 1.9238770685579196, + "grad_norm": 2.4969422817230225, + "learning_rate": 3.870914213746243e-06, + "loss": 0.5235, + "step": 4069 + }, + { + "epoch": 1.9243498817966902, + "grad_norm": 2.512152910232544, + "learning_rate": 3.870392502844382e-06, + "loss": 0.4524, + "step": 4070 + }, + { + "epoch": 1.924822695035461, + "grad_norm": 3.0212557315826416, + "learning_rate": 3.86987070661419e-06, + "loss": 0.4868, + "step": 4071 + }, + { + "epoch": 1.9252955082742318, + "grad_norm": 2.8949966430664062, + "learning_rate": 3.869348825088154e-06, + "loss": 0.5556, + "step": 4072 + }, + { + "epoch": 1.9257683215130024, + "grad_norm": 2.402043581008911, + "learning_rate": 3.868826858298772e-06, + "loss": 0.5307, + "step": 4073 + }, + { + "epoch": 1.926241134751773, + "grad_norm": 2.980992078781128, + "learning_rate": 3.868304806278543e-06, + "loss": 0.6313, + "step": 4074 + }, + { + "epoch": 1.9267139479905437, + "grad_norm": 2.7140514850616455, + "learning_rate": 3.867782669059975e-06, + "loss": 0.5359, + "step": 4075 + }, + { + "epoch": 1.9271867612293145, + "grad_norm": 2.499631643295288, + "learning_rate": 3.867260446675577e-06, + "loss": 0.4873, + "step": 4076 + }, + { + "epoch": 1.9276595744680851, + "grad_norm": 2.915583610534668, + "learning_rate": 3.866738139157866e-06, + "loss": 0.5736, + "step": 4077 + }, + { + "epoch": 1.9281323877068557, + "grad_norm": 2.4231131076812744, + "learning_rate": 3.866215746539363e-06, + "loss": 0.5096, + "step": 4078 + }, + { + "epoch": 1.9286052009456265, + "grad_norm": 2.360074996948242, + "learning_rate": 3.865693268852599e-06, + "loss": 0.4907, + "step": 4079 + }, + { + "epoch": 1.9290780141843973, + "grad_norm": 2.5410032272338867, + "learning_rate": 3.865170706130101e-06, + "loss": 0.473, + "step": 4080 + }, + { + "epoch": 1.9295508274231679, + "grad_norm": 2.780090808868408, + "learning_rate": 3.86464805840441e-06, + "loss": 0.5213, + "step": 4081 + }, + { + "epoch": 1.9300236406619384, + "grad_norm": 2.7318382263183594, + "learning_rate": 3.864125325708068e-06, + "loss": 0.5617, + "step": 4082 + }, + { + "epoch": 1.9304964539007092, + "grad_norm": 2.76509165763855, + "learning_rate": 3.863602508073623e-06, + "loss": 0.52, + "step": 4083 + }, + { + "epoch": 1.93096926713948, + "grad_norm": 2.8041110038757324, + "learning_rate": 3.863079605533631e-06, + "loss": 0.5343, + "step": 4084 + }, + { + "epoch": 1.9314420803782506, + "grad_norm": 2.4462404251098633, + "learning_rate": 3.862556618120647e-06, + "loss": 0.4657, + "step": 4085 + }, + { + "epoch": 1.9319148936170212, + "grad_norm": 2.460864305496216, + "learning_rate": 3.862033545867238e-06, + "loss": 0.517, + "step": 4086 + }, + { + "epoch": 1.932387706855792, + "grad_norm": 2.6480276584625244, + "learning_rate": 3.8615103888059715e-06, + "loss": 0.4702, + "step": 4087 + }, + { + "epoch": 1.9328605200945628, + "grad_norm": 2.7175381183624268, + "learning_rate": 3.860987146969424e-06, + "loss": 0.5073, + "step": 4088 + }, + { + "epoch": 1.9333333333333333, + "grad_norm": 2.4963486194610596, + "learning_rate": 3.860463820390175e-06, + "loss": 0.4491, + "step": 4089 + }, + { + "epoch": 1.933806146572104, + "grad_norm": 2.548135757446289, + "learning_rate": 3.8599404091008075e-06, + "loss": 0.5134, + "step": 4090 + }, + { + "epoch": 1.9342789598108747, + "grad_norm": 2.8693668842315674, + "learning_rate": 3.859416913133916e-06, + "loss": 0.5467, + "step": 4091 + }, + { + "epoch": 1.9347517730496455, + "grad_norm": 2.711273670196533, + "learning_rate": 3.858893332522092e-06, + "loss": 0.6287, + "step": 4092 + }, + { + "epoch": 1.935224586288416, + "grad_norm": 2.8604533672332764, + "learning_rate": 3.858369667297941e-06, + "loss": 0.5661, + "step": 4093 + }, + { + "epoch": 1.9356973995271867, + "grad_norm": 2.936988353729248, + "learning_rate": 3.857845917494066e-06, + "loss": 0.5311, + "step": 4094 + }, + { + "epoch": 1.9361702127659575, + "grad_norm": 2.414093494415283, + "learning_rate": 3.857322083143079e-06, + "loss": 0.505, + "step": 4095 + }, + { + "epoch": 1.9366430260047283, + "grad_norm": 2.5528934001922607, + "learning_rate": 3.856798164277599e-06, + "loss": 0.4759, + "step": 4096 + }, + { + "epoch": 1.9371158392434988, + "grad_norm": 2.592893600463867, + "learning_rate": 3.8562741609302456e-06, + "loss": 0.4932, + "step": 4097 + }, + { + "epoch": 1.9375886524822694, + "grad_norm": 2.9619107246398926, + "learning_rate": 3.855750073133648e-06, + "loss": 0.5563, + "step": 4098 + }, + { + "epoch": 1.9380614657210402, + "grad_norm": 2.864889621734619, + "learning_rate": 3.855225900920438e-06, + "loss": 0.5069, + "step": 4099 + }, + { + "epoch": 1.938534278959811, + "grad_norm": 2.3951032161712646, + "learning_rate": 3.854701644323253e-06, + "loss": 0.4883, + "step": 4100 + }, + { + "epoch": 1.9390070921985816, + "grad_norm": 2.6339633464813232, + "learning_rate": 3.854177303374737e-06, + "loss": 0.5207, + "step": 4101 + }, + { + "epoch": 1.9394799054373522, + "grad_norm": 2.6435508728027344, + "learning_rate": 3.853652878107539e-06, + "loss": 0.4679, + "step": 4102 + }, + { + "epoch": 1.939952718676123, + "grad_norm": 2.4635629653930664, + "learning_rate": 3.853128368554311e-06, + "loss": 0.5639, + "step": 4103 + }, + { + "epoch": 1.9404255319148938, + "grad_norm": 2.664635419845581, + "learning_rate": 3.852603774747714e-06, + "loss": 0.5697, + "step": 4104 + }, + { + "epoch": 1.9408983451536643, + "grad_norm": 2.7020363807678223, + "learning_rate": 3.8520790967204095e-06, + "loss": 0.5462, + "step": 4105 + }, + { + "epoch": 1.941371158392435, + "grad_norm": 3.529282331466675, + "learning_rate": 3.851554334505069e-06, + "loss": 0.54, + "step": 4106 + }, + { + "epoch": 1.9418439716312057, + "grad_norm": 2.7125768661499023, + "learning_rate": 3.851029488134367e-06, + "loss": 0.5355, + "step": 4107 + }, + { + "epoch": 1.9423167848699765, + "grad_norm": 2.5226643085479736, + "learning_rate": 3.850504557640981e-06, + "loss": 0.5106, + "step": 4108 + }, + { + "epoch": 1.942789598108747, + "grad_norm": 2.834352731704712, + "learning_rate": 3.8499795430575995e-06, + "loss": 0.6069, + "step": 4109 + }, + { + "epoch": 1.9432624113475176, + "grad_norm": 2.8484177589416504, + "learning_rate": 3.849454444416911e-06, + "loss": 0.5542, + "step": 4110 + }, + { + "epoch": 1.9437352245862884, + "grad_norm": 2.402539014816284, + "learning_rate": 3.848929261751612e-06, + "loss": 0.47, + "step": 4111 + }, + { + "epoch": 1.9442080378250592, + "grad_norm": 2.7010042667388916, + "learning_rate": 3.848403995094402e-06, + "loss": 0.5263, + "step": 4112 + }, + { + "epoch": 1.9446808510638298, + "grad_norm": 2.441689968109131, + "learning_rate": 3.847878644477988e-06, + "loss": 0.5607, + "step": 4113 + }, + { + "epoch": 1.9451536643026004, + "grad_norm": 2.5994722843170166, + "learning_rate": 3.847353209935081e-06, + "loss": 0.5103, + "step": 4114 + }, + { + "epoch": 1.9456264775413712, + "grad_norm": 2.452242136001587, + "learning_rate": 3.8468276914983975e-06, + "loss": 0.4409, + "step": 4115 + }, + { + "epoch": 1.946099290780142, + "grad_norm": 2.421023368835449, + "learning_rate": 3.84630208920066e-06, + "loss": 0.4429, + "step": 4116 + }, + { + "epoch": 1.9465721040189126, + "grad_norm": 2.696399688720703, + "learning_rate": 3.8457764030745945e-06, + "loss": 0.5352, + "step": 4117 + }, + { + "epoch": 1.9470449172576831, + "grad_norm": 2.3963489532470703, + "learning_rate": 3.845250633152933e-06, + "loss": 0.4505, + "step": 4118 + }, + { + "epoch": 1.947517730496454, + "grad_norm": 2.610649585723877, + "learning_rate": 3.8447247794684135e-06, + "loss": 0.501, + "step": 4119 + }, + { + "epoch": 1.9479905437352247, + "grad_norm": 2.740412712097168, + "learning_rate": 3.8441988420537775e-06, + "loss": 0.5362, + "step": 4120 + }, + { + "epoch": 1.9484633569739953, + "grad_norm": 2.2614004611968994, + "learning_rate": 3.8436728209417755e-06, + "loss": 0.4199, + "step": 4121 + }, + { + "epoch": 1.9489361702127659, + "grad_norm": 3.0683481693267822, + "learning_rate": 3.843146716165158e-06, + "loss": 0.5248, + "step": 4122 + }, + { + "epoch": 1.9494089834515367, + "grad_norm": 3.005174398422241, + "learning_rate": 3.842620527756684e-06, + "loss": 0.5246, + "step": 4123 + }, + { + "epoch": 1.9498817966903075, + "grad_norm": 2.672896385192871, + "learning_rate": 3.842094255749117e-06, + "loss": 0.5586, + "step": 4124 + }, + { + "epoch": 1.950354609929078, + "grad_norm": 2.5481197834014893, + "learning_rate": 3.8415679001752255e-06, + "loss": 0.5061, + "step": 4125 + }, + { + "epoch": 1.9508274231678486, + "grad_norm": 2.515789270401001, + "learning_rate": 3.8410414610677835e-06, + "loss": 0.4645, + "step": 4126 + }, + { + "epoch": 1.9513002364066194, + "grad_norm": 2.7236077785491943, + "learning_rate": 3.84051493845957e-06, + "loss": 0.5623, + "step": 4127 + }, + { + "epoch": 1.9517730496453902, + "grad_norm": 2.6252009868621826, + "learning_rate": 3.839988332383369e-06, + "loss": 0.5078, + "step": 4128 + }, + { + "epoch": 1.9522458628841608, + "grad_norm": 2.719196081161499, + "learning_rate": 3.83946164287197e-06, + "loss": 0.5481, + "step": 4129 + }, + { + "epoch": 1.9527186761229314, + "grad_norm": 2.484163284301758, + "learning_rate": 3.838934869958169e-06, + "loss": 0.5332, + "step": 4130 + }, + { + "epoch": 1.9531914893617022, + "grad_norm": 2.615382671356201, + "learning_rate": 3.838408013674764e-06, + "loss": 0.4742, + "step": 4131 + }, + { + "epoch": 1.953664302600473, + "grad_norm": 2.735321044921875, + "learning_rate": 3.83788107405456e-06, + "loss": 0.421, + "step": 4132 + }, + { + "epoch": 1.9541371158392435, + "grad_norm": 2.892652750015259, + "learning_rate": 3.837354051130369e-06, + "loss": 0.5326, + "step": 4133 + }, + { + "epoch": 1.9546099290780141, + "grad_norm": 2.6800546646118164, + "learning_rate": 3.8368269449350055e-06, + "loss": 0.5041, + "step": 4134 + }, + { + "epoch": 1.955082742316785, + "grad_norm": 2.362470865249634, + "learning_rate": 3.836299755501289e-06, + "loss": 0.4697, + "step": 4135 + }, + { + "epoch": 1.9555555555555557, + "grad_norm": 2.3855135440826416, + "learning_rate": 3.835772482862047e-06, + "loss": 0.5148, + "step": 4136 + }, + { + "epoch": 1.9560283687943263, + "grad_norm": 2.3338418006896973, + "learning_rate": 3.83524512705011e-06, + "loss": 0.4643, + "step": 4137 + }, + { + "epoch": 1.9565011820330969, + "grad_norm": 2.261355400085449, + "learning_rate": 3.834717688098313e-06, + "loss": 0.5573, + "step": 4138 + }, + { + "epoch": 1.9569739952718677, + "grad_norm": 2.8166391849517822, + "learning_rate": 3.834190166039498e-06, + "loss": 0.4868, + "step": 4139 + }, + { + "epoch": 1.9574468085106385, + "grad_norm": 2.4155869483947754, + "learning_rate": 3.833662560906512e-06, + "loss": 0.4923, + "step": 4140 + }, + { + "epoch": 1.957919621749409, + "grad_norm": 2.3977696895599365, + "learning_rate": 3.833134872732206e-06, + "loss": 0.5106, + "step": 4141 + }, + { + "epoch": 1.9583924349881796, + "grad_norm": 2.9541378021240234, + "learning_rate": 3.832607101549438e-06, + "loss": 0.4683, + "step": 4142 + }, + { + "epoch": 1.9588652482269504, + "grad_norm": 2.5862700939178467, + "learning_rate": 3.832079247391068e-06, + "loss": 0.4453, + "step": 4143 + }, + { + "epoch": 1.9593380614657212, + "grad_norm": 2.7459371089935303, + "learning_rate": 3.8315513102899644e-06, + "loss": 0.5511, + "step": 4144 + }, + { + "epoch": 1.9598108747044918, + "grad_norm": 2.904869556427002, + "learning_rate": 3.831023290279e-06, + "loss": 0.5348, + "step": 4145 + }, + { + "epoch": 1.9602836879432624, + "grad_norm": 3.092846632003784, + "learning_rate": 3.830495187391051e-06, + "loss": 0.5664, + "step": 4146 + }, + { + "epoch": 1.9607565011820332, + "grad_norm": 3.2838528156280518, + "learning_rate": 3.829967001659001e-06, + "loss": 0.5115, + "step": 4147 + }, + { + "epoch": 1.961229314420804, + "grad_norm": 2.7799549102783203, + "learning_rate": 3.829438733115738e-06, + "loss": 0.5145, + "step": 4148 + }, + { + "epoch": 1.9617021276595743, + "grad_norm": 2.436084270477295, + "learning_rate": 3.828910381794154e-06, + "loss": 0.4718, + "step": 4149 + }, + { + "epoch": 1.962174940898345, + "grad_norm": 2.6662371158599854, + "learning_rate": 3.828381947727148e-06, + "loss": 0.6129, + "step": 4150 + }, + { + "epoch": 1.962647754137116, + "grad_norm": 2.937000036239624, + "learning_rate": 3.827853430947622e-06, + "loss": 0.522, + "step": 4151 + }, + { + "epoch": 1.9631205673758865, + "grad_norm": 2.5737369060516357, + "learning_rate": 3.827324831488486e-06, + "loss": 0.4916, + "step": 4152 + }, + { + "epoch": 1.963593380614657, + "grad_norm": 2.70232892036438, + "learning_rate": 3.826796149382653e-06, + "loss": 0.4726, + "step": 4153 + }, + { + "epoch": 1.9640661938534278, + "grad_norm": 2.6899707317352295, + "learning_rate": 3.826267384663042e-06, + "loss": 0.529, + "step": 4154 + }, + { + "epoch": 1.9645390070921986, + "grad_norm": 2.6142728328704834, + "learning_rate": 3.825738537362575e-06, + "loss": 0.4999, + "step": 4155 + }, + { + "epoch": 1.9650118203309692, + "grad_norm": 2.43949818611145, + "learning_rate": 3.825209607514183e-06, + "loss": 0.5035, + "step": 4156 + }, + { + "epoch": 1.9654846335697398, + "grad_norm": 2.3735458850860596, + "learning_rate": 3.824680595150801e-06, + "loss": 0.4779, + "step": 4157 + }, + { + "epoch": 1.9659574468085106, + "grad_norm": 2.444307565689087, + "learning_rate": 3.824151500305365e-06, + "loss": 0.4825, + "step": 4158 + }, + { + "epoch": 1.9664302600472814, + "grad_norm": 2.8219668865203857, + "learning_rate": 3.8236223230108224e-06, + "loss": 0.5354, + "step": 4159 + }, + { + "epoch": 1.966903073286052, + "grad_norm": 2.720721483230591, + "learning_rate": 3.823093063300121e-06, + "loss": 0.5064, + "step": 4160 + }, + { + "epoch": 1.9673758865248225, + "grad_norm": 2.324190616607666, + "learning_rate": 3.822563721206217e-06, + "loss": 0.5348, + "step": 4161 + }, + { + "epoch": 1.9678486997635933, + "grad_norm": 2.702155351638794, + "learning_rate": 3.8220342967620695e-06, + "loss": 0.5388, + "step": 4162 + }, + { + "epoch": 1.9683215130023641, + "grad_norm": 2.4956369400024414, + "learning_rate": 3.821504790000642e-06, + "loss": 0.5071, + "step": 4163 + }, + { + "epoch": 1.9687943262411347, + "grad_norm": 2.568039655685425, + "learning_rate": 3.820975200954906e-06, + "loss": 0.5133, + "step": 4164 + }, + { + "epoch": 1.9692671394799053, + "grad_norm": 2.810868978500366, + "learning_rate": 3.820445529657837e-06, + "loss": 0.4856, + "step": 4165 + }, + { + "epoch": 1.969739952718676, + "grad_norm": 2.66365647315979, + "learning_rate": 3.819915776142415e-06, + "loss": 0.5235, + "step": 4166 + }, + { + "epoch": 1.9702127659574469, + "grad_norm": 2.2982139587402344, + "learning_rate": 3.8193859404416265e-06, + "loss": 0.4361, + "step": 4167 + }, + { + "epoch": 1.9706855791962175, + "grad_norm": 2.585672378540039, + "learning_rate": 3.818856022588458e-06, + "loss": 0.4842, + "step": 4168 + }, + { + "epoch": 1.971158392434988, + "grad_norm": 2.57857346534729, + "learning_rate": 3.81832602261591e-06, + "loss": 0.5249, + "step": 4169 + }, + { + "epoch": 1.9716312056737588, + "grad_norm": 2.6947224140167236, + "learning_rate": 3.817795940556981e-06, + "loss": 0.5234, + "step": 4170 + }, + { + "epoch": 1.9721040189125296, + "grad_norm": 2.7453415393829346, + "learning_rate": 3.8172657764446764e-06, + "loss": 0.5219, + "step": 4171 + }, + { + "epoch": 1.9725768321513002, + "grad_norm": 8.424073219299316, + "learning_rate": 3.816735530312009e-06, + "loss": 0.5162, + "step": 4172 + }, + { + "epoch": 1.9730496453900708, + "grad_norm": 2.8229739665985107, + "learning_rate": 3.816205202191993e-06, + "loss": 0.4621, + "step": 4173 + }, + { + "epoch": 1.9735224586288416, + "grad_norm": 2.5969009399414062, + "learning_rate": 3.815674792117651e-06, + "loss": 0.5044, + "step": 4174 + }, + { + "epoch": 1.9739952718676124, + "grad_norm": 2.646024227142334, + "learning_rate": 3.815144300122009e-06, + "loss": 0.5094, + "step": 4175 + }, + { + "epoch": 1.974468085106383, + "grad_norm": 2.4950616359710693, + "learning_rate": 3.814613726238097e-06, + "loss": 0.4827, + "step": 4176 + }, + { + "epoch": 1.9749408983451535, + "grad_norm": 2.5636119842529297, + "learning_rate": 3.8140830704989535e-06, + "loss": 0.5241, + "step": 4177 + }, + { + "epoch": 1.9754137115839243, + "grad_norm": 2.7936553955078125, + "learning_rate": 3.813552332937619e-06, + "loss": 0.5344, + "step": 4178 + }, + { + "epoch": 1.9758865248226951, + "grad_norm": 2.8085341453552246, + "learning_rate": 3.8130215135871405e-06, + "loss": 0.5647, + "step": 4179 + }, + { + "epoch": 1.9763593380614657, + "grad_norm": 2.4776322841644287, + "learning_rate": 3.8124906124805694e-06, + "loss": 0.542, + "step": 4180 + }, + { + "epoch": 1.9768321513002363, + "grad_norm": 2.3227856159210205, + "learning_rate": 3.8119596296509635e-06, + "loss": 0.4618, + "step": 4181 + }, + { + "epoch": 1.977304964539007, + "grad_norm": 2.5157814025878906, + "learning_rate": 3.8114285651313848e-06, + "loss": 0.538, + "step": 4182 + }, + { + "epoch": 1.9777777777777779, + "grad_norm": 2.5630218982696533, + "learning_rate": 3.8108974189548987e-06, + "loss": 0.5254, + "step": 4183 + }, + { + "epoch": 1.9782505910165484, + "grad_norm": 2.703237533569336, + "learning_rate": 3.8103661911545787e-06, + "loss": 0.4859, + "step": 4184 + }, + { + "epoch": 1.978723404255319, + "grad_norm": 2.8808000087738037, + "learning_rate": 3.809834881763502e-06, + "loss": 0.5585, + "step": 4185 + }, + { + "epoch": 1.9791962174940898, + "grad_norm": 2.9047577381134033, + "learning_rate": 3.8093034908147507e-06, + "loss": 0.5022, + "step": 4186 + }, + { + "epoch": 1.9796690307328606, + "grad_norm": 2.7417640686035156, + "learning_rate": 3.8087720183414125e-06, + "loss": 0.5275, + "step": 4187 + }, + { + "epoch": 1.9801418439716312, + "grad_norm": 2.952012062072754, + "learning_rate": 3.8082404643765786e-06, + "loss": 0.543, + "step": 4188 + }, + { + "epoch": 1.9806146572104018, + "grad_norm": 2.538376569747925, + "learning_rate": 3.807708828953348e-06, + "loss": 0.4969, + "step": 4189 + }, + { + "epoch": 1.9810874704491725, + "grad_norm": 2.3476181030273438, + "learning_rate": 3.807177112104823e-06, + "loss": 0.4979, + "step": 4190 + }, + { + "epoch": 1.9815602836879433, + "grad_norm": 2.6480464935302734, + "learning_rate": 3.80664531386411e-06, + "loss": 0.4894, + "step": 4191 + }, + { + "epoch": 1.982033096926714, + "grad_norm": 2.792916774749756, + "learning_rate": 3.8061134342643235e-06, + "loss": 0.5468, + "step": 4192 + }, + { + "epoch": 1.9825059101654845, + "grad_norm": 2.368736743927002, + "learning_rate": 3.805581473338581e-06, + "loss": 0.4672, + "step": 4193 + }, + { + "epoch": 1.9829787234042553, + "grad_norm": 2.379084348678589, + "learning_rate": 3.8050494311200037e-06, + "loss": 0.4577, + "step": 4194 + }, + { + "epoch": 1.983451536643026, + "grad_norm": 2.722471237182617, + "learning_rate": 3.804517307641722e-06, + "loss": 0.4988, + "step": 4195 + }, + { + "epoch": 1.9839243498817967, + "grad_norm": 2.356649875640869, + "learning_rate": 3.8039851029368674e-06, + "loss": 0.4933, + "step": 4196 + }, + { + "epoch": 1.9843971631205672, + "grad_norm": 2.9182281494140625, + "learning_rate": 3.8034528170385776e-06, + "loss": 0.4873, + "step": 4197 + }, + { + "epoch": 1.984869976359338, + "grad_norm": 2.6232199668884277, + "learning_rate": 3.8029204499799976e-06, + "loss": 0.4425, + "step": 4198 + }, + { + "epoch": 1.9853427895981088, + "grad_norm": 2.667541980743408, + "learning_rate": 3.802388001794274e-06, + "loss": 0.5022, + "step": 4199 + }, + { + "epoch": 1.9858156028368794, + "grad_norm": 3.168470621109009, + "learning_rate": 3.8018554725145596e-06, + "loss": 0.5505, + "step": 4200 + }, + { + "epoch": 1.98628841607565, + "grad_norm": 2.716625452041626, + "learning_rate": 3.8013228621740132e-06, + "loss": 0.4937, + "step": 4201 + }, + { + "epoch": 1.9867612293144208, + "grad_norm": 2.3014442920684814, + "learning_rate": 3.800790170805799e-06, + "loss": 0.4734, + "step": 4202 + }, + { + "epoch": 1.9872340425531916, + "grad_norm": 2.9426841735839844, + "learning_rate": 3.8002573984430847e-06, + "loss": 0.4983, + "step": 4203 + }, + { + "epoch": 1.9877068557919622, + "grad_norm": 2.5598278045654297, + "learning_rate": 3.7997245451190435e-06, + "loss": 0.4834, + "step": 4204 + }, + { + "epoch": 1.9881796690307327, + "grad_norm": 2.86458420753479, + "learning_rate": 3.7991916108668538e-06, + "loss": 0.5613, + "step": 4205 + }, + { + "epoch": 1.9886524822695035, + "grad_norm": 2.842914342880249, + "learning_rate": 3.7986585957196997e-06, + "loss": 0.4951, + "step": 4206 + }, + { + "epoch": 1.9891252955082743, + "grad_norm": 3.1828150749206543, + "learning_rate": 3.7981254997107686e-06, + "loss": 0.5913, + "step": 4207 + }, + { + "epoch": 1.989598108747045, + "grad_norm": 2.5765931606292725, + "learning_rate": 3.7975923228732547e-06, + "loss": 0.5544, + "step": 4208 + }, + { + "epoch": 1.9900709219858155, + "grad_norm": 2.492234945297241, + "learning_rate": 3.797059065240357e-06, + "loss": 0.5046, + "step": 4209 + }, + { + "epoch": 1.9905437352245863, + "grad_norm": 2.870346784591675, + "learning_rate": 3.7965257268452795e-06, + "loss": 0.5354, + "step": 4210 + }, + { + "epoch": 1.991016548463357, + "grad_norm": 2.4989993572235107, + "learning_rate": 3.795992307721229e-06, + "loss": 0.4677, + "step": 4211 + }, + { + "epoch": 1.9914893617021276, + "grad_norm": 2.931114673614502, + "learning_rate": 3.7954588079014206e-06, + "loss": 0.5504, + "step": 4212 + }, + { + "epoch": 1.9919621749408982, + "grad_norm": 2.5247652530670166, + "learning_rate": 3.794925227419073e-06, + "loss": 0.4736, + "step": 4213 + }, + { + "epoch": 1.992434988179669, + "grad_norm": 2.6238436698913574, + "learning_rate": 3.794391566307409e-06, + "loss": 0.4591, + "step": 4214 + }, + { + "epoch": 1.9929078014184398, + "grad_norm": 2.654886245727539, + "learning_rate": 3.7938578245996584e-06, + "loss": 0.5149, + "step": 4215 + }, + { + "epoch": 1.9933806146572104, + "grad_norm": 2.509164810180664, + "learning_rate": 3.793324002329054e-06, + "loss": 0.4951, + "step": 4216 + }, + { + "epoch": 1.993853427895981, + "grad_norm": 2.909632921218872, + "learning_rate": 3.7927900995288345e-06, + "loss": 0.5131, + "step": 4217 + }, + { + "epoch": 1.9943262411347518, + "grad_norm": 2.4354615211486816, + "learning_rate": 3.7922561162322456e-06, + "loss": 0.4716, + "step": 4218 + }, + { + "epoch": 1.9947990543735226, + "grad_norm": 2.6514649391174316, + "learning_rate": 3.791722052472534e-06, + "loss": 0.5714, + "step": 4219 + }, + { + "epoch": 1.9952718676122931, + "grad_norm": 2.77089262008667, + "learning_rate": 3.791187908282954e-06, + "loss": 0.5736, + "step": 4220 + }, + { + "epoch": 1.9957446808510637, + "grad_norm": 2.7651021480560303, + "learning_rate": 3.7906536836967657e-06, + "loss": 0.4948, + "step": 4221 + }, + { + "epoch": 1.9962174940898345, + "grad_norm": 2.7536795139312744, + "learning_rate": 3.7901193787472306e-06, + "loss": 0.512, + "step": 4222 + }, + { + "epoch": 1.9966903073286053, + "grad_norm": 2.684893846511841, + "learning_rate": 3.78958499346762e-06, + "loss": 0.5118, + "step": 4223 + }, + { + "epoch": 1.9971631205673759, + "grad_norm": 2.7616753578186035, + "learning_rate": 3.7890505278912054e-06, + "loss": 0.4516, + "step": 4224 + }, + { + "epoch": 1.9976359338061465, + "grad_norm": 2.4731967449188232, + "learning_rate": 3.7885159820512666e-06, + "loss": 0.4736, + "step": 4225 + }, + { + "epoch": 1.9981087470449173, + "grad_norm": 2.366631031036377, + "learning_rate": 3.7879813559810884e-06, + "loss": 0.4999, + "step": 4226 + }, + { + "epoch": 1.998581560283688, + "grad_norm": 2.994624137878418, + "learning_rate": 3.7874466497139582e-06, + "loss": 0.5273, + "step": 4227 + }, + { + "epoch": 1.9990543735224586, + "grad_norm": 2.4499242305755615, + "learning_rate": 3.7869118632831712e-06, + "loss": 0.5761, + "step": 4228 + }, + { + "epoch": 1.9995271867612292, + "grad_norm": 2.3370113372802734, + "learning_rate": 3.7863769967220243e-06, + "loss": 0.4673, + "step": 4229 + }, + { + "epoch": 2.0, + "grad_norm": 3.1131203174591064, + "learning_rate": 3.7858420500638236e-06, + "loss": 0.5118, + "step": 4230 + }, + { + "epoch": 2.000472813238771, + "grad_norm": 2.2747561931610107, + "learning_rate": 3.785307023341876e-06, + "loss": 0.4166, + "step": 4231 + }, + { + "epoch": 2.000945626477541, + "grad_norm": 2.4347424507141113, + "learning_rate": 3.7847719165894963e-06, + "loss": 0.4161, + "step": 4232 + }, + { + "epoch": 2.001418439716312, + "grad_norm": 2.398805618286133, + "learning_rate": 3.784236729840003e-06, + "loss": 0.4652, + "step": 4233 + }, + { + "epoch": 2.0018912529550827, + "grad_norm": 2.1904916763305664, + "learning_rate": 3.783701463126719e-06, + "loss": 0.4554, + "step": 4234 + }, + { + "epoch": 2.0023640661938535, + "grad_norm": 2.237330913543701, + "learning_rate": 3.7831661164829735e-06, + "loss": 0.4471, + "step": 4235 + }, + { + "epoch": 2.002836879432624, + "grad_norm": 2.3656628131866455, + "learning_rate": 3.7826306899421016e-06, + "loss": 0.4052, + "step": 4236 + }, + { + "epoch": 2.0033096926713947, + "grad_norm": 2.615489959716797, + "learning_rate": 3.7820951835374405e-06, + "loss": 0.4847, + "step": 4237 + }, + { + "epoch": 2.0037825059101655, + "grad_norm": 2.453036308288574, + "learning_rate": 3.7815595973023347e-06, + "loss": 0.4672, + "step": 4238 + }, + { + "epoch": 2.0042553191489363, + "grad_norm": 2.537468671798706, + "learning_rate": 3.7810239312701306e-06, + "loss": 0.467, + "step": 4239 + }, + { + "epoch": 2.0047281323877066, + "grad_norm": 2.3321666717529297, + "learning_rate": 3.780488185474184e-06, + "loss": 0.3557, + "step": 4240 + }, + { + "epoch": 2.0052009456264774, + "grad_norm": 2.9051828384399414, + "learning_rate": 3.779952359947854e-06, + "loss": 0.5474, + "step": 4241 + }, + { + "epoch": 2.0056737588652482, + "grad_norm": 2.7458817958831787, + "learning_rate": 3.7794164547245015e-06, + "loss": 0.4659, + "step": 4242 + }, + { + "epoch": 2.006146572104019, + "grad_norm": 2.627046585083008, + "learning_rate": 3.778880469837497e-06, + "loss": 0.4179, + "step": 4243 + }, + { + "epoch": 2.0066193853427894, + "grad_norm": 2.4186174869537354, + "learning_rate": 3.7783444053202135e-06, + "loss": 0.3976, + "step": 4244 + }, + { + "epoch": 2.00709219858156, + "grad_norm": 3.109376907348633, + "learning_rate": 3.7778082612060296e-06, + "loss": 0.4095, + "step": 4245 + }, + { + "epoch": 2.007565011820331, + "grad_norm": 2.583376169204712, + "learning_rate": 3.7772720375283282e-06, + "loss": 0.4325, + "step": 4246 + }, + { + "epoch": 2.0080378250591018, + "grad_norm": 2.6199896335601807, + "learning_rate": 3.776735734320497e-06, + "loss": 0.4207, + "step": 4247 + }, + { + "epoch": 2.008510638297872, + "grad_norm": 2.545353651046753, + "learning_rate": 3.77619935161593e-06, + "loss": 0.4483, + "step": 4248 + }, + { + "epoch": 2.008983451536643, + "grad_norm": 2.770266056060791, + "learning_rate": 3.7756628894480263e-06, + "loss": 0.457, + "step": 4249 + }, + { + "epoch": 2.0094562647754137, + "grad_norm": 2.903254985809326, + "learning_rate": 3.7751263478501878e-06, + "loss": 0.4171, + "step": 4250 + }, + { + "epoch": 2.0099290780141845, + "grad_norm": 2.5576963424682617, + "learning_rate": 3.774589726855822e-06, + "loss": 0.3631, + "step": 4251 + }, + { + "epoch": 2.010401891252955, + "grad_norm": 3.7584285736083984, + "learning_rate": 3.7740530264983434e-06, + "loss": 0.4827, + "step": 4252 + }, + { + "epoch": 2.0108747044917257, + "grad_norm": 3.3116581439971924, + "learning_rate": 3.77351624681117e-06, + "loss": 0.5071, + "step": 4253 + }, + { + "epoch": 2.0113475177304965, + "grad_norm": 3.1370885372161865, + "learning_rate": 3.772979387827723e-06, + "loss": 0.4963, + "step": 4254 + }, + { + "epoch": 2.0118203309692673, + "grad_norm": 2.4832639694213867, + "learning_rate": 3.772442449581432e-06, + "loss": 0.4442, + "step": 4255 + }, + { + "epoch": 2.0122931442080376, + "grad_norm": 2.7645785808563232, + "learning_rate": 3.7719054321057293e-06, + "loss": 0.4572, + "step": 4256 + }, + { + "epoch": 2.0127659574468084, + "grad_norm": 2.7962236404418945, + "learning_rate": 3.7713683354340515e-06, + "loss": 0.4906, + "step": 4257 + }, + { + "epoch": 2.013238770685579, + "grad_norm": 2.647991895675659, + "learning_rate": 3.7708311595998425e-06, + "loss": 0.4027, + "step": 4258 + }, + { + "epoch": 2.01371158392435, + "grad_norm": 2.3780267238616943, + "learning_rate": 3.7702939046365504e-06, + "loss": 0.4285, + "step": 4259 + }, + { + "epoch": 2.0141843971631204, + "grad_norm": 2.5185933113098145, + "learning_rate": 3.7697565705776266e-06, + "loss": 0.4834, + "step": 4260 + }, + { + "epoch": 2.014657210401891, + "grad_norm": 2.432507276535034, + "learning_rate": 3.7692191574565294e-06, + "loss": 0.3695, + "step": 4261 + }, + { + "epoch": 2.015130023640662, + "grad_norm": 2.8010706901550293, + "learning_rate": 3.76868166530672e-06, + "loss": 0.478, + "step": 4262 + }, + { + "epoch": 2.0156028368794328, + "grad_norm": 2.32817006111145, + "learning_rate": 3.768144094161666e-06, + "loss": 0.4154, + "step": 4263 + }, + { + "epoch": 2.016075650118203, + "grad_norm": 3.062812328338623, + "learning_rate": 3.7676064440548405e-06, + "loss": 0.5015, + "step": 4264 + }, + { + "epoch": 2.016548463356974, + "grad_norm": 2.6129536628723145, + "learning_rate": 3.7670687150197194e-06, + "loss": 0.3843, + "step": 4265 + }, + { + "epoch": 2.0170212765957447, + "grad_norm": 2.838259696960449, + "learning_rate": 3.766530907089786e-06, + "loss": 0.4937, + "step": 4266 + }, + { + "epoch": 2.0174940898345155, + "grad_norm": 2.601203680038452, + "learning_rate": 3.7659930202985263e-06, + "loss": 0.4644, + "step": 4267 + }, + { + "epoch": 2.017966903073286, + "grad_norm": 2.5964133739471436, + "learning_rate": 3.7654550546794322e-06, + "loss": 0.4365, + "step": 4268 + }, + { + "epoch": 2.0184397163120567, + "grad_norm": 3.0028915405273438, + "learning_rate": 3.764917010266001e-06, + "loss": 0.434, + "step": 4269 + }, + { + "epoch": 2.0189125295508275, + "grad_norm": 2.719252586364746, + "learning_rate": 3.764378887091734e-06, + "loss": 0.4401, + "step": 4270 + }, + { + "epoch": 2.0193853427895982, + "grad_norm": 2.400254011154175, + "learning_rate": 3.7638406851901377e-06, + "loss": 0.4904, + "step": 4271 + }, + { + "epoch": 2.0198581560283686, + "grad_norm": 2.8015363216400146, + "learning_rate": 3.763302404594724e-06, + "loss": 0.4569, + "step": 4272 + }, + { + "epoch": 2.0203309692671394, + "grad_norm": 2.718416452407837, + "learning_rate": 3.762764045339009e-06, + "loss": 0.5124, + "step": 4273 + }, + { + "epoch": 2.02080378250591, + "grad_norm": 2.484049081802368, + "learning_rate": 3.762225607456514e-06, + "loss": 0.4255, + "step": 4274 + }, + { + "epoch": 2.021276595744681, + "grad_norm": 2.6377930641174316, + "learning_rate": 3.7616870909807645e-06, + "loss": 0.5044, + "step": 4275 + }, + { + "epoch": 2.0217494089834513, + "grad_norm": 2.8845038414001465, + "learning_rate": 3.7611484959452927e-06, + "loss": 0.4924, + "step": 4276 + }, + { + "epoch": 2.022222222222222, + "grad_norm": 2.5939974784851074, + "learning_rate": 3.7606098223836342e-06, + "loss": 0.4873, + "step": 4277 + }, + { + "epoch": 2.022695035460993, + "grad_norm": 2.499826431274414, + "learning_rate": 3.76007107032933e-06, + "loss": 0.4515, + "step": 4278 + }, + { + "epoch": 2.0231678486997637, + "grad_norm": 3.0318663120269775, + "learning_rate": 3.759532239815924e-06, + "loss": 0.4901, + "step": 4279 + }, + { + "epoch": 2.023640661938534, + "grad_norm": 2.857977867126465, + "learning_rate": 3.758993330876969e-06, + "loss": 0.4659, + "step": 4280 + }, + { + "epoch": 2.024113475177305, + "grad_norm": 2.47918438911438, + "learning_rate": 3.7584543435460196e-06, + "loss": 0.4323, + "step": 4281 + }, + { + "epoch": 2.0245862884160757, + "grad_norm": 2.6033785343170166, + "learning_rate": 3.757915277856637e-06, + "loss": 0.4437, + "step": 4282 + }, + { + "epoch": 2.0250591016548465, + "grad_norm": 2.799781322479248, + "learning_rate": 3.757376133842386e-06, + "loss": 0.4523, + "step": 4283 + }, + { + "epoch": 2.025531914893617, + "grad_norm": 2.6092529296875, + "learning_rate": 3.756836911536836e-06, + "loss": 0.3898, + "step": 4284 + }, + { + "epoch": 2.0260047281323876, + "grad_norm": 2.66229248046875, + "learning_rate": 3.7562976109735627e-06, + "loss": 0.4731, + "step": 4285 + }, + { + "epoch": 2.0264775413711584, + "grad_norm": 2.90142822265625, + "learning_rate": 3.7557582321861463e-06, + "loss": 0.4285, + "step": 4286 + }, + { + "epoch": 2.0269503546099292, + "grad_norm": 2.5138802528381348, + "learning_rate": 3.7552187752081707e-06, + "loss": 0.4467, + "step": 4287 + }, + { + "epoch": 2.0274231678486996, + "grad_norm": 3.0656235218048096, + "learning_rate": 3.754679240073226e-06, + "loss": 0.4718, + "step": 4288 + }, + { + "epoch": 2.0278959810874704, + "grad_norm": 2.9633383750915527, + "learning_rate": 3.754139626814907e-06, + "loss": 0.4741, + "step": 4289 + }, + { + "epoch": 2.028368794326241, + "grad_norm": 2.5925145149230957, + "learning_rate": 3.753599935466812e-06, + "loss": 0.4281, + "step": 4290 + }, + { + "epoch": 2.028841607565012, + "grad_norm": 2.837740659713745, + "learning_rate": 3.7530601660625456e-06, + "loss": 0.4757, + "step": 4291 + }, + { + "epoch": 2.0293144208037823, + "grad_norm": 2.3995790481567383, + "learning_rate": 3.752520318635718e-06, + "loss": 0.4148, + "step": 4292 + }, + { + "epoch": 2.029787234042553, + "grad_norm": 2.572601795196533, + "learning_rate": 3.7519803932199424e-06, + "loss": 0.4051, + "step": 4293 + }, + { + "epoch": 2.030260047281324, + "grad_norm": 2.6780295372009277, + "learning_rate": 3.751440389848837e-06, + "loss": 0.4626, + "step": 4294 + }, + { + "epoch": 2.0307328605200947, + "grad_norm": 2.8666839599609375, + "learning_rate": 3.7509003085560257e-06, + "loss": 0.4255, + "step": 4295 + }, + { + "epoch": 2.031205673758865, + "grad_norm": 2.4398207664489746, + "learning_rate": 3.750360149375138e-06, + "loss": 0.4235, + "step": 4296 + }, + { + "epoch": 2.031678486997636, + "grad_norm": 2.436840534210205, + "learning_rate": 3.7498199123398062e-06, + "loss": 0.3907, + "step": 4297 + }, + { + "epoch": 2.0321513002364067, + "grad_norm": 3.3945820331573486, + "learning_rate": 3.7492795974836683e-06, + "loss": 0.465, + "step": 4298 + }, + { + "epoch": 2.0326241134751775, + "grad_norm": 2.6693103313446045, + "learning_rate": 3.7487392048403678e-06, + "loss": 0.4948, + "step": 4299 + }, + { + "epoch": 2.033096926713948, + "grad_norm": 2.7642734050750732, + "learning_rate": 3.748198734443553e-06, + "loss": 0.4538, + "step": 4300 + }, + { + "epoch": 2.0335697399527186, + "grad_norm": 3.1436543464660645, + "learning_rate": 3.747658186326876e-06, + "loss": 0.5137, + "step": 4301 + }, + { + "epoch": 2.0340425531914894, + "grad_norm": 3.482678174972534, + "learning_rate": 3.7471175605239947e-06, + "loss": 0.4982, + "step": 4302 + }, + { + "epoch": 2.03451536643026, + "grad_norm": 2.712557077407837, + "learning_rate": 3.746576857068571e-06, + "loss": 0.4459, + "step": 4303 + }, + { + "epoch": 2.0349881796690306, + "grad_norm": 3.147440195083618, + "learning_rate": 3.7460360759942726e-06, + "loss": 0.5063, + "step": 4304 + }, + { + "epoch": 2.0354609929078014, + "grad_norm": 2.840672492980957, + "learning_rate": 3.7454952173347714e-06, + "loss": 0.5041, + "step": 4305 + }, + { + "epoch": 2.035933806146572, + "grad_norm": 2.584122657775879, + "learning_rate": 3.744954281123745e-06, + "loss": 0.4487, + "step": 4306 + }, + { + "epoch": 2.036406619385343, + "grad_norm": 2.9869542121887207, + "learning_rate": 3.7444132673948737e-06, + "loss": 0.479, + "step": 4307 + }, + { + "epoch": 2.0368794326241133, + "grad_norm": 2.478459358215332, + "learning_rate": 3.7438721761818446e-06, + "loss": 0.4636, + "step": 4308 + }, + { + "epoch": 2.037352245862884, + "grad_norm": 2.5524215698242188, + "learning_rate": 3.7433310075183504e-06, + "loss": 0.4601, + "step": 4309 + }, + { + "epoch": 2.037825059101655, + "grad_norm": 2.3709988594055176, + "learning_rate": 3.742789761438086e-06, + "loss": 0.4163, + "step": 4310 + }, + { + "epoch": 2.0382978723404257, + "grad_norm": 3.140355348587036, + "learning_rate": 3.742248437974752e-06, + "loss": 0.4433, + "step": 4311 + }, + { + "epoch": 2.038770685579196, + "grad_norm": 2.940948486328125, + "learning_rate": 3.741707037162055e-06, + "loss": 0.4299, + "step": 4312 + }, + { + "epoch": 2.039243498817967, + "grad_norm": 3.009157419204712, + "learning_rate": 3.7411655590337055e-06, + "loss": 0.463, + "step": 4313 + }, + { + "epoch": 2.0397163120567376, + "grad_norm": 2.672945737838745, + "learning_rate": 3.7406240036234185e-06, + "loss": 0.4696, + "step": 4314 + }, + { + "epoch": 2.0401891252955084, + "grad_norm": 2.745962142944336, + "learning_rate": 3.740082370964916e-06, + "loss": 0.4931, + "step": 4315 + }, + { + "epoch": 2.040661938534279, + "grad_norm": 2.3939316272735596, + "learning_rate": 3.7395406610919217e-06, + "loss": 0.4396, + "step": 4316 + }, + { + "epoch": 2.0411347517730496, + "grad_norm": 2.4364447593688965, + "learning_rate": 3.738998874038165e-06, + "loss": 0.4807, + "step": 4317 + }, + { + "epoch": 2.0416075650118204, + "grad_norm": 2.360489845275879, + "learning_rate": 3.738457009837381e-06, + "loss": 0.4426, + "step": 4318 + }, + { + "epoch": 2.042080378250591, + "grad_norm": 2.5494935512542725, + "learning_rate": 3.7379150685233108e-06, + "loss": 0.4189, + "step": 4319 + }, + { + "epoch": 2.0425531914893615, + "grad_norm": 2.635472059249878, + "learning_rate": 3.7373730501296963e-06, + "loss": 0.5014, + "step": 4320 + }, + { + "epoch": 2.0430260047281323, + "grad_norm": 2.4982943534851074, + "learning_rate": 3.7368309546902876e-06, + "loss": 0.4658, + "step": 4321 + }, + { + "epoch": 2.043498817966903, + "grad_norm": 2.692742109298706, + "learning_rate": 3.736288782238839e-06, + "loss": 0.4454, + "step": 4322 + }, + { + "epoch": 2.043971631205674, + "grad_norm": 2.6774091720581055, + "learning_rate": 3.7357465328091086e-06, + "loss": 0.5002, + "step": 4323 + }, + { + "epoch": 2.0444444444444443, + "grad_norm": 2.695138692855835, + "learning_rate": 3.735204206434861e-06, + "loss": 0.448, + "step": 4324 + }, + { + "epoch": 2.044917257683215, + "grad_norm": 2.5383570194244385, + "learning_rate": 3.7346618031498635e-06, + "loss": 0.4352, + "step": 4325 + }, + { + "epoch": 2.045390070921986, + "grad_norm": 2.267277240753174, + "learning_rate": 3.7341193229878886e-06, + "loss": 0.4162, + "step": 4326 + }, + { + "epoch": 2.0458628841607567, + "grad_norm": 2.6037328243255615, + "learning_rate": 3.733576765982715e-06, + "loss": 0.4471, + "step": 4327 + }, + { + "epoch": 2.046335697399527, + "grad_norm": 3.261385440826416, + "learning_rate": 3.7330341321681253e-06, + "loss": 0.4618, + "step": 4328 + }, + { + "epoch": 2.046808510638298, + "grad_norm": 2.440650463104248, + "learning_rate": 3.7324914215779072e-06, + "loss": 0.4476, + "step": 4329 + }, + { + "epoch": 2.0472813238770686, + "grad_norm": 2.5940682888031006, + "learning_rate": 3.731948634245853e-06, + "loss": 0.4389, + "step": 4330 + }, + { + "epoch": 2.0477541371158394, + "grad_norm": 2.7428150177001953, + "learning_rate": 3.7314057702057582e-06, + "loss": 0.4477, + "step": 4331 + }, + { + "epoch": 2.0482269503546098, + "grad_norm": 2.3546223640441895, + "learning_rate": 3.730862829491427e-06, + "loss": 0.4047, + "step": 4332 + }, + { + "epoch": 2.0486997635933806, + "grad_norm": 2.552422523498535, + "learning_rate": 3.7303198121366637e-06, + "loss": 0.4438, + "step": 4333 + }, + { + "epoch": 2.0491725768321514, + "grad_norm": 2.99226713180542, + "learning_rate": 3.729776718175281e-06, + "loss": 0.491, + "step": 4334 + }, + { + "epoch": 2.049645390070922, + "grad_norm": 3.2003321647644043, + "learning_rate": 3.7292335476410935e-06, + "loss": 0.5458, + "step": 4335 + }, + { + "epoch": 2.0501182033096925, + "grad_norm": 2.739847183227539, + "learning_rate": 3.7286903005679237e-06, + "loss": 0.4499, + "step": 4336 + }, + { + "epoch": 2.0505910165484633, + "grad_norm": 2.5917470455169678, + "learning_rate": 3.7281469769895963e-06, + "loss": 0.4714, + "step": 4337 + }, + { + "epoch": 2.051063829787234, + "grad_norm": 2.8029327392578125, + "learning_rate": 3.7276035769399422e-06, + "loss": 0.42, + "step": 4338 + }, + { + "epoch": 2.051536643026005, + "grad_norm": 2.484879493713379, + "learning_rate": 3.727060100452796e-06, + "loss": 0.4163, + "step": 4339 + }, + { + "epoch": 2.0520094562647753, + "grad_norm": 2.7126030921936035, + "learning_rate": 3.7265165475619973e-06, + "loss": 0.4112, + "step": 4340 + }, + { + "epoch": 2.052482269503546, + "grad_norm": 2.618267774581909, + "learning_rate": 3.7259729183013927e-06, + "loss": 0.4281, + "step": 4341 + }, + { + "epoch": 2.052955082742317, + "grad_norm": 2.703270673751831, + "learning_rate": 3.7254292127048293e-06, + "loss": 0.4437, + "step": 4342 + }, + { + "epoch": 2.0534278959810877, + "grad_norm": 2.429150104522705, + "learning_rate": 3.7248854308061623e-06, + "loss": 0.3971, + "step": 4343 + }, + { + "epoch": 2.053900709219858, + "grad_norm": 2.54354190826416, + "learning_rate": 3.7243415726392508e-06, + "loss": 0.4485, + "step": 4344 + }, + { + "epoch": 2.054373522458629, + "grad_norm": 2.9515016078948975, + "learning_rate": 3.723797638237957e-06, + "loss": 0.4386, + "step": 4345 + }, + { + "epoch": 2.0548463356973996, + "grad_norm": 2.9129958152770996, + "learning_rate": 3.7232536276361514e-06, + "loss": 0.4595, + "step": 4346 + }, + { + "epoch": 2.0553191489361704, + "grad_norm": 2.5397512912750244, + "learning_rate": 3.722709540867706e-06, + "loss": 0.3681, + "step": 4347 + }, + { + "epoch": 2.0557919621749408, + "grad_norm": 2.79884672164917, + "learning_rate": 3.722165377966499e-06, + "loss": 0.4576, + "step": 4348 + }, + { + "epoch": 2.0562647754137116, + "grad_norm": 2.669936180114746, + "learning_rate": 3.7216211389664137e-06, + "loss": 0.3692, + "step": 4349 + }, + { + "epoch": 2.0567375886524824, + "grad_norm": 2.512326240539551, + "learning_rate": 3.7210768239013355e-06, + "loss": 0.4554, + "step": 4350 + }, + { + "epoch": 2.057210401891253, + "grad_norm": 2.913693904876709, + "learning_rate": 3.7205324328051583e-06, + "loss": 0.5282, + "step": 4351 + }, + { + "epoch": 2.0576832151300235, + "grad_norm": 3.040891170501709, + "learning_rate": 3.719987965711778e-06, + "loss": 0.4778, + "step": 4352 + }, + { + "epoch": 2.0581560283687943, + "grad_norm": 2.7504117488861084, + "learning_rate": 3.7194434226550966e-06, + "loss": 0.4217, + "step": 4353 + }, + { + "epoch": 2.058628841607565, + "grad_norm": 2.5522971153259277, + "learning_rate": 3.718898803669021e-06, + "loss": 0.437, + "step": 4354 + }, + { + "epoch": 2.059101654846336, + "grad_norm": 2.8531908988952637, + "learning_rate": 3.718354108787461e-06, + "loss": 0.4251, + "step": 4355 + }, + { + "epoch": 2.0595744680851062, + "grad_norm": 2.5812065601348877, + "learning_rate": 3.7178093380443337e-06, + "loss": 0.4374, + "step": 4356 + }, + { + "epoch": 2.060047281323877, + "grad_norm": 2.627871513366699, + "learning_rate": 3.7172644914735583e-06, + "loss": 0.436, + "step": 4357 + }, + { + "epoch": 2.060520094562648, + "grad_norm": 2.7146239280700684, + "learning_rate": 3.7167195691090607e-06, + "loss": 0.4204, + "step": 4358 + }, + { + "epoch": 2.0609929078014186, + "grad_norm": 2.486483573913574, + "learning_rate": 3.7161745709847706e-06, + "loss": 0.4015, + "step": 4359 + }, + { + "epoch": 2.061465721040189, + "grad_norm": 2.866049289703369, + "learning_rate": 3.7156294971346226e-06, + "loss": 0.4087, + "step": 4360 + }, + { + "epoch": 2.06193853427896, + "grad_norm": 2.9345552921295166, + "learning_rate": 3.715084347592556e-06, + "loss": 0.5074, + "step": 4361 + }, + { + "epoch": 2.0624113475177306, + "grad_norm": 2.502455711364746, + "learning_rate": 3.7145391223925155e-06, + "loss": 0.469, + "step": 4362 + }, + { + "epoch": 2.0628841607565014, + "grad_norm": 2.6419875621795654, + "learning_rate": 3.713993821568449e-06, + "loss": 0.4493, + "step": 4363 + }, + { + "epoch": 2.0633569739952717, + "grad_norm": 3.812079429626465, + "learning_rate": 3.7134484451543114e-06, + "loss": 0.4764, + "step": 4364 + }, + { + "epoch": 2.0638297872340425, + "grad_norm": 2.581780195236206, + "learning_rate": 3.712902993184059e-06, + "loss": 0.3994, + "step": 4365 + }, + { + "epoch": 2.0643026004728133, + "grad_norm": 2.282508134841919, + "learning_rate": 3.712357465691656e-06, + "loss": 0.4252, + "step": 4366 + }, + { + "epoch": 2.064775413711584, + "grad_norm": 2.4727818965911865, + "learning_rate": 3.71181186271107e-06, + "loss": 0.4558, + "step": 4367 + }, + { + "epoch": 2.0652482269503545, + "grad_norm": 2.7661173343658447, + "learning_rate": 3.711266184276272e-06, + "loss": 0.505, + "step": 4368 + }, + { + "epoch": 2.0657210401891253, + "grad_norm": 2.6264543533325195, + "learning_rate": 3.71072043042124e-06, + "loss": 0.4297, + "step": 4369 + }, + { + "epoch": 2.066193853427896, + "grad_norm": 2.773699998855591, + "learning_rate": 3.7101746011799565e-06, + "loss": 0.4267, + "step": 4370 + }, + { + "epoch": 2.066666666666667, + "grad_norm": 2.686955213546753, + "learning_rate": 3.709628696586407e-06, + "loss": 0.4099, + "step": 4371 + }, + { + "epoch": 2.0671394799054372, + "grad_norm": 2.6066620349884033, + "learning_rate": 3.709082716674582e-06, + "loss": 0.4146, + "step": 4372 + }, + { + "epoch": 2.067612293144208, + "grad_norm": 2.7769250869750977, + "learning_rate": 3.7085366614784784e-06, + "loss": 0.4047, + "step": 4373 + }, + { + "epoch": 2.068085106382979, + "grad_norm": 2.4986939430236816, + "learning_rate": 3.7079905310320957e-06, + "loss": 0.4021, + "step": 4374 + }, + { + "epoch": 2.0685579196217496, + "grad_norm": 2.5456206798553467, + "learning_rate": 3.7074443253694402e-06, + "loss": 0.3569, + "step": 4375 + }, + { + "epoch": 2.06903073286052, + "grad_norm": 2.4079296588897705, + "learning_rate": 3.70689804452452e-06, + "loss": 0.4308, + "step": 4376 + }, + { + "epoch": 2.0695035460992908, + "grad_norm": 2.86014723777771, + "learning_rate": 3.7063516885313513e-06, + "loss": 0.4577, + "step": 4377 + }, + { + "epoch": 2.0699763593380616, + "grad_norm": 2.8025779724121094, + "learning_rate": 3.7058052574239523e-06, + "loss": 0.4615, + "step": 4378 + }, + { + "epoch": 2.0704491725768324, + "grad_norm": 2.902676820755005, + "learning_rate": 3.7052587512363475e-06, + "loss": 0.4765, + "step": 4379 + }, + { + "epoch": 2.0709219858156027, + "grad_norm": 2.814509391784668, + "learning_rate": 3.704712170002566e-06, + "loss": 0.434, + "step": 4380 + }, + { + "epoch": 2.0713947990543735, + "grad_norm": 2.7923502922058105, + "learning_rate": 3.704165513756639e-06, + "loss": 0.4626, + "step": 4381 + }, + { + "epoch": 2.0718676122931443, + "grad_norm": 2.6802031993865967, + "learning_rate": 3.703618782532606e-06, + "loss": 0.4835, + "step": 4382 + }, + { + "epoch": 2.072340425531915, + "grad_norm": 3.0963687896728516, + "learning_rate": 3.7030719763645085e-06, + "loss": 0.4813, + "step": 4383 + }, + { + "epoch": 2.0728132387706855, + "grad_norm": 2.5658695697784424, + "learning_rate": 3.7025250952863956e-06, + "loss": 0.4428, + "step": 4384 + }, + { + "epoch": 2.0732860520094563, + "grad_norm": 2.7738289833068848, + "learning_rate": 3.7019781393323167e-06, + "loss": 0.4376, + "step": 4385 + }, + { + "epoch": 2.073758865248227, + "grad_norm": 2.6446938514709473, + "learning_rate": 3.7014311085363303e-06, + "loss": 0.4208, + "step": 4386 + }, + { + "epoch": 2.0742316784869974, + "grad_norm": 2.7556118965148926, + "learning_rate": 3.7008840029324967e-06, + "loss": 0.3831, + "step": 4387 + }, + { + "epoch": 2.074704491725768, + "grad_norm": 2.573141574859619, + "learning_rate": 3.700336822554882e-06, + "loss": 0.4396, + "step": 4388 + }, + { + "epoch": 2.075177304964539, + "grad_norm": 2.762319803237915, + "learning_rate": 3.6997895674375566e-06, + "loss": 0.4579, + "step": 4389 + }, + { + "epoch": 2.07565011820331, + "grad_norm": 2.729780435562134, + "learning_rate": 3.699242237614596e-06, + "loss": 0.4262, + "step": 4390 + }, + { + "epoch": 2.0761229314420806, + "grad_norm": 2.657480001449585, + "learning_rate": 3.698694833120079e-06, + "loss": 0.4176, + "step": 4391 + }, + { + "epoch": 2.076595744680851, + "grad_norm": 2.8433303833007812, + "learning_rate": 3.6981473539880914e-06, + "loss": 0.457, + "step": 4392 + }, + { + "epoch": 2.0770685579196217, + "grad_norm": 2.819047212600708, + "learning_rate": 3.6975998002527225e-06, + "loss": 0.4244, + "step": 4393 + }, + { + "epoch": 2.0775413711583925, + "grad_norm": 2.6565003395080566, + "learning_rate": 3.697052171948064e-06, + "loss": 0.4384, + "step": 4394 + }, + { + "epoch": 2.078014184397163, + "grad_norm": 2.5795063972473145, + "learning_rate": 3.696504469108216e-06, + "loss": 0.4958, + "step": 4395 + }, + { + "epoch": 2.0784869976359337, + "grad_norm": 2.455730676651001, + "learning_rate": 3.6959566917672822e-06, + "loss": 0.4191, + "step": 4396 + }, + { + "epoch": 2.0789598108747045, + "grad_norm": 2.6706607341766357, + "learning_rate": 3.6954088399593684e-06, + "loss": 0.4709, + "step": 4397 + }, + { + "epoch": 2.0794326241134753, + "grad_norm": 2.3758466243743896, + "learning_rate": 3.694860913718589e-06, + "loss": 0.4231, + "step": 4398 + }, + { + "epoch": 2.079905437352246, + "grad_norm": 2.3488340377807617, + "learning_rate": 3.6943129130790583e-06, + "loss": 0.4321, + "step": 4399 + }, + { + "epoch": 2.0803782505910164, + "grad_norm": 2.6438148021698, + "learning_rate": 3.6937648380748996e-06, + "loss": 0.4907, + "step": 4400 + }, + { + "epoch": 2.0808510638297872, + "grad_norm": 2.9826784133911133, + "learning_rate": 3.6932166887402395e-06, + "loss": 0.4404, + "step": 4401 + }, + { + "epoch": 2.081323877068558, + "grad_norm": 2.5203495025634766, + "learning_rate": 3.6926684651092076e-06, + "loss": 0.4337, + "step": 4402 + }, + { + "epoch": 2.0817966903073284, + "grad_norm": 2.7704148292541504, + "learning_rate": 3.692120167215941e-06, + "loss": 0.4195, + "step": 4403 + }, + { + "epoch": 2.082269503546099, + "grad_norm": 2.879430055618286, + "learning_rate": 3.6915717950945782e-06, + "loss": 0.4498, + "step": 4404 + }, + { + "epoch": 2.08274231678487, + "grad_norm": 2.7659497261047363, + "learning_rate": 3.6910233487792655e-06, + "loss": 0.4017, + "step": 4405 + }, + { + "epoch": 2.083215130023641, + "grad_norm": 3.4017205238342285, + "learning_rate": 3.6904748283041503e-06, + "loss": 0.4733, + "step": 4406 + }, + { + "epoch": 2.083687943262411, + "grad_norm": 2.706223249435425, + "learning_rate": 3.6899262337033887e-06, + "loss": 0.4926, + "step": 4407 + }, + { + "epoch": 2.084160756501182, + "grad_norm": 2.644932508468628, + "learning_rate": 3.6893775650111372e-06, + "loss": 0.3904, + "step": 4408 + }, + { + "epoch": 2.0846335697399527, + "grad_norm": 2.666585683822632, + "learning_rate": 3.6888288222615603e-06, + "loss": 0.4698, + "step": 4409 + }, + { + "epoch": 2.0851063829787235, + "grad_norm": 3.0058486461639404, + "learning_rate": 3.688280005488826e-06, + "loss": 0.5291, + "step": 4410 + }, + { + "epoch": 2.085579196217494, + "grad_norm": 2.533088445663452, + "learning_rate": 3.687731114727105e-06, + "loss": 0.393, + "step": 4411 + }, + { + "epoch": 2.0860520094562647, + "grad_norm": 2.921687364578247, + "learning_rate": 3.6871821500105763e-06, + "loss": 0.4719, + "step": 4412 + }, + { + "epoch": 2.0865248226950355, + "grad_norm": 2.291804313659668, + "learning_rate": 3.686633111373421e-06, + "loss": 0.4105, + "step": 4413 + }, + { + "epoch": 2.0869976359338063, + "grad_norm": 2.496333122253418, + "learning_rate": 3.6860839988498255e-06, + "loss": 0.4704, + "step": 4414 + }, + { + "epoch": 2.0874704491725766, + "grad_norm": 2.8059427738189697, + "learning_rate": 3.6855348124739787e-06, + "loss": 0.4961, + "step": 4415 + }, + { + "epoch": 2.0879432624113474, + "grad_norm": 2.683922290802002, + "learning_rate": 3.6849855522800795e-06, + "loss": 0.4838, + "step": 4416 + }, + { + "epoch": 2.088416075650118, + "grad_norm": 2.694148540496826, + "learning_rate": 3.684436218302324e-06, + "loss": 0.4812, + "step": 4417 + }, + { + "epoch": 2.088888888888889, + "grad_norm": 2.724531888961792, + "learning_rate": 3.683886810574919e-06, + "loss": 0.4495, + "step": 4418 + }, + { + "epoch": 2.0893617021276594, + "grad_norm": 2.6176564693450928, + "learning_rate": 3.6833373291320746e-06, + "loss": 0.4698, + "step": 4419 + }, + { + "epoch": 2.08983451536643, + "grad_norm": 2.534116268157959, + "learning_rate": 3.6827877740080032e-06, + "loss": 0.3912, + "step": 4420 + }, + { + "epoch": 2.090307328605201, + "grad_norm": 2.5747432708740234, + "learning_rate": 3.682238145236924e-06, + "loss": 0.4072, + "step": 4421 + }, + { + "epoch": 2.0907801418439718, + "grad_norm": 2.5947659015655518, + "learning_rate": 3.6816884428530588e-06, + "loss": 0.4638, + "step": 4422 + }, + { + "epoch": 2.091252955082742, + "grad_norm": 2.811992883682251, + "learning_rate": 3.6811386668906353e-06, + "loss": 0.4345, + "step": 4423 + }, + { + "epoch": 2.091725768321513, + "grad_norm": 2.7482287883758545, + "learning_rate": 3.680588817383886e-06, + "loss": 0.4541, + "step": 4424 + }, + { + "epoch": 2.0921985815602837, + "grad_norm": 2.987131357192993, + "learning_rate": 3.6800388943670484e-06, + "loss": 0.4571, + "step": 4425 + }, + { + "epoch": 2.0926713947990545, + "grad_norm": 3.1918671131134033, + "learning_rate": 3.6794888978743637e-06, + "loss": 0.5722, + "step": 4426 + }, + { + "epoch": 2.093144208037825, + "grad_norm": 2.5654571056365967, + "learning_rate": 3.678938827940076e-06, + "loss": 0.4686, + "step": 4427 + }, + { + "epoch": 2.0936170212765957, + "grad_norm": 2.942084789276123, + "learning_rate": 3.6783886845984383e-06, + "loss": 0.4512, + "step": 4428 + }, + { + "epoch": 2.0940898345153665, + "grad_norm": 2.74847674369812, + "learning_rate": 3.677838467883703e-06, + "loss": 0.4506, + "step": 4429 + }, + { + "epoch": 2.0945626477541373, + "grad_norm": 2.7569334506988525, + "learning_rate": 3.6772881778301322e-06, + "loss": 0.502, + "step": 4430 + }, + { + "epoch": 2.0950354609929076, + "grad_norm": 2.969966173171997, + "learning_rate": 3.6767378144719884e-06, + "loss": 0.4772, + "step": 4431 + }, + { + "epoch": 2.0955082742316784, + "grad_norm": 2.773524522781372, + "learning_rate": 3.67618737784354e-06, + "loss": 0.5183, + "step": 4432 + }, + { + "epoch": 2.095981087470449, + "grad_norm": 2.6760106086730957, + "learning_rate": 3.6756368679790617e-06, + "loss": 0.4787, + "step": 4433 + }, + { + "epoch": 2.09645390070922, + "grad_norm": 2.8758978843688965, + "learning_rate": 3.6750862849128304e-06, + "loss": 0.4275, + "step": 4434 + }, + { + "epoch": 2.0969267139479904, + "grad_norm": 2.670509099960327, + "learning_rate": 3.6745356286791288e-06, + "loss": 0.4401, + "step": 4435 + }, + { + "epoch": 2.097399527186761, + "grad_norm": 2.8453969955444336, + "learning_rate": 3.673984899312244e-06, + "loss": 0.4303, + "step": 4436 + }, + { + "epoch": 2.097872340425532, + "grad_norm": 2.6212339401245117, + "learning_rate": 3.673434096846468e-06, + "loss": 0.4675, + "step": 4437 + }, + { + "epoch": 2.0983451536643027, + "grad_norm": 2.8211941719055176, + "learning_rate": 3.672883221316095e-06, + "loss": 0.4678, + "step": 4438 + }, + { + "epoch": 2.098817966903073, + "grad_norm": 2.4838058948516846, + "learning_rate": 3.672332272755427e-06, + "loss": 0.4128, + "step": 4439 + }, + { + "epoch": 2.099290780141844, + "grad_norm": 2.596660852432251, + "learning_rate": 3.671781251198769e-06, + "loss": 0.423, + "step": 4440 + }, + { + "epoch": 2.0997635933806147, + "grad_norm": 2.9979989528656006, + "learning_rate": 3.67123015668043e-06, + "loss": 0.4493, + "step": 4441 + }, + { + "epoch": 2.1002364066193855, + "grad_norm": 2.6232850551605225, + "learning_rate": 3.670678989234725e-06, + "loss": 0.4237, + "step": 4442 + }, + { + "epoch": 2.100709219858156, + "grad_norm": 2.575039863586426, + "learning_rate": 3.670127748895973e-06, + "loss": 0.4464, + "step": 4443 + }, + { + "epoch": 2.1011820330969266, + "grad_norm": 2.3381190299987793, + "learning_rate": 3.669576435698497e-06, + "loss": 0.4208, + "step": 4444 + }, + { + "epoch": 2.1016548463356974, + "grad_norm": 2.9645180702209473, + "learning_rate": 3.669025049676625e-06, + "loss": 0.5272, + "step": 4445 + }, + { + "epoch": 2.1021276595744682, + "grad_norm": 2.719320297241211, + "learning_rate": 3.668473590864689e-06, + "loss": 0.4485, + "step": 4446 + }, + { + "epoch": 2.1026004728132386, + "grad_norm": 2.8665547370910645, + "learning_rate": 3.6679220592970254e-06, + "loss": 0.4433, + "step": 4447 + }, + { + "epoch": 2.1030732860520094, + "grad_norm": 2.6922879219055176, + "learning_rate": 3.667370455007977e-06, + "loss": 0.502, + "step": 4448 + }, + { + "epoch": 2.10354609929078, + "grad_norm": 3.018228530883789, + "learning_rate": 3.6668187780318894e-06, + "loss": 0.4939, + "step": 4449 + }, + { + "epoch": 2.104018912529551, + "grad_norm": 3.187901735305786, + "learning_rate": 3.666267028403112e-06, + "loss": 0.4151, + "step": 4450 + }, + { + "epoch": 2.1044917257683213, + "grad_norm": 2.9521446228027344, + "learning_rate": 3.6657152061560012e-06, + "loss": 0.4343, + "step": 4451 + }, + { + "epoch": 2.104964539007092, + "grad_norm": 2.5125739574432373, + "learning_rate": 3.6651633113249164e-06, + "loss": 0.4071, + "step": 4452 + }, + { + "epoch": 2.105437352245863, + "grad_norm": 2.9164133071899414, + "learning_rate": 3.664611343944221e-06, + "loss": 0.4173, + "step": 4453 + }, + { + "epoch": 2.1059101654846337, + "grad_norm": 2.680893898010254, + "learning_rate": 3.6640593040482834e-06, + "loss": 0.4917, + "step": 4454 + }, + { + "epoch": 2.106382978723404, + "grad_norm": 2.6823534965515137, + "learning_rate": 3.6635071916714774e-06, + "loss": 0.4668, + "step": 4455 + }, + { + "epoch": 2.106855791962175, + "grad_norm": 2.6221907138824463, + "learning_rate": 3.6629550068481806e-06, + "loss": 0.4956, + "step": 4456 + }, + { + "epoch": 2.1073286052009457, + "grad_norm": 3.096370220184326, + "learning_rate": 3.6624027496127745e-06, + "loss": 0.3995, + "step": 4457 + }, + { + "epoch": 2.1078014184397165, + "grad_norm": 2.752885341644287, + "learning_rate": 3.661850419999647e-06, + "loss": 0.4838, + "step": 4458 + }, + { + "epoch": 2.108274231678487, + "grad_norm": 2.6806766986846924, + "learning_rate": 3.661298018043188e-06, + "loss": 0.4817, + "step": 4459 + }, + { + "epoch": 2.1087470449172576, + "grad_norm": 2.6317873001098633, + "learning_rate": 3.660745543777794e-06, + "loss": 0.4777, + "step": 4460 + }, + { + "epoch": 2.1092198581560284, + "grad_norm": 2.4939377307891846, + "learning_rate": 3.6601929972378634e-06, + "loss": 0.4525, + "step": 4461 + }, + { + "epoch": 2.109692671394799, + "grad_norm": 2.4902873039245605, + "learning_rate": 3.659640378457803e-06, + "loss": 0.4392, + "step": 4462 + }, + { + "epoch": 2.1101654846335696, + "grad_norm": 2.5082345008850098, + "learning_rate": 3.6590876874720216e-06, + "loss": 0.4224, + "step": 4463 + }, + { + "epoch": 2.1106382978723404, + "grad_norm": 2.658407211303711, + "learning_rate": 3.6585349243149313e-06, + "loss": 0.4316, + "step": 4464 + }, + { + "epoch": 2.111111111111111, + "grad_norm": 2.562883138656616, + "learning_rate": 3.6579820890209515e-06, + "loss": 0.4491, + "step": 4465 + }, + { + "epoch": 2.111583924349882, + "grad_norm": 2.5719261169433594, + "learning_rate": 3.657429181624505e-06, + "loss": 0.4406, + "step": 4466 + }, + { + "epoch": 2.1120567375886523, + "grad_norm": 2.8840596675872803, + "learning_rate": 3.6568762021600184e-06, + "loss": 0.4267, + "step": 4467 + }, + { + "epoch": 2.112529550827423, + "grad_norm": 2.660304546356201, + "learning_rate": 3.656323150661924e-06, + "loss": 0.4502, + "step": 4468 + }, + { + "epoch": 2.113002364066194, + "grad_norm": 2.610996961593628, + "learning_rate": 3.655770027164657e-06, + "loss": 0.3934, + "step": 4469 + }, + { + "epoch": 2.1134751773049647, + "grad_norm": 2.6000053882598877, + "learning_rate": 3.655216831702658e-06, + "loss": 0.4582, + "step": 4470 + }, + { + "epoch": 2.113947990543735, + "grad_norm": 2.73124098777771, + "learning_rate": 3.654663564310372e-06, + "loss": 0.4748, + "step": 4471 + }, + { + "epoch": 2.114420803782506, + "grad_norm": 2.711091995239258, + "learning_rate": 3.6541102250222495e-06, + "loss": 0.4145, + "step": 4472 + }, + { + "epoch": 2.1148936170212767, + "grad_norm": 2.655996561050415, + "learning_rate": 3.6535568138727438e-06, + "loss": 0.4407, + "step": 4473 + }, + { + "epoch": 2.1153664302600474, + "grad_norm": 2.7630865573883057, + "learning_rate": 3.653003330896313e-06, + "loss": 0.4298, + "step": 4474 + }, + { + "epoch": 2.115839243498818, + "grad_norm": 2.554415464401245, + "learning_rate": 3.6524497761274214e-06, + "loss": 0.44, + "step": 4475 + }, + { + "epoch": 2.1163120567375886, + "grad_norm": 2.790328025817871, + "learning_rate": 3.651896149600535e-06, + "loss": 0.5061, + "step": 4476 + }, + { + "epoch": 2.1167848699763594, + "grad_norm": 2.755267381668091, + "learning_rate": 3.651342451350127e-06, + "loss": 0.4588, + "step": 4477 + }, + { + "epoch": 2.11725768321513, + "grad_norm": 2.8936638832092285, + "learning_rate": 3.6507886814106722e-06, + "loss": 0.468, + "step": 4478 + }, + { + "epoch": 2.1177304964539005, + "grad_norm": 2.7394332885742188, + "learning_rate": 3.6502348398166525e-06, + "loss": 0.383, + "step": 4479 + }, + { + "epoch": 2.1182033096926713, + "grad_norm": 2.3359546661376953, + "learning_rate": 3.649680926602553e-06, + "loss": 0.3903, + "step": 4480 + }, + { + "epoch": 2.118676122931442, + "grad_norm": 3.102202892303467, + "learning_rate": 3.6491269418028637e-06, + "loss": 0.4525, + "step": 4481 + }, + { + "epoch": 2.119148936170213, + "grad_norm": 2.467970848083496, + "learning_rate": 3.648572885452078e-06, + "loss": 0.414, + "step": 4482 + }, + { + "epoch": 2.1196217494089833, + "grad_norm": 2.8984131813049316, + "learning_rate": 3.6480187575846952e-06, + "loss": 0.4571, + "step": 4483 + }, + { + "epoch": 2.120094562647754, + "grad_norm": 2.674834966659546, + "learning_rate": 3.6474645582352187e-06, + "loss": 0.455, + "step": 4484 + }, + { + "epoch": 2.120567375886525, + "grad_norm": 2.8713369369506836, + "learning_rate": 3.6469102874381552e-06, + "loss": 0.4567, + "step": 4485 + }, + { + "epoch": 2.1210401891252957, + "grad_norm": 3.174814462661743, + "learning_rate": 3.646355945228017e-06, + "loss": 0.5295, + "step": 4486 + }, + { + "epoch": 2.121513002364066, + "grad_norm": 2.6409823894500732, + "learning_rate": 3.6458015316393215e-06, + "loss": 0.4308, + "step": 4487 + }, + { + "epoch": 2.121985815602837, + "grad_norm": 2.4228954315185547, + "learning_rate": 3.645247046706588e-06, + "loss": 0.4042, + "step": 4488 + }, + { + "epoch": 2.1224586288416076, + "grad_norm": 2.553551435470581, + "learning_rate": 3.6446924904643427e-06, + "loss": 0.3925, + "step": 4489 + }, + { + "epoch": 2.1229314420803784, + "grad_norm": 2.8019237518310547, + "learning_rate": 3.6441378629471157e-06, + "loss": 0.4079, + "step": 4490 + }, + { + "epoch": 2.123404255319149, + "grad_norm": 2.993251085281372, + "learning_rate": 3.643583164189441e-06, + "loss": 0.4558, + "step": 4491 + }, + { + "epoch": 2.1238770685579196, + "grad_norm": 2.4531471729278564, + "learning_rate": 3.643028394225857e-06, + "loss": 0.4167, + "step": 4492 + }, + { + "epoch": 2.1243498817966904, + "grad_norm": 2.6827852725982666, + "learning_rate": 3.6424735530909065e-06, + "loss": 0.4311, + "step": 4493 + }, + { + "epoch": 2.124822695035461, + "grad_norm": 3.1232128143310547, + "learning_rate": 3.6419186408191377e-06, + "loss": 0.4537, + "step": 4494 + }, + { + "epoch": 2.1252955082742315, + "grad_norm": 2.816348075866699, + "learning_rate": 3.641363657445103e-06, + "loss": 0.4869, + "step": 4495 + }, + { + "epoch": 2.1257683215130023, + "grad_norm": 2.6269683837890625, + "learning_rate": 3.6408086030033575e-06, + "loss": 0.4066, + "step": 4496 + }, + { + "epoch": 2.126241134751773, + "grad_norm": 4.6375956535339355, + "learning_rate": 3.640253477528462e-06, + "loss": 0.4488, + "step": 4497 + }, + { + "epoch": 2.126713947990544, + "grad_norm": 3.020970582962036, + "learning_rate": 3.639698281054983e-06, + "loss": 0.4197, + "step": 4498 + }, + { + "epoch": 2.1271867612293143, + "grad_norm": 2.87904691696167, + "learning_rate": 3.6391430136174892e-06, + "loss": 0.4743, + "step": 4499 + }, + { + "epoch": 2.127659574468085, + "grad_norm": 2.719892978668213, + "learning_rate": 3.6385876752505554e-06, + "loss": 0.388, + "step": 4500 + }, + { + "epoch": 2.128132387706856, + "grad_norm": 2.7321808338165283, + "learning_rate": 3.638032265988759e-06, + "loss": 0.4857, + "step": 4501 + }, + { + "epoch": 2.1286052009456267, + "grad_norm": 2.700814723968506, + "learning_rate": 3.6374767858666836e-06, + "loss": 0.4819, + "step": 4502 + }, + { + "epoch": 2.129078014184397, + "grad_norm": 2.658423662185669, + "learning_rate": 3.6369212349189164e-06, + "loss": 0.4113, + "step": 4503 + }, + { + "epoch": 2.129550827423168, + "grad_norm": 2.673877716064453, + "learning_rate": 3.63636561318005e-06, + "loss": 0.3745, + "step": 4504 + }, + { + "epoch": 2.1300236406619386, + "grad_norm": 2.607758045196533, + "learning_rate": 3.6358099206846787e-06, + "loss": 0.4409, + "step": 4505 + }, + { + "epoch": 2.1304964539007094, + "grad_norm": 2.8117682933807373, + "learning_rate": 3.6352541574674044e-06, + "loss": 0.426, + "step": 4506 + }, + { + "epoch": 2.1309692671394798, + "grad_norm": 2.6970250606536865, + "learning_rate": 3.634698323562832e-06, + "loss": 0.4295, + "step": 4507 + }, + { + "epoch": 2.1314420803782506, + "grad_norm": 2.7133560180664062, + "learning_rate": 3.6341424190055696e-06, + "loss": 0.4443, + "step": 4508 + }, + { + "epoch": 2.1319148936170214, + "grad_norm": 2.57181715965271, + "learning_rate": 3.6335864438302328e-06, + "loss": 0.3995, + "step": 4509 + }, + { + "epoch": 2.132387706855792, + "grad_norm": 2.8618004322052, + "learning_rate": 3.633030398071438e-06, + "loss": 0.5075, + "step": 4510 + }, + { + "epoch": 2.1328605200945625, + "grad_norm": 2.7586729526519775, + "learning_rate": 3.6324742817638087e-06, + "loss": 0.4322, + "step": 4511 + }, + { + "epoch": 2.1333333333333333, + "grad_norm": 2.913256883621216, + "learning_rate": 3.631918094941972e-06, + "loss": 0.4708, + "step": 4512 + }, + { + "epoch": 2.133806146572104, + "grad_norm": 2.7715728282928467, + "learning_rate": 3.6313618376405585e-06, + "loss": 0.5194, + "step": 4513 + }, + { + "epoch": 2.134278959810875, + "grad_norm": 2.7986366748809814, + "learning_rate": 3.6308055098942042e-06, + "loss": 0.4419, + "step": 4514 + }, + { + "epoch": 2.1347517730496453, + "grad_norm": 3.043549060821533, + "learning_rate": 3.6302491117375492e-06, + "loss": 0.4441, + "step": 4515 + }, + { + "epoch": 2.135224586288416, + "grad_norm": 2.771761417388916, + "learning_rate": 3.629692643205238e-06, + "loss": 0.4752, + "step": 4516 + }, + { + "epoch": 2.135697399527187, + "grad_norm": 2.804941415786743, + "learning_rate": 3.6291361043319202e-06, + "loss": 0.4089, + "step": 4517 + }, + { + "epoch": 2.1361702127659576, + "grad_norm": 2.9897940158843994, + "learning_rate": 3.628579495152248e-06, + "loss": 0.4829, + "step": 4518 + }, + { + "epoch": 2.136643026004728, + "grad_norm": 2.9273486137390137, + "learning_rate": 3.6280228157008784e-06, + "loss": 0.4469, + "step": 4519 + }, + { + "epoch": 2.137115839243499, + "grad_norm": 2.584373950958252, + "learning_rate": 3.627466066012475e-06, + "loss": 0.4277, + "step": 4520 + }, + { + "epoch": 2.1375886524822696, + "grad_norm": 3.009333848953247, + "learning_rate": 3.626909246121703e-06, + "loss": 0.4025, + "step": 4521 + }, + { + "epoch": 2.1380614657210404, + "grad_norm": 2.634615659713745, + "learning_rate": 3.626352356063234e-06, + "loss": 0.4046, + "step": 4522 + }, + { + "epoch": 2.1385342789598107, + "grad_norm": 2.87310528755188, + "learning_rate": 3.625795395871743e-06, + "loss": 0.4426, + "step": 4523 + }, + { + "epoch": 2.1390070921985815, + "grad_norm": 2.94985032081604, + "learning_rate": 3.625238365581909e-06, + "loss": 0.445, + "step": 4524 + }, + { + "epoch": 2.1394799054373523, + "grad_norm": 2.470189332962036, + "learning_rate": 3.624681265228416e-06, + "loss": 0.4082, + "step": 4525 + }, + { + "epoch": 2.139952718676123, + "grad_norm": 2.5304040908813477, + "learning_rate": 3.624124094845952e-06, + "loss": 0.403, + "step": 4526 + }, + { + "epoch": 2.1404255319148935, + "grad_norm": 2.6148900985717773, + "learning_rate": 3.62356685446921e-06, + "loss": 0.3867, + "step": 4527 + }, + { + "epoch": 2.1408983451536643, + "grad_norm": 2.885549783706665, + "learning_rate": 3.623009544132886e-06, + "loss": 0.4706, + "step": 4528 + }, + { + "epoch": 2.141371158392435, + "grad_norm": 3.00490665435791, + "learning_rate": 3.6224521638716827e-06, + "loss": 0.4733, + "step": 4529 + }, + { + "epoch": 2.141843971631206, + "grad_norm": 2.925879716873169, + "learning_rate": 3.6218947137203043e-06, + "loss": 0.4581, + "step": 4530 + }, + { + "epoch": 2.1423167848699762, + "grad_norm": 3.10861873626709, + "learning_rate": 3.621337193713462e-06, + "loss": 0.4579, + "step": 4531 + }, + { + "epoch": 2.142789598108747, + "grad_norm": 2.7386577129364014, + "learning_rate": 3.6207796038858693e-06, + "loss": 0.4248, + "step": 4532 + }, + { + "epoch": 2.143262411347518, + "grad_norm": 2.601836681365967, + "learning_rate": 3.6202219442722453e-06, + "loss": 0.4928, + "step": 4533 + }, + { + "epoch": 2.1437352245862886, + "grad_norm": 2.598778247833252, + "learning_rate": 3.6196642149073123e-06, + "loss": 0.4415, + "step": 4534 + }, + { + "epoch": 2.144208037825059, + "grad_norm": 2.443995714187622, + "learning_rate": 3.619106415825798e-06, + "loss": 0.3917, + "step": 4535 + }, + { + "epoch": 2.1446808510638298, + "grad_norm": 2.84643816947937, + "learning_rate": 3.6185485470624354e-06, + "loss": 0.4162, + "step": 4536 + }, + { + "epoch": 2.1451536643026006, + "grad_norm": 2.4568188190460205, + "learning_rate": 3.617990608651959e-06, + "loss": 0.4298, + "step": 4537 + }, + { + "epoch": 2.145626477541371, + "grad_norm": 2.968804359436035, + "learning_rate": 3.61743260062911e-06, + "loss": 0.4696, + "step": 4538 + }, + { + "epoch": 2.1460992907801417, + "grad_norm": 2.629075288772583, + "learning_rate": 3.6168745230286327e-06, + "loss": 0.4234, + "step": 4539 + }, + { + "epoch": 2.1465721040189125, + "grad_norm": 2.7680578231811523, + "learning_rate": 3.6163163758852754e-06, + "loss": 0.4669, + "step": 4540 + }, + { + "epoch": 2.1470449172576833, + "grad_norm": 2.782825469970703, + "learning_rate": 3.615758159233793e-06, + "loss": 0.4552, + "step": 4541 + }, + { + "epoch": 2.147517730496454, + "grad_norm": 2.653047561645508, + "learning_rate": 3.615199873108942e-06, + "loss": 0.4393, + "step": 4542 + }, + { + "epoch": 2.1479905437352245, + "grad_norm": 2.4175806045532227, + "learning_rate": 3.6146415175454852e-06, + "loss": 0.4114, + "step": 4543 + }, + { + "epoch": 2.1484633569739953, + "grad_norm": 2.627943515777588, + "learning_rate": 3.614083092578189e-06, + "loss": 0.4215, + "step": 4544 + }, + { + "epoch": 2.148936170212766, + "grad_norm": 2.8934123516082764, + "learning_rate": 3.6135245982418227e-06, + "loss": 0.4815, + "step": 4545 + }, + { + "epoch": 2.1494089834515364, + "grad_norm": 2.8535244464874268, + "learning_rate": 3.612966034571164e-06, + "loss": 0.4683, + "step": 4546 + }, + { + "epoch": 2.149881796690307, + "grad_norm": 2.7826647758483887, + "learning_rate": 3.6124074016009893e-06, + "loss": 0.4351, + "step": 4547 + }, + { + "epoch": 2.150354609929078, + "grad_norm": 2.6906018257141113, + "learning_rate": 3.6118486993660834e-06, + "loss": 0.4585, + "step": 4548 + }, + { + "epoch": 2.150827423167849, + "grad_norm": 2.726766586303711, + "learning_rate": 3.6112899279012346e-06, + "loss": 0.4753, + "step": 4549 + }, + { + "epoch": 2.1513002364066196, + "grad_norm": 3.0193991661071777, + "learning_rate": 3.6107310872412348e-06, + "loss": 0.4827, + "step": 4550 + }, + { + "epoch": 2.15177304964539, + "grad_norm": 2.6788697242736816, + "learning_rate": 3.610172177420881e-06, + "loss": 0.4333, + "step": 4551 + }, + { + "epoch": 2.1522458628841608, + "grad_norm": 2.865410327911377, + "learning_rate": 3.609613198474973e-06, + "loss": 0.4569, + "step": 4552 + }, + { + "epoch": 2.1527186761229316, + "grad_norm": 2.9199366569519043, + "learning_rate": 3.609054150438317e-06, + "loss": 0.5097, + "step": 4553 + }, + { + "epoch": 2.153191489361702, + "grad_norm": 2.761035203933716, + "learning_rate": 3.6084950333457215e-06, + "loss": 0.5002, + "step": 4554 + }, + { + "epoch": 2.1536643026004727, + "grad_norm": 2.514223337173462, + "learning_rate": 3.607935847232002e-06, + "loss": 0.4171, + "step": 4555 + }, + { + "epoch": 2.1541371158392435, + "grad_norm": 2.5167524814605713, + "learning_rate": 3.6073765921319747e-06, + "loss": 0.4494, + "step": 4556 + }, + { + "epoch": 2.1546099290780143, + "grad_norm": 2.7540643215179443, + "learning_rate": 3.606817268080463e-06, + "loss": 0.4472, + "step": 4557 + }, + { + "epoch": 2.155082742316785, + "grad_norm": 2.7728664875030518, + "learning_rate": 3.6062578751122936e-06, + "loss": 0.4669, + "step": 4558 + }, + { + "epoch": 2.1555555555555554, + "grad_norm": 2.7788400650024414, + "learning_rate": 3.605698413262296e-06, + "loss": 0.4613, + "step": 4559 + }, + { + "epoch": 2.1560283687943262, + "grad_norm": 2.7811810970306396, + "learning_rate": 3.605138882565308e-06, + "loss": 0.4242, + "step": 4560 + }, + { + "epoch": 2.156501182033097, + "grad_norm": 2.7819995880126953, + "learning_rate": 3.6045792830561664e-06, + "loss": 0.443, + "step": 4561 + }, + { + "epoch": 2.1569739952718674, + "grad_norm": 2.671259641647339, + "learning_rate": 3.6040196147697166e-06, + "loss": 0.4336, + "step": 4562 + }, + { + "epoch": 2.157446808510638, + "grad_norm": 2.9296300411224365, + "learning_rate": 3.603459877740807e-06, + "loss": 0.479, + "step": 4563 + }, + { + "epoch": 2.157919621749409, + "grad_norm": 2.834937334060669, + "learning_rate": 3.602900072004289e-06, + "loss": 0.4603, + "step": 4564 + }, + { + "epoch": 2.15839243498818, + "grad_norm": 2.8434760570526123, + "learning_rate": 3.602340197595019e-06, + "loss": 0.4288, + "step": 4565 + }, + { + "epoch": 2.1588652482269506, + "grad_norm": 2.7245426177978516, + "learning_rate": 3.6017802545478593e-06, + "loss": 0.4194, + "step": 4566 + }, + { + "epoch": 2.159338061465721, + "grad_norm": 2.7795023918151855, + "learning_rate": 3.6012202428976735e-06, + "loss": 0.4481, + "step": 4567 + }, + { + "epoch": 2.1598108747044917, + "grad_norm": 2.9482083320617676, + "learning_rate": 3.6006601626793325e-06, + "loss": 0.468, + "step": 4568 + }, + { + "epoch": 2.1602836879432625, + "grad_norm": 2.9563326835632324, + "learning_rate": 3.6001000139277094e-06, + "loss": 0.4427, + "step": 4569 + }, + { + "epoch": 2.160756501182033, + "grad_norm": 2.7755916118621826, + "learning_rate": 3.599539796677682e-06, + "loss": 0.4258, + "step": 4570 + }, + { + "epoch": 2.1612293144208037, + "grad_norm": 2.961045265197754, + "learning_rate": 3.5989795109641333e-06, + "loss": 0.4645, + "step": 4571 + }, + { + "epoch": 2.1617021276595745, + "grad_norm": 3.0184407234191895, + "learning_rate": 3.5984191568219482e-06, + "loss": 0.4192, + "step": 4572 + }, + { + "epoch": 2.1621749408983453, + "grad_norm": 2.9811131954193115, + "learning_rate": 3.5978587342860192e-06, + "loss": 0.408, + "step": 4573 + }, + { + "epoch": 2.162647754137116, + "grad_norm": 2.9172329902648926, + "learning_rate": 3.597298243391242e-06, + "loss": 0.4528, + "step": 4574 + }, + { + "epoch": 2.1631205673758864, + "grad_norm": 2.7798452377319336, + "learning_rate": 3.596737684172513e-06, + "loss": 0.391, + "step": 4575 + }, + { + "epoch": 2.1635933806146572, + "grad_norm": 2.526277542114258, + "learning_rate": 3.596177056664738e-06, + "loss": 0.3699, + "step": 4576 + }, + { + "epoch": 2.164066193853428, + "grad_norm": 2.856269121170044, + "learning_rate": 3.5956163609028244e-06, + "loss": 0.4082, + "step": 4577 + }, + { + "epoch": 2.1645390070921984, + "grad_norm": 2.7681572437286377, + "learning_rate": 3.5950555969216845e-06, + "loss": 0.4064, + "step": 4578 + }, + { + "epoch": 2.165011820330969, + "grad_norm": 2.2924954891204834, + "learning_rate": 3.5944947647562333e-06, + "loss": 0.416, + "step": 4579 + }, + { + "epoch": 2.16548463356974, + "grad_norm": 2.439929485321045, + "learning_rate": 3.5939338644413936e-06, + "loss": 0.4476, + "step": 4580 + }, + { + "epoch": 2.1659574468085108, + "grad_norm": 2.786442518234253, + "learning_rate": 3.5933728960120877e-06, + "loss": 0.4525, + "step": 4581 + }, + { + "epoch": 2.166430260047281, + "grad_norm": 2.5910253524780273, + "learning_rate": 3.5928118595032465e-06, + "loss": 0.4441, + "step": 4582 + }, + { + "epoch": 2.166903073286052, + "grad_norm": 2.8144876956939697, + "learning_rate": 3.5922507549498024e-06, + "loss": 0.497, + "step": 4583 + }, + { + "epoch": 2.1673758865248227, + "grad_norm": 2.5714170932769775, + "learning_rate": 3.591689582386694e-06, + "loss": 0.4625, + "step": 4584 + }, + { + "epoch": 2.1678486997635935, + "grad_norm": 2.878187894821167, + "learning_rate": 3.591128341848861e-06, + "loss": 0.4835, + "step": 4585 + }, + { + "epoch": 2.168321513002364, + "grad_norm": 2.4946508407592773, + "learning_rate": 3.5905670333712504e-06, + "loss": 0.4278, + "step": 4586 + }, + { + "epoch": 2.1687943262411347, + "grad_norm": 2.9186196327209473, + "learning_rate": 3.590005656988814e-06, + "loss": 0.465, + "step": 4587 + }, + { + "epoch": 2.1692671394799055, + "grad_norm": 3.136807441711426, + "learning_rate": 3.5894442127365046e-06, + "loss": 0.4146, + "step": 4588 + }, + { + "epoch": 2.1697399527186763, + "grad_norm": 2.8106343746185303, + "learning_rate": 3.5888827006492804e-06, + "loss": 0.4737, + "step": 4589 + }, + { + "epoch": 2.1702127659574466, + "grad_norm": 2.874553680419922, + "learning_rate": 3.5883211207621047e-06, + "loss": 0.3962, + "step": 4590 + }, + { + "epoch": 2.1706855791962174, + "grad_norm": 2.7914116382598877, + "learning_rate": 3.587759473109946e-06, + "loss": 0.4705, + "step": 4591 + }, + { + "epoch": 2.171158392434988, + "grad_norm": 2.7273290157318115, + "learning_rate": 3.5871977577277745e-06, + "loss": 0.4827, + "step": 4592 + }, + { + "epoch": 2.171631205673759, + "grad_norm": 2.4167256355285645, + "learning_rate": 3.5866359746505653e-06, + "loss": 0.4181, + "step": 4593 + }, + { + "epoch": 2.1721040189125294, + "grad_norm": 2.8929779529571533, + "learning_rate": 3.586074123913299e-06, + "loss": 0.4006, + "step": 4594 + }, + { + "epoch": 2.1725768321513, + "grad_norm": 2.6996190547943115, + "learning_rate": 3.5855122055509593e-06, + "loss": 0.4792, + "step": 4595 + }, + { + "epoch": 2.173049645390071, + "grad_norm": 2.9341464042663574, + "learning_rate": 3.584950219598534e-06, + "loss": 0.3903, + "step": 4596 + }, + { + "epoch": 2.1735224586288417, + "grad_norm": 2.799330234527588, + "learning_rate": 3.5843881660910166e-06, + "loss": 0.4717, + "step": 4597 + }, + { + "epoch": 2.173995271867612, + "grad_norm": 2.5028693675994873, + "learning_rate": 3.5838260450634028e-06, + "loss": 0.4462, + "step": 4598 + }, + { + "epoch": 2.174468085106383, + "grad_norm": 2.5845541954040527, + "learning_rate": 3.583263856550693e-06, + "loss": 0.4327, + "step": 4599 + }, + { + "epoch": 2.1749408983451537, + "grad_norm": 2.4804906845092773, + "learning_rate": 3.5827016005878933e-06, + "loss": 0.4555, + "step": 4600 + }, + { + "epoch": 2.1754137115839245, + "grad_norm": 2.625746011734009, + "learning_rate": 3.5821392772100125e-06, + "loss": 0.455, + "step": 4601 + }, + { + "epoch": 2.175886524822695, + "grad_norm": 2.6230757236480713, + "learning_rate": 3.581576886452064e-06, + "loss": 0.4422, + "step": 4602 + }, + { + "epoch": 2.1763593380614656, + "grad_norm": 3.3104100227355957, + "learning_rate": 3.5810144283490656e-06, + "loss": 0.4212, + "step": 4603 + }, + { + "epoch": 2.1768321513002364, + "grad_norm": 2.6799755096435547, + "learning_rate": 3.5804519029360384e-06, + "loss": 0.4575, + "step": 4604 + }, + { + "epoch": 2.1773049645390072, + "grad_norm": 2.462216854095459, + "learning_rate": 3.5798893102480085e-06, + "loss": 0.4096, + "step": 4605 + }, + { + "epoch": 2.1777777777777776, + "grad_norm": 2.8600878715515137, + "learning_rate": 3.5793266503200074e-06, + "loss": 0.4798, + "step": 4606 + }, + { + "epoch": 2.1782505910165484, + "grad_norm": 2.935746431350708, + "learning_rate": 3.5787639231870673e-06, + "loss": 0.4021, + "step": 4607 + }, + { + "epoch": 2.178723404255319, + "grad_norm": 2.8655526638031006, + "learning_rate": 3.578201128884229e-06, + "loss": 0.4553, + "step": 4608 + }, + { + "epoch": 2.17919621749409, + "grad_norm": 3.219498634338379, + "learning_rate": 3.577638267446533e-06, + "loss": 0.4692, + "step": 4609 + }, + { + "epoch": 2.1796690307328603, + "grad_norm": 3.0449860095977783, + "learning_rate": 3.5770753389090283e-06, + "loss": 0.4675, + "step": 4610 + }, + { + "epoch": 2.180141843971631, + "grad_norm": 2.7045507431030273, + "learning_rate": 3.576512343306765e-06, + "loss": 0.4773, + "step": 4611 + }, + { + "epoch": 2.180614657210402, + "grad_norm": 2.601499557495117, + "learning_rate": 3.5759492806747985e-06, + "loss": 0.4112, + "step": 4612 + }, + { + "epoch": 2.1810874704491727, + "grad_norm": 2.987741470336914, + "learning_rate": 3.575386151048188e-06, + "loss": 0.4651, + "step": 4613 + }, + { + "epoch": 2.181560283687943, + "grad_norm": 2.961228847503662, + "learning_rate": 3.5748229544619973e-06, + "loss": 0.5116, + "step": 4614 + }, + { + "epoch": 2.182033096926714, + "grad_norm": 2.8008430004119873, + "learning_rate": 3.574259690951295e-06, + "loss": 0.4152, + "step": 4615 + }, + { + "epoch": 2.1825059101654847, + "grad_norm": 2.5429348945617676, + "learning_rate": 3.573696360551151e-06, + "loss": 0.4188, + "step": 4616 + }, + { + "epoch": 2.1829787234042555, + "grad_norm": 2.9566478729248047, + "learning_rate": 3.5731329632966428e-06, + "loss": 0.5156, + "step": 4617 + }, + { + "epoch": 2.183451536643026, + "grad_norm": 2.5302467346191406, + "learning_rate": 3.572569499222851e-06, + "loss": 0.4361, + "step": 4618 + }, + { + "epoch": 2.1839243498817966, + "grad_norm": 3.206803560256958, + "learning_rate": 3.5720059683648593e-06, + "loss": 0.5149, + "step": 4619 + }, + { + "epoch": 2.1843971631205674, + "grad_norm": 2.9432034492492676, + "learning_rate": 3.5714423707577573e-06, + "loss": 0.4411, + "step": 4620 + }, + { + "epoch": 2.184869976359338, + "grad_norm": 2.9412078857421875, + "learning_rate": 3.5708787064366358e-06, + "loss": 0.4372, + "step": 4621 + }, + { + "epoch": 2.1853427895981086, + "grad_norm": 3.1702330112457275, + "learning_rate": 3.5703149754365935e-06, + "loss": 0.4761, + "step": 4622 + }, + { + "epoch": 2.1858156028368794, + "grad_norm": 3.1240456104278564, + "learning_rate": 3.569751177792731e-06, + "loss": 0.4854, + "step": 4623 + }, + { + "epoch": 2.18628841607565, + "grad_norm": 2.7221994400024414, + "learning_rate": 3.5691873135401534e-06, + "loss": 0.4048, + "step": 4624 + }, + { + "epoch": 2.186761229314421, + "grad_norm": 2.74397873878479, + "learning_rate": 3.5686233827139695e-06, + "loss": 0.4747, + "step": 4625 + }, + { + "epoch": 2.1872340425531913, + "grad_norm": 2.7379889488220215, + "learning_rate": 3.5680593853492932e-06, + "loss": 0.4963, + "step": 4626 + }, + { + "epoch": 2.187706855791962, + "grad_norm": 3.040205478668213, + "learning_rate": 3.5674953214812435e-06, + "loss": 0.4917, + "step": 4627 + }, + { + "epoch": 2.188179669030733, + "grad_norm": 2.95302677154541, + "learning_rate": 3.56693119114494e-06, + "loss": 0.4758, + "step": 4628 + }, + { + "epoch": 2.1886524822695037, + "grad_norm": 2.5488312244415283, + "learning_rate": 3.56636699437551e-06, + "loss": 0.4057, + "step": 4629 + }, + { + "epoch": 2.189125295508274, + "grad_norm": 2.8379666805267334, + "learning_rate": 3.565802731208083e-06, + "loss": 0.4755, + "step": 4630 + }, + { + "epoch": 2.189598108747045, + "grad_norm": 2.8765869140625, + "learning_rate": 3.565238401677793e-06, + "loss": 0.4232, + "step": 4631 + }, + { + "epoch": 2.1900709219858157, + "grad_norm": 2.9091262817382812, + "learning_rate": 3.5646740058197784e-06, + "loss": 0.3874, + "step": 4632 + }, + { + "epoch": 2.1905437352245865, + "grad_norm": 2.7067387104034424, + "learning_rate": 3.5641095436691826e-06, + "loss": 0.4771, + "step": 4633 + }, + { + "epoch": 2.191016548463357, + "grad_norm": 2.403043508529663, + "learning_rate": 3.563545015261151e-06, + "loss": 0.4062, + "step": 4634 + }, + { + "epoch": 2.1914893617021276, + "grad_norm": 2.8059732913970947, + "learning_rate": 3.562980420630836e-06, + "loss": 0.4635, + "step": 4635 + }, + { + "epoch": 2.1919621749408984, + "grad_norm": 2.5467724800109863, + "learning_rate": 3.56241575981339e-06, + "loss": 0.4552, + "step": 4636 + }, + { + "epoch": 2.192434988179669, + "grad_norm": 2.651024103164673, + "learning_rate": 3.561851032843973e-06, + "loss": 0.38, + "step": 4637 + }, + { + "epoch": 2.1929078014184396, + "grad_norm": 2.5529849529266357, + "learning_rate": 3.5612862397577496e-06, + "loss": 0.4106, + "step": 4638 + }, + { + "epoch": 2.1933806146572103, + "grad_norm": 3.069258451461792, + "learning_rate": 3.5607213805898844e-06, + "loss": 0.461, + "step": 4639 + }, + { + "epoch": 2.193853427895981, + "grad_norm": 2.5652637481689453, + "learning_rate": 3.56015645537555e-06, + "loss": 0.4497, + "step": 4640 + }, + { + "epoch": 2.194326241134752, + "grad_norm": 2.699101209640503, + "learning_rate": 3.5595914641499224e-06, + "loss": 0.4887, + "step": 4641 + }, + { + "epoch": 2.1947990543735223, + "grad_norm": 2.9292235374450684, + "learning_rate": 3.5590264069481805e-06, + "loss": 0.4462, + "step": 4642 + }, + { + "epoch": 2.195271867612293, + "grad_norm": 2.6151106357574463, + "learning_rate": 3.5584612838055077e-06, + "loss": 0.4334, + "step": 4643 + }, + { + "epoch": 2.195744680851064, + "grad_norm": 2.895798444747925, + "learning_rate": 3.5578960947570923e-06, + "loss": 0.4448, + "step": 4644 + }, + { + "epoch": 2.1962174940898347, + "grad_norm": 2.627631425857544, + "learning_rate": 3.557330839838125e-06, + "loss": 0.436, + "step": 4645 + }, + { + "epoch": 2.196690307328605, + "grad_norm": 2.8803584575653076, + "learning_rate": 3.556765519083803e-06, + "loss": 0.4698, + "step": 4646 + }, + { + "epoch": 2.197163120567376, + "grad_norm": 2.436609983444214, + "learning_rate": 3.5562001325293265e-06, + "loss": 0.4043, + "step": 4647 + }, + { + "epoch": 2.1976359338061466, + "grad_norm": 2.5090718269348145, + "learning_rate": 3.5556346802098985e-06, + "loss": 0.4505, + "step": 4648 + }, + { + "epoch": 2.1981087470449174, + "grad_norm": 2.792783737182617, + "learning_rate": 3.5550691621607277e-06, + "loss": 0.43, + "step": 4649 + }, + { + "epoch": 2.198581560283688, + "grad_norm": 2.74153470993042, + "learning_rate": 3.554503578417026e-06, + "loss": 0.4496, + "step": 4650 + }, + { + "epoch": 2.1990543735224586, + "grad_norm": 3.0262627601623535, + "learning_rate": 3.5539379290140114e-06, + "loss": 0.4503, + "step": 4651 + }, + { + "epoch": 2.1995271867612294, + "grad_norm": 2.783811330795288, + "learning_rate": 3.553372213986903e-06, + "loss": 0.432, + "step": 4652 + }, + { + "epoch": 2.2, + "grad_norm": 3.091191053390503, + "learning_rate": 3.5528064333709255e-06, + "loss": 0.4658, + "step": 4653 + }, + { + "epoch": 2.2004728132387705, + "grad_norm": 2.814634084701538, + "learning_rate": 3.5522405872013076e-06, + "loss": 0.4473, + "step": 4654 + }, + { + "epoch": 2.2009456264775413, + "grad_norm": 2.6918299198150635, + "learning_rate": 3.5516746755132824e-06, + "loss": 0.5323, + "step": 4655 + }, + { + "epoch": 2.201418439716312, + "grad_norm": 2.9902455806732178, + "learning_rate": 3.5511086983420867e-06, + "loss": 0.5166, + "step": 4656 + }, + { + "epoch": 2.201891252955083, + "grad_norm": 2.932699203491211, + "learning_rate": 3.5505426557229616e-06, + "loss": 0.5197, + "step": 4657 + }, + { + "epoch": 2.2023640661938533, + "grad_norm": 2.585712432861328, + "learning_rate": 3.549976547691152e-06, + "loss": 0.425, + "step": 4658 + }, + { + "epoch": 2.202836879432624, + "grad_norm": 3.1019949913024902, + "learning_rate": 3.5494103742819065e-06, + "loss": 0.485, + "step": 4659 + }, + { + "epoch": 2.203309692671395, + "grad_norm": 2.3169195652008057, + "learning_rate": 3.548844135530478e-06, + "loss": 0.4064, + "step": 4660 + }, + { + "epoch": 2.2037825059101657, + "grad_norm": 2.779240846633911, + "learning_rate": 3.5482778314721257e-06, + "loss": 0.427, + "step": 4661 + }, + { + "epoch": 2.204255319148936, + "grad_norm": 2.765423059463501, + "learning_rate": 3.5477114621421078e-06, + "loss": 0.5125, + "step": 4662 + }, + { + "epoch": 2.204728132387707, + "grad_norm": 2.5590033531188965, + "learning_rate": 3.5471450275756913e-06, + "loss": 0.4009, + "step": 4663 + }, + { + "epoch": 2.2052009456264776, + "grad_norm": 2.706068515777588, + "learning_rate": 3.546578527808146e-06, + "loss": 0.4604, + "step": 4664 + }, + { + "epoch": 2.2056737588652484, + "grad_norm": 2.7995102405548096, + "learning_rate": 3.546011962874745e-06, + "loss": 0.4088, + "step": 4665 + }, + { + "epoch": 2.2061465721040188, + "grad_norm": 2.6369729042053223, + "learning_rate": 3.5454453328107656e-06, + "loss": 0.4634, + "step": 4666 + }, + { + "epoch": 2.2066193853427896, + "grad_norm": 3.1426475048065186, + "learning_rate": 3.54487863765149e-06, + "loss": 0.4761, + "step": 4667 + }, + { + "epoch": 2.2070921985815604, + "grad_norm": 2.7739460468292236, + "learning_rate": 3.5443118774322027e-06, + "loss": 0.467, + "step": 4668 + }, + { + "epoch": 2.207565011820331, + "grad_norm": 2.559105157852173, + "learning_rate": 3.5437450521881934e-06, + "loss": 0.4268, + "step": 4669 + }, + { + "epoch": 2.2080378250591015, + "grad_norm": 2.726593017578125, + "learning_rate": 3.543178161954758e-06, + "loss": 0.462, + "step": 4670 + }, + { + "epoch": 2.2085106382978723, + "grad_norm": 2.796109199523926, + "learning_rate": 3.5426112067671907e-06, + "loss": 0.4571, + "step": 4671 + }, + { + "epoch": 2.208983451536643, + "grad_norm": 2.7989072799682617, + "learning_rate": 3.5420441866607964e-06, + "loss": 0.4648, + "step": 4672 + }, + { + "epoch": 2.209456264775414, + "grad_norm": 2.6750967502593994, + "learning_rate": 3.5414771016708795e-06, + "loss": 0.4717, + "step": 4673 + }, + { + "epoch": 2.2099290780141843, + "grad_norm": 2.705659866333008, + "learning_rate": 3.5409099518327507e-06, + "loss": 0.4738, + "step": 4674 + }, + { + "epoch": 2.210401891252955, + "grad_norm": 2.79276442527771, + "learning_rate": 3.5403427371817234e-06, + "loss": 0.4625, + "step": 4675 + }, + { + "epoch": 2.210874704491726, + "grad_norm": 2.781339406967163, + "learning_rate": 3.539775457753115e-06, + "loss": 0.438, + "step": 4676 + }, + { + "epoch": 2.2113475177304966, + "grad_norm": 3.0088918209075928, + "learning_rate": 3.5392081135822488e-06, + "loss": 0.4776, + "step": 4677 + }, + { + "epoch": 2.211820330969267, + "grad_norm": 3.0291390419006348, + "learning_rate": 3.538640704704449e-06, + "loss": 0.4634, + "step": 4678 + }, + { + "epoch": 2.212293144208038, + "grad_norm": 2.967867374420166, + "learning_rate": 3.5380732311550477e-06, + "loss": 0.4776, + "step": 4679 + }, + { + "epoch": 2.2127659574468086, + "grad_norm": 2.6268832683563232, + "learning_rate": 3.5375056929693787e-06, + "loss": 0.4646, + "step": 4680 + }, + { + "epoch": 2.2132387706855794, + "grad_norm": 2.6688554286956787, + "learning_rate": 3.536938090182778e-06, + "loss": 0.3975, + "step": 4681 + }, + { + "epoch": 2.2137115839243497, + "grad_norm": 3.0079736709594727, + "learning_rate": 3.5363704228305906e-06, + "loss": 0.4724, + "step": 4682 + }, + { + "epoch": 2.2141843971631205, + "grad_norm": 2.4287586212158203, + "learning_rate": 3.535802690948161e-06, + "loss": 0.4371, + "step": 4683 + }, + { + "epoch": 2.2146572104018913, + "grad_norm": 2.960679531097412, + "learning_rate": 3.53523489457084e-06, + "loss": 0.4347, + "step": 4684 + }, + { + "epoch": 2.215130023640662, + "grad_norm": 2.9646008014678955, + "learning_rate": 3.5346670337339807e-06, + "loss": 0.4803, + "step": 4685 + }, + { + "epoch": 2.2156028368794325, + "grad_norm": 3.0518898963928223, + "learning_rate": 3.534099108472942e-06, + "loss": 0.4712, + "step": 4686 + }, + { + "epoch": 2.2160756501182033, + "grad_norm": 2.776681900024414, + "learning_rate": 3.533531118823086e-06, + "loss": 0.4347, + "step": 4687 + }, + { + "epoch": 2.216548463356974, + "grad_norm": 2.18019437789917, + "learning_rate": 3.53296306481978e-06, + "loss": 0.3551, + "step": 4688 + }, + { + "epoch": 2.217021276595745, + "grad_norm": 2.9400811195373535, + "learning_rate": 3.5323949464983937e-06, + "loss": 0.4912, + "step": 4689 + }, + { + "epoch": 2.2174940898345152, + "grad_norm": 2.798386812210083, + "learning_rate": 3.5318267638943e-06, + "loss": 0.3967, + "step": 4690 + }, + { + "epoch": 2.217966903073286, + "grad_norm": 2.5452775955200195, + "learning_rate": 3.531258517042879e-06, + "loss": 0.3773, + "step": 4691 + }, + { + "epoch": 2.218439716312057, + "grad_norm": 2.711137294769287, + "learning_rate": 3.5306902059795113e-06, + "loss": 0.4123, + "step": 4692 + }, + { + "epoch": 2.2189125295508276, + "grad_norm": 3.0022387504577637, + "learning_rate": 3.530121830739584e-06, + "loss": 0.4898, + "step": 4693 + }, + { + "epoch": 2.219385342789598, + "grad_norm": 2.871814250946045, + "learning_rate": 3.5295533913584877e-06, + "loss": 0.4497, + "step": 4694 + }, + { + "epoch": 2.219858156028369, + "grad_norm": 2.9782521724700928, + "learning_rate": 3.528984887871616e-06, + "loss": 0.4797, + "step": 4695 + }, + { + "epoch": 2.2203309692671396, + "grad_norm": 2.6896398067474365, + "learning_rate": 3.5284163203143673e-06, + "loss": 0.439, + "step": 4696 + }, + { + "epoch": 2.2208037825059104, + "grad_norm": 2.7898833751678467, + "learning_rate": 3.5278476887221436e-06, + "loss": 0.4656, + "step": 4697 + }, + { + "epoch": 2.2212765957446807, + "grad_norm": 2.800416946411133, + "learning_rate": 3.527278993130352e-06, + "loss": 0.4452, + "step": 4698 + }, + { + "epoch": 2.2217494089834515, + "grad_norm": 3.653228998184204, + "learning_rate": 3.526710233574401e-06, + "loss": 0.4189, + "step": 4699 + }, + { + "epoch": 2.2222222222222223, + "grad_norm": 2.856956958770752, + "learning_rate": 3.5261414100897064e-06, + "loss": 0.4298, + "step": 4700 + }, + { + "epoch": 2.222695035460993, + "grad_norm": 2.8576223850250244, + "learning_rate": 3.5255725227116854e-06, + "loss": 0.4425, + "step": 4701 + }, + { + "epoch": 2.2231678486997635, + "grad_norm": 3.1161351203918457, + "learning_rate": 3.5250035714757603e-06, + "loss": 0.4609, + "step": 4702 + }, + { + "epoch": 2.2236406619385343, + "grad_norm": 2.843379259109497, + "learning_rate": 3.5244345564173578e-06, + "loss": 0.3589, + "step": 4703 + }, + { + "epoch": 2.224113475177305, + "grad_norm": 2.877157211303711, + "learning_rate": 3.5238654775719068e-06, + "loss": 0.4591, + "step": 4704 + }, + { + "epoch": 2.2245862884160754, + "grad_norm": 3.488954782485962, + "learning_rate": 3.5232963349748424e-06, + "loss": 0.4836, + "step": 4705 + }, + { + "epoch": 2.225059101654846, + "grad_norm": 2.929037570953369, + "learning_rate": 3.5227271286616025e-06, + "loss": 0.5293, + "step": 4706 + }, + { + "epoch": 2.225531914893617, + "grad_norm": 2.6230576038360596, + "learning_rate": 3.5221578586676286e-06, + "loss": 0.4235, + "step": 4707 + }, + { + "epoch": 2.226004728132388, + "grad_norm": 2.529998302459717, + "learning_rate": 3.5215885250283664e-06, + "loss": 0.4369, + "step": 4708 + }, + { + "epoch": 2.2264775413711586, + "grad_norm": 2.817279577255249, + "learning_rate": 3.521019127779267e-06, + "loss": 0.481, + "step": 4709 + }, + { + "epoch": 2.226950354609929, + "grad_norm": 3.1513843536376953, + "learning_rate": 3.5204496669557833e-06, + "loss": 0.463, + "step": 4710 + }, + { + "epoch": 2.2274231678486998, + "grad_norm": 2.9403610229492188, + "learning_rate": 3.5198801425933725e-06, + "loss": 0.455, + "step": 4711 + }, + { + "epoch": 2.2278959810874706, + "grad_norm": 2.648346424102783, + "learning_rate": 3.5193105547274987e-06, + "loss": 0.4441, + "step": 4712 + }, + { + "epoch": 2.228368794326241, + "grad_norm": 2.791898727416992, + "learning_rate": 3.5187409033936252e-06, + "loss": 0.4682, + "step": 4713 + }, + { + "epoch": 2.2288416075650117, + "grad_norm": 2.8157432079315186, + "learning_rate": 3.5181711886272242e-06, + "loss": 0.4572, + "step": 4714 + }, + { + "epoch": 2.2293144208037825, + "grad_norm": 3.250319480895996, + "learning_rate": 3.5176014104637665e-06, + "loss": 0.4599, + "step": 4715 + }, + { + "epoch": 2.2297872340425533, + "grad_norm": 2.6747050285339355, + "learning_rate": 3.5170315689387307e-06, + "loss": 0.4328, + "step": 4716 + }, + { + "epoch": 2.230260047281324, + "grad_norm": 2.584094762802124, + "learning_rate": 3.5164616640875993e-06, + "loss": 0.4268, + "step": 4717 + }, + { + "epoch": 2.2307328605200945, + "grad_norm": 2.480710506439209, + "learning_rate": 3.5158916959458573e-06, + "loss": 0.438, + "step": 4718 + }, + { + "epoch": 2.2312056737588652, + "grad_norm": 2.9338483810424805, + "learning_rate": 3.515321664548993e-06, + "loss": 0.4937, + "step": 4719 + }, + { + "epoch": 2.231678486997636, + "grad_norm": 2.7880783081054688, + "learning_rate": 3.5147515699325013e-06, + "loss": 0.4624, + "step": 4720 + }, + { + "epoch": 2.2321513002364064, + "grad_norm": 2.740841865539551, + "learning_rate": 3.5141814121318797e-06, + "loss": 0.3689, + "step": 4721 + }, + { + "epoch": 2.232624113475177, + "grad_norm": 2.9541244506835938, + "learning_rate": 3.5136111911826277e-06, + "loss": 0.4092, + "step": 4722 + }, + { + "epoch": 2.233096926713948, + "grad_norm": 2.7205398082733154, + "learning_rate": 3.5130409071202515e-06, + "loss": 0.445, + "step": 4723 + }, + { + "epoch": 2.233569739952719, + "grad_norm": 2.563406229019165, + "learning_rate": 3.51247055998026e-06, + "loss": 0.4335, + "step": 4724 + }, + { + "epoch": 2.2340425531914896, + "grad_norm": 2.4249489307403564, + "learning_rate": 3.5119001497981666e-06, + "loss": 0.4671, + "step": 4725 + }, + { + "epoch": 2.23451536643026, + "grad_norm": 2.711630344390869, + "learning_rate": 3.5113296766094875e-06, + "loss": 0.4177, + "step": 4726 + }, + { + "epoch": 2.2349881796690307, + "grad_norm": 3.0257632732391357, + "learning_rate": 3.5107591404497443e-06, + "loss": 0.4976, + "step": 4727 + }, + { + "epoch": 2.2354609929078015, + "grad_norm": 2.717303991317749, + "learning_rate": 3.5101885413544614e-06, + "loss": 0.4621, + "step": 4728 + }, + { + "epoch": 2.235933806146572, + "grad_norm": 3.2846004962921143, + "learning_rate": 3.509617879359167e-06, + "loss": 0.4284, + "step": 4729 + }, + { + "epoch": 2.2364066193853427, + "grad_norm": 2.7217819690704346, + "learning_rate": 3.5090471544993953e-06, + "loss": 0.4247, + "step": 4730 + }, + { + "epoch": 2.2368794326241135, + "grad_norm": 2.5003223419189453, + "learning_rate": 3.5084763668106812e-06, + "loss": 0.4096, + "step": 4731 + }, + { + "epoch": 2.2373522458628843, + "grad_norm": 2.7312731742858887, + "learning_rate": 3.5079055163285658e-06, + "loss": 0.4741, + "step": 4732 + }, + { + "epoch": 2.237825059101655, + "grad_norm": 2.84940767288208, + "learning_rate": 3.5073346030885934e-06, + "loss": 0.4887, + "step": 4733 + }, + { + "epoch": 2.2382978723404254, + "grad_norm": 3.1188511848449707, + "learning_rate": 3.506763627126313e-06, + "loss": 0.5335, + "step": 4734 + }, + { + "epoch": 2.2387706855791962, + "grad_norm": 2.6741397380828857, + "learning_rate": 3.5061925884772753e-06, + "loss": 0.4137, + "step": 4735 + }, + { + "epoch": 2.239243498817967, + "grad_norm": 3.1542465686798096, + "learning_rate": 3.505621487177037e-06, + "loss": 0.5303, + "step": 4736 + }, + { + "epoch": 2.2397163120567374, + "grad_norm": 5.448268890380859, + "learning_rate": 3.505050323261159e-06, + "loss": 0.4995, + "step": 4737 + }, + { + "epoch": 2.240189125295508, + "grad_norm": 2.7317898273468018, + "learning_rate": 3.5044790967652037e-06, + "loss": 0.4595, + "step": 4738 + }, + { + "epoch": 2.240661938534279, + "grad_norm": 2.8135695457458496, + "learning_rate": 3.50390780772474e-06, + "loss": 0.4593, + "step": 4739 + }, + { + "epoch": 2.2411347517730498, + "grad_norm": 3.1391844749450684, + "learning_rate": 3.5033364561753393e-06, + "loss": 0.4902, + "step": 4740 + }, + { + "epoch": 2.24160756501182, + "grad_norm": 2.6383132934570312, + "learning_rate": 3.5027650421525762e-06, + "loss": 0.3832, + "step": 4741 + }, + { + "epoch": 2.242080378250591, + "grad_norm": 2.742546558380127, + "learning_rate": 3.5021935656920314e-06, + "loss": 0.4012, + "step": 4742 + }, + { + "epoch": 2.2425531914893617, + "grad_norm": 3.1243674755096436, + "learning_rate": 3.5016220268292873e-06, + "loss": 0.4271, + "step": 4743 + }, + { + "epoch": 2.2430260047281325, + "grad_norm": 2.794717788696289, + "learning_rate": 3.501050425599932e-06, + "loss": 0.4604, + "step": 4744 + }, + { + "epoch": 2.243498817966903, + "grad_norm": 2.8481621742248535, + "learning_rate": 3.5004787620395565e-06, + "loss": 0.4814, + "step": 4745 + }, + { + "epoch": 2.2439716312056737, + "grad_norm": 2.8842051029205322, + "learning_rate": 3.499907036183755e-06, + "loss": 0.4987, + "step": 4746 + }, + { + "epoch": 2.2444444444444445, + "grad_norm": 3.074805974960327, + "learning_rate": 3.4993352480681265e-06, + "loss": 0.4966, + "step": 4747 + }, + { + "epoch": 2.2449172576832153, + "grad_norm": 2.7204246520996094, + "learning_rate": 3.4987633977282742e-06, + "loss": 0.4, + "step": 4748 + }, + { + "epoch": 2.2453900709219856, + "grad_norm": 2.685884952545166, + "learning_rate": 3.4981914851998055e-06, + "loss": 0.4285, + "step": 4749 + }, + { + "epoch": 2.2458628841607564, + "grad_norm": 2.1666336059570312, + "learning_rate": 3.4976195105183287e-06, + "loss": 0.3756, + "step": 4750 + }, + { + "epoch": 2.246335697399527, + "grad_norm": 2.863006353378296, + "learning_rate": 3.49704747371946e-06, + "loss": 0.4535, + "step": 4751 + }, + { + "epoch": 2.246808510638298, + "grad_norm": 2.5558736324310303, + "learning_rate": 3.496475374838817e-06, + "loss": 0.4129, + "step": 4752 + }, + { + "epoch": 2.2472813238770684, + "grad_norm": 2.9780309200286865, + "learning_rate": 3.495903213912022e-06, + "loss": 0.4871, + "step": 4753 + }, + { + "epoch": 2.247754137115839, + "grad_norm": 2.951779365539551, + "learning_rate": 3.4953309909747e-06, + "loss": 0.5162, + "step": 4754 + }, + { + "epoch": 2.24822695035461, + "grad_norm": 2.7654693126678467, + "learning_rate": 3.4947587060624834e-06, + "loss": 0.4662, + "step": 4755 + }, + { + "epoch": 2.2486997635933808, + "grad_norm": 2.708247184753418, + "learning_rate": 3.494186359211002e-06, + "loss": 0.4279, + "step": 4756 + }, + { + "epoch": 2.249172576832151, + "grad_norm": 3.09916615486145, + "learning_rate": 3.4936139504558963e-06, + "loss": 0.4085, + "step": 4757 + }, + { + "epoch": 2.249645390070922, + "grad_norm": 2.913806200027466, + "learning_rate": 3.493041479832807e-06, + "loss": 0.4653, + "step": 4758 + }, + { + "epoch": 2.2501182033096927, + "grad_norm": 3.2903928756713867, + "learning_rate": 3.4924689473773787e-06, + "loss": 0.5167, + "step": 4759 + }, + { + "epoch": 2.2505910165484635, + "grad_norm": 3.1302902698516846, + "learning_rate": 3.4918963531252607e-06, + "loss": 0.5398, + "step": 4760 + }, + { + "epoch": 2.251063829787234, + "grad_norm": 2.8858273029327393, + "learning_rate": 3.4913236971121063e-06, + "loss": 0.4395, + "step": 4761 + }, + { + "epoch": 2.2515366430260046, + "grad_norm": 3.194521903991699, + "learning_rate": 3.4907509793735727e-06, + "loss": 0.5258, + "step": 4762 + }, + { + "epoch": 2.2520094562647754, + "grad_norm": 2.8640544414520264, + "learning_rate": 3.49017819994532e-06, + "loss": 0.4073, + "step": 4763 + }, + { + "epoch": 2.2524822695035462, + "grad_norm": 3.139995813369751, + "learning_rate": 3.489605358863011e-06, + "loss": 0.4653, + "step": 4764 + }, + { + "epoch": 2.2529550827423166, + "grad_norm": 2.6228537559509277, + "learning_rate": 3.489032456162317e-06, + "loss": 0.4546, + "step": 4765 + }, + { + "epoch": 2.2534278959810874, + "grad_norm": 2.8197672367095947, + "learning_rate": 3.4884594918789083e-06, + "loss": 0.479, + "step": 4766 + }, + { + "epoch": 2.253900709219858, + "grad_norm": 2.7839298248291016, + "learning_rate": 3.4878864660484612e-06, + "loss": 0.5081, + "step": 4767 + }, + { + "epoch": 2.254373522458629, + "grad_norm": 2.8630709648132324, + "learning_rate": 3.487313378706656e-06, + "loss": 0.4345, + "step": 4768 + }, + { + "epoch": 2.2548463356973993, + "grad_norm": 2.5661563873291016, + "learning_rate": 3.4867402298891755e-06, + "loss": 0.4266, + "step": 4769 + }, + { + "epoch": 2.25531914893617, + "grad_norm": 2.6274025440216064, + "learning_rate": 3.4861670196317084e-06, + "loss": 0.4645, + "step": 4770 + }, + { + "epoch": 2.255791962174941, + "grad_norm": 2.578702449798584, + "learning_rate": 3.485593747969944e-06, + "loss": 0.4242, + "step": 4771 + }, + { + "epoch": 2.2562647754137117, + "grad_norm": 2.322476625442505, + "learning_rate": 3.48502041493958e-06, + "loss": 0.3975, + "step": 4772 + }, + { + "epoch": 2.256737588652482, + "grad_norm": 2.8412630558013916, + "learning_rate": 3.484447020576313e-06, + "loss": 0.4276, + "step": 4773 + }, + { + "epoch": 2.257210401891253, + "grad_norm": 2.6090497970581055, + "learning_rate": 3.483873564915847e-06, + "loss": 0.429, + "step": 4774 + }, + { + "epoch": 2.2576832151300237, + "grad_norm": 2.692458152770996, + "learning_rate": 3.4833000479938877e-06, + "loss": 0.4211, + "step": 4775 + }, + { + "epoch": 2.2581560283687945, + "grad_norm": 2.5546815395355225, + "learning_rate": 3.482726469846146e-06, + "loss": 0.4751, + "step": 4776 + }, + { + "epoch": 2.258628841607565, + "grad_norm": 2.8409626483917236, + "learning_rate": 3.4821528305083376e-06, + "loss": 0.4821, + "step": 4777 + }, + { + "epoch": 2.2591016548463356, + "grad_norm": 2.722966432571411, + "learning_rate": 3.4815791300161785e-06, + "loss": 0.5029, + "step": 4778 + }, + { + "epoch": 2.2595744680851064, + "grad_norm": 2.691603899002075, + "learning_rate": 3.48100536840539e-06, + "loss": 0.4242, + "step": 4779 + }, + { + "epoch": 2.260047281323877, + "grad_norm": 2.64035964012146, + "learning_rate": 3.4804315457116992e-06, + "loss": 0.4033, + "step": 4780 + }, + { + "epoch": 2.2605200945626476, + "grad_norm": 2.758819580078125, + "learning_rate": 3.4798576619708357e-06, + "loss": 0.4321, + "step": 4781 + }, + { + "epoch": 2.2609929078014184, + "grad_norm": 2.8204405307769775, + "learning_rate": 3.4792837172185324e-06, + "loss": 0.4309, + "step": 4782 + }, + { + "epoch": 2.261465721040189, + "grad_norm": 2.529771327972412, + "learning_rate": 3.478709711490525e-06, + "loss": 0.4398, + "step": 4783 + }, + { + "epoch": 2.26193853427896, + "grad_norm": 2.8156251907348633, + "learning_rate": 3.4781356448225557e-06, + "loss": 0.447, + "step": 4784 + }, + { + "epoch": 2.2624113475177303, + "grad_norm": 2.689528703689575, + "learning_rate": 3.477561517250369e-06, + "loss": 0.3907, + "step": 4785 + }, + { + "epoch": 2.262884160756501, + "grad_norm": 2.9148027896881104, + "learning_rate": 3.476987328809713e-06, + "loss": 0.4287, + "step": 4786 + }, + { + "epoch": 2.263356973995272, + "grad_norm": 2.933021306991577, + "learning_rate": 3.4764130795363404e-06, + "loss": 0.4847, + "step": 4787 + }, + { + "epoch": 2.2638297872340427, + "grad_norm": 2.8559257984161377, + "learning_rate": 3.4758387694660064e-06, + "loss": 0.4554, + "step": 4788 + }, + { + "epoch": 2.264302600472813, + "grad_norm": 3.0355522632598877, + "learning_rate": 3.4752643986344707e-06, + "loss": 0.4286, + "step": 4789 + }, + { + "epoch": 2.264775413711584, + "grad_norm": 2.9768362045288086, + "learning_rate": 3.474689967077498e-06, + "loss": 0.4917, + "step": 4790 + }, + { + "epoch": 2.2652482269503547, + "grad_norm": 2.827971935272217, + "learning_rate": 3.474115474830855e-06, + "loss": 0.4542, + "step": 4791 + }, + { + "epoch": 2.2657210401891255, + "grad_norm": 2.559659719467163, + "learning_rate": 3.4735409219303123e-06, + "loss": 0.4168, + "step": 4792 + }, + { + "epoch": 2.266193853427896, + "grad_norm": 2.3172824382781982, + "learning_rate": 3.472966308411645e-06, + "loss": 0.3535, + "step": 4793 + }, + { + "epoch": 2.2666666666666666, + "grad_norm": 2.6779656410217285, + "learning_rate": 3.4723916343106327e-06, + "loss": 0.4599, + "step": 4794 + }, + { + "epoch": 2.2671394799054374, + "grad_norm": 2.55780291557312, + "learning_rate": 3.4718168996630573e-06, + "loss": 0.4185, + "step": 4795 + }, + { + "epoch": 2.267612293144208, + "grad_norm": 2.4929800033569336, + "learning_rate": 3.471242104504704e-06, + "loss": 0.4008, + "step": 4796 + }, + { + "epoch": 2.2680851063829786, + "grad_norm": 2.849475145339966, + "learning_rate": 3.4706672488713642e-06, + "loss": 0.396, + "step": 4797 + }, + { + "epoch": 2.2685579196217494, + "grad_norm": 2.4830739498138428, + "learning_rate": 3.4700923327988306e-06, + "loss": 0.4087, + "step": 4798 + }, + { + "epoch": 2.26903073286052, + "grad_norm": 3.2748119831085205, + "learning_rate": 3.469517356322901e-06, + "loss": 0.4496, + "step": 4799 + }, + { + "epoch": 2.269503546099291, + "grad_norm": 3.0440170764923096, + "learning_rate": 3.468942319479378e-06, + "loss": 0.4903, + "step": 4800 + }, + { + "epoch": 2.2699763593380613, + "grad_norm": 2.8200504779815674, + "learning_rate": 3.4683672223040645e-06, + "loss": 0.4588, + "step": 4801 + }, + { + "epoch": 2.270449172576832, + "grad_norm": 2.675206184387207, + "learning_rate": 3.4677920648327707e-06, + "loss": 0.4257, + "step": 4802 + }, + { + "epoch": 2.270921985815603, + "grad_norm": 2.862675905227661, + "learning_rate": 3.4672168471013084e-06, + "loss": 0.466, + "step": 4803 + }, + { + "epoch": 2.2713947990543737, + "grad_norm": 2.65663743019104, + "learning_rate": 3.4666415691454947e-06, + "loss": 0.4784, + "step": 4804 + }, + { + "epoch": 2.271867612293144, + "grad_norm": 2.5610506534576416, + "learning_rate": 3.4660662310011483e-06, + "loss": 0.4429, + "step": 4805 + }, + { + "epoch": 2.272340425531915, + "grad_norm": 2.6459643840789795, + "learning_rate": 3.465490832704094e-06, + "loss": 0.4345, + "step": 4806 + }, + { + "epoch": 2.2728132387706856, + "grad_norm": 2.426013469696045, + "learning_rate": 3.4649153742901585e-06, + "loss": 0.4533, + "step": 4807 + }, + { + "epoch": 2.2732860520094564, + "grad_norm": 2.6714842319488525, + "learning_rate": 3.4643398557951745e-06, + "loss": 0.4409, + "step": 4808 + }, + { + "epoch": 2.273758865248227, + "grad_norm": 2.703629493713379, + "learning_rate": 3.463764277254976e-06, + "loss": 0.3656, + "step": 4809 + }, + { + "epoch": 2.2742316784869976, + "grad_norm": 2.811753988265991, + "learning_rate": 3.4631886387054025e-06, + "loss": 0.4957, + "step": 4810 + }, + { + "epoch": 2.2747044917257684, + "grad_norm": 2.9469289779663086, + "learning_rate": 3.462612940182295e-06, + "loss": 0.4582, + "step": 4811 + }, + { + "epoch": 2.275177304964539, + "grad_norm": 2.6287801265716553, + "learning_rate": 3.462037181721501e-06, + "loss": 0.4072, + "step": 4812 + }, + { + "epoch": 2.2756501182033095, + "grad_norm": 2.7104952335357666, + "learning_rate": 3.46146136335887e-06, + "loss": 0.4998, + "step": 4813 + }, + { + "epoch": 2.2761229314420803, + "grad_norm": 3.170363187789917, + "learning_rate": 3.460885485130256e-06, + "loss": 0.4722, + "step": 4814 + }, + { + "epoch": 2.276595744680851, + "grad_norm": 2.7315151691436768, + "learning_rate": 3.460309547071516e-06, + "loss": 0.4482, + "step": 4815 + }, + { + "epoch": 2.277068557919622, + "grad_norm": 2.685988187789917, + "learning_rate": 3.4597335492185113e-06, + "loss": 0.4419, + "step": 4816 + }, + { + "epoch": 2.2775413711583923, + "grad_norm": 2.532790184020996, + "learning_rate": 3.459157491607107e-06, + "loss": 0.3961, + "step": 4817 + }, + { + "epoch": 2.278014184397163, + "grad_norm": 2.920729875564575, + "learning_rate": 3.458581374273171e-06, + "loss": 0.4767, + "step": 4818 + }, + { + "epoch": 2.278486997635934, + "grad_norm": 3.2481250762939453, + "learning_rate": 3.458005197252577e-06, + "loss": 0.4985, + "step": 4819 + }, + { + "epoch": 2.2789598108747047, + "grad_norm": 2.373809814453125, + "learning_rate": 3.4574289605811994e-06, + "loss": 0.4259, + "step": 4820 + }, + { + "epoch": 2.279432624113475, + "grad_norm": 2.7851033210754395, + "learning_rate": 3.4568526642949184e-06, + "loss": 0.4829, + "step": 4821 + }, + { + "epoch": 2.279905437352246, + "grad_norm": 2.9777133464813232, + "learning_rate": 3.456276308429618e-06, + "loss": 0.4896, + "step": 4822 + }, + { + "epoch": 2.2803782505910166, + "grad_norm": 2.7922022342681885, + "learning_rate": 3.4556998930211853e-06, + "loss": 0.4908, + "step": 4823 + }, + { + "epoch": 2.2808510638297874, + "grad_norm": 2.699180841445923, + "learning_rate": 3.4551234181055104e-06, + "loss": 0.4518, + "step": 4824 + }, + { + "epoch": 2.2813238770685578, + "grad_norm": 3.1200520992279053, + "learning_rate": 3.4545468837184885e-06, + "loss": 0.4877, + "step": 4825 + }, + { + "epoch": 2.2817966903073286, + "grad_norm": 2.56782603263855, + "learning_rate": 3.453970289896018e-06, + "loss": 0.4281, + "step": 4826 + }, + { + "epoch": 2.2822695035460994, + "grad_norm": 3.241356372833252, + "learning_rate": 3.4533936366740007e-06, + "loss": 0.4338, + "step": 4827 + }, + { + "epoch": 2.28274231678487, + "grad_norm": 3.560295343399048, + "learning_rate": 3.452816924088342e-06, + "loss": 0.4121, + "step": 4828 + }, + { + "epoch": 2.2832151300236405, + "grad_norm": 2.8512449264526367, + "learning_rate": 3.452240152174951e-06, + "loss": 0.4357, + "step": 4829 + }, + { + "epoch": 2.2836879432624113, + "grad_norm": 3.0332651138305664, + "learning_rate": 3.4516633209697408e-06, + "loss": 0.4985, + "step": 4830 + }, + { + "epoch": 2.284160756501182, + "grad_norm": 2.520930528640747, + "learning_rate": 3.451086430508629e-06, + "loss": 0.4021, + "step": 4831 + }, + { + "epoch": 2.284633569739953, + "grad_norm": 2.508227825164795, + "learning_rate": 3.4505094808275363e-06, + "loss": 0.3935, + "step": 4832 + }, + { + "epoch": 2.2851063829787233, + "grad_norm": 2.56752610206604, + "learning_rate": 3.449932471962385e-06, + "loss": 0.4689, + "step": 4833 + }, + { + "epoch": 2.285579196217494, + "grad_norm": 2.7757534980773926, + "learning_rate": 3.449355403949105e-06, + "loss": 0.4565, + "step": 4834 + }, + { + "epoch": 2.286052009456265, + "grad_norm": 3.364821195602417, + "learning_rate": 3.448778276823626e-06, + "loss": 0.4729, + "step": 4835 + }, + { + "epoch": 2.2865248226950357, + "grad_norm": 3.0045557022094727, + "learning_rate": 3.448201090621884e-06, + "loss": 0.4834, + "step": 4836 + }, + { + "epoch": 2.286997635933806, + "grad_norm": 2.9451794624328613, + "learning_rate": 3.4476238453798183e-06, + "loss": 0.489, + "step": 4837 + }, + { + "epoch": 2.287470449172577, + "grad_norm": 2.8307435512542725, + "learning_rate": 3.4470465411333708e-06, + "loss": 0.5079, + "step": 4838 + }, + { + "epoch": 2.2879432624113476, + "grad_norm": 2.7118136882781982, + "learning_rate": 3.4464691779184876e-06, + "loss": 0.4794, + "step": 4839 + }, + { + "epoch": 2.2884160756501184, + "grad_norm": 2.6724441051483154, + "learning_rate": 3.445891755771119e-06, + "loss": 0.4619, + "step": 4840 + }, + { + "epoch": 2.2888888888888888, + "grad_norm": 2.8161258697509766, + "learning_rate": 3.445314274727218e-06, + "loss": 0.4287, + "step": 4841 + }, + { + "epoch": 2.2893617021276595, + "grad_norm": 2.5681750774383545, + "learning_rate": 3.4447367348227433e-06, + "loss": 0.4167, + "step": 4842 + }, + { + "epoch": 2.2898345153664303, + "grad_norm": 2.8136284351348877, + "learning_rate": 3.444159136093654e-06, + "loss": 0.4195, + "step": 4843 + }, + { + "epoch": 2.290307328605201, + "grad_norm": 3.153651714324951, + "learning_rate": 3.443581478575915e-06, + "loss": 0.4821, + "step": 4844 + }, + { + "epoch": 2.2907801418439715, + "grad_norm": 2.980883836746216, + "learning_rate": 3.4430037623054953e-06, + "loss": 0.4627, + "step": 4845 + }, + { + "epoch": 2.2912529550827423, + "grad_norm": 2.786182403564453, + "learning_rate": 3.4424259873183664e-06, + "loss": 0.4342, + "step": 4846 + }, + { + "epoch": 2.291725768321513, + "grad_norm": 2.8938279151916504, + "learning_rate": 3.4418481536505026e-06, + "loss": 0.3997, + "step": 4847 + }, + { + "epoch": 2.2921985815602834, + "grad_norm": 2.5534510612487793, + "learning_rate": 3.4412702613378844e-06, + "loss": 0.3982, + "step": 4848 + }, + { + "epoch": 2.2926713947990542, + "grad_norm": 2.7907063961029053, + "learning_rate": 3.4406923104164956e-06, + "loss": 0.4484, + "step": 4849 + }, + { + "epoch": 2.293144208037825, + "grad_norm": 3.162702798843384, + "learning_rate": 3.4401143009223203e-06, + "loss": 0.4528, + "step": 4850 + }, + { + "epoch": 2.293617021276596, + "grad_norm": 2.4647393226623535, + "learning_rate": 3.4395362328913505e-06, + "loss": 0.3759, + "step": 4851 + }, + { + "epoch": 2.2940898345153666, + "grad_norm": 2.8219876289367676, + "learning_rate": 3.438958106359579e-06, + "loss": 0.4903, + "step": 4852 + }, + { + "epoch": 2.294562647754137, + "grad_norm": 2.827073097229004, + "learning_rate": 3.438379921363003e-06, + "loss": 0.4315, + "step": 4853 + }, + { + "epoch": 2.295035460992908, + "grad_norm": 2.472470283508301, + "learning_rate": 3.4378016779376244e-06, + "loss": 0.4478, + "step": 4854 + }, + { + "epoch": 2.2955082742316786, + "grad_norm": 3.3994734287261963, + "learning_rate": 3.4372233761194473e-06, + "loss": 0.5086, + "step": 4855 + }, + { + "epoch": 2.295981087470449, + "grad_norm": 3.030465602874756, + "learning_rate": 3.4366450159444796e-06, + "loss": 0.4159, + "step": 4856 + }, + { + "epoch": 2.2964539007092197, + "grad_norm": 2.5460705757141113, + "learning_rate": 3.4360665974487346e-06, + "loss": 0.4097, + "step": 4857 + }, + { + "epoch": 2.2969267139479905, + "grad_norm": 2.884469509124756, + "learning_rate": 3.4354881206682273e-06, + "loss": 0.4478, + "step": 4858 + }, + { + "epoch": 2.2973995271867613, + "grad_norm": 2.5139710903167725, + "learning_rate": 3.4349095856389765e-06, + "loss": 0.4286, + "step": 4859 + }, + { + "epoch": 2.297872340425532, + "grad_norm": 3.1628260612487793, + "learning_rate": 3.4343309923970053e-06, + "loss": 0.4617, + "step": 4860 + }, + { + "epoch": 2.2983451536643025, + "grad_norm": 2.6141695976257324, + "learning_rate": 3.4337523409783395e-06, + "loss": 0.3841, + "step": 4861 + }, + { + "epoch": 2.2988179669030733, + "grad_norm": 2.766834259033203, + "learning_rate": 3.43317363141901e-06, + "loss": 0.4484, + "step": 4862 + }, + { + "epoch": 2.299290780141844, + "grad_norm": 2.785491943359375, + "learning_rate": 3.4325948637550503e-06, + "loss": 0.4363, + "step": 4863 + }, + { + "epoch": 2.2997635933806144, + "grad_norm": 2.624929189682007, + "learning_rate": 3.4320160380224988e-06, + "loss": 0.4518, + "step": 4864 + }, + { + "epoch": 2.300236406619385, + "grad_norm": 2.895413398742676, + "learning_rate": 3.4314371542573944e-06, + "loss": 0.4745, + "step": 4865 + }, + { + "epoch": 2.300709219858156, + "grad_norm": 2.603816270828247, + "learning_rate": 3.430858212495783e-06, + "loss": 0.4444, + "step": 4866 + }, + { + "epoch": 2.301182033096927, + "grad_norm": 3.387360095977783, + "learning_rate": 3.4302792127737116e-06, + "loss": 0.4169, + "step": 4867 + }, + { + "epoch": 2.3016548463356976, + "grad_norm": 2.894054651260376, + "learning_rate": 3.4297001551272334e-06, + "loss": 0.4493, + "step": 4868 + }, + { + "epoch": 2.302127659574468, + "grad_norm": 3.0432028770446777, + "learning_rate": 3.4291210395924035e-06, + "loss": 0.4854, + "step": 4869 + }, + { + "epoch": 2.3026004728132388, + "grad_norm": 2.5144734382629395, + "learning_rate": 3.42854186620528e-06, + "loss": 0.4556, + "step": 4870 + }, + { + "epoch": 2.3030732860520096, + "grad_norm": 2.964812755584717, + "learning_rate": 3.427962635001926e-06, + "loss": 0.495, + "step": 4871 + }, + { + "epoch": 2.30354609929078, + "grad_norm": 2.9991118907928467, + "learning_rate": 3.4273833460184077e-06, + "loss": 0.4787, + "step": 4872 + }, + { + "epoch": 2.3040189125295507, + "grad_norm": 2.9424328804016113, + "learning_rate": 3.4268039992907955e-06, + "loss": 0.5006, + "step": 4873 + }, + { + "epoch": 2.3044917257683215, + "grad_norm": 2.792880058288574, + "learning_rate": 3.426224594855162e-06, + "loss": 0.4399, + "step": 4874 + }, + { + "epoch": 2.3049645390070923, + "grad_norm": 2.5308053493499756, + "learning_rate": 3.4256451327475838e-06, + "loss": 0.4843, + "step": 4875 + }, + { + "epoch": 2.305437352245863, + "grad_norm": 2.7937564849853516, + "learning_rate": 3.425065613004142e-06, + "loss": 0.4428, + "step": 4876 + }, + { + "epoch": 2.3059101654846335, + "grad_norm": 2.4231557846069336, + "learning_rate": 3.424486035660921e-06, + "loss": 0.4054, + "step": 4877 + }, + { + "epoch": 2.3063829787234043, + "grad_norm": 3.0622596740722656, + "learning_rate": 3.423906400754009e-06, + "loss": 0.4623, + "step": 4878 + }, + { + "epoch": 2.306855791962175, + "grad_norm": 2.6532933712005615, + "learning_rate": 3.4233267083194955e-06, + "loss": 0.4387, + "step": 4879 + }, + { + "epoch": 2.3073286052009454, + "grad_norm": 2.793325185775757, + "learning_rate": 3.422746958393477e-06, + "loss": 0.4047, + "step": 4880 + }, + { + "epoch": 2.307801418439716, + "grad_norm": 2.9178314208984375, + "learning_rate": 3.422167151012052e-06, + "loss": 0.4397, + "step": 4881 + }, + { + "epoch": 2.308274231678487, + "grad_norm": 3.463913917541504, + "learning_rate": 3.4215872862113214e-06, + "loss": 0.4347, + "step": 4882 + }, + { + "epoch": 2.308747044917258, + "grad_norm": 3.228403091430664, + "learning_rate": 3.421007364027392e-06, + "loss": 0.4405, + "step": 4883 + }, + { + "epoch": 2.3092198581560286, + "grad_norm": 2.896933078765869, + "learning_rate": 3.420427384496372e-06, + "loss": 0.4429, + "step": 4884 + }, + { + "epoch": 2.309692671394799, + "grad_norm": 2.5559937953948975, + "learning_rate": 3.4198473476543755e-06, + "loss": 0.4281, + "step": 4885 + }, + { + "epoch": 2.3101654846335697, + "grad_norm": 3.457918167114258, + "learning_rate": 3.419267253537517e-06, + "loss": 0.4495, + "step": 4886 + }, + { + "epoch": 2.3106382978723405, + "grad_norm": 2.6554839611053467, + "learning_rate": 3.418687102181918e-06, + "loss": 0.4682, + "step": 4887 + }, + { + "epoch": 2.311111111111111, + "grad_norm": 2.8171639442443848, + "learning_rate": 3.4181068936237024e-06, + "loss": 0.4184, + "step": 4888 + }, + { + "epoch": 2.3115839243498817, + "grad_norm": 2.9272499084472656, + "learning_rate": 3.4175266278989955e-06, + "loss": 0.5445, + "step": 4889 + }, + { + "epoch": 2.3120567375886525, + "grad_norm": 2.5928499698638916, + "learning_rate": 3.4169463050439284e-06, + "loss": 0.3808, + "step": 4890 + }, + { + "epoch": 2.3125295508274233, + "grad_norm": 2.6624577045440674, + "learning_rate": 3.4163659250946356e-06, + "loss": 0.4678, + "step": 4891 + }, + { + "epoch": 2.313002364066194, + "grad_norm": 2.666555643081665, + "learning_rate": 3.4157854880872553e-06, + "loss": 0.457, + "step": 4892 + }, + { + "epoch": 2.3134751773049644, + "grad_norm": 3.2987406253814697, + "learning_rate": 3.4152049940579278e-06, + "loss": 0.551, + "step": 4893 + }, + { + "epoch": 2.3139479905437352, + "grad_norm": 2.728119134902954, + "learning_rate": 3.414624443042799e-06, + "loss": 0.3935, + "step": 4894 + }, + { + "epoch": 2.314420803782506, + "grad_norm": 3.133005380630493, + "learning_rate": 3.4140438350780157e-06, + "loss": 0.4981, + "step": 4895 + }, + { + "epoch": 2.3148936170212764, + "grad_norm": 2.591252565383911, + "learning_rate": 3.4134631701997312e-06, + "loss": 0.4251, + "step": 4896 + }, + { + "epoch": 2.315366430260047, + "grad_norm": 3.007136344909668, + "learning_rate": 3.412882448444101e-06, + "loss": 0.4492, + "step": 4897 + }, + { + "epoch": 2.315839243498818, + "grad_norm": 2.6391026973724365, + "learning_rate": 3.412301669847284e-06, + "loss": 0.5151, + "step": 4898 + }, + { + "epoch": 2.3163120567375888, + "grad_norm": 7.453699111938477, + "learning_rate": 3.411720834445441e-06, + "loss": 0.4983, + "step": 4899 + }, + { + "epoch": 2.3167848699763596, + "grad_norm": 2.667712688446045, + "learning_rate": 3.41113994227474e-06, + "loss": 0.4581, + "step": 4900 + }, + { + "epoch": 2.31725768321513, + "grad_norm": 2.7727627754211426, + "learning_rate": 3.41055899337135e-06, + "loss": 0.4731, + "step": 4901 + }, + { + "epoch": 2.3177304964539007, + "grad_norm": 3.0096890926361084, + "learning_rate": 3.409977987771444e-06, + "loss": 0.4996, + "step": 4902 + }, + { + "epoch": 2.3182033096926715, + "grad_norm": 2.725830078125, + "learning_rate": 3.4093969255111993e-06, + "loss": 0.4544, + "step": 4903 + }, + { + "epoch": 2.318676122931442, + "grad_norm": 2.7596993446350098, + "learning_rate": 3.4088158066267945e-06, + "loss": 0.4846, + "step": 4904 + }, + { + "epoch": 2.3191489361702127, + "grad_norm": 2.702620029449463, + "learning_rate": 3.4082346311544156e-06, + "loss": 0.4849, + "step": 4905 + }, + { + "epoch": 2.3196217494089835, + "grad_norm": 2.725374460220337, + "learning_rate": 3.407653399130249e-06, + "loss": 0.4116, + "step": 4906 + }, + { + "epoch": 2.3200945626477543, + "grad_norm": 2.6770219802856445, + "learning_rate": 3.4070721105904847e-06, + "loss": 0.4606, + "step": 4907 + }, + { + "epoch": 2.320567375886525, + "grad_norm": 2.9249117374420166, + "learning_rate": 3.406490765571317e-06, + "loss": 0.461, + "step": 4908 + }, + { + "epoch": 2.3210401891252954, + "grad_norm": 2.7568278312683105, + "learning_rate": 3.405909364108944e-06, + "loss": 0.4065, + "step": 4909 + }, + { + "epoch": 2.321513002364066, + "grad_norm": 2.7231340408325195, + "learning_rate": 3.4053279062395676e-06, + "loss": 0.4173, + "step": 4910 + }, + { + "epoch": 2.321985815602837, + "grad_norm": 3.1401100158691406, + "learning_rate": 3.404746391999393e-06, + "loss": 0.4287, + "step": 4911 + }, + { + "epoch": 2.3224586288416074, + "grad_norm": 2.714853525161743, + "learning_rate": 3.404164821424627e-06, + "loss": 0.4552, + "step": 4912 + }, + { + "epoch": 2.322931442080378, + "grad_norm": 3.1509978771209717, + "learning_rate": 3.4035831945514825e-06, + "loss": 0.5296, + "step": 4913 + }, + { + "epoch": 2.323404255319149, + "grad_norm": 2.567194938659668, + "learning_rate": 3.403001511416174e-06, + "loss": 0.4306, + "step": 4914 + }, + { + "epoch": 2.3238770685579198, + "grad_norm": 2.7473888397216797, + "learning_rate": 3.402419772054922e-06, + "loss": 0.4009, + "step": 4915 + }, + { + "epoch": 2.3243498817966906, + "grad_norm": 2.8617780208587646, + "learning_rate": 3.401837976503947e-06, + "loss": 0.4545, + "step": 4916 + }, + { + "epoch": 2.324822695035461, + "grad_norm": 2.3650572299957275, + "learning_rate": 3.401256124799475e-06, + "loss": 0.4046, + "step": 4917 + }, + { + "epoch": 2.3252955082742317, + "grad_norm": 2.418407678604126, + "learning_rate": 3.4006742169777364e-06, + "loss": 0.4222, + "step": 4918 + }, + { + "epoch": 2.3257683215130025, + "grad_norm": 2.7232494354248047, + "learning_rate": 3.400092253074964e-06, + "loss": 0.4373, + "step": 4919 + }, + { + "epoch": 2.326241134751773, + "grad_norm": 2.702965497970581, + "learning_rate": 3.399510233127394e-06, + "loss": 0.437, + "step": 4920 + }, + { + "epoch": 2.3267139479905437, + "grad_norm": 2.8381760120391846, + "learning_rate": 3.3989281571712664e-06, + "loss": 0.4294, + "step": 4921 + }, + { + "epoch": 2.3271867612293144, + "grad_norm": 2.767131805419922, + "learning_rate": 3.398346025242823e-06, + "loss": 0.4673, + "step": 4922 + }, + { + "epoch": 2.3276595744680852, + "grad_norm": 2.5261805057525635, + "learning_rate": 3.3977638373783123e-06, + "loss": 0.4147, + "step": 4923 + }, + { + "epoch": 2.3281323877068556, + "grad_norm": 2.7176897525787354, + "learning_rate": 3.3971815936139836e-06, + "loss": 0.3885, + "step": 4924 + }, + { + "epoch": 2.3286052009456264, + "grad_norm": 2.849043130874634, + "learning_rate": 3.396599293986092e-06, + "loss": 0.4842, + "step": 4925 + }, + { + "epoch": 2.329078014184397, + "grad_norm": 2.550673484802246, + "learning_rate": 3.3960169385308927e-06, + "loss": 0.4049, + "step": 4926 + }, + { + "epoch": 2.329550827423168, + "grad_norm": 3.0821585655212402, + "learning_rate": 3.3954345272846477e-06, + "loss": 0.53, + "step": 4927 + }, + { + "epoch": 2.3300236406619383, + "grad_norm": 2.68658185005188, + "learning_rate": 3.3948520602836223e-06, + "loss": 0.4592, + "step": 4928 + }, + { + "epoch": 2.330496453900709, + "grad_norm": 2.7391903400421143, + "learning_rate": 3.394269537564082e-06, + "loss": 0.4773, + "step": 4929 + }, + { + "epoch": 2.33096926713948, + "grad_norm": 2.665114164352417, + "learning_rate": 3.393686959162299e-06, + "loss": 0.4671, + "step": 4930 + }, + { + "epoch": 2.3314420803782507, + "grad_norm": 2.6827399730682373, + "learning_rate": 3.3931043251145477e-06, + "loss": 0.4669, + "step": 4931 + }, + { + "epoch": 2.331914893617021, + "grad_norm": 3.1760666370391846, + "learning_rate": 3.392521635457106e-06, + "loss": 0.4729, + "step": 4932 + }, + { + "epoch": 2.332387706855792, + "grad_norm": 2.9686226844787598, + "learning_rate": 3.3919388902262555e-06, + "loss": 0.5017, + "step": 4933 + }, + { + "epoch": 2.3328605200945627, + "grad_norm": 2.471325397491455, + "learning_rate": 3.3913560894582818e-06, + "loss": 0.4195, + "step": 4934 + }, + { + "epoch": 2.3333333333333335, + "grad_norm": 2.4062955379486084, + "learning_rate": 3.3907732331894732e-06, + "loss": 0.3666, + "step": 4935 + }, + { + "epoch": 2.333806146572104, + "grad_norm": 2.6800320148468018, + "learning_rate": 3.3901903214561206e-06, + "loss": 0.4774, + "step": 4936 + }, + { + "epoch": 2.3342789598108746, + "grad_norm": 2.923741102218628, + "learning_rate": 3.389607354294521e-06, + "loss": 0.4546, + "step": 4937 + }, + { + "epoch": 2.3347517730496454, + "grad_norm": 3.0034096240997314, + "learning_rate": 3.3890243317409716e-06, + "loss": 0.5373, + "step": 4938 + }, + { + "epoch": 2.3352245862884162, + "grad_norm": 3.0757339000701904, + "learning_rate": 3.388441253831775e-06, + "loss": 0.4655, + "step": 4939 + }, + { + "epoch": 2.3356973995271866, + "grad_norm": 2.5352041721343994, + "learning_rate": 3.3878581206032373e-06, + "loss": 0.4391, + "step": 4940 + }, + { + "epoch": 2.3361702127659574, + "grad_norm": 2.9332237243652344, + "learning_rate": 3.3872749320916675e-06, + "loss": 0.4685, + "step": 4941 + }, + { + "epoch": 2.336643026004728, + "grad_norm": 2.4871222972869873, + "learning_rate": 3.386691688333379e-06, + "loss": 0.3952, + "step": 4942 + }, + { + "epoch": 2.337115839243499, + "grad_norm": 2.6384918689727783, + "learning_rate": 3.386108389364687e-06, + "loss": 0.4044, + "step": 4943 + }, + { + "epoch": 2.3375886524822693, + "grad_norm": 2.3545165061950684, + "learning_rate": 3.3855250352219102e-06, + "loss": 0.426, + "step": 4944 + }, + { + "epoch": 2.33806146572104, + "grad_norm": 2.972242593765259, + "learning_rate": 3.3849416259413735e-06, + "loss": 0.5033, + "step": 4945 + }, + { + "epoch": 2.338534278959811, + "grad_norm": 3.117351770401001, + "learning_rate": 3.384358161559401e-06, + "loss": 0.4695, + "step": 4946 + }, + { + "epoch": 2.3390070921985817, + "grad_norm": 2.888916492462158, + "learning_rate": 3.383774642112324e-06, + "loss": 0.437, + "step": 4947 + }, + { + "epoch": 2.339479905437352, + "grad_norm": 3.0677435398101807, + "learning_rate": 3.3831910676364753e-06, + "loss": 0.4293, + "step": 4948 + }, + { + "epoch": 2.339952718676123, + "grad_norm": 2.8571784496307373, + "learning_rate": 3.3826074381681916e-06, + "loss": 0.4574, + "step": 4949 + }, + { + "epoch": 2.3404255319148937, + "grad_norm": 2.907276153564453, + "learning_rate": 3.3820237537438127e-06, + "loss": 0.4731, + "step": 4950 + }, + { + "epoch": 2.3408983451536645, + "grad_norm": 2.923762559890747, + "learning_rate": 3.3814400143996823e-06, + "loss": 0.4648, + "step": 4951 + }, + { + "epoch": 2.341371158392435, + "grad_norm": 2.6206982135772705, + "learning_rate": 3.3808562201721473e-06, + "loss": 0.436, + "step": 4952 + }, + { + "epoch": 2.3418439716312056, + "grad_norm": 6.279088973999023, + "learning_rate": 3.380272371097558e-06, + "loss": 0.4461, + "step": 4953 + }, + { + "epoch": 2.3423167848699764, + "grad_norm": 2.785297155380249, + "learning_rate": 3.3796884672122684e-06, + "loss": 0.4619, + "step": 4954 + }, + { + "epoch": 2.342789598108747, + "grad_norm": 2.6241793632507324, + "learning_rate": 3.379104508552634e-06, + "loss": 0.4323, + "step": 4955 + }, + { + "epoch": 2.3432624113475176, + "grad_norm": 2.6052167415618896, + "learning_rate": 3.378520495155017e-06, + "loss": 0.3943, + "step": 4956 + }, + { + "epoch": 2.3437352245862884, + "grad_norm": 2.8247411251068115, + "learning_rate": 3.3779364270557818e-06, + "loss": 0.4689, + "step": 4957 + }, + { + "epoch": 2.344208037825059, + "grad_norm": 2.5348927974700928, + "learning_rate": 3.377352304291294e-06, + "loss": 0.4619, + "step": 4958 + }, + { + "epoch": 2.34468085106383, + "grad_norm": 2.906648874282837, + "learning_rate": 3.376768126897926e-06, + "loss": 0.5191, + "step": 4959 + }, + { + "epoch": 2.3451536643026003, + "grad_norm": 2.796870470046997, + "learning_rate": 3.3761838949120514e-06, + "loss": 0.4227, + "step": 4960 + }, + { + "epoch": 2.345626477541371, + "grad_norm": 2.789635419845581, + "learning_rate": 3.3755996083700464e-06, + "loss": 0.3927, + "step": 4961 + }, + { + "epoch": 2.346099290780142, + "grad_norm": 2.86641263961792, + "learning_rate": 3.375015267308295e-06, + "loss": 0.4097, + "step": 4962 + }, + { + "epoch": 2.3465721040189127, + "grad_norm": 2.8374414443969727, + "learning_rate": 3.374430871763178e-06, + "loss": 0.4566, + "step": 4963 + }, + { + "epoch": 2.347044917257683, + "grad_norm": 2.71951961517334, + "learning_rate": 3.3738464217710854e-06, + "loss": 0.4748, + "step": 4964 + }, + { + "epoch": 2.347517730496454, + "grad_norm": 2.6939785480499268, + "learning_rate": 3.373261917368408e-06, + "loss": 0.4499, + "step": 4965 + }, + { + "epoch": 2.3479905437352246, + "grad_norm": 2.862661600112915, + "learning_rate": 3.37267735859154e-06, + "loss": 0.415, + "step": 4966 + }, + { + "epoch": 2.3484633569739954, + "grad_norm": 2.3657119274139404, + "learning_rate": 3.3720927454768793e-06, + "loss": 0.4112, + "step": 4967 + }, + { + "epoch": 2.348936170212766, + "grad_norm": 3.701571464538574, + "learning_rate": 3.3715080780608277e-06, + "loss": 0.4735, + "step": 4968 + }, + { + "epoch": 2.3494089834515366, + "grad_norm": 2.894350528717041, + "learning_rate": 3.3709233563797895e-06, + "loss": 0.4278, + "step": 4969 + }, + { + "epoch": 2.3498817966903074, + "grad_norm": 3.0072877407073975, + "learning_rate": 3.3703385804701727e-06, + "loss": 0.4718, + "step": 4970 + }, + { + "epoch": 2.350354609929078, + "grad_norm": 2.9920408725738525, + "learning_rate": 3.369753750368389e-06, + "loss": 0.4636, + "step": 4971 + }, + { + "epoch": 2.3508274231678485, + "grad_norm": 2.381770372390747, + "learning_rate": 3.369168866110853e-06, + "loss": 0.3841, + "step": 4972 + }, + { + "epoch": 2.3513002364066193, + "grad_norm": 2.6195342540740967, + "learning_rate": 3.3685839277339825e-06, + "loss": 0.4422, + "step": 4973 + }, + { + "epoch": 2.35177304964539, + "grad_norm": 2.885852575302124, + "learning_rate": 3.3679989352741992e-06, + "loss": 0.4798, + "step": 4974 + }, + { + "epoch": 2.352245862884161, + "grad_norm": 2.820004940032959, + "learning_rate": 3.367413888767929e-06, + "loss": 0.4498, + "step": 4975 + }, + { + "epoch": 2.3527186761229313, + "grad_norm": 2.579680919647217, + "learning_rate": 3.366828788251599e-06, + "loss": 0.4894, + "step": 4976 + }, + { + "epoch": 2.353191489361702, + "grad_norm": 2.7509915828704834, + "learning_rate": 3.366243633761642e-06, + "loss": 0.4354, + "step": 4977 + }, + { + "epoch": 2.353664302600473, + "grad_norm": 3.061767339706421, + "learning_rate": 3.3656584253344917e-06, + "loss": 0.4651, + "step": 4978 + }, + { + "epoch": 2.3541371158392437, + "grad_norm": 2.6109485626220703, + "learning_rate": 3.365073163006587e-06, + "loss": 0.44, + "step": 4979 + }, + { + "epoch": 2.354609929078014, + "grad_norm": 3.4247376918792725, + "learning_rate": 3.36448784681437e-06, + "loss": 0.3993, + "step": 4980 + }, + { + "epoch": 2.355082742316785, + "grad_norm": 2.953695297241211, + "learning_rate": 3.363902476794285e-06, + "loss": 0.4763, + "step": 4981 + }, + { + "epoch": 2.3555555555555556, + "grad_norm": 2.836543083190918, + "learning_rate": 3.3633170529827806e-06, + "loss": 0.4755, + "step": 4982 + }, + { + "epoch": 2.3560283687943264, + "grad_norm": 2.944082021713257, + "learning_rate": 3.36273157541631e-06, + "loss": 0.472, + "step": 4983 + }, + { + "epoch": 2.3565011820330968, + "grad_norm": 2.891716957092285, + "learning_rate": 3.3621460441313262e-06, + "loss": 0.5259, + "step": 4984 + }, + { + "epoch": 2.3569739952718676, + "grad_norm": 2.8448829650878906, + "learning_rate": 3.3615604591642896e-06, + "loss": 0.4587, + "step": 4985 + }, + { + "epoch": 2.3574468085106384, + "grad_norm": 3.114393711090088, + "learning_rate": 3.36097482055166e-06, + "loss": 0.4352, + "step": 4986 + }, + { + "epoch": 2.357919621749409, + "grad_norm": 2.964851140975952, + "learning_rate": 3.360389128329904e-06, + "loss": 0.5015, + "step": 4987 + }, + { + "epoch": 2.3583924349881795, + "grad_norm": 2.4819815158843994, + "learning_rate": 3.3598033825354893e-06, + "loss": 0.3459, + "step": 4988 + }, + { + "epoch": 2.3588652482269503, + "grad_norm": 2.635754346847534, + "learning_rate": 3.359217583204889e-06, + "loss": 0.4367, + "step": 4989 + }, + { + "epoch": 2.359338061465721, + "grad_norm": 2.542482376098633, + "learning_rate": 3.358631730374576e-06, + "loss": 0.3978, + "step": 4990 + }, + { + "epoch": 2.359810874704492, + "grad_norm": 2.614018678665161, + "learning_rate": 3.358045824081031e-06, + "loss": 0.424, + "step": 4991 + }, + { + "epoch": 2.3602836879432623, + "grad_norm": 2.775373697280884, + "learning_rate": 3.3574598643607354e-06, + "loss": 0.4901, + "step": 4992 + }, + { + "epoch": 2.360756501182033, + "grad_norm": 3.091381311416626, + "learning_rate": 3.356873851250173e-06, + "loss": 0.4954, + "step": 4993 + }, + { + "epoch": 2.361229314420804, + "grad_norm": 2.440023422241211, + "learning_rate": 3.3562877847858337e-06, + "loss": 0.4053, + "step": 4994 + }, + { + "epoch": 2.3617021276595747, + "grad_norm": 2.8879518508911133, + "learning_rate": 3.3557016650042084e-06, + "loss": 0.4766, + "step": 4995 + }, + { + "epoch": 2.362174940898345, + "grad_norm": 3.1298391819000244, + "learning_rate": 3.355115491941793e-06, + "loss": 0.4743, + "step": 4996 + }, + { + "epoch": 2.362647754137116, + "grad_norm": 3.3325259685516357, + "learning_rate": 3.3545292656350845e-06, + "loss": 0.4703, + "step": 4997 + }, + { + "epoch": 2.3631205673758866, + "grad_norm": 2.7935359477996826, + "learning_rate": 3.353942986120587e-06, + "loss": 0.432, + "step": 4998 + }, + { + "epoch": 2.3635933806146574, + "grad_norm": 2.623624324798584, + "learning_rate": 3.3533566534348033e-06, + "loss": 0.4302, + "step": 4999 + }, + { + "epoch": 2.3640661938534278, + "grad_norm": 3.1467108726501465, + "learning_rate": 3.3527702676142426e-06, + "loss": 0.4661, + "step": 5000 + }, + { + "epoch": 2.3645390070921986, + "grad_norm": 2.5364840030670166, + "learning_rate": 3.352183828695418e-06, + "loss": 0.4134, + "step": 5001 + }, + { + "epoch": 2.3650118203309693, + "grad_norm": 3.002777338027954, + "learning_rate": 3.3515973367148415e-06, + "loss": 0.3771, + "step": 5002 + }, + { + "epoch": 2.36548463356974, + "grad_norm": 2.660043954849243, + "learning_rate": 3.3510107917090335e-06, + "loss": 0.4254, + "step": 5003 + }, + { + "epoch": 2.3659574468085105, + "grad_norm": 2.7041075229644775, + "learning_rate": 3.3504241937145148e-06, + "loss": 0.4651, + "step": 5004 + }, + { + "epoch": 2.3664302600472813, + "grad_norm": 2.7387280464172363, + "learning_rate": 3.349837542767811e-06, + "loss": 0.3874, + "step": 5005 + }, + { + "epoch": 2.366903073286052, + "grad_norm": 3.012188196182251, + "learning_rate": 3.349250838905449e-06, + "loss": 0.4508, + "step": 5006 + }, + { + "epoch": 2.3673758865248224, + "grad_norm": 2.3108484745025635, + "learning_rate": 3.3486640821639616e-06, + "loss": 0.3783, + "step": 5007 + }, + { + "epoch": 2.3678486997635932, + "grad_norm": 3.2188332080841064, + "learning_rate": 3.3480772725798837e-06, + "loss": 0.4879, + "step": 5008 + }, + { + "epoch": 2.368321513002364, + "grad_norm": 2.566087484359741, + "learning_rate": 3.3474904101897526e-06, + "loss": 0.3847, + "step": 5009 + }, + { + "epoch": 2.368794326241135, + "grad_norm": 2.5581698417663574, + "learning_rate": 3.3469034950301092e-06, + "loss": 0.4201, + "step": 5010 + }, + { + "epoch": 2.3692671394799056, + "grad_norm": 2.900296926498413, + "learning_rate": 3.3463165271374992e-06, + "loss": 0.4568, + "step": 5011 + }, + { + "epoch": 2.369739952718676, + "grad_norm": 2.8239312171936035, + "learning_rate": 3.34572950654847e-06, + "loss": 0.4583, + "step": 5012 + }, + { + "epoch": 2.370212765957447, + "grad_norm": 3.219465970993042, + "learning_rate": 3.3451424332995723e-06, + "loss": 0.5435, + "step": 5013 + }, + { + "epoch": 2.3706855791962176, + "grad_norm": 3.3111915588378906, + "learning_rate": 3.344555307427362e-06, + "loss": 0.435, + "step": 5014 + }, + { + "epoch": 2.371158392434988, + "grad_norm": 3.296668529510498, + "learning_rate": 3.3439681289683946e-06, + "loss": 0.4738, + "step": 5015 + }, + { + "epoch": 2.3716312056737587, + "grad_norm": 3.005722761154175, + "learning_rate": 3.343380897959234e-06, + "loss": 0.4267, + "step": 5016 + }, + { + "epoch": 2.3721040189125295, + "grad_norm": 2.7844085693359375, + "learning_rate": 3.3427936144364425e-06, + "loss": 0.4558, + "step": 5017 + }, + { + "epoch": 2.3725768321513003, + "grad_norm": 2.7532076835632324, + "learning_rate": 3.3422062784365884e-06, + "loss": 0.4144, + "step": 5018 + }, + { + "epoch": 2.373049645390071, + "grad_norm": 2.835764169692993, + "learning_rate": 3.3416188899962413e-06, + "loss": 0.4945, + "step": 5019 + }, + { + "epoch": 2.3735224586288415, + "grad_norm": 3.1513726711273193, + "learning_rate": 3.3410314491519767e-06, + "loss": 0.4971, + "step": 5020 + }, + { + "epoch": 2.3739952718676123, + "grad_norm": 3.0162220001220703, + "learning_rate": 3.3404439559403723e-06, + "loss": 0.4477, + "step": 5021 + }, + { + "epoch": 2.374468085106383, + "grad_norm": 2.676391363143921, + "learning_rate": 3.3398564103980073e-06, + "loss": 0.432, + "step": 5022 + }, + { + "epoch": 2.3749408983451534, + "grad_norm": 2.7806248664855957, + "learning_rate": 3.3392688125614663e-06, + "loss": 0.4818, + "step": 5023 + }, + { + "epoch": 2.3754137115839242, + "grad_norm": 2.968806505203247, + "learning_rate": 3.3386811624673373e-06, + "loss": 0.4893, + "step": 5024 + }, + { + "epoch": 2.375886524822695, + "grad_norm": 2.992684841156006, + "learning_rate": 3.3380934601522087e-06, + "loss": 0.4423, + "step": 5025 + }, + { + "epoch": 2.376359338061466, + "grad_norm": 2.578420639038086, + "learning_rate": 3.3375057056526762e-06, + "loss": 0.3682, + "step": 5026 + }, + { + "epoch": 2.3768321513002366, + "grad_norm": 2.7683115005493164, + "learning_rate": 3.336917899005335e-06, + "loss": 0.4038, + "step": 5027 + }, + { + "epoch": 2.377304964539007, + "grad_norm": 2.838812828063965, + "learning_rate": 3.336330040246786e-06, + "loss": 0.442, + "step": 5028 + }, + { + "epoch": 2.3777777777777778, + "grad_norm": 2.766136646270752, + "learning_rate": 3.335742129413633e-06, + "loss": 0.4745, + "step": 5029 + }, + { + "epoch": 2.3782505910165486, + "grad_norm": 2.862656593322754, + "learning_rate": 3.3351541665424812e-06, + "loss": 0.4324, + "step": 5030 + }, + { + "epoch": 2.378723404255319, + "grad_norm": 2.71425199508667, + "learning_rate": 3.3345661516699433e-06, + "loss": 0.4013, + "step": 5031 + }, + { + "epoch": 2.3791962174940897, + "grad_norm": 2.8404030799865723, + "learning_rate": 3.333978084832629e-06, + "loss": 0.5038, + "step": 5032 + }, + { + "epoch": 2.3796690307328605, + "grad_norm": 2.965851068496704, + "learning_rate": 3.3333899660671574e-06, + "loss": 0.4668, + "step": 5033 + }, + { + "epoch": 2.3801418439716313, + "grad_norm": 2.686452627182007, + "learning_rate": 3.3328017954101464e-06, + "loss": 0.4167, + "step": 5034 + }, + { + "epoch": 2.380614657210402, + "grad_norm": 2.8676156997680664, + "learning_rate": 3.3322135728982197e-06, + "loss": 0.4531, + "step": 5035 + }, + { + "epoch": 2.3810874704491725, + "grad_norm": 2.4456300735473633, + "learning_rate": 3.3316252985680026e-06, + "loss": 0.4173, + "step": 5036 + }, + { + "epoch": 2.3815602836879433, + "grad_norm": 2.5472559928894043, + "learning_rate": 3.331036972456124e-06, + "loss": 0.3926, + "step": 5037 + }, + { + "epoch": 2.382033096926714, + "grad_norm": 2.81900954246521, + "learning_rate": 3.330448594599218e-06, + "loss": 0.4785, + "step": 5038 + }, + { + "epoch": 2.3825059101654844, + "grad_norm": 3.0930590629577637, + "learning_rate": 3.329860165033919e-06, + "loss": 0.4587, + "step": 5039 + }, + { + "epoch": 2.382978723404255, + "grad_norm": 3.0553040504455566, + "learning_rate": 3.3292716837968673e-06, + "loss": 0.5285, + "step": 5040 + }, + { + "epoch": 2.383451536643026, + "grad_norm": 2.577580690383911, + "learning_rate": 3.328683150924704e-06, + "loss": 0.4184, + "step": 5041 + }, + { + "epoch": 2.383924349881797, + "grad_norm": 2.6430366039276123, + "learning_rate": 3.3280945664540735e-06, + "loss": 0.4636, + "step": 5042 + }, + { + "epoch": 2.3843971631205676, + "grad_norm": 3.228360891342163, + "learning_rate": 3.3275059304216255e-06, + "loss": 0.455, + "step": 5043 + }, + { + "epoch": 2.384869976359338, + "grad_norm": 2.776142120361328, + "learning_rate": 3.3269172428640125e-06, + "loss": 0.4785, + "step": 5044 + }, + { + "epoch": 2.3853427895981087, + "grad_norm": 2.755671739578247, + "learning_rate": 3.3263285038178882e-06, + "loss": 0.4625, + "step": 5045 + }, + { + "epoch": 2.3858156028368795, + "grad_norm": 3.061004400253296, + "learning_rate": 3.3257397133199114e-06, + "loss": 0.4641, + "step": 5046 + }, + { + "epoch": 2.38628841607565, + "grad_norm": 2.8391458988189697, + "learning_rate": 3.3251508714067432e-06, + "loss": 0.5003, + "step": 5047 + }, + { + "epoch": 2.3867612293144207, + "grad_norm": 2.390810966491699, + "learning_rate": 3.324561978115049e-06, + "loss": 0.4446, + "step": 5048 + }, + { + "epoch": 2.3872340425531915, + "grad_norm": 2.7760825157165527, + "learning_rate": 3.323973033481496e-06, + "loss": 0.4443, + "step": 5049 + }, + { + "epoch": 2.3877068557919623, + "grad_norm": 3.157893419265747, + "learning_rate": 3.3233840375427552e-06, + "loss": 0.4934, + "step": 5050 + }, + { + "epoch": 2.388179669030733, + "grad_norm": 2.7245349884033203, + "learning_rate": 3.3227949903355e-06, + "loss": 0.4254, + "step": 5051 + }, + { + "epoch": 2.3886524822695034, + "grad_norm": 2.6674044132232666, + "learning_rate": 3.322205891896409e-06, + "loss": 0.4116, + "step": 5052 + }, + { + "epoch": 2.3891252955082742, + "grad_norm": 3.1490554809570312, + "learning_rate": 3.3216167422621627e-06, + "loss": 0.4604, + "step": 5053 + }, + { + "epoch": 2.389598108747045, + "grad_norm": 2.725731134414673, + "learning_rate": 3.321027541469444e-06, + "loss": 0.4836, + "step": 5054 + }, + { + "epoch": 2.3900709219858154, + "grad_norm": 2.5378828048706055, + "learning_rate": 3.3204382895549407e-06, + "loss": 0.4228, + "step": 5055 + }, + { + "epoch": 2.390543735224586, + "grad_norm": 2.8191192150115967, + "learning_rate": 3.3198489865553427e-06, + "loss": 0.4371, + "step": 5056 + }, + { + "epoch": 2.391016548463357, + "grad_norm": 2.5676498413085938, + "learning_rate": 3.3192596325073433e-06, + "loss": 0.4463, + "step": 5057 + }, + { + "epoch": 2.391489361702128, + "grad_norm": 3.0846121311187744, + "learning_rate": 3.3186702274476397e-06, + "loss": 0.5049, + "step": 5058 + }, + { + "epoch": 2.3919621749408986, + "grad_norm": 2.6085152626037598, + "learning_rate": 3.3180807714129293e-06, + "loss": 0.4376, + "step": 5059 + }, + { + "epoch": 2.392434988179669, + "grad_norm": 3.0218591690063477, + "learning_rate": 3.3174912644399172e-06, + "loss": 0.4734, + "step": 5060 + }, + { + "epoch": 2.3929078014184397, + "grad_norm": 2.5904781818389893, + "learning_rate": 3.316901706565308e-06, + "loss": 0.4924, + "step": 5061 + }, + { + "epoch": 2.3933806146572105, + "grad_norm": 2.675478458404541, + "learning_rate": 3.3163120978258123e-06, + "loss": 0.4072, + "step": 5062 + }, + { + "epoch": 2.393853427895981, + "grad_norm": 2.7944445610046387, + "learning_rate": 3.3157224382581415e-06, + "loss": 0.4328, + "step": 5063 + }, + { + "epoch": 2.3943262411347517, + "grad_norm": 2.846224546432495, + "learning_rate": 3.315132727899012e-06, + "loss": 0.4447, + "step": 5064 + }, + { + "epoch": 2.3947990543735225, + "grad_norm": 2.6825828552246094, + "learning_rate": 3.3145429667851402e-06, + "loss": 0.4528, + "step": 5065 + }, + { + "epoch": 2.3952718676122933, + "grad_norm": 3.0305285453796387, + "learning_rate": 3.3139531549532505e-06, + "loss": 0.4538, + "step": 5066 + }, + { + "epoch": 2.395744680851064, + "grad_norm": 2.707540988922119, + "learning_rate": 3.313363292440067e-06, + "loss": 0.4412, + "step": 5067 + }, + { + "epoch": 2.3962174940898344, + "grad_norm": 3.0458385944366455, + "learning_rate": 3.3127733792823173e-06, + "loss": 0.4587, + "step": 5068 + }, + { + "epoch": 2.396690307328605, + "grad_norm": 2.7711992263793945, + "learning_rate": 3.312183415516733e-06, + "loss": 0.4157, + "step": 5069 + }, + { + "epoch": 2.397163120567376, + "grad_norm": 2.6953988075256348, + "learning_rate": 3.3115934011800494e-06, + "loss": 0.3828, + "step": 5070 + }, + { + "epoch": 2.3976359338061464, + "grad_norm": 3.033721923828125, + "learning_rate": 3.311003336309003e-06, + "loss": 0.5204, + "step": 5071 + }, + { + "epoch": 2.398108747044917, + "grad_norm": 2.6134517192840576, + "learning_rate": 3.3104132209403355e-06, + "loss": 0.4181, + "step": 5072 + }, + { + "epoch": 2.398581560283688, + "grad_norm": 2.8800251483917236, + "learning_rate": 3.30982305511079e-06, + "loss": 0.466, + "step": 5073 + }, + { + "epoch": 2.3990543735224588, + "grad_norm": 2.5043210983276367, + "learning_rate": 3.309232838857114e-06, + "loss": 0.4161, + "step": 5074 + }, + { + "epoch": 2.3995271867612296, + "grad_norm": 2.6577322483062744, + "learning_rate": 3.308642572216057e-06, + "loss": 0.465, + "step": 5075 + }, + { + "epoch": 2.4, + "grad_norm": 2.549098253250122, + "learning_rate": 3.3080522552243734e-06, + "loss": 0.4571, + "step": 5076 + }, + { + "epoch": 2.4004728132387707, + "grad_norm": 2.881958246231079, + "learning_rate": 3.3074618879188186e-06, + "loss": 0.4443, + "step": 5077 + }, + { + "epoch": 2.4009456264775415, + "grad_norm": 2.608397960662842, + "learning_rate": 3.3068714703361528e-06, + "loss": 0.3843, + "step": 5078 + }, + { + "epoch": 2.401418439716312, + "grad_norm": 2.8666789531707764, + "learning_rate": 3.306281002513139e-06, + "loss": 0.4857, + "step": 5079 + }, + { + "epoch": 2.4018912529550827, + "grad_norm": 2.9008588790893555, + "learning_rate": 3.3056904844865422e-06, + "loss": 0.4454, + "step": 5080 + }, + { + "epoch": 2.4023640661938535, + "grad_norm": 2.7446060180664062, + "learning_rate": 3.3050999162931315e-06, + "loss": 0.4522, + "step": 5081 + }, + { + "epoch": 2.4028368794326243, + "grad_norm": 2.787116765975952, + "learning_rate": 3.3045092979696804e-06, + "loss": 0.4714, + "step": 5082 + }, + { + "epoch": 2.403309692671395, + "grad_norm": 2.7494192123413086, + "learning_rate": 3.3039186295529613e-06, + "loss": 0.4107, + "step": 5083 + }, + { + "epoch": 2.4037825059101654, + "grad_norm": 2.733794927597046, + "learning_rate": 3.303327911079755e-06, + "loss": 0.4169, + "step": 5084 + }, + { + "epoch": 2.404255319148936, + "grad_norm": 2.7313334941864014, + "learning_rate": 3.3027371425868422e-06, + "loss": 0.4287, + "step": 5085 + }, + { + "epoch": 2.404728132387707, + "grad_norm": 2.7832977771759033, + "learning_rate": 3.3021463241110075e-06, + "loss": 0.5307, + "step": 5086 + }, + { + "epoch": 2.4052009456264773, + "grad_norm": 2.6615281105041504, + "learning_rate": 3.301555455689038e-06, + "loss": 0.4519, + "step": 5087 + }, + { + "epoch": 2.405673758865248, + "grad_norm": 2.343921422958374, + "learning_rate": 3.3009645373577264e-06, + "loss": 0.46, + "step": 5088 + }, + { + "epoch": 2.406146572104019, + "grad_norm": 2.6115355491638184, + "learning_rate": 3.300373569153864e-06, + "loss": 0.4782, + "step": 5089 + }, + { + "epoch": 2.4066193853427897, + "grad_norm": 2.730625629425049, + "learning_rate": 3.299782551114249e-06, + "loss": 0.4632, + "step": 5090 + }, + { + "epoch": 2.40709219858156, + "grad_norm": 2.4495043754577637, + "learning_rate": 3.2991914832756824e-06, + "loss": 0.4243, + "step": 5091 + }, + { + "epoch": 2.407565011820331, + "grad_norm": 2.8731648921966553, + "learning_rate": 3.2986003656749654e-06, + "loss": 0.4262, + "step": 5092 + }, + { + "epoch": 2.4080378250591017, + "grad_norm": 2.870342969894409, + "learning_rate": 3.2980091983489053e-06, + "loss": 0.4735, + "step": 5093 + }, + { + "epoch": 2.4085106382978725, + "grad_norm": 2.500786542892456, + "learning_rate": 3.297417981334312e-06, + "loss": 0.4007, + "step": 5094 + }, + { + "epoch": 2.408983451536643, + "grad_norm": 2.7787322998046875, + "learning_rate": 3.2968267146679978e-06, + "loss": 0.493, + "step": 5095 + }, + { + "epoch": 2.4094562647754136, + "grad_norm": 2.5229599475860596, + "learning_rate": 3.2962353983867783e-06, + "loss": 0.3676, + "step": 5096 + }, + { + "epoch": 2.4099290780141844, + "grad_norm": 3.1955904960632324, + "learning_rate": 3.2956440325274715e-06, + "loss": 0.4888, + "step": 5097 + }, + { + "epoch": 2.4104018912529552, + "grad_norm": 2.8580288887023926, + "learning_rate": 3.2950526171268995e-06, + "loss": 0.4892, + "step": 5098 + }, + { + "epoch": 2.4108747044917256, + "grad_norm": 2.6321749687194824, + "learning_rate": 3.294461152221887e-06, + "loss": 0.3823, + "step": 5099 + }, + { + "epoch": 2.4113475177304964, + "grad_norm": 2.881127119064331, + "learning_rate": 3.293869637849263e-06, + "loss": 0.4569, + "step": 5100 + }, + { + "epoch": 2.411820330969267, + "grad_norm": 2.7742316722869873, + "learning_rate": 3.293278074045857e-06, + "loss": 0.4445, + "step": 5101 + }, + { + "epoch": 2.412293144208038, + "grad_norm": 2.546701431274414, + "learning_rate": 3.2926864608485037e-06, + "loss": 0.3995, + "step": 5102 + }, + { + "epoch": 2.4127659574468083, + "grad_norm": 2.588226318359375, + "learning_rate": 3.292094798294041e-06, + "loss": 0.4081, + "step": 5103 + }, + { + "epoch": 2.413238770685579, + "grad_norm": 2.968689441680908, + "learning_rate": 3.2915030864193077e-06, + "loss": 0.4475, + "step": 5104 + }, + { + "epoch": 2.41371158392435, + "grad_norm": 2.9249184131622314, + "learning_rate": 3.290911325261148e-06, + "loss": 0.4763, + "step": 5105 + }, + { + "epoch": 2.4141843971631207, + "grad_norm": 2.817596673965454, + "learning_rate": 3.2903195148564083e-06, + "loss": 0.4451, + "step": 5106 + }, + { + "epoch": 2.414657210401891, + "grad_norm": 2.6465954780578613, + "learning_rate": 3.2897276552419377e-06, + "loss": 0.4665, + "step": 5107 + }, + { + "epoch": 2.415130023640662, + "grad_norm": 2.8613853454589844, + "learning_rate": 3.2891357464545885e-06, + "loss": 0.4398, + "step": 5108 + }, + { + "epoch": 2.4156028368794327, + "grad_norm": 2.756321907043457, + "learning_rate": 3.2885437885312175e-06, + "loss": 0.4634, + "step": 5109 + }, + { + "epoch": 2.4160756501182035, + "grad_norm": 2.8965282440185547, + "learning_rate": 3.287951781508682e-06, + "loss": 0.4319, + "step": 5110 + }, + { + "epoch": 2.416548463356974, + "grad_norm": 2.896756172180176, + "learning_rate": 3.287359725423844e-06, + "loss": 0.4771, + "step": 5111 + }, + { + "epoch": 2.4170212765957446, + "grad_norm": 2.952911376953125, + "learning_rate": 3.286767620313569e-06, + "loss": 0.5026, + "step": 5112 + }, + { + "epoch": 2.4174940898345154, + "grad_norm": 3.850515604019165, + "learning_rate": 3.2861754662147234e-06, + "loss": 0.4387, + "step": 5113 + }, + { + "epoch": 2.417966903073286, + "grad_norm": 3.0072689056396484, + "learning_rate": 3.2855832631641794e-06, + "loss": 0.4586, + "step": 5114 + }, + { + "epoch": 2.4184397163120566, + "grad_norm": 3.166790246963501, + "learning_rate": 3.2849910111988092e-06, + "loss": 0.4842, + "step": 5115 + }, + { + "epoch": 2.4189125295508274, + "grad_norm": 3.5397679805755615, + "learning_rate": 3.284398710355492e-06, + "loss": 0.5138, + "step": 5116 + }, + { + "epoch": 2.419385342789598, + "grad_norm": 2.779609441757202, + "learning_rate": 3.283806360671106e-06, + "loss": 0.4049, + "step": 5117 + }, + { + "epoch": 2.419858156028369, + "grad_norm": 2.5924575328826904, + "learning_rate": 3.283213962182535e-06, + "loss": 0.433, + "step": 5118 + }, + { + "epoch": 2.4203309692671393, + "grad_norm": 2.7429699897766113, + "learning_rate": 3.282621514926665e-06, + "loss": 0.4674, + "step": 5119 + }, + { + "epoch": 2.42080378250591, + "grad_norm": 2.8113889694213867, + "learning_rate": 3.2820290189403846e-06, + "loss": 0.3898, + "step": 5120 + }, + { + "epoch": 2.421276595744681, + "grad_norm": 2.867105722427368, + "learning_rate": 3.2814364742605863e-06, + "loss": 0.4439, + "step": 5121 + }, + { + "epoch": 2.4217494089834517, + "grad_norm": 2.428597927093506, + "learning_rate": 3.2808438809241654e-06, + "loss": 0.4339, + "step": 5122 + }, + { + "epoch": 2.422222222222222, + "grad_norm": 3.071735143661499, + "learning_rate": 3.2802512389680203e-06, + "loss": 0.4583, + "step": 5123 + }, + { + "epoch": 2.422695035460993, + "grad_norm": 3.046313762664795, + "learning_rate": 3.279658548429051e-06, + "loss": 0.5351, + "step": 5124 + }, + { + "epoch": 2.4231678486997636, + "grad_norm": 2.8412697315216064, + "learning_rate": 3.279065809344163e-06, + "loss": 0.5258, + "step": 5125 + }, + { + "epoch": 2.4236406619385344, + "grad_norm": 2.887169122695923, + "learning_rate": 3.278473021750263e-06, + "loss": 0.4568, + "step": 5126 + }, + { + "epoch": 2.424113475177305, + "grad_norm": 2.8316574096679688, + "learning_rate": 3.2778801856842624e-06, + "loss": 0.46, + "step": 5127 + }, + { + "epoch": 2.4245862884160756, + "grad_norm": 2.7660772800445557, + "learning_rate": 3.277287301183073e-06, + "loss": 0.4323, + "step": 5128 + }, + { + "epoch": 2.4250591016548464, + "grad_norm": 2.737682819366455, + "learning_rate": 3.276694368283611e-06, + "loss": 0.4296, + "step": 5129 + }, + { + "epoch": 2.425531914893617, + "grad_norm": 2.8807425498962402, + "learning_rate": 3.276101387022797e-06, + "loss": 0.4673, + "step": 5130 + }, + { + "epoch": 2.4260047281323875, + "grad_norm": 2.530526876449585, + "learning_rate": 3.275508357437552e-06, + "loss": 0.416, + "step": 5131 + }, + { + "epoch": 2.4264775413711583, + "grad_norm": 3.1189746856689453, + "learning_rate": 3.274915279564803e-06, + "loss": 0.4171, + "step": 5132 + }, + { + "epoch": 2.426950354609929, + "grad_norm": 2.6612462997436523, + "learning_rate": 3.274322153441477e-06, + "loss": 0.4104, + "step": 5133 + }, + { + "epoch": 2.4274231678487, + "grad_norm": 2.717973470687866, + "learning_rate": 3.2737289791045064e-06, + "loss": 0.479, + "step": 5134 + }, + { + "epoch": 2.4278959810874703, + "grad_norm": 2.764216661453247, + "learning_rate": 3.2731357565908247e-06, + "loss": 0.481, + "step": 5135 + }, + { + "epoch": 2.428368794326241, + "grad_norm": 2.5081393718719482, + "learning_rate": 3.272542485937369e-06, + "loss": 0.4592, + "step": 5136 + }, + { + "epoch": 2.428841607565012, + "grad_norm": 3.1380364894866943, + "learning_rate": 3.271949167181081e-06, + "loss": 0.4179, + "step": 5137 + }, + { + "epoch": 2.4293144208037827, + "grad_norm": 2.9275963306427, + "learning_rate": 3.2713558003589026e-06, + "loss": 0.5196, + "step": 5138 + }, + { + "epoch": 2.429787234042553, + "grad_norm": 2.8215506076812744, + "learning_rate": 3.270762385507781e-06, + "loss": 0.4081, + "step": 5139 + }, + { + "epoch": 2.430260047281324, + "grad_norm": 2.9185614585876465, + "learning_rate": 3.270168922664665e-06, + "loss": 0.4936, + "step": 5140 + }, + { + "epoch": 2.4307328605200946, + "grad_norm": 2.6507248878479004, + "learning_rate": 3.269575411866507e-06, + "loss": 0.4834, + "step": 5141 + }, + { + "epoch": 2.4312056737588654, + "grad_norm": 2.864741563796997, + "learning_rate": 3.2689818531502637e-06, + "loss": 0.4562, + "step": 5142 + }, + { + "epoch": 2.431678486997636, + "grad_norm": 2.806919813156128, + "learning_rate": 3.2683882465528917e-06, + "loss": 0.4645, + "step": 5143 + }, + { + "epoch": 2.4321513002364066, + "grad_norm": 2.733372211456299, + "learning_rate": 3.267794592111353e-06, + "loss": 0.4123, + "step": 5144 + }, + { + "epoch": 2.4326241134751774, + "grad_norm": 2.8005833625793457, + "learning_rate": 3.2672008898626116e-06, + "loss": 0.4343, + "step": 5145 + }, + { + "epoch": 2.433096926713948, + "grad_norm": 3.2339670658111572, + "learning_rate": 3.2666071398436354e-06, + "loss": 0.4017, + "step": 5146 + }, + { + "epoch": 2.4335697399527185, + "grad_norm": 2.510251760482788, + "learning_rate": 3.2660133420913932e-06, + "loss": 0.3882, + "step": 5147 + }, + { + "epoch": 2.4340425531914893, + "grad_norm": 3.5633628368377686, + "learning_rate": 3.26541949664286e-06, + "loss": 0.4766, + "step": 5148 + }, + { + "epoch": 2.43451536643026, + "grad_norm": 2.8246724605560303, + "learning_rate": 3.26482560353501e-06, + "loss": 0.3728, + "step": 5149 + }, + { + "epoch": 2.434988179669031, + "grad_norm": 2.4923641681671143, + "learning_rate": 3.264231662804823e-06, + "loss": 0.4346, + "step": 5150 + }, + { + "epoch": 2.4354609929078013, + "grad_norm": 3.180874824523926, + "learning_rate": 3.2636376744892827e-06, + "loss": 0.4351, + "step": 5151 + }, + { + "epoch": 2.435933806146572, + "grad_norm": 2.6933515071868896, + "learning_rate": 3.263043638625373e-06, + "loss": 0.4293, + "step": 5152 + }, + { + "epoch": 2.436406619385343, + "grad_norm": 2.584132194519043, + "learning_rate": 3.262449555250081e-06, + "loss": 0.4589, + "step": 5153 + }, + { + "epoch": 2.4368794326241137, + "grad_norm": 2.8103036880493164, + "learning_rate": 3.2618554244003985e-06, + "loss": 0.463, + "step": 5154 + }, + { + "epoch": 2.437352245862884, + "grad_norm": 2.809070587158203, + "learning_rate": 3.2612612461133197e-06, + "loss": 0.4629, + "step": 5155 + }, + { + "epoch": 2.437825059101655, + "grad_norm": 2.98148512840271, + "learning_rate": 3.2606670204258405e-06, + "loss": 0.451, + "step": 5156 + }, + { + "epoch": 2.4382978723404256, + "grad_norm": 2.691047191619873, + "learning_rate": 3.2600727473749614e-06, + "loss": 0.3878, + "step": 5157 + }, + { + "epoch": 2.4387706855791964, + "grad_norm": 2.900360345840454, + "learning_rate": 3.2594784269976856e-06, + "loss": 0.4216, + "step": 5158 + }, + { + "epoch": 2.4392434988179668, + "grad_norm": 2.8449952602386475, + "learning_rate": 3.258884059331019e-06, + "loss": 0.4268, + "step": 5159 + }, + { + "epoch": 2.4397163120567376, + "grad_norm": 2.7226388454437256, + "learning_rate": 3.258289644411969e-06, + "loss": 0.4381, + "step": 5160 + }, + { + "epoch": 2.4401891252955084, + "grad_norm": 2.513946056365967, + "learning_rate": 3.257695182277547e-06, + "loss": 0.4566, + "step": 5161 + }, + { + "epoch": 2.440661938534279, + "grad_norm": 2.9941394329071045, + "learning_rate": 3.2571006729647693e-06, + "loss": 0.4395, + "step": 5162 + }, + { + "epoch": 2.4411347517730495, + "grad_norm": 2.699094533920288, + "learning_rate": 3.2565061165106523e-06, + "loss": 0.4274, + "step": 5163 + }, + { + "epoch": 2.4416075650118203, + "grad_norm": 2.574193000793457, + "learning_rate": 3.255911512952216e-06, + "loss": 0.4187, + "step": 5164 + }, + { + "epoch": 2.442080378250591, + "grad_norm": 2.920766592025757, + "learning_rate": 3.2553168623264854e-06, + "loss": 0.4911, + "step": 5165 + }, + { + "epoch": 2.4425531914893615, + "grad_norm": 2.728421926498413, + "learning_rate": 3.2547221646704853e-06, + "loss": 0.4466, + "step": 5166 + }, + { + "epoch": 2.4430260047281322, + "grad_norm": 2.8171417713165283, + "learning_rate": 3.254127420021246e-06, + "loss": 0.4331, + "step": 5167 + }, + { + "epoch": 2.443498817966903, + "grad_norm": 2.4069135189056396, + "learning_rate": 3.2535326284157975e-06, + "loss": 0.389, + "step": 5168 + }, + { + "epoch": 2.443971631205674, + "grad_norm": 2.912405490875244, + "learning_rate": 3.2529377898911777e-06, + "loss": 0.4681, + "step": 5169 + }, + { + "epoch": 2.4444444444444446, + "grad_norm": 2.987558126449585, + "learning_rate": 3.2523429044844228e-06, + "loss": 0.4715, + "step": 5170 + }, + { + "epoch": 2.444917257683215, + "grad_norm": 2.5117199420928955, + "learning_rate": 3.251747972232574e-06, + "loss": 0.4531, + "step": 5171 + }, + { + "epoch": 2.445390070921986, + "grad_norm": 2.5405385494232178, + "learning_rate": 3.2511529931726752e-06, + "loss": 0.4323, + "step": 5172 + }, + { + "epoch": 2.4458628841607566, + "grad_norm": 2.989932060241699, + "learning_rate": 3.250557967341773e-06, + "loss": 0.4039, + "step": 5173 + }, + { + "epoch": 2.446335697399527, + "grad_norm": 2.6331627368927, + "learning_rate": 3.2499628947769186e-06, + "loss": 0.5147, + "step": 5174 + }, + { + "epoch": 2.4468085106382977, + "grad_norm": 2.71699857711792, + "learning_rate": 3.249367775515162e-06, + "loss": 0.3748, + "step": 5175 + }, + { + "epoch": 2.4472813238770685, + "grad_norm": 2.9508471488952637, + "learning_rate": 3.2487726095935606e-06, + "loss": 0.5145, + "step": 5176 + }, + { + "epoch": 2.4477541371158393, + "grad_norm": 2.8276431560516357, + "learning_rate": 3.2481773970491713e-06, + "loss": 0.4295, + "step": 5177 + }, + { + "epoch": 2.44822695035461, + "grad_norm": 2.5500540733337402, + "learning_rate": 3.2475821379190565e-06, + "loss": 0.4246, + "step": 5178 + }, + { + "epoch": 2.4486997635933805, + "grad_norm": 2.845641613006592, + "learning_rate": 3.246986832240281e-06, + "loss": 0.4211, + "step": 5179 + }, + { + "epoch": 2.4491725768321513, + "grad_norm": 3.1215856075286865, + "learning_rate": 3.2463914800499097e-06, + "loss": 0.4378, + "step": 5180 + }, + { + "epoch": 2.449645390070922, + "grad_norm": 2.4685606956481934, + "learning_rate": 3.2457960813850137e-06, + "loss": 0.4836, + "step": 5181 + }, + { + "epoch": 2.4501182033096924, + "grad_norm": 2.508028268814087, + "learning_rate": 3.245200636282666e-06, + "loss": 0.4377, + "step": 5182 + }, + { + "epoch": 2.4505910165484632, + "grad_norm": 2.899949312210083, + "learning_rate": 3.244605144779943e-06, + "loss": 0.501, + "step": 5183 + }, + { + "epoch": 2.451063829787234, + "grad_norm": 2.6494483947753906, + "learning_rate": 3.244009606913923e-06, + "loss": 0.4255, + "step": 5184 + }, + { + "epoch": 2.451536643026005, + "grad_norm": 2.4363760948181152, + "learning_rate": 3.243414022721686e-06, + "loss": 0.4402, + "step": 5185 + }, + { + "epoch": 2.4520094562647756, + "grad_norm": 2.4725022315979004, + "learning_rate": 3.242818392240317e-06, + "loss": 0.4388, + "step": 5186 + }, + { + "epoch": 2.452482269503546, + "grad_norm": 2.7010514736175537, + "learning_rate": 3.242222715506905e-06, + "loss": 0.4388, + "step": 5187 + }, + { + "epoch": 2.4529550827423168, + "grad_norm": 2.811464548110962, + "learning_rate": 3.241626992558539e-06, + "loss": 0.4634, + "step": 5188 + }, + { + "epoch": 2.4534278959810876, + "grad_norm": 2.6473052501678467, + "learning_rate": 3.2410312234323123e-06, + "loss": 0.4752, + "step": 5189 + }, + { + "epoch": 2.453900709219858, + "grad_norm": 2.5587213039398193, + "learning_rate": 3.24043540816532e-06, + "loss": 0.4458, + "step": 5190 + }, + { + "epoch": 2.4543735224586287, + "grad_norm": 2.6306557655334473, + "learning_rate": 3.239839546794662e-06, + "loss": 0.4081, + "step": 5191 + }, + { + "epoch": 2.4548463356973995, + "grad_norm": 2.4613633155822754, + "learning_rate": 3.23924363935744e-06, + "loss": 0.4165, + "step": 5192 + }, + { + "epoch": 2.4553191489361703, + "grad_norm": 2.7189204692840576, + "learning_rate": 3.238647685890757e-06, + "loss": 0.4822, + "step": 5193 + }, + { + "epoch": 2.455791962174941, + "grad_norm": 3.015977382659912, + "learning_rate": 3.238051686431722e-06, + "loss": 0.4964, + "step": 5194 + }, + { + "epoch": 2.4562647754137115, + "grad_norm": 2.8868937492370605, + "learning_rate": 3.2374556410174445e-06, + "loss": 0.4514, + "step": 5195 + }, + { + "epoch": 2.4567375886524823, + "grad_norm": 2.7959537506103516, + "learning_rate": 3.2368595496850375e-06, + "loss": 0.475, + "step": 5196 + }, + { + "epoch": 2.457210401891253, + "grad_norm": 3.0086777210235596, + "learning_rate": 3.2362634124716187e-06, + "loss": 0.4913, + "step": 5197 + }, + { + "epoch": 2.4576832151300234, + "grad_norm": 2.621335506439209, + "learning_rate": 3.2356672294143044e-06, + "loss": 0.4259, + "step": 5198 + }, + { + "epoch": 2.458156028368794, + "grad_norm": 3.1620380878448486, + "learning_rate": 3.235071000550218e-06, + "loss": 0.451, + "step": 5199 + }, + { + "epoch": 2.458628841607565, + "grad_norm": 2.7663278579711914, + "learning_rate": 3.234474725916484e-06, + "loss": 0.3854, + "step": 5200 + }, + { + "epoch": 2.459101654846336, + "grad_norm": 2.5187132358551025, + "learning_rate": 3.2338784055502288e-06, + "loss": 0.4068, + "step": 5201 + }, + { + "epoch": 2.4595744680851066, + "grad_norm": 2.6022701263427734, + "learning_rate": 3.233282039488583e-06, + "loss": 0.4484, + "step": 5202 + }, + { + "epoch": 2.460047281323877, + "grad_norm": 2.874750852584839, + "learning_rate": 3.2326856277686807e-06, + "loss": 0.45, + "step": 5203 + }, + { + "epoch": 2.4605200945626478, + "grad_norm": 2.671008586883545, + "learning_rate": 3.232089170427656e-06, + "loss": 0.4446, + "step": 5204 + }, + { + "epoch": 2.4609929078014185, + "grad_norm": 2.7365503311157227, + "learning_rate": 3.2314926675026498e-06, + "loss": 0.4402, + "step": 5205 + }, + { + "epoch": 2.461465721040189, + "grad_norm": 2.8163657188415527, + "learning_rate": 3.230896119030803e-06, + "loss": 0.3881, + "step": 5206 + }, + { + "epoch": 2.4619385342789597, + "grad_norm": 2.812433958053589, + "learning_rate": 3.2302995250492584e-06, + "loss": 0.4897, + "step": 5207 + }, + { + "epoch": 2.4624113475177305, + "grad_norm": 2.786033868789673, + "learning_rate": 3.2297028855951664e-06, + "loss": 0.4069, + "step": 5208 + }, + { + "epoch": 2.4628841607565013, + "grad_norm": 3.0247974395751953, + "learning_rate": 3.229106200705674e-06, + "loss": 0.4048, + "step": 5209 + }, + { + "epoch": 2.463356973995272, + "grad_norm": 3.3280487060546875, + "learning_rate": 3.2285094704179353e-06, + "loss": 0.5613, + "step": 5210 + }, + { + "epoch": 2.4638297872340424, + "grad_norm": 2.603219985961914, + "learning_rate": 3.2279126947691073e-06, + "loss": 0.432, + "step": 5211 + }, + { + "epoch": 2.4643026004728132, + "grad_norm": 3.1532180309295654, + "learning_rate": 3.2273158737963472e-06, + "loss": 0.4602, + "step": 5212 + }, + { + "epoch": 2.464775413711584, + "grad_norm": 2.7512969970703125, + "learning_rate": 3.2267190075368164e-06, + "loss": 0.5064, + "step": 5213 + }, + { + "epoch": 2.4652482269503544, + "grad_norm": 2.926992177963257, + "learning_rate": 3.22612209602768e-06, + "loss": 0.4753, + "step": 5214 + }, + { + "epoch": 2.465721040189125, + "grad_norm": 4.052840709686279, + "learning_rate": 3.2255251393061047e-06, + "loss": 0.5235, + "step": 5215 + }, + { + "epoch": 2.466193853427896, + "grad_norm": 2.8266959190368652, + "learning_rate": 3.2249281374092606e-06, + "loss": 0.3931, + "step": 5216 + }, + { + "epoch": 2.466666666666667, + "grad_norm": 2.564359426498413, + "learning_rate": 3.2243310903743196e-06, + "loss": 0.4146, + "step": 5217 + }, + { + "epoch": 2.4671394799054376, + "grad_norm": 2.387925148010254, + "learning_rate": 3.2237339982384576e-06, + "loss": 0.4142, + "step": 5218 + }, + { + "epoch": 2.467612293144208, + "grad_norm": 2.7045164108276367, + "learning_rate": 3.223136861038853e-06, + "loss": 0.4345, + "step": 5219 + }, + { + "epoch": 2.4680851063829787, + "grad_norm": 2.6963284015655518, + "learning_rate": 3.2225396788126872e-06, + "loss": 0.4243, + "step": 5220 + }, + { + "epoch": 2.4685579196217495, + "grad_norm": 2.8247268199920654, + "learning_rate": 3.221942451597144e-06, + "loss": 0.3919, + "step": 5221 + }, + { + "epoch": 2.46903073286052, + "grad_norm": 3.843836784362793, + "learning_rate": 3.2213451794294093e-06, + "loss": 0.4183, + "step": 5222 + }, + { + "epoch": 2.4695035460992907, + "grad_norm": 2.8579909801483154, + "learning_rate": 3.220747862346674e-06, + "loss": 0.4844, + "step": 5223 + }, + { + "epoch": 2.4699763593380615, + "grad_norm": 3.744027853012085, + "learning_rate": 3.2201505003861294e-06, + "loss": 0.4563, + "step": 5224 + }, + { + "epoch": 2.4704491725768323, + "grad_norm": 2.835108995437622, + "learning_rate": 3.219553093584971e-06, + "loss": 0.4394, + "step": 5225 + }, + { + "epoch": 2.470921985815603, + "grad_norm": 2.5681865215301514, + "learning_rate": 3.218955641980397e-06, + "loss": 0.3907, + "step": 5226 + }, + { + "epoch": 2.4713947990543734, + "grad_norm": 2.963172674179077, + "learning_rate": 3.2183581456096067e-06, + "loss": 0.5163, + "step": 5227 + }, + { + "epoch": 2.4718676122931442, + "grad_norm": 2.7840685844421387, + "learning_rate": 3.2177606045098047e-06, + "loss": 0.411, + "step": 5228 + }, + { + "epoch": 2.472340425531915, + "grad_norm": 2.7849979400634766, + "learning_rate": 3.2171630187181977e-06, + "loss": 0.4671, + "step": 5229 + }, + { + "epoch": 2.4728132387706854, + "grad_norm": 2.736406087875366, + "learning_rate": 3.216565388271994e-06, + "loss": 0.5225, + "step": 5230 + }, + { + "epoch": 2.473286052009456, + "grad_norm": 2.978271007537842, + "learning_rate": 3.215967713208406e-06, + "loss": 0.4668, + "step": 5231 + }, + { + "epoch": 2.473758865248227, + "grad_norm": 2.687560796737671, + "learning_rate": 3.2153699935646475e-06, + "loss": 0.4683, + "step": 5232 + }, + { + "epoch": 2.4742316784869978, + "grad_norm": 2.7096521854400635, + "learning_rate": 3.214772229377936e-06, + "loss": 0.4999, + "step": 5233 + }, + { + "epoch": 2.4747044917257686, + "grad_norm": 3.1861157417297363, + "learning_rate": 3.214174420685493e-06, + "loss": 0.4365, + "step": 5234 + }, + { + "epoch": 2.475177304964539, + "grad_norm": 2.623061418533325, + "learning_rate": 3.2135765675245394e-06, + "loss": 0.3717, + "step": 5235 + }, + { + "epoch": 2.4756501182033097, + "grad_norm": 2.680921792984009, + "learning_rate": 3.2129786699323016e-06, + "loss": 0.4688, + "step": 5236 + }, + { + "epoch": 2.4761229314420805, + "grad_norm": 2.80426025390625, + "learning_rate": 3.2123807279460096e-06, + "loss": 0.5043, + "step": 5237 + }, + { + "epoch": 2.476595744680851, + "grad_norm": 2.676156997680664, + "learning_rate": 3.211782741602893e-06, + "loss": 0.4486, + "step": 5238 + }, + { + "epoch": 2.4770685579196217, + "grad_norm": 2.700822591781616, + "learning_rate": 3.2111847109401855e-06, + "loss": 0.4097, + "step": 5239 + }, + { + "epoch": 2.4775413711583925, + "grad_norm": 2.735387086868286, + "learning_rate": 3.2105866359951254e-06, + "loss": 0.4357, + "step": 5240 + }, + { + "epoch": 2.4780141843971633, + "grad_norm": 2.961874485015869, + "learning_rate": 3.2099885168049507e-06, + "loss": 0.4942, + "step": 5241 + }, + { + "epoch": 2.478486997635934, + "grad_norm": 2.546588659286499, + "learning_rate": 3.209390353406904e-06, + "loss": 0.3852, + "step": 5242 + }, + { + "epoch": 2.4789598108747044, + "grad_norm": 2.6269772052764893, + "learning_rate": 3.208792145838231e-06, + "loss": 0.3935, + "step": 5243 + }, + { + "epoch": 2.479432624113475, + "grad_norm": 2.9009883403778076, + "learning_rate": 3.208193894136179e-06, + "loss": 0.4003, + "step": 5244 + }, + { + "epoch": 2.479905437352246, + "grad_norm": 2.772834300994873, + "learning_rate": 3.2075955983379982e-06, + "loss": 0.4742, + "step": 5245 + }, + { + "epoch": 2.4803782505910164, + "grad_norm": 2.728703737258911, + "learning_rate": 3.2069972584809423e-06, + "loss": 0.4405, + "step": 5246 + }, + { + "epoch": 2.480851063829787, + "grad_norm": 2.72868275642395, + "learning_rate": 3.206398874602268e-06, + "loss": 0.4714, + "step": 5247 + }, + { + "epoch": 2.481323877068558, + "grad_norm": 2.6804213523864746, + "learning_rate": 3.2058004467392323e-06, + "loss": 0.4106, + "step": 5248 + }, + { + "epoch": 2.4817966903073287, + "grad_norm": 2.6740739345550537, + "learning_rate": 3.205201974929098e-06, + "loss": 0.3855, + "step": 5249 + }, + { + "epoch": 2.482269503546099, + "grad_norm": 2.8131754398345947, + "learning_rate": 3.204603459209129e-06, + "loss": 0.418, + "step": 5250 + }, + { + "epoch": 2.48274231678487, + "grad_norm": 2.5242888927459717, + "learning_rate": 3.204004899616592e-06, + "loss": 0.4914, + "step": 5251 + }, + { + "epoch": 2.4832151300236407, + "grad_norm": 2.969191551208496, + "learning_rate": 3.2034062961887567e-06, + "loss": 0.4634, + "step": 5252 + }, + { + "epoch": 2.4836879432624115, + "grad_norm": 2.967968463897705, + "learning_rate": 3.2028076489628963e-06, + "loss": 0.456, + "step": 5253 + }, + { + "epoch": 2.484160756501182, + "grad_norm": 2.9006540775299072, + "learning_rate": 3.2022089579762845e-06, + "loss": 0.4203, + "step": 5254 + }, + { + "epoch": 2.4846335697399526, + "grad_norm": 2.6377336978912354, + "learning_rate": 3.2016102232662003e-06, + "loss": 0.4518, + "step": 5255 + }, + { + "epoch": 2.4851063829787234, + "grad_norm": 2.757749319076538, + "learning_rate": 3.201011444869925e-06, + "loss": 0.4314, + "step": 5256 + }, + { + "epoch": 2.4855791962174942, + "grad_norm": 2.571560859680176, + "learning_rate": 3.20041262282474e-06, + "loss": 0.427, + "step": 5257 + }, + { + "epoch": 2.4860520094562646, + "grad_norm": 3.1367194652557373, + "learning_rate": 3.1998137571679316e-06, + "loss": 0.4901, + "step": 5258 + }, + { + "epoch": 2.4865248226950354, + "grad_norm": 3.194042205810547, + "learning_rate": 3.1992148479367896e-06, + "loss": 0.466, + "step": 5259 + }, + { + "epoch": 2.486997635933806, + "grad_norm": 2.5546324253082275, + "learning_rate": 3.1986158951686052e-06, + "loss": 0.4182, + "step": 5260 + }, + { + "epoch": 2.487470449172577, + "grad_norm": 2.919783115386963, + "learning_rate": 3.198016898900672e-06, + "loss": 0.4234, + "step": 5261 + }, + { + "epoch": 2.4879432624113473, + "grad_norm": 2.865248918533325, + "learning_rate": 3.1974178591702877e-06, + "loss": 0.4291, + "step": 5262 + }, + { + "epoch": 2.488416075650118, + "grad_norm": 2.685737133026123, + "learning_rate": 3.196818776014752e-06, + "loss": 0.4548, + "step": 5263 + }, + { + "epoch": 2.488888888888889, + "grad_norm": 2.826974630355835, + "learning_rate": 3.196219649471365e-06, + "loss": 0.4152, + "step": 5264 + }, + { + "epoch": 2.4893617021276597, + "grad_norm": 2.764975070953369, + "learning_rate": 3.1956204795774336e-06, + "loss": 0.5209, + "step": 5265 + }, + { + "epoch": 2.48983451536643, + "grad_norm": 2.4184255599975586, + "learning_rate": 3.1950212663702662e-06, + "loss": 0.3969, + "step": 5266 + }, + { + "epoch": 2.490307328605201, + "grad_norm": 2.9361133575439453, + "learning_rate": 3.1944220098871713e-06, + "loss": 0.4589, + "step": 5267 + }, + { + "epoch": 2.4907801418439717, + "grad_norm": 2.377051830291748, + "learning_rate": 3.193822710165463e-06, + "loss": 0.4328, + "step": 5268 + }, + { + "epoch": 2.4912529550827425, + "grad_norm": 3.1302497386932373, + "learning_rate": 3.1932233672424563e-06, + "loss": 0.3918, + "step": 5269 + }, + { + "epoch": 2.491725768321513, + "grad_norm": 2.89577579498291, + "learning_rate": 3.192623981155471e-06, + "loss": 0.5004, + "step": 5270 + }, + { + "epoch": 2.4921985815602836, + "grad_norm": 2.7735235691070557, + "learning_rate": 3.1920245519418273e-06, + "loss": 0.4206, + "step": 5271 + }, + { + "epoch": 2.4926713947990544, + "grad_norm": 2.5424516201019287, + "learning_rate": 3.1914250796388493e-06, + "loss": 0.4419, + "step": 5272 + }, + { + "epoch": 2.493144208037825, + "grad_norm": 3.1216981410980225, + "learning_rate": 3.1908255642838628e-06, + "loss": 0.4552, + "step": 5273 + }, + { + "epoch": 2.4936170212765956, + "grad_norm": 3.044045925140381, + "learning_rate": 3.1902260059141978e-06, + "loss": 0.4967, + "step": 5274 + }, + { + "epoch": 2.4940898345153664, + "grad_norm": 2.5630741119384766, + "learning_rate": 3.189626404567186e-06, + "loss": 0.3908, + "step": 5275 + }, + { + "epoch": 2.494562647754137, + "grad_norm": 2.7177648544311523, + "learning_rate": 3.189026760280162e-06, + "loss": 0.4915, + "step": 5276 + }, + { + "epoch": 2.495035460992908, + "grad_norm": 2.653416395187378, + "learning_rate": 3.1884270730904632e-06, + "loss": 0.4633, + "step": 5277 + }, + { + "epoch": 2.4955082742316783, + "grad_norm": 3.7212321758270264, + "learning_rate": 3.1878273430354284e-06, + "loss": 0.4549, + "step": 5278 + }, + { + "epoch": 2.495981087470449, + "grad_norm": 2.4152729511260986, + "learning_rate": 3.187227570152402e-06, + "loss": 0.4674, + "step": 5279 + }, + { + "epoch": 2.49645390070922, + "grad_norm": 2.5354862213134766, + "learning_rate": 3.1866277544787284e-06, + "loss": 0.4135, + "step": 5280 + }, + { + "epoch": 2.4969267139479907, + "grad_norm": 3.1766583919525146, + "learning_rate": 3.186027896051754e-06, + "loss": 0.5656, + "step": 5281 + }, + { + "epoch": 2.497399527186761, + "grad_norm": 2.5636754035949707, + "learning_rate": 3.1854279949088313e-06, + "loss": 0.4138, + "step": 5282 + }, + { + "epoch": 2.497872340425532, + "grad_norm": 2.7615602016448975, + "learning_rate": 3.1848280510873124e-06, + "loss": 0.4936, + "step": 5283 + }, + { + "epoch": 2.4983451536643027, + "grad_norm": 2.964721918106079, + "learning_rate": 3.1842280646245543e-06, + "loss": 0.4865, + "step": 5284 + }, + { + "epoch": 2.4988179669030735, + "grad_norm": 2.6915178298950195, + "learning_rate": 3.1836280355579152e-06, + "loss": 0.4179, + "step": 5285 + }, + { + "epoch": 2.499290780141844, + "grad_norm": 2.820451259613037, + "learning_rate": 3.183027963924755e-06, + "loss": 0.4785, + "step": 5286 + }, + { + "epoch": 2.4997635933806146, + "grad_norm": 2.841719627380371, + "learning_rate": 3.1824278497624393e-06, + "loss": 0.4535, + "step": 5287 + }, + { + "epoch": 2.5002364066193854, + "grad_norm": 2.459167957305908, + "learning_rate": 3.181827693108333e-06, + "loss": 0.4353, + "step": 5288 + }, + { + "epoch": 2.500709219858156, + "grad_norm": 3.2538363933563232, + "learning_rate": 3.1812274939998066e-06, + "loss": 0.4037, + "step": 5289 + }, + { + "epoch": 2.5011820330969265, + "grad_norm": 2.6980504989624023, + "learning_rate": 3.180627252474231e-06, + "loss": 0.4181, + "step": 5290 + }, + { + "epoch": 2.5016548463356973, + "grad_norm": 2.9400012493133545, + "learning_rate": 3.1800269685689804e-06, + "loss": 0.4642, + "step": 5291 + }, + { + "epoch": 2.502127659574468, + "grad_norm": 2.7832958698272705, + "learning_rate": 3.1794266423214328e-06, + "loss": 0.3936, + "step": 5292 + }, + { + "epoch": 2.5026004728132385, + "grad_norm": 2.4017868041992188, + "learning_rate": 3.178826273768967e-06, + "loss": 0.3984, + "step": 5293 + }, + { + "epoch": 2.5030732860520093, + "grad_norm": 2.398120641708374, + "learning_rate": 3.1782258629489665e-06, + "loss": 0.4219, + "step": 5294 + }, + { + "epoch": 2.50354609929078, + "grad_norm": 2.973947763442993, + "learning_rate": 3.177625409898815e-06, + "loss": 0.4192, + "step": 5295 + }, + { + "epoch": 2.504018912529551, + "grad_norm": 3.1169888973236084, + "learning_rate": 3.1770249146559006e-06, + "loss": 0.5098, + "step": 5296 + }, + { + "epoch": 2.5044917257683217, + "grad_norm": 2.816964864730835, + "learning_rate": 3.1764243772576132e-06, + "loss": 0.4228, + "step": 5297 + }, + { + "epoch": 2.504964539007092, + "grad_norm": 2.5624163150787354, + "learning_rate": 3.1758237977413452e-06, + "loss": 0.4389, + "step": 5298 + }, + { + "epoch": 2.505437352245863, + "grad_norm": 2.7477777004241943, + "learning_rate": 3.175223176144494e-06, + "loss": 0.4564, + "step": 5299 + }, + { + "epoch": 2.5059101654846336, + "grad_norm": 3.1478309631347656, + "learning_rate": 3.174622512504456e-06, + "loss": 0.4859, + "step": 5300 + }, + { + "epoch": 2.506382978723404, + "grad_norm": 2.8400418758392334, + "learning_rate": 3.1740218068586315e-06, + "loss": 0.4476, + "step": 5301 + }, + { + "epoch": 2.506855791962175, + "grad_norm": 2.7097036838531494, + "learning_rate": 3.173421059244426e-06, + "loss": 0.4559, + "step": 5302 + }, + { + "epoch": 2.5073286052009456, + "grad_norm": 2.864760637283325, + "learning_rate": 3.172820269699243e-06, + "loss": 0.5124, + "step": 5303 + }, + { + "epoch": 2.5078014184397164, + "grad_norm": 2.877110004425049, + "learning_rate": 3.1722194382604926e-06, + "loss": 0.5083, + "step": 5304 + }, + { + "epoch": 2.508274231678487, + "grad_norm": 3.2369656562805176, + "learning_rate": 3.1716185649655844e-06, + "loss": 0.4894, + "step": 5305 + }, + { + "epoch": 2.5087470449172575, + "grad_norm": 2.7377753257751465, + "learning_rate": 3.171017649851934e-06, + "loss": 0.4324, + "step": 5306 + }, + { + "epoch": 2.5092198581560283, + "grad_norm": 2.883364200592041, + "learning_rate": 3.1704166929569564e-06, + "loss": 0.3731, + "step": 5307 + }, + { + "epoch": 2.509692671394799, + "grad_norm": 2.5724737644195557, + "learning_rate": 3.1698156943180716e-06, + "loss": 0.4768, + "step": 5308 + }, + { + "epoch": 2.5101654846335695, + "grad_norm": 2.7532460689544678, + "learning_rate": 3.1692146539727e-06, + "loss": 0.4385, + "step": 5309 + }, + { + "epoch": 2.5106382978723403, + "grad_norm": 2.786505699157715, + "learning_rate": 3.168613571958267e-06, + "loss": 0.4241, + "step": 5310 + }, + { + "epoch": 2.511111111111111, + "grad_norm": 3.1674118041992188, + "learning_rate": 3.1680124483121975e-06, + "loss": 0.4445, + "step": 5311 + }, + { + "epoch": 2.511583924349882, + "grad_norm": 2.7861545085906982, + "learning_rate": 3.167411283071923e-06, + "loss": 0.4264, + "step": 5312 + }, + { + "epoch": 2.5120567375886527, + "grad_norm": 2.7412493228912354, + "learning_rate": 3.1668100762748745e-06, + "loss": 0.4725, + "step": 5313 + }, + { + "epoch": 2.512529550827423, + "grad_norm": 2.710019588470459, + "learning_rate": 3.1662088279584858e-06, + "loss": 0.5207, + "step": 5314 + }, + { + "epoch": 2.513002364066194, + "grad_norm": 2.694812297821045, + "learning_rate": 3.165607538160194e-06, + "loss": 0.3666, + "step": 5315 + }, + { + "epoch": 2.5134751773049646, + "grad_norm": 2.4390623569488525, + "learning_rate": 3.1650062069174405e-06, + "loss": 0.4025, + "step": 5316 + }, + { + "epoch": 2.513947990543735, + "grad_norm": 3.055738925933838, + "learning_rate": 3.1644048342676663e-06, + "loss": 0.4288, + "step": 5317 + }, + { + "epoch": 2.5144208037825058, + "grad_norm": 3.065824508666992, + "learning_rate": 3.163803420248316e-06, + "loss": 0.4592, + "step": 5318 + }, + { + "epoch": 2.5148936170212766, + "grad_norm": 2.6011085510253906, + "learning_rate": 3.163201964896838e-06, + "loss": 0.4081, + "step": 5319 + }, + { + "epoch": 2.5153664302600474, + "grad_norm": 2.4833033084869385, + "learning_rate": 3.162600468250681e-06, + "loss": 0.4343, + "step": 5320 + }, + { + "epoch": 2.515839243498818, + "grad_norm": 2.9035534858703613, + "learning_rate": 3.161998930347299e-06, + "loss": 0.4972, + "step": 5321 + }, + { + "epoch": 2.5163120567375885, + "grad_norm": 2.788752317428589, + "learning_rate": 3.161397351224146e-06, + "loss": 0.4597, + "step": 5322 + }, + { + "epoch": 2.5167848699763593, + "grad_norm": 2.4344491958618164, + "learning_rate": 3.16079573091868e-06, + "loss": 0.359, + "step": 5323 + }, + { + "epoch": 2.51725768321513, + "grad_norm": 2.750150680541992, + "learning_rate": 3.160194069468361e-06, + "loss": 0.4596, + "step": 5324 + }, + { + "epoch": 2.5177304964539005, + "grad_norm": 2.826902389526367, + "learning_rate": 3.1595923669106526e-06, + "loss": 0.4377, + "step": 5325 + }, + { + "epoch": 2.5182033096926713, + "grad_norm": 2.554439067840576, + "learning_rate": 3.15899062328302e-06, + "loss": 0.4517, + "step": 5326 + }, + { + "epoch": 2.518676122931442, + "grad_norm": 3.0882742404937744, + "learning_rate": 3.158388838622931e-06, + "loss": 0.47, + "step": 5327 + }, + { + "epoch": 2.519148936170213, + "grad_norm": 2.918947696685791, + "learning_rate": 3.157787012967856e-06, + "loss": 0.522, + "step": 5328 + }, + { + "epoch": 2.5196217494089836, + "grad_norm": 2.8057637214660645, + "learning_rate": 3.1571851463552674e-06, + "loss": 0.4837, + "step": 5329 + }, + { + "epoch": 2.520094562647754, + "grad_norm": 2.66241455078125, + "learning_rate": 3.156583238822641e-06, + "loss": 0.3988, + "step": 5330 + }, + { + "epoch": 2.520567375886525, + "grad_norm": 2.9793803691864014, + "learning_rate": 3.155981290407456e-06, + "loss": 0.4737, + "step": 5331 + }, + { + "epoch": 2.5210401891252956, + "grad_norm": 2.847522258758545, + "learning_rate": 3.1553793011471924e-06, + "loss": 0.4394, + "step": 5332 + }, + { + "epoch": 2.521513002364066, + "grad_norm": 2.9561474323272705, + "learning_rate": 3.154777271079333e-06, + "loss": 0.47, + "step": 5333 + }, + { + "epoch": 2.5219858156028367, + "grad_norm": 2.8353018760681152, + "learning_rate": 3.154175200241365e-06, + "loss": 0.4015, + "step": 5334 + }, + { + "epoch": 2.5224586288416075, + "grad_norm": 2.609049081802368, + "learning_rate": 3.153573088670775e-06, + "loss": 0.4723, + "step": 5335 + }, + { + "epoch": 2.5229314420803783, + "grad_norm": 2.8538455963134766, + "learning_rate": 3.1529709364050556e-06, + "loss": 0.4665, + "step": 5336 + }, + { + "epoch": 2.523404255319149, + "grad_norm": 2.768310785293579, + "learning_rate": 3.1523687434816978e-06, + "loss": 0.4933, + "step": 5337 + }, + { + "epoch": 2.5238770685579195, + "grad_norm": 2.9300906658172607, + "learning_rate": 3.1517665099382e-06, + "loss": 0.4651, + "step": 5338 + }, + { + "epoch": 2.5243498817966903, + "grad_norm": 2.6984703540802, + "learning_rate": 3.1511642358120585e-06, + "loss": 0.4442, + "step": 5339 + }, + { + "epoch": 2.524822695035461, + "grad_norm": 2.8148467540740967, + "learning_rate": 3.1505619211407762e-06, + "loss": 0.4611, + "step": 5340 + }, + { + "epoch": 2.5252955082742314, + "grad_norm": 2.816436290740967, + "learning_rate": 3.1499595659618556e-06, + "loss": 0.5291, + "step": 5341 + }, + { + "epoch": 2.5257683215130022, + "grad_norm": 2.902805805206299, + "learning_rate": 3.149357170312802e-06, + "loss": 0.4394, + "step": 5342 + }, + { + "epoch": 2.526241134751773, + "grad_norm": 2.6443474292755127, + "learning_rate": 3.148754734231126e-06, + "loss": 0.4444, + "step": 5343 + }, + { + "epoch": 2.526713947990544, + "grad_norm": 2.6818583011627197, + "learning_rate": 3.148152257754336e-06, + "loss": 0.4256, + "step": 5344 + }, + { + "epoch": 2.5271867612293146, + "grad_norm": 2.5266945362091064, + "learning_rate": 3.1475497409199485e-06, + "loss": 0.4087, + "step": 5345 + }, + { + "epoch": 2.527659574468085, + "grad_norm": 2.6326711177825928, + "learning_rate": 3.146947183765477e-06, + "loss": 0.3842, + "step": 5346 + }, + { + "epoch": 2.5281323877068558, + "grad_norm": 3.122880697250366, + "learning_rate": 3.1463445863284413e-06, + "loss": 0.482, + "step": 5347 + }, + { + "epoch": 2.5286052009456266, + "grad_norm": 2.819258213043213, + "learning_rate": 3.145741948646362e-06, + "loss": 0.4628, + "step": 5348 + }, + { + "epoch": 2.529078014184397, + "grad_norm": 2.5842230319976807, + "learning_rate": 3.145139270756764e-06, + "loss": 0.4479, + "step": 5349 + }, + { + "epoch": 2.5295508274231677, + "grad_norm": 2.7257237434387207, + "learning_rate": 3.144536552697172e-06, + "loss": 0.473, + "step": 5350 + }, + { + "epoch": 2.5300236406619385, + "grad_norm": 2.6876981258392334, + "learning_rate": 3.143933794505115e-06, + "loss": 0.4615, + "step": 5351 + }, + { + "epoch": 2.5304964539007093, + "grad_norm": 2.7942895889282227, + "learning_rate": 3.143330996218124e-06, + "loss": 0.4982, + "step": 5352 + }, + { + "epoch": 2.53096926713948, + "grad_norm": 2.3150579929351807, + "learning_rate": 3.1427281578737327e-06, + "loss": 0.3905, + "step": 5353 + }, + { + "epoch": 2.5314420803782505, + "grad_norm": 2.7326138019561768, + "learning_rate": 3.142125279509478e-06, + "loss": 0.4076, + "step": 5354 + }, + { + "epoch": 2.5319148936170213, + "grad_norm": 2.46362566947937, + "learning_rate": 3.1415223611628976e-06, + "loss": 0.4043, + "step": 5355 + }, + { + "epoch": 2.532387706855792, + "grad_norm": 2.6670427322387695, + "learning_rate": 3.1409194028715323e-06, + "loss": 0.484, + "step": 5356 + }, + { + "epoch": 2.5328605200945624, + "grad_norm": 2.917771100997925, + "learning_rate": 3.140316404672926e-06, + "loss": 0.4539, + "step": 5357 + }, + { + "epoch": 2.533333333333333, + "grad_norm": 2.7964110374450684, + "learning_rate": 3.1397133666046254e-06, + "loss": 0.4706, + "step": 5358 + }, + { + "epoch": 2.533806146572104, + "grad_norm": 2.6481330394744873, + "learning_rate": 3.139110288704179e-06, + "loss": 0.4101, + "step": 5359 + }, + { + "epoch": 2.534278959810875, + "grad_norm": 2.859452962875366, + "learning_rate": 3.1385071710091365e-06, + "loss": 0.4842, + "step": 5360 + }, + { + "epoch": 2.5347517730496456, + "grad_norm": 2.686077356338501, + "learning_rate": 3.137904013557052e-06, + "loss": 0.4073, + "step": 5361 + }, + { + "epoch": 2.535224586288416, + "grad_norm": 3.7147045135498047, + "learning_rate": 3.137300816385482e-06, + "loss": 0.4536, + "step": 5362 + }, + { + "epoch": 2.5356973995271868, + "grad_norm": 2.51054048538208, + "learning_rate": 3.1366975795319856e-06, + "loss": 0.4171, + "step": 5363 + }, + { + "epoch": 2.5361702127659576, + "grad_norm": 3.043149471282959, + "learning_rate": 3.136094303034121e-06, + "loss": 0.5179, + "step": 5364 + }, + { + "epoch": 2.536643026004728, + "grad_norm": 2.398878812789917, + "learning_rate": 3.1354909869294548e-06, + "loss": 0.4144, + "step": 5365 + }, + { + "epoch": 2.5371158392434987, + "grad_norm": 2.969712257385254, + "learning_rate": 3.134887631255551e-06, + "loss": 0.3983, + "step": 5366 + }, + { + "epoch": 2.5375886524822695, + "grad_norm": 2.7707982063293457, + "learning_rate": 3.134284236049978e-06, + "loss": 0.4405, + "step": 5367 + }, + { + "epoch": 2.5380614657210403, + "grad_norm": 2.579742193222046, + "learning_rate": 3.1336808013503073e-06, + "loss": 0.4402, + "step": 5368 + }, + { + "epoch": 2.538534278959811, + "grad_norm": 2.6041927337646484, + "learning_rate": 3.1330773271941113e-06, + "loss": 0.396, + "step": 5369 + }, + { + "epoch": 2.5390070921985815, + "grad_norm": 2.7383856773376465, + "learning_rate": 3.1324738136189658e-06, + "loss": 0.4424, + "step": 5370 + }, + { + "epoch": 2.5394799054373522, + "grad_norm": 3.053644895553589, + "learning_rate": 3.13187026066245e-06, + "loss": 0.473, + "step": 5371 + }, + { + "epoch": 2.539952718676123, + "grad_norm": 2.684244155883789, + "learning_rate": 3.1312666683621428e-06, + "loss": 0.3963, + "step": 5372 + }, + { + "epoch": 2.5404255319148934, + "grad_norm": 2.6505017280578613, + "learning_rate": 3.130663036755629e-06, + "loss": 0.4292, + "step": 5373 + }, + { + "epoch": 2.540898345153664, + "grad_norm": 3.025965929031372, + "learning_rate": 3.1300593658804935e-06, + "loss": 0.4539, + "step": 5374 + }, + { + "epoch": 2.541371158392435, + "grad_norm": 2.72106671333313, + "learning_rate": 3.1294556557743237e-06, + "loss": 0.4519, + "step": 5375 + }, + { + "epoch": 2.541843971631206, + "grad_norm": 2.759995222091675, + "learning_rate": 3.12885190647471e-06, + "loss": 0.451, + "step": 5376 + }, + { + "epoch": 2.5423167848699766, + "grad_norm": 2.697950601577759, + "learning_rate": 3.1282481180192457e-06, + "loss": 0.4328, + "step": 5377 + }, + { + "epoch": 2.542789598108747, + "grad_norm": 2.6970415115356445, + "learning_rate": 3.127644290445526e-06, + "loss": 0.4489, + "step": 5378 + }, + { + "epoch": 2.5432624113475177, + "grad_norm": 2.5856997966766357, + "learning_rate": 3.127040423791148e-06, + "loss": 0.3848, + "step": 5379 + }, + { + "epoch": 2.5437352245862885, + "grad_norm": 2.9798166751861572, + "learning_rate": 3.1264365180937127e-06, + "loss": 0.5038, + "step": 5380 + }, + { + "epoch": 2.544208037825059, + "grad_norm": 3.413175106048584, + "learning_rate": 3.1258325733908224e-06, + "loss": 0.5247, + "step": 5381 + }, + { + "epoch": 2.5446808510638297, + "grad_norm": 2.838517904281616, + "learning_rate": 3.1252285897200818e-06, + "loss": 0.4652, + "step": 5382 + }, + { + "epoch": 2.5451536643026005, + "grad_norm": 2.8342528343200684, + "learning_rate": 3.1246245671190983e-06, + "loss": 0.4245, + "step": 5383 + }, + { + "epoch": 2.5456264775413713, + "grad_norm": 3.06026029586792, + "learning_rate": 3.124020505625482e-06, + "loss": 0.469, + "step": 5384 + }, + { + "epoch": 2.546099290780142, + "grad_norm": 2.633894681930542, + "learning_rate": 3.1234164052768452e-06, + "loss": 0.4509, + "step": 5385 + }, + { + "epoch": 2.5465721040189124, + "grad_norm": 2.634819984436035, + "learning_rate": 3.1228122661108023e-06, + "loss": 0.4879, + "step": 5386 + }, + { + "epoch": 2.5470449172576832, + "grad_norm": 3.9843504428863525, + "learning_rate": 3.1222080881649707e-06, + "loss": 0.4472, + "step": 5387 + }, + { + "epoch": 2.547517730496454, + "grad_norm": 2.5480258464813232, + "learning_rate": 3.1216038714769694e-06, + "loss": 0.4396, + "step": 5388 + }, + { + "epoch": 2.5479905437352244, + "grad_norm": 2.7461917400360107, + "learning_rate": 3.12099961608442e-06, + "loss": 0.4735, + "step": 5389 + }, + { + "epoch": 2.548463356973995, + "grad_norm": 3.167769193649292, + "learning_rate": 3.1203953220249493e-06, + "loss": 0.4196, + "step": 5390 + }, + { + "epoch": 2.548936170212766, + "grad_norm": 2.721696615219116, + "learning_rate": 3.1197909893361814e-06, + "loss": 0.4571, + "step": 5391 + }, + { + "epoch": 2.5494089834515368, + "grad_norm": 2.726668119430542, + "learning_rate": 3.1191866180557463e-06, + "loss": 0.4856, + "step": 5392 + }, + { + "epoch": 2.5498817966903076, + "grad_norm": 2.602205276489258, + "learning_rate": 3.1185822082212754e-06, + "loss": 0.4631, + "step": 5393 + }, + { + "epoch": 2.550354609929078, + "grad_norm": 2.7715859413146973, + "learning_rate": 3.1179777598704025e-06, + "loss": 0.4136, + "step": 5394 + }, + { + "epoch": 2.5508274231678487, + "grad_norm": 2.8081955909729004, + "learning_rate": 3.1173732730407647e-06, + "loss": 0.4963, + "step": 5395 + }, + { + "epoch": 2.5513002364066195, + "grad_norm": 2.946772336959839, + "learning_rate": 3.1167687477700006e-06, + "loss": 0.4443, + "step": 5396 + }, + { + "epoch": 2.55177304964539, + "grad_norm": 2.89345383644104, + "learning_rate": 3.1161641840957503e-06, + "loss": 0.4377, + "step": 5397 + }, + { + "epoch": 2.5522458628841607, + "grad_norm": 2.908317804336548, + "learning_rate": 3.115559582055659e-06, + "loss": 0.4702, + "step": 5398 + }, + { + "epoch": 2.5527186761229315, + "grad_norm": 2.554417848587036, + "learning_rate": 3.1149549416873704e-06, + "loss": 0.3738, + "step": 5399 + }, + { + "epoch": 2.5531914893617023, + "grad_norm": 2.3132457733154297, + "learning_rate": 3.1143502630285356e-06, + "loss": 0.4074, + "step": 5400 + }, + { + "epoch": 2.553664302600473, + "grad_norm": 2.751666784286499, + "learning_rate": 3.1137455461168026e-06, + "loss": 0.4697, + "step": 5401 + }, + { + "epoch": 2.5541371158392434, + "grad_norm": 2.7088871002197266, + "learning_rate": 3.113140790989826e-06, + "loss": 0.4754, + "step": 5402 + }, + { + "epoch": 2.554609929078014, + "grad_norm": 3.0633046627044678, + "learning_rate": 3.1125359976852605e-06, + "loss": 0.4874, + "step": 5403 + }, + { + "epoch": 2.555082742316785, + "grad_norm": 3.399456024169922, + "learning_rate": 3.111931166240764e-06, + "loss": 0.5529, + "step": 5404 + }, + { + "epoch": 2.5555555555555554, + "grad_norm": 2.7729690074920654, + "learning_rate": 3.1113262966939985e-06, + "loss": 0.4677, + "step": 5405 + }, + { + "epoch": 2.556028368794326, + "grad_norm": 2.81025767326355, + "learning_rate": 3.1107213890826244e-06, + "loss": 0.4954, + "step": 5406 + }, + { + "epoch": 2.556501182033097, + "grad_norm": 2.4837241172790527, + "learning_rate": 3.110116443444307e-06, + "loss": 0.3681, + "step": 5407 + }, + { + "epoch": 2.5569739952718678, + "grad_norm": 2.6406874656677246, + "learning_rate": 3.109511459816714e-06, + "loss": 0.4569, + "step": 5408 + }, + { + "epoch": 2.5574468085106385, + "grad_norm": 2.6093738079071045, + "learning_rate": 3.1089064382375155e-06, + "loss": 0.413, + "step": 5409 + }, + { + "epoch": 2.557919621749409, + "grad_norm": 2.6629011631011963, + "learning_rate": 3.108301378744383e-06, + "loss": 0.4286, + "step": 5410 + }, + { + "epoch": 2.5583924349881797, + "grad_norm": 2.694796323776245, + "learning_rate": 3.10769628137499e-06, + "loss": 0.4316, + "step": 5411 + }, + { + "epoch": 2.5588652482269505, + "grad_norm": 2.88023042678833, + "learning_rate": 3.107091146167015e-06, + "loss": 0.4378, + "step": 5412 + }, + { + "epoch": 2.559338061465721, + "grad_norm": 2.8804919719696045, + "learning_rate": 3.1064859731581365e-06, + "loss": 0.4971, + "step": 5413 + }, + { + "epoch": 2.5598108747044916, + "grad_norm": 2.850468397140503, + "learning_rate": 3.1058807623860353e-06, + "loss": 0.4686, + "step": 5414 + }, + { + "epoch": 2.5602836879432624, + "grad_norm": 3.0548019409179688, + "learning_rate": 3.1052755138883963e-06, + "loss": 0.4497, + "step": 5415 + }, + { + "epoch": 2.5607565011820332, + "grad_norm": 3.10168719291687, + "learning_rate": 3.1046702277029046e-06, + "loss": 0.569, + "step": 5416 + }, + { + "epoch": 2.561229314420804, + "grad_norm": 2.5887374877929688, + "learning_rate": 3.1040649038672494e-06, + "loss": 0.3812, + "step": 5417 + }, + { + "epoch": 2.5617021276595744, + "grad_norm": 2.9928438663482666, + "learning_rate": 3.1034595424191212e-06, + "loss": 0.4308, + "step": 5418 + }, + { + "epoch": 2.562174940898345, + "grad_norm": 2.7003073692321777, + "learning_rate": 3.102854143396214e-06, + "loss": 0.4967, + "step": 5419 + }, + { + "epoch": 2.562647754137116, + "grad_norm": 3.172868490219116, + "learning_rate": 3.102248706836222e-06, + "loss": 0.5311, + "step": 5420 + }, + { + "epoch": 2.5631205673758863, + "grad_norm": 3.0146191120147705, + "learning_rate": 3.101643232776844e-06, + "loss": 0.4714, + "step": 5421 + }, + { + "epoch": 2.563593380614657, + "grad_norm": 3.0683791637420654, + "learning_rate": 3.1010377212557806e-06, + "loss": 0.4047, + "step": 5422 + }, + { + "epoch": 2.564066193853428, + "grad_norm": 2.8260676860809326, + "learning_rate": 3.1004321723107334e-06, + "loss": 0.5282, + "step": 5423 + }, + { + "epoch": 2.5645390070921987, + "grad_norm": 3.0792388916015625, + "learning_rate": 3.0998265859794074e-06, + "loss": 0.5323, + "step": 5424 + }, + { + "epoch": 2.5650118203309695, + "grad_norm": 2.7332866191864014, + "learning_rate": 3.09922096229951e-06, + "loss": 0.4401, + "step": 5425 + }, + { + "epoch": 2.56548463356974, + "grad_norm": 2.9366047382354736, + "learning_rate": 3.098615301308751e-06, + "loss": 0.4495, + "step": 5426 + }, + { + "epoch": 2.5659574468085107, + "grad_norm": 2.982088565826416, + "learning_rate": 3.098009603044842e-06, + "loss": 0.495, + "step": 5427 + }, + { + "epoch": 2.5664302600472815, + "grad_norm": 3.1204755306243896, + "learning_rate": 3.0974038675454976e-06, + "loss": 0.4354, + "step": 5428 + }, + { + "epoch": 2.566903073286052, + "grad_norm": 2.835238218307495, + "learning_rate": 3.0967980948484333e-06, + "loss": 0.4161, + "step": 5429 + }, + { + "epoch": 2.5673758865248226, + "grad_norm": 2.8104958534240723, + "learning_rate": 3.096192284991369e-06, + "loss": 0.5045, + "step": 5430 + }, + { + "epoch": 2.5678486997635934, + "grad_norm": 3.1636080741882324, + "learning_rate": 3.0955864380120247e-06, + "loss": 0.4533, + "step": 5431 + }, + { + "epoch": 2.568321513002364, + "grad_norm": 2.980112314224243, + "learning_rate": 3.0949805539481247e-06, + "loss": 0.3998, + "step": 5432 + }, + { + "epoch": 2.568794326241135, + "grad_norm": 2.6379945278167725, + "learning_rate": 3.0943746328373953e-06, + "loss": 0.3785, + "step": 5433 + }, + { + "epoch": 2.5692671394799054, + "grad_norm": 2.780930757522583, + "learning_rate": 3.0937686747175627e-06, + "loss": 0.4801, + "step": 5434 + }, + { + "epoch": 2.569739952718676, + "grad_norm": 2.6608550548553467, + "learning_rate": 3.0931626796263585e-06, + "loss": 0.4047, + "step": 5435 + }, + { + "epoch": 2.570212765957447, + "grad_norm": 3.130584716796875, + "learning_rate": 3.0925566476015156e-06, + "loss": 0.5049, + "step": 5436 + }, + { + "epoch": 2.5706855791962173, + "grad_norm": 2.9699313640594482, + "learning_rate": 3.0919505786807687e-06, + "loss": 0.3847, + "step": 5437 + }, + { + "epoch": 2.571158392434988, + "grad_norm": 2.919260025024414, + "learning_rate": 3.091344472901855e-06, + "loss": 0.4631, + "step": 5438 + }, + { + "epoch": 2.571631205673759, + "grad_norm": 2.956587553024292, + "learning_rate": 3.0907383303025134e-06, + "loss": 0.4974, + "step": 5439 + }, + { + "epoch": 2.5721040189125297, + "grad_norm": 2.758542776107788, + "learning_rate": 3.090132150920486e-06, + "loss": 0.4785, + "step": 5440 + }, + { + "epoch": 2.5725768321513005, + "grad_norm": 2.678469657897949, + "learning_rate": 3.0895259347935175e-06, + "loss": 0.4453, + "step": 5441 + }, + { + "epoch": 2.573049645390071, + "grad_norm": 2.6508545875549316, + "learning_rate": 3.088919681959355e-06, + "loss": 0.4426, + "step": 5442 + }, + { + "epoch": 2.5735224586288417, + "grad_norm": 2.6156187057495117, + "learning_rate": 3.0883133924557453e-06, + "loss": 0.4445, + "step": 5443 + }, + { + "epoch": 2.5739952718676125, + "grad_norm": 2.484374761581421, + "learning_rate": 3.08770706632044e-06, + "loss": 0.4155, + "step": 5444 + }, + { + "epoch": 2.574468085106383, + "grad_norm": 2.7465295791625977, + "learning_rate": 3.087100703591193e-06, + "loss": 0.4085, + "step": 5445 + }, + { + "epoch": 2.5749408983451536, + "grad_norm": 2.771740198135376, + "learning_rate": 3.08649430430576e-06, + "loss": 0.4313, + "step": 5446 + }, + { + "epoch": 2.5754137115839244, + "grad_norm": 2.7480874061584473, + "learning_rate": 3.0858878685018984e-06, + "loss": 0.3471, + "step": 5447 + }, + { + "epoch": 2.575886524822695, + "grad_norm": 2.894913673400879, + "learning_rate": 3.085281396217368e-06, + "loss": 0.4888, + "step": 5448 + }, + { + "epoch": 2.576359338061466, + "grad_norm": 3.037628173828125, + "learning_rate": 3.0846748874899306e-06, + "loss": 0.3976, + "step": 5449 + }, + { + "epoch": 2.5768321513002364, + "grad_norm": 2.4811434745788574, + "learning_rate": 3.0840683423573526e-06, + "loss": 0.4822, + "step": 5450 + }, + { + "epoch": 2.577304964539007, + "grad_norm": 3.0078725814819336, + "learning_rate": 3.0834617608573998e-06, + "loss": 0.4999, + "step": 5451 + }, + { + "epoch": 2.5777777777777775, + "grad_norm": 3.174154043197632, + "learning_rate": 3.0828551430278413e-06, + "loss": 0.4626, + "step": 5452 + }, + { + "epoch": 2.5782505910165483, + "grad_norm": 2.8277535438537598, + "learning_rate": 3.082248488906449e-06, + "loss": 0.4633, + "step": 5453 + }, + { + "epoch": 2.578723404255319, + "grad_norm": 2.731767416000366, + "learning_rate": 3.0816417985309966e-06, + "loss": 0.4148, + "step": 5454 + }, + { + "epoch": 2.57919621749409, + "grad_norm": 2.5480549335479736, + "learning_rate": 3.0810350719392597e-06, + "loss": 0.4773, + "step": 5455 + }, + { + "epoch": 2.5796690307328607, + "grad_norm": 2.9755172729492188, + "learning_rate": 3.080428309169017e-06, + "loss": 0.5107, + "step": 5456 + }, + { + "epoch": 2.580141843971631, + "grad_norm": 2.6499290466308594, + "learning_rate": 3.079821510258048e-06, + "loss": 0.3982, + "step": 5457 + }, + { + "epoch": 2.580614657210402, + "grad_norm": 2.663214921951294, + "learning_rate": 3.079214675244136e-06, + "loss": 0.4419, + "step": 5458 + }, + { + "epoch": 2.5810874704491726, + "grad_norm": 2.595489263534546, + "learning_rate": 3.078607804165066e-06, + "loss": 0.3958, + "step": 5459 + }, + { + "epoch": 2.581560283687943, + "grad_norm": 3.031458854675293, + "learning_rate": 3.0780008970586255e-06, + "loss": 0.518, + "step": 5460 + }, + { + "epoch": 2.582033096926714, + "grad_norm": 2.827071189880371, + "learning_rate": 3.077393953962603e-06, + "loss": 0.4397, + "step": 5461 + }, + { + "epoch": 2.5825059101654846, + "grad_norm": 2.656111240386963, + "learning_rate": 3.0767869749147917e-06, + "loss": 0.4912, + "step": 5462 + }, + { + "epoch": 2.5829787234042554, + "grad_norm": 2.545365333557129, + "learning_rate": 3.076179959952984e-06, + "loss": 0.3991, + "step": 5463 + }, + { + "epoch": 2.583451536643026, + "grad_norm": 2.5794365406036377, + "learning_rate": 3.075572909114977e-06, + "loss": 0.4499, + "step": 5464 + }, + { + "epoch": 2.5839243498817965, + "grad_norm": 2.787140369415283, + "learning_rate": 3.074965822438568e-06, + "loss": 0.386, + "step": 5465 + }, + { + "epoch": 2.5843971631205673, + "grad_norm": 2.6406853199005127, + "learning_rate": 3.0743586999615594e-06, + "loss": 0.4853, + "step": 5466 + }, + { + "epoch": 2.584869976359338, + "grad_norm": 2.8082082271575928, + "learning_rate": 3.073751541721752e-06, + "loss": 0.4669, + "step": 5467 + }, + { + "epoch": 2.5853427895981085, + "grad_norm": 2.8808975219726562, + "learning_rate": 3.073144347756952e-06, + "loss": 0.4193, + "step": 5468 + }, + { + "epoch": 2.5858156028368793, + "grad_norm": 2.823352813720703, + "learning_rate": 3.072537118104968e-06, + "loss": 0.482, + "step": 5469 + }, + { + "epoch": 2.58628841607565, + "grad_norm": 2.6454555988311768, + "learning_rate": 3.0719298528036073e-06, + "loss": 0.4667, + "step": 5470 + }, + { + "epoch": 2.586761229314421, + "grad_norm": 2.871145486831665, + "learning_rate": 3.0713225518906826e-06, + "loss": 0.5125, + "step": 5471 + }, + { + "epoch": 2.5872340425531917, + "grad_norm": 3.1301417350769043, + "learning_rate": 3.070715215404007e-06, + "loss": 0.4827, + "step": 5472 + }, + { + "epoch": 2.587706855791962, + "grad_norm": 2.31062912940979, + "learning_rate": 3.070107843381398e-06, + "loss": 0.3954, + "step": 5473 + }, + { + "epoch": 2.588179669030733, + "grad_norm": 2.8366353511810303, + "learning_rate": 3.069500435860674e-06, + "loss": 0.4597, + "step": 5474 + }, + { + "epoch": 2.5886524822695036, + "grad_norm": 2.900143623352051, + "learning_rate": 3.068892992879654e-06, + "loss": 0.4294, + "step": 5475 + }, + { + "epoch": 2.589125295508274, + "grad_norm": 2.923313617706299, + "learning_rate": 3.0682855144761626e-06, + "loss": 0.505, + "step": 5476 + }, + { + "epoch": 2.5895981087470448, + "grad_norm": 2.726475954055786, + "learning_rate": 3.0676780006880242e-06, + "loss": 0.4208, + "step": 5477 + }, + { + "epoch": 2.5900709219858156, + "grad_norm": 4.115052223205566, + "learning_rate": 3.0670704515530654e-06, + "loss": 0.466, + "step": 5478 + }, + { + "epoch": 2.5905437352245864, + "grad_norm": 2.6018717288970947, + "learning_rate": 3.0664628671091163e-06, + "loss": 0.4697, + "step": 5479 + }, + { + "epoch": 2.591016548463357, + "grad_norm": 2.7393722534179688, + "learning_rate": 3.0658552473940085e-06, + "loss": 0.4618, + "step": 5480 + }, + { + "epoch": 2.5914893617021275, + "grad_norm": 2.8406929969787598, + "learning_rate": 3.065247592445575e-06, + "loss": 0.4806, + "step": 5481 + }, + { + "epoch": 2.5919621749408983, + "grad_norm": 2.9773001670837402, + "learning_rate": 3.0646399023016525e-06, + "loss": 0.4764, + "step": 5482 + }, + { + "epoch": 2.592434988179669, + "grad_norm": 3.374643325805664, + "learning_rate": 3.0640321770000804e-06, + "loss": 0.4481, + "step": 5483 + }, + { + "epoch": 2.5929078014184395, + "grad_norm": 2.5742013454437256, + "learning_rate": 3.0634244165786965e-06, + "loss": 0.432, + "step": 5484 + }, + { + "epoch": 2.5933806146572103, + "grad_norm": 2.9390289783477783, + "learning_rate": 3.062816621075346e-06, + "loss": 0.3941, + "step": 5485 + }, + { + "epoch": 2.593853427895981, + "grad_norm": 2.683414936065674, + "learning_rate": 3.062208790527871e-06, + "loss": 0.4268, + "step": 5486 + }, + { + "epoch": 2.594326241134752, + "grad_norm": 2.689647674560547, + "learning_rate": 3.06160092497412e-06, + "loss": 0.4569, + "step": 5487 + }, + { + "epoch": 2.5947990543735227, + "grad_norm": 3.1170310974121094, + "learning_rate": 3.060993024451943e-06, + "loss": 0.4387, + "step": 5488 + }, + { + "epoch": 2.595271867612293, + "grad_norm": 2.8732447624206543, + "learning_rate": 3.0603850889991894e-06, + "loss": 0.451, + "step": 5489 + }, + { + "epoch": 2.595744680851064, + "grad_norm": 3.0444157123565674, + "learning_rate": 3.0597771186537135e-06, + "loss": 0.4691, + "step": 5490 + }, + { + "epoch": 2.5962174940898346, + "grad_norm": 2.3791720867156982, + "learning_rate": 3.0591691134533714e-06, + "loss": 0.4771, + "step": 5491 + }, + { + "epoch": 2.596690307328605, + "grad_norm": 3.0677225589752197, + "learning_rate": 3.05856107343602e-06, + "loss": 0.459, + "step": 5492 + }, + { + "epoch": 2.5971631205673757, + "grad_norm": 3.1702635288238525, + "learning_rate": 3.05795299863952e-06, + "loss": 0.4816, + "step": 5493 + }, + { + "epoch": 2.5976359338061465, + "grad_norm": 2.964869499206543, + "learning_rate": 3.057344889101734e-06, + "loss": 0.4369, + "step": 5494 + }, + { + "epoch": 2.5981087470449173, + "grad_norm": 3.1333882808685303, + "learning_rate": 3.056736744860525e-06, + "loss": 0.4178, + "step": 5495 + }, + { + "epoch": 2.598581560283688, + "grad_norm": 2.4340405464172363, + "learning_rate": 3.05612856595376e-06, + "loss": 0.4359, + "step": 5496 + }, + { + "epoch": 2.5990543735224585, + "grad_norm": 2.638620615005493, + "learning_rate": 3.0555203524193083e-06, + "loss": 0.3915, + "step": 5497 + }, + { + "epoch": 2.5995271867612293, + "grad_norm": 2.8218815326690674, + "learning_rate": 3.054912104295039e-06, + "loss": 0.4684, + "step": 5498 + }, + { + "epoch": 2.6, + "grad_norm": 2.6696009635925293, + "learning_rate": 3.054303821618827e-06, + "loss": 0.4073, + "step": 5499 + }, + { + "epoch": 2.6004728132387704, + "grad_norm": 2.3880512714385986, + "learning_rate": 3.0536955044285465e-06, + "loss": 0.3576, + "step": 5500 + }, + { + "epoch": 2.6009456264775412, + "grad_norm": 2.762890100479126, + "learning_rate": 3.053087152762075e-06, + "loss": 0.3857, + "step": 5501 + }, + { + "epoch": 2.601418439716312, + "grad_norm": 2.729033946990967, + "learning_rate": 3.052478766657292e-06, + "loss": 0.3935, + "step": 5502 + }, + { + "epoch": 2.601891252955083, + "grad_norm": 2.630490303039551, + "learning_rate": 3.051870346152078e-06, + "loss": 0.3932, + "step": 5503 + }, + { + "epoch": 2.6023640661938536, + "grad_norm": 3.0335981845855713, + "learning_rate": 3.051261891284318e-06, + "loss": 0.4313, + "step": 5504 + }, + { + "epoch": 2.602836879432624, + "grad_norm": 2.969888687133789, + "learning_rate": 3.0506534020918963e-06, + "loss": 0.4698, + "step": 5505 + }, + { + "epoch": 2.603309692671395, + "grad_norm": 3.093996524810791, + "learning_rate": 3.050044878612703e-06, + "loss": 0.5338, + "step": 5506 + }, + { + "epoch": 2.6037825059101656, + "grad_norm": 2.759993314743042, + "learning_rate": 3.049436320884626e-06, + "loss": 0.4429, + "step": 5507 + }, + { + "epoch": 2.604255319148936, + "grad_norm": 2.979422092437744, + "learning_rate": 3.0488277289455587e-06, + "loss": 0.4489, + "step": 5508 + }, + { + "epoch": 2.6047281323877067, + "grad_norm": 2.8266701698303223, + "learning_rate": 3.048219102833396e-06, + "loss": 0.489, + "step": 5509 + }, + { + "epoch": 2.6052009456264775, + "grad_norm": 2.2582461833953857, + "learning_rate": 3.047610442586033e-06, + "loss": 0.3759, + "step": 5510 + }, + { + "epoch": 2.6056737588652483, + "grad_norm": 3.078152894973755, + "learning_rate": 3.0470017482413694e-06, + "loss": 0.5059, + "step": 5511 + }, + { + "epoch": 2.606146572104019, + "grad_norm": 2.7895498275756836, + "learning_rate": 3.0463930198373047e-06, + "loss": 0.4752, + "step": 5512 + }, + { + "epoch": 2.6066193853427895, + "grad_norm": 3.2307958602905273, + "learning_rate": 3.045784257411743e-06, + "loss": 0.4847, + "step": 5513 + }, + { + "epoch": 2.6070921985815603, + "grad_norm": 2.793661594390869, + "learning_rate": 3.0451754610025884e-06, + "loss": 0.4492, + "step": 5514 + }, + { + "epoch": 2.607565011820331, + "grad_norm": 2.4443132877349854, + "learning_rate": 3.0445666306477484e-06, + "loss": 0.4174, + "step": 5515 + }, + { + "epoch": 2.6080378250591014, + "grad_norm": 2.628769636154175, + "learning_rate": 3.0439577663851326e-06, + "loss": 0.3889, + "step": 5516 + }, + { + "epoch": 2.608510638297872, + "grad_norm": 2.9367563724517822, + "learning_rate": 3.0433488682526525e-06, + "loss": 0.437, + "step": 5517 + }, + { + "epoch": 2.608983451536643, + "grad_norm": 3.171353340148926, + "learning_rate": 3.04273993628822e-06, + "loss": 0.47, + "step": 5518 + }, + { + "epoch": 2.609456264775414, + "grad_norm": 2.856576442718506, + "learning_rate": 3.0421309705297513e-06, + "loss": 0.4797, + "step": 5519 + }, + { + "epoch": 2.6099290780141846, + "grad_norm": 2.4926068782806396, + "learning_rate": 3.041521971015165e-06, + "loss": 0.4294, + "step": 5520 + }, + { + "epoch": 2.610401891252955, + "grad_norm": 2.7897613048553467, + "learning_rate": 3.040912937782379e-06, + "loss": 0.4388, + "step": 5521 + }, + { + "epoch": 2.6108747044917258, + "grad_norm": 3.588188886642456, + "learning_rate": 3.0403038708693173e-06, + "loss": 0.4027, + "step": 5522 + }, + { + "epoch": 2.6113475177304966, + "grad_norm": 3.5394980907440186, + "learning_rate": 3.0396947703139017e-06, + "loss": 0.4866, + "step": 5523 + }, + { + "epoch": 2.611820330969267, + "grad_norm": 3.086865186691284, + "learning_rate": 3.03908563615406e-06, + "loss": 0.4344, + "step": 5524 + }, + { + "epoch": 2.6122931442080377, + "grad_norm": 2.649564504623413, + "learning_rate": 3.0384764684277194e-06, + "loss": 0.4571, + "step": 5525 + }, + { + "epoch": 2.6127659574468085, + "grad_norm": 2.945234775543213, + "learning_rate": 3.0378672671728105e-06, + "loss": 0.4885, + "step": 5526 + }, + { + "epoch": 2.6132387706855793, + "grad_norm": 2.625424861907959, + "learning_rate": 3.037258032427265e-06, + "loss": 0.4095, + "step": 5527 + }, + { + "epoch": 2.61371158392435, + "grad_norm": 2.7597248554229736, + "learning_rate": 3.0366487642290175e-06, + "loss": 0.4393, + "step": 5528 + }, + { + "epoch": 2.6141843971631205, + "grad_norm": 2.721189260482788, + "learning_rate": 3.0360394626160043e-06, + "loss": 0.3865, + "step": 5529 + }, + { + "epoch": 2.6146572104018913, + "grad_norm": 2.624056339263916, + "learning_rate": 3.0354301276261656e-06, + "loss": 0.4273, + "step": 5530 + }, + { + "epoch": 2.615130023640662, + "grad_norm": 2.7764177322387695, + "learning_rate": 3.034820759297439e-06, + "loss": 0.4756, + "step": 5531 + }, + { + "epoch": 2.6156028368794324, + "grad_norm": 3.0841729640960693, + "learning_rate": 3.0342113576677696e-06, + "loss": 0.4907, + "step": 5532 + }, + { + "epoch": 2.616075650118203, + "grad_norm": 2.678715705871582, + "learning_rate": 3.0336019227751017e-06, + "loss": 0.4478, + "step": 5533 + }, + { + "epoch": 2.616548463356974, + "grad_norm": 2.378679037094116, + "learning_rate": 3.032992454657382e-06, + "loss": 0.3678, + "step": 5534 + }, + { + "epoch": 2.617021276595745, + "grad_norm": 2.792079210281372, + "learning_rate": 3.0323829533525583e-06, + "loss": 0.4115, + "step": 5535 + }, + { + "epoch": 2.6174940898345156, + "grad_norm": 2.738133192062378, + "learning_rate": 3.0317734188985832e-06, + "loss": 0.4152, + "step": 5536 + }, + { + "epoch": 2.617966903073286, + "grad_norm": 2.6963796615600586, + "learning_rate": 3.0311638513334084e-06, + "loss": 0.4096, + "step": 5537 + }, + { + "epoch": 2.6184397163120567, + "grad_norm": 2.694145679473877, + "learning_rate": 3.03055425069499e-06, + "loss": 0.3793, + "step": 5538 + }, + { + "epoch": 2.6189125295508275, + "grad_norm": 2.762403964996338, + "learning_rate": 3.0299446170212855e-06, + "loss": 0.459, + "step": 5539 + }, + { + "epoch": 2.619385342789598, + "grad_norm": 2.804382562637329, + "learning_rate": 3.0293349503502522e-06, + "loss": 0.4853, + "step": 5540 + }, + { + "epoch": 2.6198581560283687, + "grad_norm": 2.7768518924713135, + "learning_rate": 3.0287252507198537e-06, + "loss": 0.4496, + "step": 5541 + }, + { + "epoch": 2.6203309692671395, + "grad_norm": 2.9075138568878174, + "learning_rate": 3.028115518168052e-06, + "loss": 0.4498, + "step": 5542 + }, + { + "epoch": 2.6208037825059103, + "grad_norm": 2.8966822624206543, + "learning_rate": 3.0275057527328126e-06, + "loss": 0.4434, + "step": 5543 + }, + { + "epoch": 2.621276595744681, + "grad_norm": 2.8140156269073486, + "learning_rate": 3.0268959544521027e-06, + "loss": 0.3935, + "step": 5544 + }, + { + "epoch": 2.6217494089834514, + "grad_norm": 2.8606276512145996, + "learning_rate": 3.0262861233638924e-06, + "loss": 0.4222, + "step": 5545 + }, + { + "epoch": 2.6222222222222222, + "grad_norm": 3.003610134124756, + "learning_rate": 3.0256762595061522e-06, + "loss": 0.428, + "step": 5546 + }, + { + "epoch": 2.622695035460993, + "grad_norm": 2.725907802581787, + "learning_rate": 3.025066362916857e-06, + "loss": 0.3975, + "step": 5547 + }, + { + "epoch": 2.6231678486997634, + "grad_norm": 2.5247902870178223, + "learning_rate": 3.024456433633982e-06, + "loss": 0.4584, + "step": 5548 + }, + { + "epoch": 2.623640661938534, + "grad_norm": 2.932798147201538, + "learning_rate": 3.0238464716955045e-06, + "loss": 0.4991, + "step": 5549 + }, + { + "epoch": 2.624113475177305, + "grad_norm": 2.693547010421753, + "learning_rate": 3.023236477139404e-06, + "loss": 0.4405, + "step": 5550 + }, + { + "epoch": 2.6245862884160758, + "grad_norm": 3.2600035667419434, + "learning_rate": 3.022626450003662e-06, + "loss": 0.4904, + "step": 5551 + }, + { + "epoch": 2.6250591016548466, + "grad_norm": 2.9471960067749023, + "learning_rate": 3.0220163903262627e-06, + "loss": 0.4487, + "step": 5552 + }, + { + "epoch": 2.625531914893617, + "grad_norm": 2.583944082260132, + "learning_rate": 3.0214062981451926e-06, + "loss": 0.3552, + "step": 5553 + }, + { + "epoch": 2.6260047281323877, + "grad_norm": 2.675062656402588, + "learning_rate": 3.0207961734984377e-06, + "loss": 0.4524, + "step": 5554 + }, + { + "epoch": 2.6264775413711585, + "grad_norm": 3.0126802921295166, + "learning_rate": 3.0201860164239887e-06, + "loss": 0.4124, + "step": 5555 + }, + { + "epoch": 2.626950354609929, + "grad_norm": 2.490734577178955, + "learning_rate": 3.019575826959838e-06, + "loss": 0.4095, + "step": 5556 + }, + { + "epoch": 2.6274231678486997, + "grad_norm": 2.72817063331604, + "learning_rate": 3.018965605143978e-06, + "loss": 0.4298, + "step": 5557 + }, + { + "epoch": 2.6278959810874705, + "grad_norm": 3.1298327445983887, + "learning_rate": 3.0183553510144064e-06, + "loss": 0.4961, + "step": 5558 + }, + { + "epoch": 2.6283687943262413, + "grad_norm": 3.2379956245422363, + "learning_rate": 3.0177450646091195e-06, + "loss": 0.4943, + "step": 5559 + }, + { + "epoch": 2.628841607565012, + "grad_norm": 2.5040571689605713, + "learning_rate": 3.017134745966117e-06, + "loss": 0.3701, + "step": 5560 + }, + { + "epoch": 2.6293144208037824, + "grad_norm": 3.047184944152832, + "learning_rate": 3.0165243951234025e-06, + "loss": 0.4587, + "step": 5561 + }, + { + "epoch": 2.629787234042553, + "grad_norm": 2.4926774501800537, + "learning_rate": 3.0159140121189783e-06, + "loss": 0.3723, + "step": 5562 + }, + { + "epoch": 2.630260047281324, + "grad_norm": 2.5434961318969727, + "learning_rate": 3.015303596990851e-06, + "loss": 0.4176, + "step": 5563 + }, + { + "epoch": 2.6307328605200944, + "grad_norm": 2.5117976665496826, + "learning_rate": 3.0146931497770284e-06, + "loss": 0.4218, + "step": 5564 + }, + { + "epoch": 2.631205673758865, + "grad_norm": 2.9408798217773438, + "learning_rate": 3.0140826705155196e-06, + "loss": 0.4473, + "step": 5565 + }, + { + "epoch": 2.631678486997636, + "grad_norm": 2.996422052383423, + "learning_rate": 3.0134721592443385e-06, + "loss": 0.4513, + "step": 5566 + }, + { + "epoch": 2.6321513002364068, + "grad_norm": 2.984356164932251, + "learning_rate": 3.0128616160014955e-06, + "loss": 0.4749, + "step": 5567 + }, + { + "epoch": 2.6326241134751776, + "grad_norm": 2.6075069904327393, + "learning_rate": 3.0122510408250095e-06, + "loss": 0.4707, + "step": 5568 + }, + { + "epoch": 2.633096926713948, + "grad_norm": 2.9463071823120117, + "learning_rate": 3.0116404337528972e-06, + "loss": 0.5125, + "step": 5569 + }, + { + "epoch": 2.6335697399527187, + "grad_norm": 2.98574161529541, + "learning_rate": 3.0110297948231787e-06, + "loss": 0.4487, + "step": 5570 + }, + { + "epoch": 2.6340425531914895, + "grad_norm": 2.6039397716522217, + "learning_rate": 3.010419124073876e-06, + "loss": 0.4516, + "step": 5571 + }, + { + "epoch": 2.63451536643026, + "grad_norm": 2.8480236530303955, + "learning_rate": 3.0098084215430124e-06, + "loss": 0.4962, + "step": 5572 + }, + { + "epoch": 2.6349881796690307, + "grad_norm": 2.527597427368164, + "learning_rate": 3.0091976872686133e-06, + "loss": 0.435, + "step": 5573 + }, + { + "epoch": 2.6354609929078014, + "grad_norm": 2.898303508758545, + "learning_rate": 3.0085869212887076e-06, + "loss": 0.4473, + "step": 5574 + }, + { + "epoch": 2.6359338061465722, + "grad_norm": 2.981414318084717, + "learning_rate": 3.007976123641324e-06, + "loss": 0.4203, + "step": 5575 + }, + { + "epoch": 2.636406619385343, + "grad_norm": 3.219064474105835, + "learning_rate": 3.0073652943644947e-06, + "loss": 0.4596, + "step": 5576 + }, + { + "epoch": 2.6368794326241134, + "grad_norm": 2.7287049293518066, + "learning_rate": 3.0067544334962532e-06, + "loss": 0.433, + "step": 5577 + }, + { + "epoch": 2.637352245862884, + "grad_norm": 2.6232664585113525, + "learning_rate": 3.0061435410746352e-06, + "loss": 0.4254, + "step": 5578 + }, + { + "epoch": 2.637825059101655, + "grad_norm": 2.908311605453491, + "learning_rate": 3.0055326171376788e-06, + "loss": 0.4349, + "step": 5579 + }, + { + "epoch": 2.6382978723404253, + "grad_norm": 2.8369064331054688, + "learning_rate": 3.0049216617234224e-06, + "loss": 0.4675, + "step": 5580 + }, + { + "epoch": 2.638770685579196, + "grad_norm": 2.659499406814575, + "learning_rate": 3.0043106748699085e-06, + "loss": 0.4073, + "step": 5581 + }, + { + "epoch": 2.639243498817967, + "grad_norm": 2.579765558242798, + "learning_rate": 3.00369965661518e-06, + "loss": 0.4536, + "step": 5582 + }, + { + "epoch": 2.6397163120567377, + "grad_norm": 3.572861909866333, + "learning_rate": 3.0030886069972827e-06, + "loss": 0.5227, + "step": 5583 + }, + { + "epoch": 2.6401891252955085, + "grad_norm": 2.6523196697235107, + "learning_rate": 3.002477526054263e-06, + "loss": 0.3846, + "step": 5584 + }, + { + "epoch": 2.640661938534279, + "grad_norm": 3.072181463241577, + "learning_rate": 3.001866413824173e-06, + "loss": 0.5399, + "step": 5585 + }, + { + "epoch": 2.6411347517730497, + "grad_norm": 2.7304325103759766, + "learning_rate": 3.0012552703450597e-06, + "loss": 0.4048, + "step": 5586 + }, + { + "epoch": 2.6416075650118205, + "grad_norm": 3.039491891860962, + "learning_rate": 3.0006440956549798e-06, + "loss": 0.5035, + "step": 5587 + }, + { + "epoch": 2.642080378250591, + "grad_norm": 2.7623798847198486, + "learning_rate": 3.000032889791988e-06, + "loss": 0.4369, + "step": 5588 + }, + { + "epoch": 2.6425531914893616, + "grad_norm": 3.391052722930908, + "learning_rate": 2.9994216527941394e-06, + "loss": 0.5308, + "step": 5589 + }, + { + "epoch": 2.6430260047281324, + "grad_norm": 3.0263915061950684, + "learning_rate": 2.9988103846994954e-06, + "loss": 0.4319, + "step": 5590 + }, + { + "epoch": 2.6434988179669032, + "grad_norm": 2.786607027053833, + "learning_rate": 2.998199085546115e-06, + "loss": 0.4695, + "step": 5591 + }, + { + "epoch": 2.643971631205674, + "grad_norm": 2.884674310684204, + "learning_rate": 2.9975877553720627e-06, + "loss": 0.4615, + "step": 5592 + }, + { + "epoch": 2.6444444444444444, + "grad_norm": 2.6100499629974365, + "learning_rate": 2.996976394215402e-06, + "loss": 0.4784, + "step": 5593 + }, + { + "epoch": 2.644917257683215, + "grad_norm": 2.6978676319122314, + "learning_rate": 2.9963650021142018e-06, + "loss": 0.3911, + "step": 5594 + }, + { + "epoch": 2.645390070921986, + "grad_norm": 2.8080835342407227, + "learning_rate": 2.9957535791065284e-06, + "loss": 0.4997, + "step": 5595 + }, + { + "epoch": 2.6458628841607563, + "grad_norm": 2.6639578342437744, + "learning_rate": 2.9951421252304537e-06, + "loss": 0.4066, + "step": 5596 + }, + { + "epoch": 2.646335697399527, + "grad_norm": 3.102456569671631, + "learning_rate": 2.9945306405240505e-06, + "loss": 0.5554, + "step": 5597 + }, + { + "epoch": 2.646808510638298, + "grad_norm": 2.6524150371551514, + "learning_rate": 2.993919125025392e-06, + "loss": 0.3881, + "step": 5598 + }, + { + "epoch": 2.6472813238770687, + "grad_norm": 2.926316499710083, + "learning_rate": 2.993307578772556e-06, + "loss": 0.4845, + "step": 5599 + }, + { + "epoch": 2.6477541371158395, + "grad_norm": 3.346550703048706, + "learning_rate": 2.9926960018036195e-06, + "loss": 0.4481, + "step": 5600 + }, + { + "epoch": 2.64822695035461, + "grad_norm": 2.6211020946502686, + "learning_rate": 2.9920843941566634e-06, + "loss": 0.4355, + "step": 5601 + }, + { + "epoch": 2.6486997635933807, + "grad_norm": 2.7479333877563477, + "learning_rate": 2.99147275586977e-06, + "loss": 0.4373, + "step": 5602 + }, + { + "epoch": 2.6491725768321515, + "grad_norm": 2.523385524749756, + "learning_rate": 2.9908610869810235e-06, + "loss": 0.4467, + "step": 5603 + }, + { + "epoch": 2.649645390070922, + "grad_norm": 2.93886137008667, + "learning_rate": 2.9902493875285086e-06, + "loss": 0.4956, + "step": 5604 + }, + { + "epoch": 2.6501182033096926, + "grad_norm": 2.7630443572998047, + "learning_rate": 2.989637657550315e-06, + "loss": 0.5012, + "step": 5605 + }, + { + "epoch": 2.6505910165484634, + "grad_norm": 2.6733906269073486, + "learning_rate": 2.989025897084531e-06, + "loss": 0.446, + "step": 5606 + }, + { + "epoch": 2.651063829787234, + "grad_norm": 2.8411107063293457, + "learning_rate": 2.9884141061692484e-06, + "loss": 0.4817, + "step": 5607 + }, + { + "epoch": 2.651536643026005, + "grad_norm": 2.8667192459106445, + "learning_rate": 2.987802284842562e-06, + "loss": 0.3909, + "step": 5608 + }, + { + "epoch": 2.6520094562647754, + "grad_norm": 3.4640755653381348, + "learning_rate": 2.987190433142565e-06, + "loss": 0.4379, + "step": 5609 + }, + { + "epoch": 2.652482269503546, + "grad_norm": 2.675121307373047, + "learning_rate": 2.9865785511073565e-06, + "loss": 0.4833, + "step": 5610 + }, + { + "epoch": 2.652955082742317, + "grad_norm": 2.4375529289245605, + "learning_rate": 2.9859666387750353e-06, + "loss": 0.3949, + "step": 5611 + }, + { + "epoch": 2.6534278959810873, + "grad_norm": 2.7312581539154053, + "learning_rate": 2.9853546961837026e-06, + "loss": 0.4546, + "step": 5612 + }, + { + "epoch": 2.653900709219858, + "grad_norm": 2.7695999145507812, + "learning_rate": 2.9847427233714617e-06, + "loss": 0.4696, + "step": 5613 + }, + { + "epoch": 2.654373522458629, + "grad_norm": 2.6313109397888184, + "learning_rate": 2.984130720376416e-06, + "loss": 0.4733, + "step": 5614 + }, + { + "epoch": 2.6548463356973997, + "grad_norm": 2.656864881515503, + "learning_rate": 2.9835186872366733e-06, + "loss": 0.3806, + "step": 5615 + }, + { + "epoch": 2.65531914893617, + "grad_norm": 2.720075845718384, + "learning_rate": 2.982906623990342e-06, + "loss": 0.4041, + "step": 5616 + }, + { + "epoch": 2.655791962174941, + "grad_norm": 2.6684951782226562, + "learning_rate": 2.9822945306755334e-06, + "loss": 0.4552, + "step": 5617 + }, + { + "epoch": 2.6562647754137116, + "grad_norm": 2.567751884460449, + "learning_rate": 2.9816824073303585e-06, + "loss": 0.465, + "step": 5618 + }, + { + "epoch": 2.656737588652482, + "grad_norm": 2.7490367889404297, + "learning_rate": 2.981070253992933e-06, + "loss": 0.4647, + "step": 5619 + }, + { + "epoch": 2.657210401891253, + "grad_norm": 2.548656463623047, + "learning_rate": 2.9804580707013715e-06, + "loss": 0.4226, + "step": 5620 + }, + { + "epoch": 2.6576832151300236, + "grad_norm": 2.5484731197357178, + "learning_rate": 2.9798458574937927e-06, + "loss": 0.382, + "step": 5621 + }, + { + "epoch": 2.6581560283687944, + "grad_norm": 2.7293949127197266, + "learning_rate": 2.979233614408317e-06, + "loss": 0.4418, + "step": 5622 + }, + { + "epoch": 2.658628841607565, + "grad_norm": 2.645036458969116, + "learning_rate": 2.9786213414830646e-06, + "loss": 0.414, + "step": 5623 + }, + { + "epoch": 2.6591016548463355, + "grad_norm": 2.5287609100341797, + "learning_rate": 2.9780090387561604e-06, + "loss": 0.3914, + "step": 5624 + }, + { + "epoch": 2.6595744680851063, + "grad_norm": 2.5570411682128906, + "learning_rate": 2.9773967062657293e-06, + "loss": 0.4431, + "step": 5625 + }, + { + "epoch": 2.660047281323877, + "grad_norm": 2.681749105453491, + "learning_rate": 2.9767843440498983e-06, + "loss": 0.4245, + "step": 5626 + }, + { + "epoch": 2.6605200945626475, + "grad_norm": 2.8629777431488037, + "learning_rate": 2.976171952146798e-06, + "loss": 0.4643, + "step": 5627 + }, + { + "epoch": 2.6609929078014183, + "grad_norm": 2.577148199081421, + "learning_rate": 2.9755595305945573e-06, + "loss": 0.43, + "step": 5628 + }, + { + "epoch": 2.661465721040189, + "grad_norm": 2.747218370437622, + "learning_rate": 2.97494707943131e-06, + "loss": 0.5194, + "step": 5629 + }, + { + "epoch": 2.66193853427896, + "grad_norm": 2.535604953765869, + "learning_rate": 2.9743345986951904e-06, + "loss": 0.4401, + "step": 5630 + }, + { + "epoch": 2.6624113475177307, + "grad_norm": 3.3341166973114014, + "learning_rate": 2.973722088424336e-06, + "loss": 0.4925, + "step": 5631 + }, + { + "epoch": 2.662884160756501, + "grad_norm": 2.9264349937438965, + "learning_rate": 2.973109548656884e-06, + "loss": 0.4787, + "step": 5632 + }, + { + "epoch": 2.663356973995272, + "grad_norm": 2.7132506370544434, + "learning_rate": 2.9724969794309742e-06, + "loss": 0.4138, + "step": 5633 + }, + { + "epoch": 2.6638297872340426, + "grad_norm": 2.7970192432403564, + "learning_rate": 2.9718843807847497e-06, + "loss": 0.4896, + "step": 5634 + }, + { + "epoch": 2.664302600472813, + "grad_norm": 2.610208749771118, + "learning_rate": 2.9712717527563545e-06, + "loss": 0.3997, + "step": 5635 + }, + { + "epoch": 2.6647754137115838, + "grad_norm": 3.5483577251434326, + "learning_rate": 2.9706590953839335e-06, + "loss": 0.5109, + "step": 5636 + }, + { + "epoch": 2.6652482269503546, + "grad_norm": 2.746933698654175, + "learning_rate": 2.9700464087056345e-06, + "loss": 0.4672, + "step": 5637 + }, + { + "epoch": 2.6657210401891254, + "grad_norm": 2.704436779022217, + "learning_rate": 2.969433692759607e-06, + "loss": 0.4402, + "step": 5638 + }, + { + "epoch": 2.666193853427896, + "grad_norm": 2.859520196914673, + "learning_rate": 2.9688209475840005e-06, + "loss": 0.4679, + "step": 5639 + }, + { + "epoch": 2.6666666666666665, + "grad_norm": 2.518580436706543, + "learning_rate": 2.968208173216971e-06, + "loss": 0.3772, + "step": 5640 + }, + { + "epoch": 2.6671394799054373, + "grad_norm": 2.7624926567077637, + "learning_rate": 2.967595369696671e-06, + "loss": 0.4753, + "step": 5641 + }, + { + "epoch": 2.667612293144208, + "grad_norm": 2.654003620147705, + "learning_rate": 2.966982537061257e-06, + "loss": 0.4583, + "step": 5642 + }, + { + "epoch": 2.6680851063829785, + "grad_norm": 2.8473968505859375, + "learning_rate": 2.966369675348888e-06, + "loss": 0.4623, + "step": 5643 + }, + { + "epoch": 2.6685579196217493, + "grad_norm": 2.5587947368621826, + "learning_rate": 2.9657567845977253e-06, + "loss": 0.4014, + "step": 5644 + }, + { + "epoch": 2.66903073286052, + "grad_norm": 2.572220802307129, + "learning_rate": 2.96514386484593e-06, + "loss": 0.4249, + "step": 5645 + }, + { + "epoch": 2.669503546099291, + "grad_norm": 2.7995707988739014, + "learning_rate": 2.964530916131665e-06, + "loss": 0.4575, + "step": 5646 + }, + { + "epoch": 2.6699763593380617, + "grad_norm": 2.8712687492370605, + "learning_rate": 2.963917938493097e-06, + "loss": 0.4353, + "step": 5647 + }, + { + "epoch": 2.670449172576832, + "grad_norm": 2.856473207473755, + "learning_rate": 2.963304931968393e-06, + "loss": 0.4345, + "step": 5648 + }, + { + "epoch": 2.670921985815603, + "grad_norm": 2.709198474884033, + "learning_rate": 2.9626918965957224e-06, + "loss": 0.4116, + "step": 5649 + }, + { + "epoch": 2.6713947990543736, + "grad_norm": 2.8144607543945312, + "learning_rate": 2.962078832413257e-06, + "loss": 0.4575, + "step": 5650 + }, + { + "epoch": 2.671867612293144, + "grad_norm": 3.131911039352417, + "learning_rate": 2.961465739459168e-06, + "loss": 0.4743, + "step": 5651 + }, + { + "epoch": 2.6723404255319148, + "grad_norm": 2.8487515449523926, + "learning_rate": 2.9608526177716316e-06, + "loss": 0.4314, + "step": 5652 + }, + { + "epoch": 2.6728132387706856, + "grad_norm": 2.613229751586914, + "learning_rate": 2.960239467388823e-06, + "loss": 0.4807, + "step": 5653 + }, + { + "epoch": 2.6732860520094563, + "grad_norm": 2.5049116611480713, + "learning_rate": 2.9596262883489213e-06, + "loss": 0.4708, + "step": 5654 + }, + { + "epoch": 2.673758865248227, + "grad_norm": 2.6347460746765137, + "learning_rate": 2.9590130806901052e-06, + "loss": 0.3689, + "step": 5655 + }, + { + "epoch": 2.6742316784869975, + "grad_norm": 3.3290371894836426, + "learning_rate": 2.9583998444505578e-06, + "loss": 0.4674, + "step": 5656 + }, + { + "epoch": 2.6747044917257683, + "grad_norm": 2.748403549194336, + "learning_rate": 2.957786579668462e-06, + "loss": 0.3852, + "step": 5657 + }, + { + "epoch": 2.675177304964539, + "grad_norm": 2.837573766708374, + "learning_rate": 2.957173286382003e-06, + "loss": 0.4541, + "step": 5658 + }, + { + "epoch": 2.6756501182033094, + "grad_norm": 3.0976510047912598, + "learning_rate": 2.9565599646293686e-06, + "loss": 0.4669, + "step": 5659 + }, + { + "epoch": 2.6761229314420802, + "grad_norm": 2.7059597969055176, + "learning_rate": 2.955946614448747e-06, + "loss": 0.3935, + "step": 5660 + }, + { + "epoch": 2.676595744680851, + "grad_norm": 2.6700541973114014, + "learning_rate": 2.9553332358783294e-06, + "loss": 0.4322, + "step": 5661 + }, + { + "epoch": 2.677068557919622, + "grad_norm": 2.9782698154449463, + "learning_rate": 2.9547198289563068e-06, + "loss": 0.4338, + "step": 5662 + }, + { + "epoch": 2.6775413711583926, + "grad_norm": 2.637876510620117, + "learning_rate": 2.9541063937208755e-06, + "loss": 0.4289, + "step": 5663 + }, + { + "epoch": 2.678014184397163, + "grad_norm": 3.421949863433838, + "learning_rate": 2.953492930210229e-06, + "loss": 0.5458, + "step": 5664 + }, + { + "epoch": 2.678486997635934, + "grad_norm": 2.8273842334747314, + "learning_rate": 2.952879438462567e-06, + "loss": 0.4529, + "step": 5665 + }, + { + "epoch": 2.6789598108747046, + "grad_norm": 2.9090168476104736, + "learning_rate": 2.9522659185160873e-06, + "loss": 0.444, + "step": 5666 + }, + { + "epoch": 2.679432624113475, + "grad_norm": 2.646710157394409, + "learning_rate": 2.9516523704089927e-06, + "loss": 0.4226, + "step": 5667 + }, + { + "epoch": 2.6799054373522457, + "grad_norm": 2.65915584564209, + "learning_rate": 2.951038794179486e-06, + "loss": 0.4307, + "step": 5668 + }, + { + "epoch": 2.6803782505910165, + "grad_norm": 3.004507303237915, + "learning_rate": 2.950425189865771e-06, + "loss": 0.4799, + "step": 5669 + }, + { + "epoch": 2.6808510638297873, + "grad_norm": 2.5210134983062744, + "learning_rate": 2.949811557506054e-06, + "loss": 0.3842, + "step": 5670 + }, + { + "epoch": 2.681323877068558, + "grad_norm": 2.8072893619537354, + "learning_rate": 2.9491978971385436e-06, + "loss": 0.435, + "step": 5671 + }, + { + "epoch": 2.6817966903073285, + "grad_norm": 2.5701990127563477, + "learning_rate": 2.9485842088014498e-06, + "loss": 0.4932, + "step": 5672 + }, + { + "epoch": 2.6822695035460993, + "grad_norm": 2.9368457794189453, + "learning_rate": 2.9479704925329854e-06, + "loss": 0.455, + "step": 5673 + }, + { + "epoch": 2.68274231678487, + "grad_norm": 2.8576247692108154, + "learning_rate": 2.947356748371362e-06, + "loss": 0.4254, + "step": 5674 + }, + { + "epoch": 2.6832151300236404, + "grad_norm": 2.8999195098876953, + "learning_rate": 2.946742976354795e-06, + "loss": 0.4159, + "step": 5675 + }, + { + "epoch": 2.6836879432624112, + "grad_norm": 2.8439736366271973, + "learning_rate": 2.946129176521502e-06, + "loss": 0.4035, + "step": 5676 + }, + { + "epoch": 2.684160756501182, + "grad_norm": 2.8525729179382324, + "learning_rate": 2.945515348909702e-06, + "loss": 0.4137, + "step": 5677 + }, + { + "epoch": 2.684633569739953, + "grad_norm": 2.6573562622070312, + "learning_rate": 2.9449014935576147e-06, + "loss": 0.4203, + "step": 5678 + }, + { + "epoch": 2.6851063829787236, + "grad_norm": 2.765794277191162, + "learning_rate": 2.9442876105034616e-06, + "loss": 0.5184, + "step": 5679 + }, + { + "epoch": 2.685579196217494, + "grad_norm": 2.694617748260498, + "learning_rate": 2.943673699785467e-06, + "loss": 0.417, + "step": 5680 + }, + { + "epoch": 2.6860520094562648, + "grad_norm": 2.740774393081665, + "learning_rate": 2.943059761441857e-06, + "loss": 0.4431, + "step": 5681 + }, + { + "epoch": 2.6865248226950356, + "grad_norm": 2.670642614364624, + "learning_rate": 2.942445795510859e-06, + "loss": 0.4298, + "step": 5682 + }, + { + "epoch": 2.686997635933806, + "grad_norm": 2.838907241821289, + "learning_rate": 2.9418318020307e-06, + "loss": 0.4529, + "step": 5683 + }, + { + "epoch": 2.6874704491725767, + "grad_norm": 2.562317371368408, + "learning_rate": 2.9412177810396135e-06, + "loss": 0.4251, + "step": 5684 + }, + { + "epoch": 2.6879432624113475, + "grad_norm": 2.5805928707122803, + "learning_rate": 2.9406037325758298e-06, + "loss": 0.4405, + "step": 5685 + }, + { + "epoch": 2.6884160756501183, + "grad_norm": 2.5701205730438232, + "learning_rate": 2.939989656677583e-06, + "loss": 0.4184, + "step": 5686 + }, + { + "epoch": 2.688888888888889, + "grad_norm": 2.7990400791168213, + "learning_rate": 2.939375553383111e-06, + "loss": 0.4866, + "step": 5687 + }, + { + "epoch": 2.6893617021276595, + "grad_norm": 3.063319206237793, + "learning_rate": 2.9387614227306487e-06, + "loss": 0.4202, + "step": 5688 + }, + { + "epoch": 2.6898345153664303, + "grad_norm": 3.0891315937042236, + "learning_rate": 2.938147264758437e-06, + "loss": 0.4344, + "step": 5689 + }, + { + "epoch": 2.690307328605201, + "grad_norm": 2.8982670307159424, + "learning_rate": 2.9375330795047165e-06, + "loss": 0.4548, + "step": 5690 + }, + { + "epoch": 2.6907801418439714, + "grad_norm": 2.7947235107421875, + "learning_rate": 2.9369188670077293e-06, + "loss": 0.5028, + "step": 5691 + }, + { + "epoch": 2.691252955082742, + "grad_norm": 3.1615960597991943, + "learning_rate": 2.9363046273057206e-06, + "loss": 0.4855, + "step": 5692 + }, + { + "epoch": 2.691725768321513, + "grad_norm": 2.669516086578369, + "learning_rate": 2.935690360436935e-06, + "loss": 0.3813, + "step": 5693 + }, + { + "epoch": 2.692198581560284, + "grad_norm": 2.8743274211883545, + "learning_rate": 2.935076066439622e-06, + "loss": 0.4302, + "step": 5694 + }, + { + "epoch": 2.6926713947990546, + "grad_norm": 2.6829612255096436, + "learning_rate": 2.9344617453520295e-06, + "loss": 0.4063, + "step": 5695 + }, + { + "epoch": 2.693144208037825, + "grad_norm": 2.776447057723999, + "learning_rate": 2.9338473972124097e-06, + "loss": 0.4921, + "step": 5696 + }, + { + "epoch": 2.6936170212765957, + "grad_norm": 2.7865772247314453, + "learning_rate": 2.9332330220590143e-06, + "loss": 0.4939, + "step": 5697 + }, + { + "epoch": 2.6940898345153665, + "grad_norm": 3.020526170730591, + "learning_rate": 2.932618619930098e-06, + "loss": 0.4839, + "step": 5698 + }, + { + "epoch": 2.694562647754137, + "grad_norm": 2.637057065963745, + "learning_rate": 2.932004190863918e-06, + "loss": 0.4343, + "step": 5699 + }, + { + "epoch": 2.6950354609929077, + "grad_norm": 2.7426512241363525, + "learning_rate": 2.9313897348987314e-06, + "loss": 0.3609, + "step": 5700 + }, + { + "epoch": 2.6955082742316785, + "grad_norm": 2.767186164855957, + "learning_rate": 2.9307752520727974e-06, + "loss": 0.3793, + "step": 5701 + }, + { + "epoch": 2.6959810874704493, + "grad_norm": 2.4791622161865234, + "learning_rate": 2.930160742424377e-06, + "loss": 0.4192, + "step": 5702 + }, + { + "epoch": 2.69645390070922, + "grad_norm": 2.661461591720581, + "learning_rate": 2.9295462059917336e-06, + "loss": 0.4758, + "step": 5703 + }, + { + "epoch": 2.6969267139479904, + "grad_norm": 2.896242380142212, + "learning_rate": 2.928931642813131e-06, + "loss": 0.42, + "step": 5704 + }, + { + "epoch": 2.6973995271867612, + "grad_norm": 2.783813238143921, + "learning_rate": 2.9283170529268366e-06, + "loss": 0.4726, + "step": 5705 + }, + { + "epoch": 2.697872340425532, + "grad_norm": 2.4347333908081055, + "learning_rate": 2.927702436371117e-06, + "loss": 0.4199, + "step": 5706 + }, + { + "epoch": 2.6983451536643024, + "grad_norm": 2.4643805027008057, + "learning_rate": 2.927087793184242e-06, + "loss": 0.3578, + "step": 5707 + }, + { + "epoch": 2.698817966903073, + "grad_norm": 2.6396660804748535, + "learning_rate": 2.9264731234044835e-06, + "loss": 0.4509, + "step": 5708 + }, + { + "epoch": 2.699290780141844, + "grad_norm": 2.7341182231903076, + "learning_rate": 2.925858427070113e-06, + "loss": 0.4331, + "step": 5709 + }, + { + "epoch": 2.699763593380615, + "grad_norm": 2.7578938007354736, + "learning_rate": 2.9252437042194058e-06, + "loss": 0.4508, + "step": 5710 + }, + { + "epoch": 2.7002364066193856, + "grad_norm": 2.557788133621216, + "learning_rate": 2.9246289548906375e-06, + "loss": 0.3775, + "step": 5711 + }, + { + "epoch": 2.700709219858156, + "grad_norm": 2.802851676940918, + "learning_rate": 2.924014179122086e-06, + "loss": 0.4518, + "step": 5712 + }, + { + "epoch": 2.7011820330969267, + "grad_norm": 2.4773001670837402, + "learning_rate": 2.9233993769520313e-06, + "loss": 0.4019, + "step": 5713 + }, + { + "epoch": 2.7016548463356975, + "grad_norm": 3.108971357345581, + "learning_rate": 2.922784548418754e-06, + "loss": 0.4715, + "step": 5714 + }, + { + "epoch": 2.702127659574468, + "grad_norm": 2.8596770763397217, + "learning_rate": 2.9221696935605366e-06, + "loss": 0.4361, + "step": 5715 + }, + { + "epoch": 2.7026004728132387, + "grad_norm": 2.570604085922241, + "learning_rate": 2.9215548124156633e-06, + "loss": 0.3982, + "step": 5716 + }, + { + "epoch": 2.7030732860520095, + "grad_norm": 2.3157799243927, + "learning_rate": 2.9209399050224206e-06, + "loss": 0.456, + "step": 5717 + }, + { + "epoch": 2.7035460992907803, + "grad_norm": 2.6865758895874023, + "learning_rate": 2.9203249714190952e-06, + "loss": 0.4441, + "step": 5718 + }, + { + "epoch": 2.704018912529551, + "grad_norm": 2.76723313331604, + "learning_rate": 2.919710011643978e-06, + "loss": 0.464, + "step": 5719 + }, + { + "epoch": 2.7044917257683214, + "grad_norm": 2.648792028427124, + "learning_rate": 2.9190950257353578e-06, + "loss": 0.3426, + "step": 5720 + }, + { + "epoch": 2.704964539007092, + "grad_norm": 2.878739833831787, + "learning_rate": 2.9184800137315276e-06, + "loss": 0.4431, + "step": 5721 + }, + { + "epoch": 2.705437352245863, + "grad_norm": 2.670567274093628, + "learning_rate": 2.917864975670783e-06, + "loss": 0.4347, + "step": 5722 + }, + { + "epoch": 2.7059101654846334, + "grad_norm": 2.7031569480895996, + "learning_rate": 2.9172499115914184e-06, + "loss": 0.4557, + "step": 5723 + }, + { + "epoch": 2.706382978723404, + "grad_norm": 2.5225696563720703, + "learning_rate": 2.9166348215317314e-06, + "loss": 0.4159, + "step": 5724 + }, + { + "epoch": 2.706855791962175, + "grad_norm": 2.8676085472106934, + "learning_rate": 2.916019705530021e-06, + "loss": 0.5018, + "step": 5725 + }, + { + "epoch": 2.7073286052009458, + "grad_norm": 2.576463460922241, + "learning_rate": 2.915404563624587e-06, + "loss": 0.4317, + "step": 5726 + }, + { + "epoch": 2.7078014184397166, + "grad_norm": 3.155565023422241, + "learning_rate": 2.9147893958537328e-06, + "loss": 0.5029, + "step": 5727 + }, + { + "epoch": 2.708274231678487, + "grad_norm": 2.604079008102417, + "learning_rate": 2.9141742022557622e-06, + "loss": 0.4324, + "step": 5728 + }, + { + "epoch": 2.7087470449172577, + "grad_norm": 2.6597228050231934, + "learning_rate": 2.913558982868979e-06, + "loss": 0.4335, + "step": 5729 + }, + { + "epoch": 2.7092198581560285, + "grad_norm": 2.811384439468384, + "learning_rate": 2.9129437377316923e-06, + "loss": 0.4031, + "step": 5730 + }, + { + "epoch": 2.709692671394799, + "grad_norm": 3.1041207313537598, + "learning_rate": 2.91232846688221e-06, + "loss": 0.481, + "step": 5731 + }, + { + "epoch": 2.7101654846335697, + "grad_norm": 2.5992188453674316, + "learning_rate": 2.9117131703588414e-06, + "loss": 0.4266, + "step": 5732 + }, + { + "epoch": 2.7106382978723405, + "grad_norm": 2.7726242542266846, + "learning_rate": 2.911097848199899e-06, + "loss": 0.4464, + "step": 5733 + }, + { + "epoch": 2.7111111111111112, + "grad_norm": 2.8683483600616455, + "learning_rate": 2.9104825004436966e-06, + "loss": 0.4248, + "step": 5734 + }, + { + "epoch": 2.711583924349882, + "grad_norm": 2.776386022567749, + "learning_rate": 2.9098671271285484e-06, + "loss": 0.4556, + "step": 5735 + }, + { + "epoch": 2.7120567375886524, + "grad_norm": 2.7612528800964355, + "learning_rate": 2.909251728292771e-06, + "loss": 0.455, + "step": 5736 + }, + { + "epoch": 2.712529550827423, + "grad_norm": 2.9223551750183105, + "learning_rate": 2.908636303974684e-06, + "loss": 0.4302, + "step": 5737 + }, + { + "epoch": 2.713002364066194, + "grad_norm": 2.898226022720337, + "learning_rate": 2.908020854212606e-06, + "loss": 0.4827, + "step": 5738 + }, + { + "epoch": 2.7134751773049643, + "grad_norm": 2.706361770629883, + "learning_rate": 2.9074053790448576e-06, + "loss": 0.4444, + "step": 5739 + }, + { + "epoch": 2.713947990543735, + "grad_norm": 2.8227248191833496, + "learning_rate": 2.9067898785097637e-06, + "loss": 0.4661, + "step": 5740 + }, + { + "epoch": 2.714420803782506, + "grad_norm": 2.597837448120117, + "learning_rate": 2.9061743526456474e-06, + "loss": 0.4646, + "step": 5741 + }, + { + "epoch": 2.7148936170212767, + "grad_norm": 2.5525131225585938, + "learning_rate": 2.9055588014908354e-06, + "loss": 0.4172, + "step": 5742 + }, + { + "epoch": 2.7153664302600475, + "grad_norm": 2.713071823120117, + "learning_rate": 2.904943225083655e-06, + "loss": 0.4893, + "step": 5743 + }, + { + "epoch": 2.715839243498818, + "grad_norm": 2.538623571395874, + "learning_rate": 2.9043276234624353e-06, + "loss": 0.3905, + "step": 5744 + }, + { + "epoch": 2.7163120567375887, + "grad_norm": 2.5190389156341553, + "learning_rate": 2.9037119966655076e-06, + "loss": 0.4318, + "step": 5745 + }, + { + "epoch": 2.7167848699763595, + "grad_norm": 2.6587612628936768, + "learning_rate": 2.903096344731204e-06, + "loss": 0.4153, + "step": 5746 + }, + { + "epoch": 2.71725768321513, + "grad_norm": 2.836731433868408, + "learning_rate": 2.902480667697859e-06, + "loss": 0.4779, + "step": 5747 + }, + { + "epoch": 2.7177304964539006, + "grad_norm": 2.8076045513153076, + "learning_rate": 2.9018649656038074e-06, + "loss": 0.5126, + "step": 5748 + }, + { + "epoch": 2.7182033096926714, + "grad_norm": 2.8930516242980957, + "learning_rate": 2.9012492384873865e-06, + "loss": 0.4561, + "step": 5749 + }, + { + "epoch": 2.7186761229314422, + "grad_norm": 2.7000370025634766, + "learning_rate": 2.9006334863869343e-06, + "loss": 0.4659, + "step": 5750 + }, + { + "epoch": 2.719148936170213, + "grad_norm": 2.927011251449585, + "learning_rate": 2.9000177093407926e-06, + "loss": 0.5123, + "step": 5751 + }, + { + "epoch": 2.7196217494089834, + "grad_norm": 3.0102779865264893, + "learning_rate": 2.8994019073873015e-06, + "loss": 0.3972, + "step": 5752 + }, + { + "epoch": 2.720094562647754, + "grad_norm": 2.778838634490967, + "learning_rate": 2.8987860805648054e-06, + "loss": 0.4922, + "step": 5753 + }, + { + "epoch": 2.720567375886525, + "grad_norm": 2.6150314807891846, + "learning_rate": 2.898170228911648e-06, + "loss": 0.4425, + "step": 5754 + }, + { + "epoch": 2.7210401891252953, + "grad_norm": 2.9329984188079834, + "learning_rate": 2.8975543524661777e-06, + "loss": 0.4872, + "step": 5755 + }, + { + "epoch": 2.721513002364066, + "grad_norm": 2.756803512573242, + "learning_rate": 2.8969384512667404e-06, + "loss": 0.4362, + "step": 5756 + }, + { + "epoch": 2.721985815602837, + "grad_norm": 2.600877285003662, + "learning_rate": 2.896322525351686e-06, + "loss": 0.4802, + "step": 5757 + }, + { + "epoch": 2.7224586288416077, + "grad_norm": 2.647069215774536, + "learning_rate": 2.8957065747593655e-06, + "loss": 0.4649, + "step": 5758 + }, + { + "epoch": 2.7229314420803785, + "grad_norm": 2.845388174057007, + "learning_rate": 2.895090599528132e-06, + "loss": 0.4533, + "step": 5759 + }, + { + "epoch": 2.723404255319149, + "grad_norm": 2.973881721496582, + "learning_rate": 2.8944745996963397e-06, + "loss": 0.4959, + "step": 5760 + }, + { + "epoch": 2.7238770685579197, + "grad_norm": 2.8995487689971924, + "learning_rate": 2.8938585753023435e-06, + "loss": 0.4597, + "step": 5761 + }, + { + "epoch": 2.7243498817966905, + "grad_norm": 2.903693437576294, + "learning_rate": 2.8932425263845004e-06, + "loss": 0.4521, + "step": 5762 + }, + { + "epoch": 2.724822695035461, + "grad_norm": 2.7609009742736816, + "learning_rate": 2.8926264529811702e-06, + "loss": 0.4399, + "step": 5763 + }, + { + "epoch": 2.7252955082742316, + "grad_norm": 2.788787603378296, + "learning_rate": 2.892010355130712e-06, + "loss": 0.4614, + "step": 5764 + }, + { + "epoch": 2.7257683215130024, + "grad_norm": 2.786498785018921, + "learning_rate": 2.8913942328714887e-06, + "loss": 0.4798, + "step": 5765 + }, + { + "epoch": 2.726241134751773, + "grad_norm": 2.9809393882751465, + "learning_rate": 2.8907780862418616e-06, + "loss": 0.5108, + "step": 5766 + }, + { + "epoch": 2.726713947990544, + "grad_norm": 2.6621177196502686, + "learning_rate": 2.8901619152801967e-06, + "loss": 0.4031, + "step": 5767 + }, + { + "epoch": 2.7271867612293144, + "grad_norm": 3.3092098236083984, + "learning_rate": 2.8895457200248607e-06, + "loss": 0.4671, + "step": 5768 + }, + { + "epoch": 2.727659574468085, + "grad_norm": 2.866306781768799, + "learning_rate": 2.8889295005142204e-06, + "loss": 0.4434, + "step": 5769 + }, + { + "epoch": 2.728132387706856, + "grad_norm": 2.6861231327056885, + "learning_rate": 2.888313256786646e-06, + "loss": 0.429, + "step": 5770 + }, + { + "epoch": 2.7286052009456263, + "grad_norm": 2.873180389404297, + "learning_rate": 2.8876969888805072e-06, + "loss": 0.4412, + "step": 5771 + }, + { + "epoch": 2.729078014184397, + "grad_norm": 2.511678695678711, + "learning_rate": 2.887080696834178e-06, + "loss": 0.4024, + "step": 5772 + }, + { + "epoch": 2.729550827423168, + "grad_norm": 2.6502726078033447, + "learning_rate": 2.88646438068603e-06, + "loss": 0.4357, + "step": 5773 + }, + { + "epoch": 2.7300236406619387, + "grad_norm": 2.7156145572662354, + "learning_rate": 2.8858480404744403e-06, + "loss": 0.4511, + "step": 5774 + }, + { + "epoch": 2.7304964539007095, + "grad_norm": 2.882582187652588, + "learning_rate": 2.8852316762377842e-06, + "loss": 0.4822, + "step": 5775 + }, + { + "epoch": 2.73096926713948, + "grad_norm": 2.7139666080474854, + "learning_rate": 2.8846152880144413e-06, + "loss": 0.4666, + "step": 5776 + }, + { + "epoch": 2.7314420803782506, + "grad_norm": 2.7453949451446533, + "learning_rate": 2.8839988758427907e-06, + "loss": 0.3927, + "step": 5777 + }, + { + "epoch": 2.731914893617021, + "grad_norm": 2.7859580516815186, + "learning_rate": 2.883382439761214e-06, + "loss": 0.4466, + "step": 5778 + }, + { + "epoch": 2.732387706855792, + "grad_norm": 2.695234537124634, + "learning_rate": 2.882765979808094e-06, + "loss": 0.4227, + "step": 5779 + }, + { + "epoch": 2.7328605200945626, + "grad_norm": 2.8081552982330322, + "learning_rate": 2.8821494960218148e-06, + "loss": 0.447, + "step": 5780 + }, + { + "epoch": 2.7333333333333334, + "grad_norm": 2.887643337249756, + "learning_rate": 2.881532988440762e-06, + "loss": 0.5018, + "step": 5781 + }, + { + "epoch": 2.733806146572104, + "grad_norm": 3.108212471008301, + "learning_rate": 2.8809164571033233e-06, + "loss": 0.4132, + "step": 5782 + }, + { + "epoch": 2.7342789598108745, + "grad_norm": 2.874328374862671, + "learning_rate": 2.880299902047886e-06, + "loss": 0.4618, + "step": 5783 + }, + { + "epoch": 2.7347517730496453, + "grad_norm": 3.089132308959961, + "learning_rate": 2.879683323312843e-06, + "loss": 0.4956, + "step": 5784 + }, + { + "epoch": 2.735224586288416, + "grad_norm": 2.5173206329345703, + "learning_rate": 2.879066720936583e-06, + "loss": 0.4087, + "step": 5785 + }, + { + "epoch": 2.7356973995271865, + "grad_norm": 2.6401286125183105, + "learning_rate": 2.8784500949575014e-06, + "loss": 0.3995, + "step": 5786 + }, + { + "epoch": 2.7361702127659573, + "grad_norm": 2.9371910095214844, + "learning_rate": 2.877833445413991e-06, + "loss": 0.5209, + "step": 5787 + }, + { + "epoch": 2.736643026004728, + "grad_norm": 3.218158006668091, + "learning_rate": 2.8772167723444498e-06, + "loss": 0.4275, + "step": 5788 + }, + { + "epoch": 2.737115839243499, + "grad_norm": 2.9072160720825195, + "learning_rate": 2.8766000757872736e-06, + "loss": 0.4244, + "step": 5789 + }, + { + "epoch": 2.7375886524822697, + "grad_norm": 3.0378096103668213, + "learning_rate": 2.8759833557808614e-06, + "loss": 0.507, + "step": 5790 + }, + { + "epoch": 2.73806146572104, + "grad_norm": 2.728353977203369, + "learning_rate": 2.8753666123636148e-06, + "loss": 0.413, + "step": 5791 + }, + { + "epoch": 2.738534278959811, + "grad_norm": 2.6869957447052, + "learning_rate": 2.874749845573935e-06, + "loss": 0.44, + "step": 5792 + }, + { + "epoch": 2.7390070921985816, + "grad_norm": 2.6381702423095703, + "learning_rate": 2.8741330554502263e-06, + "loss": 0.4708, + "step": 5793 + }, + { + "epoch": 2.739479905437352, + "grad_norm": 2.6944689750671387, + "learning_rate": 2.873516242030892e-06, + "loss": 0.4555, + "step": 5794 + }, + { + "epoch": 2.739952718676123, + "grad_norm": 3.168473243713379, + "learning_rate": 2.8728994053543396e-06, + "loss": 0.4538, + "step": 5795 + }, + { + "epoch": 2.7404255319148936, + "grad_norm": 2.7504515647888184, + "learning_rate": 2.872282545458976e-06, + "loss": 0.4628, + "step": 5796 + }, + { + "epoch": 2.7408983451536644, + "grad_norm": 2.896462917327881, + "learning_rate": 2.8716656623832114e-06, + "loss": 0.4946, + "step": 5797 + }, + { + "epoch": 2.741371158392435, + "grad_norm": 2.8053417205810547, + "learning_rate": 2.8710487561654547e-06, + "loss": 0.4893, + "step": 5798 + }, + { + "epoch": 2.7418439716312055, + "grad_norm": 2.63171124458313, + "learning_rate": 2.870431826844119e-06, + "loss": 0.4257, + "step": 5799 + }, + { + "epoch": 2.7423167848699763, + "grad_norm": 3.0963807106018066, + "learning_rate": 2.869814874457618e-06, + "loss": 0.5404, + "step": 5800 + }, + { + "epoch": 2.742789598108747, + "grad_norm": 2.591132164001465, + "learning_rate": 2.8691978990443664e-06, + "loss": 0.4015, + "step": 5801 + }, + { + "epoch": 2.7432624113475175, + "grad_norm": 3.0319552421569824, + "learning_rate": 2.8685809006427812e-06, + "loss": 0.4411, + "step": 5802 + }, + { + "epoch": 2.7437352245862883, + "grad_norm": 2.7791874408721924, + "learning_rate": 2.8679638792912784e-06, + "loss": 0.43, + "step": 5803 + }, + { + "epoch": 2.744208037825059, + "grad_norm": 3.530632495880127, + "learning_rate": 2.867346835028279e-06, + "loss": 0.4581, + "step": 5804 + }, + { + "epoch": 2.74468085106383, + "grad_norm": 3.2043099403381348, + "learning_rate": 2.8667297678922024e-06, + "loss": 0.4375, + "step": 5805 + }, + { + "epoch": 2.7451536643026007, + "grad_norm": 2.8442344665527344, + "learning_rate": 2.8661126779214716e-06, + "loss": 0.4059, + "step": 5806 + }, + { + "epoch": 2.745626477541371, + "grad_norm": 2.7561380863189697, + "learning_rate": 2.86549556515451e-06, + "loss": 0.4391, + "step": 5807 + }, + { + "epoch": 2.746099290780142, + "grad_norm": 3.229663848876953, + "learning_rate": 2.8648784296297418e-06, + "loss": 0.4579, + "step": 5808 + }, + { + "epoch": 2.7465721040189126, + "grad_norm": 2.8375027179718018, + "learning_rate": 2.864261271385593e-06, + "loss": 0.4566, + "step": 5809 + }, + { + "epoch": 2.747044917257683, + "grad_norm": 2.392998695373535, + "learning_rate": 2.863644090460493e-06, + "loss": 0.4123, + "step": 5810 + }, + { + "epoch": 2.7475177304964538, + "grad_norm": 2.707610607147217, + "learning_rate": 2.86302688689287e-06, + "loss": 0.4299, + "step": 5811 + }, + { + "epoch": 2.7479905437352246, + "grad_norm": 2.824042797088623, + "learning_rate": 2.8624096607211547e-06, + "loss": 0.3799, + "step": 5812 + }, + { + "epoch": 2.7484633569739954, + "grad_norm": 2.933102607727051, + "learning_rate": 2.861792411983779e-06, + "loss": 0.4416, + "step": 5813 + }, + { + "epoch": 2.748936170212766, + "grad_norm": 2.7000277042388916, + "learning_rate": 2.8611751407191757e-06, + "loss": 0.4346, + "step": 5814 + }, + { + "epoch": 2.7494089834515365, + "grad_norm": 2.6757142543792725, + "learning_rate": 2.86055784696578e-06, + "loss": 0.4559, + "step": 5815 + }, + { + "epoch": 2.7498817966903073, + "grad_norm": 2.5791053771972656, + "learning_rate": 2.8599405307620287e-06, + "loss": 0.4619, + "step": 5816 + }, + { + "epoch": 2.750354609929078, + "grad_norm": 3.0327374935150146, + "learning_rate": 2.859323192146359e-06, + "loss": 0.4163, + "step": 5817 + }, + { + "epoch": 2.7508274231678485, + "grad_norm": 2.5580220222473145, + "learning_rate": 2.8587058311572084e-06, + "loss": 0.4005, + "step": 5818 + }, + { + "epoch": 2.7513002364066192, + "grad_norm": 2.592179536819458, + "learning_rate": 2.85808844783302e-06, + "loss": 0.4404, + "step": 5819 + }, + { + "epoch": 2.75177304964539, + "grad_norm": 3.2779927253723145, + "learning_rate": 2.8574710422122342e-06, + "loss": 0.54, + "step": 5820 + }, + { + "epoch": 2.752245862884161, + "grad_norm": 2.4804370403289795, + "learning_rate": 2.8568536143332933e-06, + "loss": 0.4476, + "step": 5821 + }, + { + "epoch": 2.7527186761229316, + "grad_norm": 2.649477481842041, + "learning_rate": 2.8562361642346427e-06, + "loss": 0.4336, + "step": 5822 + }, + { + "epoch": 2.753191489361702, + "grad_norm": 3.138587474822998, + "learning_rate": 2.855618691954728e-06, + "loss": 0.5042, + "step": 5823 + }, + { + "epoch": 2.753664302600473, + "grad_norm": 2.75093412399292, + "learning_rate": 2.855001197531997e-06, + "loss": 0.4327, + "step": 5824 + }, + { + "epoch": 2.7541371158392436, + "grad_norm": 2.678809642791748, + "learning_rate": 2.854383681004898e-06, + "loss": 0.4409, + "step": 5825 + }, + { + "epoch": 2.754609929078014, + "grad_norm": 2.965386390686035, + "learning_rate": 2.853766142411881e-06, + "loss": 0.4716, + "step": 5826 + }, + { + "epoch": 2.7550827423167847, + "grad_norm": 2.6419436931610107, + "learning_rate": 2.853148581791398e-06, + "loss": 0.4367, + "step": 5827 + }, + { + "epoch": 2.7555555555555555, + "grad_norm": 3.205794095993042, + "learning_rate": 2.8525309991819004e-06, + "loss": 0.4869, + "step": 5828 + }, + { + "epoch": 2.7560283687943263, + "grad_norm": 3.041008472442627, + "learning_rate": 2.851913394621844e-06, + "loss": 0.5087, + "step": 5829 + }, + { + "epoch": 2.756501182033097, + "grad_norm": 2.6525566577911377, + "learning_rate": 2.851295768149684e-06, + "loss": 0.3951, + "step": 5830 + }, + { + "epoch": 2.7569739952718675, + "grad_norm": 2.732220411300659, + "learning_rate": 2.850678119803876e-06, + "loss": 0.4797, + "step": 5831 + }, + { + "epoch": 2.7574468085106383, + "grad_norm": 2.8965251445770264, + "learning_rate": 2.8500604496228797e-06, + "loss": 0.4938, + "step": 5832 + }, + { + "epoch": 2.757919621749409, + "grad_norm": 2.48020076751709, + "learning_rate": 2.849442757645154e-06, + "loss": 0.4172, + "step": 5833 + }, + { + "epoch": 2.7583924349881794, + "grad_norm": 2.4764912128448486, + "learning_rate": 2.8488250439091603e-06, + "loss": 0.4123, + "step": 5834 + }, + { + "epoch": 2.7588652482269502, + "grad_norm": 2.4547016620635986, + "learning_rate": 2.84820730845336e-06, + "loss": 0.4116, + "step": 5835 + }, + { + "epoch": 2.759338061465721, + "grad_norm": 2.55476975440979, + "learning_rate": 2.847589551316218e-06, + "loss": 0.4744, + "step": 5836 + }, + { + "epoch": 2.759810874704492, + "grad_norm": 2.3866238594055176, + "learning_rate": 2.846971772536199e-06, + "loss": 0.4406, + "step": 5837 + }, + { + "epoch": 2.7602836879432626, + "grad_norm": 2.855318784713745, + "learning_rate": 2.8463539721517687e-06, + "loss": 0.4517, + "step": 5838 + }, + { + "epoch": 2.760756501182033, + "grad_norm": 2.527198314666748, + "learning_rate": 2.8457361502013954e-06, + "loss": 0.3588, + "step": 5839 + }, + { + "epoch": 2.7612293144208038, + "grad_norm": 2.6761462688446045, + "learning_rate": 2.8451183067235476e-06, + "loss": 0.4192, + "step": 5840 + }, + { + "epoch": 2.7617021276595746, + "grad_norm": 2.5692319869995117, + "learning_rate": 2.8445004417566967e-06, + "loss": 0.4108, + "step": 5841 + }, + { + "epoch": 2.762174940898345, + "grad_norm": 2.5721096992492676, + "learning_rate": 2.8438825553393133e-06, + "loss": 0.3941, + "step": 5842 + }, + { + "epoch": 2.7626477541371157, + "grad_norm": 2.699430227279663, + "learning_rate": 2.843264647509872e-06, + "loss": 0.4418, + "step": 5843 + }, + { + "epoch": 2.7631205673758865, + "grad_norm": 2.6943318843841553, + "learning_rate": 2.842646718306846e-06, + "loss": 0.4505, + "step": 5844 + }, + { + "epoch": 2.7635933806146573, + "grad_norm": 2.661656379699707, + "learning_rate": 2.8420287677687107e-06, + "loss": 0.4413, + "step": 5845 + }, + { + "epoch": 2.764066193853428, + "grad_norm": 2.830467939376831, + "learning_rate": 2.8414107959339444e-06, + "loss": 0.5095, + "step": 5846 + }, + { + "epoch": 2.7645390070921985, + "grad_norm": 2.598053455352783, + "learning_rate": 2.840792802841024e-06, + "loss": 0.4029, + "step": 5847 + }, + { + "epoch": 2.7650118203309693, + "grad_norm": 2.641700029373169, + "learning_rate": 2.8401747885284316e-06, + "loss": 0.4237, + "step": 5848 + }, + { + "epoch": 2.76548463356974, + "grad_norm": 2.6672768592834473, + "learning_rate": 2.8395567530346454e-06, + "loss": 0.4181, + "step": 5849 + }, + { + "epoch": 2.7659574468085104, + "grad_norm": 2.5851705074310303, + "learning_rate": 2.838938696398149e-06, + "loss": 0.4165, + "step": 5850 + }, + { + "epoch": 2.766430260047281, + "grad_norm": 2.318120002746582, + "learning_rate": 2.8383206186574276e-06, + "loss": 0.3578, + "step": 5851 + }, + { + "epoch": 2.766903073286052, + "grad_norm": 2.6199793815612793, + "learning_rate": 2.8377025198509635e-06, + "loss": 0.4719, + "step": 5852 + }, + { + "epoch": 2.767375886524823, + "grad_norm": 2.7186086177825928, + "learning_rate": 2.837084400017245e-06, + "loss": 0.41, + "step": 5853 + }, + { + "epoch": 2.7678486997635936, + "grad_norm": 2.702514886856079, + "learning_rate": 2.8364662591947583e-06, + "loss": 0.4659, + "step": 5854 + }, + { + "epoch": 2.768321513002364, + "grad_norm": 2.612375259399414, + "learning_rate": 2.835848097421993e-06, + "loss": 0.4252, + "step": 5855 + }, + { + "epoch": 2.7687943262411348, + "grad_norm": 3.0127978324890137, + "learning_rate": 2.8352299147374394e-06, + "loss": 0.4084, + "step": 5856 + }, + { + "epoch": 2.7692671394799055, + "grad_norm": 2.6460049152374268, + "learning_rate": 2.83461171117959e-06, + "loss": 0.4035, + "step": 5857 + }, + { + "epoch": 2.769739952718676, + "grad_norm": 2.9844725131988525, + "learning_rate": 2.8339934867869357e-06, + "loss": 0.4912, + "step": 5858 + }, + { + "epoch": 2.7702127659574467, + "grad_norm": 2.731217861175537, + "learning_rate": 2.833375241597972e-06, + "loss": 0.4112, + "step": 5859 + }, + { + "epoch": 2.7706855791962175, + "grad_norm": 2.731194496154785, + "learning_rate": 2.832756975651193e-06, + "loss": 0.4516, + "step": 5860 + }, + { + "epoch": 2.7711583924349883, + "grad_norm": 3.0532076358795166, + "learning_rate": 2.8321386889850965e-06, + "loss": 0.3959, + "step": 5861 + }, + { + "epoch": 2.771631205673759, + "grad_norm": 3.5437800884246826, + "learning_rate": 2.831520381638181e-06, + "loss": 0.6055, + "step": 5862 + }, + { + "epoch": 2.7721040189125294, + "grad_norm": 2.4297714233398438, + "learning_rate": 2.830902053648944e-06, + "loss": 0.4038, + "step": 5863 + }, + { + "epoch": 2.7725768321513002, + "grad_norm": 2.696768045425415, + "learning_rate": 2.8302837050558876e-06, + "loss": 0.3983, + "step": 5864 + }, + { + "epoch": 2.773049645390071, + "grad_norm": 2.6574649810791016, + "learning_rate": 2.8296653358975122e-06, + "loss": 0.4937, + "step": 5865 + }, + { + "epoch": 2.7735224586288414, + "grad_norm": 2.9393341541290283, + "learning_rate": 2.8290469462123234e-06, + "loss": 0.4603, + "step": 5866 + }, + { + "epoch": 2.773995271867612, + "grad_norm": 2.7630696296691895, + "learning_rate": 2.828428536038824e-06, + "loss": 0.4663, + "step": 5867 + }, + { + "epoch": 2.774468085106383, + "grad_norm": 2.7354233264923096, + "learning_rate": 2.8278101054155183e-06, + "loss": 0.4444, + "step": 5868 + }, + { + "epoch": 2.774940898345154, + "grad_norm": 3.0489425659179688, + "learning_rate": 2.827191654380915e-06, + "loss": 0.4684, + "step": 5869 + }, + { + "epoch": 2.7754137115839246, + "grad_norm": 2.9602572917938232, + "learning_rate": 2.8265731829735226e-06, + "loss": 0.4571, + "step": 5870 + }, + { + "epoch": 2.775886524822695, + "grad_norm": 2.774132013320923, + "learning_rate": 2.825954691231851e-06, + "loss": 0.4458, + "step": 5871 + }, + { + "epoch": 2.7763593380614657, + "grad_norm": 2.696622133255005, + "learning_rate": 2.825336179194409e-06, + "loss": 0.4933, + "step": 5872 + }, + { + "epoch": 2.7768321513002365, + "grad_norm": 2.742184638977051, + "learning_rate": 2.8247176468997096e-06, + "loss": 0.4464, + "step": 5873 + }, + { + "epoch": 2.777304964539007, + "grad_norm": 2.7033183574676514, + "learning_rate": 2.824099094386266e-06, + "loss": 0.4369, + "step": 5874 + }, + { + "epoch": 2.7777777777777777, + "grad_norm": 2.7264044284820557, + "learning_rate": 2.8234805216925935e-06, + "loss": 0.4621, + "step": 5875 + }, + { + "epoch": 2.7782505910165485, + "grad_norm": 2.6417739391326904, + "learning_rate": 2.822861928857208e-06, + "loss": 0.4254, + "step": 5876 + }, + { + "epoch": 2.7787234042553193, + "grad_norm": 3.17209529876709, + "learning_rate": 2.8222433159186245e-06, + "loss": 0.5011, + "step": 5877 + }, + { + "epoch": 2.77919621749409, + "grad_norm": 3.1434381008148193, + "learning_rate": 2.8216246829153633e-06, + "loss": 0.4567, + "step": 5878 + }, + { + "epoch": 2.7796690307328604, + "grad_norm": 2.781608819961548, + "learning_rate": 2.821006029885943e-06, + "loss": 0.4723, + "step": 5879 + }, + { + "epoch": 2.780141843971631, + "grad_norm": 3.00079345703125, + "learning_rate": 2.820387356868885e-06, + "loss": 0.4796, + "step": 5880 + }, + { + "epoch": 2.780614657210402, + "grad_norm": 2.703555107116699, + "learning_rate": 2.819768663902712e-06, + "loss": 0.4577, + "step": 5881 + }, + { + "epoch": 2.7810874704491724, + "grad_norm": 2.5741801261901855, + "learning_rate": 2.8191499510259453e-06, + "loss": 0.4255, + "step": 5882 + }, + { + "epoch": 2.781560283687943, + "grad_norm": 2.9871208667755127, + "learning_rate": 2.8185312182771112e-06, + "loss": 0.4495, + "step": 5883 + }, + { + "epoch": 2.782033096926714, + "grad_norm": 2.525317668914795, + "learning_rate": 2.8179124656947343e-06, + "loss": 0.4428, + "step": 5884 + }, + { + "epoch": 2.7825059101654848, + "grad_norm": 2.525092840194702, + "learning_rate": 2.817293693317343e-06, + "loss": 0.4348, + "step": 5885 + }, + { + "epoch": 2.7829787234042556, + "grad_norm": 2.8485171794891357, + "learning_rate": 2.816674901183464e-06, + "loss": 0.4206, + "step": 5886 + }, + { + "epoch": 2.783451536643026, + "grad_norm": 2.6612746715545654, + "learning_rate": 2.8160560893316272e-06, + "loss": 0.396, + "step": 5887 + }, + { + "epoch": 2.7839243498817967, + "grad_norm": 2.7093865871429443, + "learning_rate": 2.815437257800364e-06, + "loss": 0.4468, + "step": 5888 + }, + { + "epoch": 2.7843971631205675, + "grad_norm": 2.6130900382995605, + "learning_rate": 2.814818406628206e-06, + "loss": 0.443, + "step": 5889 + }, + { + "epoch": 2.784869976359338, + "grad_norm": 2.8147552013397217, + "learning_rate": 2.8141995358536866e-06, + "loss": 0.4454, + "step": 5890 + }, + { + "epoch": 2.7853427895981087, + "grad_norm": 2.5621275901794434, + "learning_rate": 2.8135806455153395e-06, + "loss": 0.439, + "step": 5891 + }, + { + "epoch": 2.7858156028368795, + "grad_norm": 2.880228281021118, + "learning_rate": 2.812961735651701e-06, + "loss": 0.3895, + "step": 5892 + }, + { + "epoch": 2.7862884160756503, + "grad_norm": 2.5861377716064453, + "learning_rate": 2.8123428063013068e-06, + "loss": 0.4402, + "step": 5893 + }, + { + "epoch": 2.786761229314421, + "grad_norm": 2.9707765579223633, + "learning_rate": 2.811723857502696e-06, + "loss": 0.4461, + "step": 5894 + }, + { + "epoch": 2.7872340425531914, + "grad_norm": 2.923999309539795, + "learning_rate": 2.811104889294408e-06, + "loss": 0.4395, + "step": 5895 + }, + { + "epoch": 2.787706855791962, + "grad_norm": 2.846933603286743, + "learning_rate": 2.810485901714981e-06, + "loss": 0.5168, + "step": 5896 + }, + { + "epoch": 2.788179669030733, + "grad_norm": 4.1052350997924805, + "learning_rate": 2.8098668948029597e-06, + "loss": 0.5152, + "step": 5897 + }, + { + "epoch": 2.7886524822695034, + "grad_norm": 2.7391018867492676, + "learning_rate": 2.8092478685968856e-06, + "loss": 0.4515, + "step": 5898 + }, + { + "epoch": 2.789125295508274, + "grad_norm": 2.976088285446167, + "learning_rate": 2.8086288231353027e-06, + "loss": 0.5156, + "step": 5899 + }, + { + "epoch": 2.789598108747045, + "grad_norm": 2.6139633655548096, + "learning_rate": 2.8080097584567562e-06, + "loss": 0.4237, + "step": 5900 + }, + { + "epoch": 2.7900709219858157, + "grad_norm": 2.501654624938965, + "learning_rate": 2.807390674599792e-06, + "loss": 0.4349, + "step": 5901 + }, + { + "epoch": 2.7905437352245865, + "grad_norm": 2.8814525604248047, + "learning_rate": 2.8067715716029586e-06, + "loss": 0.4866, + "step": 5902 + }, + { + "epoch": 2.791016548463357, + "grad_norm": 2.7953200340270996, + "learning_rate": 2.8061524495048046e-06, + "loss": 0.3964, + "step": 5903 + }, + { + "epoch": 2.7914893617021277, + "grad_norm": 2.7362849712371826, + "learning_rate": 2.8055333083438808e-06, + "loss": 0.4181, + "step": 5904 + }, + { + "epoch": 2.7919621749408985, + "grad_norm": 2.9740512371063232, + "learning_rate": 2.8049141481587366e-06, + "loss": 0.4784, + "step": 5905 + }, + { + "epoch": 2.792434988179669, + "grad_norm": 2.595813274383545, + "learning_rate": 2.8042949689879262e-06, + "loss": 0.4421, + "step": 5906 + }, + { + "epoch": 2.7929078014184396, + "grad_norm": 2.886899948120117, + "learning_rate": 2.803675770870002e-06, + "loss": 0.4435, + "step": 5907 + }, + { + "epoch": 2.7933806146572104, + "grad_norm": 2.6057486534118652, + "learning_rate": 2.8030565538435196e-06, + "loss": 0.4472, + "step": 5908 + }, + { + "epoch": 2.7938534278959812, + "grad_norm": 2.7422802448272705, + "learning_rate": 2.802437317947034e-06, + "loss": 0.4799, + "step": 5909 + }, + { + "epoch": 2.794326241134752, + "grad_norm": 2.3904244899749756, + "learning_rate": 2.801818063219102e-06, + "loss": 0.4508, + "step": 5910 + }, + { + "epoch": 2.7947990543735224, + "grad_norm": 2.8434207439422607, + "learning_rate": 2.8011987896982835e-06, + "loss": 0.4473, + "step": 5911 + }, + { + "epoch": 2.795271867612293, + "grad_norm": 2.916088819503784, + "learning_rate": 2.8005794974231366e-06, + "loss": 0.464, + "step": 5912 + }, + { + "epoch": 2.795744680851064, + "grad_norm": 2.6483397483825684, + "learning_rate": 2.7999601864322236e-06, + "loss": 0.441, + "step": 5913 + }, + { + "epoch": 2.7962174940898343, + "grad_norm": 2.9287428855895996, + "learning_rate": 2.7993408567641033e-06, + "loss": 0.4551, + "step": 5914 + }, + { + "epoch": 2.796690307328605, + "grad_norm": 2.575024127960205, + "learning_rate": 2.798721508457342e-06, + "loss": 0.4494, + "step": 5915 + }, + { + "epoch": 2.797163120567376, + "grad_norm": 2.7156829833984375, + "learning_rate": 2.7981021415505015e-06, + "loss": 0.419, + "step": 5916 + }, + { + "epoch": 2.7976359338061467, + "grad_norm": 2.850553035736084, + "learning_rate": 2.7974827560821482e-06, + "loss": 0.4709, + "step": 5917 + }, + { + "epoch": 2.7981087470449175, + "grad_norm": 2.673846483230591, + "learning_rate": 2.796863352090847e-06, + "loss": 0.4224, + "step": 5918 + }, + { + "epoch": 2.798581560283688, + "grad_norm": 2.9093217849731445, + "learning_rate": 2.796243929615168e-06, + "loss": 0.468, + "step": 5919 + }, + { + "epoch": 2.7990543735224587, + "grad_norm": 2.4853813648223877, + "learning_rate": 2.7956244886936775e-06, + "loss": 0.4723, + "step": 5920 + }, + { + "epoch": 2.7995271867612295, + "grad_norm": 3.026428461074829, + "learning_rate": 2.795005029364946e-06, + "loss": 0.4721, + "step": 5921 + }, + { + "epoch": 2.8, + "grad_norm": 2.886295795440674, + "learning_rate": 2.794385551667546e-06, + "loss": 0.456, + "step": 5922 + }, + { + "epoch": 2.8004728132387706, + "grad_norm": 3.2260656356811523, + "learning_rate": 2.7937660556400486e-06, + "loss": 0.4499, + "step": 5923 + }, + { + "epoch": 2.8009456264775414, + "grad_norm": 2.7971982955932617, + "learning_rate": 2.793146541321027e-06, + "loss": 0.3982, + "step": 5924 + }, + { + "epoch": 2.801418439716312, + "grad_norm": 2.85461163520813, + "learning_rate": 2.7925270087490546e-06, + "loss": 0.4841, + "step": 5925 + }, + { + "epoch": 2.801891252955083, + "grad_norm": 3.0642316341400146, + "learning_rate": 2.7919074579627086e-06, + "loss": 0.4538, + "step": 5926 + }, + { + "epoch": 2.8023640661938534, + "grad_norm": 2.9053616523742676, + "learning_rate": 2.7912878890005657e-06, + "loss": 0.434, + "step": 5927 + }, + { + "epoch": 2.802836879432624, + "grad_norm": 2.7649240493774414, + "learning_rate": 2.7906683019012027e-06, + "loss": 0.414, + "step": 5928 + }, + { + "epoch": 2.803309692671395, + "grad_norm": 2.8717660903930664, + "learning_rate": 2.7900486967031987e-06, + "loss": 0.4337, + "step": 5929 + }, + { + "epoch": 2.8037825059101653, + "grad_norm": 2.6860995292663574, + "learning_rate": 2.789429073445135e-06, + "loss": 0.447, + "step": 5930 + }, + { + "epoch": 2.804255319148936, + "grad_norm": 2.67509126663208, + "learning_rate": 2.7888094321655918e-06, + "loss": 0.4955, + "step": 5931 + }, + { + "epoch": 2.804728132387707, + "grad_norm": 2.7426326274871826, + "learning_rate": 2.7881897729031514e-06, + "loss": 0.4564, + "step": 5932 + }, + { + "epoch": 2.8052009456264777, + "grad_norm": 2.7087252140045166, + "learning_rate": 2.7875700956963973e-06, + "loss": 0.4571, + "step": 5933 + }, + { + "epoch": 2.8056737588652485, + "grad_norm": 2.513526439666748, + "learning_rate": 2.7869504005839147e-06, + "loss": 0.4361, + "step": 5934 + }, + { + "epoch": 2.806146572104019, + "grad_norm": 3.2246084213256836, + "learning_rate": 2.7863306876042885e-06, + "loss": 0.4612, + "step": 5935 + }, + { + "epoch": 2.8066193853427897, + "grad_norm": 3.226325511932373, + "learning_rate": 2.7857109567961066e-06, + "loss": 0.4528, + "step": 5936 + }, + { + "epoch": 2.8070921985815604, + "grad_norm": 2.8861422538757324, + "learning_rate": 2.785091208197956e-06, + "loss": 0.5049, + "step": 5937 + }, + { + "epoch": 2.807565011820331, + "grad_norm": 2.76279616355896, + "learning_rate": 2.7844714418484257e-06, + "loss": 0.4714, + "step": 5938 + }, + { + "epoch": 2.8080378250591016, + "grad_norm": 2.9591920375823975, + "learning_rate": 2.7838516577861063e-06, + "loss": 0.4633, + "step": 5939 + }, + { + "epoch": 2.8085106382978724, + "grad_norm": 2.536916971206665, + "learning_rate": 2.7832318560495885e-06, + "loss": 0.4108, + "step": 5940 + }, + { + "epoch": 2.808983451536643, + "grad_norm": 3.2484991550445557, + "learning_rate": 2.7826120366774657e-06, + "loss": 0.4888, + "step": 5941 + }, + { + "epoch": 2.8094562647754135, + "grad_norm": 2.7129359245300293, + "learning_rate": 2.781992199708329e-06, + "loss": 0.4008, + "step": 5942 + }, + { + "epoch": 2.8099290780141843, + "grad_norm": 2.4176113605499268, + "learning_rate": 2.781372345180776e-06, + "loss": 0.3864, + "step": 5943 + }, + { + "epoch": 2.810401891252955, + "grad_norm": 2.6557252407073975, + "learning_rate": 2.7807524731334e-06, + "loss": 0.4295, + "step": 5944 + }, + { + "epoch": 2.8108747044917255, + "grad_norm": 2.9191324710845947, + "learning_rate": 2.7801325836047993e-06, + "loss": 0.4854, + "step": 5945 + }, + { + "epoch": 2.8113475177304963, + "grad_norm": 2.6325371265411377, + "learning_rate": 2.7795126766335705e-06, + "loss": 0.4332, + "step": 5946 + }, + { + "epoch": 2.811820330969267, + "grad_norm": 2.658337116241455, + "learning_rate": 2.778892752258314e-06, + "loss": 0.4276, + "step": 5947 + }, + { + "epoch": 2.812293144208038, + "grad_norm": 2.763782262802124, + "learning_rate": 2.778272810517627e-06, + "loss": 0.4246, + "step": 5948 + }, + { + "epoch": 2.8127659574468087, + "grad_norm": 2.407607078552246, + "learning_rate": 2.777652851450113e-06, + "loss": 0.3788, + "step": 5949 + }, + { + "epoch": 2.813238770685579, + "grad_norm": 3.0339951515197754, + "learning_rate": 2.7770328750943736e-06, + "loss": 0.477, + "step": 5950 + }, + { + "epoch": 2.81371158392435, + "grad_norm": 2.3475773334503174, + "learning_rate": 2.776412881489012e-06, + "loss": 0.4206, + "step": 5951 + }, + { + "epoch": 2.8141843971631206, + "grad_norm": 3.0455260276794434, + "learning_rate": 2.7757928706726318e-06, + "loss": 0.4301, + "step": 5952 + }, + { + "epoch": 2.814657210401891, + "grad_norm": 2.803920030593872, + "learning_rate": 2.7751728426838386e-06, + "loss": 0.3738, + "step": 5953 + }, + { + "epoch": 2.815130023640662, + "grad_norm": 3.1083319187164307, + "learning_rate": 2.77455279756124e-06, + "loss": 0.5365, + "step": 5954 + }, + { + "epoch": 2.8156028368794326, + "grad_norm": 3.180809497833252, + "learning_rate": 2.7739327353434427e-06, + "loss": 0.4789, + "step": 5955 + }, + { + "epoch": 2.8160756501182034, + "grad_norm": 2.975043773651123, + "learning_rate": 2.7733126560690543e-06, + "loss": 0.4798, + "step": 5956 + }, + { + "epoch": 2.816548463356974, + "grad_norm": 2.765475034713745, + "learning_rate": 2.772692559776685e-06, + "loss": 0.4206, + "step": 5957 + }, + { + "epoch": 2.8170212765957445, + "grad_norm": 2.48612380027771, + "learning_rate": 2.7720724465049463e-06, + "loss": 0.4234, + "step": 5958 + }, + { + "epoch": 2.8174940898345153, + "grad_norm": 2.7145729064941406, + "learning_rate": 2.77145231629245e-06, + "loss": 0.4713, + "step": 5959 + }, + { + "epoch": 2.817966903073286, + "grad_norm": 2.5993762016296387, + "learning_rate": 2.7708321691778074e-06, + "loss": 0.4144, + "step": 5960 + }, + { + "epoch": 2.8184397163120565, + "grad_norm": 3.0902538299560547, + "learning_rate": 2.770212005199633e-06, + "loss": 0.4822, + "step": 5961 + }, + { + "epoch": 2.8189125295508273, + "grad_norm": 2.849757671356201, + "learning_rate": 2.7695918243965424e-06, + "loss": 0.4449, + "step": 5962 + }, + { + "epoch": 2.819385342789598, + "grad_norm": 2.77148699760437, + "learning_rate": 2.768971626807151e-06, + "loss": 0.4448, + "step": 5963 + }, + { + "epoch": 2.819858156028369, + "grad_norm": 2.7865898609161377, + "learning_rate": 2.7683514124700757e-06, + "loss": 0.4944, + "step": 5964 + }, + { + "epoch": 2.8203309692671397, + "grad_norm": 2.9057955741882324, + "learning_rate": 2.767731181423934e-06, + "loss": 0.5074, + "step": 5965 + }, + { + "epoch": 2.82080378250591, + "grad_norm": 2.725837469100952, + "learning_rate": 2.7671109337073465e-06, + "loss": 0.4207, + "step": 5966 + }, + { + "epoch": 2.821276595744681, + "grad_norm": 3.078531265258789, + "learning_rate": 2.7664906693589315e-06, + "loss": 0.4835, + "step": 5967 + }, + { + "epoch": 2.8217494089834516, + "grad_norm": 2.8692002296447754, + "learning_rate": 2.765870388417312e-06, + "loss": 0.4284, + "step": 5968 + }, + { + "epoch": 2.822222222222222, + "grad_norm": 2.8519723415374756, + "learning_rate": 2.765250090921109e-06, + "loss": 0.541, + "step": 5969 + }, + { + "epoch": 2.8226950354609928, + "grad_norm": 3.2037532329559326, + "learning_rate": 2.7646297769089457e-06, + "loss": 0.4276, + "step": 5970 + }, + { + "epoch": 2.8231678486997636, + "grad_norm": 2.8637137413024902, + "learning_rate": 2.7640094464194468e-06, + "loss": 0.4904, + "step": 5971 + }, + { + "epoch": 2.8236406619385344, + "grad_norm": 2.681516408920288, + "learning_rate": 2.7633890994912372e-06, + "loss": 0.4942, + "step": 5972 + }, + { + "epoch": 2.824113475177305, + "grad_norm": 3.0035219192504883, + "learning_rate": 2.7627687361629434e-06, + "loss": 0.4556, + "step": 5973 + }, + { + "epoch": 2.8245862884160755, + "grad_norm": 2.8107759952545166, + "learning_rate": 2.7621483564731923e-06, + "loss": 0.4225, + "step": 5974 + }, + { + "epoch": 2.8250591016548463, + "grad_norm": 2.87276029586792, + "learning_rate": 2.7615279604606126e-06, + "loss": 0.5045, + "step": 5975 + }, + { + "epoch": 2.825531914893617, + "grad_norm": 2.687953233718872, + "learning_rate": 2.760907548163833e-06, + "loss": 0.4018, + "step": 5976 + }, + { + "epoch": 2.8260047281323875, + "grad_norm": 2.587979555130005, + "learning_rate": 2.760287119621486e-06, + "loss": 0.4407, + "step": 5977 + }, + { + "epoch": 2.8264775413711583, + "grad_norm": 2.805602550506592, + "learning_rate": 2.7596666748722e-06, + "loss": 0.4559, + "step": 5978 + }, + { + "epoch": 2.826950354609929, + "grad_norm": 2.320763111114502, + "learning_rate": 2.759046213954609e-06, + "loss": 0.3847, + "step": 5979 + }, + { + "epoch": 2.8274231678487, + "grad_norm": 2.6876401901245117, + "learning_rate": 2.758425736907347e-06, + "loss": 0.4528, + "step": 5980 + }, + { + "epoch": 2.8278959810874706, + "grad_norm": 2.6852915287017822, + "learning_rate": 2.757805243769046e-06, + "loss": 0.395, + "step": 5981 + }, + { + "epoch": 2.828368794326241, + "grad_norm": 2.808326005935669, + "learning_rate": 2.7571847345783447e-06, + "loss": 0.4647, + "step": 5982 + }, + { + "epoch": 2.828841607565012, + "grad_norm": 2.641479015350342, + "learning_rate": 2.7565642093738766e-06, + "loss": 0.3798, + "step": 5983 + }, + { + "epoch": 2.8293144208037826, + "grad_norm": 2.8066110610961914, + "learning_rate": 2.7559436681942803e-06, + "loss": 0.5072, + "step": 5984 + }, + { + "epoch": 2.829787234042553, + "grad_norm": 2.898375988006592, + "learning_rate": 2.7553231110781936e-06, + "loss": 0.5182, + "step": 5985 + }, + { + "epoch": 2.8302600472813237, + "grad_norm": 2.704890489578247, + "learning_rate": 2.7547025380642574e-06, + "loss": 0.3999, + "step": 5986 + }, + { + "epoch": 2.8307328605200945, + "grad_norm": 2.6024270057678223, + "learning_rate": 2.7540819491911106e-06, + "loss": 0.4302, + "step": 5987 + }, + { + "epoch": 2.8312056737588653, + "grad_norm": 2.8006081581115723, + "learning_rate": 2.7534613444973946e-06, + "loss": 0.4492, + "step": 5988 + }, + { + "epoch": 2.831678486997636, + "grad_norm": 2.9532058238983154, + "learning_rate": 2.752840724021752e-06, + "loss": 0.4552, + "step": 5989 + }, + { + "epoch": 2.8321513002364065, + "grad_norm": 3.1830217838287354, + "learning_rate": 2.7522200878028265e-06, + "loss": 0.5013, + "step": 5990 + }, + { + "epoch": 2.8326241134751773, + "grad_norm": 2.716176748275757, + "learning_rate": 2.7515994358792624e-06, + "loss": 0.4569, + "step": 5991 + }, + { + "epoch": 2.833096926713948, + "grad_norm": 2.6852715015411377, + "learning_rate": 2.7509787682897044e-06, + "loss": 0.4764, + "step": 5992 + }, + { + "epoch": 2.8335697399527184, + "grad_norm": 2.9383316040039062, + "learning_rate": 2.7503580850727985e-06, + "loss": 0.5205, + "step": 5993 + }, + { + "epoch": 2.8340425531914892, + "grad_norm": 2.703132152557373, + "learning_rate": 2.749737386267193e-06, + "loss": 0.4543, + "step": 5994 + }, + { + "epoch": 2.83451536643026, + "grad_norm": 2.4304885864257812, + "learning_rate": 2.7491166719115354e-06, + "loss": 0.4479, + "step": 5995 + }, + { + "epoch": 2.834988179669031, + "grad_norm": 2.975722551345825, + "learning_rate": 2.748495942044475e-06, + "loss": 0.4074, + "step": 5996 + }, + { + "epoch": 2.8354609929078016, + "grad_norm": 3.440208911895752, + "learning_rate": 2.7478751967046617e-06, + "loss": 0.4497, + "step": 5997 + }, + { + "epoch": 2.835933806146572, + "grad_norm": 2.734673261642456, + "learning_rate": 2.747254435930747e-06, + "loss": 0.437, + "step": 5998 + }, + { + "epoch": 2.8364066193853428, + "grad_norm": 3.1918959617614746, + "learning_rate": 2.7466336597613826e-06, + "loss": 0.4197, + "step": 5999 + }, + { + "epoch": 2.8368794326241136, + "grad_norm": 3.1440329551696777, + "learning_rate": 2.7460128682352216e-06, + "loss": 0.4425, + "step": 6000 + }, + { + "epoch": 2.837352245862884, + "grad_norm": 2.582993507385254, + "learning_rate": 2.7453920613909183e-06, + "loss": 0.4475, + "step": 6001 + }, + { + "epoch": 2.8378250591016547, + "grad_norm": 3.2682149410247803, + "learning_rate": 2.744771239267128e-06, + "loss": 0.4615, + "step": 6002 + }, + { + "epoch": 2.8382978723404255, + "grad_norm": 2.848477840423584, + "learning_rate": 2.7441504019025046e-06, + "loss": 0.4093, + "step": 6003 + }, + { + "epoch": 2.8387706855791963, + "grad_norm": 2.3582282066345215, + "learning_rate": 2.7435295493357067e-06, + "loss": 0.3911, + "step": 6004 + }, + { + "epoch": 2.839243498817967, + "grad_norm": 2.7707207202911377, + "learning_rate": 2.742908681605392e-06, + "loss": 0.4069, + "step": 6005 + }, + { + "epoch": 2.8397163120567375, + "grad_norm": 3.0763752460479736, + "learning_rate": 2.7422877987502183e-06, + "loss": 0.512, + "step": 6006 + }, + { + "epoch": 2.8401891252955083, + "grad_norm": 2.8027124404907227, + "learning_rate": 2.741666900808846e-06, + "loss": 0.4922, + "step": 6007 + }, + { + "epoch": 2.840661938534279, + "grad_norm": 2.487982988357544, + "learning_rate": 2.7410459878199353e-06, + "loss": 0.4368, + "step": 6008 + }, + { + "epoch": 2.8411347517730494, + "grad_norm": 2.8727993965148926, + "learning_rate": 2.7404250598221484e-06, + "loss": 0.4639, + "step": 6009 + }, + { + "epoch": 2.84160756501182, + "grad_norm": 2.5556678771972656, + "learning_rate": 2.739804116854147e-06, + "loss": 0.4217, + "step": 6010 + }, + { + "epoch": 2.842080378250591, + "grad_norm": 2.6306912899017334, + "learning_rate": 2.7391831589545948e-06, + "loss": 0.4816, + "step": 6011 + }, + { + "epoch": 2.842553191489362, + "grad_norm": 2.7340946197509766, + "learning_rate": 2.7385621861621557e-06, + "loss": 0.4113, + "step": 6012 + }, + { + "epoch": 2.8430260047281326, + "grad_norm": 2.834190607070923, + "learning_rate": 2.737941198515495e-06, + "loss": 0.4691, + "step": 6013 + }, + { + "epoch": 2.843498817966903, + "grad_norm": 2.7139697074890137, + "learning_rate": 2.737320196053281e-06, + "loss": 0.3798, + "step": 6014 + }, + { + "epoch": 2.8439716312056738, + "grad_norm": 2.7934985160827637, + "learning_rate": 2.736699178814177e-06, + "loss": 0.446, + "step": 6015 + }, + { + "epoch": 2.8444444444444446, + "grad_norm": 2.6941518783569336, + "learning_rate": 2.7360781468368534e-06, + "loss": 0.4787, + "step": 6016 + }, + { + "epoch": 2.844917257683215, + "grad_norm": 3.1530468463897705, + "learning_rate": 2.7354571001599792e-06, + "loss": 0.474, + "step": 6017 + }, + { + "epoch": 2.8453900709219857, + "grad_norm": 2.613875389099121, + "learning_rate": 2.7348360388222243e-06, + "loss": 0.4297, + "step": 6018 + }, + { + "epoch": 2.8458628841607565, + "grad_norm": 2.5481486320495605, + "learning_rate": 2.7342149628622587e-06, + "loss": 0.3762, + "step": 6019 + }, + { + "epoch": 2.8463356973995273, + "grad_norm": 2.6425609588623047, + "learning_rate": 2.7335938723187544e-06, + "loss": 0.4077, + "step": 6020 + }, + { + "epoch": 2.846808510638298, + "grad_norm": 2.6281731128692627, + "learning_rate": 2.7329727672303836e-06, + "loss": 0.466, + "step": 6021 + }, + { + "epoch": 2.8472813238770684, + "grad_norm": 2.8862180709838867, + "learning_rate": 2.7323516476358197e-06, + "loss": 0.4191, + "step": 6022 + }, + { + "epoch": 2.8477541371158392, + "grad_norm": 2.907731533050537, + "learning_rate": 2.7317305135737383e-06, + "loss": 0.4867, + "step": 6023 + }, + { + "epoch": 2.84822695035461, + "grad_norm": 2.825593948364258, + "learning_rate": 2.731109365082814e-06, + "loss": 0.4888, + "step": 6024 + }, + { + "epoch": 2.8486997635933804, + "grad_norm": 2.478163003921509, + "learning_rate": 2.730488202201722e-06, + "loss": 0.4714, + "step": 6025 + }, + { + "epoch": 2.849172576832151, + "grad_norm": 2.928899049758911, + "learning_rate": 2.7298670249691418e-06, + "loss": 0.4671, + "step": 6026 + }, + { + "epoch": 2.849645390070922, + "grad_norm": 2.778256893157959, + "learning_rate": 2.7292458334237488e-06, + "loss": 0.429, + "step": 6027 + }, + { + "epoch": 2.850118203309693, + "grad_norm": 3.0689055919647217, + "learning_rate": 2.7286246276042234e-06, + "loss": 0.4727, + "step": 6028 + }, + { + "epoch": 2.8505910165484636, + "grad_norm": 2.582066774368286, + "learning_rate": 2.7280034075492447e-06, + "loss": 0.4025, + "step": 6029 + }, + { + "epoch": 2.851063829787234, + "grad_norm": 3.6679015159606934, + "learning_rate": 2.7273821732974936e-06, + "loss": 0.4856, + "step": 6030 + }, + { + "epoch": 2.8515366430260047, + "grad_norm": 2.7222588062286377, + "learning_rate": 2.7267609248876516e-06, + "loss": 0.4255, + "step": 6031 + }, + { + "epoch": 2.8520094562647755, + "grad_norm": 2.455038547515869, + "learning_rate": 2.726139662358401e-06, + "loss": 0.4234, + "step": 6032 + }, + { + "epoch": 2.852482269503546, + "grad_norm": 2.8277318477630615, + "learning_rate": 2.7255183857484253e-06, + "loss": 0.4146, + "step": 6033 + }, + { + "epoch": 2.8529550827423167, + "grad_norm": 2.523615837097168, + "learning_rate": 2.724897095096409e-06, + "loss": 0.4227, + "step": 6034 + }, + { + "epoch": 2.8534278959810875, + "grad_norm": 3.353646755218506, + "learning_rate": 2.724275790441036e-06, + "loss": 0.5041, + "step": 6035 + }, + { + "epoch": 2.8539007092198583, + "grad_norm": 2.753981828689575, + "learning_rate": 2.7236544718209934e-06, + "loss": 0.4646, + "step": 6036 + }, + { + "epoch": 2.854373522458629, + "grad_norm": 2.954744577407837, + "learning_rate": 2.723033139274967e-06, + "loss": 0.5182, + "step": 6037 + }, + { + "epoch": 2.8548463356973994, + "grad_norm": 2.4814131259918213, + "learning_rate": 2.7224117928416462e-06, + "loss": 0.4626, + "step": 6038 + }, + { + "epoch": 2.8553191489361702, + "grad_norm": 2.7414886951446533, + "learning_rate": 2.721790432559717e-06, + "loss": 0.4111, + "step": 6039 + }, + { + "epoch": 2.855791962174941, + "grad_norm": 2.8743896484375, + "learning_rate": 2.7211690584678706e-06, + "loss": 0.4986, + "step": 6040 + }, + { + "epoch": 2.8562647754137114, + "grad_norm": 3.0691921710968018, + "learning_rate": 2.720547670604797e-06, + "loss": 0.4743, + "step": 6041 + }, + { + "epoch": 2.856737588652482, + "grad_norm": 2.7273411750793457, + "learning_rate": 2.7199262690091872e-06, + "loss": 0.4403, + "step": 6042 + }, + { + "epoch": 2.857210401891253, + "grad_norm": 2.8022944927215576, + "learning_rate": 2.7193048537197325e-06, + "loss": 0.4413, + "step": 6043 + }, + { + "epoch": 2.8576832151300238, + "grad_norm": 2.4883248805999756, + "learning_rate": 2.718683424775126e-06, + "loss": 0.4485, + "step": 6044 + }, + { + "epoch": 2.8581560283687946, + "grad_norm": 2.457249879837036, + "learning_rate": 2.718061982214062e-06, + "loss": 0.4167, + "step": 6045 + }, + { + "epoch": 2.858628841607565, + "grad_norm": 2.7210328578948975, + "learning_rate": 2.717440526075234e-06, + "loss": 0.4419, + "step": 6046 + }, + { + "epoch": 2.8591016548463357, + "grad_norm": 2.684483766555786, + "learning_rate": 2.7168190563973386e-06, + "loss": 0.4449, + "step": 6047 + }, + { + "epoch": 2.8595744680851065, + "grad_norm": 2.5305230617523193, + "learning_rate": 2.7161975732190706e-06, + "loss": 0.3829, + "step": 6048 + }, + { + "epoch": 2.860047281323877, + "grad_norm": 3.0284602642059326, + "learning_rate": 2.7155760765791278e-06, + "loss": 0.5164, + "step": 6049 + }, + { + "epoch": 2.8605200945626477, + "grad_norm": 3.154599189758301, + "learning_rate": 2.7149545665162085e-06, + "loss": 0.527, + "step": 6050 + }, + { + "epoch": 2.8609929078014185, + "grad_norm": 2.6798126697540283, + "learning_rate": 2.7143330430690113e-06, + "loss": 0.4379, + "step": 6051 + }, + { + "epoch": 2.8614657210401893, + "grad_norm": 2.9531302452087402, + "learning_rate": 2.7137115062762344e-06, + "loss": 0.4549, + "step": 6052 + }, + { + "epoch": 2.86193853427896, + "grad_norm": 2.779531240463257, + "learning_rate": 2.7130899561765787e-06, + "loss": 0.4037, + "step": 6053 + }, + { + "epoch": 2.8624113475177304, + "grad_norm": 2.786763906478882, + "learning_rate": 2.7124683928087466e-06, + "loss": 0.3986, + "step": 6054 + }, + { + "epoch": 2.862884160756501, + "grad_norm": 2.430415630340576, + "learning_rate": 2.7118468162114385e-06, + "loss": 0.4402, + "step": 6055 + }, + { + "epoch": 2.863356973995272, + "grad_norm": 3.027268409729004, + "learning_rate": 2.7112252264233596e-06, + "loss": 0.4737, + "step": 6056 + }, + { + "epoch": 2.8638297872340424, + "grad_norm": 3.024935483932495, + "learning_rate": 2.710603623483211e-06, + "loss": 0.3997, + "step": 6057 + }, + { + "epoch": 2.864302600472813, + "grad_norm": 2.8862195014953613, + "learning_rate": 2.7099820074296985e-06, + "loss": 0.4896, + "step": 6058 + }, + { + "epoch": 2.864775413711584, + "grad_norm": 2.595579147338867, + "learning_rate": 2.709360378301527e-06, + "loss": 0.4387, + "step": 6059 + }, + { + "epoch": 2.8652482269503547, + "grad_norm": 2.8046188354492188, + "learning_rate": 2.708738736137403e-06, + "loss": 0.4726, + "step": 6060 + }, + { + "epoch": 2.8657210401891255, + "grad_norm": 3.040304660797119, + "learning_rate": 2.708117080976033e-06, + "loss": 0.4642, + "step": 6061 + }, + { + "epoch": 2.866193853427896, + "grad_norm": 2.618128538131714, + "learning_rate": 2.7074954128561248e-06, + "loss": 0.3171, + "step": 6062 + }, + { + "epoch": 2.8666666666666667, + "grad_norm": 2.7966055870056152, + "learning_rate": 2.706873731816387e-06, + "loss": 0.4893, + "step": 6063 + }, + { + "epoch": 2.8671394799054375, + "grad_norm": 2.9198038578033447, + "learning_rate": 2.706252037895529e-06, + "loss": 0.4428, + "step": 6064 + }, + { + "epoch": 2.867612293144208, + "grad_norm": 2.417705774307251, + "learning_rate": 2.7056303311322617e-06, + "loss": 0.3704, + "step": 6065 + }, + { + "epoch": 2.8680851063829786, + "grad_norm": 3.143918752670288, + "learning_rate": 2.7050086115652953e-06, + "loss": 0.5247, + "step": 6066 + }, + { + "epoch": 2.8685579196217494, + "grad_norm": 2.620781183242798, + "learning_rate": 2.704386879233341e-06, + "loss": 0.4131, + "step": 6067 + }, + { + "epoch": 2.8690307328605202, + "grad_norm": 2.6929845809936523, + "learning_rate": 2.703765134175112e-06, + "loss": 0.4833, + "step": 6068 + }, + { + "epoch": 2.869503546099291, + "grad_norm": 2.695920944213867, + "learning_rate": 2.7031433764293214e-06, + "loss": 0.435, + "step": 6069 + }, + { + "epoch": 2.8699763593380614, + "grad_norm": 2.6184475421905518, + "learning_rate": 2.702521606034684e-06, + "loss": 0.3898, + "step": 6070 + }, + { + "epoch": 2.870449172576832, + "grad_norm": 3.130624532699585, + "learning_rate": 2.7018998230299136e-06, + "loss": 0.4934, + "step": 6071 + }, + { + "epoch": 2.870921985815603, + "grad_norm": 2.947936534881592, + "learning_rate": 2.701278027453727e-06, + "loss": 0.4167, + "step": 6072 + }, + { + "epoch": 2.8713947990543733, + "grad_norm": 2.389263391494751, + "learning_rate": 2.7006562193448406e-06, + "loss": 0.3854, + "step": 6073 + }, + { + "epoch": 2.871867612293144, + "grad_norm": 2.9040684700012207, + "learning_rate": 2.700034398741971e-06, + "loss": 0.4656, + "step": 6074 + }, + { + "epoch": 2.872340425531915, + "grad_norm": 2.8671910762786865, + "learning_rate": 2.6994125656838365e-06, + "loss": 0.4642, + "step": 6075 + }, + { + "epoch": 2.8728132387706857, + "grad_norm": 2.6957180500030518, + "learning_rate": 2.698790720209156e-06, + "loss": 0.4894, + "step": 6076 + }, + { + "epoch": 2.8732860520094565, + "grad_norm": 2.748342514038086, + "learning_rate": 2.698168862356648e-06, + "loss": 0.4552, + "step": 6077 + }, + { + "epoch": 2.873758865248227, + "grad_norm": 2.7459912300109863, + "learning_rate": 2.6975469921650344e-06, + "loss": 0.4244, + "step": 6078 + }, + { + "epoch": 2.8742316784869977, + "grad_norm": 2.515650987625122, + "learning_rate": 2.6969251096730366e-06, + "loss": 0.4178, + "step": 6079 + }, + { + "epoch": 2.8747044917257685, + "grad_norm": 2.747373342514038, + "learning_rate": 2.696303214919375e-06, + "loss": 0.4623, + "step": 6080 + }, + { + "epoch": 2.875177304964539, + "grad_norm": 2.72092604637146, + "learning_rate": 2.695681307942773e-06, + "loss": 0.4227, + "step": 6081 + }, + { + "epoch": 2.8756501182033096, + "grad_norm": 2.6925108432769775, + "learning_rate": 2.695059388781955e-06, + "loss": 0.3807, + "step": 6082 + }, + { + "epoch": 2.8761229314420804, + "grad_norm": 2.673546314239502, + "learning_rate": 2.6944374574756427e-06, + "loss": 0.424, + "step": 6083 + }, + { + "epoch": 2.876595744680851, + "grad_norm": 2.7018187046051025, + "learning_rate": 2.6938155140625636e-06, + "loss": 0.4367, + "step": 6084 + }, + { + "epoch": 2.877068557919622, + "grad_norm": 2.9420957565307617, + "learning_rate": 2.6931935585814416e-06, + "loss": 0.4223, + "step": 6085 + }, + { + "epoch": 2.8775413711583924, + "grad_norm": 2.6523385047912598, + "learning_rate": 2.6925715910710036e-06, + "loss": 0.4074, + "step": 6086 + }, + { + "epoch": 2.878014184397163, + "grad_norm": 2.6104063987731934, + "learning_rate": 2.691949611569978e-06, + "loss": 0.423, + "step": 6087 + }, + { + "epoch": 2.878486997635934, + "grad_norm": 2.6463685035705566, + "learning_rate": 2.691327620117091e-06, + "loss": 0.4354, + "step": 6088 + }, + { + "epoch": 2.8789598108747043, + "grad_norm": 2.5863583087921143, + "learning_rate": 2.6907056167510725e-06, + "loss": 0.4177, + "step": 6089 + }, + { + "epoch": 2.879432624113475, + "grad_norm": 2.6946942806243896, + "learning_rate": 2.690083601510651e-06, + "loss": 0.4176, + "step": 6090 + }, + { + "epoch": 2.879905437352246, + "grad_norm": 3.0649454593658447, + "learning_rate": 2.6894615744345575e-06, + "loss": 0.4827, + "step": 6091 + }, + { + "epoch": 2.8803782505910167, + "grad_norm": 2.6454906463623047, + "learning_rate": 2.6888395355615226e-06, + "loss": 0.4757, + "step": 6092 + }, + { + "epoch": 2.8808510638297875, + "grad_norm": 3.251805067062378, + "learning_rate": 2.688217484930278e-06, + "loss": 0.5651, + "step": 6093 + }, + { + "epoch": 2.881323877068558, + "grad_norm": 2.543999433517456, + "learning_rate": 2.687595422579555e-06, + "loss": 0.4196, + "step": 6094 + }, + { + "epoch": 2.8817966903073287, + "grad_norm": 3.1502909660339355, + "learning_rate": 2.686973348548088e-06, + "loss": 0.4376, + "step": 6095 + }, + { + "epoch": 2.8822695035460995, + "grad_norm": 2.7800376415252686, + "learning_rate": 2.686351262874611e-06, + "loss": 0.444, + "step": 6096 + }, + { + "epoch": 2.88274231678487, + "grad_norm": 3.1529603004455566, + "learning_rate": 2.685729165597858e-06, + "loss": 0.5137, + "step": 6097 + }, + { + "epoch": 2.8832151300236406, + "grad_norm": 2.6079602241516113, + "learning_rate": 2.685107056756564e-06, + "loss": 0.4213, + "step": 6098 + }, + { + "epoch": 2.8836879432624114, + "grad_norm": 2.8969249725341797, + "learning_rate": 2.6844849363894648e-06, + "loss": 0.4679, + "step": 6099 + }, + { + "epoch": 2.884160756501182, + "grad_norm": 2.5882437229156494, + "learning_rate": 2.6838628045352977e-06, + "loss": 0.3891, + "step": 6100 + }, + { + "epoch": 2.8846335697399526, + "grad_norm": 2.9458062648773193, + "learning_rate": 2.6832406612328007e-06, + "loss": 0.4802, + "step": 6101 + }, + { + "epoch": 2.8851063829787233, + "grad_norm": 2.8463058471679688, + "learning_rate": 2.6826185065207105e-06, + "loss": 0.4332, + "step": 6102 + }, + { + "epoch": 2.885579196217494, + "grad_norm": 2.8799285888671875, + "learning_rate": 2.6819963404377667e-06, + "loss": 0.4474, + "step": 6103 + }, + { + "epoch": 2.8860520094562645, + "grad_norm": 2.846860408782959, + "learning_rate": 2.681374163022709e-06, + "loss": 0.4317, + "step": 6104 + }, + { + "epoch": 2.8865248226950353, + "grad_norm": 2.7918877601623535, + "learning_rate": 2.6807519743142775e-06, + "loss": 0.4243, + "step": 6105 + }, + { + "epoch": 2.886997635933806, + "grad_norm": 2.9351487159729004, + "learning_rate": 2.6801297743512127e-06, + "loss": 0.5253, + "step": 6106 + }, + { + "epoch": 2.887470449172577, + "grad_norm": 2.9422426223754883, + "learning_rate": 2.6795075631722576e-06, + "loss": 0.4887, + "step": 6107 + }, + { + "epoch": 2.8879432624113477, + "grad_norm": 2.6837220191955566, + "learning_rate": 2.678885340816153e-06, + "loss": 0.4761, + "step": 6108 + }, + { + "epoch": 2.888416075650118, + "grad_norm": 2.6800777912139893, + "learning_rate": 2.6782631073216425e-06, + "loss": 0.4248, + "step": 6109 + }, + { + "epoch": 2.888888888888889, + "grad_norm": 2.9654436111450195, + "learning_rate": 2.6776408627274702e-06, + "loss": 0.487, + "step": 6110 + }, + { + "epoch": 2.8893617021276596, + "grad_norm": 2.7725181579589844, + "learning_rate": 2.6770186070723804e-06, + "loss": 0.4166, + "step": 6111 + }, + { + "epoch": 2.88983451536643, + "grad_norm": 2.6547815799713135, + "learning_rate": 2.676396340395118e-06, + "loss": 0.4039, + "step": 6112 + }, + { + "epoch": 2.890307328605201, + "grad_norm": 2.690997838973999, + "learning_rate": 2.6757740627344292e-06, + "loss": 0.4639, + "step": 6113 + }, + { + "epoch": 2.8907801418439716, + "grad_norm": 2.4693069458007812, + "learning_rate": 2.67515177412906e-06, + "loss": 0.4052, + "step": 6114 + }, + { + "epoch": 2.8912529550827424, + "grad_norm": 2.7137033939361572, + "learning_rate": 2.6745294746177576e-06, + "loss": 0.4442, + "step": 6115 + }, + { + "epoch": 2.891725768321513, + "grad_norm": 3.7417004108428955, + "learning_rate": 2.6739071642392712e-06, + "loss": 0.4809, + "step": 6116 + }, + { + "epoch": 2.8921985815602835, + "grad_norm": 2.707094669342041, + "learning_rate": 2.673284843032347e-06, + "loss": 0.411, + "step": 6117 + }, + { + "epoch": 2.8926713947990543, + "grad_norm": 2.7864158153533936, + "learning_rate": 2.672662511035736e-06, + "loss": 0.4939, + "step": 6118 + }, + { + "epoch": 2.893144208037825, + "grad_norm": 2.8753504753112793, + "learning_rate": 2.672040168288187e-06, + "loss": 0.4396, + "step": 6119 + }, + { + "epoch": 2.8936170212765955, + "grad_norm": 2.7581071853637695, + "learning_rate": 2.6714178148284516e-06, + "loss": 0.427, + "step": 6120 + }, + { + "epoch": 2.8940898345153663, + "grad_norm": 2.9754791259765625, + "learning_rate": 2.6707954506952803e-06, + "loss": 0.4255, + "step": 6121 + }, + { + "epoch": 2.894562647754137, + "grad_norm": 2.876939296722412, + "learning_rate": 2.670173075927426e-06, + "loss": 0.4699, + "step": 6122 + }, + { + "epoch": 2.895035460992908, + "grad_norm": 2.4875400066375732, + "learning_rate": 2.6695506905636397e-06, + "loss": 0.3568, + "step": 6123 + }, + { + "epoch": 2.8955082742316787, + "grad_norm": 2.703606128692627, + "learning_rate": 2.668928294642675e-06, + "loss": 0.3646, + "step": 6124 + }, + { + "epoch": 2.895981087470449, + "grad_norm": 2.8618338108062744, + "learning_rate": 2.6683058882032868e-06, + "loss": 0.378, + "step": 6125 + }, + { + "epoch": 2.89645390070922, + "grad_norm": 2.9756760597229004, + "learning_rate": 2.667683471284229e-06, + "loss": 0.4348, + "step": 6126 + }, + { + "epoch": 2.8969267139479906, + "grad_norm": 2.7861104011535645, + "learning_rate": 2.667061043924256e-06, + "loss": 0.4435, + "step": 6127 + }, + { + "epoch": 2.897399527186761, + "grad_norm": 2.7932238578796387, + "learning_rate": 2.6664386061621243e-06, + "loss": 0.4824, + "step": 6128 + }, + { + "epoch": 2.8978723404255318, + "grad_norm": 2.85483455657959, + "learning_rate": 2.6658161580365917e-06, + "loss": 0.4925, + "step": 6129 + }, + { + "epoch": 2.8983451536643026, + "grad_norm": 2.4242141246795654, + "learning_rate": 2.6651936995864136e-06, + "loss": 0.3466, + "step": 6130 + }, + { + "epoch": 2.8988179669030734, + "grad_norm": 3.385214328765869, + "learning_rate": 2.6645712308503473e-06, + "loss": 0.4751, + "step": 6131 + }, + { + "epoch": 2.899290780141844, + "grad_norm": 2.7109622955322266, + "learning_rate": 2.6639487518671525e-06, + "loss": 0.4469, + "step": 6132 + }, + { + "epoch": 2.8997635933806145, + "grad_norm": 2.6537814140319824, + "learning_rate": 2.6633262626755877e-06, + "loss": 0.4678, + "step": 6133 + }, + { + "epoch": 2.9002364066193853, + "grad_norm": 2.5992231369018555, + "learning_rate": 2.6627037633144124e-06, + "loss": 0.4206, + "step": 6134 + }, + { + "epoch": 2.900709219858156, + "grad_norm": 2.988940954208374, + "learning_rate": 2.6620812538223885e-06, + "loss": 0.4554, + "step": 6135 + }, + { + "epoch": 2.9011820330969265, + "grad_norm": 3.0678138732910156, + "learning_rate": 2.661458734238274e-06, + "loss": 0.4671, + "step": 6136 + }, + { + "epoch": 2.9016548463356973, + "grad_norm": 2.6902482509613037, + "learning_rate": 2.6608362046008335e-06, + "loss": 0.372, + "step": 6137 + }, + { + "epoch": 2.902127659574468, + "grad_norm": 3.031597375869751, + "learning_rate": 2.660213664948827e-06, + "loss": 0.4424, + "step": 6138 + }, + { + "epoch": 2.902600472813239, + "grad_norm": 2.8376755714416504, + "learning_rate": 2.6595911153210187e-06, + "loss": 0.4599, + "step": 6139 + }, + { + "epoch": 2.9030732860520096, + "grad_norm": 3.3164854049682617, + "learning_rate": 2.6589685557561707e-06, + "loss": 0.3897, + "step": 6140 + }, + { + "epoch": 2.90354609929078, + "grad_norm": 2.9535014629364014, + "learning_rate": 2.658345986293048e-06, + "loss": 0.4957, + "step": 6141 + }, + { + "epoch": 2.904018912529551, + "grad_norm": 2.821276903152466, + "learning_rate": 2.657723406970415e-06, + "loss": 0.4453, + "step": 6142 + }, + { + "epoch": 2.9044917257683216, + "grad_norm": 2.7314651012420654, + "learning_rate": 2.657100817827037e-06, + "loss": 0.4406, + "step": 6143 + }, + { + "epoch": 2.904964539007092, + "grad_norm": 2.9509520530700684, + "learning_rate": 2.6564782189016804e-06, + "loss": 0.4629, + "step": 6144 + }, + { + "epoch": 2.9054373522458627, + "grad_norm": 2.6234960556030273, + "learning_rate": 2.655855610233111e-06, + "loss": 0.4306, + "step": 6145 + }, + { + "epoch": 2.9059101654846335, + "grad_norm": 2.7209644317626953, + "learning_rate": 2.6552329918600962e-06, + "loss": 0.3643, + "step": 6146 + }, + { + "epoch": 2.9063829787234043, + "grad_norm": 2.9797747135162354, + "learning_rate": 2.654610363821404e-06, + "loss": 0.4616, + "step": 6147 + }, + { + "epoch": 2.906855791962175, + "grad_norm": 2.8179666996002197, + "learning_rate": 2.6539877261558016e-06, + "loss": 0.4526, + "step": 6148 + }, + { + "epoch": 2.9073286052009455, + "grad_norm": 2.7492244243621826, + "learning_rate": 2.653365078902059e-06, + "loss": 0.4862, + "step": 6149 + }, + { + "epoch": 2.9078014184397163, + "grad_norm": 3.0262451171875, + "learning_rate": 2.6527424220989457e-06, + "loss": 0.3728, + "step": 6150 + }, + { + "epoch": 2.908274231678487, + "grad_norm": 2.8092808723449707, + "learning_rate": 2.6521197557852315e-06, + "loss": 0.4668, + "step": 6151 + }, + { + "epoch": 2.9087470449172574, + "grad_norm": 2.915719985961914, + "learning_rate": 2.651497079999687e-06, + "loss": 0.5124, + "step": 6152 + }, + { + "epoch": 2.9092198581560282, + "grad_norm": 2.9794204235076904, + "learning_rate": 2.6508743947810834e-06, + "loss": 0.5207, + "step": 6153 + }, + { + "epoch": 2.909692671394799, + "grad_norm": 2.882453680038452, + "learning_rate": 2.650251700168193e-06, + "loss": 0.4382, + "step": 6154 + }, + { + "epoch": 2.91016548463357, + "grad_norm": 3.183680534362793, + "learning_rate": 2.6496289961997886e-06, + "loss": 0.5134, + "step": 6155 + }, + { + "epoch": 2.9106382978723406, + "grad_norm": 2.9374759197235107, + "learning_rate": 2.649006282914642e-06, + "loss": 0.4748, + "step": 6156 + }, + { + "epoch": 2.911111111111111, + "grad_norm": 2.8096041679382324, + "learning_rate": 2.648383560351527e-06, + "loss": 0.4672, + "step": 6157 + }, + { + "epoch": 2.911583924349882, + "grad_norm": 2.8799238204956055, + "learning_rate": 2.6477608285492196e-06, + "loss": 0.4679, + "step": 6158 + }, + { + "epoch": 2.9120567375886526, + "grad_norm": 2.689310073852539, + "learning_rate": 2.6471380875464923e-06, + "loss": 0.4069, + "step": 6159 + }, + { + "epoch": 2.912529550827423, + "grad_norm": 2.909323215484619, + "learning_rate": 2.6465153373821216e-06, + "loss": 0.4463, + "step": 6160 + }, + { + "epoch": 2.9130023640661937, + "grad_norm": 2.797724962234497, + "learning_rate": 2.6458925780948845e-06, + "loss": 0.4269, + "step": 6161 + }, + { + "epoch": 2.9134751773049645, + "grad_norm": 2.7533204555511475, + "learning_rate": 2.645269809723556e-06, + "loss": 0.453, + "step": 6162 + }, + { + "epoch": 2.9139479905437353, + "grad_norm": 2.6615989208221436, + "learning_rate": 2.6446470323069122e-06, + "loss": 0.3921, + "step": 6163 + }, + { + "epoch": 2.914420803782506, + "grad_norm": 3.0493314266204834, + "learning_rate": 2.644024245883733e-06, + "loss": 0.4779, + "step": 6164 + }, + { + "epoch": 2.9148936170212765, + "grad_norm": 2.649845600128174, + "learning_rate": 2.643401450492795e-06, + "loss": 0.454, + "step": 6165 + }, + { + "epoch": 2.9153664302600473, + "grad_norm": 2.7931838035583496, + "learning_rate": 2.642778646172877e-06, + "loss": 0.504, + "step": 6166 + }, + { + "epoch": 2.915839243498818, + "grad_norm": 2.9518136978149414, + "learning_rate": 2.64215583296276e-06, + "loss": 0.4767, + "step": 6167 + }, + { + "epoch": 2.9163120567375884, + "grad_norm": 2.6047427654266357, + "learning_rate": 2.6415330109012216e-06, + "loss": 0.4316, + "step": 6168 + }, + { + "epoch": 2.916784869976359, + "grad_norm": 2.7732112407684326, + "learning_rate": 2.640910180027044e-06, + "loss": 0.4213, + "step": 6169 + }, + { + "epoch": 2.91725768321513, + "grad_norm": 3.1157236099243164, + "learning_rate": 2.6402873403790068e-06, + "loss": 0.4559, + "step": 6170 + }, + { + "epoch": 2.917730496453901, + "grad_norm": 2.68424129486084, + "learning_rate": 2.6396644919958917e-06, + "loss": 0.3456, + "step": 6171 + }, + { + "epoch": 2.9182033096926716, + "grad_norm": 3.1093270778656006, + "learning_rate": 2.639041634916482e-06, + "loss": 0.4172, + "step": 6172 + }, + { + "epoch": 2.918676122931442, + "grad_norm": 2.9844655990600586, + "learning_rate": 2.6384187691795594e-06, + "loss": 0.4844, + "step": 6173 + }, + { + "epoch": 2.9191489361702128, + "grad_norm": 2.907151222229004, + "learning_rate": 2.637795894823906e-06, + "loss": 0.5126, + "step": 6174 + }, + { + "epoch": 2.9196217494089836, + "grad_norm": 2.804105520248413, + "learning_rate": 2.637173011888307e-06, + "loss": 0.3919, + "step": 6175 + }, + { + "epoch": 2.920094562647754, + "grad_norm": 2.8809266090393066, + "learning_rate": 2.636550120411547e-06, + "loss": 0.4468, + "step": 6176 + }, + { + "epoch": 2.9205673758865247, + "grad_norm": 2.686290979385376, + "learning_rate": 2.6359272204324087e-06, + "loss": 0.4352, + "step": 6177 + }, + { + "epoch": 2.9210401891252955, + "grad_norm": 2.448101758956909, + "learning_rate": 2.635304311989678e-06, + "loss": 0.4218, + "step": 6178 + }, + { + "epoch": 2.9215130023640663, + "grad_norm": 2.81024169921875, + "learning_rate": 2.6346813951221416e-06, + "loss": 0.5177, + "step": 6179 + }, + { + "epoch": 2.921985815602837, + "grad_norm": 2.7590086460113525, + "learning_rate": 2.6340584698685856e-06, + "loss": 0.3897, + "step": 6180 + }, + { + "epoch": 2.9224586288416075, + "grad_norm": 3.1226227283477783, + "learning_rate": 2.6334355362677965e-06, + "loss": 0.4595, + "step": 6181 + }, + { + "epoch": 2.9229314420803783, + "grad_norm": 2.673828125, + "learning_rate": 2.6328125943585607e-06, + "loss": 0.4932, + "step": 6182 + }, + { + "epoch": 2.923404255319149, + "grad_norm": 2.8297293186187744, + "learning_rate": 2.632189644179668e-06, + "loss": 0.3819, + "step": 6183 + }, + { + "epoch": 2.9238770685579194, + "grad_norm": 2.9661548137664795, + "learning_rate": 2.6315666857699056e-06, + "loss": 0.4419, + "step": 6184 + }, + { + "epoch": 2.92434988179669, + "grad_norm": 2.9745798110961914, + "learning_rate": 2.6309437191680627e-06, + "loss": 0.4423, + "step": 6185 + }, + { + "epoch": 2.924822695035461, + "grad_norm": 2.8351712226867676, + "learning_rate": 2.6303207444129285e-06, + "loss": 0.5043, + "step": 6186 + }, + { + "epoch": 2.925295508274232, + "grad_norm": 2.6442384719848633, + "learning_rate": 2.6296977615432927e-06, + "loss": 0.4431, + "step": 6187 + }, + { + "epoch": 2.9257683215130026, + "grad_norm": 2.4128029346466064, + "learning_rate": 2.6290747705979457e-06, + "loss": 0.3603, + "step": 6188 + }, + { + "epoch": 2.926241134751773, + "grad_norm": 2.730424642562866, + "learning_rate": 2.6284517716156786e-06, + "loss": 0.439, + "step": 6189 + }, + { + "epoch": 2.9267139479905437, + "grad_norm": 2.6215405464172363, + "learning_rate": 2.627828764635284e-06, + "loss": 0.4117, + "step": 6190 + }, + { + "epoch": 2.9271867612293145, + "grad_norm": 2.56585955619812, + "learning_rate": 2.627205749695552e-06, + "loss": 0.4404, + "step": 6191 + }, + { + "epoch": 2.927659574468085, + "grad_norm": 2.9587886333465576, + "learning_rate": 2.6265827268352763e-06, + "loss": 0.4295, + "step": 6192 + }, + { + "epoch": 2.9281323877068557, + "grad_norm": 2.6611828804016113, + "learning_rate": 2.625959696093249e-06, + "loss": 0.4441, + "step": 6193 + }, + { + "epoch": 2.9286052009456265, + "grad_norm": 2.4391369819641113, + "learning_rate": 2.6253366575082634e-06, + "loss": 0.4447, + "step": 6194 + }, + { + "epoch": 2.9290780141843973, + "grad_norm": 2.710763454437256, + "learning_rate": 2.6247136111191144e-06, + "loss": 0.4662, + "step": 6195 + }, + { + "epoch": 2.929550827423168, + "grad_norm": 2.770697593688965, + "learning_rate": 2.6240905569645952e-06, + "loss": 0.4263, + "step": 6196 + }, + { + "epoch": 2.9300236406619384, + "grad_norm": 2.5885732173919678, + "learning_rate": 2.623467495083501e-06, + "loss": 0.4303, + "step": 6197 + }, + { + "epoch": 2.9304964539007092, + "grad_norm": 2.5716748237609863, + "learning_rate": 2.6228444255146274e-06, + "loss": 0.3714, + "step": 6198 + }, + { + "epoch": 2.93096926713948, + "grad_norm": 3.0437910556793213, + "learning_rate": 2.6222213482967703e-06, + "loss": 0.4077, + "step": 6199 + }, + { + "epoch": 2.9314420803782504, + "grad_norm": 2.7861344814300537, + "learning_rate": 2.6215982634687253e-06, + "loss": 0.4157, + "step": 6200 + }, + { + "epoch": 2.931914893617021, + "grad_norm": 2.5265355110168457, + "learning_rate": 2.6209751710692905e-06, + "loss": 0.4586, + "step": 6201 + }, + { + "epoch": 2.932387706855792, + "grad_norm": 2.940112590789795, + "learning_rate": 2.6203520711372615e-06, + "loss": 0.4208, + "step": 6202 + }, + { + "epoch": 2.9328605200945628, + "grad_norm": 2.7124581336975098, + "learning_rate": 2.6197289637114363e-06, + "loss": 0.4173, + "step": 6203 + }, + { + "epoch": 2.9333333333333336, + "grad_norm": 2.818523406982422, + "learning_rate": 2.619105848830615e-06, + "loss": 0.4349, + "step": 6204 + }, + { + "epoch": 2.933806146572104, + "grad_norm": 2.7630393505096436, + "learning_rate": 2.6184827265335937e-06, + "loss": 0.5078, + "step": 6205 + }, + { + "epoch": 2.9342789598108747, + "grad_norm": 3.0554699897766113, + "learning_rate": 2.6178595968591726e-06, + "loss": 0.4712, + "step": 6206 + }, + { + "epoch": 2.9347517730496455, + "grad_norm": 2.721992254257202, + "learning_rate": 2.6172364598461507e-06, + "loss": 0.4847, + "step": 6207 + }, + { + "epoch": 2.935224586288416, + "grad_norm": 2.809663772583008, + "learning_rate": 2.6166133155333303e-06, + "loss": 0.4447, + "step": 6208 + }, + { + "epoch": 2.9356973995271867, + "grad_norm": 2.568394660949707, + "learning_rate": 2.6159901639595088e-06, + "loss": 0.4543, + "step": 6209 + }, + { + "epoch": 2.9361702127659575, + "grad_norm": 3.3670637607574463, + "learning_rate": 2.6153670051634884e-06, + "loss": 0.4901, + "step": 6210 + }, + { + "epoch": 2.9366430260047283, + "grad_norm": 3.082508087158203, + "learning_rate": 2.614743839184071e-06, + "loss": 0.4862, + "step": 6211 + }, + { + "epoch": 2.937115839243499, + "grad_norm": 2.692139148712158, + "learning_rate": 2.6141206660600566e-06, + "loss": 0.5199, + "step": 6212 + }, + { + "epoch": 2.9375886524822694, + "grad_norm": 3.231433391571045, + "learning_rate": 2.6134974858302504e-06, + "loss": 0.464, + "step": 6213 + }, + { + "epoch": 2.93806146572104, + "grad_norm": 3.224238157272339, + "learning_rate": 2.612874298533452e-06, + "loss": 0.4507, + "step": 6214 + }, + { + "epoch": 2.938534278959811, + "grad_norm": 2.812755584716797, + "learning_rate": 2.6122511042084663e-06, + "loss": 0.4527, + "step": 6215 + }, + { + "epoch": 2.9390070921985814, + "grad_norm": 2.837811231613159, + "learning_rate": 2.611627902894098e-06, + "loss": 0.4782, + "step": 6216 + }, + { + "epoch": 2.939479905437352, + "grad_norm": 3.093817710876465, + "learning_rate": 2.6110046946291476e-06, + "loss": 0.4933, + "step": 6217 + }, + { + "epoch": 2.939952718676123, + "grad_norm": 2.950119733810425, + "learning_rate": 2.6103814794524235e-06, + "loss": 0.4884, + "step": 6218 + }, + { + "epoch": 2.9404255319148938, + "grad_norm": 2.469681978225708, + "learning_rate": 2.6097582574027274e-06, + "loss": 0.4135, + "step": 6219 + }, + { + "epoch": 2.9408983451536646, + "grad_norm": 2.779238224029541, + "learning_rate": 2.609135028518866e-06, + "loss": 0.5165, + "step": 6220 + }, + { + "epoch": 2.941371158392435, + "grad_norm": 2.807705879211426, + "learning_rate": 2.608511792839645e-06, + "loss": 0.4046, + "step": 6221 + }, + { + "epoch": 2.9418439716312057, + "grad_norm": 2.6067750453948975, + "learning_rate": 2.607888550403871e-06, + "loss": 0.406, + "step": 6222 + }, + { + "epoch": 2.9423167848699765, + "grad_norm": 2.865766763687134, + "learning_rate": 2.607265301250349e-06, + "loss": 0.471, + "step": 6223 + }, + { + "epoch": 2.942789598108747, + "grad_norm": 2.977681875228882, + "learning_rate": 2.6066420454178876e-06, + "loss": 0.4666, + "step": 6224 + }, + { + "epoch": 2.9432624113475176, + "grad_norm": 2.870884418487549, + "learning_rate": 2.606018782945294e-06, + "loss": 0.4768, + "step": 6225 + }, + { + "epoch": 2.9437352245862884, + "grad_norm": 2.992851495742798, + "learning_rate": 2.6053955138713756e-06, + "loss": 0.4657, + "step": 6226 + }, + { + "epoch": 2.9442080378250592, + "grad_norm": 2.7279815673828125, + "learning_rate": 2.6047722382349406e-06, + "loss": 0.4087, + "step": 6227 + }, + { + "epoch": 2.94468085106383, + "grad_norm": 2.8587028980255127, + "learning_rate": 2.604148956074797e-06, + "loss": 0.4452, + "step": 6228 + }, + { + "epoch": 2.9451536643026004, + "grad_norm": 3.001694679260254, + "learning_rate": 2.6035256674297555e-06, + "loss": 0.4852, + "step": 6229 + }, + { + "epoch": 2.945626477541371, + "grad_norm": 2.858069896697998, + "learning_rate": 2.6029023723386237e-06, + "loss": 0.4281, + "step": 6230 + }, + { + "epoch": 2.946099290780142, + "grad_norm": 2.675856828689575, + "learning_rate": 2.602279070840213e-06, + "loss": 0.4545, + "step": 6231 + }, + { + "epoch": 2.9465721040189123, + "grad_norm": 2.530245065689087, + "learning_rate": 2.6016557629733334e-06, + "loss": 0.4619, + "step": 6232 + }, + { + "epoch": 2.947044917257683, + "grad_norm": 2.7533743381500244, + "learning_rate": 2.601032448776795e-06, + "loss": 0.4879, + "step": 6233 + }, + { + "epoch": 2.947517730496454, + "grad_norm": 3.130453109741211, + "learning_rate": 2.600409128289409e-06, + "loss": 0.4056, + "step": 6234 + }, + { + "epoch": 2.9479905437352247, + "grad_norm": 3.4736509323120117, + "learning_rate": 2.5997858015499867e-06, + "loss": 0.5063, + "step": 6235 + }, + { + "epoch": 2.9484633569739955, + "grad_norm": 2.871978282928467, + "learning_rate": 2.5991624685973406e-06, + "loss": 0.4562, + "step": 6236 + }, + { + "epoch": 2.948936170212766, + "grad_norm": 2.976503372192383, + "learning_rate": 2.5985391294702817e-06, + "loss": 0.5079, + "step": 6237 + }, + { + "epoch": 2.9494089834515367, + "grad_norm": 2.578122615814209, + "learning_rate": 2.597915784207623e-06, + "loss": 0.4069, + "step": 6238 + }, + { + "epoch": 2.9498817966903075, + "grad_norm": 2.885911226272583, + "learning_rate": 2.597292432848178e-06, + "loss": 0.4382, + "step": 6239 + }, + { + "epoch": 2.950354609929078, + "grad_norm": 2.9301681518554688, + "learning_rate": 2.5966690754307605e-06, + "loss": 0.4888, + "step": 6240 + }, + { + "epoch": 2.9508274231678486, + "grad_norm": 2.9912192821502686, + "learning_rate": 2.5960457119941834e-06, + "loss": 0.4699, + "step": 6241 + }, + { + "epoch": 2.9513002364066194, + "grad_norm": 2.6612601280212402, + "learning_rate": 2.5954223425772607e-06, + "loss": 0.3736, + "step": 6242 + }, + { + "epoch": 2.9517730496453902, + "grad_norm": 2.9325380325317383, + "learning_rate": 2.5947989672188067e-06, + "loss": 0.4771, + "step": 6243 + }, + { + "epoch": 2.952245862884161, + "grad_norm": 2.8143959045410156, + "learning_rate": 2.594175585957637e-06, + "loss": 0.5103, + "step": 6244 + }, + { + "epoch": 2.9527186761229314, + "grad_norm": 2.355078935623169, + "learning_rate": 2.5935521988325674e-06, + "loss": 0.44, + "step": 6245 + }, + { + "epoch": 2.953191489361702, + "grad_norm": 2.733156442642212, + "learning_rate": 2.5929288058824114e-06, + "loss": 0.4306, + "step": 6246 + }, + { + "epoch": 2.953664302600473, + "grad_norm": 3.182563304901123, + "learning_rate": 2.5923054071459865e-06, + "loss": 0.417, + "step": 6247 + }, + { + "epoch": 2.9541371158392433, + "grad_norm": 2.4162323474884033, + "learning_rate": 2.5916820026621094e-06, + "loss": 0.3802, + "step": 6248 + }, + { + "epoch": 2.954609929078014, + "grad_norm": 2.772706985473633, + "learning_rate": 2.591058592469595e-06, + "loss": 0.4654, + "step": 6249 + }, + { + "epoch": 2.955082742316785, + "grad_norm": 2.6011102199554443, + "learning_rate": 2.5904351766072616e-06, + "loss": 0.4619, + "step": 6250 + }, + { + "epoch": 2.9555555555555557, + "grad_norm": 2.5700361728668213, + "learning_rate": 2.589811755113926e-06, + "loss": 0.3991, + "step": 6251 + }, + { + "epoch": 2.9560283687943265, + "grad_norm": 2.6444971561431885, + "learning_rate": 2.589188328028407e-06, + "loss": 0.4388, + "step": 6252 + }, + { + "epoch": 2.956501182033097, + "grad_norm": 2.739567279815674, + "learning_rate": 2.588564895389521e-06, + "loss": 0.4193, + "step": 6253 + }, + { + "epoch": 2.9569739952718677, + "grad_norm": 2.7070045471191406, + "learning_rate": 2.5879414572360877e-06, + "loss": 0.4347, + "step": 6254 + }, + { + "epoch": 2.9574468085106385, + "grad_norm": 2.7811532020568848, + "learning_rate": 2.587318013606926e-06, + "loss": 0.43, + "step": 6255 + }, + { + "epoch": 2.957919621749409, + "grad_norm": 3.0036091804504395, + "learning_rate": 2.5866945645408537e-06, + "loss": 0.4855, + "step": 6256 + }, + { + "epoch": 2.9583924349881796, + "grad_norm": 2.948573112487793, + "learning_rate": 2.5860711100766918e-06, + "loss": 0.4594, + "step": 6257 + }, + { + "epoch": 2.9588652482269504, + "grad_norm": 2.6371593475341797, + "learning_rate": 2.5854476502532583e-06, + "loss": 0.446, + "step": 6258 + }, + { + "epoch": 2.959338061465721, + "grad_norm": 2.668677806854248, + "learning_rate": 2.5848241851093754e-06, + "loss": 0.3991, + "step": 6259 + }, + { + "epoch": 2.959810874704492, + "grad_norm": 3.1640663146972656, + "learning_rate": 2.5842007146838614e-06, + "loss": 0.5146, + "step": 6260 + }, + { + "epoch": 2.9602836879432624, + "grad_norm": 2.9412102699279785, + "learning_rate": 2.5835772390155382e-06, + "loss": 0.4798, + "step": 6261 + }, + { + "epoch": 2.960756501182033, + "grad_norm": 2.7674343585968018, + "learning_rate": 2.582953758143227e-06, + "loss": 0.4262, + "step": 6262 + }, + { + "epoch": 2.961229314420804, + "grad_norm": 3.5219457149505615, + "learning_rate": 2.582330272105749e-06, + "loss": 0.4905, + "step": 6263 + }, + { + "epoch": 2.9617021276595743, + "grad_norm": 2.4274468421936035, + "learning_rate": 2.5817067809419267e-06, + "loss": 0.4048, + "step": 6264 + }, + { + "epoch": 2.962174940898345, + "grad_norm": 2.6907944679260254, + "learning_rate": 2.5810832846905814e-06, + "loss": 0.388, + "step": 6265 + }, + { + "epoch": 2.962647754137116, + "grad_norm": 2.603151321411133, + "learning_rate": 2.5804597833905347e-06, + "loss": 0.4377, + "step": 6266 + }, + { + "epoch": 2.9631205673758867, + "grad_norm": 2.685837507247925, + "learning_rate": 2.57983627708061e-06, + "loss": 0.4409, + "step": 6267 + }, + { + "epoch": 2.963593380614657, + "grad_norm": 2.8281500339508057, + "learning_rate": 2.579212765799631e-06, + "loss": 0.4567, + "step": 6268 + }, + { + "epoch": 2.964066193853428, + "grad_norm": 2.6387875080108643, + "learning_rate": 2.57858924958642e-06, + "loss": 0.4061, + "step": 6269 + }, + { + "epoch": 2.9645390070921986, + "grad_norm": 2.64139986038208, + "learning_rate": 2.5779657284798017e-06, + "loss": 0.4539, + "step": 6270 + }, + { + "epoch": 2.965011820330969, + "grad_norm": 2.7384836673736572, + "learning_rate": 2.5773422025185983e-06, + "loss": 0.408, + "step": 6271 + }, + { + "epoch": 2.96548463356974, + "grad_norm": 2.262514352798462, + "learning_rate": 2.576718671741636e-06, + "loss": 0.3726, + "step": 6272 + }, + { + "epoch": 2.9659574468085106, + "grad_norm": 2.53800106048584, + "learning_rate": 2.5760951361877384e-06, + "loss": 0.4716, + "step": 6273 + }, + { + "epoch": 2.9664302600472814, + "grad_norm": 3.256701707839966, + "learning_rate": 2.57547159589573e-06, + "loss": 0.518, + "step": 6274 + }, + { + "epoch": 2.966903073286052, + "grad_norm": 2.9427342414855957, + "learning_rate": 2.574848050904436e-06, + "loss": 0.4255, + "step": 6275 + }, + { + "epoch": 2.9673758865248225, + "grad_norm": 2.5794098377227783, + "learning_rate": 2.574224501252682e-06, + "loss": 0.4412, + "step": 6276 + }, + { + "epoch": 2.9678486997635933, + "grad_norm": 2.5894877910614014, + "learning_rate": 2.573600946979294e-06, + "loss": 0.4356, + "step": 6277 + }, + { + "epoch": 2.968321513002364, + "grad_norm": 2.9597361087799072, + "learning_rate": 2.572977388123098e-06, + "loss": 0.4376, + "step": 6278 + }, + { + "epoch": 2.9687943262411345, + "grad_norm": 2.779303550720215, + "learning_rate": 2.5723538247229197e-06, + "loss": 0.3985, + "step": 6279 + }, + { + "epoch": 2.9692671394799053, + "grad_norm": 2.9173855781555176, + "learning_rate": 2.5717302568175866e-06, + "loss": 0.4581, + "step": 6280 + }, + { + "epoch": 2.969739952718676, + "grad_norm": 2.703721284866333, + "learning_rate": 2.5711066844459242e-06, + "loss": 0.3705, + "step": 6281 + }, + { + "epoch": 2.970212765957447, + "grad_norm": 2.5415029525756836, + "learning_rate": 2.5704831076467613e-06, + "loss": 0.4089, + "step": 6282 + }, + { + "epoch": 2.9706855791962177, + "grad_norm": 2.791780948638916, + "learning_rate": 2.5698595264589234e-06, + "loss": 0.4357, + "step": 6283 + }, + { + "epoch": 2.971158392434988, + "grad_norm": 2.887662887573242, + "learning_rate": 2.5692359409212392e-06, + "loss": 0.4093, + "step": 6284 + }, + { + "epoch": 2.971631205673759, + "grad_norm": 3.0309557914733887, + "learning_rate": 2.5686123510725364e-06, + "loss": 0.4461, + "step": 6285 + }, + { + "epoch": 2.9721040189125296, + "grad_norm": 2.6861515045166016, + "learning_rate": 2.5679887569516437e-06, + "loss": 0.4199, + "step": 6286 + }, + { + "epoch": 2.9725768321513, + "grad_norm": 2.7014012336730957, + "learning_rate": 2.5673651585973897e-06, + "loss": 0.4373, + "step": 6287 + }, + { + "epoch": 2.9730496453900708, + "grad_norm": 2.951265811920166, + "learning_rate": 2.5667415560486026e-06, + "loss": 0.4426, + "step": 6288 + }, + { + "epoch": 2.9735224586288416, + "grad_norm": 2.7664504051208496, + "learning_rate": 2.5661179493441106e-06, + "loss": 0.474, + "step": 6289 + }, + { + "epoch": 2.9739952718676124, + "grad_norm": 2.6081087589263916, + "learning_rate": 2.5654943385227445e-06, + "loss": 0.4058, + "step": 6290 + }, + { + "epoch": 2.974468085106383, + "grad_norm": 2.9416966438293457, + "learning_rate": 2.564870723623333e-06, + "loss": 0.506, + "step": 6291 + }, + { + "epoch": 2.9749408983451535, + "grad_norm": 2.9441659450531006, + "learning_rate": 2.564247104684706e-06, + "loss": 0.4505, + "step": 6292 + }, + { + "epoch": 2.9754137115839243, + "grad_norm": 2.7110862731933594, + "learning_rate": 2.563623481745693e-06, + "loss": 0.4493, + "step": 6293 + }, + { + "epoch": 2.975886524822695, + "grad_norm": 2.88459849357605, + "learning_rate": 2.562999854845125e-06, + "loss": 0.4462, + "step": 6294 + }, + { + "epoch": 2.9763593380614655, + "grad_norm": 3.0491793155670166, + "learning_rate": 2.5623762240218327e-06, + "loss": 0.4928, + "step": 6295 + }, + { + "epoch": 2.9768321513002363, + "grad_norm": 2.9475483894348145, + "learning_rate": 2.561752589314646e-06, + "loss": 0.4535, + "step": 6296 + }, + { + "epoch": 2.977304964539007, + "grad_norm": 2.879495859146118, + "learning_rate": 2.561128950762397e-06, + "loss": 0.4393, + "step": 6297 + }, + { + "epoch": 2.977777777777778, + "grad_norm": 2.8478336334228516, + "learning_rate": 2.560505308403916e-06, + "loss": 0.4363, + "step": 6298 + }, + { + "epoch": 2.9782505910165487, + "grad_norm": 2.5475094318389893, + "learning_rate": 2.5598816622780343e-06, + "loss": 0.3825, + "step": 6299 + }, + { + "epoch": 2.978723404255319, + "grad_norm": 2.85430908203125, + "learning_rate": 2.5592580124235838e-06, + "loss": 0.4226, + "step": 6300 + }, + { + "epoch": 2.97919621749409, + "grad_norm": 2.569775104522705, + "learning_rate": 2.5586343588793975e-06, + "loss": 0.4045, + "step": 6301 + }, + { + "epoch": 2.9796690307328606, + "grad_norm": 2.4482202529907227, + "learning_rate": 2.558010701684307e-06, + "loss": 0.4625, + "step": 6302 + }, + { + "epoch": 2.980141843971631, + "grad_norm": 2.9301230907440186, + "learning_rate": 2.5573870408771436e-06, + "loss": 0.4358, + "step": 6303 + }, + { + "epoch": 2.9806146572104018, + "grad_norm": 2.9865870475769043, + "learning_rate": 2.5567633764967416e-06, + "loss": 0.497, + "step": 6304 + }, + { + "epoch": 2.9810874704491725, + "grad_norm": 2.523524522781372, + "learning_rate": 2.556139708581933e-06, + "loss": 0.4141, + "step": 6305 + }, + { + "epoch": 2.9815602836879433, + "grad_norm": 2.8489344120025635, + "learning_rate": 2.5555160371715504e-06, + "loss": 0.4205, + "step": 6306 + }, + { + "epoch": 2.982033096926714, + "grad_norm": 2.417759895324707, + "learning_rate": 2.5548923623044274e-06, + "loss": 0.44, + "step": 6307 + }, + { + "epoch": 2.9825059101654845, + "grad_norm": 2.7626900672912598, + "learning_rate": 2.554268684019398e-06, + "loss": 0.4646, + "step": 6308 + }, + { + "epoch": 2.9829787234042553, + "grad_norm": 3.0916266441345215, + "learning_rate": 2.5536450023552956e-06, + "loss": 0.4443, + "step": 6309 + }, + { + "epoch": 2.983451536643026, + "grad_norm": 2.721992015838623, + "learning_rate": 2.5530213173509542e-06, + "loss": 0.4008, + "step": 6310 + }, + { + "epoch": 2.9839243498817964, + "grad_norm": 2.825334072113037, + "learning_rate": 2.552397629045208e-06, + "loss": 0.4513, + "step": 6311 + }, + { + "epoch": 2.9843971631205672, + "grad_norm": 2.912050485610962, + "learning_rate": 2.5517739374768915e-06, + "loss": 0.4104, + "step": 6312 + }, + { + "epoch": 2.984869976359338, + "grad_norm": 2.760650634765625, + "learning_rate": 2.551150242684838e-06, + "loss": 0.4372, + "step": 6313 + }, + { + "epoch": 2.985342789598109, + "grad_norm": 2.8926033973693848, + "learning_rate": 2.5505265447078838e-06, + "loss": 0.475, + "step": 6314 + }, + { + "epoch": 2.9858156028368796, + "grad_norm": 2.6279892921447754, + "learning_rate": 2.5499028435848633e-06, + "loss": 0.4589, + "step": 6315 + }, + { + "epoch": 2.98628841607565, + "grad_norm": 3.2147316932678223, + "learning_rate": 2.549279139354611e-06, + "loss": 0.4968, + "step": 6316 + }, + { + "epoch": 2.986761229314421, + "grad_norm": 2.4510674476623535, + "learning_rate": 2.5486554320559626e-06, + "loss": 0.4291, + "step": 6317 + }, + { + "epoch": 2.9872340425531916, + "grad_norm": 2.6919643878936768, + "learning_rate": 2.5480317217277544e-06, + "loss": 0.4704, + "step": 6318 + }, + { + "epoch": 2.987706855791962, + "grad_norm": 2.9832234382629395, + "learning_rate": 2.5474080084088215e-06, + "loss": 0.4129, + "step": 6319 + }, + { + "epoch": 2.9881796690307327, + "grad_norm": 2.893209218978882, + "learning_rate": 2.5467842921380004e-06, + "loss": 0.5099, + "step": 6320 + }, + { + "epoch": 2.9886524822695035, + "grad_norm": 2.6734580993652344, + "learning_rate": 2.5461605729541254e-06, + "loss": 0.4588, + "step": 6321 + }, + { + "epoch": 2.9891252955082743, + "grad_norm": 2.5591681003570557, + "learning_rate": 2.5455368508960343e-06, + "loss": 0.4162, + "step": 6322 + }, + { + "epoch": 2.989598108747045, + "grad_norm": 3.2619881629943848, + "learning_rate": 2.5449131260025626e-06, + "loss": 0.4412, + "step": 6323 + }, + { + "epoch": 2.9900709219858155, + "grad_norm": 2.897914409637451, + "learning_rate": 2.544289398312549e-06, + "loss": 0.5079, + "step": 6324 + }, + { + "epoch": 2.9905437352245863, + "grad_norm": 2.7891685962677, + "learning_rate": 2.5436656678648274e-06, + "loss": 0.42, + "step": 6325 + }, + { + "epoch": 2.991016548463357, + "grad_norm": 3.022341728210449, + "learning_rate": 2.5430419346982367e-06, + "loss": 0.4739, + "step": 6326 + }, + { + "epoch": 2.9914893617021274, + "grad_norm": 3.395775556564331, + "learning_rate": 2.542418198851614e-06, + "loss": 0.4822, + "step": 6327 + }, + { + "epoch": 2.9919621749408982, + "grad_norm": 3.0200490951538086, + "learning_rate": 2.541794460363795e-06, + "loss": 0.4755, + "step": 6328 + }, + { + "epoch": 2.992434988179669, + "grad_norm": 3.302020311355591, + "learning_rate": 2.541170719273619e-06, + "loss": 0.4603, + "step": 6329 + }, + { + "epoch": 2.99290780141844, + "grad_norm": 2.5985910892486572, + "learning_rate": 2.5405469756199226e-06, + "loss": 0.4475, + "step": 6330 + }, + { + "epoch": 2.9933806146572106, + "grad_norm": 2.9413928985595703, + "learning_rate": 2.5399232294415434e-06, + "loss": 0.4695, + "step": 6331 + }, + { + "epoch": 2.993853427895981, + "grad_norm": 2.942777156829834, + "learning_rate": 2.53929948077732e-06, + "loss": 0.4462, + "step": 6332 + }, + { + "epoch": 2.9943262411347518, + "grad_norm": 2.971120595932007, + "learning_rate": 2.53867572966609e-06, + "loss": 0.4546, + "step": 6333 + }, + { + "epoch": 2.9947990543735226, + "grad_norm": 2.8248138427734375, + "learning_rate": 2.5380519761466927e-06, + "loss": 0.453, + "step": 6334 + }, + { + "epoch": 2.995271867612293, + "grad_norm": 3.0819008350372314, + "learning_rate": 2.5374282202579647e-06, + "loss": 0.4774, + "step": 6335 + }, + { + "epoch": 2.9957446808510637, + "grad_norm": 2.742570161819458, + "learning_rate": 2.5368044620387466e-06, + "loss": 0.5059, + "step": 6336 + }, + { + "epoch": 2.9962174940898345, + "grad_norm": 2.9087419509887695, + "learning_rate": 2.5361807015278757e-06, + "loss": 0.3606, + "step": 6337 + }, + { + "epoch": 2.9966903073286053, + "grad_norm": 2.6887354850769043, + "learning_rate": 2.5355569387641908e-06, + "loss": 0.4247, + "step": 6338 + }, + { + "epoch": 2.997163120567376, + "grad_norm": 2.8516008853912354, + "learning_rate": 2.534933173786531e-06, + "loss": 0.4502, + "step": 6339 + }, + { + "epoch": 2.9976359338061465, + "grad_norm": 2.4463164806365967, + "learning_rate": 2.5343094066337366e-06, + "loss": 0.3883, + "step": 6340 + }, + { + "epoch": 2.9981087470449173, + "grad_norm": 2.87025785446167, + "learning_rate": 2.533685637344645e-06, + "loss": 0.4534, + "step": 6341 + }, + { + "epoch": 2.998581560283688, + "grad_norm": 3.0706169605255127, + "learning_rate": 2.5330618659580967e-06, + "loss": 0.5426, + "step": 6342 + }, + { + "epoch": 2.9990543735224584, + "grad_norm": 2.7185773849487305, + "learning_rate": 2.532438092512931e-06, + "loss": 0.497, + "step": 6343 + }, + { + "epoch": 2.999527186761229, + "grad_norm": 2.840207815170288, + "learning_rate": 2.531814317047988e-06, + "loss": 0.4073, + "step": 6344 + }, + { + "epoch": 3.0, + "grad_norm": 3.1592655181884766, + "learning_rate": 2.5311905396021063e-06, + "loss": 0.4728, + "step": 6345 + }, + { + "epoch": 3.000472813238771, + "grad_norm": 2.190042495727539, + "learning_rate": 2.530566760214127e-06, + "loss": 0.3588, + "step": 6346 + }, + { + "epoch": 3.000945626477541, + "grad_norm": 2.749516248703003, + "learning_rate": 2.5299429789228898e-06, + "loss": 0.3495, + "step": 6347 + }, + { + "epoch": 3.001418439716312, + "grad_norm": 2.6181938648223877, + "learning_rate": 2.5293191957672335e-06, + "loss": 0.3611, + "step": 6348 + }, + { + "epoch": 3.0018912529550827, + "grad_norm": 2.7235212326049805, + "learning_rate": 2.528695410786e-06, + "loss": 0.4173, + "step": 6349 + }, + { + "epoch": 3.0023640661938535, + "grad_norm": 2.5408031940460205, + "learning_rate": 2.528071624018029e-06, + "loss": 0.3651, + "step": 6350 + }, + { + "epoch": 3.002836879432624, + "grad_norm": 2.7824409008026123, + "learning_rate": 2.5274478355021615e-06, + "loss": 0.378, + "step": 6351 + }, + { + "epoch": 3.0033096926713947, + "grad_norm": 2.7671427726745605, + "learning_rate": 2.526824045277238e-06, + "loss": 0.446, + "step": 6352 + }, + { + "epoch": 3.0037825059101655, + "grad_norm": 2.6746346950531006, + "learning_rate": 2.526200253382098e-06, + "loss": 0.3831, + "step": 6353 + }, + { + "epoch": 3.0042553191489363, + "grad_norm": 2.437439441680908, + "learning_rate": 2.525576459855583e-06, + "loss": 0.352, + "step": 6354 + }, + { + "epoch": 3.0047281323877066, + "grad_norm": 2.7632546424865723, + "learning_rate": 2.5249526647365343e-06, + "loss": 0.4636, + "step": 6355 + }, + { + "epoch": 3.0052009456264774, + "grad_norm": 2.681955099105835, + "learning_rate": 2.524328868063793e-06, + "loss": 0.3978, + "step": 6356 + }, + { + "epoch": 3.0056737588652482, + "grad_norm": 2.9575345516204834, + "learning_rate": 2.523705069876199e-06, + "loss": 0.3803, + "step": 6357 + }, + { + "epoch": 3.006146572104019, + "grad_norm": 2.7368216514587402, + "learning_rate": 2.523081270212594e-06, + "loss": 0.3968, + "step": 6358 + }, + { + "epoch": 3.0066193853427894, + "grad_norm": 2.637592077255249, + "learning_rate": 2.522457469111821e-06, + "loss": 0.3629, + "step": 6359 + }, + { + "epoch": 3.00709219858156, + "grad_norm": 2.579331398010254, + "learning_rate": 2.5218336666127187e-06, + "loss": 0.4044, + "step": 6360 + }, + { + "epoch": 3.007565011820331, + "grad_norm": 3.014544725418091, + "learning_rate": 2.5212098627541296e-06, + "loss": 0.3518, + "step": 6361 + }, + { + "epoch": 3.0080378250591018, + "grad_norm": 2.5261058807373047, + "learning_rate": 2.520586057574896e-06, + "loss": 0.3763, + "step": 6362 + }, + { + "epoch": 3.008510638297872, + "grad_norm": 3.234910249710083, + "learning_rate": 2.519962251113858e-06, + "loss": 0.3691, + "step": 6363 + }, + { + "epoch": 3.008983451536643, + "grad_norm": 3.2930967807769775, + "learning_rate": 2.519338443409859e-06, + "loss": 0.4363, + "step": 6364 + }, + { + "epoch": 3.0094562647754137, + "grad_norm": 2.807910442352295, + "learning_rate": 2.51871463450174e-06, + "loss": 0.3984, + "step": 6365 + }, + { + "epoch": 3.0099290780141845, + "grad_norm": 3.1555075645446777, + "learning_rate": 2.518090824428342e-06, + "loss": 0.4006, + "step": 6366 + }, + { + "epoch": 3.010401891252955, + "grad_norm": 3.1793272495269775, + "learning_rate": 2.5174670132285084e-06, + "loss": 0.4966, + "step": 6367 + }, + { + "epoch": 3.0108747044917257, + "grad_norm": 2.7007548809051514, + "learning_rate": 2.5168432009410805e-06, + "loss": 0.3755, + "step": 6368 + }, + { + "epoch": 3.0113475177304965, + "grad_norm": 2.914792537689209, + "learning_rate": 2.5162193876048995e-06, + "loss": 0.39, + "step": 6369 + }, + { + "epoch": 3.0118203309692673, + "grad_norm": 2.935516119003296, + "learning_rate": 2.5155955732588093e-06, + "loss": 0.4045, + "step": 6370 + }, + { + "epoch": 3.0122931442080376, + "grad_norm": 2.8817989826202393, + "learning_rate": 2.5149717579416503e-06, + "loss": 0.3751, + "step": 6371 + }, + { + "epoch": 3.0127659574468084, + "grad_norm": 2.9181740283966064, + "learning_rate": 2.514347941692266e-06, + "loss": 0.3689, + "step": 6372 + }, + { + "epoch": 3.013238770685579, + "grad_norm": 3.052060604095459, + "learning_rate": 2.5137241245494982e-06, + "loss": 0.3874, + "step": 6373 + }, + { + "epoch": 3.01371158392435, + "grad_norm": 2.6931657791137695, + "learning_rate": 2.513100306552189e-06, + "loss": 0.3673, + "step": 6374 + }, + { + "epoch": 3.0141843971631204, + "grad_norm": 2.3422248363494873, + "learning_rate": 2.5124764877391824e-06, + "loss": 0.3753, + "step": 6375 + }, + { + "epoch": 3.014657210401891, + "grad_norm": 2.5826265811920166, + "learning_rate": 2.5118526681493186e-06, + "loss": 0.3661, + "step": 6376 + }, + { + "epoch": 3.015130023640662, + "grad_norm": 2.7407493591308594, + "learning_rate": 2.5112288478214415e-06, + "loss": 0.3887, + "step": 6377 + }, + { + "epoch": 3.0156028368794328, + "grad_norm": 2.7378315925598145, + "learning_rate": 2.510605026794393e-06, + "loss": 0.3623, + "step": 6378 + }, + { + "epoch": 3.016075650118203, + "grad_norm": 2.59541654586792, + "learning_rate": 2.5099812051070167e-06, + "loss": 0.3804, + "step": 6379 + }, + { + "epoch": 3.016548463356974, + "grad_norm": 3.1022770404815674, + "learning_rate": 2.509357382798154e-06, + "loss": 0.4092, + "step": 6380 + }, + { + "epoch": 3.0170212765957447, + "grad_norm": 2.521545648574829, + "learning_rate": 2.5087335599066476e-06, + "loss": 0.3509, + "step": 6381 + }, + { + "epoch": 3.0174940898345155, + "grad_norm": 2.949395179748535, + "learning_rate": 2.5081097364713407e-06, + "loss": 0.387, + "step": 6382 + }, + { + "epoch": 3.017966903073286, + "grad_norm": 2.4806487560272217, + "learning_rate": 2.507485912531077e-06, + "loss": 0.4004, + "step": 6383 + }, + { + "epoch": 3.0184397163120567, + "grad_norm": 2.6480894088745117, + "learning_rate": 2.506862088124698e-06, + "loss": 0.3366, + "step": 6384 + }, + { + "epoch": 3.0189125295508275, + "grad_norm": 2.62559175491333, + "learning_rate": 2.5062382632910463e-06, + "loss": 0.3676, + "step": 6385 + }, + { + "epoch": 3.0193853427895982, + "grad_norm": 2.694767951965332, + "learning_rate": 2.5056144380689657e-06, + "loss": 0.3438, + "step": 6386 + }, + { + "epoch": 3.0198581560283686, + "grad_norm": 2.808107614517212, + "learning_rate": 2.504990612497299e-06, + "loss": 0.3831, + "step": 6387 + }, + { + "epoch": 3.0203309692671394, + "grad_norm": 3.2392303943634033, + "learning_rate": 2.504366786614888e-06, + "loss": 0.3493, + "step": 6388 + }, + { + "epoch": 3.02080378250591, + "grad_norm": 2.6899030208587646, + "learning_rate": 2.5037429604605774e-06, + "loss": 0.3998, + "step": 6389 + }, + { + "epoch": 3.021276595744681, + "grad_norm": 2.5622799396514893, + "learning_rate": 2.503119134073208e-06, + "loss": 0.3443, + "step": 6390 + }, + { + "epoch": 3.0217494089834513, + "grad_norm": 2.716832399368286, + "learning_rate": 2.502495307491625e-06, + "loss": 0.4465, + "step": 6391 + }, + { + "epoch": 3.022222222222222, + "grad_norm": 2.8117692470550537, + "learning_rate": 2.501871480754669e-06, + "loss": 0.3513, + "step": 6392 + }, + { + "epoch": 3.022695035460993, + "grad_norm": 3.1260762214660645, + "learning_rate": 2.501247653901185e-06, + "loss": 0.4336, + "step": 6393 + }, + { + "epoch": 3.0231678486997637, + "grad_norm": 2.5076897144317627, + "learning_rate": 2.5006238269700137e-06, + "loss": 0.3437, + "step": 6394 + }, + { + "epoch": 3.023640661938534, + "grad_norm": 2.781937837600708, + "learning_rate": 2.5e-06, + "loss": 0.3583, + "step": 6395 + }, + { + "epoch": 3.024113475177305, + "grad_norm": 3.084050178527832, + "learning_rate": 2.499376173029987e-06, + "loss": 0.3785, + "step": 6396 + }, + { + "epoch": 3.0245862884160757, + "grad_norm": 3.2292473316192627, + "learning_rate": 2.498752346098816e-06, + "loss": 0.3858, + "step": 6397 + }, + { + "epoch": 3.0250591016548465, + "grad_norm": 2.738614797592163, + "learning_rate": 2.498128519245332e-06, + "loss": 0.4166, + "step": 6398 + }, + { + "epoch": 3.025531914893617, + "grad_norm": 2.940103054046631, + "learning_rate": 2.4975046925083764e-06, + "loss": 0.4117, + "step": 6399 + }, + { + "epoch": 3.0260047281323876, + "grad_norm": 2.5177032947540283, + "learning_rate": 2.4968808659267927e-06, + "loss": 0.3704, + "step": 6400 + }, + { + "epoch": 3.0264775413711584, + "grad_norm": 2.6969990730285645, + "learning_rate": 2.4962570395394243e-06, + "loss": 0.3721, + "step": 6401 + }, + { + "epoch": 3.0269503546099292, + "grad_norm": 2.9696028232574463, + "learning_rate": 2.495633213385112e-06, + "loss": 0.3934, + "step": 6402 + }, + { + "epoch": 3.0274231678486996, + "grad_norm": 3.4032552242279053, + "learning_rate": 2.495009387502702e-06, + "loss": 0.3877, + "step": 6403 + }, + { + "epoch": 3.0278959810874704, + "grad_norm": 2.6801865100860596, + "learning_rate": 2.4943855619310343e-06, + "loss": 0.3421, + "step": 6404 + }, + { + "epoch": 3.028368794326241, + "grad_norm": 2.827056884765625, + "learning_rate": 2.493761736708954e-06, + "loss": 0.3791, + "step": 6405 + }, + { + "epoch": 3.028841607565012, + "grad_norm": 2.6393566131591797, + "learning_rate": 2.4931379118753034e-06, + "loss": 0.3729, + "step": 6406 + }, + { + "epoch": 3.0293144208037823, + "grad_norm": 2.833519458770752, + "learning_rate": 2.4925140874689236e-06, + "loss": 0.3836, + "step": 6407 + }, + { + "epoch": 3.029787234042553, + "grad_norm": 2.8852169513702393, + "learning_rate": 2.4918902635286597e-06, + "loss": 0.4307, + "step": 6408 + }, + { + "epoch": 3.030260047281324, + "grad_norm": 2.7166404724121094, + "learning_rate": 2.491266440093354e-06, + "loss": 0.3825, + "step": 6409 + }, + { + "epoch": 3.0307328605200947, + "grad_norm": 2.5828018188476562, + "learning_rate": 2.4906426172018474e-06, + "loss": 0.3579, + "step": 6410 + }, + { + "epoch": 3.031205673758865, + "grad_norm": 2.915632724761963, + "learning_rate": 2.490018794892985e-06, + "loss": 0.4099, + "step": 6411 + }, + { + "epoch": 3.031678486997636, + "grad_norm": 2.7117249965667725, + "learning_rate": 2.489394973205607e-06, + "loss": 0.4063, + "step": 6412 + }, + { + "epoch": 3.0321513002364067, + "grad_norm": 2.3989102840423584, + "learning_rate": 2.488771152178559e-06, + "loss": 0.3377, + "step": 6413 + }, + { + "epoch": 3.0326241134751775, + "grad_norm": 2.6560115814208984, + "learning_rate": 2.488147331850682e-06, + "loss": 0.4072, + "step": 6414 + }, + { + "epoch": 3.033096926713948, + "grad_norm": 2.9466328620910645, + "learning_rate": 2.4875235122608184e-06, + "loss": 0.3559, + "step": 6415 + }, + { + "epoch": 3.0335697399527186, + "grad_norm": 2.765348196029663, + "learning_rate": 2.4868996934478114e-06, + "loss": 0.336, + "step": 6416 + }, + { + "epoch": 3.0340425531914894, + "grad_norm": 2.6021807193756104, + "learning_rate": 2.4862758754505017e-06, + "loss": 0.3861, + "step": 6417 + }, + { + "epoch": 3.03451536643026, + "grad_norm": 2.7293684482574463, + "learning_rate": 2.4856520583077344e-06, + "loss": 0.3926, + "step": 6418 + }, + { + "epoch": 3.0349881796690306, + "grad_norm": 2.9704763889312744, + "learning_rate": 2.485028242058351e-06, + "loss": 0.4303, + "step": 6419 + }, + { + "epoch": 3.0354609929078014, + "grad_norm": 3.385713815689087, + "learning_rate": 2.484404426741191e-06, + "loss": 0.44, + "step": 6420 + }, + { + "epoch": 3.035933806146572, + "grad_norm": 3.177983045578003, + "learning_rate": 2.4837806123951013e-06, + "loss": 0.4256, + "step": 6421 + }, + { + "epoch": 3.036406619385343, + "grad_norm": 2.6287200450897217, + "learning_rate": 2.4831567990589203e-06, + "loss": 0.3764, + "step": 6422 + }, + { + "epoch": 3.0368794326241133, + "grad_norm": 2.81823992729187, + "learning_rate": 2.4825329867714924e-06, + "loss": 0.3645, + "step": 6423 + }, + { + "epoch": 3.037352245862884, + "grad_norm": 3.1826934814453125, + "learning_rate": 2.4819091755716586e-06, + "loss": 0.3666, + "step": 6424 + }, + { + "epoch": 3.037825059101655, + "grad_norm": 3.0880346298217773, + "learning_rate": 2.481285365498261e-06, + "loss": 0.4339, + "step": 6425 + }, + { + "epoch": 3.0382978723404257, + "grad_norm": 3.1764965057373047, + "learning_rate": 2.480661556590142e-06, + "loss": 0.4804, + "step": 6426 + }, + { + "epoch": 3.038770685579196, + "grad_norm": 2.89469313621521, + "learning_rate": 2.480037748886142e-06, + "loss": 0.3875, + "step": 6427 + }, + { + "epoch": 3.039243498817967, + "grad_norm": 2.6043636798858643, + "learning_rate": 2.479413942425105e-06, + "loss": 0.3859, + "step": 6428 + }, + { + "epoch": 3.0397163120567376, + "grad_norm": 2.6570727825164795, + "learning_rate": 2.4787901372458712e-06, + "loss": 0.3508, + "step": 6429 + }, + { + "epoch": 3.0401891252955084, + "grad_norm": 2.914050579071045, + "learning_rate": 2.4781663333872825e-06, + "loss": 0.3904, + "step": 6430 + }, + { + "epoch": 3.040661938534279, + "grad_norm": 2.595606803894043, + "learning_rate": 2.47754253088818e-06, + "loss": 0.3753, + "step": 6431 + }, + { + "epoch": 3.0411347517730496, + "grad_norm": 2.68186616897583, + "learning_rate": 2.4769187297874065e-06, + "loss": 0.3545, + "step": 6432 + }, + { + "epoch": 3.0416075650118204, + "grad_norm": 2.956507921218872, + "learning_rate": 2.476294930123802e-06, + "loss": 0.3778, + "step": 6433 + }, + { + "epoch": 3.042080378250591, + "grad_norm": 2.8327226638793945, + "learning_rate": 2.475671131936209e-06, + "loss": 0.3205, + "step": 6434 + }, + { + "epoch": 3.0425531914893615, + "grad_norm": 2.594348430633545, + "learning_rate": 2.475047335263466e-06, + "loss": 0.3859, + "step": 6435 + }, + { + "epoch": 3.0430260047281323, + "grad_norm": 3.5030717849731445, + "learning_rate": 2.4744235401444177e-06, + "loss": 0.3611, + "step": 6436 + }, + { + "epoch": 3.043498817966903, + "grad_norm": 2.8478317260742188, + "learning_rate": 2.4737997466179034e-06, + "loss": 0.3927, + "step": 6437 + }, + { + "epoch": 3.043971631205674, + "grad_norm": 2.677827835083008, + "learning_rate": 2.4731759547227627e-06, + "loss": 0.3784, + "step": 6438 + }, + { + "epoch": 3.0444444444444443, + "grad_norm": 3.0059866905212402, + "learning_rate": 2.4725521644978393e-06, + "loss": 0.4279, + "step": 6439 + }, + { + "epoch": 3.044917257683215, + "grad_norm": 3.012500047683716, + "learning_rate": 2.4719283759819713e-06, + "loss": 0.4007, + "step": 6440 + }, + { + "epoch": 3.045390070921986, + "grad_norm": 2.758204936981201, + "learning_rate": 2.4713045892140007e-06, + "loss": 0.3668, + "step": 6441 + }, + { + "epoch": 3.0458628841607567, + "grad_norm": 2.9551615715026855, + "learning_rate": 2.4706808042327678e-06, + "loss": 0.3524, + "step": 6442 + }, + { + "epoch": 3.046335697399527, + "grad_norm": 2.8639965057373047, + "learning_rate": 2.4700570210771115e-06, + "loss": 0.3886, + "step": 6443 + }, + { + "epoch": 3.046808510638298, + "grad_norm": 2.718219757080078, + "learning_rate": 2.4694332397858738e-06, + "loss": 0.3693, + "step": 6444 + }, + { + "epoch": 3.0472813238770686, + "grad_norm": 3.050135612487793, + "learning_rate": 2.4688094603978933e-06, + "loss": 0.3979, + "step": 6445 + }, + { + "epoch": 3.0477541371158394, + "grad_norm": 2.786186456680298, + "learning_rate": 2.468185682952013e-06, + "loss": 0.3809, + "step": 6446 + }, + { + "epoch": 3.0482269503546098, + "grad_norm": 2.6462252140045166, + "learning_rate": 2.4675619074870697e-06, + "loss": 0.3746, + "step": 6447 + }, + { + "epoch": 3.0486997635933806, + "grad_norm": 2.984783887863159, + "learning_rate": 2.4669381340419037e-06, + "loss": 0.4092, + "step": 6448 + }, + { + "epoch": 3.0491725768321514, + "grad_norm": 2.936380624771118, + "learning_rate": 2.466314362655356e-06, + "loss": 0.4335, + "step": 6449 + }, + { + "epoch": 3.049645390070922, + "grad_norm": 2.730738639831543, + "learning_rate": 2.465690593366264e-06, + "loss": 0.364, + "step": 6450 + }, + { + "epoch": 3.0501182033096925, + "grad_norm": 2.7273590564727783, + "learning_rate": 2.4650668262134693e-06, + "loss": 0.3905, + "step": 6451 + }, + { + "epoch": 3.0505910165484633, + "grad_norm": 2.9588208198547363, + "learning_rate": 2.4644430612358105e-06, + "loss": 0.3936, + "step": 6452 + }, + { + "epoch": 3.051063829787234, + "grad_norm": 2.8721611499786377, + "learning_rate": 2.4638192984721247e-06, + "loss": 0.4279, + "step": 6453 + }, + { + "epoch": 3.051536643026005, + "grad_norm": 3.7179651260375977, + "learning_rate": 2.463195537961254e-06, + "loss": 0.427, + "step": 6454 + }, + { + "epoch": 3.0520094562647753, + "grad_norm": 2.651731491088867, + "learning_rate": 2.4625717797420353e-06, + "loss": 0.3471, + "step": 6455 + }, + { + "epoch": 3.052482269503546, + "grad_norm": 3.898737668991089, + "learning_rate": 2.4619480238533085e-06, + "loss": 0.4574, + "step": 6456 + }, + { + "epoch": 3.052955082742317, + "grad_norm": 2.916252374649048, + "learning_rate": 2.4613242703339108e-06, + "loss": 0.3622, + "step": 6457 + }, + { + "epoch": 3.0534278959810877, + "grad_norm": 3.122565507888794, + "learning_rate": 2.4607005192226806e-06, + "loss": 0.3954, + "step": 6458 + }, + { + "epoch": 3.053900709219858, + "grad_norm": 3.2377424240112305, + "learning_rate": 2.4600767705584575e-06, + "loss": 0.4082, + "step": 6459 + }, + { + "epoch": 3.054373522458629, + "grad_norm": 2.941102981567383, + "learning_rate": 2.459453024380079e-06, + "loss": 0.4324, + "step": 6460 + }, + { + "epoch": 3.0548463356973996, + "grad_norm": 2.964313507080078, + "learning_rate": 2.4588292807263816e-06, + "loss": 0.3037, + "step": 6461 + }, + { + "epoch": 3.0553191489361704, + "grad_norm": 2.824669599533081, + "learning_rate": 2.4582055396362055e-06, + "loss": 0.4076, + "step": 6462 + }, + { + "epoch": 3.0557919621749408, + "grad_norm": 2.7739884853363037, + "learning_rate": 2.457581801148387e-06, + "loss": 0.3615, + "step": 6463 + }, + { + "epoch": 3.0562647754137116, + "grad_norm": 3.2974464893341064, + "learning_rate": 2.456958065301764e-06, + "loss": 0.426, + "step": 6464 + }, + { + "epoch": 3.0567375886524824, + "grad_norm": 3.0801217555999756, + "learning_rate": 2.456334332135174e-06, + "loss": 0.3737, + "step": 6465 + }, + { + "epoch": 3.057210401891253, + "grad_norm": 2.788851022720337, + "learning_rate": 2.455710601687452e-06, + "loss": 0.4367, + "step": 6466 + }, + { + "epoch": 3.0576832151300235, + "grad_norm": 2.8078136444091797, + "learning_rate": 2.4550868739974378e-06, + "loss": 0.3796, + "step": 6467 + }, + { + "epoch": 3.0581560283687943, + "grad_norm": 2.9871349334716797, + "learning_rate": 2.4544631491039657e-06, + "loss": 0.3869, + "step": 6468 + }, + { + "epoch": 3.058628841607565, + "grad_norm": 2.9170174598693848, + "learning_rate": 2.453839427045875e-06, + "loss": 0.4591, + "step": 6469 + }, + { + "epoch": 3.059101654846336, + "grad_norm": 2.7316131591796875, + "learning_rate": 2.4532157078620013e-06, + "loss": 0.3723, + "step": 6470 + }, + { + "epoch": 3.0595744680851062, + "grad_norm": 3.047921657562256, + "learning_rate": 2.4525919915911793e-06, + "loss": 0.3804, + "step": 6471 + }, + { + "epoch": 3.060047281323877, + "grad_norm": 3.047934055328369, + "learning_rate": 2.4519682782722465e-06, + "loss": 0.3949, + "step": 6472 + }, + { + "epoch": 3.060520094562648, + "grad_norm": 2.4911186695098877, + "learning_rate": 2.4513445679440374e-06, + "loss": 0.3629, + "step": 6473 + }, + { + "epoch": 3.0609929078014186, + "grad_norm": 2.5353519916534424, + "learning_rate": 2.4507208606453895e-06, + "loss": 0.3417, + "step": 6474 + }, + { + "epoch": 3.061465721040189, + "grad_norm": 2.474622964859009, + "learning_rate": 2.4500971564151384e-06, + "loss": 0.3468, + "step": 6475 + }, + { + "epoch": 3.06193853427896, + "grad_norm": 2.7016963958740234, + "learning_rate": 2.4494734552921166e-06, + "loss": 0.3872, + "step": 6476 + }, + { + "epoch": 3.0624113475177306, + "grad_norm": 2.912144184112549, + "learning_rate": 2.4488497573151625e-06, + "loss": 0.3727, + "step": 6477 + }, + { + "epoch": 3.0628841607565014, + "grad_norm": 2.8234877586364746, + "learning_rate": 2.4482260625231093e-06, + "loss": 0.3472, + "step": 6478 + }, + { + "epoch": 3.0633569739952717, + "grad_norm": 2.6554179191589355, + "learning_rate": 2.447602370954793e-06, + "loss": 0.343, + "step": 6479 + }, + { + "epoch": 3.0638297872340425, + "grad_norm": 2.666419744491577, + "learning_rate": 2.446978682649047e-06, + "loss": 0.3932, + "step": 6480 + }, + { + "epoch": 3.0643026004728133, + "grad_norm": 2.968574285507202, + "learning_rate": 2.446354997644705e-06, + "loss": 0.4418, + "step": 6481 + }, + { + "epoch": 3.064775413711584, + "grad_norm": 2.692253589630127, + "learning_rate": 2.4457313159806028e-06, + "loss": 0.3141, + "step": 6482 + }, + { + "epoch": 3.0652482269503545, + "grad_norm": 2.5857295989990234, + "learning_rate": 2.445107637695574e-06, + "loss": 0.3392, + "step": 6483 + }, + { + "epoch": 3.0657210401891253, + "grad_norm": 3.2332825660705566, + "learning_rate": 2.4444839628284504e-06, + "loss": 0.4694, + "step": 6484 + }, + { + "epoch": 3.066193853427896, + "grad_norm": 2.7391014099121094, + "learning_rate": 2.4438602914180684e-06, + "loss": 0.3966, + "step": 6485 + }, + { + "epoch": 3.066666666666667, + "grad_norm": 2.7882139682769775, + "learning_rate": 2.4432366235032593e-06, + "loss": 0.3552, + "step": 6486 + }, + { + "epoch": 3.0671394799054372, + "grad_norm": 2.8907811641693115, + "learning_rate": 2.4426129591228573e-06, + "loss": 0.4478, + "step": 6487 + }, + { + "epoch": 3.067612293144208, + "grad_norm": 2.878929853439331, + "learning_rate": 2.4419892983156947e-06, + "loss": 0.3457, + "step": 6488 + }, + { + "epoch": 3.068085106382979, + "grad_norm": 2.7087442874908447, + "learning_rate": 2.441365641120603e-06, + "loss": 0.3491, + "step": 6489 + }, + { + "epoch": 3.0685579196217496, + "grad_norm": 3.2330431938171387, + "learning_rate": 2.4407419875764167e-06, + "loss": 0.3901, + "step": 6490 + }, + { + "epoch": 3.06903073286052, + "grad_norm": 3.0529370307922363, + "learning_rate": 2.440118337721966e-06, + "loss": 0.4059, + "step": 6491 + }, + { + "epoch": 3.0695035460992908, + "grad_norm": 2.4786794185638428, + "learning_rate": 2.439494691596085e-06, + "loss": 0.3153, + "step": 6492 + }, + { + "epoch": 3.0699763593380616, + "grad_norm": 2.956310510635376, + "learning_rate": 2.438871049237604e-06, + "loss": 0.3973, + "step": 6493 + }, + { + "epoch": 3.0704491725768324, + "grad_norm": 3.0816991329193115, + "learning_rate": 2.4382474106853543e-06, + "loss": 0.388, + "step": 6494 + }, + { + "epoch": 3.0709219858156027, + "grad_norm": 2.6103477478027344, + "learning_rate": 2.4376237759781686e-06, + "loss": 0.3656, + "step": 6495 + }, + { + "epoch": 3.0713947990543735, + "grad_norm": 2.974076271057129, + "learning_rate": 2.437000145154875e-06, + "loss": 0.3246, + "step": 6496 + }, + { + "epoch": 3.0718676122931443, + "grad_norm": 2.633605718612671, + "learning_rate": 2.4363765182543075e-06, + "loss": 0.3556, + "step": 6497 + }, + { + "epoch": 3.072340425531915, + "grad_norm": 2.49161434173584, + "learning_rate": 2.4357528953152953e-06, + "loss": 0.3506, + "step": 6498 + }, + { + "epoch": 3.0728132387706855, + "grad_norm": 2.6435935497283936, + "learning_rate": 2.4351292763766676e-06, + "loss": 0.3652, + "step": 6499 + }, + { + "epoch": 3.0732860520094563, + "grad_norm": 2.9710617065429688, + "learning_rate": 2.4345056614772563e-06, + "loss": 0.3713, + "step": 6500 + }, + { + "epoch": 3.073758865248227, + "grad_norm": 2.6947052478790283, + "learning_rate": 2.43388205065589e-06, + "loss": 0.378, + "step": 6501 + }, + { + "epoch": 3.0742316784869974, + "grad_norm": 2.9686238765716553, + "learning_rate": 2.433258443951398e-06, + "loss": 0.3936, + "step": 6502 + }, + { + "epoch": 3.074704491725768, + "grad_norm": 2.6008691787719727, + "learning_rate": 2.432634841402611e-06, + "loss": 0.3709, + "step": 6503 + }, + { + "epoch": 3.075177304964539, + "grad_norm": 2.595116376876831, + "learning_rate": 2.4320112430483563e-06, + "loss": 0.3884, + "step": 6504 + }, + { + "epoch": 3.07565011820331, + "grad_norm": 2.685241460800171, + "learning_rate": 2.431387648927464e-06, + "loss": 0.3751, + "step": 6505 + }, + { + "epoch": 3.0761229314420806, + "grad_norm": 2.8863797187805176, + "learning_rate": 2.430764059078762e-06, + "loss": 0.3765, + "step": 6506 + }, + { + "epoch": 3.076595744680851, + "grad_norm": 3.020766019821167, + "learning_rate": 2.430140473541077e-06, + "loss": 0.362, + "step": 6507 + }, + { + "epoch": 3.0770685579196217, + "grad_norm": 2.9521167278289795, + "learning_rate": 2.42951689235324e-06, + "loss": 0.41, + "step": 6508 + }, + { + "epoch": 3.0775413711583925, + "grad_norm": 2.5844924449920654, + "learning_rate": 2.4288933155540757e-06, + "loss": 0.3258, + "step": 6509 + }, + { + "epoch": 3.078014184397163, + "grad_norm": 3.052661657333374, + "learning_rate": 2.4282697431824138e-06, + "loss": 0.363, + "step": 6510 + }, + { + "epoch": 3.0784869976359337, + "grad_norm": 3.109342575073242, + "learning_rate": 2.427646175277081e-06, + "loss": 0.4105, + "step": 6511 + }, + { + "epoch": 3.0789598108747045, + "grad_norm": 3.3141326904296875, + "learning_rate": 2.427022611876903e-06, + "loss": 0.405, + "step": 6512 + }, + { + "epoch": 3.0794326241134753, + "grad_norm": 3.054673194885254, + "learning_rate": 2.426399053020707e-06, + "loss": 0.3532, + "step": 6513 + }, + { + "epoch": 3.079905437352246, + "grad_norm": 2.823489189147949, + "learning_rate": 2.425775498747318e-06, + "loss": 0.3762, + "step": 6514 + }, + { + "epoch": 3.0803782505910164, + "grad_norm": 2.6739792823791504, + "learning_rate": 2.425151949095565e-06, + "loss": 0.4044, + "step": 6515 + }, + { + "epoch": 3.0808510638297872, + "grad_norm": 2.7313177585601807, + "learning_rate": 2.4245284041042714e-06, + "loss": 0.3136, + "step": 6516 + }, + { + "epoch": 3.081323877068558, + "grad_norm": 3.1661181449890137, + "learning_rate": 2.4239048638122624e-06, + "loss": 0.44, + "step": 6517 + }, + { + "epoch": 3.0817966903073284, + "grad_norm": 3.326542377471924, + "learning_rate": 2.4232813282583647e-06, + "loss": 0.3798, + "step": 6518 + }, + { + "epoch": 3.082269503546099, + "grad_norm": 3.0194952487945557, + "learning_rate": 2.422657797481402e-06, + "loss": 0.423, + "step": 6519 + }, + { + "epoch": 3.08274231678487, + "grad_norm": 2.6704318523406982, + "learning_rate": 2.4220342715201995e-06, + "loss": 0.41, + "step": 6520 + }, + { + "epoch": 3.083215130023641, + "grad_norm": 3.057990312576294, + "learning_rate": 2.421410750413581e-06, + "loss": 0.4096, + "step": 6521 + }, + { + "epoch": 3.083687943262411, + "grad_norm": 2.6242079734802246, + "learning_rate": 2.4207872342003693e-06, + "loss": 0.3673, + "step": 6522 + }, + { + "epoch": 3.084160756501182, + "grad_norm": 2.933910846710205, + "learning_rate": 2.4201637229193904e-06, + "loss": 0.4018, + "step": 6523 + }, + { + "epoch": 3.0846335697399527, + "grad_norm": 2.6973681449890137, + "learning_rate": 2.4195402166094657e-06, + "loss": 0.3533, + "step": 6524 + }, + { + "epoch": 3.0851063829787235, + "grad_norm": 3.096013307571411, + "learning_rate": 2.4189167153094194e-06, + "loss": 0.3872, + "step": 6525 + }, + { + "epoch": 3.085579196217494, + "grad_norm": 3.0707414150238037, + "learning_rate": 2.4182932190580737e-06, + "loss": 0.3775, + "step": 6526 + }, + { + "epoch": 3.0860520094562647, + "grad_norm": 2.873190402984619, + "learning_rate": 2.417669727894251e-06, + "loss": 0.3144, + "step": 6527 + }, + { + "epoch": 3.0865248226950355, + "grad_norm": 2.316431999206543, + "learning_rate": 2.4170462418567732e-06, + "loss": 0.3238, + "step": 6528 + }, + { + "epoch": 3.0869976359338063, + "grad_norm": 2.3672494888305664, + "learning_rate": 2.4164227609844626e-06, + "loss": 0.3585, + "step": 6529 + }, + { + "epoch": 3.0874704491725766, + "grad_norm": 2.904538154602051, + "learning_rate": 2.415799285316139e-06, + "loss": 0.366, + "step": 6530 + }, + { + "epoch": 3.0879432624113474, + "grad_norm": 2.914602279663086, + "learning_rate": 2.415175814890626e-06, + "loss": 0.3793, + "step": 6531 + }, + { + "epoch": 3.088416075650118, + "grad_norm": 2.652005672454834, + "learning_rate": 2.4145523497467417e-06, + "loss": 0.362, + "step": 6532 + }, + { + "epoch": 3.088888888888889, + "grad_norm": 2.5137813091278076, + "learning_rate": 2.413928889923309e-06, + "loss": 0.2974, + "step": 6533 + }, + { + "epoch": 3.0893617021276594, + "grad_norm": 3.2166645526885986, + "learning_rate": 2.413305435459147e-06, + "loss": 0.4151, + "step": 6534 + }, + { + "epoch": 3.08983451536643, + "grad_norm": 3.0506820678710938, + "learning_rate": 2.412681986393075e-06, + "loss": 0.4223, + "step": 6535 + }, + { + "epoch": 3.090307328605201, + "grad_norm": 3.035275936126709, + "learning_rate": 2.412058542763913e-06, + "loss": 0.4841, + "step": 6536 + }, + { + "epoch": 3.0907801418439718, + "grad_norm": 3.3195009231567383, + "learning_rate": 2.4114351046104793e-06, + "loss": 0.4205, + "step": 6537 + }, + { + "epoch": 3.091252955082742, + "grad_norm": 2.8700361251831055, + "learning_rate": 2.410811671971594e-06, + "loss": 0.3704, + "step": 6538 + }, + { + "epoch": 3.091725768321513, + "grad_norm": 2.900595188140869, + "learning_rate": 2.410188244886075e-06, + "loss": 0.4184, + "step": 6539 + }, + { + "epoch": 3.0921985815602837, + "grad_norm": 2.88179349899292, + "learning_rate": 2.409564823392739e-06, + "loss": 0.4156, + "step": 6540 + }, + { + "epoch": 3.0926713947990545, + "grad_norm": 2.677568197250366, + "learning_rate": 2.408941407530406e-06, + "loss": 0.4084, + "step": 6541 + }, + { + "epoch": 3.093144208037825, + "grad_norm": 3.0236027240753174, + "learning_rate": 2.408317997337892e-06, + "loss": 0.4384, + "step": 6542 + }, + { + "epoch": 3.0936170212765957, + "grad_norm": 3.1708545684814453, + "learning_rate": 2.4076945928540143e-06, + "loss": 0.3876, + "step": 6543 + }, + { + "epoch": 3.0940898345153665, + "grad_norm": 3.248821973800659, + "learning_rate": 2.40707119411759e-06, + "loss": 0.3865, + "step": 6544 + }, + { + "epoch": 3.0945626477541373, + "grad_norm": 3.0961649417877197, + "learning_rate": 2.4064478011674334e-06, + "loss": 0.3982, + "step": 6545 + }, + { + "epoch": 3.0950354609929076, + "grad_norm": 3.1989805698394775, + "learning_rate": 2.4058244140423637e-06, + "loss": 0.4777, + "step": 6546 + }, + { + "epoch": 3.0955082742316784, + "grad_norm": 2.805640459060669, + "learning_rate": 2.4052010327811933e-06, + "loss": 0.3764, + "step": 6547 + }, + { + "epoch": 3.095981087470449, + "grad_norm": 2.7225050926208496, + "learning_rate": 2.40457765742274e-06, + "loss": 0.3286, + "step": 6548 + }, + { + "epoch": 3.09645390070922, + "grad_norm": 3.119915008544922, + "learning_rate": 2.4039542880058174e-06, + "loss": 0.4463, + "step": 6549 + }, + { + "epoch": 3.0969267139479904, + "grad_norm": 2.8503530025482178, + "learning_rate": 2.4033309245692403e-06, + "loss": 0.395, + "step": 6550 + }, + { + "epoch": 3.097399527186761, + "grad_norm": 2.947504758834839, + "learning_rate": 2.4027075671518225e-06, + "loss": 0.4024, + "step": 6551 + }, + { + "epoch": 3.097872340425532, + "grad_norm": 3.170905113220215, + "learning_rate": 2.402084215792377e-06, + "loss": 0.4302, + "step": 6552 + }, + { + "epoch": 3.0983451536643027, + "grad_norm": 2.910475492477417, + "learning_rate": 2.4014608705297195e-06, + "loss": 0.4037, + "step": 6553 + }, + { + "epoch": 3.098817966903073, + "grad_norm": 2.627511978149414, + "learning_rate": 2.400837531402661e-06, + "loss": 0.3972, + "step": 6554 + }, + { + "epoch": 3.099290780141844, + "grad_norm": 2.6485681533813477, + "learning_rate": 2.4002141984500133e-06, + "loss": 0.4044, + "step": 6555 + }, + { + "epoch": 3.0997635933806147, + "grad_norm": 2.930954694747925, + "learning_rate": 2.399590871710592e-06, + "loss": 0.4214, + "step": 6556 + }, + { + "epoch": 3.1002364066193855, + "grad_norm": 2.6014554500579834, + "learning_rate": 2.3989675512232063e-06, + "loss": 0.3493, + "step": 6557 + }, + { + "epoch": 3.100709219858156, + "grad_norm": 2.899001121520996, + "learning_rate": 2.398344237026667e-06, + "loss": 0.382, + "step": 6558 + }, + { + "epoch": 3.1011820330969266, + "grad_norm": 2.4698870182037354, + "learning_rate": 2.3977209291597876e-06, + "loss": 0.3558, + "step": 6559 + }, + { + "epoch": 3.1016548463356974, + "grad_norm": 3.2926251888275146, + "learning_rate": 2.3970976276613763e-06, + "loss": 0.4078, + "step": 6560 + }, + { + "epoch": 3.1021276595744682, + "grad_norm": 2.5306150913238525, + "learning_rate": 2.3964743325702454e-06, + "loss": 0.3657, + "step": 6561 + }, + { + "epoch": 3.1026004728132386, + "grad_norm": 2.727583408355713, + "learning_rate": 2.395851043925204e-06, + "loss": 0.3791, + "step": 6562 + }, + { + "epoch": 3.1030732860520094, + "grad_norm": 3.1403541564941406, + "learning_rate": 2.3952277617650602e-06, + "loss": 0.3934, + "step": 6563 + }, + { + "epoch": 3.10354609929078, + "grad_norm": 2.5816383361816406, + "learning_rate": 2.3946044861286256e-06, + "loss": 0.3703, + "step": 6564 + }, + { + "epoch": 3.104018912529551, + "grad_norm": 2.5742220878601074, + "learning_rate": 2.3939812170547067e-06, + "loss": 0.3628, + "step": 6565 + }, + { + "epoch": 3.1044917257683213, + "grad_norm": 2.7276530265808105, + "learning_rate": 2.393357954582113e-06, + "loss": 0.3789, + "step": 6566 + }, + { + "epoch": 3.104964539007092, + "grad_norm": 3.05595064163208, + "learning_rate": 2.3927346987496515e-06, + "loss": 0.3766, + "step": 6567 + }, + { + "epoch": 3.105437352245863, + "grad_norm": 2.786970615386963, + "learning_rate": 2.39211144959613e-06, + "loss": 0.3329, + "step": 6568 + }, + { + "epoch": 3.1059101654846337, + "grad_norm": 3.499018430709839, + "learning_rate": 2.391488207160356e-06, + "loss": 0.4175, + "step": 6569 + }, + { + "epoch": 3.106382978723404, + "grad_norm": 2.969735860824585, + "learning_rate": 2.3908649714811346e-06, + "loss": 0.3893, + "step": 6570 + }, + { + "epoch": 3.106855791962175, + "grad_norm": 3.1494929790496826, + "learning_rate": 2.3902417425972734e-06, + "loss": 0.4048, + "step": 6571 + }, + { + "epoch": 3.1073286052009457, + "grad_norm": 2.6393489837646484, + "learning_rate": 2.3896185205475782e-06, + "loss": 0.3216, + "step": 6572 + }, + { + "epoch": 3.1078014184397165, + "grad_norm": 3.6984152793884277, + "learning_rate": 2.3889953053708528e-06, + "loss": 0.3646, + "step": 6573 + }, + { + "epoch": 3.108274231678487, + "grad_norm": 3.518547534942627, + "learning_rate": 2.388372097105903e-06, + "loss": 0.3627, + "step": 6574 + }, + { + "epoch": 3.1087470449172576, + "grad_norm": 3.422043800354004, + "learning_rate": 2.3877488957915333e-06, + "loss": 0.4116, + "step": 6575 + }, + { + "epoch": 3.1092198581560284, + "grad_norm": 2.8088064193725586, + "learning_rate": 2.3871257014665486e-06, + "loss": 0.3477, + "step": 6576 + }, + { + "epoch": 3.109692671394799, + "grad_norm": 2.7877607345581055, + "learning_rate": 2.3865025141697513e-06, + "loss": 0.351, + "step": 6577 + }, + { + "epoch": 3.1101654846335696, + "grad_norm": 2.9446799755096436, + "learning_rate": 2.3858793339399433e-06, + "loss": 0.4025, + "step": 6578 + }, + { + "epoch": 3.1106382978723404, + "grad_norm": 2.886584758758545, + "learning_rate": 2.3852561608159304e-06, + "loss": 0.3765, + "step": 6579 + }, + { + "epoch": 3.111111111111111, + "grad_norm": 3.45711088180542, + "learning_rate": 2.384632994836513e-06, + "loss": 0.3744, + "step": 6580 + }, + { + "epoch": 3.111583924349882, + "grad_norm": 2.737441301345825, + "learning_rate": 2.3840098360404916e-06, + "loss": 0.4048, + "step": 6581 + }, + { + "epoch": 3.1120567375886523, + "grad_norm": 2.742567300796509, + "learning_rate": 2.383386684466671e-06, + "loss": 0.3717, + "step": 6582 + }, + { + "epoch": 3.112529550827423, + "grad_norm": 3.017970561981201, + "learning_rate": 2.382763540153849e-06, + "loss": 0.3922, + "step": 6583 + }, + { + "epoch": 3.113002364066194, + "grad_norm": 3.132004499435425, + "learning_rate": 2.3821404031408283e-06, + "loss": 0.3969, + "step": 6584 + }, + { + "epoch": 3.1134751773049647, + "grad_norm": 2.910820245742798, + "learning_rate": 2.3815172734664075e-06, + "loss": 0.4241, + "step": 6585 + }, + { + "epoch": 3.113947990543735, + "grad_norm": 3.0029842853546143, + "learning_rate": 2.380894151169386e-06, + "loss": 0.4007, + "step": 6586 + }, + { + "epoch": 3.114420803782506, + "grad_norm": 3.0309178829193115, + "learning_rate": 2.380271036288564e-06, + "loss": 0.3876, + "step": 6587 + }, + { + "epoch": 3.1148936170212767, + "grad_norm": 2.963204860687256, + "learning_rate": 2.379647928862739e-06, + "loss": 0.4017, + "step": 6588 + }, + { + "epoch": 3.1153664302600474, + "grad_norm": 3.0127944946289062, + "learning_rate": 2.3790248289307103e-06, + "loss": 0.3651, + "step": 6589 + }, + { + "epoch": 3.115839243498818, + "grad_norm": 2.557485580444336, + "learning_rate": 2.3784017365312755e-06, + "loss": 0.3419, + "step": 6590 + }, + { + "epoch": 3.1163120567375886, + "grad_norm": 2.8577969074249268, + "learning_rate": 2.3777786517032306e-06, + "loss": 0.372, + "step": 6591 + }, + { + "epoch": 3.1167848699763594, + "grad_norm": 2.450324058532715, + "learning_rate": 2.3771555744853735e-06, + "loss": 0.3442, + "step": 6592 + }, + { + "epoch": 3.11725768321513, + "grad_norm": 2.7939295768737793, + "learning_rate": 2.3765325049164996e-06, + "loss": 0.401, + "step": 6593 + }, + { + "epoch": 3.1177304964539005, + "grad_norm": 2.9690325260162354, + "learning_rate": 2.3759094430354056e-06, + "loss": 0.3962, + "step": 6594 + }, + { + "epoch": 3.1182033096926713, + "grad_norm": 2.7630631923675537, + "learning_rate": 2.375286388880887e-06, + "loss": 0.4126, + "step": 6595 + }, + { + "epoch": 3.118676122931442, + "grad_norm": 2.6259944438934326, + "learning_rate": 2.3746633424917366e-06, + "loss": 0.3285, + "step": 6596 + }, + { + "epoch": 3.119148936170213, + "grad_norm": 2.7107701301574707, + "learning_rate": 2.3740403039067516e-06, + "loss": 0.3636, + "step": 6597 + }, + { + "epoch": 3.1196217494089833, + "grad_norm": 2.985301971435547, + "learning_rate": 2.373417273164724e-06, + "loss": 0.3928, + "step": 6598 + }, + { + "epoch": 3.120094562647754, + "grad_norm": 3.2578976154327393, + "learning_rate": 2.3727942503044483e-06, + "loss": 0.3379, + "step": 6599 + }, + { + "epoch": 3.120567375886525, + "grad_norm": 3.1681406497955322, + "learning_rate": 2.372171235364717e-06, + "loss": 0.4023, + "step": 6600 + }, + { + "epoch": 3.1210401891252957, + "grad_norm": 3.120147705078125, + "learning_rate": 2.371548228384321e-06, + "loss": 0.4228, + "step": 6601 + }, + { + "epoch": 3.121513002364066, + "grad_norm": 2.7786099910736084, + "learning_rate": 2.3709252294020547e-06, + "loss": 0.4386, + "step": 6602 + }, + { + "epoch": 3.121985815602837, + "grad_norm": 2.698849678039551, + "learning_rate": 2.3703022384567086e-06, + "loss": 0.3861, + "step": 6603 + }, + { + "epoch": 3.1224586288416076, + "grad_norm": 2.7917959690093994, + "learning_rate": 2.3696792555870724e-06, + "loss": 0.3535, + "step": 6604 + }, + { + "epoch": 3.1229314420803784, + "grad_norm": 2.8249263763427734, + "learning_rate": 2.3690562808319385e-06, + "loss": 0.3415, + "step": 6605 + }, + { + "epoch": 3.123404255319149, + "grad_norm": 2.567458391189575, + "learning_rate": 2.368433314230095e-06, + "loss": 0.3827, + "step": 6606 + }, + { + "epoch": 3.1238770685579196, + "grad_norm": 2.9670443534851074, + "learning_rate": 2.3678103558203328e-06, + "loss": 0.4238, + "step": 6607 + }, + { + "epoch": 3.1243498817966904, + "grad_norm": 2.6893439292907715, + "learning_rate": 2.36718740564144e-06, + "loss": 0.3461, + "step": 6608 + }, + { + "epoch": 3.124822695035461, + "grad_norm": 3.2669708728790283, + "learning_rate": 2.3665644637322044e-06, + "loss": 0.3992, + "step": 6609 + }, + { + "epoch": 3.1252955082742315, + "grad_norm": 2.889340400695801, + "learning_rate": 2.3659415301314152e-06, + "loss": 0.3829, + "step": 6610 + }, + { + "epoch": 3.1257683215130023, + "grad_norm": 2.625603199005127, + "learning_rate": 2.3653186048778584e-06, + "loss": 0.3559, + "step": 6611 + }, + { + "epoch": 3.126241134751773, + "grad_norm": 2.8128650188446045, + "learning_rate": 2.3646956880103224e-06, + "loss": 0.4035, + "step": 6612 + }, + { + "epoch": 3.126713947990544, + "grad_norm": 3.1887412071228027, + "learning_rate": 2.3640727795675925e-06, + "loss": 0.3938, + "step": 6613 + }, + { + "epoch": 3.1271867612293143, + "grad_norm": 2.886514186859131, + "learning_rate": 2.363449879588454e-06, + "loss": 0.3504, + "step": 6614 + }, + { + "epoch": 3.127659574468085, + "grad_norm": 3.2149860858917236, + "learning_rate": 2.3628269881116937e-06, + "loss": 0.4137, + "step": 6615 + }, + { + "epoch": 3.128132387706856, + "grad_norm": 3.3155312538146973, + "learning_rate": 2.362204105176094e-06, + "loss": 0.3811, + "step": 6616 + }, + { + "epoch": 3.1286052009456267, + "grad_norm": 2.6228792667388916, + "learning_rate": 2.3615812308204415e-06, + "loss": 0.3511, + "step": 6617 + }, + { + "epoch": 3.129078014184397, + "grad_norm": 2.7686524391174316, + "learning_rate": 2.3609583650835187e-06, + "loss": 0.3722, + "step": 6618 + }, + { + "epoch": 3.129550827423168, + "grad_norm": 3.396368980407715, + "learning_rate": 2.3603355080041083e-06, + "loss": 0.4678, + "step": 6619 + }, + { + "epoch": 3.1300236406619386, + "grad_norm": 2.7329437732696533, + "learning_rate": 2.359712659620994e-06, + "loss": 0.3775, + "step": 6620 + }, + { + "epoch": 3.1304964539007094, + "grad_norm": 2.7633914947509766, + "learning_rate": 2.3590898199729567e-06, + "loss": 0.3306, + "step": 6621 + }, + { + "epoch": 3.1309692671394798, + "grad_norm": 3.020887613296509, + "learning_rate": 2.3584669890987792e-06, + "loss": 0.4121, + "step": 6622 + }, + { + "epoch": 3.1314420803782506, + "grad_norm": 2.8912103176116943, + "learning_rate": 2.3578441670372414e-06, + "loss": 0.4297, + "step": 6623 + }, + { + "epoch": 3.1319148936170214, + "grad_norm": 3.0654027462005615, + "learning_rate": 2.3572213538271234e-06, + "loss": 0.3856, + "step": 6624 + }, + { + "epoch": 3.132387706855792, + "grad_norm": 3.1126575469970703, + "learning_rate": 2.356598549507206e-06, + "loss": 0.3886, + "step": 6625 + }, + { + "epoch": 3.1328605200945625, + "grad_norm": 2.7066447734832764, + "learning_rate": 2.3559757541162687e-06, + "loss": 0.4212, + "step": 6626 + }, + { + "epoch": 3.1333333333333333, + "grad_norm": 2.876338243484497, + "learning_rate": 2.355352967693088e-06, + "loss": 0.3607, + "step": 6627 + }, + { + "epoch": 3.133806146572104, + "grad_norm": 2.9011716842651367, + "learning_rate": 2.3547301902764454e-06, + "loss": 0.428, + "step": 6628 + }, + { + "epoch": 3.134278959810875, + "grad_norm": 2.805656909942627, + "learning_rate": 2.3541074219051163e-06, + "loss": 0.4038, + "step": 6629 + }, + { + "epoch": 3.1347517730496453, + "grad_norm": 2.89546275138855, + "learning_rate": 2.353484662617879e-06, + "loss": 0.3798, + "step": 6630 + }, + { + "epoch": 3.135224586288416, + "grad_norm": 3.0290539264678955, + "learning_rate": 2.352861912453508e-06, + "loss": 0.3916, + "step": 6631 + }, + { + "epoch": 3.135697399527187, + "grad_norm": 2.848393440246582, + "learning_rate": 2.352239171450781e-06, + "loss": 0.3423, + "step": 6632 + }, + { + "epoch": 3.1361702127659576, + "grad_norm": 2.871372938156128, + "learning_rate": 2.3516164396484737e-06, + "loss": 0.3872, + "step": 6633 + }, + { + "epoch": 3.136643026004728, + "grad_norm": 3.120682716369629, + "learning_rate": 2.3509937170853585e-06, + "loss": 0.3952, + "step": 6634 + }, + { + "epoch": 3.137115839243499, + "grad_norm": 2.6936683654785156, + "learning_rate": 2.3503710038002127e-06, + "loss": 0.3643, + "step": 6635 + }, + { + "epoch": 3.1375886524822696, + "grad_norm": 3.749519109725952, + "learning_rate": 2.349748299831808e-06, + "loss": 0.4519, + "step": 6636 + }, + { + "epoch": 3.1380614657210404, + "grad_norm": 2.8034276962280273, + "learning_rate": 2.3491256052189175e-06, + "loss": 0.401, + "step": 6637 + }, + { + "epoch": 3.1385342789598107, + "grad_norm": 2.6201975345611572, + "learning_rate": 2.348502920000314e-06, + "loss": 0.3491, + "step": 6638 + }, + { + "epoch": 3.1390070921985815, + "grad_norm": 2.890552043914795, + "learning_rate": 2.347880244214769e-06, + "loss": 0.3439, + "step": 6639 + }, + { + "epoch": 3.1394799054373523, + "grad_norm": 2.899594306945801, + "learning_rate": 2.347257577901055e-06, + "loss": 0.3707, + "step": 6640 + }, + { + "epoch": 3.139952718676123, + "grad_norm": 2.8660130500793457, + "learning_rate": 2.346634921097942e-06, + "loss": 0.3582, + "step": 6641 + }, + { + "epoch": 3.1404255319148935, + "grad_norm": 2.9805452823638916, + "learning_rate": 2.346012273844199e-06, + "loss": 0.3466, + "step": 6642 + }, + { + "epoch": 3.1408983451536643, + "grad_norm": 3.162977457046509, + "learning_rate": 2.345389636178597e-06, + "loss": 0.3657, + "step": 6643 + }, + { + "epoch": 3.141371158392435, + "grad_norm": 2.838988780975342, + "learning_rate": 2.344767008139904e-06, + "loss": 0.3826, + "step": 6644 + }, + { + "epoch": 3.141843971631206, + "grad_norm": 3.8427252769470215, + "learning_rate": 2.3441443897668893e-06, + "loss": 0.3697, + "step": 6645 + }, + { + "epoch": 3.1423167848699762, + "grad_norm": 2.9233880043029785, + "learning_rate": 2.34352178109832e-06, + "loss": 0.3481, + "step": 6646 + }, + { + "epoch": 3.142789598108747, + "grad_norm": 2.5840606689453125, + "learning_rate": 2.342899182172963e-06, + "loss": 0.3746, + "step": 6647 + }, + { + "epoch": 3.143262411347518, + "grad_norm": 2.806793451309204, + "learning_rate": 2.3422765930295857e-06, + "loss": 0.419, + "step": 6648 + }, + { + "epoch": 3.1437352245862886, + "grad_norm": 2.803952693939209, + "learning_rate": 2.3416540137069522e-06, + "loss": 0.3965, + "step": 6649 + }, + { + "epoch": 3.144208037825059, + "grad_norm": 2.8416364192962646, + "learning_rate": 2.3410314442438297e-06, + "loss": 0.4317, + "step": 6650 + }, + { + "epoch": 3.1446808510638298, + "grad_norm": 2.9956440925598145, + "learning_rate": 2.3404088846789826e-06, + "loss": 0.4268, + "step": 6651 + }, + { + "epoch": 3.1451536643026006, + "grad_norm": 3.1649162769317627, + "learning_rate": 2.339786335051173e-06, + "loss": 0.4149, + "step": 6652 + }, + { + "epoch": 3.145626477541371, + "grad_norm": 2.909107208251953, + "learning_rate": 2.3391637953991673e-06, + "loss": 0.4085, + "step": 6653 + }, + { + "epoch": 3.1460992907801417, + "grad_norm": 2.416755199432373, + "learning_rate": 2.3385412657617264e-06, + "loss": 0.3585, + "step": 6654 + }, + { + "epoch": 3.1465721040189125, + "grad_norm": 3.1122629642486572, + "learning_rate": 2.3379187461776123e-06, + "loss": 0.3876, + "step": 6655 + }, + { + "epoch": 3.1470449172576833, + "grad_norm": 2.6854658126831055, + "learning_rate": 2.337296236685588e-06, + "loss": 0.3125, + "step": 6656 + }, + { + "epoch": 3.147517730496454, + "grad_norm": 2.779876708984375, + "learning_rate": 2.3366737373244127e-06, + "loss": 0.3688, + "step": 6657 + }, + { + "epoch": 3.1479905437352245, + "grad_norm": 3.1444761753082275, + "learning_rate": 2.3360512481328484e-06, + "loss": 0.4089, + "step": 6658 + }, + { + "epoch": 3.1484633569739953, + "grad_norm": 2.71445894241333, + "learning_rate": 2.335428769149654e-06, + "loss": 0.3532, + "step": 6659 + }, + { + "epoch": 3.148936170212766, + "grad_norm": 2.9788241386413574, + "learning_rate": 2.334806300413587e-06, + "loss": 0.4238, + "step": 6660 + }, + { + "epoch": 3.1494089834515364, + "grad_norm": 3.0118865966796875, + "learning_rate": 2.334183841963409e-06, + "loss": 0.4437, + "step": 6661 + }, + { + "epoch": 3.149881796690307, + "grad_norm": 3.2229537963867188, + "learning_rate": 2.3335613938378753e-06, + "loss": 0.3582, + "step": 6662 + }, + { + "epoch": 3.150354609929078, + "grad_norm": 2.734997034072876, + "learning_rate": 2.3329389560757447e-06, + "loss": 0.3737, + "step": 6663 + }, + { + "epoch": 3.150827423167849, + "grad_norm": 3.4746382236480713, + "learning_rate": 2.3323165287157724e-06, + "loss": 0.3516, + "step": 6664 + }, + { + "epoch": 3.1513002364066196, + "grad_norm": 2.9428153038024902, + "learning_rate": 2.3316941117967137e-06, + "loss": 0.3985, + "step": 6665 + }, + { + "epoch": 3.15177304964539, + "grad_norm": 2.6840944290161133, + "learning_rate": 2.3310717053573257e-06, + "loss": 0.3274, + "step": 6666 + }, + { + "epoch": 3.1522458628841608, + "grad_norm": 3.048335552215576, + "learning_rate": 2.3304493094363607e-06, + "loss": 0.4262, + "step": 6667 + }, + { + "epoch": 3.1527186761229316, + "grad_norm": 2.87381911277771, + "learning_rate": 2.329826924072575e-06, + "loss": 0.3867, + "step": 6668 + }, + { + "epoch": 3.153191489361702, + "grad_norm": 2.6236355304718018, + "learning_rate": 2.32920454930472e-06, + "loss": 0.3649, + "step": 6669 + }, + { + "epoch": 3.1536643026004727, + "grad_norm": 3.1326401233673096, + "learning_rate": 2.328582185171549e-06, + "loss": 0.3451, + "step": 6670 + }, + { + "epoch": 3.1541371158392435, + "grad_norm": 3.011826992034912, + "learning_rate": 2.327959831711814e-06, + "loss": 0.4118, + "step": 6671 + }, + { + "epoch": 3.1546099290780143, + "grad_norm": 2.834933280944824, + "learning_rate": 2.3273374889642646e-06, + "loss": 0.4378, + "step": 6672 + }, + { + "epoch": 3.155082742316785, + "grad_norm": 3.085756778717041, + "learning_rate": 2.326715156967654e-06, + "loss": 0.4389, + "step": 6673 + }, + { + "epoch": 3.1555555555555554, + "grad_norm": 2.7912232875823975, + "learning_rate": 2.3260928357607305e-06, + "loss": 0.3352, + "step": 6674 + }, + { + "epoch": 3.1560283687943262, + "grad_norm": 2.7643113136291504, + "learning_rate": 2.3254705253822424e-06, + "loss": 0.3449, + "step": 6675 + }, + { + "epoch": 3.156501182033097, + "grad_norm": 2.8984663486480713, + "learning_rate": 2.3248482258709405e-06, + "loss": 0.4231, + "step": 6676 + }, + { + "epoch": 3.1569739952718674, + "grad_norm": 3.214996814727783, + "learning_rate": 2.324225937265572e-06, + "loss": 0.4616, + "step": 6677 + }, + { + "epoch": 3.157446808510638, + "grad_norm": 2.58534836769104, + "learning_rate": 2.3236036596048827e-06, + "loss": 0.3264, + "step": 6678 + }, + { + "epoch": 3.157919621749409, + "grad_norm": 2.790714740753174, + "learning_rate": 2.322981392927621e-06, + "loss": 0.4086, + "step": 6679 + }, + { + "epoch": 3.15839243498818, + "grad_norm": 2.726029872894287, + "learning_rate": 2.32235913727253e-06, + "loss": 0.3344, + "step": 6680 + }, + { + "epoch": 3.1588652482269506, + "grad_norm": 2.8392906188964844, + "learning_rate": 2.3217368926783583e-06, + "loss": 0.3468, + "step": 6681 + }, + { + "epoch": 3.159338061465721, + "grad_norm": 2.9796900749206543, + "learning_rate": 2.321114659183848e-06, + "loss": 0.4051, + "step": 6682 + }, + { + "epoch": 3.1598108747044917, + "grad_norm": 3.0399303436279297, + "learning_rate": 2.320492436827743e-06, + "loss": 0.402, + "step": 6683 + }, + { + "epoch": 3.1602836879432625, + "grad_norm": 2.9295334815979004, + "learning_rate": 2.3198702256487877e-06, + "loss": 0.3975, + "step": 6684 + }, + { + "epoch": 3.160756501182033, + "grad_norm": 2.881552219390869, + "learning_rate": 2.319248025685723e-06, + "loss": 0.4342, + "step": 6685 + }, + { + "epoch": 3.1612293144208037, + "grad_norm": 3.0711705684661865, + "learning_rate": 2.3186258369772916e-06, + "loss": 0.3829, + "step": 6686 + }, + { + "epoch": 3.1617021276595745, + "grad_norm": 2.6614468097686768, + "learning_rate": 2.3180036595622345e-06, + "loss": 0.3473, + "step": 6687 + }, + { + "epoch": 3.1621749408983453, + "grad_norm": 3.0084400177001953, + "learning_rate": 2.3173814934792903e-06, + "loss": 0.4363, + "step": 6688 + }, + { + "epoch": 3.162647754137116, + "grad_norm": 2.9340786933898926, + "learning_rate": 2.3167593387672006e-06, + "loss": 0.4235, + "step": 6689 + }, + { + "epoch": 3.1631205673758864, + "grad_norm": 3.0765340328216553, + "learning_rate": 2.3161371954647023e-06, + "loss": 0.4601, + "step": 6690 + }, + { + "epoch": 3.1635933806146572, + "grad_norm": 2.816096067428589, + "learning_rate": 2.3155150636105356e-06, + "loss": 0.3764, + "step": 6691 + }, + { + "epoch": 3.164066193853428, + "grad_norm": 3.0476551055908203, + "learning_rate": 2.3148929432434372e-06, + "loss": 0.3956, + "step": 6692 + }, + { + "epoch": 3.1645390070921984, + "grad_norm": 2.628934860229492, + "learning_rate": 2.314270834402143e-06, + "loss": 0.3551, + "step": 6693 + }, + { + "epoch": 3.165011820330969, + "grad_norm": 3.3933539390563965, + "learning_rate": 2.31364873712539e-06, + "loss": 0.4523, + "step": 6694 + }, + { + "epoch": 3.16548463356974, + "grad_norm": 3.256176233291626, + "learning_rate": 2.313026651451912e-06, + "loss": 0.417, + "step": 6695 + }, + { + "epoch": 3.1659574468085108, + "grad_norm": 2.92926025390625, + "learning_rate": 2.312404577420445e-06, + "loss": 0.4365, + "step": 6696 + }, + { + "epoch": 3.166430260047281, + "grad_norm": 2.9514732360839844, + "learning_rate": 2.3117825150697233e-06, + "loss": 0.4632, + "step": 6697 + }, + { + "epoch": 3.166903073286052, + "grad_norm": 2.8635852336883545, + "learning_rate": 2.3111604644384778e-06, + "loss": 0.4018, + "step": 6698 + }, + { + "epoch": 3.1673758865248227, + "grad_norm": 2.5937020778656006, + "learning_rate": 2.3105384255654433e-06, + "loss": 0.3682, + "step": 6699 + }, + { + "epoch": 3.1678486997635935, + "grad_norm": 2.857851266860962, + "learning_rate": 2.3099163984893497e-06, + "loss": 0.3293, + "step": 6700 + }, + { + "epoch": 3.168321513002364, + "grad_norm": 2.5903947353363037, + "learning_rate": 2.3092943832489283e-06, + "loss": 0.3543, + "step": 6701 + }, + { + "epoch": 3.1687943262411347, + "grad_norm": 2.9783661365509033, + "learning_rate": 2.30867237988291e-06, + "loss": 0.3707, + "step": 6702 + }, + { + "epoch": 3.1692671394799055, + "grad_norm": 3.0133306980133057, + "learning_rate": 2.3080503884300225e-06, + "loss": 0.439, + "step": 6703 + }, + { + "epoch": 3.1697399527186763, + "grad_norm": 2.7119483947753906, + "learning_rate": 2.3074284089289968e-06, + "loss": 0.3956, + "step": 6704 + }, + { + "epoch": 3.1702127659574466, + "grad_norm": 3.0499672889709473, + "learning_rate": 2.3068064414185597e-06, + "loss": 0.434, + "step": 6705 + }, + { + "epoch": 3.1706855791962174, + "grad_norm": 2.862807512283325, + "learning_rate": 2.306184485937437e-06, + "loss": 0.3644, + "step": 6706 + }, + { + "epoch": 3.171158392434988, + "grad_norm": 2.9445149898529053, + "learning_rate": 2.305562542524358e-06, + "loss": 0.3894, + "step": 6707 + }, + { + "epoch": 3.171631205673759, + "grad_norm": 3.0442428588867188, + "learning_rate": 2.304940611218046e-06, + "loss": 0.3816, + "step": 6708 + }, + { + "epoch": 3.1721040189125294, + "grad_norm": 2.7101798057556152, + "learning_rate": 2.304318692057228e-06, + "loss": 0.3708, + "step": 6709 + }, + { + "epoch": 3.1725768321513, + "grad_norm": 2.7874515056610107, + "learning_rate": 2.303696785080626e-06, + "loss": 0.404, + "step": 6710 + }, + { + "epoch": 3.173049645390071, + "grad_norm": 3.0438833236694336, + "learning_rate": 2.303074890326964e-06, + "loss": 0.4342, + "step": 6711 + }, + { + "epoch": 3.1735224586288417, + "grad_norm": 2.6079208850860596, + "learning_rate": 2.302453007834966e-06, + "loss": 0.3725, + "step": 6712 + }, + { + "epoch": 3.173995271867612, + "grad_norm": 3.3353021144866943, + "learning_rate": 2.3018311376433523e-06, + "loss": 0.4372, + "step": 6713 + }, + { + "epoch": 3.174468085106383, + "grad_norm": 2.840771436691284, + "learning_rate": 2.3012092797908454e-06, + "loss": 0.3979, + "step": 6714 + }, + { + "epoch": 3.1749408983451537, + "grad_norm": 3.0474867820739746, + "learning_rate": 2.3005874343161648e-06, + "loss": 0.4077, + "step": 6715 + }, + { + "epoch": 3.1754137115839245, + "grad_norm": 2.849835157394409, + "learning_rate": 2.2999656012580296e-06, + "loss": 0.393, + "step": 6716 + }, + { + "epoch": 3.175886524822695, + "grad_norm": 2.6361217498779297, + "learning_rate": 2.29934378065516e-06, + "loss": 0.3894, + "step": 6717 + }, + { + "epoch": 3.1763593380614656, + "grad_norm": 3.139700174331665, + "learning_rate": 2.298721972546273e-06, + "loss": 0.36, + "step": 6718 + }, + { + "epoch": 3.1768321513002364, + "grad_norm": 2.987861156463623, + "learning_rate": 2.298100176970087e-06, + "loss": 0.4306, + "step": 6719 + }, + { + "epoch": 3.1773049645390072, + "grad_norm": 2.6403157711029053, + "learning_rate": 2.297478393965317e-06, + "loss": 0.3978, + "step": 6720 + }, + { + "epoch": 3.1777777777777776, + "grad_norm": 2.819519281387329, + "learning_rate": 2.296856623570679e-06, + "loss": 0.3467, + "step": 6721 + }, + { + "epoch": 3.1782505910165484, + "grad_norm": 2.7195916175842285, + "learning_rate": 2.296234865824889e-06, + "loss": 0.3685, + "step": 6722 + }, + { + "epoch": 3.178723404255319, + "grad_norm": 3.015488624572754, + "learning_rate": 2.2956131207666604e-06, + "loss": 0.3751, + "step": 6723 + }, + { + "epoch": 3.17919621749409, + "grad_norm": 2.9283792972564697, + "learning_rate": 2.2949913884347055e-06, + "loss": 0.3261, + "step": 6724 + }, + { + "epoch": 3.1796690307328603, + "grad_norm": 3.358991861343384, + "learning_rate": 2.294369668867739e-06, + "loss": 0.4505, + "step": 6725 + }, + { + "epoch": 3.180141843971631, + "grad_norm": 2.9143471717834473, + "learning_rate": 2.2937479621044712e-06, + "loss": 0.3612, + "step": 6726 + }, + { + "epoch": 3.180614657210402, + "grad_norm": 3.020519495010376, + "learning_rate": 2.2931262681836136e-06, + "loss": 0.4241, + "step": 6727 + }, + { + "epoch": 3.1810874704491727, + "grad_norm": 2.693737745285034, + "learning_rate": 2.2925045871438765e-06, + "loss": 0.366, + "step": 6728 + }, + { + "epoch": 3.181560283687943, + "grad_norm": 2.9427194595336914, + "learning_rate": 2.2918829190239677e-06, + "loss": 0.3741, + "step": 6729 + }, + { + "epoch": 3.182033096926714, + "grad_norm": 2.529383659362793, + "learning_rate": 2.291261263862598e-06, + "loss": 0.4469, + "step": 6730 + }, + { + "epoch": 3.1825059101654847, + "grad_norm": 3.0097804069519043, + "learning_rate": 2.290639621698473e-06, + "loss": 0.4167, + "step": 6731 + }, + { + "epoch": 3.1829787234042555, + "grad_norm": 2.7047014236450195, + "learning_rate": 2.290017992570302e-06, + "loss": 0.3615, + "step": 6732 + }, + { + "epoch": 3.183451536643026, + "grad_norm": 2.676964282989502, + "learning_rate": 2.2893963765167897e-06, + "loss": 0.3722, + "step": 6733 + }, + { + "epoch": 3.1839243498817966, + "grad_norm": 3.0529778003692627, + "learning_rate": 2.2887747735766413e-06, + "loss": 0.395, + "step": 6734 + }, + { + "epoch": 3.1843971631205674, + "grad_norm": 2.826725721359253, + "learning_rate": 2.288153183788562e-06, + "loss": 0.3713, + "step": 6735 + }, + { + "epoch": 3.184869976359338, + "grad_norm": 2.8689587116241455, + "learning_rate": 2.287531607191254e-06, + "loss": 0.4383, + "step": 6736 + }, + { + "epoch": 3.1853427895981086, + "grad_norm": 3.1835694313049316, + "learning_rate": 2.2869100438234217e-06, + "loss": 0.3908, + "step": 6737 + }, + { + "epoch": 3.1858156028368794, + "grad_norm": 3.227262020111084, + "learning_rate": 2.286288493723767e-06, + "loss": 0.3549, + "step": 6738 + }, + { + "epoch": 3.18628841607565, + "grad_norm": 2.7543468475341797, + "learning_rate": 2.2856669569309896e-06, + "loss": 0.351, + "step": 6739 + }, + { + "epoch": 3.186761229314421, + "grad_norm": 2.5381555557250977, + "learning_rate": 2.2850454334837923e-06, + "loss": 0.3473, + "step": 6740 + }, + { + "epoch": 3.1872340425531913, + "grad_norm": 2.785923957824707, + "learning_rate": 2.284423923420872e-06, + "loss": 0.4144, + "step": 6741 + }, + { + "epoch": 3.187706855791962, + "grad_norm": 2.583853006362915, + "learning_rate": 2.28380242678093e-06, + "loss": 0.3088, + "step": 6742 + }, + { + "epoch": 3.188179669030733, + "grad_norm": 2.604647159576416, + "learning_rate": 2.2831809436026627e-06, + "loss": 0.3474, + "step": 6743 + }, + { + "epoch": 3.1886524822695037, + "grad_norm": 6.13611364364624, + "learning_rate": 2.2825594739247662e-06, + "loss": 0.4089, + "step": 6744 + }, + { + "epoch": 3.189125295508274, + "grad_norm": 3.034011125564575, + "learning_rate": 2.281938017785939e-06, + "loss": 0.4569, + "step": 6745 + }, + { + "epoch": 3.189598108747045, + "grad_norm": 2.9352638721466064, + "learning_rate": 2.281316575224874e-06, + "loss": 0.4293, + "step": 6746 + }, + { + "epoch": 3.1900709219858157, + "grad_norm": 3.860957384109497, + "learning_rate": 2.280695146280268e-06, + "loss": 0.4082, + "step": 6747 + }, + { + "epoch": 3.1905437352245865, + "grad_norm": 2.8131468296051025, + "learning_rate": 2.280073730990814e-06, + "loss": 0.3194, + "step": 6748 + }, + { + "epoch": 3.191016548463357, + "grad_norm": 3.1310737133026123, + "learning_rate": 2.2794523293952033e-06, + "loss": 0.4454, + "step": 6749 + }, + { + "epoch": 3.1914893617021276, + "grad_norm": 3.065091133117676, + "learning_rate": 2.27883094153213e-06, + "loss": 0.3789, + "step": 6750 + }, + { + "epoch": 3.1919621749408984, + "grad_norm": 3.315216541290283, + "learning_rate": 2.278209567440284e-06, + "loss": 0.4037, + "step": 6751 + }, + { + "epoch": 3.192434988179669, + "grad_norm": 3.0228476524353027, + "learning_rate": 2.2775882071583546e-06, + "loss": 0.3652, + "step": 6752 + }, + { + "epoch": 3.1929078014184396, + "grad_norm": 3.703540802001953, + "learning_rate": 2.2769668607250336e-06, + "loss": 0.3477, + "step": 6753 + }, + { + "epoch": 3.1933806146572103, + "grad_norm": 2.952481508255005, + "learning_rate": 2.2763455281790065e-06, + "loss": 0.4026, + "step": 6754 + }, + { + "epoch": 3.193853427895981, + "grad_norm": 2.5798189640045166, + "learning_rate": 2.275724209558965e-06, + "loss": 0.3475, + "step": 6755 + }, + { + "epoch": 3.194326241134752, + "grad_norm": 2.599669933319092, + "learning_rate": 2.2751029049035923e-06, + "loss": 0.3499, + "step": 6756 + }, + { + "epoch": 3.1947990543735223, + "grad_norm": 3.0463781356811523, + "learning_rate": 2.2744816142515756e-06, + "loss": 0.3927, + "step": 6757 + }, + { + "epoch": 3.195271867612293, + "grad_norm": 3.134199380874634, + "learning_rate": 2.2738603376416003e-06, + "loss": 0.3957, + "step": 6758 + }, + { + "epoch": 3.195744680851064, + "grad_norm": 3.1326372623443604, + "learning_rate": 2.273239075112349e-06, + "loss": 0.4305, + "step": 6759 + }, + { + "epoch": 3.1962174940898347, + "grad_norm": 2.847128391265869, + "learning_rate": 2.2726178267025072e-06, + "loss": 0.3825, + "step": 6760 + }, + { + "epoch": 3.196690307328605, + "grad_norm": 2.697584629058838, + "learning_rate": 2.2719965924507566e-06, + "loss": 0.3517, + "step": 6761 + }, + { + "epoch": 3.197163120567376, + "grad_norm": 2.881446599960327, + "learning_rate": 2.271375372395777e-06, + "loss": 0.3791, + "step": 6762 + }, + { + "epoch": 3.1976359338061466, + "grad_norm": 3.085054874420166, + "learning_rate": 2.270754166576252e-06, + "loss": 0.4324, + "step": 6763 + }, + { + "epoch": 3.1981087470449174, + "grad_norm": 3.3494462966918945, + "learning_rate": 2.270132975030859e-06, + "loss": 0.4242, + "step": 6764 + }, + { + "epoch": 3.198581560283688, + "grad_norm": 2.8617660999298096, + "learning_rate": 2.2695117977982785e-06, + "loss": 0.3563, + "step": 6765 + }, + { + "epoch": 3.1990543735224586, + "grad_norm": 2.7437968254089355, + "learning_rate": 2.2688906349171873e-06, + "loss": 0.4042, + "step": 6766 + }, + { + "epoch": 3.1995271867612294, + "grad_norm": 3.1129143238067627, + "learning_rate": 2.268269486426262e-06, + "loss": 0.3761, + "step": 6767 + }, + { + "epoch": 3.2, + "grad_norm": 3.32441782951355, + "learning_rate": 2.2676483523641807e-06, + "loss": 0.4439, + "step": 6768 + }, + { + "epoch": 3.2004728132387705, + "grad_norm": 2.8744730949401855, + "learning_rate": 2.267027232769617e-06, + "loss": 0.4015, + "step": 6769 + }, + { + "epoch": 3.2009456264775413, + "grad_norm": 3.6283397674560547, + "learning_rate": 2.2664061276812465e-06, + "loss": 0.3634, + "step": 6770 + }, + { + "epoch": 3.201418439716312, + "grad_norm": 2.7826597690582275, + "learning_rate": 2.2657850371377426e-06, + "loss": 0.3178, + "step": 6771 + }, + { + "epoch": 3.201891252955083, + "grad_norm": 2.668173313140869, + "learning_rate": 2.265163961177776e-06, + "loss": 0.3662, + "step": 6772 + }, + { + "epoch": 3.2023640661938533, + "grad_norm": 2.868441104888916, + "learning_rate": 2.264542899840021e-06, + "loss": 0.4235, + "step": 6773 + }, + { + "epoch": 3.202836879432624, + "grad_norm": 3.2715935707092285, + "learning_rate": 2.263921853163147e-06, + "loss": 0.4741, + "step": 6774 + }, + { + "epoch": 3.203309692671395, + "grad_norm": 2.8647544384002686, + "learning_rate": 2.2633008211858233e-06, + "loss": 0.3885, + "step": 6775 + }, + { + "epoch": 3.2037825059101657, + "grad_norm": 3.070164680480957, + "learning_rate": 2.2626798039467207e-06, + "loss": 0.4191, + "step": 6776 + }, + { + "epoch": 3.204255319148936, + "grad_norm": 2.846686840057373, + "learning_rate": 2.262058801484505e-06, + "loss": 0.3619, + "step": 6777 + }, + { + "epoch": 3.204728132387707, + "grad_norm": 2.767031192779541, + "learning_rate": 2.261437813837845e-06, + "loss": 0.3248, + "step": 6778 + }, + { + "epoch": 3.2052009456264776, + "grad_norm": 2.6819260120391846, + "learning_rate": 2.2608168410454065e-06, + "loss": 0.3871, + "step": 6779 + }, + { + "epoch": 3.2056737588652484, + "grad_norm": 3.1176788806915283, + "learning_rate": 2.260195883145854e-06, + "loss": 0.3929, + "step": 6780 + }, + { + "epoch": 3.2061465721040188, + "grad_norm": 3.143209457397461, + "learning_rate": 2.2595749401778524e-06, + "loss": 0.4188, + "step": 6781 + }, + { + "epoch": 3.2066193853427896, + "grad_norm": 2.9685657024383545, + "learning_rate": 2.2589540121800647e-06, + "loss": 0.4049, + "step": 6782 + }, + { + "epoch": 3.2070921985815604, + "grad_norm": 2.6853368282318115, + "learning_rate": 2.258333099191155e-06, + "loss": 0.349, + "step": 6783 + }, + { + "epoch": 3.207565011820331, + "grad_norm": 2.8418309688568115, + "learning_rate": 2.257712201249783e-06, + "loss": 0.4121, + "step": 6784 + }, + { + "epoch": 3.2080378250591015, + "grad_norm": 2.9441449642181396, + "learning_rate": 2.2570913183946085e-06, + "loss": 0.3846, + "step": 6785 + }, + { + "epoch": 3.2085106382978723, + "grad_norm": 2.9956493377685547, + "learning_rate": 2.256470450664294e-06, + "loss": 0.3941, + "step": 6786 + }, + { + "epoch": 3.208983451536643, + "grad_norm": 3.1774401664733887, + "learning_rate": 2.255849598097496e-06, + "loss": 0.4252, + "step": 6787 + }, + { + "epoch": 3.209456264775414, + "grad_norm": 2.8948934078216553, + "learning_rate": 2.255228760732873e-06, + "loss": 0.3963, + "step": 6788 + }, + { + "epoch": 3.2099290780141843, + "grad_norm": 3.440021276473999, + "learning_rate": 2.2546079386090825e-06, + "loss": 0.3777, + "step": 6789 + }, + { + "epoch": 3.210401891252955, + "grad_norm": 3.1573195457458496, + "learning_rate": 2.253987131764779e-06, + "loss": 0.3896, + "step": 6790 + }, + { + "epoch": 3.210874704491726, + "grad_norm": 3.4218719005584717, + "learning_rate": 2.2533663402386183e-06, + "loss": 0.3979, + "step": 6791 + }, + { + "epoch": 3.2113475177304966, + "grad_norm": 3.3442487716674805, + "learning_rate": 2.252745564069253e-06, + "loss": 0.406, + "step": 6792 + }, + { + "epoch": 3.211820330969267, + "grad_norm": 2.6089327335357666, + "learning_rate": 2.2521248032953387e-06, + "loss": 0.3539, + "step": 6793 + }, + { + "epoch": 3.212293144208038, + "grad_norm": 3.8015971183776855, + "learning_rate": 2.251504057955526e-06, + "loss": 0.4184, + "step": 6794 + }, + { + "epoch": 3.2127659574468086, + "grad_norm": 3.797565460205078, + "learning_rate": 2.250883328088465e-06, + "loss": 0.3392, + "step": 6795 + }, + { + "epoch": 3.2132387706855794, + "grad_norm": 3.290762186050415, + "learning_rate": 2.2502626137328077e-06, + "loss": 0.3726, + "step": 6796 + }, + { + "epoch": 3.2137115839243497, + "grad_norm": 3.149158000946045, + "learning_rate": 2.2496419149272023e-06, + "loss": 0.3869, + "step": 6797 + }, + { + "epoch": 3.2141843971631205, + "grad_norm": 2.652902364730835, + "learning_rate": 2.2490212317102964e-06, + "loss": 0.3256, + "step": 6798 + }, + { + "epoch": 3.2146572104018913, + "grad_norm": 3.3039770126342773, + "learning_rate": 2.248400564120739e-06, + "loss": 0.4231, + "step": 6799 + }, + { + "epoch": 3.215130023640662, + "grad_norm": 3.0190038681030273, + "learning_rate": 2.247779912197174e-06, + "loss": 0.4319, + "step": 6800 + }, + { + "epoch": 3.2156028368794325, + "grad_norm": 2.861393690109253, + "learning_rate": 2.2471592759782485e-06, + "loss": 0.465, + "step": 6801 + }, + { + "epoch": 3.2160756501182033, + "grad_norm": 2.7796146869659424, + "learning_rate": 2.246538655502606e-06, + "loss": 0.3896, + "step": 6802 + }, + { + "epoch": 3.216548463356974, + "grad_norm": 3.1849005222320557, + "learning_rate": 2.24591805080889e-06, + "loss": 0.3782, + "step": 6803 + }, + { + "epoch": 3.217021276595745, + "grad_norm": 3.076164960861206, + "learning_rate": 2.2452974619357435e-06, + "loss": 0.4023, + "step": 6804 + }, + { + "epoch": 3.2174940898345152, + "grad_norm": 2.7006006240844727, + "learning_rate": 2.2446768889218064e-06, + "loss": 0.3902, + "step": 6805 + }, + { + "epoch": 3.217966903073286, + "grad_norm": 2.9310474395751953, + "learning_rate": 2.2440563318057205e-06, + "loss": 0.366, + "step": 6806 + }, + { + "epoch": 3.218439716312057, + "grad_norm": 3.057248592376709, + "learning_rate": 2.2434357906261246e-06, + "loss": 0.4042, + "step": 6807 + }, + { + "epoch": 3.2189125295508276, + "grad_norm": 3.3720197677612305, + "learning_rate": 2.242815265421656e-06, + "loss": 0.3816, + "step": 6808 + }, + { + "epoch": 3.219385342789598, + "grad_norm": 2.9626352787017822, + "learning_rate": 2.2421947562309545e-06, + "loss": 0.363, + "step": 6809 + }, + { + "epoch": 3.219858156028369, + "grad_norm": 2.7848782539367676, + "learning_rate": 2.2415742630926533e-06, + "loss": 0.3597, + "step": 6810 + }, + { + "epoch": 3.2203309692671396, + "grad_norm": 2.757319450378418, + "learning_rate": 2.2409537860453913e-06, + "loss": 0.3304, + "step": 6811 + }, + { + "epoch": 3.2208037825059104, + "grad_norm": 2.7765560150146484, + "learning_rate": 2.240333325127801e-06, + "loss": 0.3896, + "step": 6812 + }, + { + "epoch": 3.2212765957446807, + "grad_norm": 2.9882447719573975, + "learning_rate": 2.239712880378515e-06, + "loss": 0.4004, + "step": 6813 + }, + { + "epoch": 3.2217494089834515, + "grad_norm": 2.8551244735717773, + "learning_rate": 2.2390924518361673e-06, + "loss": 0.4167, + "step": 6814 + }, + { + "epoch": 3.2222222222222223, + "grad_norm": 2.8051679134368896, + "learning_rate": 2.2384720395393878e-06, + "loss": 0.3319, + "step": 6815 + }, + { + "epoch": 3.222695035460993, + "grad_norm": 3.1172873973846436, + "learning_rate": 2.2378516435268086e-06, + "loss": 0.379, + "step": 6816 + }, + { + "epoch": 3.2231678486997635, + "grad_norm": 3.0282177925109863, + "learning_rate": 2.237231263837058e-06, + "loss": 0.3855, + "step": 6817 + }, + { + "epoch": 3.2236406619385343, + "grad_norm": 2.7156803607940674, + "learning_rate": 2.236610900508763e-06, + "loss": 0.4062, + "step": 6818 + }, + { + "epoch": 3.224113475177305, + "grad_norm": 2.721327781677246, + "learning_rate": 2.235990553580554e-06, + "loss": 0.3726, + "step": 6819 + }, + { + "epoch": 3.2245862884160754, + "grad_norm": 2.881181240081787, + "learning_rate": 2.235370223091055e-06, + "loss": 0.421, + "step": 6820 + }, + { + "epoch": 3.225059101654846, + "grad_norm": 2.8074657917022705, + "learning_rate": 2.234749909078892e-06, + "loss": 0.3628, + "step": 6821 + }, + { + "epoch": 3.225531914893617, + "grad_norm": 2.8781638145446777, + "learning_rate": 2.234129611582689e-06, + "loss": 0.3857, + "step": 6822 + }, + { + "epoch": 3.226004728132388, + "grad_norm": 2.9473299980163574, + "learning_rate": 2.233509330641068e-06, + "loss": 0.4358, + "step": 6823 + }, + { + "epoch": 3.2264775413711586, + "grad_norm": 3.261209011077881, + "learning_rate": 2.2328890662926543e-06, + "loss": 0.4115, + "step": 6824 + }, + { + "epoch": 3.226950354609929, + "grad_norm": 3.2796943187713623, + "learning_rate": 2.232268818576067e-06, + "loss": 0.3846, + "step": 6825 + }, + { + "epoch": 3.2274231678486998, + "grad_norm": 3.1083059310913086, + "learning_rate": 2.2316485875299247e-06, + "loss": 0.3452, + "step": 6826 + }, + { + "epoch": 3.2278959810874706, + "grad_norm": 2.7947003841400146, + "learning_rate": 2.23102837319285e-06, + "loss": 0.3733, + "step": 6827 + }, + { + "epoch": 3.228368794326241, + "grad_norm": 2.792348861694336, + "learning_rate": 2.230408175603458e-06, + "loss": 0.411, + "step": 6828 + }, + { + "epoch": 3.2288416075650117, + "grad_norm": 2.8563876152038574, + "learning_rate": 2.229787994800368e-06, + "loss": 0.4303, + "step": 6829 + }, + { + "epoch": 3.2293144208037825, + "grad_norm": 2.9573659896850586, + "learning_rate": 2.2291678308221943e-06, + "loss": 0.4124, + "step": 6830 + }, + { + "epoch": 3.2297872340425533, + "grad_norm": 2.8554422855377197, + "learning_rate": 2.228547683707551e-06, + "loss": 0.3715, + "step": 6831 + }, + { + "epoch": 3.230260047281324, + "grad_norm": 2.9457242488861084, + "learning_rate": 2.227927553495054e-06, + "loss": 0.4339, + "step": 6832 + }, + { + "epoch": 3.2307328605200945, + "grad_norm": 2.799135684967041, + "learning_rate": 2.227307440223315e-06, + "loss": 0.3335, + "step": 6833 + }, + { + "epoch": 3.2312056737588652, + "grad_norm": 2.768529176712036, + "learning_rate": 2.2266873439309465e-06, + "loss": 0.3929, + "step": 6834 + }, + { + "epoch": 3.231678486997636, + "grad_norm": 3.124069929122925, + "learning_rate": 2.2260672646565585e-06, + "loss": 0.4205, + "step": 6835 + }, + { + "epoch": 3.2321513002364064, + "grad_norm": 2.8153982162475586, + "learning_rate": 2.2254472024387603e-06, + "loss": 0.3565, + "step": 6836 + }, + { + "epoch": 3.232624113475177, + "grad_norm": 3.1802141666412354, + "learning_rate": 2.224827157316162e-06, + "loss": 0.4614, + "step": 6837 + }, + { + "epoch": 3.233096926713948, + "grad_norm": 2.669651746749878, + "learning_rate": 2.2242071293273682e-06, + "loss": 0.3581, + "step": 6838 + }, + { + "epoch": 3.233569739952719, + "grad_norm": 3.073127269744873, + "learning_rate": 2.223587118510989e-06, + "loss": 0.3581, + "step": 6839 + }, + { + "epoch": 3.2340425531914896, + "grad_norm": 2.875955820083618, + "learning_rate": 2.222967124905627e-06, + "loss": 0.3905, + "step": 6840 + }, + { + "epoch": 3.23451536643026, + "grad_norm": 2.887744903564453, + "learning_rate": 2.2223471485498872e-06, + "loss": 0.4131, + "step": 6841 + }, + { + "epoch": 3.2349881796690307, + "grad_norm": 2.6957902908325195, + "learning_rate": 2.2217271894823735e-06, + "loss": 0.3631, + "step": 6842 + }, + { + "epoch": 3.2354609929078015, + "grad_norm": 2.7098400592803955, + "learning_rate": 2.221107247741688e-06, + "loss": 0.3959, + "step": 6843 + }, + { + "epoch": 3.235933806146572, + "grad_norm": 2.986271858215332, + "learning_rate": 2.22048732336643e-06, + "loss": 0.3515, + "step": 6844 + }, + { + "epoch": 3.2364066193853427, + "grad_norm": 3.0537121295928955, + "learning_rate": 2.2198674163952015e-06, + "loss": 0.438, + "step": 6845 + }, + { + "epoch": 3.2368794326241135, + "grad_norm": 2.8351151943206787, + "learning_rate": 2.2192475268666e-06, + "loss": 0.4069, + "step": 6846 + }, + { + "epoch": 3.2373522458628843, + "grad_norm": 2.6455280780792236, + "learning_rate": 2.218627654819225e-06, + "loss": 0.3626, + "step": 6847 + }, + { + "epoch": 3.237825059101655, + "grad_norm": 3.060352325439453, + "learning_rate": 2.2180078002916717e-06, + "loss": 0.3306, + "step": 6848 + }, + { + "epoch": 3.2382978723404254, + "grad_norm": 3.0178887844085693, + "learning_rate": 2.2173879633225355e-06, + "loss": 0.4111, + "step": 6849 + }, + { + "epoch": 3.2387706855791962, + "grad_norm": 2.895822763442993, + "learning_rate": 2.2167681439504123e-06, + "loss": 0.4053, + "step": 6850 + }, + { + "epoch": 3.239243498817967, + "grad_norm": 2.7295608520507812, + "learning_rate": 2.2161483422138945e-06, + "loss": 0.4021, + "step": 6851 + }, + { + "epoch": 3.2397163120567374, + "grad_norm": 3.1004912853240967, + "learning_rate": 2.2155285581515747e-06, + "loss": 0.3882, + "step": 6852 + }, + { + "epoch": 3.240189125295508, + "grad_norm": 2.927987813949585, + "learning_rate": 2.214908791802045e-06, + "loss": 0.4036, + "step": 6853 + }, + { + "epoch": 3.240661938534279, + "grad_norm": 3.1679599285125732, + "learning_rate": 2.2142890432038943e-06, + "loss": 0.3897, + "step": 6854 + }, + { + "epoch": 3.2411347517730498, + "grad_norm": 3.2094008922576904, + "learning_rate": 2.213669312395712e-06, + "loss": 0.4429, + "step": 6855 + }, + { + "epoch": 3.24160756501182, + "grad_norm": 4.637594223022461, + "learning_rate": 2.2130495994160857e-06, + "loss": 0.3708, + "step": 6856 + }, + { + "epoch": 3.242080378250591, + "grad_norm": 3.0063490867614746, + "learning_rate": 2.212429904303603e-06, + "loss": 0.3949, + "step": 6857 + }, + { + "epoch": 3.2425531914893617, + "grad_norm": 3.285444736480713, + "learning_rate": 2.21181022709685e-06, + "loss": 0.4236, + "step": 6858 + }, + { + "epoch": 3.2430260047281325, + "grad_norm": 3.02506422996521, + "learning_rate": 2.2111905678344086e-06, + "loss": 0.368, + "step": 6859 + }, + { + "epoch": 3.243498817966903, + "grad_norm": 2.9845006465911865, + "learning_rate": 2.2105709265548657e-06, + "loss": 0.4154, + "step": 6860 + }, + { + "epoch": 3.2439716312056737, + "grad_norm": 3.2537527084350586, + "learning_rate": 2.2099513032968013e-06, + "loss": 0.4385, + "step": 6861 + }, + { + "epoch": 3.2444444444444445, + "grad_norm": 2.8521063327789307, + "learning_rate": 2.2093316980987985e-06, + "loss": 0.384, + "step": 6862 + }, + { + "epoch": 3.2449172576832153, + "grad_norm": 3.186844825744629, + "learning_rate": 2.208712110999436e-06, + "loss": 0.4131, + "step": 6863 + }, + { + "epoch": 3.2453900709219856, + "grad_norm": 2.932058095932007, + "learning_rate": 2.208092542037292e-06, + "loss": 0.3341, + "step": 6864 + }, + { + "epoch": 3.2458628841607564, + "grad_norm": 3.0818707942962646, + "learning_rate": 2.2074729912509462e-06, + "loss": 0.4149, + "step": 6865 + }, + { + "epoch": 3.246335697399527, + "grad_norm": 2.9788503646850586, + "learning_rate": 2.2068534586789735e-06, + "loss": 0.3572, + "step": 6866 + }, + { + "epoch": 3.246808510638298, + "grad_norm": 2.84075665473938, + "learning_rate": 2.206233944359952e-06, + "loss": 0.3561, + "step": 6867 + }, + { + "epoch": 3.2472813238770684, + "grad_norm": 2.966459035873413, + "learning_rate": 2.2056144483324545e-06, + "loss": 0.3909, + "step": 6868 + }, + { + "epoch": 3.247754137115839, + "grad_norm": 2.892038106918335, + "learning_rate": 2.204994970635054e-06, + "loss": 0.3557, + "step": 6869 + }, + { + "epoch": 3.24822695035461, + "grad_norm": 2.7458810806274414, + "learning_rate": 2.2043755113063233e-06, + "loss": 0.3551, + "step": 6870 + }, + { + "epoch": 3.2486997635933808, + "grad_norm": 2.766803741455078, + "learning_rate": 2.2037560703848334e-06, + "loss": 0.3343, + "step": 6871 + }, + { + "epoch": 3.249172576832151, + "grad_norm": 2.9780561923980713, + "learning_rate": 2.2031366479091533e-06, + "loss": 0.4004, + "step": 6872 + }, + { + "epoch": 3.249645390070922, + "grad_norm": 2.8848516941070557, + "learning_rate": 2.202517243917853e-06, + "loss": 0.3467, + "step": 6873 + }, + { + "epoch": 3.2501182033096927, + "grad_norm": 2.9962213039398193, + "learning_rate": 2.201897858449499e-06, + "loss": 0.3796, + "step": 6874 + }, + { + "epoch": 3.2505910165484635, + "grad_norm": 2.838131904602051, + "learning_rate": 2.201278491542659e-06, + "loss": 0.3683, + "step": 6875 + }, + { + "epoch": 3.251063829787234, + "grad_norm": 3.0232505798339844, + "learning_rate": 2.200659143235897e-06, + "loss": 0.3793, + "step": 6876 + }, + { + "epoch": 3.2515366430260046, + "grad_norm": 3.0690126419067383, + "learning_rate": 2.2000398135677776e-06, + "loss": 0.417, + "step": 6877 + }, + { + "epoch": 3.2520094562647754, + "grad_norm": 3.1838719844818115, + "learning_rate": 2.1994205025768643e-06, + "loss": 0.4608, + "step": 6878 + }, + { + "epoch": 3.2524822695035462, + "grad_norm": 3.1187257766723633, + "learning_rate": 2.198801210301717e-06, + "loss": 0.3396, + "step": 6879 + }, + { + "epoch": 3.2529550827423166, + "grad_norm": 2.7608656883239746, + "learning_rate": 2.1981819367808984e-06, + "loss": 0.386, + "step": 6880 + }, + { + "epoch": 3.2534278959810874, + "grad_norm": 3.027456283569336, + "learning_rate": 2.197562682052968e-06, + "loss": 0.3941, + "step": 6881 + }, + { + "epoch": 3.253900709219858, + "grad_norm": 2.925515651702881, + "learning_rate": 2.1969434461564816e-06, + "loss": 0.3608, + "step": 6882 + }, + { + "epoch": 3.254373522458629, + "grad_norm": 2.946770668029785, + "learning_rate": 2.196324229129999e-06, + "loss": 0.4116, + "step": 6883 + }, + { + "epoch": 3.2548463356973993, + "grad_norm": 2.6497952938079834, + "learning_rate": 2.1957050310120746e-06, + "loss": 0.338, + "step": 6884 + }, + { + "epoch": 3.25531914893617, + "grad_norm": 2.6915128231048584, + "learning_rate": 2.195085851841264e-06, + "loss": 0.3372, + "step": 6885 + }, + { + "epoch": 3.255791962174941, + "grad_norm": 3.4022350311279297, + "learning_rate": 2.1944666916561205e-06, + "loss": 0.3844, + "step": 6886 + }, + { + "epoch": 3.2562647754137117, + "grad_norm": 2.7463366985321045, + "learning_rate": 2.1938475504951958e-06, + "loss": 0.3268, + "step": 6887 + }, + { + "epoch": 3.256737588652482, + "grad_norm": 2.828810691833496, + "learning_rate": 2.193228428397042e-06, + "loss": 0.3275, + "step": 6888 + }, + { + "epoch": 3.257210401891253, + "grad_norm": 3.4016268253326416, + "learning_rate": 2.192609325400208e-06, + "loss": 0.3916, + "step": 6889 + }, + { + "epoch": 3.2576832151300237, + "grad_norm": 2.4980733394622803, + "learning_rate": 2.191990241543245e-06, + "loss": 0.3636, + "step": 6890 + }, + { + "epoch": 3.2581560283687945, + "grad_norm": 3.0384702682495117, + "learning_rate": 2.191371176864698e-06, + "loss": 0.398, + "step": 6891 + }, + { + "epoch": 3.258628841607565, + "grad_norm": 2.8949527740478516, + "learning_rate": 2.190752131403115e-06, + "loss": 0.3919, + "step": 6892 + }, + { + "epoch": 3.2591016548463356, + "grad_norm": 2.765617609024048, + "learning_rate": 2.190133105197041e-06, + "loss": 0.3799, + "step": 6893 + }, + { + "epoch": 3.2595744680851064, + "grad_norm": 2.6149277687072754, + "learning_rate": 2.18951409828502e-06, + "loss": 0.3895, + "step": 6894 + }, + { + "epoch": 3.260047281323877, + "grad_norm": 2.9738945960998535, + "learning_rate": 2.1888951107055934e-06, + "loss": 0.3879, + "step": 6895 + }, + { + "epoch": 3.2605200945626476, + "grad_norm": 2.9438633918762207, + "learning_rate": 2.1882761424973053e-06, + "loss": 0.438, + "step": 6896 + }, + { + "epoch": 3.2609929078014184, + "grad_norm": 3.114243984222412, + "learning_rate": 2.1876571936986936e-06, + "loss": 0.4737, + "step": 6897 + }, + { + "epoch": 3.261465721040189, + "grad_norm": 3.017526388168335, + "learning_rate": 2.1870382643483e-06, + "loss": 0.4039, + "step": 6898 + }, + { + "epoch": 3.26193853427896, + "grad_norm": 3.1475703716278076, + "learning_rate": 2.1864193544846613e-06, + "loss": 0.3825, + "step": 6899 + }, + { + "epoch": 3.2624113475177303, + "grad_norm": 2.75502872467041, + "learning_rate": 2.1858004641463142e-06, + "loss": 0.3507, + "step": 6900 + }, + { + "epoch": 3.262884160756501, + "grad_norm": 3.0467209815979004, + "learning_rate": 2.1851815933717944e-06, + "loss": 0.3938, + "step": 6901 + }, + { + "epoch": 3.263356973995272, + "grad_norm": 2.993014097213745, + "learning_rate": 2.184562742199636e-06, + "loss": 0.3711, + "step": 6902 + }, + { + "epoch": 3.2638297872340427, + "grad_norm": 2.607309341430664, + "learning_rate": 2.183943910668373e-06, + "loss": 0.3689, + "step": 6903 + }, + { + "epoch": 3.264302600472813, + "grad_norm": 2.961653470993042, + "learning_rate": 2.1833250988165373e-06, + "loss": 0.3806, + "step": 6904 + }, + { + "epoch": 3.264775413711584, + "grad_norm": 2.8202552795410156, + "learning_rate": 2.1827063066826574e-06, + "loss": 0.391, + "step": 6905 + }, + { + "epoch": 3.2652482269503547, + "grad_norm": 3.032648801803589, + "learning_rate": 2.1820875343052666e-06, + "loss": 0.4011, + "step": 6906 + }, + { + "epoch": 3.2657210401891255, + "grad_norm": 2.8265180587768555, + "learning_rate": 2.1814687817228896e-06, + "loss": 0.3923, + "step": 6907 + }, + { + "epoch": 3.266193853427896, + "grad_norm": 3.1425564289093018, + "learning_rate": 2.1808500489740555e-06, + "loss": 0.4913, + "step": 6908 + }, + { + "epoch": 3.2666666666666666, + "grad_norm": 2.977809429168701, + "learning_rate": 2.18023133609729e-06, + "loss": 0.379, + "step": 6909 + }, + { + "epoch": 3.2671394799054374, + "grad_norm": 3.509551525115967, + "learning_rate": 2.1796126431311153e-06, + "loss": 0.4025, + "step": 6910 + }, + { + "epoch": 3.267612293144208, + "grad_norm": 2.9133846759796143, + "learning_rate": 2.178993970114058e-06, + "loss": 0.4209, + "step": 6911 + }, + { + "epoch": 3.2680851063829786, + "grad_norm": 2.945513963699341, + "learning_rate": 2.178375317084637e-06, + "loss": 0.3882, + "step": 6912 + }, + { + "epoch": 3.2685579196217494, + "grad_norm": 2.7868733406066895, + "learning_rate": 2.1777566840813763e-06, + "loss": 0.3456, + "step": 6913 + }, + { + "epoch": 3.26903073286052, + "grad_norm": 2.803220748901367, + "learning_rate": 2.1771380711427937e-06, + "loss": 0.3394, + "step": 6914 + }, + { + "epoch": 3.269503546099291, + "grad_norm": 3.1293554306030273, + "learning_rate": 2.176519478307407e-06, + "loss": 0.402, + "step": 6915 + }, + { + "epoch": 3.2699763593380613, + "grad_norm": 2.843971014022827, + "learning_rate": 2.1759009056137347e-06, + "loss": 0.3449, + "step": 6916 + }, + { + "epoch": 3.270449172576832, + "grad_norm": 2.9983274936676025, + "learning_rate": 2.1752823531002917e-06, + "loss": 0.4091, + "step": 6917 + }, + { + "epoch": 3.270921985815603, + "grad_norm": 2.686722993850708, + "learning_rate": 2.174663820805592e-06, + "loss": 0.4303, + "step": 6918 + }, + { + "epoch": 3.2713947990543737, + "grad_norm": 2.669349431991577, + "learning_rate": 2.1740453087681508e-06, + "loss": 0.3796, + "step": 6919 + }, + { + "epoch": 3.271867612293144, + "grad_norm": 2.992138624191284, + "learning_rate": 2.173426817026477e-06, + "loss": 0.4125, + "step": 6920 + }, + { + "epoch": 3.272340425531915, + "grad_norm": 3.332834243774414, + "learning_rate": 2.1728083456190852e-06, + "loss": 0.3885, + "step": 6921 + }, + { + "epoch": 3.2728132387706856, + "grad_norm": 2.869673013687134, + "learning_rate": 2.1721898945844825e-06, + "loss": 0.3941, + "step": 6922 + }, + { + "epoch": 3.2732860520094564, + "grad_norm": 2.804440975189209, + "learning_rate": 2.1715714639611774e-06, + "loss": 0.4007, + "step": 6923 + }, + { + "epoch": 3.273758865248227, + "grad_norm": 3.1751439571380615, + "learning_rate": 2.1709530537876774e-06, + "loss": 0.3981, + "step": 6924 + }, + { + "epoch": 3.2742316784869976, + "grad_norm": 2.6367175579071045, + "learning_rate": 2.1703346641024878e-06, + "loss": 0.3582, + "step": 6925 + }, + { + "epoch": 3.2747044917257684, + "grad_norm": 2.99164080619812, + "learning_rate": 2.1697162949441137e-06, + "loss": 0.3846, + "step": 6926 + }, + { + "epoch": 3.275177304964539, + "grad_norm": 3.3206982612609863, + "learning_rate": 2.169097946351057e-06, + "loss": 0.3689, + "step": 6927 + }, + { + "epoch": 3.2756501182033095, + "grad_norm": 2.927907943725586, + "learning_rate": 2.16847961836182e-06, + "loss": 0.3536, + "step": 6928 + }, + { + "epoch": 3.2761229314420803, + "grad_norm": 3.1950864791870117, + "learning_rate": 2.167861311014904e-06, + "loss": 0.4154, + "step": 6929 + }, + { + "epoch": 3.276595744680851, + "grad_norm": 2.888383388519287, + "learning_rate": 2.1672430243488073e-06, + "loss": 0.3702, + "step": 6930 + }, + { + "epoch": 3.277068557919622, + "grad_norm": 2.842287063598633, + "learning_rate": 2.166624758402029e-06, + "loss": 0.3623, + "step": 6931 + }, + { + "epoch": 3.2775413711583923, + "grad_norm": 2.84350323677063, + "learning_rate": 2.166006513213065e-06, + "loss": 0.3757, + "step": 6932 + }, + { + "epoch": 3.278014184397163, + "grad_norm": 3.105626344680786, + "learning_rate": 2.165388288820411e-06, + "loss": 0.3955, + "step": 6933 + }, + { + "epoch": 3.278486997635934, + "grad_norm": 3.273508071899414, + "learning_rate": 2.164770085262561e-06, + "loss": 0.4046, + "step": 6934 + }, + { + "epoch": 3.2789598108747047, + "grad_norm": 3.2530124187469482, + "learning_rate": 2.1641519025780066e-06, + "loss": 0.3141, + "step": 6935 + }, + { + "epoch": 3.279432624113475, + "grad_norm": 2.822849750518799, + "learning_rate": 2.163533740805242e-06, + "loss": 0.3973, + "step": 6936 + }, + { + "epoch": 3.279905437352246, + "grad_norm": 2.772097587585449, + "learning_rate": 2.162915599982756e-06, + "loss": 0.3606, + "step": 6937 + }, + { + "epoch": 3.2803782505910166, + "grad_norm": 3.150696039199829, + "learning_rate": 2.1622974801490365e-06, + "loss": 0.4709, + "step": 6938 + }, + { + "epoch": 3.2808510638297874, + "grad_norm": 3.2072134017944336, + "learning_rate": 2.1616793813425736e-06, + "loss": 0.3946, + "step": 6939 + }, + { + "epoch": 3.2813238770685578, + "grad_norm": 2.9922473430633545, + "learning_rate": 2.1610613036018515e-06, + "loss": 0.3263, + "step": 6940 + }, + { + "epoch": 3.2817966903073286, + "grad_norm": 2.7818009853363037, + "learning_rate": 2.1604432469653555e-06, + "loss": 0.3887, + "step": 6941 + }, + { + "epoch": 3.2822695035460994, + "grad_norm": 3.12998628616333, + "learning_rate": 2.15982521147157e-06, + "loss": 0.3522, + "step": 6942 + }, + { + "epoch": 3.28274231678487, + "grad_norm": 2.876678228378296, + "learning_rate": 2.159207197158976e-06, + "loss": 0.3643, + "step": 6943 + }, + { + "epoch": 3.2832151300236405, + "grad_norm": 2.825488805770874, + "learning_rate": 2.1585892040660565e-06, + "loss": 0.3223, + "step": 6944 + }, + { + "epoch": 3.2836879432624113, + "grad_norm": 2.8724498748779297, + "learning_rate": 2.1579712322312906e-06, + "loss": 0.3855, + "step": 6945 + }, + { + "epoch": 3.284160756501182, + "grad_norm": 2.841064691543579, + "learning_rate": 2.1573532816931547e-06, + "loss": 0.4106, + "step": 6946 + }, + { + "epoch": 3.284633569739953, + "grad_norm": 3.053391218185425, + "learning_rate": 2.1567353524901288e-06, + "loss": 0.4875, + "step": 6947 + }, + { + "epoch": 3.2851063829787233, + "grad_norm": 2.7294771671295166, + "learning_rate": 2.156117444660687e-06, + "loss": 0.3856, + "step": 6948 + }, + { + "epoch": 3.285579196217494, + "grad_norm": 3.0965659618377686, + "learning_rate": 2.155499558243304e-06, + "loss": 0.4104, + "step": 6949 + }, + { + "epoch": 3.286052009456265, + "grad_norm": 2.778923511505127, + "learning_rate": 2.1548816932764536e-06, + "loss": 0.3636, + "step": 6950 + }, + { + "epoch": 3.2865248226950357, + "grad_norm": 2.890679121017456, + "learning_rate": 2.1542638497986054e-06, + "loss": 0.4026, + "step": 6951 + }, + { + "epoch": 3.286997635933806, + "grad_norm": 3.0466806888580322, + "learning_rate": 2.1536460278482326e-06, + "loss": 0.3856, + "step": 6952 + }, + { + "epoch": 3.287470449172577, + "grad_norm": 3.1367077827453613, + "learning_rate": 2.1530282274638013e-06, + "loss": 0.3767, + "step": 6953 + }, + { + "epoch": 3.2879432624113476, + "grad_norm": 2.984694719314575, + "learning_rate": 2.1524104486837823e-06, + "loss": 0.4142, + "step": 6954 + }, + { + "epoch": 3.2884160756501184, + "grad_norm": 3.1542797088623047, + "learning_rate": 2.151792691546641e-06, + "loss": 0.4361, + "step": 6955 + }, + { + "epoch": 3.2888888888888888, + "grad_norm": 2.7306816577911377, + "learning_rate": 2.1511749560908405e-06, + "loss": 0.3692, + "step": 6956 + }, + { + "epoch": 3.2893617021276595, + "grad_norm": 3.6679904460906982, + "learning_rate": 2.150557242354847e-06, + "loss": 0.4496, + "step": 6957 + }, + { + "epoch": 3.2898345153664303, + "grad_norm": 3.2040863037109375, + "learning_rate": 2.1499395503771207e-06, + "loss": 0.3526, + "step": 6958 + }, + { + "epoch": 3.290307328605201, + "grad_norm": 3.2416043281555176, + "learning_rate": 2.1493218801961246e-06, + "loss": 0.3955, + "step": 6959 + }, + { + "epoch": 3.2907801418439715, + "grad_norm": 2.8164525032043457, + "learning_rate": 2.1487042318503174e-06, + "loss": 0.3727, + "step": 6960 + }, + { + "epoch": 3.2912529550827423, + "grad_norm": 2.5954513549804688, + "learning_rate": 2.148086605378156e-06, + "loss": 0.3315, + "step": 6961 + }, + { + "epoch": 3.291725768321513, + "grad_norm": 2.8068149089813232, + "learning_rate": 2.1474690008181e-06, + "loss": 0.3702, + "step": 6962 + }, + { + "epoch": 3.2921985815602834, + "grad_norm": 2.9063730239868164, + "learning_rate": 2.1468514182086025e-06, + "loss": 0.3357, + "step": 6963 + }, + { + "epoch": 3.2926713947990542, + "grad_norm": 2.7623207569122314, + "learning_rate": 2.1462338575881197e-06, + "loss": 0.381, + "step": 6964 + }, + { + "epoch": 3.293144208037825, + "grad_norm": 2.6818830966949463, + "learning_rate": 2.145616318995103e-06, + "loss": 0.3733, + "step": 6965 + }, + { + "epoch": 3.293617021276596, + "grad_norm": 2.7966864109039307, + "learning_rate": 2.1449988024680034e-06, + "loss": 0.3993, + "step": 6966 + }, + { + "epoch": 3.2940898345153666, + "grad_norm": 3.0644514560699463, + "learning_rate": 2.1443813080452728e-06, + "loss": 0.3541, + "step": 6967 + }, + { + "epoch": 3.294562647754137, + "grad_norm": 3.03204607963562, + "learning_rate": 2.1437638357653586e-06, + "loss": 0.3864, + "step": 6968 + }, + { + "epoch": 3.295035460992908, + "grad_norm": 2.980565071105957, + "learning_rate": 2.143146385666707e-06, + "loss": 0.36, + "step": 6969 + }, + { + "epoch": 3.2955082742316786, + "grad_norm": 3.1261661052703857, + "learning_rate": 2.1425289577877675e-06, + "loss": 0.4053, + "step": 6970 + }, + { + "epoch": 3.295981087470449, + "grad_norm": 3.0194897651672363, + "learning_rate": 2.1419115521669804e-06, + "loss": 0.4553, + "step": 6971 + }, + { + "epoch": 3.2964539007092197, + "grad_norm": 2.7620482444763184, + "learning_rate": 2.141294168842792e-06, + "loss": 0.3846, + "step": 6972 + }, + { + "epoch": 3.2969267139479905, + "grad_norm": 2.9575016498565674, + "learning_rate": 2.1406768078536427e-06, + "loss": 0.4415, + "step": 6973 + }, + { + "epoch": 3.2973995271867613, + "grad_norm": 3.17909574508667, + "learning_rate": 2.1400594692379717e-06, + "loss": 0.4514, + "step": 6974 + }, + { + "epoch": 3.297872340425532, + "grad_norm": 3.128613233566284, + "learning_rate": 2.1394421530342207e-06, + "loss": 0.3757, + "step": 6975 + }, + { + "epoch": 3.2983451536643025, + "grad_norm": 3.0247111320495605, + "learning_rate": 2.1388248592808243e-06, + "loss": 0.3881, + "step": 6976 + }, + { + "epoch": 3.2988179669030733, + "grad_norm": 2.8091228008270264, + "learning_rate": 2.1382075880162217e-06, + "loss": 0.3782, + "step": 6977 + }, + { + "epoch": 3.299290780141844, + "grad_norm": 2.985105514526367, + "learning_rate": 2.137590339278846e-06, + "loss": 0.3783, + "step": 6978 + }, + { + "epoch": 3.2997635933806144, + "grad_norm": 3.1862502098083496, + "learning_rate": 2.1369731131071304e-06, + "loss": 0.4776, + "step": 6979 + }, + { + "epoch": 3.300236406619385, + "grad_norm": 3.3138091564178467, + "learning_rate": 2.1363559095395075e-06, + "loss": 0.4056, + "step": 6980 + }, + { + "epoch": 3.300709219858156, + "grad_norm": 3.023695707321167, + "learning_rate": 2.135738728614407e-06, + "loss": 0.3716, + "step": 6981 + }, + { + "epoch": 3.301182033096927, + "grad_norm": 6.149252414703369, + "learning_rate": 2.135121570370259e-06, + "loss": 0.3713, + "step": 6982 + }, + { + "epoch": 3.3016548463356976, + "grad_norm": 2.689671754837036, + "learning_rate": 2.134504434845491e-06, + "loss": 0.3541, + "step": 6983 + }, + { + "epoch": 3.302127659574468, + "grad_norm": 3.241212844848633, + "learning_rate": 2.1338873220785284e-06, + "loss": 0.4328, + "step": 6984 + }, + { + "epoch": 3.3026004728132388, + "grad_norm": 3.6037068367004395, + "learning_rate": 2.133270232107798e-06, + "loss": 0.4091, + "step": 6985 + }, + { + "epoch": 3.3030732860520096, + "grad_norm": 3.300031900405884, + "learning_rate": 2.1326531649717216e-06, + "loss": 0.3742, + "step": 6986 + }, + { + "epoch": 3.30354609929078, + "grad_norm": 2.82257342338562, + "learning_rate": 2.1320361207087225e-06, + "loss": 0.3622, + "step": 6987 + }, + { + "epoch": 3.3040189125295507, + "grad_norm": 3.297513246536255, + "learning_rate": 2.1314190993572196e-06, + "loss": 0.4606, + "step": 6988 + }, + { + "epoch": 3.3044917257683215, + "grad_norm": 2.676440954208374, + "learning_rate": 2.130802100955634e-06, + "loss": 0.382, + "step": 6989 + }, + { + "epoch": 3.3049645390070923, + "grad_norm": 2.9548017978668213, + "learning_rate": 2.130185125542383e-06, + "loss": 0.3751, + "step": 6990 + }, + { + "epoch": 3.305437352245863, + "grad_norm": 2.800647020339966, + "learning_rate": 2.129568173155882e-06, + "loss": 0.3868, + "step": 6991 + }, + { + "epoch": 3.3059101654846335, + "grad_norm": 3.3789260387420654, + "learning_rate": 2.128951243834546e-06, + "loss": 0.4373, + "step": 6992 + }, + { + "epoch": 3.3063829787234043, + "grad_norm": 2.944807767868042, + "learning_rate": 2.12833433761679e-06, + "loss": 0.4205, + "step": 6993 + }, + { + "epoch": 3.306855791962175, + "grad_norm": 2.577975273132324, + "learning_rate": 2.127717454541025e-06, + "loss": 0.4197, + "step": 6994 + }, + { + "epoch": 3.3073286052009454, + "grad_norm": 3.0542666912078857, + "learning_rate": 2.127100594645661e-06, + "loss": 0.3811, + "step": 6995 + }, + { + "epoch": 3.307801418439716, + "grad_norm": 3.163015842437744, + "learning_rate": 2.1264837579691088e-06, + "loss": 0.415, + "step": 6996 + }, + { + "epoch": 3.308274231678487, + "grad_norm": 2.9161269664764404, + "learning_rate": 2.1258669445497746e-06, + "loss": 0.3714, + "step": 6997 + }, + { + "epoch": 3.308747044917258, + "grad_norm": 2.934483289718628, + "learning_rate": 2.1252501544260657e-06, + "loss": 0.4085, + "step": 6998 + }, + { + "epoch": 3.3092198581560286, + "grad_norm": 3.155613660812378, + "learning_rate": 2.1246333876363852e-06, + "loss": 0.4698, + "step": 6999 + }, + { + "epoch": 3.309692671394799, + "grad_norm": 2.648171901702881, + "learning_rate": 2.124016644219139e-06, + "loss": 0.3091, + "step": 7000 + }, + { + "epoch": 3.3101654846335697, + "grad_norm": 2.908219814300537, + "learning_rate": 2.123399924212728e-06, + "loss": 0.4063, + "step": 7001 + }, + { + "epoch": 3.3106382978723405, + "grad_norm": 3.138749361038208, + "learning_rate": 2.122783227655551e-06, + "loss": 0.4296, + "step": 7002 + }, + { + "epoch": 3.311111111111111, + "grad_norm": 3.044466018676758, + "learning_rate": 2.1221665545860094e-06, + "loss": 0.4424, + "step": 7003 + }, + { + "epoch": 3.3115839243498817, + "grad_norm": 2.6758792400360107, + "learning_rate": 2.121549905042499e-06, + "loss": 0.4073, + "step": 7004 + }, + { + "epoch": 3.3120567375886525, + "grad_norm": 2.8901989459991455, + "learning_rate": 2.1209332790634174e-06, + "loss": 0.3842, + "step": 7005 + }, + { + "epoch": 3.3125295508274233, + "grad_norm": 2.8179712295532227, + "learning_rate": 2.1203166766871582e-06, + "loss": 0.366, + "step": 7006 + }, + { + "epoch": 3.313002364066194, + "grad_norm": 2.6536550521850586, + "learning_rate": 2.1197000979521138e-06, + "loss": 0.3851, + "step": 7007 + }, + { + "epoch": 3.3134751773049644, + "grad_norm": 3.1277682781219482, + "learning_rate": 2.1190835428966775e-06, + "loss": 0.4249, + "step": 7008 + }, + { + "epoch": 3.3139479905437352, + "grad_norm": 2.924666166305542, + "learning_rate": 2.1184670115592383e-06, + "loss": 0.3873, + "step": 7009 + }, + { + "epoch": 3.314420803782506, + "grad_norm": 2.7921009063720703, + "learning_rate": 2.1178505039781856e-06, + "loss": 0.3754, + "step": 7010 + }, + { + "epoch": 3.3148936170212764, + "grad_norm": 2.5349879264831543, + "learning_rate": 2.1172340201919067e-06, + "loss": 0.3701, + "step": 7011 + }, + { + "epoch": 3.315366430260047, + "grad_norm": 2.849376678466797, + "learning_rate": 2.1166175602387866e-06, + "loss": 0.3963, + "step": 7012 + }, + { + "epoch": 3.315839243498818, + "grad_norm": 3.141280174255371, + "learning_rate": 2.11600112415721e-06, + "loss": 0.4158, + "step": 7013 + }, + { + "epoch": 3.3163120567375888, + "grad_norm": 2.922807455062866, + "learning_rate": 2.11538471198556e-06, + "loss": 0.3667, + "step": 7014 + }, + { + "epoch": 3.3167848699763596, + "grad_norm": 2.770400047302246, + "learning_rate": 2.114768323762216e-06, + "loss": 0.3674, + "step": 7015 + }, + { + "epoch": 3.31725768321513, + "grad_norm": 2.7706570625305176, + "learning_rate": 2.114151959525561e-06, + "loss": 0.3761, + "step": 7016 + }, + { + "epoch": 3.3177304964539007, + "grad_norm": 3.041755437850952, + "learning_rate": 2.1135356193139704e-06, + "loss": 0.4483, + "step": 7017 + }, + { + "epoch": 3.3182033096926715, + "grad_norm": 3.5757904052734375, + "learning_rate": 2.1129193031658227e-06, + "loss": 0.4094, + "step": 7018 + }, + { + "epoch": 3.318676122931442, + "grad_norm": 2.9292917251586914, + "learning_rate": 2.1123030111194936e-06, + "loss": 0.3514, + "step": 7019 + }, + { + "epoch": 3.3191489361702127, + "grad_norm": 3.1443874835968018, + "learning_rate": 2.111686743213355e-06, + "loss": 0.4098, + "step": 7020 + }, + { + "epoch": 3.3196217494089835, + "grad_norm": 2.9738030433654785, + "learning_rate": 2.1110704994857804e-06, + "loss": 0.3584, + "step": 7021 + }, + { + "epoch": 3.3200945626477543, + "grad_norm": 2.8961563110351562, + "learning_rate": 2.1104542799751397e-06, + "loss": 0.3736, + "step": 7022 + }, + { + "epoch": 3.320567375886525, + "grad_norm": 3.9264683723449707, + "learning_rate": 2.1098380847198037e-06, + "loss": 0.457, + "step": 7023 + }, + { + "epoch": 3.3210401891252954, + "grad_norm": 2.8742756843566895, + "learning_rate": 2.109221913758139e-06, + "loss": 0.4252, + "step": 7024 + }, + { + "epoch": 3.321513002364066, + "grad_norm": 3.7229559421539307, + "learning_rate": 2.108605767128512e-06, + "loss": 0.4451, + "step": 7025 + }, + { + "epoch": 3.321985815602837, + "grad_norm": 2.6417593955993652, + "learning_rate": 2.1079896448692884e-06, + "loss": 0.3658, + "step": 7026 + }, + { + "epoch": 3.3224586288416074, + "grad_norm": 2.8780412673950195, + "learning_rate": 2.10737354701883e-06, + "loss": 0.4225, + "step": 7027 + }, + { + "epoch": 3.322931442080378, + "grad_norm": 2.557816505432129, + "learning_rate": 2.1067574736155e-06, + "loss": 0.3812, + "step": 7028 + }, + { + "epoch": 3.323404255319149, + "grad_norm": 2.859062910079956, + "learning_rate": 2.106141424697658e-06, + "loss": 0.3629, + "step": 7029 + }, + { + "epoch": 3.3238770685579198, + "grad_norm": 2.4776878356933594, + "learning_rate": 2.1055254003036607e-06, + "loss": 0.3591, + "step": 7030 + }, + { + "epoch": 3.3243498817966906, + "grad_norm": 3.085066795349121, + "learning_rate": 2.1049094004718687e-06, + "loss": 0.4237, + "step": 7031 + }, + { + "epoch": 3.324822695035461, + "grad_norm": 2.862592935562134, + "learning_rate": 2.1042934252406345e-06, + "loss": 0.3185, + "step": 7032 + }, + { + "epoch": 3.3252955082742317, + "grad_norm": 2.965743064880371, + "learning_rate": 2.1036774746483145e-06, + "loss": 0.4058, + "step": 7033 + }, + { + "epoch": 3.3257683215130025, + "grad_norm": 2.7420589923858643, + "learning_rate": 2.103061548733261e-06, + "loss": 0.3566, + "step": 7034 + }, + { + "epoch": 3.326241134751773, + "grad_norm": 2.7824347019195557, + "learning_rate": 2.1024456475338235e-06, + "loss": 0.3553, + "step": 7035 + }, + { + "epoch": 3.3267139479905437, + "grad_norm": 3.0410704612731934, + "learning_rate": 2.1018297710883528e-06, + "loss": 0.3772, + "step": 7036 + }, + { + "epoch": 3.3271867612293144, + "grad_norm": 3.0811562538146973, + "learning_rate": 2.101213919435196e-06, + "loss": 0.3738, + "step": 7037 + }, + { + "epoch": 3.3276595744680852, + "grad_norm": 2.939445734024048, + "learning_rate": 2.100598092612699e-06, + "loss": 0.4107, + "step": 7038 + }, + { + "epoch": 3.3281323877068556, + "grad_norm": 3.05804705619812, + "learning_rate": 2.0999822906592086e-06, + "loss": 0.3972, + "step": 7039 + }, + { + "epoch": 3.3286052009456264, + "grad_norm": 2.803558111190796, + "learning_rate": 2.0993665136130657e-06, + "loss": 0.3487, + "step": 7040 + }, + { + "epoch": 3.329078014184397, + "grad_norm": 2.937675714492798, + "learning_rate": 2.0987507615126147e-06, + "loss": 0.4095, + "step": 7041 + }, + { + "epoch": 3.329550827423168, + "grad_norm": 2.853905439376831, + "learning_rate": 2.098135034396194e-06, + "loss": 0.3775, + "step": 7042 + }, + { + "epoch": 3.3300236406619383, + "grad_norm": 3.3520495891571045, + "learning_rate": 2.097519332302142e-06, + "loss": 0.4065, + "step": 7043 + }, + { + "epoch": 3.330496453900709, + "grad_norm": 2.8787078857421875, + "learning_rate": 2.096903655268797e-06, + "loss": 0.3452, + "step": 7044 + }, + { + "epoch": 3.33096926713948, + "grad_norm": 2.993896007537842, + "learning_rate": 2.096288003334493e-06, + "loss": 0.3814, + "step": 7045 + }, + { + "epoch": 3.3314420803782507, + "grad_norm": 3.5248336791992188, + "learning_rate": 2.0956723765375655e-06, + "loss": 0.3852, + "step": 7046 + }, + { + "epoch": 3.331914893617021, + "grad_norm": 3.2227890491485596, + "learning_rate": 2.0950567749163463e-06, + "loss": 0.3913, + "step": 7047 + }, + { + "epoch": 3.332387706855792, + "grad_norm": 3.390401601791382, + "learning_rate": 2.094441198509165e-06, + "loss": 0.3944, + "step": 7048 + }, + { + "epoch": 3.3328605200945627, + "grad_norm": 3.2057554721832275, + "learning_rate": 2.0938256473543534e-06, + "loss": 0.404, + "step": 7049 + }, + { + "epoch": 3.3333333333333335, + "grad_norm": 2.866708755493164, + "learning_rate": 2.0932101214902367e-06, + "loss": 0.4345, + "step": 7050 + }, + { + "epoch": 3.333806146572104, + "grad_norm": 3.4304039478302, + "learning_rate": 2.0925946209551428e-06, + "loss": 0.4209, + "step": 7051 + }, + { + "epoch": 3.3342789598108746, + "grad_norm": 3.996561288833618, + "learning_rate": 2.091979145787395e-06, + "loss": 0.4394, + "step": 7052 + }, + { + "epoch": 3.3347517730496454, + "grad_norm": 3.1932613849639893, + "learning_rate": 2.0913636960253166e-06, + "loss": 0.3837, + "step": 7053 + }, + { + "epoch": 3.3352245862884162, + "grad_norm": 2.908832311630249, + "learning_rate": 2.0907482717072293e-06, + "loss": 0.3526, + "step": 7054 + }, + { + "epoch": 3.3356973995271866, + "grad_norm": 2.7319607734680176, + "learning_rate": 2.090132872871452e-06, + "loss": 0.3686, + "step": 7055 + }, + { + "epoch": 3.3361702127659574, + "grad_norm": 2.9213504791259766, + "learning_rate": 2.0895174995563043e-06, + "loss": 0.4034, + "step": 7056 + }, + { + "epoch": 3.336643026004728, + "grad_norm": 2.8093936443328857, + "learning_rate": 2.0889021518001017e-06, + "loss": 0.4151, + "step": 7057 + }, + { + "epoch": 3.337115839243499, + "grad_norm": 3.1840829849243164, + "learning_rate": 2.0882868296411594e-06, + "loss": 0.3501, + "step": 7058 + }, + { + "epoch": 3.3375886524822693, + "grad_norm": 2.793567657470703, + "learning_rate": 2.087671533117791e-06, + "loss": 0.3911, + "step": 7059 + }, + { + "epoch": 3.33806146572104, + "grad_norm": 3.0820090770721436, + "learning_rate": 2.0870562622683077e-06, + "loss": 0.432, + "step": 7060 + }, + { + "epoch": 3.338534278959811, + "grad_norm": 2.774630546569824, + "learning_rate": 2.0864410171310213e-06, + "loss": 0.3434, + "step": 7061 + }, + { + "epoch": 3.3390070921985817, + "grad_norm": 2.70447039604187, + "learning_rate": 2.085825797744239e-06, + "loss": 0.3787, + "step": 7062 + }, + { + "epoch": 3.339479905437352, + "grad_norm": 3.1014437675476074, + "learning_rate": 2.0852106041462672e-06, + "loss": 0.4568, + "step": 7063 + }, + { + "epoch": 3.339952718676123, + "grad_norm": 3.312680244445801, + "learning_rate": 2.0845954363754133e-06, + "loss": 0.4285, + "step": 7064 + }, + { + "epoch": 3.3404255319148937, + "grad_norm": 2.7070534229278564, + "learning_rate": 2.0839802944699806e-06, + "loss": 0.4096, + "step": 7065 + }, + { + "epoch": 3.3408983451536645, + "grad_norm": 2.8172531127929688, + "learning_rate": 2.083365178468269e-06, + "loss": 0.3652, + "step": 7066 + }, + { + "epoch": 3.341371158392435, + "grad_norm": 2.896378517150879, + "learning_rate": 2.082750088408582e-06, + "loss": 0.3778, + "step": 7067 + }, + { + "epoch": 3.3418439716312056, + "grad_norm": 2.769805669784546, + "learning_rate": 2.0821350243292175e-06, + "loss": 0.3593, + "step": 7068 + }, + { + "epoch": 3.3423167848699764, + "grad_norm": 2.672520875930786, + "learning_rate": 2.0815199862684728e-06, + "loss": 0.3873, + "step": 7069 + }, + { + "epoch": 3.342789598108747, + "grad_norm": 2.841327428817749, + "learning_rate": 2.0809049742646435e-06, + "loss": 0.41, + "step": 7070 + }, + { + "epoch": 3.3432624113475176, + "grad_norm": 3.0540482997894287, + "learning_rate": 2.080289988356023e-06, + "loss": 0.32, + "step": 7071 + }, + { + "epoch": 3.3437352245862884, + "grad_norm": 3.471684217453003, + "learning_rate": 2.079675028580905e-06, + "loss": 0.3779, + "step": 7072 + }, + { + "epoch": 3.344208037825059, + "grad_norm": 2.8545875549316406, + "learning_rate": 2.07906009497758e-06, + "loss": 0.4645, + "step": 7073 + }, + { + "epoch": 3.34468085106383, + "grad_norm": 2.7771127223968506, + "learning_rate": 2.078445187584337e-06, + "loss": 0.3889, + "step": 7074 + }, + { + "epoch": 3.3451536643026003, + "grad_norm": 2.769188165664673, + "learning_rate": 2.0778303064394647e-06, + "loss": 0.3745, + "step": 7075 + }, + { + "epoch": 3.345626477541371, + "grad_norm": 2.739577531814575, + "learning_rate": 2.0772154515812467e-06, + "loss": 0.4402, + "step": 7076 + }, + { + "epoch": 3.346099290780142, + "grad_norm": 2.6124343872070312, + "learning_rate": 2.0766006230479696e-06, + "loss": 0.3595, + "step": 7077 + }, + { + "epoch": 3.3465721040189127, + "grad_norm": 2.7100563049316406, + "learning_rate": 2.0759858208779136e-06, + "loss": 0.3641, + "step": 7078 + }, + { + "epoch": 3.347044917257683, + "grad_norm": 2.8594000339508057, + "learning_rate": 2.075371045109363e-06, + "loss": 0.402, + "step": 7079 + }, + { + "epoch": 3.347517730496454, + "grad_norm": 3.2045278549194336, + "learning_rate": 2.0747562957805955e-06, + "loss": 0.4719, + "step": 7080 + }, + { + "epoch": 3.3479905437352246, + "grad_norm": 2.825594663619995, + "learning_rate": 2.0741415729298874e-06, + "loss": 0.4127, + "step": 7081 + }, + { + "epoch": 3.3484633569739954, + "grad_norm": 2.992403984069824, + "learning_rate": 2.0735268765955173e-06, + "loss": 0.3943, + "step": 7082 + }, + { + "epoch": 3.348936170212766, + "grad_norm": 3.0629165172576904, + "learning_rate": 2.072912206815758e-06, + "loss": 0.4132, + "step": 7083 + }, + { + "epoch": 3.3494089834515366, + "grad_norm": 2.7553658485412598, + "learning_rate": 2.0722975636288836e-06, + "loss": 0.3667, + "step": 7084 + }, + { + "epoch": 3.3498817966903074, + "grad_norm": 3.1556780338287354, + "learning_rate": 2.0716829470731647e-06, + "loss": 0.4383, + "step": 7085 + }, + { + "epoch": 3.350354609929078, + "grad_norm": 2.6693310737609863, + "learning_rate": 2.071068357186869e-06, + "loss": 0.363, + "step": 7086 + }, + { + "epoch": 3.3508274231678485, + "grad_norm": 2.738314628601074, + "learning_rate": 2.0704537940082673e-06, + "loss": 0.3493, + "step": 7087 + }, + { + "epoch": 3.3513002364066193, + "grad_norm": 3.2205989360809326, + "learning_rate": 2.069839257575624e-06, + "loss": 0.3802, + "step": 7088 + }, + { + "epoch": 3.35177304964539, + "grad_norm": 2.8969876766204834, + "learning_rate": 2.069224747927203e-06, + "loss": 0.3846, + "step": 7089 + }, + { + "epoch": 3.352245862884161, + "grad_norm": 2.833179473876953, + "learning_rate": 2.0686102651012694e-06, + "loss": 0.3892, + "step": 7090 + }, + { + "epoch": 3.3527186761229313, + "grad_norm": 3.303830623626709, + "learning_rate": 2.067995809136082e-06, + "loss": 0.4009, + "step": 7091 + }, + { + "epoch": 3.353191489361702, + "grad_norm": 3.3684141635894775, + "learning_rate": 2.0673813800699024e-06, + "loss": 0.4434, + "step": 7092 + }, + { + "epoch": 3.353664302600473, + "grad_norm": 2.6549112796783447, + "learning_rate": 2.066766977940987e-06, + "loss": 0.3941, + "step": 7093 + }, + { + "epoch": 3.3541371158392437, + "grad_norm": 2.852935314178467, + "learning_rate": 2.066152602787591e-06, + "loss": 0.4143, + "step": 7094 + }, + { + "epoch": 3.354609929078014, + "grad_norm": 2.9621706008911133, + "learning_rate": 2.0655382546479713e-06, + "loss": 0.4502, + "step": 7095 + }, + { + "epoch": 3.355082742316785, + "grad_norm": 3.2836413383483887, + "learning_rate": 2.064923933560378e-06, + "loss": 0.3993, + "step": 7096 + }, + { + "epoch": 3.3555555555555556, + "grad_norm": 2.8187968730926514, + "learning_rate": 2.0643096395630654e-06, + "loss": 0.3766, + "step": 7097 + }, + { + "epoch": 3.3560283687943264, + "grad_norm": 2.7965118885040283, + "learning_rate": 2.0636953726942803e-06, + "loss": 0.4258, + "step": 7098 + }, + { + "epoch": 3.3565011820330968, + "grad_norm": 3.002030611038208, + "learning_rate": 2.063081132992271e-06, + "loss": 0.3548, + "step": 7099 + }, + { + "epoch": 3.3569739952718676, + "grad_norm": 2.927603006362915, + "learning_rate": 2.0624669204952847e-06, + "loss": 0.3759, + "step": 7100 + }, + { + "epoch": 3.3574468085106384, + "grad_norm": 2.911393165588379, + "learning_rate": 2.061852735241563e-06, + "loss": 0.3599, + "step": 7101 + }, + { + "epoch": 3.357919621749409, + "grad_norm": 3.0596864223480225, + "learning_rate": 2.0612385772693517e-06, + "loss": 0.3557, + "step": 7102 + }, + { + "epoch": 3.3583924349881795, + "grad_norm": 2.7869808673858643, + "learning_rate": 2.0606244466168905e-06, + "loss": 0.3696, + "step": 7103 + }, + { + "epoch": 3.3588652482269503, + "grad_norm": 2.927715539932251, + "learning_rate": 2.060010343322417e-06, + "loss": 0.3309, + "step": 7104 + }, + { + "epoch": 3.359338061465721, + "grad_norm": 3.44653058052063, + "learning_rate": 2.059396267424171e-06, + "loss": 0.4453, + "step": 7105 + }, + { + "epoch": 3.359810874704492, + "grad_norm": 3.047652244567871, + "learning_rate": 2.0587822189603873e-06, + "loss": 0.3615, + "step": 7106 + }, + { + "epoch": 3.3602836879432623, + "grad_norm": 2.6640517711639404, + "learning_rate": 2.0581681979693002e-06, + "loss": 0.3716, + "step": 7107 + }, + { + "epoch": 3.360756501182033, + "grad_norm": 2.8253493309020996, + "learning_rate": 2.0575542044891424e-06, + "loss": 0.3485, + "step": 7108 + }, + { + "epoch": 3.361229314420804, + "grad_norm": 3.0512938499450684, + "learning_rate": 2.0569402385581433e-06, + "loss": 0.4582, + "step": 7109 + }, + { + "epoch": 3.3617021276595747, + "grad_norm": 2.935060739517212, + "learning_rate": 2.0563263002145333e-06, + "loss": 0.425, + "step": 7110 + }, + { + "epoch": 3.362174940898345, + "grad_norm": 3.2708780765533447, + "learning_rate": 2.0557123894965396e-06, + "loss": 0.4193, + "step": 7111 + }, + { + "epoch": 3.362647754137116, + "grad_norm": 2.758329391479492, + "learning_rate": 2.055098506442386e-06, + "loss": 0.3754, + "step": 7112 + }, + { + "epoch": 3.3631205673758866, + "grad_norm": 3.0359015464782715, + "learning_rate": 2.0544846510902987e-06, + "loss": 0.4207, + "step": 7113 + }, + { + "epoch": 3.3635933806146574, + "grad_norm": 3.096968412399292, + "learning_rate": 2.0538708234784983e-06, + "loss": 0.4303, + "step": 7114 + }, + { + "epoch": 3.3640661938534278, + "grad_norm": 3.0777673721313477, + "learning_rate": 2.053257023645206e-06, + "loss": 0.3904, + "step": 7115 + }, + { + "epoch": 3.3645390070921986, + "grad_norm": 2.9483232498168945, + "learning_rate": 2.0526432516286394e-06, + "loss": 0.3949, + "step": 7116 + }, + { + "epoch": 3.3650118203309693, + "grad_norm": 2.839067220687866, + "learning_rate": 2.0520295074670154e-06, + "loss": 0.3705, + "step": 7117 + }, + { + "epoch": 3.36548463356974, + "grad_norm": 3.0450778007507324, + "learning_rate": 2.0514157911985506e-06, + "loss": 0.3987, + "step": 7118 + }, + { + "epoch": 3.3659574468085105, + "grad_norm": 3.425318717956543, + "learning_rate": 2.0508021028614564e-06, + "loss": 0.3941, + "step": 7119 + }, + { + "epoch": 3.3664302600472813, + "grad_norm": 2.9509286880493164, + "learning_rate": 2.0501884424939465e-06, + "loss": 0.354, + "step": 7120 + }, + { + "epoch": 3.366903073286052, + "grad_norm": 2.799504518508911, + "learning_rate": 2.0495748101342303e-06, + "loss": 0.3891, + "step": 7121 + }, + { + "epoch": 3.3673758865248224, + "grad_norm": 2.9140994548797607, + "learning_rate": 2.048961205820515e-06, + "loss": 0.3638, + "step": 7122 + }, + { + "epoch": 3.3678486997635932, + "grad_norm": 2.8074216842651367, + "learning_rate": 2.0483476295910077e-06, + "loss": 0.3501, + "step": 7123 + }, + { + "epoch": 3.368321513002364, + "grad_norm": 2.770829677581787, + "learning_rate": 2.0477340814839126e-06, + "loss": 0.3774, + "step": 7124 + }, + { + "epoch": 3.368794326241135, + "grad_norm": 2.581655502319336, + "learning_rate": 2.047120561537434e-06, + "loss": 0.3523, + "step": 7125 + }, + { + "epoch": 3.3692671394799056, + "grad_norm": 3.4234209060668945, + "learning_rate": 2.046507069789772e-06, + "loss": 0.4191, + "step": 7126 + }, + { + "epoch": 3.369739952718676, + "grad_norm": 2.669860601425171, + "learning_rate": 2.045893606279126e-06, + "loss": 0.3542, + "step": 7127 + }, + { + "epoch": 3.370212765957447, + "grad_norm": 3.2426629066467285, + "learning_rate": 2.045280171043694e-06, + "loss": 0.4416, + "step": 7128 + }, + { + "epoch": 3.3706855791962176, + "grad_norm": 3.1318910121917725, + "learning_rate": 2.044666764121672e-06, + "loss": 0.3999, + "step": 7129 + }, + { + "epoch": 3.371158392434988, + "grad_norm": 2.7044012546539307, + "learning_rate": 2.044053385551254e-06, + "loss": 0.3907, + "step": 7130 + }, + { + "epoch": 3.3716312056737587, + "grad_norm": 2.9429895877838135, + "learning_rate": 2.0434400353706322e-06, + "loss": 0.3827, + "step": 7131 + }, + { + "epoch": 3.3721040189125295, + "grad_norm": 2.7258787155151367, + "learning_rate": 2.0428267136179973e-06, + "loss": 0.3688, + "step": 7132 + }, + { + "epoch": 3.3725768321513003, + "grad_norm": 2.765108108520508, + "learning_rate": 2.042213420331539e-06, + "loss": 0.4078, + "step": 7133 + }, + { + "epoch": 3.373049645390071, + "grad_norm": 3.2951347827911377, + "learning_rate": 2.0416001555494435e-06, + "loss": 0.4259, + "step": 7134 + }, + { + "epoch": 3.3735224586288415, + "grad_norm": 3.3917062282562256, + "learning_rate": 2.040986919309895e-06, + "loss": 0.5094, + "step": 7135 + }, + { + "epoch": 3.3739952718676123, + "grad_norm": 2.746434450149536, + "learning_rate": 2.04037371165108e-06, + "loss": 0.3513, + "step": 7136 + }, + { + "epoch": 3.374468085106383, + "grad_norm": 3.268731117248535, + "learning_rate": 2.0397605326111774e-06, + "loss": 0.3909, + "step": 7137 + }, + { + "epoch": 3.3749408983451534, + "grad_norm": 2.8498165607452393, + "learning_rate": 2.0391473822283692e-06, + "loss": 0.3657, + "step": 7138 + }, + { + "epoch": 3.3754137115839242, + "grad_norm": 2.855966567993164, + "learning_rate": 2.0385342605408325e-06, + "loss": 0.3927, + "step": 7139 + }, + { + "epoch": 3.375886524822695, + "grad_norm": 3.1839048862457275, + "learning_rate": 2.0379211675867438e-06, + "loss": 0.4476, + "step": 7140 + }, + { + "epoch": 3.376359338061466, + "grad_norm": 2.9379947185516357, + "learning_rate": 2.037308103404278e-06, + "loss": 0.3657, + "step": 7141 + }, + { + "epoch": 3.3768321513002366, + "grad_norm": 2.9251210689544678, + "learning_rate": 2.0366950680316073e-06, + "loss": 0.3975, + "step": 7142 + }, + { + "epoch": 3.377304964539007, + "grad_norm": 2.811885118484497, + "learning_rate": 2.036082061506904e-06, + "loss": 0.3064, + "step": 7143 + }, + { + "epoch": 3.3777777777777778, + "grad_norm": 2.755229949951172, + "learning_rate": 2.0354690838683363e-06, + "loss": 0.3328, + "step": 7144 + }, + { + "epoch": 3.3782505910165486, + "grad_norm": 3.006819725036621, + "learning_rate": 2.0348561351540706e-06, + "loss": 0.4168, + "step": 7145 + }, + { + "epoch": 3.378723404255319, + "grad_norm": 2.8788509368896484, + "learning_rate": 2.034243215402275e-06, + "loss": 0.4123, + "step": 7146 + }, + { + "epoch": 3.3791962174940897, + "grad_norm": 2.9732980728149414, + "learning_rate": 2.033630324651112e-06, + "loss": 0.3371, + "step": 7147 + }, + { + "epoch": 3.3796690307328605, + "grad_norm": 2.7731754779815674, + "learning_rate": 2.033017462938744e-06, + "loss": 0.382, + "step": 7148 + }, + { + "epoch": 3.3801418439716313, + "grad_norm": 2.766395092010498, + "learning_rate": 2.032404630303331e-06, + "loss": 0.3295, + "step": 7149 + }, + { + "epoch": 3.380614657210402, + "grad_norm": 3.197960138320923, + "learning_rate": 2.03179182678303e-06, + "loss": 0.354, + "step": 7150 + }, + { + "epoch": 3.3810874704491725, + "grad_norm": 3.048553228378296, + "learning_rate": 2.031179052416e-06, + "loss": 0.4027, + "step": 7151 + }, + { + "epoch": 3.3815602836879433, + "grad_norm": 3.1527998447418213, + "learning_rate": 2.0305663072403934e-06, + "loss": 0.4229, + "step": 7152 + }, + { + "epoch": 3.382033096926714, + "grad_norm": 3.0407028198242188, + "learning_rate": 2.029953591294366e-06, + "loss": 0.4254, + "step": 7153 + }, + { + "epoch": 3.3825059101654844, + "grad_norm": 2.7170357704162598, + "learning_rate": 2.0293409046160673e-06, + "loss": 0.3307, + "step": 7154 + }, + { + "epoch": 3.382978723404255, + "grad_norm": 3.0128726959228516, + "learning_rate": 2.028728247243646e-06, + "loss": 0.3873, + "step": 7155 + }, + { + "epoch": 3.383451536643026, + "grad_norm": 4.861877918243408, + "learning_rate": 2.0281156192152507e-06, + "loss": 0.4371, + "step": 7156 + }, + { + "epoch": 3.383924349881797, + "grad_norm": 2.890249252319336, + "learning_rate": 2.0275030205690257e-06, + "loss": 0.3899, + "step": 7157 + }, + { + "epoch": 3.3843971631205676, + "grad_norm": 3.0774779319763184, + "learning_rate": 2.026890451343117e-06, + "loss": 0.4151, + "step": 7158 + }, + { + "epoch": 3.384869976359338, + "grad_norm": 2.8705947399139404, + "learning_rate": 2.026277911575665e-06, + "loss": 0.4004, + "step": 7159 + }, + { + "epoch": 3.3853427895981087, + "grad_norm": 3.170760154724121, + "learning_rate": 2.0256654013048096e-06, + "loss": 0.4442, + "step": 7160 + }, + { + "epoch": 3.3858156028368795, + "grad_norm": 4.211156368255615, + "learning_rate": 2.0250529205686905e-06, + "loss": 0.4605, + "step": 7161 + }, + { + "epoch": 3.38628841607565, + "grad_norm": 2.513519287109375, + "learning_rate": 2.0244404694054435e-06, + "loss": 0.3506, + "step": 7162 + }, + { + "epoch": 3.3867612293144207, + "grad_norm": 3.1558821201324463, + "learning_rate": 2.023828047853203e-06, + "loss": 0.43, + "step": 7163 + }, + { + "epoch": 3.3872340425531915, + "grad_norm": 3.6770291328430176, + "learning_rate": 2.023215655950102e-06, + "loss": 0.3911, + "step": 7164 + }, + { + "epoch": 3.3877068557919623, + "grad_norm": 2.6544485092163086, + "learning_rate": 2.022603293734271e-06, + "loss": 0.3306, + "step": 7165 + }, + { + "epoch": 3.388179669030733, + "grad_norm": 3.34232759475708, + "learning_rate": 2.0219909612438405e-06, + "loss": 0.4233, + "step": 7166 + }, + { + "epoch": 3.3886524822695034, + "grad_norm": 3.388561725616455, + "learning_rate": 2.0213786585169363e-06, + "loss": 0.4171, + "step": 7167 + }, + { + "epoch": 3.3891252955082742, + "grad_norm": 2.8606953620910645, + "learning_rate": 2.020766385591684e-06, + "loss": 0.3864, + "step": 7168 + }, + { + "epoch": 3.389598108747045, + "grad_norm": 3.0135979652404785, + "learning_rate": 2.020154142506208e-06, + "loss": 0.3933, + "step": 7169 + }, + { + "epoch": 3.3900709219858154, + "grad_norm": 2.5003163814544678, + "learning_rate": 2.0195419292986294e-06, + "loss": 0.3852, + "step": 7170 + }, + { + "epoch": 3.390543735224586, + "grad_norm": 2.8591368198394775, + "learning_rate": 2.0189297460070685e-06, + "loss": 0.3962, + "step": 7171 + }, + { + "epoch": 3.391016548463357, + "grad_norm": 2.8830223083496094, + "learning_rate": 2.0183175926696427e-06, + "loss": 0.3632, + "step": 7172 + }, + { + "epoch": 3.391489361702128, + "grad_norm": 3.3904542922973633, + "learning_rate": 2.0177054693244674e-06, + "loss": 0.4284, + "step": 7173 + }, + { + "epoch": 3.3919621749408986, + "grad_norm": 3.0325920581817627, + "learning_rate": 2.0170933760096585e-06, + "loss": 0.4331, + "step": 7174 + }, + { + "epoch": 3.392434988179669, + "grad_norm": 2.60345196723938, + "learning_rate": 2.016481312763327e-06, + "loss": 0.4077, + "step": 7175 + }, + { + "epoch": 3.3929078014184397, + "grad_norm": 2.8146891593933105, + "learning_rate": 2.0158692796235845e-06, + "loss": 0.4224, + "step": 7176 + }, + { + "epoch": 3.3933806146572105, + "grad_norm": 2.8158490657806396, + "learning_rate": 2.0152572766285396e-06, + "loss": 0.3454, + "step": 7177 + }, + { + "epoch": 3.393853427895981, + "grad_norm": 3.2753400802612305, + "learning_rate": 2.0146453038162978e-06, + "loss": 0.3615, + "step": 7178 + }, + { + "epoch": 3.3943262411347517, + "grad_norm": 3.0527124404907227, + "learning_rate": 2.0140333612249655e-06, + "loss": 0.415, + "step": 7179 + }, + { + "epoch": 3.3947990543735225, + "grad_norm": 2.6813764572143555, + "learning_rate": 2.0134214488926435e-06, + "loss": 0.3391, + "step": 7180 + }, + { + "epoch": 3.3952718676122933, + "grad_norm": 2.809319496154785, + "learning_rate": 2.0128095668574356e-06, + "loss": 0.3123, + "step": 7181 + }, + { + "epoch": 3.395744680851064, + "grad_norm": 2.6619064807891846, + "learning_rate": 2.0121977151574396e-06, + "loss": 0.4222, + "step": 7182 + }, + { + "epoch": 3.3962174940898344, + "grad_norm": 2.9201200008392334, + "learning_rate": 2.0115858938307516e-06, + "loss": 0.3712, + "step": 7183 + }, + { + "epoch": 3.396690307328605, + "grad_norm": 3.2058637142181396, + "learning_rate": 2.0109741029154696e-06, + "loss": 0.4004, + "step": 7184 + }, + { + "epoch": 3.397163120567376, + "grad_norm": 2.821855306625366, + "learning_rate": 2.0103623424496862e-06, + "loss": 0.4053, + "step": 7185 + }, + { + "epoch": 3.3976359338061464, + "grad_norm": 3.0371549129486084, + "learning_rate": 2.009750612471492e-06, + "loss": 0.4246, + "step": 7186 + }, + { + "epoch": 3.398108747044917, + "grad_norm": 2.8827290534973145, + "learning_rate": 2.009138913018978e-06, + "loss": 0.3256, + "step": 7187 + }, + { + "epoch": 3.398581560283688, + "grad_norm": 3.168039560317993, + "learning_rate": 2.0085272441302305e-06, + "loss": 0.4233, + "step": 7188 + }, + { + "epoch": 3.3990543735224588, + "grad_norm": 3.259723663330078, + "learning_rate": 2.0079156058433374e-06, + "loss": 0.4168, + "step": 7189 + }, + { + "epoch": 3.3995271867612296, + "grad_norm": 2.456231117248535, + "learning_rate": 2.007303998196382e-06, + "loss": 0.3383, + "step": 7190 + }, + { + "epoch": 3.4, + "grad_norm": 2.735180377960205, + "learning_rate": 2.006692421227445e-06, + "loss": 0.3475, + "step": 7191 + }, + { + "epoch": 3.4004728132387707, + "grad_norm": 2.76263427734375, + "learning_rate": 2.006080874974609e-06, + "loss": 0.3651, + "step": 7192 + }, + { + "epoch": 3.4009456264775415, + "grad_norm": 3.36867094039917, + "learning_rate": 2.0054693594759504e-06, + "loss": 0.4479, + "step": 7193 + }, + { + "epoch": 3.401418439716312, + "grad_norm": 2.532167673110962, + "learning_rate": 2.004857874769547e-06, + "loss": 0.3818, + "step": 7194 + }, + { + "epoch": 3.4018912529550827, + "grad_norm": 2.8723537921905518, + "learning_rate": 2.0042464208934724e-06, + "loss": 0.3332, + "step": 7195 + }, + { + "epoch": 3.4023640661938535, + "grad_norm": 2.676460027694702, + "learning_rate": 2.0036349978857987e-06, + "loss": 0.3488, + "step": 7196 + }, + { + "epoch": 3.4028368794326243, + "grad_norm": 2.805851459503174, + "learning_rate": 2.0030236057845983e-06, + "loss": 0.3796, + "step": 7197 + }, + { + "epoch": 3.403309692671395, + "grad_norm": 2.688988447189331, + "learning_rate": 2.0024122446279377e-06, + "loss": 0.3707, + "step": 7198 + }, + { + "epoch": 3.4037825059101654, + "grad_norm": 3.118720293045044, + "learning_rate": 2.0018009144538853e-06, + "loss": 0.4064, + "step": 7199 + }, + { + "epoch": 3.404255319148936, + "grad_norm": 2.876507520675659, + "learning_rate": 2.001189615300506e-06, + "loss": 0.3543, + "step": 7200 + }, + { + "epoch": 3.404728132387707, + "grad_norm": 3.0043466091156006, + "learning_rate": 2.000578347205861e-06, + "loss": 0.3833, + "step": 7201 + }, + { + "epoch": 3.4052009456264773, + "grad_norm": 3.1057114601135254, + "learning_rate": 1.9999671102080133e-06, + "loss": 0.4154, + "step": 7202 + }, + { + "epoch": 3.405673758865248, + "grad_norm": 2.9791855812072754, + "learning_rate": 1.9993559043450202e-06, + "loss": 0.3865, + "step": 7203 + }, + { + "epoch": 3.406146572104019, + "grad_norm": 3.4403460025787354, + "learning_rate": 1.9987447296549407e-06, + "loss": 0.3883, + "step": 7204 + }, + { + "epoch": 3.4066193853427897, + "grad_norm": 2.9962027072906494, + "learning_rate": 1.998133586175829e-06, + "loss": 0.3796, + "step": 7205 + }, + { + "epoch": 3.40709219858156, + "grad_norm": 3.0613129138946533, + "learning_rate": 1.997522473945737e-06, + "loss": 0.3917, + "step": 7206 + }, + { + "epoch": 3.407565011820331, + "grad_norm": 3.065985679626465, + "learning_rate": 1.996911393002718e-06, + "loss": 0.3521, + "step": 7207 + }, + { + "epoch": 3.4080378250591017, + "grad_norm": 2.976177930831909, + "learning_rate": 1.996300343384821e-06, + "loss": 0.3852, + "step": 7208 + }, + { + "epoch": 3.4085106382978725, + "grad_norm": 3.3587961196899414, + "learning_rate": 1.995689325130092e-06, + "loss": 0.3947, + "step": 7209 + }, + { + "epoch": 3.408983451536643, + "grad_norm": 2.626983165740967, + "learning_rate": 1.995078338276578e-06, + "loss": 0.316, + "step": 7210 + }, + { + "epoch": 3.4094562647754136, + "grad_norm": 3.14713978767395, + "learning_rate": 1.9944673828623217e-06, + "loss": 0.4008, + "step": 7211 + }, + { + "epoch": 3.4099290780141844, + "grad_norm": 2.968918800354004, + "learning_rate": 1.993856458925365e-06, + "loss": 0.439, + "step": 7212 + }, + { + "epoch": 3.4104018912529552, + "grad_norm": 2.7724127769470215, + "learning_rate": 1.9932455665037476e-06, + "loss": 0.3941, + "step": 7213 + }, + { + "epoch": 3.4108747044917256, + "grad_norm": 2.963146448135376, + "learning_rate": 1.9926347056355057e-06, + "loss": 0.3893, + "step": 7214 + }, + { + "epoch": 3.4113475177304964, + "grad_norm": 2.791637420654297, + "learning_rate": 1.9920238763586765e-06, + "loss": 0.4068, + "step": 7215 + }, + { + "epoch": 3.411820330969267, + "grad_norm": 3.030275583267212, + "learning_rate": 1.9914130787112924e-06, + "loss": 0.3828, + "step": 7216 + }, + { + "epoch": 3.412293144208038, + "grad_norm": 3.113128900527954, + "learning_rate": 1.990802312731387e-06, + "loss": 0.3903, + "step": 7217 + }, + { + "epoch": 3.4127659574468083, + "grad_norm": 3.104170322418213, + "learning_rate": 1.9901915784569884e-06, + "loss": 0.4171, + "step": 7218 + }, + { + "epoch": 3.413238770685579, + "grad_norm": 3.1247572898864746, + "learning_rate": 1.989580875926125e-06, + "loss": 0.4022, + "step": 7219 + }, + { + "epoch": 3.41371158392435, + "grad_norm": 2.9487457275390625, + "learning_rate": 1.988970205176822e-06, + "loss": 0.3948, + "step": 7220 + }, + { + "epoch": 3.4141843971631207, + "grad_norm": 2.8763654232025146, + "learning_rate": 1.9883595662471028e-06, + "loss": 0.3588, + "step": 7221 + }, + { + "epoch": 3.414657210401891, + "grad_norm": 2.563152551651001, + "learning_rate": 1.987748959174991e-06, + "loss": 0.3509, + "step": 7222 + }, + { + "epoch": 3.415130023640662, + "grad_norm": 3.148759365081787, + "learning_rate": 1.9871383839985053e-06, + "loss": 0.4364, + "step": 7223 + }, + { + "epoch": 3.4156028368794327, + "grad_norm": 2.8187363147735596, + "learning_rate": 1.986527840755663e-06, + "loss": 0.3803, + "step": 7224 + }, + { + "epoch": 3.4160756501182035, + "grad_norm": 3.009376287460327, + "learning_rate": 1.985917329484481e-06, + "loss": 0.3841, + "step": 7225 + }, + { + "epoch": 3.416548463356974, + "grad_norm": 2.869291067123413, + "learning_rate": 1.985306850222972e-06, + "loss": 0.3877, + "step": 7226 + }, + { + "epoch": 3.4170212765957446, + "grad_norm": 3.108461856842041, + "learning_rate": 1.9846964030091497e-06, + "loss": 0.3767, + "step": 7227 + }, + { + "epoch": 3.4174940898345154, + "grad_norm": 3.096320629119873, + "learning_rate": 1.9840859878810226e-06, + "loss": 0.4603, + "step": 7228 + }, + { + "epoch": 3.417966903073286, + "grad_norm": 2.8519909381866455, + "learning_rate": 1.983475604876598e-06, + "loss": 0.3263, + "step": 7229 + }, + { + "epoch": 3.4184397163120566, + "grad_norm": 3.192051410675049, + "learning_rate": 1.9828652540338835e-06, + "loss": 0.4132, + "step": 7230 + }, + { + "epoch": 3.4189125295508274, + "grad_norm": 3.0398056507110596, + "learning_rate": 1.9822549353908817e-06, + "loss": 0.4038, + "step": 7231 + }, + { + "epoch": 3.419385342789598, + "grad_norm": 3.12247896194458, + "learning_rate": 1.9816446489855944e-06, + "loss": 0.409, + "step": 7232 + }, + { + "epoch": 3.419858156028369, + "grad_norm": 3.20316481590271, + "learning_rate": 1.9810343948560223e-06, + "loss": 0.4058, + "step": 7233 + }, + { + "epoch": 3.4203309692671393, + "grad_norm": 3.3397457599639893, + "learning_rate": 1.9804241730401625e-06, + "loss": 0.3657, + "step": 7234 + }, + { + "epoch": 3.42080378250591, + "grad_norm": 3.928691864013672, + "learning_rate": 1.979813983576012e-06, + "loss": 0.361, + "step": 7235 + }, + { + "epoch": 3.421276595744681, + "grad_norm": 3.5814051628112793, + "learning_rate": 1.9792038265015635e-06, + "loss": 0.3975, + "step": 7236 + }, + { + "epoch": 3.4217494089834517, + "grad_norm": 2.8578879833221436, + "learning_rate": 1.9785937018548086e-06, + "loss": 0.3915, + "step": 7237 + }, + { + "epoch": 3.422222222222222, + "grad_norm": 3.0343220233917236, + "learning_rate": 1.977983609673738e-06, + "loss": 0.3686, + "step": 7238 + }, + { + "epoch": 3.422695035460993, + "grad_norm": 3.2719056606292725, + "learning_rate": 1.977373549996338e-06, + "loss": 0.3905, + "step": 7239 + }, + { + "epoch": 3.4231678486997636, + "grad_norm": 2.6638169288635254, + "learning_rate": 1.976763522860597e-06, + "loss": 0.3631, + "step": 7240 + }, + { + "epoch": 3.4236406619385344, + "grad_norm": 2.7679927349090576, + "learning_rate": 1.9761535283044967e-06, + "loss": 0.377, + "step": 7241 + }, + { + "epoch": 3.424113475177305, + "grad_norm": 2.774540424346924, + "learning_rate": 1.975543566366019e-06, + "loss": 0.3509, + "step": 7242 + }, + { + "epoch": 3.4245862884160756, + "grad_norm": 2.811659336090088, + "learning_rate": 1.9749336370831438e-06, + "loss": 0.3835, + "step": 7243 + }, + { + "epoch": 3.4250591016548464, + "grad_norm": 2.8533360958099365, + "learning_rate": 1.9743237404938478e-06, + "loss": 0.3765, + "step": 7244 + }, + { + "epoch": 3.425531914893617, + "grad_norm": 2.712301015853882, + "learning_rate": 1.9737138766361084e-06, + "loss": 0.3797, + "step": 7245 + }, + { + "epoch": 3.4260047281323875, + "grad_norm": 2.9763426780700684, + "learning_rate": 1.9731040455478986e-06, + "loss": 0.4223, + "step": 7246 + }, + { + "epoch": 3.4264775413711583, + "grad_norm": 2.8802297115325928, + "learning_rate": 1.9724942472671882e-06, + "loss": 0.3666, + "step": 7247 + }, + { + "epoch": 3.426950354609929, + "grad_norm": 2.934107542037964, + "learning_rate": 1.9718844818319486e-06, + "loss": 0.3612, + "step": 7248 + }, + { + "epoch": 3.4274231678487, + "grad_norm": 3.0172696113586426, + "learning_rate": 1.9712747492801467e-06, + "loss": 0.3643, + "step": 7249 + }, + { + "epoch": 3.4278959810874703, + "grad_norm": 3.368419647216797, + "learning_rate": 1.970665049649748e-06, + "loss": 0.4511, + "step": 7250 + }, + { + "epoch": 3.428368794326241, + "grad_norm": 3.077819585800171, + "learning_rate": 1.9700553829787162e-06, + "loss": 0.4013, + "step": 7251 + }, + { + "epoch": 3.428841607565012, + "grad_norm": 2.690673828125, + "learning_rate": 1.96944574930501e-06, + "loss": 0.3776, + "step": 7252 + }, + { + "epoch": 3.4293144208037827, + "grad_norm": 3.1122169494628906, + "learning_rate": 1.9688361486665924e-06, + "loss": 0.3802, + "step": 7253 + }, + { + "epoch": 3.429787234042553, + "grad_norm": 2.9874207973480225, + "learning_rate": 1.968226581101417e-06, + "loss": 0.4492, + "step": 7254 + }, + { + "epoch": 3.430260047281324, + "grad_norm": 2.885493278503418, + "learning_rate": 1.967617046647442e-06, + "loss": 0.3958, + "step": 7255 + }, + { + "epoch": 3.4307328605200946, + "grad_norm": 2.953897476196289, + "learning_rate": 1.9670075453426195e-06, + "loss": 0.3973, + "step": 7256 + }, + { + "epoch": 3.4312056737588654, + "grad_norm": 2.685088634490967, + "learning_rate": 1.966398077224899e-06, + "loss": 0.393, + "step": 7257 + }, + { + "epoch": 3.431678486997636, + "grad_norm": 4.035208702087402, + "learning_rate": 1.9657886423322313e-06, + "loss": 0.4263, + "step": 7258 + }, + { + "epoch": 3.4321513002364066, + "grad_norm": 2.942042827606201, + "learning_rate": 1.965179240702562e-06, + "loss": 0.4319, + "step": 7259 + }, + { + "epoch": 3.4326241134751774, + "grad_norm": 3.0794999599456787, + "learning_rate": 1.9645698723738356e-06, + "loss": 0.4199, + "step": 7260 + }, + { + "epoch": 3.433096926713948, + "grad_norm": 3.0653584003448486, + "learning_rate": 1.963960537383996e-06, + "loss": 0.3723, + "step": 7261 + }, + { + "epoch": 3.4335697399527185, + "grad_norm": 3.1571545600891113, + "learning_rate": 1.963351235770983e-06, + "loss": 0.4211, + "step": 7262 + }, + { + "epoch": 3.4340425531914893, + "grad_norm": 2.6681735515594482, + "learning_rate": 1.962741967572736e-06, + "loss": 0.3333, + "step": 7263 + }, + { + "epoch": 3.43451536643026, + "grad_norm": 2.9747934341430664, + "learning_rate": 1.9621327328271907e-06, + "loss": 0.3896, + "step": 7264 + }, + { + "epoch": 3.434988179669031, + "grad_norm": 2.7994508743286133, + "learning_rate": 1.9615235315722814e-06, + "loss": 0.3642, + "step": 7265 + }, + { + "epoch": 3.4354609929078013, + "grad_norm": 2.933928966522217, + "learning_rate": 1.9609143638459405e-06, + "loss": 0.3955, + "step": 7266 + }, + { + "epoch": 3.435933806146572, + "grad_norm": 2.9577367305755615, + "learning_rate": 1.9603052296860983e-06, + "loss": 0.3437, + "step": 7267 + }, + { + "epoch": 3.436406619385343, + "grad_norm": 3.017282009124756, + "learning_rate": 1.959696129130684e-06, + "loss": 0.3784, + "step": 7268 + }, + { + "epoch": 3.4368794326241137, + "grad_norm": 3.2072815895080566, + "learning_rate": 1.959087062217622e-06, + "loss": 0.3901, + "step": 7269 + }, + { + "epoch": 3.437352245862884, + "grad_norm": 2.91153621673584, + "learning_rate": 1.9584780289848358e-06, + "loss": 0.4402, + "step": 7270 + }, + { + "epoch": 3.437825059101655, + "grad_norm": 2.846842050552368, + "learning_rate": 1.9578690294702495e-06, + "loss": 0.3804, + "step": 7271 + }, + { + "epoch": 3.4382978723404256, + "grad_norm": 3.0958521366119385, + "learning_rate": 1.957260063711781e-06, + "loss": 0.4103, + "step": 7272 + }, + { + "epoch": 3.4387706855791964, + "grad_norm": 2.9808530807495117, + "learning_rate": 1.9566511317473483e-06, + "loss": 0.4127, + "step": 7273 + }, + { + "epoch": 3.4392434988179668, + "grad_norm": 2.725851058959961, + "learning_rate": 1.9560422336148678e-06, + "loss": 0.3493, + "step": 7274 + }, + { + "epoch": 3.4397163120567376, + "grad_norm": 2.7861814498901367, + "learning_rate": 1.9554333693522515e-06, + "loss": 0.3703, + "step": 7275 + }, + { + "epoch": 3.4401891252955084, + "grad_norm": 3.128708839416504, + "learning_rate": 1.954824538997412e-06, + "loss": 0.3917, + "step": 7276 + }, + { + "epoch": 3.440661938534279, + "grad_norm": 3.117403268814087, + "learning_rate": 1.954215742588257e-06, + "loss": 0.3581, + "step": 7277 + }, + { + "epoch": 3.4411347517730495, + "grad_norm": 2.710076093673706, + "learning_rate": 1.9536069801626957e-06, + "loss": 0.3255, + "step": 7278 + }, + { + "epoch": 3.4416075650118203, + "grad_norm": 2.7732627391815186, + "learning_rate": 1.952998251758632e-06, + "loss": 0.375, + "step": 7279 + }, + { + "epoch": 3.442080378250591, + "grad_norm": 2.896050453186035, + "learning_rate": 1.9523895574139673e-06, + "loss": 0.4087, + "step": 7280 + }, + { + "epoch": 3.4425531914893615, + "grad_norm": 2.9051663875579834, + "learning_rate": 1.9517808971666048e-06, + "loss": 0.3423, + "step": 7281 + }, + { + "epoch": 3.4430260047281322, + "grad_norm": 3.0232038497924805, + "learning_rate": 1.9511722710544417e-06, + "loss": 0.364, + "step": 7282 + }, + { + "epoch": 3.443498817966903, + "grad_norm": 2.753870725631714, + "learning_rate": 1.9505636791153744e-06, + "loss": 0.3484, + "step": 7283 + }, + { + "epoch": 3.443971631205674, + "grad_norm": 2.944079637527466, + "learning_rate": 1.9499551213872983e-06, + "loss": 0.3354, + "step": 7284 + }, + { + "epoch": 3.4444444444444446, + "grad_norm": 3.1531970500946045, + "learning_rate": 1.949346597908104e-06, + "loss": 0.3394, + "step": 7285 + }, + { + "epoch": 3.444917257683215, + "grad_norm": 3.0357189178466797, + "learning_rate": 1.948738108715683e-06, + "loss": 0.4302, + "step": 7286 + }, + { + "epoch": 3.445390070921986, + "grad_norm": 3.3698086738586426, + "learning_rate": 1.948129653847923e-06, + "loss": 0.419, + "step": 7287 + }, + { + "epoch": 3.4458628841607566, + "grad_norm": 3.343132495880127, + "learning_rate": 1.947521233342709e-06, + "loss": 0.3895, + "step": 7288 + }, + { + "epoch": 3.446335697399527, + "grad_norm": 3.1905252933502197, + "learning_rate": 1.9469128472379257e-06, + "loss": 0.429, + "step": 7289 + }, + { + "epoch": 3.4468085106382977, + "grad_norm": 2.8517212867736816, + "learning_rate": 1.946304495571454e-06, + "loss": 0.3513, + "step": 7290 + }, + { + "epoch": 3.4472813238770685, + "grad_norm": 2.7713496685028076, + "learning_rate": 1.9456961783811735e-06, + "loss": 0.4331, + "step": 7291 + }, + { + "epoch": 3.4477541371158393, + "grad_norm": 2.8258652687072754, + "learning_rate": 1.945087895704962e-06, + "loss": 0.3539, + "step": 7292 + }, + { + "epoch": 3.44822695035461, + "grad_norm": 2.757322072982788, + "learning_rate": 1.9444796475806925e-06, + "loss": 0.3865, + "step": 7293 + }, + { + "epoch": 3.4486997635933805, + "grad_norm": 2.8410696983337402, + "learning_rate": 1.943871434046241e-06, + "loss": 0.3612, + "step": 7294 + }, + { + "epoch": 3.4491725768321513, + "grad_norm": 3.2297637462615967, + "learning_rate": 1.9432632551394753e-06, + "loss": 0.3956, + "step": 7295 + }, + { + "epoch": 3.449645390070922, + "grad_norm": 2.991351842880249, + "learning_rate": 1.9426551108982666e-06, + "loss": 0.3864, + "step": 7296 + }, + { + "epoch": 3.4501182033096924, + "grad_norm": 2.7942168712615967, + "learning_rate": 1.94204700136048e-06, + "loss": 0.4314, + "step": 7297 + }, + { + "epoch": 3.4505910165484632, + "grad_norm": 2.8188698291778564, + "learning_rate": 1.9414389265639805e-06, + "loss": 0.3585, + "step": 7298 + }, + { + "epoch": 3.451063829787234, + "grad_norm": 3.2826895713806152, + "learning_rate": 1.9408308865466295e-06, + "loss": 0.4614, + "step": 7299 + }, + { + "epoch": 3.451536643026005, + "grad_norm": 3.273867130279541, + "learning_rate": 1.9402228813462865e-06, + "loss": 0.3533, + "step": 7300 + }, + { + "epoch": 3.4520094562647756, + "grad_norm": 3.5334157943725586, + "learning_rate": 1.939614911000811e-06, + "loss": 0.4088, + "step": 7301 + }, + { + "epoch": 3.452482269503546, + "grad_norm": 2.983908176422119, + "learning_rate": 1.9390069755480583e-06, + "loss": 0.3725, + "step": 7302 + }, + { + "epoch": 3.4529550827423168, + "grad_norm": 2.893660306930542, + "learning_rate": 1.93839907502588e-06, + "loss": 0.3746, + "step": 7303 + }, + { + "epoch": 3.4534278959810876, + "grad_norm": 3.1762871742248535, + "learning_rate": 1.9377912094721295e-06, + "loss": 0.446, + "step": 7304 + }, + { + "epoch": 3.453900709219858, + "grad_norm": 3.3231537342071533, + "learning_rate": 1.9371833789246554e-06, + "loss": 0.4837, + "step": 7305 + }, + { + "epoch": 3.4543735224586287, + "grad_norm": 3.548333168029785, + "learning_rate": 1.936575583421304e-06, + "loss": 0.3911, + "step": 7306 + }, + { + "epoch": 3.4548463356973995, + "grad_norm": 3.0627071857452393, + "learning_rate": 1.9359678229999213e-06, + "loss": 0.3751, + "step": 7307 + }, + { + "epoch": 3.4553191489361703, + "grad_norm": 2.797663927078247, + "learning_rate": 1.9353600976983475e-06, + "loss": 0.41, + "step": 7308 + }, + { + "epoch": 3.455791962174941, + "grad_norm": 2.803269624710083, + "learning_rate": 1.9347524075544258e-06, + "loss": 0.3775, + "step": 7309 + }, + { + "epoch": 3.4562647754137115, + "grad_norm": 2.828010320663452, + "learning_rate": 1.934144752605993e-06, + "loss": 0.375, + "step": 7310 + }, + { + "epoch": 3.4567375886524823, + "grad_norm": 3.456477165222168, + "learning_rate": 1.933537132890884e-06, + "loss": 0.4764, + "step": 7311 + }, + { + "epoch": 3.457210401891253, + "grad_norm": 2.723670244216919, + "learning_rate": 1.9329295484469354e-06, + "loss": 0.3581, + "step": 7312 + }, + { + "epoch": 3.4576832151300234, + "grad_norm": 3.9723474979400635, + "learning_rate": 1.9323219993119766e-06, + "loss": 0.3951, + "step": 7313 + }, + { + "epoch": 3.458156028368794, + "grad_norm": 2.951300859451294, + "learning_rate": 1.931714485523838e-06, + "loss": 0.3865, + "step": 7314 + }, + { + "epoch": 3.458628841607565, + "grad_norm": 2.9265835285186768, + "learning_rate": 1.931107007120347e-06, + "loss": 0.3731, + "step": 7315 + }, + { + "epoch": 3.459101654846336, + "grad_norm": 3.271883249282837, + "learning_rate": 1.930499564139327e-06, + "loss": 0.3971, + "step": 7316 + }, + { + "epoch": 3.4595744680851066, + "grad_norm": 2.8716280460357666, + "learning_rate": 1.929892156618603e-06, + "loss": 0.3332, + "step": 7317 + }, + { + "epoch": 3.460047281323877, + "grad_norm": 2.9820191860198975, + "learning_rate": 1.929284784595993e-06, + "loss": 0.3907, + "step": 7318 + }, + { + "epoch": 3.4605200945626478, + "grad_norm": 3.313225269317627, + "learning_rate": 1.9286774481093183e-06, + "loss": 0.3678, + "step": 7319 + }, + { + "epoch": 3.4609929078014185, + "grad_norm": 3.365387439727783, + "learning_rate": 1.928070147196394e-06, + "loss": 0.4894, + "step": 7320 + }, + { + "epoch": 3.461465721040189, + "grad_norm": 3.1723599433898926, + "learning_rate": 1.927462881895033e-06, + "loss": 0.4607, + "step": 7321 + }, + { + "epoch": 3.4619385342789597, + "grad_norm": 2.7644999027252197, + "learning_rate": 1.9268556522430483e-06, + "loss": 0.3627, + "step": 7322 + }, + { + "epoch": 3.4624113475177305, + "grad_norm": 2.65572190284729, + "learning_rate": 1.9262484582782483e-06, + "loss": 0.3893, + "step": 7323 + }, + { + "epoch": 3.4628841607565013, + "grad_norm": 2.992037773132324, + "learning_rate": 1.9256413000384415e-06, + "loss": 0.4175, + "step": 7324 + }, + { + "epoch": 3.463356973995272, + "grad_norm": 3.020496368408203, + "learning_rate": 1.925034177561433e-06, + "loss": 0.42, + "step": 7325 + }, + { + "epoch": 3.4638297872340424, + "grad_norm": 2.780334234237671, + "learning_rate": 1.9244270908850236e-06, + "loss": 0.4195, + "step": 7326 + }, + { + "epoch": 3.4643026004728132, + "grad_norm": 2.863028049468994, + "learning_rate": 1.9238200400470166e-06, + "loss": 0.3706, + "step": 7327 + }, + { + "epoch": 3.464775413711584, + "grad_norm": 3.2766900062561035, + "learning_rate": 1.923213025085209e-06, + "loss": 0.4506, + "step": 7328 + }, + { + "epoch": 3.4652482269503544, + "grad_norm": 2.7300634384155273, + "learning_rate": 1.9226060460373975e-06, + "loss": 0.3463, + "step": 7329 + }, + { + "epoch": 3.465721040189125, + "grad_norm": 3.136104106903076, + "learning_rate": 1.921999102941376e-06, + "loss": 0.3839, + "step": 7330 + }, + { + "epoch": 3.466193853427896, + "grad_norm": 2.944932699203491, + "learning_rate": 1.921392195834934e-06, + "loss": 0.432, + "step": 7331 + }, + { + "epoch": 3.466666666666667, + "grad_norm": 3.428375005722046, + "learning_rate": 1.9207853247558647e-06, + "loss": 0.3407, + "step": 7332 + }, + { + "epoch": 3.4671394799054376, + "grad_norm": 3.3732450008392334, + "learning_rate": 1.9201784897419535e-06, + "loss": 0.361, + "step": 7333 + }, + { + "epoch": 3.467612293144208, + "grad_norm": 2.8291900157928467, + "learning_rate": 1.9195716908309836e-06, + "loss": 0.3805, + "step": 7334 + }, + { + "epoch": 3.4680851063829787, + "grad_norm": 3.3229610919952393, + "learning_rate": 1.9189649280607407e-06, + "loss": 0.3756, + "step": 7335 + }, + { + "epoch": 3.4685579196217495, + "grad_norm": 2.949416160583496, + "learning_rate": 1.918358201469004e-06, + "loss": 0.4316, + "step": 7336 + }, + { + "epoch": 3.46903073286052, + "grad_norm": 3.525501251220703, + "learning_rate": 1.9177515110935515e-06, + "loss": 0.4018, + "step": 7337 + }, + { + "epoch": 3.4695035460992907, + "grad_norm": 3.1439104080200195, + "learning_rate": 1.917144856972159e-06, + "loss": 0.4176, + "step": 7338 + }, + { + "epoch": 3.4699763593380615, + "grad_norm": 3.0022377967834473, + "learning_rate": 1.9165382391426006e-06, + "loss": 0.3962, + "step": 7339 + }, + { + "epoch": 3.4704491725768323, + "grad_norm": 3.2174794673919678, + "learning_rate": 1.9159316576426482e-06, + "loss": 0.441, + "step": 7340 + }, + { + "epoch": 3.470921985815603, + "grad_norm": 2.965123414993286, + "learning_rate": 1.9153251125100694e-06, + "loss": 0.4105, + "step": 7341 + }, + { + "epoch": 3.4713947990543734, + "grad_norm": 2.722904920578003, + "learning_rate": 1.9147186037826333e-06, + "loss": 0.4102, + "step": 7342 + }, + { + "epoch": 3.4718676122931442, + "grad_norm": 3.4894051551818848, + "learning_rate": 1.9141121314981033e-06, + "loss": 0.4225, + "step": 7343 + }, + { + "epoch": 3.472340425531915, + "grad_norm": 2.828497886657715, + "learning_rate": 1.913505695694241e-06, + "loss": 0.374, + "step": 7344 + }, + { + "epoch": 3.4728132387706854, + "grad_norm": 3.3046014308929443, + "learning_rate": 1.9128992964088077e-06, + "loss": 0.3568, + "step": 7345 + }, + { + "epoch": 3.473286052009456, + "grad_norm": 2.927281618118286, + "learning_rate": 1.9122929336795605e-06, + "loss": 0.4308, + "step": 7346 + }, + { + "epoch": 3.473758865248227, + "grad_norm": 2.9569990634918213, + "learning_rate": 1.911686607544256e-06, + "loss": 0.3226, + "step": 7347 + }, + { + "epoch": 3.4742316784869978, + "grad_norm": 3.1061038970947266, + "learning_rate": 1.9110803180406468e-06, + "loss": 0.4426, + "step": 7348 + }, + { + "epoch": 3.4747044917257686, + "grad_norm": 2.9609580039978027, + "learning_rate": 1.9104740652064825e-06, + "loss": 0.3835, + "step": 7349 + }, + { + "epoch": 3.475177304964539, + "grad_norm": 3.1547608375549316, + "learning_rate": 1.9098678490795147e-06, + "loss": 0.3814, + "step": 7350 + }, + { + "epoch": 3.4756501182033097, + "grad_norm": 2.869022846221924, + "learning_rate": 1.909261669697487e-06, + "loss": 0.4048, + "step": 7351 + }, + { + "epoch": 3.4761229314420805, + "grad_norm": 3.0565078258514404, + "learning_rate": 1.908655527098146e-06, + "loss": 0.3736, + "step": 7352 + }, + { + "epoch": 3.476595744680851, + "grad_norm": 2.893603563308716, + "learning_rate": 1.9080494213192317e-06, + "loss": 0.3906, + "step": 7353 + }, + { + "epoch": 3.4770685579196217, + "grad_norm": 2.818938732147217, + "learning_rate": 1.9074433523984844e-06, + "loss": 0.3958, + "step": 7354 + }, + { + "epoch": 3.4775413711583925, + "grad_norm": 2.675461769104004, + "learning_rate": 1.9068373203736419e-06, + "loss": 0.3371, + "step": 7355 + }, + { + "epoch": 3.4780141843971633, + "grad_norm": 2.5831551551818848, + "learning_rate": 1.9062313252824384e-06, + "loss": 0.3365, + "step": 7356 + }, + { + "epoch": 3.478486997635934, + "grad_norm": 3.299736738204956, + "learning_rate": 1.9056253671626054e-06, + "loss": 0.3923, + "step": 7357 + }, + { + "epoch": 3.4789598108747044, + "grad_norm": 2.508787155151367, + "learning_rate": 1.905019446051876e-06, + "loss": 0.3367, + "step": 7358 + }, + { + "epoch": 3.479432624113475, + "grad_norm": 2.980327606201172, + "learning_rate": 1.9044135619879753e-06, + "loss": 0.3842, + "step": 7359 + }, + { + "epoch": 3.479905437352246, + "grad_norm": 3.2114269733428955, + "learning_rate": 1.9038077150086317e-06, + "loss": 0.4625, + "step": 7360 + }, + { + "epoch": 3.4803782505910164, + "grad_norm": 3.2119715213775635, + "learning_rate": 1.9032019051515677e-06, + "loss": 0.4197, + "step": 7361 + }, + { + "epoch": 3.480851063829787, + "grad_norm": 3.2967300415039062, + "learning_rate": 1.9025961324545034e-06, + "loss": 0.4462, + "step": 7362 + }, + { + "epoch": 3.481323877068558, + "grad_norm": 3.132643461227417, + "learning_rate": 1.9019903969551589e-06, + "loss": 0.4355, + "step": 7363 + }, + { + "epoch": 3.4817966903073287, + "grad_norm": 2.9940602779388428, + "learning_rate": 1.9013846986912493e-06, + "loss": 0.3584, + "step": 7364 + }, + { + "epoch": 3.482269503546099, + "grad_norm": 2.901935577392578, + "learning_rate": 1.9007790377004907e-06, + "loss": 0.3987, + "step": 7365 + }, + { + "epoch": 3.48274231678487, + "grad_norm": 3.143404006958008, + "learning_rate": 1.9001734140205939e-06, + "loss": 0.3642, + "step": 7366 + }, + { + "epoch": 3.4832151300236407, + "grad_norm": 3.6701388359069824, + "learning_rate": 1.899567827689267e-06, + "loss": 0.4111, + "step": 7367 + }, + { + "epoch": 3.4836879432624115, + "grad_norm": 2.6821651458740234, + "learning_rate": 1.8989622787442202e-06, + "loss": 0.3827, + "step": 7368 + }, + { + "epoch": 3.484160756501182, + "grad_norm": 3.3916640281677246, + "learning_rate": 1.8983567672231562e-06, + "loss": 0.4214, + "step": 7369 + }, + { + "epoch": 3.4846335697399526, + "grad_norm": 3.3192927837371826, + "learning_rate": 1.8977512931637788e-06, + "loss": 0.4375, + "step": 7370 + }, + { + "epoch": 3.4851063829787234, + "grad_norm": 3.177884578704834, + "learning_rate": 1.8971458566037877e-06, + "loss": 0.3944, + "step": 7371 + }, + { + "epoch": 3.4855791962174942, + "grad_norm": 3.1554532051086426, + "learning_rate": 1.896540457580879e-06, + "loss": 0.3712, + "step": 7372 + }, + { + "epoch": 3.4860520094562646, + "grad_norm": 2.7773821353912354, + "learning_rate": 1.8959350961327516e-06, + "loss": 0.3534, + "step": 7373 + }, + { + "epoch": 3.4865248226950354, + "grad_norm": 2.4792168140411377, + "learning_rate": 1.8953297722970956e-06, + "loss": 0.3571, + "step": 7374 + }, + { + "epoch": 3.486997635933806, + "grad_norm": 2.676361083984375, + "learning_rate": 1.8947244861116044e-06, + "loss": 0.3365, + "step": 7375 + }, + { + "epoch": 3.487470449172577, + "grad_norm": 2.9340765476226807, + "learning_rate": 1.8941192376139655e-06, + "loss": 0.4656, + "step": 7376 + }, + { + "epoch": 3.4879432624113473, + "grad_norm": 3.7924742698669434, + "learning_rate": 1.8935140268418646e-06, + "loss": 0.3639, + "step": 7377 + }, + { + "epoch": 3.488416075650118, + "grad_norm": 2.798912286758423, + "learning_rate": 1.892908853832986e-06, + "loss": 0.3741, + "step": 7378 + }, + { + "epoch": 3.488888888888889, + "grad_norm": 3.1731197834014893, + "learning_rate": 1.8923037186250112e-06, + "loss": 0.4041, + "step": 7379 + }, + { + "epoch": 3.4893617021276597, + "grad_norm": 2.893725633621216, + "learning_rate": 1.8916986212556182e-06, + "loss": 0.3103, + "step": 7380 + }, + { + "epoch": 3.48983451536643, + "grad_norm": 3.2489001750946045, + "learning_rate": 1.891093561762486e-06, + "loss": 0.328, + "step": 7381 + }, + { + "epoch": 3.490307328605201, + "grad_norm": 2.8076415061950684, + "learning_rate": 1.8904885401832862e-06, + "loss": 0.426, + "step": 7382 + }, + { + "epoch": 3.4907801418439717, + "grad_norm": 3.076544761657715, + "learning_rate": 1.8898835565556938e-06, + "loss": 0.3664, + "step": 7383 + }, + { + "epoch": 3.4912529550827425, + "grad_norm": 2.7615935802459717, + "learning_rate": 1.8892786109173769e-06, + "loss": 0.3718, + "step": 7384 + }, + { + "epoch": 3.491725768321513, + "grad_norm": 2.9050116539001465, + "learning_rate": 1.8886737033060023e-06, + "loss": 0.3456, + "step": 7385 + }, + { + "epoch": 3.4921985815602836, + "grad_norm": 2.4928293228149414, + "learning_rate": 1.8880688337592366e-06, + "loss": 0.3487, + "step": 7386 + }, + { + "epoch": 3.4926713947990544, + "grad_norm": 2.773418426513672, + "learning_rate": 1.88746400231474e-06, + "loss": 0.3771, + "step": 7387 + }, + { + "epoch": 3.493144208037825, + "grad_norm": 2.7137296199798584, + "learning_rate": 1.886859209010175e-06, + "loss": 0.376, + "step": 7388 + }, + { + "epoch": 3.4936170212765956, + "grad_norm": 3.327976942062378, + "learning_rate": 1.886254453883199e-06, + "loss": 0.3481, + "step": 7389 + }, + { + "epoch": 3.4940898345153664, + "grad_norm": 3.8637235164642334, + "learning_rate": 1.8856497369714655e-06, + "loss": 0.3726, + "step": 7390 + }, + { + "epoch": 3.494562647754137, + "grad_norm": 3.1517951488494873, + "learning_rate": 1.88504505831263e-06, + "loss": 0.4459, + "step": 7391 + }, + { + "epoch": 3.495035460992908, + "grad_norm": 3.160130262374878, + "learning_rate": 1.884440417944342e-06, + "loss": 0.3918, + "step": 7392 + }, + { + "epoch": 3.4955082742316783, + "grad_norm": 2.6518726348876953, + "learning_rate": 1.8838358159042503e-06, + "loss": 0.3493, + "step": 7393 + }, + { + "epoch": 3.495981087470449, + "grad_norm": 2.7487380504608154, + "learning_rate": 1.8832312522300009e-06, + "loss": 0.3846, + "step": 7394 + }, + { + "epoch": 3.49645390070922, + "grad_norm": 3.062293291091919, + "learning_rate": 1.8826267269592355e-06, + "loss": 0.3792, + "step": 7395 + }, + { + "epoch": 3.4969267139479907, + "grad_norm": 3.3636794090270996, + "learning_rate": 1.8820222401295979e-06, + "loss": 0.4504, + "step": 7396 + }, + { + "epoch": 3.497399527186761, + "grad_norm": 3.230196237564087, + "learning_rate": 1.8814177917787246e-06, + "loss": 0.3953, + "step": 7397 + }, + { + "epoch": 3.497872340425532, + "grad_norm": 2.891002893447876, + "learning_rate": 1.8808133819442541e-06, + "loss": 0.3923, + "step": 7398 + }, + { + "epoch": 3.4983451536643027, + "grad_norm": 2.7478551864624023, + "learning_rate": 1.8802090106638196e-06, + "loss": 0.4115, + "step": 7399 + }, + { + "epoch": 3.4988179669030735, + "grad_norm": 3.0452797412872314, + "learning_rate": 1.8796046779750515e-06, + "loss": 0.4154, + "step": 7400 + }, + { + "epoch": 3.499290780141844, + "grad_norm": 3.0759124755859375, + "learning_rate": 1.87900038391558e-06, + "loss": 0.4277, + "step": 7401 + }, + { + "epoch": 3.4997635933806146, + "grad_norm": 2.7563929557800293, + "learning_rate": 1.8783961285230314e-06, + "loss": 0.3896, + "step": 7402 + }, + { + "epoch": 3.5002364066193854, + "grad_norm": 2.661916494369507, + "learning_rate": 1.87779191183503e-06, + "loss": 0.3625, + "step": 7403 + }, + { + "epoch": 3.500709219858156, + "grad_norm": 2.881241798400879, + "learning_rate": 1.877187733889199e-06, + "loss": 0.3724, + "step": 7404 + }, + { + "epoch": 3.5011820330969265, + "grad_norm": 3.2405693531036377, + "learning_rate": 1.8765835947231554e-06, + "loss": 0.3974, + "step": 7405 + }, + { + "epoch": 3.5016548463356973, + "grad_norm": 2.924288034439087, + "learning_rate": 1.8759794943745184e-06, + "loss": 0.3467, + "step": 7406 + }, + { + "epoch": 3.502127659574468, + "grad_norm": 3.031663656234741, + "learning_rate": 1.8753754328809027e-06, + "loss": 0.3995, + "step": 7407 + }, + { + "epoch": 3.5026004728132385, + "grad_norm": 3.028277635574341, + "learning_rate": 1.874771410279919e-06, + "loss": 0.3741, + "step": 7408 + }, + { + "epoch": 3.5030732860520093, + "grad_norm": 3.0211644172668457, + "learning_rate": 1.8741674266091782e-06, + "loss": 0.4018, + "step": 7409 + }, + { + "epoch": 3.50354609929078, + "grad_norm": 2.732234239578247, + "learning_rate": 1.8735634819062875e-06, + "loss": 0.313, + "step": 7410 + }, + { + "epoch": 3.504018912529551, + "grad_norm": 3.139596939086914, + "learning_rate": 1.8729595762088525e-06, + "loss": 0.4112, + "step": 7411 + }, + { + "epoch": 3.5044917257683217, + "grad_norm": 2.894230365753174, + "learning_rate": 1.8723557095544754e-06, + "loss": 0.3891, + "step": 7412 + }, + { + "epoch": 3.504964539007092, + "grad_norm": 2.850205659866333, + "learning_rate": 1.8717518819807547e-06, + "loss": 0.424, + "step": 7413 + }, + { + "epoch": 3.505437352245863, + "grad_norm": 3.047736644744873, + "learning_rate": 1.8711480935252907e-06, + "loss": 0.3757, + "step": 7414 + }, + { + "epoch": 3.5059101654846336, + "grad_norm": 3.0174455642700195, + "learning_rate": 1.8705443442256772e-06, + "loss": 0.3625, + "step": 7415 + }, + { + "epoch": 3.506382978723404, + "grad_norm": 2.840681552886963, + "learning_rate": 1.869940634119507e-06, + "loss": 0.3595, + "step": 7416 + }, + { + "epoch": 3.506855791962175, + "grad_norm": 3.067473888397217, + "learning_rate": 1.8693369632443713e-06, + "loss": 0.432, + "step": 7417 + }, + { + "epoch": 3.5073286052009456, + "grad_norm": 2.94655179977417, + "learning_rate": 1.8687333316378572e-06, + "loss": 0.4222, + "step": 7418 + }, + { + "epoch": 3.5078014184397164, + "grad_norm": 2.968548536300659, + "learning_rate": 1.868129739337551e-06, + "loss": 0.4098, + "step": 7419 + }, + { + "epoch": 3.508274231678487, + "grad_norm": 2.70094895362854, + "learning_rate": 1.867526186381034e-06, + "loss": 0.386, + "step": 7420 + }, + { + "epoch": 3.5087470449172575, + "grad_norm": 3.25897216796875, + "learning_rate": 1.8669226728058895e-06, + "loss": 0.4411, + "step": 7421 + }, + { + "epoch": 3.5092198581560283, + "grad_norm": 4.281215667724609, + "learning_rate": 1.866319198649694e-06, + "loss": 0.4011, + "step": 7422 + }, + { + "epoch": 3.509692671394799, + "grad_norm": 2.8394858837127686, + "learning_rate": 1.8657157639500223e-06, + "loss": 0.4162, + "step": 7423 + }, + { + "epoch": 3.5101654846335695, + "grad_norm": 2.732691764831543, + "learning_rate": 1.86511236874445e-06, + "loss": 0.3603, + "step": 7424 + }, + { + "epoch": 3.5106382978723403, + "grad_norm": 3.0152828693389893, + "learning_rate": 1.8645090130705463e-06, + "loss": 0.3811, + "step": 7425 + }, + { + "epoch": 3.511111111111111, + "grad_norm": 3.1762008666992188, + "learning_rate": 1.8639056969658793e-06, + "loss": 0.3985, + "step": 7426 + }, + { + "epoch": 3.511583924349882, + "grad_norm": 3.151123523712158, + "learning_rate": 1.863302420468016e-06, + "loss": 0.3582, + "step": 7427 + }, + { + "epoch": 3.5120567375886527, + "grad_norm": 2.738206386566162, + "learning_rate": 1.862699183614518e-06, + "loss": 0.3768, + "step": 7428 + }, + { + "epoch": 3.512529550827423, + "grad_norm": 3.235212564468384, + "learning_rate": 1.8620959864429487e-06, + "loss": 0.3964, + "step": 7429 + }, + { + "epoch": 3.513002364066194, + "grad_norm": 3.1113579273223877, + "learning_rate": 1.8614928289908648e-06, + "loss": 0.3979, + "step": 7430 + }, + { + "epoch": 3.5134751773049646, + "grad_norm": 2.6802520751953125, + "learning_rate": 1.860889711295822e-06, + "loss": 0.327, + "step": 7431 + }, + { + "epoch": 3.513947990543735, + "grad_norm": 2.9212403297424316, + "learning_rate": 1.860286633395375e-06, + "loss": 0.4104, + "step": 7432 + }, + { + "epoch": 3.5144208037825058, + "grad_norm": 2.868861198425293, + "learning_rate": 1.8596835953270742e-06, + "loss": 0.383, + "step": 7433 + }, + { + "epoch": 3.5148936170212766, + "grad_norm": 2.831655740737915, + "learning_rate": 1.8590805971284686e-06, + "loss": 0.3615, + "step": 7434 + }, + { + "epoch": 3.5153664302600474, + "grad_norm": 3.1540114879608154, + "learning_rate": 1.8584776388371039e-06, + "loss": 0.3914, + "step": 7435 + }, + { + "epoch": 3.515839243498818, + "grad_norm": 3.22031307220459, + "learning_rate": 1.8578747204905223e-06, + "loss": 0.4358, + "step": 7436 + }, + { + "epoch": 3.5163120567375885, + "grad_norm": 3.2922887802124023, + "learning_rate": 1.8572718421262677e-06, + "loss": 0.3894, + "step": 7437 + }, + { + "epoch": 3.5167848699763593, + "grad_norm": 2.936475992202759, + "learning_rate": 1.856669003781876e-06, + "loss": 0.3748, + "step": 7438 + }, + { + "epoch": 3.51725768321513, + "grad_norm": 3.4542860984802246, + "learning_rate": 1.8560662054948856e-06, + "loss": 0.3362, + "step": 7439 + }, + { + "epoch": 3.5177304964539005, + "grad_norm": 3.1532278060913086, + "learning_rate": 1.8554634473028288e-06, + "loss": 0.411, + "step": 7440 + }, + { + "epoch": 3.5182033096926713, + "grad_norm": 3.1678943634033203, + "learning_rate": 1.854860729243237e-06, + "loss": 0.4357, + "step": 7441 + }, + { + "epoch": 3.518676122931442, + "grad_norm": 2.608930826187134, + "learning_rate": 1.8542580513536385e-06, + "loss": 0.3851, + "step": 7442 + }, + { + "epoch": 3.519148936170213, + "grad_norm": 3.127915143966675, + "learning_rate": 1.853655413671559e-06, + "loss": 0.4227, + "step": 7443 + }, + { + "epoch": 3.5196217494089836, + "grad_norm": 3.0593245029449463, + "learning_rate": 1.8530528162345238e-06, + "loss": 0.4315, + "step": 7444 + }, + { + "epoch": 3.520094562647754, + "grad_norm": 2.7818729877471924, + "learning_rate": 1.852450259080053e-06, + "loss": 0.4018, + "step": 7445 + }, + { + "epoch": 3.520567375886525, + "grad_norm": 3.2635445594787598, + "learning_rate": 1.8518477422456639e-06, + "loss": 0.415, + "step": 7446 + }, + { + "epoch": 3.5210401891252956, + "grad_norm": 2.5713813304901123, + "learning_rate": 1.851245265768875e-06, + "loss": 0.3309, + "step": 7447 + }, + { + "epoch": 3.521513002364066, + "grad_norm": 2.6778969764709473, + "learning_rate": 1.8506428296871982e-06, + "loss": 0.3106, + "step": 7448 + }, + { + "epoch": 3.5219858156028367, + "grad_norm": 2.901095390319824, + "learning_rate": 1.8500404340381455e-06, + "loss": 0.3729, + "step": 7449 + }, + { + "epoch": 3.5224586288416075, + "grad_norm": 3.1000046730041504, + "learning_rate": 1.849438078859225e-06, + "loss": 0.438, + "step": 7450 + }, + { + "epoch": 3.5229314420803783, + "grad_norm": 2.901890993118286, + "learning_rate": 1.8488357641879417e-06, + "loss": 0.3934, + "step": 7451 + }, + { + "epoch": 3.523404255319149, + "grad_norm": 3.2212157249450684, + "learning_rate": 1.8482334900618009e-06, + "loss": 0.4359, + "step": 7452 + }, + { + "epoch": 3.5238770685579195, + "grad_norm": 3.3780901432037354, + "learning_rate": 1.847631256518303e-06, + "loss": 0.4022, + "step": 7453 + }, + { + "epoch": 3.5243498817966903, + "grad_norm": 2.9996445178985596, + "learning_rate": 1.847029063594945e-06, + "loss": 0.3989, + "step": 7454 + }, + { + "epoch": 3.524822695035461, + "grad_norm": 2.8581080436706543, + "learning_rate": 1.8464269113292255e-06, + "loss": 0.3401, + "step": 7455 + }, + { + "epoch": 3.5252955082742314, + "grad_norm": 2.9551661014556885, + "learning_rate": 1.8458247997586354e-06, + "loss": 0.4556, + "step": 7456 + }, + { + "epoch": 3.5257683215130022, + "grad_norm": 2.9672555923461914, + "learning_rate": 1.8452227289206672e-06, + "loss": 0.3575, + "step": 7457 + }, + { + "epoch": 3.526241134751773, + "grad_norm": 3.226273536682129, + "learning_rate": 1.8446206988528087e-06, + "loss": 0.3769, + "step": 7458 + }, + { + "epoch": 3.526713947990544, + "grad_norm": 2.994356155395508, + "learning_rate": 1.8440187095925443e-06, + "loss": 0.3653, + "step": 7459 + }, + { + "epoch": 3.5271867612293146, + "grad_norm": 2.489049196243286, + "learning_rate": 1.8434167611773595e-06, + "loss": 0.3454, + "step": 7460 + }, + { + "epoch": 3.527659574468085, + "grad_norm": 2.7897472381591797, + "learning_rate": 1.8428148536447333e-06, + "loss": 0.3526, + "step": 7461 + }, + { + "epoch": 3.5281323877068558, + "grad_norm": 2.947746992111206, + "learning_rate": 1.842212987032145e-06, + "loss": 0.3542, + "step": 7462 + }, + { + "epoch": 3.5286052009456266, + "grad_norm": 2.9303736686706543, + "learning_rate": 1.84161116137707e-06, + "loss": 0.3618, + "step": 7463 + }, + { + "epoch": 3.529078014184397, + "grad_norm": 2.81052827835083, + "learning_rate": 1.8410093767169807e-06, + "loss": 0.3833, + "step": 7464 + }, + { + "epoch": 3.5295508274231677, + "grad_norm": 3.4084126949310303, + "learning_rate": 1.840407633089348e-06, + "loss": 0.3868, + "step": 7465 + }, + { + "epoch": 3.5300236406619385, + "grad_norm": 2.8372802734375, + "learning_rate": 1.839805930531639e-06, + "loss": 0.3407, + "step": 7466 + }, + { + "epoch": 3.5304964539007093, + "grad_norm": 2.9218525886535645, + "learning_rate": 1.8392042690813205e-06, + "loss": 0.3772, + "step": 7467 + }, + { + "epoch": 3.53096926713948, + "grad_norm": 3.425274610519409, + "learning_rate": 1.8386026487758552e-06, + "loss": 0.3996, + "step": 7468 + }, + { + "epoch": 3.5314420803782505, + "grad_norm": 3.027423858642578, + "learning_rate": 1.8380010696527015e-06, + "loss": 0.3752, + "step": 7469 + }, + { + "epoch": 3.5319148936170213, + "grad_norm": 2.974896192550659, + "learning_rate": 1.8373995317493193e-06, + "loss": 0.3657, + "step": 7470 + }, + { + "epoch": 3.532387706855792, + "grad_norm": 2.837458610534668, + "learning_rate": 1.8367980351031628e-06, + "loss": 0.3949, + "step": 7471 + }, + { + "epoch": 3.5328605200945624, + "grad_norm": 2.8257288932800293, + "learning_rate": 1.8361965797516844e-06, + "loss": 0.3253, + "step": 7472 + }, + { + "epoch": 3.533333333333333, + "grad_norm": 2.8278095722198486, + "learning_rate": 1.8355951657323351e-06, + "loss": 0.3588, + "step": 7473 + }, + { + "epoch": 3.533806146572104, + "grad_norm": 2.641160249710083, + "learning_rate": 1.8349937930825601e-06, + "loss": 0.3423, + "step": 7474 + }, + { + "epoch": 3.534278959810875, + "grad_norm": 2.6909263134002686, + "learning_rate": 1.8343924618398065e-06, + "loss": 0.3973, + "step": 7475 + }, + { + "epoch": 3.5347517730496456, + "grad_norm": 3.0727429389953613, + "learning_rate": 1.8337911720415157e-06, + "loss": 0.4207, + "step": 7476 + }, + { + "epoch": 3.535224586288416, + "grad_norm": 3.218925714492798, + "learning_rate": 1.8331899237251265e-06, + "loss": 0.3955, + "step": 7477 + }, + { + "epoch": 3.5356973995271868, + "grad_norm": 3.163914918899536, + "learning_rate": 1.832588716928078e-06, + "loss": 0.4655, + "step": 7478 + }, + { + "epoch": 3.5361702127659576, + "grad_norm": 2.8622686862945557, + "learning_rate": 1.831987551687803e-06, + "loss": 0.4084, + "step": 7479 + }, + { + "epoch": 3.536643026004728, + "grad_norm": 2.8534188270568848, + "learning_rate": 1.831386428041734e-06, + "loss": 0.4144, + "step": 7480 + }, + { + "epoch": 3.5371158392434987, + "grad_norm": 2.8138554096221924, + "learning_rate": 1.8307853460273008e-06, + "loss": 0.3835, + "step": 7481 + }, + { + "epoch": 3.5375886524822695, + "grad_norm": 3.061960458755493, + "learning_rate": 1.830184305681929e-06, + "loss": 0.4128, + "step": 7482 + }, + { + "epoch": 3.5380614657210403, + "grad_norm": 2.8524835109710693, + "learning_rate": 1.8295833070430444e-06, + "loss": 0.3372, + "step": 7483 + }, + { + "epoch": 3.538534278959811, + "grad_norm": 3.2567028999328613, + "learning_rate": 1.8289823501480663e-06, + "loss": 0.4533, + "step": 7484 + }, + { + "epoch": 3.5390070921985815, + "grad_norm": 2.945634603500366, + "learning_rate": 1.8283814350344158e-06, + "loss": 0.3565, + "step": 7485 + }, + { + "epoch": 3.5394799054373522, + "grad_norm": 2.903287649154663, + "learning_rate": 1.8277805617395089e-06, + "loss": 0.349, + "step": 7486 + }, + { + "epoch": 3.539952718676123, + "grad_norm": 3.249272584915161, + "learning_rate": 1.827179730300757e-06, + "loss": 0.4076, + "step": 7487 + }, + { + "epoch": 3.5404255319148934, + "grad_norm": 2.9591739177703857, + "learning_rate": 1.8265789407555748e-06, + "loss": 0.3439, + "step": 7488 + }, + { + "epoch": 3.540898345153664, + "grad_norm": 3.8527538776397705, + "learning_rate": 1.8259781931413683e-06, + "loss": 0.4684, + "step": 7489 + }, + { + "epoch": 3.541371158392435, + "grad_norm": 2.7392261028289795, + "learning_rate": 1.8253774874955449e-06, + "loss": 0.3494, + "step": 7490 + }, + { + "epoch": 3.541843971631206, + "grad_norm": 2.880993127822876, + "learning_rate": 1.8247768238555069e-06, + "loss": 0.3546, + "step": 7491 + }, + { + "epoch": 3.5423167848699766, + "grad_norm": 2.9944894313812256, + "learning_rate": 1.8241762022586545e-06, + "loss": 0.3594, + "step": 7492 + }, + { + "epoch": 3.542789598108747, + "grad_norm": 3.0084292888641357, + "learning_rate": 1.8235756227423878e-06, + "loss": 0.408, + "step": 7493 + }, + { + "epoch": 3.5432624113475177, + "grad_norm": 2.75227689743042, + "learning_rate": 1.8229750853440998e-06, + "loss": 0.3515, + "step": 7494 + }, + { + "epoch": 3.5437352245862885, + "grad_norm": 3.041893243789673, + "learning_rate": 1.8223745901011856e-06, + "loss": 0.401, + "step": 7495 + }, + { + "epoch": 3.544208037825059, + "grad_norm": 2.8728370666503906, + "learning_rate": 1.8217741370510345e-06, + "loss": 0.3832, + "step": 7496 + }, + { + "epoch": 3.5446808510638297, + "grad_norm": 3.095460891723633, + "learning_rate": 1.8211737262310331e-06, + "loss": 0.3086, + "step": 7497 + }, + { + "epoch": 3.5451536643026005, + "grad_norm": 3.1869826316833496, + "learning_rate": 1.8205733576785678e-06, + "loss": 0.3666, + "step": 7498 + }, + { + "epoch": 3.5456264775413713, + "grad_norm": 3.307560443878174, + "learning_rate": 1.8199730314310204e-06, + "loss": 0.4489, + "step": 7499 + }, + { + "epoch": 3.546099290780142, + "grad_norm": 2.9531142711639404, + "learning_rate": 1.8193727475257697e-06, + "loss": 0.4017, + "step": 7500 + }, + { + "epoch": 3.5465721040189124, + "grad_norm": 3.2969162464141846, + "learning_rate": 1.8187725060001942e-06, + "loss": 0.4179, + "step": 7501 + }, + { + "epoch": 3.5470449172576832, + "grad_norm": 2.9434688091278076, + "learning_rate": 1.818172306891667e-06, + "loss": 0.3562, + "step": 7502 + }, + { + "epoch": 3.547517730496454, + "grad_norm": 3.070732355117798, + "learning_rate": 1.8175721502375616e-06, + "loss": 0.393, + "step": 7503 + }, + { + "epoch": 3.5479905437352244, + "grad_norm": 2.970898389816284, + "learning_rate": 1.8169720360752457e-06, + "loss": 0.4448, + "step": 7504 + }, + { + "epoch": 3.548463356973995, + "grad_norm": 2.7050931453704834, + "learning_rate": 1.8163719644420858e-06, + "loss": 0.3562, + "step": 7505 + }, + { + "epoch": 3.548936170212766, + "grad_norm": 3.0073063373565674, + "learning_rate": 1.8157719353754467e-06, + "loss": 0.4429, + "step": 7506 + }, + { + "epoch": 3.5494089834515368, + "grad_norm": 3.0240445137023926, + "learning_rate": 1.8151719489126874e-06, + "loss": 0.4073, + "step": 7507 + }, + { + "epoch": 3.5498817966903076, + "grad_norm": 3.558763265609741, + "learning_rate": 1.8145720050911695e-06, + "loss": 0.4025, + "step": 7508 + }, + { + "epoch": 3.550354609929078, + "grad_norm": 3.637258768081665, + "learning_rate": 1.8139721039482473e-06, + "loss": 0.5074, + "step": 7509 + }, + { + "epoch": 3.5508274231678487, + "grad_norm": 2.804719924926758, + "learning_rate": 1.8133722455212726e-06, + "loss": 0.3727, + "step": 7510 + }, + { + "epoch": 3.5513002364066195, + "grad_norm": 2.9034759998321533, + "learning_rate": 1.8127724298475984e-06, + "loss": 0.3642, + "step": 7511 + }, + { + "epoch": 3.55177304964539, + "grad_norm": 3.1304872035980225, + "learning_rate": 1.8121726569645714e-06, + "loss": 0.434, + "step": 7512 + }, + { + "epoch": 3.5522458628841607, + "grad_norm": 3.019956111907959, + "learning_rate": 1.8115729269095378e-06, + "loss": 0.4235, + "step": 7513 + }, + { + "epoch": 3.5527186761229315, + "grad_norm": 2.7984633445739746, + "learning_rate": 1.810973239719839e-06, + "loss": 0.3344, + "step": 7514 + }, + { + "epoch": 3.5531914893617023, + "grad_norm": 2.839709997177124, + "learning_rate": 1.8103735954328145e-06, + "loss": 0.3708, + "step": 7515 + }, + { + "epoch": 3.553664302600473, + "grad_norm": 2.766819477081299, + "learning_rate": 1.809773994085803e-06, + "loss": 0.3402, + "step": 7516 + }, + { + "epoch": 3.5541371158392434, + "grad_norm": 2.707942247390747, + "learning_rate": 1.8091744357161372e-06, + "loss": 0.4327, + "step": 7517 + }, + { + "epoch": 3.554609929078014, + "grad_norm": 3.512702465057373, + "learning_rate": 1.8085749203611516e-06, + "loss": 0.3965, + "step": 7518 + }, + { + "epoch": 3.555082742316785, + "grad_norm": 2.717024803161621, + "learning_rate": 1.8079754480581738e-06, + "loss": 0.3237, + "step": 7519 + }, + { + "epoch": 3.5555555555555554, + "grad_norm": 2.659001350402832, + "learning_rate": 1.8073760188445296e-06, + "loss": 0.3546, + "step": 7520 + }, + { + "epoch": 3.556028368794326, + "grad_norm": 2.615028142929077, + "learning_rate": 1.8067766327575445e-06, + "loss": 0.3232, + "step": 7521 + }, + { + "epoch": 3.556501182033097, + "grad_norm": 2.659428119659424, + "learning_rate": 1.8061772898345386e-06, + "loss": 0.3769, + "step": 7522 + }, + { + "epoch": 3.5569739952718678, + "grad_norm": 3.142369270324707, + "learning_rate": 1.8055779901128296e-06, + "loss": 0.4292, + "step": 7523 + }, + { + "epoch": 3.5574468085106385, + "grad_norm": 3.0832736492156982, + "learning_rate": 1.8049787336297352e-06, + "loss": 0.3871, + "step": 7524 + }, + { + "epoch": 3.557919621749409, + "grad_norm": 2.778411865234375, + "learning_rate": 1.8043795204225664e-06, + "loss": 0.3938, + "step": 7525 + }, + { + "epoch": 3.5583924349881797, + "grad_norm": 3.1651480197906494, + "learning_rate": 1.8037803505286355e-06, + "loss": 0.3315, + "step": 7526 + }, + { + "epoch": 3.5588652482269505, + "grad_norm": 3.266508102416992, + "learning_rate": 1.8031812239852498e-06, + "loss": 0.4156, + "step": 7527 + }, + { + "epoch": 3.559338061465721, + "grad_norm": 3.1345436573028564, + "learning_rate": 1.8025821408297127e-06, + "loss": 0.3813, + "step": 7528 + }, + { + "epoch": 3.5598108747044916, + "grad_norm": 3.1535425186157227, + "learning_rate": 1.8019831010993289e-06, + "loss": 0.3897, + "step": 7529 + }, + { + "epoch": 3.5602836879432624, + "grad_norm": 3.0934345722198486, + "learning_rate": 1.8013841048313952e-06, + "loss": 0.4074, + "step": 7530 + }, + { + "epoch": 3.5607565011820332, + "grad_norm": 3.224876642227173, + "learning_rate": 1.8007851520632108e-06, + "loss": 0.3969, + "step": 7531 + }, + { + "epoch": 3.561229314420804, + "grad_norm": 3.082303285598755, + "learning_rate": 1.8001862428320693e-06, + "loss": 0.3559, + "step": 7532 + }, + { + "epoch": 3.5617021276595744, + "grad_norm": 3.5289969444274902, + "learning_rate": 1.7995873771752608e-06, + "loss": 0.3961, + "step": 7533 + }, + { + "epoch": 3.562174940898345, + "grad_norm": 3.1893370151519775, + "learning_rate": 1.7989885551300762e-06, + "loss": 0.3721, + "step": 7534 + }, + { + "epoch": 3.562647754137116, + "grad_norm": 2.6911089420318604, + "learning_rate": 1.7983897767337999e-06, + "loss": 0.3801, + "step": 7535 + }, + { + "epoch": 3.5631205673758863, + "grad_norm": 3.0837483406066895, + "learning_rate": 1.797791042023716e-06, + "loss": 0.3886, + "step": 7536 + }, + { + "epoch": 3.563593380614657, + "grad_norm": 2.973459005355835, + "learning_rate": 1.7971923510371054e-06, + "loss": 0.438, + "step": 7537 + }, + { + "epoch": 3.564066193853428, + "grad_norm": 3.1537392139434814, + "learning_rate": 1.7965937038112435e-06, + "loss": 0.4022, + "step": 7538 + }, + { + "epoch": 3.5645390070921987, + "grad_norm": 3.2339680194854736, + "learning_rate": 1.795995100383409e-06, + "loss": 0.3883, + "step": 7539 + }, + { + "epoch": 3.5650118203309695, + "grad_norm": 2.5029079914093018, + "learning_rate": 1.7953965407908714e-06, + "loss": 0.3522, + "step": 7540 + }, + { + "epoch": 3.56548463356974, + "grad_norm": 3.1560211181640625, + "learning_rate": 1.7947980250709027e-06, + "loss": 0.4024, + "step": 7541 + }, + { + "epoch": 3.5659574468085107, + "grad_norm": 2.950477361679077, + "learning_rate": 1.7941995532607687e-06, + "loss": 0.3598, + "step": 7542 + }, + { + "epoch": 3.5664302600472815, + "grad_norm": 3.1263279914855957, + "learning_rate": 1.793601125397733e-06, + "loss": 0.3535, + "step": 7543 + }, + { + "epoch": 3.566903073286052, + "grad_norm": 2.986631393432617, + "learning_rate": 1.7930027415190587e-06, + "loss": 0.4251, + "step": 7544 + }, + { + "epoch": 3.5673758865248226, + "grad_norm": 2.6882247924804688, + "learning_rate": 1.7924044016620022e-06, + "loss": 0.3584, + "step": 7545 + }, + { + "epoch": 3.5678486997635934, + "grad_norm": 2.9358696937561035, + "learning_rate": 1.791806105863822e-06, + "loss": 0.3671, + "step": 7546 + }, + { + "epoch": 3.568321513002364, + "grad_norm": 2.774198055267334, + "learning_rate": 1.7912078541617704e-06, + "loss": 0.3505, + "step": 7547 + }, + { + "epoch": 3.568794326241135, + "grad_norm": 2.7384231090545654, + "learning_rate": 1.7906096465930964e-06, + "loss": 0.3992, + "step": 7548 + }, + { + "epoch": 3.5692671394799054, + "grad_norm": 2.8625354766845703, + "learning_rate": 1.7900114831950506e-06, + "loss": 0.3858, + "step": 7549 + }, + { + "epoch": 3.569739952718676, + "grad_norm": 2.737884044647217, + "learning_rate": 1.7894133640048761e-06, + "loss": 0.3973, + "step": 7550 + }, + { + "epoch": 3.570212765957447, + "grad_norm": 2.9817614555358887, + "learning_rate": 1.7888152890598154e-06, + "loss": 0.3613, + "step": 7551 + }, + { + "epoch": 3.5706855791962173, + "grad_norm": 2.760956287384033, + "learning_rate": 1.7882172583971081e-06, + "loss": 0.3645, + "step": 7552 + }, + { + "epoch": 3.571158392434988, + "grad_norm": 2.6867735385894775, + "learning_rate": 1.7876192720539908e-06, + "loss": 0.3771, + "step": 7553 + }, + { + "epoch": 3.571631205673759, + "grad_norm": 3.3362443447113037, + "learning_rate": 1.7870213300676986e-06, + "loss": 0.3989, + "step": 7554 + }, + { + "epoch": 3.5721040189125297, + "grad_norm": 2.8359227180480957, + "learning_rate": 1.7864234324754617e-06, + "loss": 0.3645, + "step": 7555 + }, + { + "epoch": 3.5725768321513005, + "grad_norm": 3.3070647716522217, + "learning_rate": 1.7858255793145076e-06, + "loss": 0.4128, + "step": 7556 + }, + { + "epoch": 3.573049645390071, + "grad_norm": 2.544879913330078, + "learning_rate": 1.7852277706220644e-06, + "loss": 0.3779, + "step": 7557 + }, + { + "epoch": 3.5735224586288417, + "grad_norm": 2.890796661376953, + "learning_rate": 1.7846300064353525e-06, + "loss": 0.373, + "step": 7558 + }, + { + "epoch": 3.5739952718676125, + "grad_norm": 2.9703400135040283, + "learning_rate": 1.7840322867915944e-06, + "loss": 0.3619, + "step": 7559 + }, + { + "epoch": 3.574468085106383, + "grad_norm": 3.0122430324554443, + "learning_rate": 1.7834346117280066e-06, + "loss": 0.4079, + "step": 7560 + }, + { + "epoch": 3.5749408983451536, + "grad_norm": 2.904963493347168, + "learning_rate": 1.7828369812818025e-06, + "loss": 0.38, + "step": 7561 + }, + { + "epoch": 3.5754137115839244, + "grad_norm": 3.0917439460754395, + "learning_rate": 1.7822393954901957e-06, + "loss": 0.383, + "step": 7562 + }, + { + "epoch": 3.575886524822695, + "grad_norm": 2.633920907974243, + "learning_rate": 1.7816418543903935e-06, + "loss": 0.3823, + "step": 7563 + }, + { + "epoch": 3.576359338061466, + "grad_norm": 2.9266390800476074, + "learning_rate": 1.781044358019604e-06, + "loss": 0.4642, + "step": 7564 + }, + { + "epoch": 3.5768321513002364, + "grad_norm": 2.878138780593872, + "learning_rate": 1.7804469064150299e-06, + "loss": 0.4056, + "step": 7565 + }, + { + "epoch": 3.577304964539007, + "grad_norm": 2.949370861053467, + "learning_rate": 1.7798494996138708e-06, + "loss": 0.3863, + "step": 7566 + }, + { + "epoch": 3.5777777777777775, + "grad_norm": 3.1444685459136963, + "learning_rate": 1.7792521376533264e-06, + "loss": 0.3611, + "step": 7567 + }, + { + "epoch": 3.5782505910165483, + "grad_norm": 3.0719716548919678, + "learning_rate": 1.7786548205705906e-06, + "loss": 0.3866, + "step": 7568 + }, + { + "epoch": 3.578723404255319, + "grad_norm": 3.155343770980835, + "learning_rate": 1.7780575484028566e-06, + "loss": 0.3896, + "step": 7569 + }, + { + "epoch": 3.57919621749409, + "grad_norm": 3.0447211265563965, + "learning_rate": 1.7774603211873138e-06, + "loss": 0.3713, + "step": 7570 + }, + { + "epoch": 3.5796690307328607, + "grad_norm": 2.8683619499206543, + "learning_rate": 1.7768631389611471e-06, + "loss": 0.3808, + "step": 7571 + }, + { + "epoch": 3.580141843971631, + "grad_norm": 3.1548070907592773, + "learning_rate": 1.776266001761543e-06, + "loss": 0.3714, + "step": 7572 + }, + { + "epoch": 3.580614657210402, + "grad_norm": 2.8699257373809814, + "learning_rate": 1.7756689096256816e-06, + "loss": 0.3694, + "step": 7573 + }, + { + "epoch": 3.5810874704491726, + "grad_norm": 2.834714412689209, + "learning_rate": 1.7750718625907398e-06, + "loss": 0.3935, + "step": 7574 + }, + { + "epoch": 3.581560283687943, + "grad_norm": 3.3828539848327637, + "learning_rate": 1.7744748606938957e-06, + "loss": 0.4783, + "step": 7575 + }, + { + "epoch": 3.582033096926714, + "grad_norm": 3.3892476558685303, + "learning_rate": 1.7738779039723202e-06, + "loss": 0.41, + "step": 7576 + }, + { + "epoch": 3.5825059101654846, + "grad_norm": 3.014289379119873, + "learning_rate": 1.7732809924631842e-06, + "loss": 0.3516, + "step": 7577 + }, + { + "epoch": 3.5829787234042554, + "grad_norm": 3.477212429046631, + "learning_rate": 1.772684126203654e-06, + "loss": 0.4144, + "step": 7578 + }, + { + "epoch": 3.583451536643026, + "grad_norm": 2.9156792163848877, + "learning_rate": 1.772087305230893e-06, + "loss": 0.3772, + "step": 7579 + }, + { + "epoch": 3.5839243498817965, + "grad_norm": 2.639169931411743, + "learning_rate": 1.7714905295820651e-06, + "loss": 0.3487, + "step": 7580 + }, + { + "epoch": 3.5843971631205673, + "grad_norm": 3.196894407272339, + "learning_rate": 1.7708937992943263e-06, + "loss": 0.4852, + "step": 7581 + }, + { + "epoch": 3.584869976359338, + "grad_norm": 2.9140779972076416, + "learning_rate": 1.7702971144048347e-06, + "loss": 0.3703, + "step": 7582 + }, + { + "epoch": 3.5853427895981085, + "grad_norm": 3.3844895362854004, + "learning_rate": 1.7697004749507418e-06, + "loss": 0.4227, + "step": 7583 + }, + { + "epoch": 3.5858156028368793, + "grad_norm": 3.080061912536621, + "learning_rate": 1.769103880969198e-06, + "loss": 0.4237, + "step": 7584 + }, + { + "epoch": 3.58628841607565, + "grad_norm": 3.037505865097046, + "learning_rate": 1.7685073324973506e-06, + "loss": 0.3902, + "step": 7585 + }, + { + "epoch": 3.586761229314421, + "grad_norm": 3.6563873291015625, + "learning_rate": 1.7679108295723436e-06, + "loss": 0.3956, + "step": 7586 + }, + { + "epoch": 3.5872340425531917, + "grad_norm": 3.158935546875, + "learning_rate": 1.76731437223132e-06, + "loss": 0.3898, + "step": 7587 + }, + { + "epoch": 3.587706855791962, + "grad_norm": 3.059199571609497, + "learning_rate": 1.7667179605114176e-06, + "loss": 0.4183, + "step": 7588 + }, + { + "epoch": 3.588179669030733, + "grad_norm": 2.8123233318328857, + "learning_rate": 1.7661215944497716e-06, + "loss": 0.3731, + "step": 7589 + }, + { + "epoch": 3.5886524822695036, + "grad_norm": 3.094287633895874, + "learning_rate": 1.7655252740835169e-06, + "loss": 0.4562, + "step": 7590 + }, + { + "epoch": 3.589125295508274, + "grad_norm": 2.886833667755127, + "learning_rate": 1.7649289994497822e-06, + "loss": 0.4178, + "step": 7591 + }, + { + "epoch": 3.5895981087470448, + "grad_norm": 3.3040647506713867, + "learning_rate": 1.764332770585696e-06, + "loss": 0.4311, + "step": 7592 + }, + { + "epoch": 3.5900709219858156, + "grad_norm": 2.7948951721191406, + "learning_rate": 1.7637365875283827e-06, + "loss": 0.3704, + "step": 7593 + }, + { + "epoch": 3.5905437352245864, + "grad_norm": 3.092221975326538, + "learning_rate": 1.7631404503149623e-06, + "loss": 0.4166, + "step": 7594 + }, + { + "epoch": 3.591016548463357, + "grad_norm": 3.6018600463867188, + "learning_rate": 1.7625443589825564e-06, + "loss": 0.4251, + "step": 7595 + }, + { + "epoch": 3.5914893617021275, + "grad_norm": 2.708017110824585, + "learning_rate": 1.7619483135682791e-06, + "loss": 0.3775, + "step": 7596 + }, + { + "epoch": 3.5919621749408983, + "grad_norm": 2.8069381713867188, + "learning_rate": 1.7613523141092438e-06, + "loss": 0.3929, + "step": 7597 + }, + { + "epoch": 3.592434988179669, + "grad_norm": 3.097787380218506, + "learning_rate": 1.7607563606425616e-06, + "loss": 0.3992, + "step": 7598 + }, + { + "epoch": 3.5929078014184395, + "grad_norm": 2.9691715240478516, + "learning_rate": 1.7601604532053385e-06, + "loss": 0.4001, + "step": 7599 + }, + { + "epoch": 3.5933806146572103, + "grad_norm": 2.5511624813079834, + "learning_rate": 1.7595645918346807e-06, + "loss": 0.3136, + "step": 7600 + }, + { + "epoch": 3.593853427895981, + "grad_norm": 2.4688427448272705, + "learning_rate": 1.7589687765676891e-06, + "loss": 0.3922, + "step": 7601 + }, + { + "epoch": 3.594326241134752, + "grad_norm": 3.004023790359497, + "learning_rate": 1.7583730074414613e-06, + "loss": 0.4203, + "step": 7602 + }, + { + "epoch": 3.5947990543735227, + "grad_norm": 2.902641773223877, + "learning_rate": 1.7577772844930957e-06, + "loss": 0.3855, + "step": 7603 + }, + { + "epoch": 3.595271867612293, + "grad_norm": 3.851375102996826, + "learning_rate": 1.7571816077596826e-06, + "loss": 0.3769, + "step": 7604 + }, + { + "epoch": 3.595744680851064, + "grad_norm": 3.03249192237854, + "learning_rate": 1.756585977278315e-06, + "loss": 0.3448, + "step": 7605 + }, + { + "epoch": 3.5962174940898346, + "grad_norm": 2.992363214492798, + "learning_rate": 1.7559903930860789e-06, + "loss": 0.3893, + "step": 7606 + }, + { + "epoch": 3.596690307328605, + "grad_norm": 2.9322855472564697, + "learning_rate": 1.7553948552200577e-06, + "loss": 0.4337, + "step": 7607 + }, + { + "epoch": 3.5971631205673757, + "grad_norm": 3.2564096450805664, + "learning_rate": 1.7547993637173347e-06, + "loss": 0.3943, + "step": 7608 + }, + { + "epoch": 3.5976359338061465, + "grad_norm": 2.9988484382629395, + "learning_rate": 1.7542039186149867e-06, + "loss": 0.3421, + "step": 7609 + }, + { + "epoch": 3.5981087470449173, + "grad_norm": 2.8188817501068115, + "learning_rate": 1.7536085199500914e-06, + "loss": 0.3657, + "step": 7610 + }, + { + "epoch": 3.598581560283688, + "grad_norm": 3.0583255290985107, + "learning_rate": 1.7530131677597206e-06, + "loss": 0.4036, + "step": 7611 + }, + { + "epoch": 3.5990543735224585, + "grad_norm": 2.8700921535491943, + "learning_rate": 1.7524178620809435e-06, + "loss": 0.3928, + "step": 7612 + }, + { + "epoch": 3.5995271867612293, + "grad_norm": 3.4497945308685303, + "learning_rate": 1.751822602950829e-06, + "loss": 0.3517, + "step": 7613 + }, + { + "epoch": 3.6, + "grad_norm": 3.334191083908081, + "learning_rate": 1.75122739040644e-06, + "loss": 0.3414, + "step": 7614 + }, + { + "epoch": 3.6004728132387704, + "grad_norm": 3.1435158252716064, + "learning_rate": 1.7506322244848387e-06, + "loss": 0.4075, + "step": 7615 + }, + { + "epoch": 3.6009456264775412, + "grad_norm": 3.178990125656128, + "learning_rate": 1.7500371052230824e-06, + "loss": 0.4688, + "step": 7616 + }, + { + "epoch": 3.601418439716312, + "grad_norm": 2.9292044639587402, + "learning_rate": 1.7494420326582267e-06, + "loss": 0.3882, + "step": 7617 + }, + { + "epoch": 3.601891252955083, + "grad_norm": 2.6899197101593018, + "learning_rate": 1.7488470068273256e-06, + "loss": 0.3916, + "step": 7618 + }, + { + "epoch": 3.6023640661938536, + "grad_norm": 2.8319191932678223, + "learning_rate": 1.7482520277674273e-06, + "loss": 0.3924, + "step": 7619 + }, + { + "epoch": 3.602836879432624, + "grad_norm": 2.74589204788208, + "learning_rate": 1.747657095515578e-06, + "loss": 0.2911, + "step": 7620 + }, + { + "epoch": 3.603309692671395, + "grad_norm": 2.857028007507324, + "learning_rate": 1.7470622101088233e-06, + "loss": 0.3618, + "step": 7621 + }, + { + "epoch": 3.6037825059101656, + "grad_norm": 3.3715617656707764, + "learning_rate": 1.746467371584203e-06, + "loss": 0.4186, + "step": 7622 + }, + { + "epoch": 3.604255319148936, + "grad_norm": 2.839526414871216, + "learning_rate": 1.745872579978755e-06, + "loss": 0.4088, + "step": 7623 + }, + { + "epoch": 3.6047281323877067, + "grad_norm": 3.7689156532287598, + "learning_rate": 1.7452778353295155e-06, + "loss": 0.4748, + "step": 7624 + }, + { + "epoch": 3.6052009456264775, + "grad_norm": 2.9345123767852783, + "learning_rate": 1.7446831376735152e-06, + "loss": 0.4117, + "step": 7625 + }, + { + "epoch": 3.6056737588652483, + "grad_norm": 2.7898924350738525, + "learning_rate": 1.7440884870477845e-06, + "loss": 0.3515, + "step": 7626 + }, + { + "epoch": 3.606146572104019, + "grad_norm": 3.4268569946289062, + "learning_rate": 1.7434938834893481e-06, + "loss": 0.4051, + "step": 7627 + }, + { + "epoch": 3.6066193853427895, + "grad_norm": 3.019066095352173, + "learning_rate": 1.7428993270352311e-06, + "loss": 0.4128, + "step": 7628 + }, + { + "epoch": 3.6070921985815603, + "grad_norm": 3.1277568340301514, + "learning_rate": 1.742304817722454e-06, + "loss": 0.37, + "step": 7629 + }, + { + "epoch": 3.607565011820331, + "grad_norm": 2.924818277359009, + "learning_rate": 1.7417103555880318e-06, + "loss": 0.3792, + "step": 7630 + }, + { + "epoch": 3.6080378250591014, + "grad_norm": 2.664699077606201, + "learning_rate": 1.7411159406689821e-06, + "loss": 0.3584, + "step": 7631 + }, + { + "epoch": 3.608510638297872, + "grad_norm": 3.223729133605957, + "learning_rate": 1.7405215730023144e-06, + "loss": 0.3956, + "step": 7632 + }, + { + "epoch": 3.608983451536643, + "grad_norm": 2.934225559234619, + "learning_rate": 1.7399272526250388e-06, + "loss": 0.4179, + "step": 7633 + }, + { + "epoch": 3.609456264775414, + "grad_norm": 2.833798885345459, + "learning_rate": 1.7393329795741603e-06, + "loss": 0.3283, + "step": 7634 + }, + { + "epoch": 3.6099290780141846, + "grad_norm": 3.008798837661743, + "learning_rate": 1.738738753886681e-06, + "loss": 0.3704, + "step": 7635 + }, + { + "epoch": 3.610401891252955, + "grad_norm": 2.8714520931243896, + "learning_rate": 1.7381445755996023e-06, + "loss": 0.3646, + "step": 7636 + }, + { + "epoch": 3.6108747044917258, + "grad_norm": 3.083554267883301, + "learning_rate": 1.7375504447499193e-06, + "loss": 0.3785, + "step": 7637 + }, + { + "epoch": 3.6113475177304966, + "grad_norm": 3.270347833633423, + "learning_rate": 1.7369563613746277e-06, + "loss": 0.4426, + "step": 7638 + }, + { + "epoch": 3.611820330969267, + "grad_norm": 2.7754862308502197, + "learning_rate": 1.7363623255107175e-06, + "loss": 0.3448, + "step": 7639 + }, + { + "epoch": 3.6122931442080377, + "grad_norm": 2.98140025138855, + "learning_rate": 1.7357683371951767e-06, + "loss": 0.4027, + "step": 7640 + }, + { + "epoch": 3.6127659574468085, + "grad_norm": 3.1640074253082275, + "learning_rate": 1.7351743964649908e-06, + "loss": 0.3913, + "step": 7641 + }, + { + "epoch": 3.6132387706855793, + "grad_norm": 2.758202075958252, + "learning_rate": 1.7345805033571417e-06, + "loss": 0.4148, + "step": 7642 + }, + { + "epoch": 3.61371158392435, + "grad_norm": 3.1030571460723877, + "learning_rate": 1.7339866579086074e-06, + "loss": 0.4002, + "step": 7643 + }, + { + "epoch": 3.6141843971631205, + "grad_norm": 3.2414135932922363, + "learning_rate": 1.733392860156366e-06, + "loss": 0.4732, + "step": 7644 + }, + { + "epoch": 3.6146572104018913, + "grad_norm": 2.8720390796661377, + "learning_rate": 1.7327991101373886e-06, + "loss": 0.4112, + "step": 7645 + }, + { + "epoch": 3.615130023640662, + "grad_norm": 3.0104875564575195, + "learning_rate": 1.7322054078886474e-06, + "loss": 0.3934, + "step": 7646 + }, + { + "epoch": 3.6156028368794324, + "grad_norm": 2.8615126609802246, + "learning_rate": 1.7316117534471091e-06, + "loss": 0.3437, + "step": 7647 + }, + { + "epoch": 3.616075650118203, + "grad_norm": 2.8283586502075195, + "learning_rate": 1.7310181468497369e-06, + "loss": 0.374, + "step": 7648 + }, + { + "epoch": 3.616548463356974, + "grad_norm": 3.2289321422576904, + "learning_rate": 1.7304245881334935e-06, + "loss": 0.3899, + "step": 7649 + }, + { + "epoch": 3.617021276595745, + "grad_norm": 3.126882791519165, + "learning_rate": 1.7298310773353356e-06, + "loss": 0.388, + "step": 7650 + }, + { + "epoch": 3.6174940898345156, + "grad_norm": 3.013657569885254, + "learning_rate": 1.7292376144922201e-06, + "loss": 0.379, + "step": 7651 + }, + { + "epoch": 3.617966903073286, + "grad_norm": 3.070192337036133, + "learning_rate": 1.7286441996410989e-06, + "loss": 0.3801, + "step": 7652 + }, + { + "epoch": 3.6184397163120567, + "grad_norm": 2.805380344390869, + "learning_rate": 1.7280508328189199e-06, + "loss": 0.3577, + "step": 7653 + }, + { + "epoch": 3.6189125295508275, + "grad_norm": 3.2853379249572754, + "learning_rate": 1.7274575140626318e-06, + "loss": 0.4168, + "step": 7654 + }, + { + "epoch": 3.619385342789598, + "grad_norm": 3.16316819190979, + "learning_rate": 1.7268642434091761e-06, + "loss": 0.425, + "step": 7655 + }, + { + "epoch": 3.6198581560283687, + "grad_norm": 3.2971179485321045, + "learning_rate": 1.7262710208954947e-06, + "loss": 0.3884, + "step": 7656 + }, + { + "epoch": 3.6203309692671395, + "grad_norm": 3.1823747158050537, + "learning_rate": 1.725677846558524e-06, + "loss": 0.3419, + "step": 7657 + }, + { + "epoch": 3.6208037825059103, + "grad_norm": 3.114654779434204, + "learning_rate": 1.7250847204351973e-06, + "loss": 0.3951, + "step": 7658 + }, + { + "epoch": 3.621276595744681, + "grad_norm": 3.0272440910339355, + "learning_rate": 1.7244916425624482e-06, + "loss": 0.4102, + "step": 7659 + }, + { + "epoch": 3.6217494089834514, + "grad_norm": 2.973611354827881, + "learning_rate": 1.7238986129772035e-06, + "loss": 0.3827, + "step": 7660 + }, + { + "epoch": 3.6222222222222222, + "grad_norm": 3.063713312149048, + "learning_rate": 1.7233056317163894e-06, + "loss": 0.3909, + "step": 7661 + }, + { + "epoch": 3.622695035460993, + "grad_norm": 3.203725576400757, + "learning_rate": 1.7227126988169283e-06, + "loss": 0.3933, + "step": 7662 + }, + { + "epoch": 3.6231678486997634, + "grad_norm": 2.945887327194214, + "learning_rate": 1.7221198143157386e-06, + "loss": 0.3722, + "step": 7663 + }, + { + "epoch": 3.623640661938534, + "grad_norm": 3.042691469192505, + "learning_rate": 1.7215269782497373e-06, + "loss": 0.4108, + "step": 7664 + }, + { + "epoch": 3.624113475177305, + "grad_norm": 2.8496763706207275, + "learning_rate": 1.720934190655837e-06, + "loss": 0.3867, + "step": 7665 + }, + { + "epoch": 3.6245862884160758, + "grad_norm": 2.7017154693603516, + "learning_rate": 1.7203414515709493e-06, + "loss": 0.3246, + "step": 7666 + }, + { + "epoch": 3.6250591016548466, + "grad_norm": 2.66630482673645, + "learning_rate": 1.7197487610319808e-06, + "loss": 0.365, + "step": 7667 + }, + { + "epoch": 3.625531914893617, + "grad_norm": 2.8724591732025146, + "learning_rate": 1.7191561190758348e-06, + "loss": 0.3361, + "step": 7668 + }, + { + "epoch": 3.6260047281323877, + "grad_norm": 3.1413803100585938, + "learning_rate": 1.7185635257394143e-06, + "loss": 0.3949, + "step": 7669 + }, + { + "epoch": 3.6264775413711585, + "grad_norm": 2.9866268634796143, + "learning_rate": 1.7179709810596163e-06, + "loss": 0.3728, + "step": 7670 + }, + { + "epoch": 3.626950354609929, + "grad_norm": 3.003497838973999, + "learning_rate": 1.717378485073336e-06, + "loss": 0.384, + "step": 7671 + }, + { + "epoch": 3.6274231678486997, + "grad_norm": 3.0043468475341797, + "learning_rate": 1.716786037817466e-06, + "loss": 0.3432, + "step": 7672 + }, + { + "epoch": 3.6278959810874705, + "grad_norm": 3.216550827026367, + "learning_rate": 1.7161936393288945e-06, + "loss": 0.3963, + "step": 7673 + }, + { + "epoch": 3.6283687943262413, + "grad_norm": 3.1091387271881104, + "learning_rate": 1.715601289644509e-06, + "loss": 0.4347, + "step": 7674 + }, + { + "epoch": 3.628841607565012, + "grad_norm": 3.2288286685943604, + "learning_rate": 1.7150089888011916e-06, + "loss": 0.4291, + "step": 7675 + }, + { + "epoch": 3.6293144208037824, + "grad_norm": 2.943941831588745, + "learning_rate": 1.7144167368358216e-06, + "loss": 0.3643, + "step": 7676 + }, + { + "epoch": 3.629787234042553, + "grad_norm": 2.819683313369751, + "learning_rate": 1.7138245337852774e-06, + "loss": 0.4051, + "step": 7677 + }, + { + "epoch": 3.630260047281324, + "grad_norm": 2.9988269805908203, + "learning_rate": 1.713232379686432e-06, + "loss": 0.4102, + "step": 7678 + }, + { + "epoch": 3.6307328605200944, + "grad_norm": 3.0041310787200928, + "learning_rate": 1.7126402745761566e-06, + "loss": 0.3854, + "step": 7679 + }, + { + "epoch": 3.631205673758865, + "grad_norm": 2.8700194358825684, + "learning_rate": 1.7120482184913192e-06, + "loss": 0.3441, + "step": 7680 + }, + { + "epoch": 3.631678486997636, + "grad_norm": 3.5275180339813232, + "learning_rate": 1.7114562114687833e-06, + "loss": 0.3808, + "step": 7681 + }, + { + "epoch": 3.6321513002364068, + "grad_norm": 3.182326078414917, + "learning_rate": 1.710864253545412e-06, + "loss": 0.4178, + "step": 7682 + }, + { + "epoch": 3.6326241134751776, + "grad_norm": 3.0514512062072754, + "learning_rate": 1.7102723447580627e-06, + "loss": 0.3527, + "step": 7683 + }, + { + "epoch": 3.633096926713948, + "grad_norm": 2.8293066024780273, + "learning_rate": 1.7096804851435922e-06, + "loss": 0.3723, + "step": 7684 + }, + { + "epoch": 3.6335697399527187, + "grad_norm": 2.9601097106933594, + "learning_rate": 1.709088674738853e-06, + "loss": 0.3704, + "step": 7685 + }, + { + "epoch": 3.6340425531914895, + "grad_norm": 2.8070995807647705, + "learning_rate": 1.7084969135806933e-06, + "loss": 0.346, + "step": 7686 + }, + { + "epoch": 3.63451536643026, + "grad_norm": 3.0162715911865234, + "learning_rate": 1.70790520170596e-06, + "loss": 0.39, + "step": 7687 + }, + { + "epoch": 3.6349881796690307, + "grad_norm": 3.018763780593872, + "learning_rate": 1.7073135391514967e-06, + "loss": 0.4621, + "step": 7688 + }, + { + "epoch": 3.6354609929078014, + "grad_norm": 2.963604688644409, + "learning_rate": 1.706721925954144e-06, + "loss": 0.339, + "step": 7689 + }, + { + "epoch": 3.6359338061465722, + "grad_norm": 2.8532896041870117, + "learning_rate": 1.7061303621507383e-06, + "loss": 0.3915, + "step": 7690 + }, + { + "epoch": 3.636406619385343, + "grad_norm": 3.248006820678711, + "learning_rate": 1.7055388477781133e-06, + "loss": 0.3712, + "step": 7691 + }, + { + "epoch": 3.6368794326241134, + "grad_norm": 3.2195777893066406, + "learning_rate": 1.7049473828731011e-06, + "loss": 0.4358, + "step": 7692 + }, + { + "epoch": 3.637352245862884, + "grad_norm": 2.7190768718719482, + "learning_rate": 1.7043559674725296e-06, + "loss": 0.341, + "step": 7693 + }, + { + "epoch": 3.637825059101655, + "grad_norm": 2.6047232151031494, + "learning_rate": 1.7037646016132223e-06, + "loss": 0.3513, + "step": 7694 + }, + { + "epoch": 3.6382978723404253, + "grad_norm": 3.0824201107025146, + "learning_rate": 1.7031732853320026e-06, + "loss": 0.4097, + "step": 7695 + }, + { + "epoch": 3.638770685579196, + "grad_norm": 2.845461130142212, + "learning_rate": 1.7025820186656883e-06, + "loss": 0.3395, + "step": 7696 + }, + { + "epoch": 3.639243498817967, + "grad_norm": 2.937863826751709, + "learning_rate": 1.7019908016510953e-06, + "loss": 0.395, + "step": 7697 + }, + { + "epoch": 3.6397163120567377, + "grad_norm": 3.349780559539795, + "learning_rate": 1.701399634325036e-06, + "loss": 0.3889, + "step": 7698 + }, + { + "epoch": 3.6401891252955085, + "grad_norm": 2.8527066707611084, + "learning_rate": 1.7008085167243187e-06, + "loss": 0.3753, + "step": 7699 + }, + { + "epoch": 3.640661938534279, + "grad_norm": 2.8112385272979736, + "learning_rate": 1.7002174488857517e-06, + "loss": 0.3912, + "step": 7700 + }, + { + "epoch": 3.6411347517730497, + "grad_norm": 2.731933832168579, + "learning_rate": 1.6996264308461363e-06, + "loss": 0.4142, + "step": 7701 + }, + { + "epoch": 3.6416075650118205, + "grad_norm": 3.70465350151062, + "learning_rate": 1.6990354626422744e-06, + "loss": 0.4089, + "step": 7702 + }, + { + "epoch": 3.642080378250591, + "grad_norm": 2.8656258583068848, + "learning_rate": 1.698444544310962e-06, + "loss": 0.3771, + "step": 7703 + }, + { + "epoch": 3.6425531914893616, + "grad_norm": 2.878830671310425, + "learning_rate": 1.697853675888993e-06, + "loss": 0.3754, + "step": 7704 + }, + { + "epoch": 3.6430260047281324, + "grad_norm": 3.440528154373169, + "learning_rate": 1.6972628574131586e-06, + "loss": 0.4543, + "step": 7705 + }, + { + "epoch": 3.6434988179669032, + "grad_norm": 2.70736026763916, + "learning_rate": 1.6966720889202451e-06, + "loss": 0.4049, + "step": 7706 + }, + { + "epoch": 3.643971631205674, + "grad_norm": 2.787992238998413, + "learning_rate": 1.6960813704470391e-06, + "loss": 0.3854, + "step": 7707 + }, + { + "epoch": 3.6444444444444444, + "grad_norm": 2.631490707397461, + "learning_rate": 1.6954907020303213e-06, + "loss": 0.3775, + "step": 7708 + }, + { + "epoch": 3.644917257683215, + "grad_norm": 3.052255392074585, + "learning_rate": 1.6949000837068685e-06, + "loss": 0.3873, + "step": 7709 + }, + { + "epoch": 3.645390070921986, + "grad_norm": 2.7443203926086426, + "learning_rate": 1.6943095155134586e-06, + "loss": 0.3362, + "step": 7710 + }, + { + "epoch": 3.6458628841607563, + "grad_norm": 2.931688070297241, + "learning_rate": 1.6937189974868618e-06, + "loss": 0.3839, + "step": 7711 + }, + { + "epoch": 3.646335697399527, + "grad_norm": 2.950242757797241, + "learning_rate": 1.6931285296638479e-06, + "loss": 0.3552, + "step": 7712 + }, + { + "epoch": 3.646808510638298, + "grad_norm": 2.940735340118408, + "learning_rate": 1.6925381120811823e-06, + "loss": 0.3881, + "step": 7713 + }, + { + "epoch": 3.6472813238770687, + "grad_norm": 2.771355390548706, + "learning_rate": 1.6919477447756273e-06, + "loss": 0.3578, + "step": 7714 + }, + { + "epoch": 3.6477541371158395, + "grad_norm": 2.919004201889038, + "learning_rate": 1.6913574277839435e-06, + "loss": 0.3971, + "step": 7715 + }, + { + "epoch": 3.64822695035461, + "grad_norm": 3.293705463409424, + "learning_rate": 1.6907671611428872e-06, + "loss": 0.422, + "step": 7716 + }, + { + "epoch": 3.6486997635933807, + "grad_norm": 2.744239091873169, + "learning_rate": 1.6901769448892103e-06, + "loss": 0.398, + "step": 7717 + }, + { + "epoch": 3.6491725768321515, + "grad_norm": 3.1726129055023193, + "learning_rate": 1.689586779059665e-06, + "loss": 0.39, + "step": 7718 + }, + { + "epoch": 3.649645390070922, + "grad_norm": 3.146743059158325, + "learning_rate": 1.688996663690997e-06, + "loss": 0.4059, + "step": 7719 + }, + { + "epoch": 3.6501182033096926, + "grad_norm": 2.941025495529175, + "learning_rate": 1.688406598819951e-06, + "loss": 0.3479, + "step": 7720 + }, + { + "epoch": 3.6505910165484634, + "grad_norm": 3.3480939865112305, + "learning_rate": 1.6878165844832679e-06, + "loss": 0.4141, + "step": 7721 + }, + { + "epoch": 3.651063829787234, + "grad_norm": 2.9145030975341797, + "learning_rate": 1.6872266207176833e-06, + "loss": 0.3497, + "step": 7722 + }, + { + "epoch": 3.651536643026005, + "grad_norm": 3.119502067565918, + "learning_rate": 1.686636707559934e-06, + "loss": 0.424, + "step": 7723 + }, + { + "epoch": 3.6520094562647754, + "grad_norm": 3.0867667198181152, + "learning_rate": 1.6860468450467497e-06, + "loss": 0.3998, + "step": 7724 + }, + { + "epoch": 3.652482269503546, + "grad_norm": 2.9128987789154053, + "learning_rate": 1.6854570332148602e-06, + "loss": 0.4043, + "step": 7725 + }, + { + "epoch": 3.652955082742317, + "grad_norm": 2.9973206520080566, + "learning_rate": 1.6848672721009896e-06, + "loss": 0.3395, + "step": 7726 + }, + { + "epoch": 3.6534278959810873, + "grad_norm": 2.824916124343872, + "learning_rate": 1.6842775617418591e-06, + "loss": 0.4102, + "step": 7727 + }, + { + "epoch": 3.653900709219858, + "grad_norm": 2.7984440326690674, + "learning_rate": 1.6836879021741887e-06, + "loss": 0.3823, + "step": 7728 + }, + { + "epoch": 3.654373522458629, + "grad_norm": 2.8412179946899414, + "learning_rate": 1.6830982934346917e-06, + "loss": 0.3755, + "step": 7729 + }, + { + "epoch": 3.6548463356973997, + "grad_norm": 3.1677138805389404, + "learning_rate": 1.6825087355600836e-06, + "loss": 0.4224, + "step": 7730 + }, + { + "epoch": 3.65531914893617, + "grad_norm": 3.097085475921631, + "learning_rate": 1.6819192285870718e-06, + "loss": 0.4103, + "step": 7731 + }, + { + "epoch": 3.655791962174941, + "grad_norm": 2.9802496433258057, + "learning_rate": 1.6813297725523613e-06, + "loss": 0.4297, + "step": 7732 + }, + { + "epoch": 3.6562647754137116, + "grad_norm": 3.0135059356689453, + "learning_rate": 1.680740367492657e-06, + "loss": 0.4526, + "step": 7733 + }, + { + "epoch": 3.656737588652482, + "grad_norm": 2.7776739597320557, + "learning_rate": 1.6801510134446575e-06, + "loss": 0.3924, + "step": 7734 + }, + { + "epoch": 3.657210401891253, + "grad_norm": 2.7500126361846924, + "learning_rate": 1.6795617104450595e-06, + "loss": 0.3785, + "step": 7735 + }, + { + "epoch": 3.6576832151300236, + "grad_norm": 3.494142770767212, + "learning_rate": 1.6789724585305566e-06, + "loss": 0.3483, + "step": 7736 + }, + { + "epoch": 3.6581560283687944, + "grad_norm": 3.055081605911255, + "learning_rate": 1.6783832577378377e-06, + "loss": 0.4481, + "step": 7737 + }, + { + "epoch": 3.658628841607565, + "grad_norm": 2.781412124633789, + "learning_rate": 1.6777941081035914e-06, + "loss": 0.3969, + "step": 7738 + }, + { + "epoch": 3.6591016548463355, + "grad_norm": 3.1672184467315674, + "learning_rate": 1.677205009664501e-06, + "loss": 0.3959, + "step": 7739 + }, + { + "epoch": 3.6595744680851063, + "grad_norm": 3.0597715377807617, + "learning_rate": 1.6766159624572458e-06, + "loss": 0.418, + "step": 7740 + }, + { + "epoch": 3.660047281323877, + "grad_norm": 3.2906267642974854, + "learning_rate": 1.676026966518505e-06, + "loss": 0.4335, + "step": 7741 + }, + { + "epoch": 3.6605200945626475, + "grad_norm": 3.2519290447235107, + "learning_rate": 1.6754380218849515e-06, + "loss": 0.3786, + "step": 7742 + }, + { + "epoch": 3.6609929078014183, + "grad_norm": 3.24716854095459, + "learning_rate": 1.6748491285932572e-06, + "loss": 0.3599, + "step": 7743 + }, + { + "epoch": 3.661465721040189, + "grad_norm": 3.2940993309020996, + "learning_rate": 1.6742602866800897e-06, + "loss": 0.3934, + "step": 7744 + }, + { + "epoch": 3.66193853427896, + "grad_norm": 2.917409896850586, + "learning_rate": 1.6736714961821124e-06, + "loss": 0.4197, + "step": 7745 + }, + { + "epoch": 3.6624113475177307, + "grad_norm": 3.005068063735962, + "learning_rate": 1.6730827571359887e-06, + "loss": 0.4239, + "step": 7746 + }, + { + "epoch": 3.662884160756501, + "grad_norm": 2.751880168914795, + "learning_rate": 1.6724940695783745e-06, + "loss": 0.4257, + "step": 7747 + }, + { + "epoch": 3.663356973995272, + "grad_norm": 3.090670585632324, + "learning_rate": 1.6719054335459273e-06, + "loss": 0.3686, + "step": 7748 + }, + { + "epoch": 3.6638297872340426, + "grad_norm": 3.250251293182373, + "learning_rate": 1.6713168490752974e-06, + "loss": 0.4249, + "step": 7749 + }, + { + "epoch": 3.664302600472813, + "grad_norm": 2.8662827014923096, + "learning_rate": 1.6707283162031335e-06, + "loss": 0.3692, + "step": 7750 + }, + { + "epoch": 3.6647754137115838, + "grad_norm": 2.8709118366241455, + "learning_rate": 1.6701398349660813e-06, + "loss": 0.3929, + "step": 7751 + }, + { + "epoch": 3.6652482269503546, + "grad_norm": 2.992035388946533, + "learning_rate": 1.6695514054007822e-06, + "loss": 0.4131, + "step": 7752 + }, + { + "epoch": 3.6657210401891254, + "grad_norm": 3.0427589416503906, + "learning_rate": 1.668963027543876e-06, + "loss": 0.387, + "step": 7753 + }, + { + "epoch": 3.666193853427896, + "grad_norm": 3.0147807598114014, + "learning_rate": 1.6683747014319987e-06, + "loss": 0.3648, + "step": 7754 + }, + { + "epoch": 3.6666666666666665, + "grad_norm": 2.5483829975128174, + "learning_rate": 1.6677864271017811e-06, + "loss": 0.3643, + "step": 7755 + }, + { + "epoch": 3.6671394799054373, + "grad_norm": 2.7661986351013184, + "learning_rate": 1.6671982045898544e-06, + "loss": 0.3731, + "step": 7756 + }, + { + "epoch": 3.667612293144208, + "grad_norm": 2.778036117553711, + "learning_rate": 1.666610033932843e-06, + "loss": 0.3744, + "step": 7757 + }, + { + "epoch": 3.6680851063829785, + "grad_norm": 2.9028329849243164, + "learning_rate": 1.6660219151673712e-06, + "loss": 0.4286, + "step": 7758 + }, + { + "epoch": 3.6685579196217493, + "grad_norm": 2.826687812805176, + "learning_rate": 1.6654338483300575e-06, + "loss": 0.318, + "step": 7759 + }, + { + "epoch": 3.66903073286052, + "grad_norm": 2.7063660621643066, + "learning_rate": 1.6648458334575186e-06, + "loss": 0.3351, + "step": 7760 + }, + { + "epoch": 3.669503546099291, + "grad_norm": 2.708361864089966, + "learning_rate": 1.664257870586368e-06, + "loss": 0.376, + "step": 7761 + }, + { + "epoch": 3.6699763593380617, + "grad_norm": 3.1139161586761475, + "learning_rate": 1.6636699597532141e-06, + "loss": 0.3572, + "step": 7762 + }, + { + "epoch": 3.670449172576832, + "grad_norm": 3.0858285427093506, + "learning_rate": 1.6630821009946658e-06, + "loss": 0.4204, + "step": 7763 + }, + { + "epoch": 3.670921985815603, + "grad_norm": 3.5593984127044678, + "learning_rate": 1.6624942943473252e-06, + "loss": 0.463, + "step": 7764 + }, + { + "epoch": 3.6713947990543736, + "grad_norm": 2.863851308822632, + "learning_rate": 1.6619065398477921e-06, + "loss": 0.4272, + "step": 7765 + }, + { + "epoch": 3.671867612293144, + "grad_norm": 2.833399772644043, + "learning_rate": 1.6613188375326638e-06, + "loss": 0.3509, + "step": 7766 + }, + { + "epoch": 3.6723404255319148, + "grad_norm": 2.988948345184326, + "learning_rate": 1.6607311874385346e-06, + "loss": 0.3572, + "step": 7767 + }, + { + "epoch": 3.6728132387706856, + "grad_norm": 2.7349398136138916, + "learning_rate": 1.6601435896019936e-06, + "loss": 0.3875, + "step": 7768 + }, + { + "epoch": 3.6732860520094563, + "grad_norm": 2.8544445037841797, + "learning_rate": 1.659556044059629e-06, + "loss": 0.4057, + "step": 7769 + }, + { + "epoch": 3.673758865248227, + "grad_norm": 3.0341904163360596, + "learning_rate": 1.6589685508480235e-06, + "loss": 0.3935, + "step": 7770 + }, + { + "epoch": 3.6742316784869975, + "grad_norm": 2.7495710849761963, + "learning_rate": 1.6583811100037595e-06, + "loss": 0.352, + "step": 7771 + }, + { + "epoch": 3.6747044917257683, + "grad_norm": 3.258525848388672, + "learning_rate": 1.6577937215634133e-06, + "loss": 0.4414, + "step": 7772 + }, + { + "epoch": 3.675177304964539, + "grad_norm": 3.0686328411102295, + "learning_rate": 1.657206385563558e-06, + "loss": 0.353, + "step": 7773 + }, + { + "epoch": 3.6756501182033094, + "grad_norm": 3.1168248653411865, + "learning_rate": 1.6566191020407668e-06, + "loss": 0.4064, + "step": 7774 + }, + { + "epoch": 3.6761229314420802, + "grad_norm": 2.7837352752685547, + "learning_rate": 1.6560318710316053e-06, + "loss": 0.3956, + "step": 7775 + }, + { + "epoch": 3.676595744680851, + "grad_norm": 3.1514039039611816, + "learning_rate": 1.6554446925726391e-06, + "loss": 0.4044, + "step": 7776 + }, + { + "epoch": 3.677068557919622, + "grad_norm": 3.010352611541748, + "learning_rate": 1.6548575667004285e-06, + "loss": 0.4162, + "step": 7777 + }, + { + "epoch": 3.6775413711583926, + "grad_norm": 3.1727633476257324, + "learning_rate": 1.6542704934515308e-06, + "loss": 0.411, + "step": 7778 + }, + { + "epoch": 3.678014184397163, + "grad_norm": 3.6771271228790283, + "learning_rate": 1.6536834728625018e-06, + "loss": 0.4562, + "step": 7779 + }, + { + "epoch": 3.678486997635934, + "grad_norm": 2.9793131351470947, + "learning_rate": 1.6530965049698908e-06, + "loss": 0.4039, + "step": 7780 + }, + { + "epoch": 3.6789598108747046, + "grad_norm": 3.193751096725464, + "learning_rate": 1.6525095898102478e-06, + "loss": 0.4064, + "step": 7781 + }, + { + "epoch": 3.679432624113475, + "grad_norm": 2.6643173694610596, + "learning_rate": 1.6519227274201169e-06, + "loss": 0.3731, + "step": 7782 + }, + { + "epoch": 3.6799054373522457, + "grad_norm": 3.4855685234069824, + "learning_rate": 1.6513359178360384e-06, + "loss": 0.3815, + "step": 7783 + }, + { + "epoch": 3.6803782505910165, + "grad_norm": 3.320537567138672, + "learning_rate": 1.6507491610945514e-06, + "loss": 0.4065, + "step": 7784 + }, + { + "epoch": 3.6808510638297873, + "grad_norm": 3.2793102264404297, + "learning_rate": 1.6501624572321895e-06, + "loss": 0.429, + "step": 7785 + }, + { + "epoch": 3.681323877068558, + "grad_norm": 2.8609631061553955, + "learning_rate": 1.6495758062854854e-06, + "loss": 0.3881, + "step": 7786 + }, + { + "epoch": 3.6817966903073285, + "grad_norm": 2.956533193588257, + "learning_rate": 1.6489892082909675e-06, + "loss": 0.3662, + "step": 7787 + }, + { + "epoch": 3.6822695035460993, + "grad_norm": 2.949092149734497, + "learning_rate": 1.6484026632851591e-06, + "loss": 0.4021, + "step": 7788 + }, + { + "epoch": 3.68274231678487, + "grad_norm": 2.802572250366211, + "learning_rate": 1.6478161713045831e-06, + "loss": 0.3862, + "step": 7789 + }, + { + "epoch": 3.6832151300236404, + "grad_norm": 3.0064797401428223, + "learning_rate": 1.6472297323857578e-06, + "loss": 0.3495, + "step": 7790 + }, + { + "epoch": 3.6836879432624112, + "grad_norm": 2.74037766456604, + "learning_rate": 1.646643346565197e-06, + "loss": 0.362, + "step": 7791 + }, + { + "epoch": 3.684160756501182, + "grad_norm": 3.423326253890991, + "learning_rate": 1.646057013879414e-06, + "loss": 0.4383, + "step": 7792 + }, + { + "epoch": 3.684633569739953, + "grad_norm": 2.7426443099975586, + "learning_rate": 1.645470734364915e-06, + "loss": 0.3741, + "step": 7793 + }, + { + "epoch": 3.6851063829787236, + "grad_norm": 2.879296064376831, + "learning_rate": 1.6448845080582077e-06, + "loss": 0.4003, + "step": 7794 + }, + { + "epoch": 3.685579196217494, + "grad_norm": 2.6821188926696777, + "learning_rate": 1.6442983349957924e-06, + "loss": 0.3545, + "step": 7795 + }, + { + "epoch": 3.6860520094562648, + "grad_norm": 2.714059829711914, + "learning_rate": 1.6437122152141665e-06, + "loss": 0.3702, + "step": 7796 + }, + { + "epoch": 3.6865248226950356, + "grad_norm": 2.593811511993408, + "learning_rate": 1.6431261487498274e-06, + "loss": 0.3512, + "step": 7797 + }, + { + "epoch": 3.686997635933806, + "grad_norm": 2.687533378601074, + "learning_rate": 1.6425401356392652e-06, + "loss": 0.36, + "step": 7798 + }, + { + "epoch": 3.6874704491725767, + "grad_norm": 3.1675431728363037, + "learning_rate": 1.6419541759189694e-06, + "loss": 0.3349, + "step": 7799 + }, + { + "epoch": 3.6879432624113475, + "grad_norm": 2.777310371398926, + "learning_rate": 1.6413682696254246e-06, + "loss": 0.317, + "step": 7800 + }, + { + "epoch": 3.6884160756501183, + "grad_norm": 3.0121655464172363, + "learning_rate": 1.640782416795112e-06, + "loss": 0.3612, + "step": 7801 + }, + { + "epoch": 3.688888888888889, + "grad_norm": 3.0532145500183105, + "learning_rate": 1.6401966174645113e-06, + "loss": 0.4065, + "step": 7802 + }, + { + "epoch": 3.6893617021276595, + "grad_norm": 2.8221664428710938, + "learning_rate": 1.6396108716700961e-06, + "loss": 0.3669, + "step": 7803 + }, + { + "epoch": 3.6898345153664303, + "grad_norm": 2.966357707977295, + "learning_rate": 1.6390251794483405e-06, + "loss": 0.391, + "step": 7804 + }, + { + "epoch": 3.690307328605201, + "grad_norm": 3.460252046585083, + "learning_rate": 1.6384395408357118e-06, + "loss": 0.429, + "step": 7805 + }, + { + "epoch": 3.6907801418439714, + "grad_norm": 2.8907718658447266, + "learning_rate": 1.637853955868674e-06, + "loss": 0.3761, + "step": 7806 + }, + { + "epoch": 3.691252955082742, + "grad_norm": 3.114612102508545, + "learning_rate": 1.6372684245836912e-06, + "loss": 0.4376, + "step": 7807 + }, + { + "epoch": 3.691725768321513, + "grad_norm": 2.9361326694488525, + "learning_rate": 1.6366829470172191e-06, + "loss": 0.3672, + "step": 7808 + }, + { + "epoch": 3.692198581560284, + "grad_norm": 3.2719476222991943, + "learning_rate": 1.6360975232057156e-06, + "loss": 0.4266, + "step": 7809 + }, + { + "epoch": 3.6926713947990546, + "grad_norm": 2.873952865600586, + "learning_rate": 1.635512153185631e-06, + "loss": 0.4056, + "step": 7810 + }, + { + "epoch": 3.693144208037825, + "grad_norm": 3.0273401737213135, + "learning_rate": 1.634926836993413e-06, + "loss": 0.3947, + "step": 7811 + }, + { + "epoch": 3.6936170212765957, + "grad_norm": 2.868738889694214, + "learning_rate": 1.634341574665509e-06, + "loss": 0.3935, + "step": 7812 + }, + { + "epoch": 3.6940898345153665, + "grad_norm": 3.3080437183380127, + "learning_rate": 1.6337563662383591e-06, + "loss": 0.3606, + "step": 7813 + }, + { + "epoch": 3.694562647754137, + "grad_norm": 2.8339016437530518, + "learning_rate": 1.6331712117484014e-06, + "loss": 0.4019, + "step": 7814 + }, + { + "epoch": 3.6950354609929077, + "grad_norm": 2.666815996170044, + "learning_rate": 1.6325861112320717e-06, + "loss": 0.3502, + "step": 7815 + }, + { + "epoch": 3.6955082742316785, + "grad_norm": 2.7624311447143555, + "learning_rate": 1.6320010647258008e-06, + "loss": 0.3481, + "step": 7816 + }, + { + "epoch": 3.6959810874704493, + "grad_norm": 2.7796332836151123, + "learning_rate": 1.6314160722660183e-06, + "loss": 0.3735, + "step": 7817 + }, + { + "epoch": 3.69645390070922, + "grad_norm": 2.954318046569824, + "learning_rate": 1.6308311338891484e-06, + "loss": 0.3933, + "step": 7818 + }, + { + "epoch": 3.6969267139479904, + "grad_norm": 2.821072816848755, + "learning_rate": 1.6302462496316115e-06, + "loss": 0.3437, + "step": 7819 + }, + { + "epoch": 3.6973995271867612, + "grad_norm": 3.436192750930786, + "learning_rate": 1.629661419529828e-06, + "loss": 0.4469, + "step": 7820 + }, + { + "epoch": 3.697872340425532, + "grad_norm": 3.1361067295074463, + "learning_rate": 1.629076643620211e-06, + "loss": 0.3887, + "step": 7821 + }, + { + "epoch": 3.6983451536643024, + "grad_norm": 3.355024576187134, + "learning_rate": 1.6284919219391732e-06, + "loss": 0.424, + "step": 7822 + }, + { + "epoch": 3.698817966903073, + "grad_norm": 2.7671639919281006, + "learning_rate": 1.6279072545231212e-06, + "loss": 0.3765, + "step": 7823 + }, + { + "epoch": 3.699290780141844, + "grad_norm": 2.9509360790252686, + "learning_rate": 1.6273226414084606e-06, + "loss": 0.4057, + "step": 7824 + }, + { + "epoch": 3.699763593380615, + "grad_norm": 2.9852921962738037, + "learning_rate": 1.6267380826315932e-06, + "loss": 0.4238, + "step": 7825 + }, + { + "epoch": 3.7002364066193856, + "grad_norm": 2.826594114303589, + "learning_rate": 1.626153578228915e-06, + "loss": 0.3958, + "step": 7826 + }, + { + "epoch": 3.700709219858156, + "grad_norm": 2.9103410243988037, + "learning_rate": 1.6255691282368228e-06, + "loss": 0.394, + "step": 7827 + }, + { + "epoch": 3.7011820330969267, + "grad_norm": 3.362992525100708, + "learning_rate": 1.6249847326917068e-06, + "loss": 0.4233, + "step": 7828 + }, + { + "epoch": 3.7016548463356975, + "grad_norm": 2.711280107498169, + "learning_rate": 1.624400391629954e-06, + "loss": 0.2977, + "step": 7829 + }, + { + "epoch": 3.702127659574468, + "grad_norm": 2.8354649543762207, + "learning_rate": 1.6238161050879497e-06, + "loss": 0.3549, + "step": 7830 + }, + { + "epoch": 3.7026004728132387, + "grad_norm": 3.096376895904541, + "learning_rate": 1.6232318731020743e-06, + "loss": 0.3486, + "step": 7831 + }, + { + "epoch": 3.7030732860520095, + "grad_norm": 2.918267250061035, + "learning_rate": 1.6226476957087064e-06, + "loss": 0.3659, + "step": 7832 + }, + { + "epoch": 3.7035460992907803, + "grad_norm": 2.705399513244629, + "learning_rate": 1.6220635729442195e-06, + "loss": 0.4301, + "step": 7833 + }, + { + "epoch": 3.704018912529551, + "grad_norm": 2.9192235469818115, + "learning_rate": 1.621479504844983e-06, + "loss": 0.3384, + "step": 7834 + }, + { + "epoch": 3.7044917257683214, + "grad_norm": 2.78623104095459, + "learning_rate": 1.6208954914473669e-06, + "loss": 0.3528, + "step": 7835 + }, + { + "epoch": 3.704964539007092, + "grad_norm": 3.0218069553375244, + "learning_rate": 1.6203115327877333e-06, + "loss": 0.3698, + "step": 7836 + }, + { + "epoch": 3.705437352245863, + "grad_norm": 3.019101619720459, + "learning_rate": 1.6197276289024422e-06, + "loss": 0.4398, + "step": 7837 + }, + { + "epoch": 3.7059101654846334, + "grad_norm": 2.9220848083496094, + "learning_rate": 1.6191437798278531e-06, + "loss": 0.3803, + "step": 7838 + }, + { + "epoch": 3.706382978723404, + "grad_norm": 3.2731969356536865, + "learning_rate": 1.6185599856003181e-06, + "loss": 0.4529, + "step": 7839 + }, + { + "epoch": 3.706855791962175, + "grad_norm": 2.85239577293396, + "learning_rate": 1.617976246256188e-06, + "loss": 0.3801, + "step": 7840 + }, + { + "epoch": 3.7073286052009458, + "grad_norm": 2.8250765800476074, + "learning_rate": 1.6173925618318092e-06, + "loss": 0.3267, + "step": 7841 + }, + { + "epoch": 3.7078014184397166, + "grad_norm": 2.9152321815490723, + "learning_rate": 1.616808932363525e-06, + "loss": 0.428, + "step": 7842 + }, + { + "epoch": 3.708274231678487, + "grad_norm": 2.912656545639038, + "learning_rate": 1.6162253578876766e-06, + "loss": 0.3802, + "step": 7843 + }, + { + "epoch": 3.7087470449172577, + "grad_norm": 3.0700762271881104, + "learning_rate": 1.6156418384405992e-06, + "loss": 0.377, + "step": 7844 + }, + { + "epoch": 3.7092198581560285, + "grad_norm": 2.873141050338745, + "learning_rate": 1.6150583740586274e-06, + "loss": 0.399, + "step": 7845 + }, + { + "epoch": 3.709692671394799, + "grad_norm": 2.899555206298828, + "learning_rate": 1.6144749647780906e-06, + "loss": 0.402, + "step": 7846 + }, + { + "epoch": 3.7101654846335697, + "grad_norm": 3.257697343826294, + "learning_rate": 1.6138916106353139e-06, + "loss": 0.4193, + "step": 7847 + }, + { + "epoch": 3.7106382978723405, + "grad_norm": 2.6879804134368896, + "learning_rate": 1.613308311666622e-06, + "loss": 0.3474, + "step": 7848 + }, + { + "epoch": 3.7111111111111112, + "grad_norm": 2.712491273880005, + "learning_rate": 1.6127250679083323e-06, + "loss": 0.3315, + "step": 7849 + }, + { + "epoch": 3.711583924349882, + "grad_norm": 2.9762673377990723, + "learning_rate": 1.6121418793967631e-06, + "loss": 0.3953, + "step": 7850 + }, + { + "epoch": 3.7120567375886524, + "grad_norm": 2.743668556213379, + "learning_rate": 1.6115587461682258e-06, + "loss": 0.381, + "step": 7851 + }, + { + "epoch": 3.712529550827423, + "grad_norm": 3.0545318126678467, + "learning_rate": 1.6109756682590288e-06, + "loss": 0.412, + "step": 7852 + }, + { + "epoch": 3.713002364066194, + "grad_norm": 3.0125906467437744, + "learning_rate": 1.61039264570548e-06, + "loss": 0.3931, + "step": 7853 + }, + { + "epoch": 3.7134751773049643, + "grad_norm": 2.809302806854248, + "learning_rate": 1.6098096785438794e-06, + "loss": 0.3943, + "step": 7854 + }, + { + "epoch": 3.713947990543735, + "grad_norm": 3.092452049255371, + "learning_rate": 1.6092267668105276e-06, + "loss": 0.3932, + "step": 7855 + }, + { + "epoch": 3.714420803782506, + "grad_norm": 2.9878969192504883, + "learning_rate": 1.608643910541719e-06, + "loss": 0.4289, + "step": 7856 + }, + { + "epoch": 3.7148936170212767, + "grad_norm": 2.693387508392334, + "learning_rate": 1.6080611097737444e-06, + "loss": 0.373, + "step": 7857 + }, + { + "epoch": 3.7153664302600475, + "grad_norm": 3.4097673892974854, + "learning_rate": 1.6074783645428945e-06, + "loss": 0.4487, + "step": 7858 + }, + { + "epoch": 3.715839243498818, + "grad_norm": 3.1466784477233887, + "learning_rate": 1.6068956748854525e-06, + "loss": 0.3648, + "step": 7859 + }, + { + "epoch": 3.7163120567375887, + "grad_norm": 3.062107563018799, + "learning_rate": 1.6063130408377015e-06, + "loss": 0.3899, + "step": 7860 + }, + { + "epoch": 3.7167848699763595, + "grad_norm": 3.2298364639282227, + "learning_rate": 1.6057304624359188e-06, + "loss": 0.4243, + "step": 7861 + }, + { + "epoch": 3.71725768321513, + "grad_norm": 3.0285773277282715, + "learning_rate": 1.6051479397163784e-06, + "loss": 0.3469, + "step": 7862 + }, + { + "epoch": 3.7177304964539006, + "grad_norm": 2.8438515663146973, + "learning_rate": 1.6045654727153525e-06, + "loss": 0.3363, + "step": 7863 + }, + { + "epoch": 3.7182033096926714, + "grad_norm": 3.1558034420013428, + "learning_rate": 1.6039830614691081e-06, + "loss": 0.4326, + "step": 7864 + }, + { + "epoch": 3.7186761229314422, + "grad_norm": 2.438640594482422, + "learning_rate": 1.603400706013909e-06, + "loss": 0.3647, + "step": 7865 + }, + { + "epoch": 3.719148936170213, + "grad_norm": 3.0443127155303955, + "learning_rate": 1.6028184063860168e-06, + "loss": 0.4295, + "step": 7866 + }, + { + "epoch": 3.7196217494089834, + "grad_norm": 2.8009512424468994, + "learning_rate": 1.602236162621688e-06, + "loss": 0.4055, + "step": 7867 + }, + { + "epoch": 3.720094562647754, + "grad_norm": 3.227698802947998, + "learning_rate": 1.6016539747571775e-06, + "loss": 0.4681, + "step": 7868 + }, + { + "epoch": 3.720567375886525, + "grad_norm": 2.8242595195770264, + "learning_rate": 1.601071842828735e-06, + "loss": 0.3707, + "step": 7869 + }, + { + "epoch": 3.7210401891252953, + "grad_norm": 3.602937698364258, + "learning_rate": 1.6004897668726067e-06, + "loss": 0.5201, + "step": 7870 + }, + { + "epoch": 3.721513002364066, + "grad_norm": 3.578422784805298, + "learning_rate": 1.599907746925037e-06, + "loss": 0.4514, + "step": 7871 + }, + { + "epoch": 3.721985815602837, + "grad_norm": 2.7365758419036865, + "learning_rate": 1.5993257830222635e-06, + "loss": 0.356, + "step": 7872 + }, + { + "epoch": 3.7224586288416077, + "grad_norm": 3.125636577606201, + "learning_rate": 1.5987438752005258e-06, + "loss": 0.4277, + "step": 7873 + }, + { + "epoch": 3.7229314420803785, + "grad_norm": 2.7157294750213623, + "learning_rate": 1.5981620234960549e-06, + "loss": 0.363, + "step": 7874 + }, + { + "epoch": 3.723404255319149, + "grad_norm": 2.90950083732605, + "learning_rate": 1.5975802279450793e-06, + "loss": 0.4027, + "step": 7875 + }, + { + "epoch": 3.7238770685579197, + "grad_norm": 2.659787178039551, + "learning_rate": 1.596998488583827e-06, + "loss": 0.3632, + "step": 7876 + }, + { + "epoch": 3.7243498817966905, + "grad_norm": 3.221623182296753, + "learning_rate": 1.5964168054485185e-06, + "loss": 0.4295, + "step": 7877 + }, + { + "epoch": 3.724822695035461, + "grad_norm": 2.6838672161102295, + "learning_rate": 1.595835178575374e-06, + "loss": 0.3413, + "step": 7878 + }, + { + "epoch": 3.7252955082742316, + "grad_norm": 2.804706335067749, + "learning_rate": 1.5952536080006084e-06, + "loss": 0.3801, + "step": 7879 + }, + { + "epoch": 3.7257683215130024, + "grad_norm": 2.7647509574890137, + "learning_rate": 1.5946720937604326e-06, + "loss": 0.3941, + "step": 7880 + }, + { + "epoch": 3.726241134751773, + "grad_norm": 2.8363754749298096, + "learning_rate": 1.5940906358910566e-06, + "loss": 0.3772, + "step": 7881 + }, + { + "epoch": 3.726713947990544, + "grad_norm": 3.4147698879241943, + "learning_rate": 1.5935092344286835e-06, + "loss": 0.399, + "step": 7882 + }, + { + "epoch": 3.7271867612293144, + "grad_norm": 2.984090805053711, + "learning_rate": 1.5929278894095162e-06, + "loss": 0.3373, + "step": 7883 + }, + { + "epoch": 3.727659574468085, + "grad_norm": 3.250173330307007, + "learning_rate": 1.5923466008697521e-06, + "loss": 0.3932, + "step": 7884 + }, + { + "epoch": 3.728132387706856, + "grad_norm": 3.2699649333953857, + "learning_rate": 1.5917653688455848e-06, + "loss": 0.4529, + "step": 7885 + }, + { + "epoch": 3.7286052009456263, + "grad_norm": 3.175934076309204, + "learning_rate": 1.591184193373206e-06, + "loss": 0.3726, + "step": 7886 + }, + { + "epoch": 3.729078014184397, + "grad_norm": 2.8128812313079834, + "learning_rate": 1.5906030744888024e-06, + "loss": 0.3648, + "step": 7887 + }, + { + "epoch": 3.729550827423168, + "grad_norm": 3.025012493133545, + "learning_rate": 1.5900220122285564e-06, + "loss": 0.3945, + "step": 7888 + }, + { + "epoch": 3.7300236406619387, + "grad_norm": 3.237680435180664, + "learning_rate": 1.5894410066286512e-06, + "loss": 0.3815, + "step": 7889 + }, + { + "epoch": 3.7304964539007095, + "grad_norm": 3.458033323287964, + "learning_rate": 1.5888600577252605e-06, + "loss": 0.4104, + "step": 7890 + }, + { + "epoch": 3.73096926713948, + "grad_norm": 2.718867540359497, + "learning_rate": 1.58827916555456e-06, + "loss": 0.3243, + "step": 7891 + }, + { + "epoch": 3.7314420803782506, + "grad_norm": 3.047157049179077, + "learning_rate": 1.5876983301527176e-06, + "loss": 0.3689, + "step": 7892 + }, + { + "epoch": 3.731914893617021, + "grad_norm": 3.2904715538024902, + "learning_rate": 1.5871175515558995e-06, + "loss": 0.4045, + "step": 7893 + }, + { + "epoch": 3.732387706855792, + "grad_norm": 2.956467866897583, + "learning_rate": 1.5865368298002692e-06, + "loss": 0.3806, + "step": 7894 + }, + { + "epoch": 3.7328605200945626, + "grad_norm": 3.3309173583984375, + "learning_rate": 1.5859561649219843e-06, + "loss": 0.4011, + "step": 7895 + }, + { + "epoch": 3.7333333333333334, + "grad_norm": 2.7853524684906006, + "learning_rate": 1.5853755569572018e-06, + "loss": 0.3239, + "step": 7896 + }, + { + "epoch": 3.733806146572104, + "grad_norm": 2.9832780361175537, + "learning_rate": 1.584795005942073e-06, + "loss": 0.4582, + "step": 7897 + }, + { + "epoch": 3.7342789598108745, + "grad_norm": 3.2866461277008057, + "learning_rate": 1.584214511912745e-06, + "loss": 0.3876, + "step": 7898 + }, + { + "epoch": 3.7347517730496453, + "grad_norm": 3.018526792526245, + "learning_rate": 1.5836340749053646e-06, + "loss": 0.3221, + "step": 7899 + }, + { + "epoch": 3.735224586288416, + "grad_norm": 2.9109885692596436, + "learning_rate": 1.583053694956072e-06, + "loss": 0.4225, + "step": 7900 + }, + { + "epoch": 3.7356973995271865, + "grad_norm": 3.104146718978882, + "learning_rate": 1.5824733721010051e-06, + "loss": 0.3843, + "step": 7901 + }, + { + "epoch": 3.7361702127659573, + "grad_norm": 3.0982813835144043, + "learning_rate": 1.5818931063762989e-06, + "loss": 0.4223, + "step": 7902 + }, + { + "epoch": 3.736643026004728, + "grad_norm": 2.7797579765319824, + "learning_rate": 1.5813128978180819e-06, + "loss": 0.3536, + "step": 7903 + }, + { + "epoch": 3.737115839243499, + "grad_norm": 2.870884656906128, + "learning_rate": 1.5807327464624835e-06, + "loss": 0.3053, + "step": 7904 + }, + { + "epoch": 3.7375886524822697, + "grad_norm": 2.896674633026123, + "learning_rate": 1.5801526523456251e-06, + "loss": 0.3806, + "step": 7905 + }, + { + "epoch": 3.73806146572104, + "grad_norm": 3.009662389755249, + "learning_rate": 1.5795726155036284e-06, + "loss": 0.3568, + "step": 7906 + }, + { + "epoch": 3.738534278959811, + "grad_norm": 2.6860599517822266, + "learning_rate": 1.578992635972609e-06, + "loss": 0.4392, + "step": 7907 + }, + { + "epoch": 3.7390070921985816, + "grad_norm": 2.9046099185943604, + "learning_rate": 1.578412713788679e-06, + "loss": 0.3756, + "step": 7908 + }, + { + "epoch": 3.739479905437352, + "grad_norm": 2.8035101890563965, + "learning_rate": 1.5778328489879488e-06, + "loss": 0.3576, + "step": 7909 + }, + { + "epoch": 3.739952718676123, + "grad_norm": 2.767514228820801, + "learning_rate": 1.5772530416065238e-06, + "loss": 0.4037, + "step": 7910 + }, + { + "epoch": 3.7404255319148936, + "grad_norm": 3.0867795944213867, + "learning_rate": 1.576673291680505e-06, + "loss": 0.4394, + "step": 7911 + }, + { + "epoch": 3.7408983451536644, + "grad_norm": 3.295976161956787, + "learning_rate": 1.5760935992459926e-06, + "loss": 0.3938, + "step": 7912 + }, + { + "epoch": 3.741371158392435, + "grad_norm": 2.725949287414551, + "learning_rate": 1.5755139643390794e-06, + "loss": 0.3633, + "step": 7913 + }, + { + "epoch": 3.7418439716312055, + "grad_norm": 3.0864083766937256, + "learning_rate": 1.5749343869958585e-06, + "loss": 0.3034, + "step": 7914 + }, + { + "epoch": 3.7423167848699763, + "grad_norm": 3.707273244857788, + "learning_rate": 1.5743548672524175e-06, + "loss": 0.4206, + "step": 7915 + }, + { + "epoch": 3.742789598108747, + "grad_norm": 2.9829516410827637, + "learning_rate": 1.573775405144839e-06, + "loss": 0.333, + "step": 7916 + }, + { + "epoch": 3.7432624113475175, + "grad_norm": 3.3303117752075195, + "learning_rate": 1.5731960007092056e-06, + "loss": 0.4558, + "step": 7917 + }, + { + "epoch": 3.7437352245862883, + "grad_norm": 2.63291335105896, + "learning_rate": 1.5726166539815925e-06, + "loss": 0.39, + "step": 7918 + }, + { + "epoch": 3.744208037825059, + "grad_norm": 3.0533673763275146, + "learning_rate": 1.572037364998075e-06, + "loss": 0.3586, + "step": 7919 + }, + { + "epoch": 3.74468085106383, + "grad_norm": 2.9185104370117188, + "learning_rate": 1.5714581337947216e-06, + "loss": 0.3809, + "step": 7920 + }, + { + "epoch": 3.7451536643026007, + "grad_norm": 3.1863298416137695, + "learning_rate": 1.5708789604075975e-06, + "loss": 0.4132, + "step": 7921 + }, + { + "epoch": 3.745626477541371, + "grad_norm": 3.2700514793395996, + "learning_rate": 1.5702998448727674e-06, + "loss": 0.4601, + "step": 7922 + }, + { + "epoch": 3.746099290780142, + "grad_norm": 3.4729206562042236, + "learning_rate": 1.5697207872262886e-06, + "loss": 0.4585, + "step": 7923 + }, + { + "epoch": 3.7465721040189126, + "grad_norm": 3.1432926654815674, + "learning_rate": 1.5691417875042182e-06, + "loss": 0.3128, + "step": 7924 + }, + { + "epoch": 3.747044917257683, + "grad_norm": 3.096121072769165, + "learning_rate": 1.5685628457426066e-06, + "loss": 0.3903, + "step": 7925 + }, + { + "epoch": 3.7475177304964538, + "grad_norm": 2.6897027492523193, + "learning_rate": 1.5679839619775023e-06, + "loss": 0.3707, + "step": 7926 + }, + { + "epoch": 3.7479905437352246, + "grad_norm": 2.8020687103271484, + "learning_rate": 1.5674051362449503e-06, + "loss": 0.3986, + "step": 7927 + }, + { + "epoch": 3.7484633569739954, + "grad_norm": 3.1278326511383057, + "learning_rate": 1.56682636858099e-06, + "loss": 0.3899, + "step": 7928 + }, + { + "epoch": 3.748936170212766, + "grad_norm": 2.907982587814331, + "learning_rate": 1.5662476590216613e-06, + "loss": 0.3422, + "step": 7929 + }, + { + "epoch": 3.7494089834515365, + "grad_norm": 3.1246347427368164, + "learning_rate": 1.5656690076029962e-06, + "loss": 0.452, + "step": 7930 + }, + { + "epoch": 3.7498817966903073, + "grad_norm": 2.9161367416381836, + "learning_rate": 1.565090414361024e-06, + "loss": 0.3284, + "step": 7931 + }, + { + "epoch": 3.750354609929078, + "grad_norm": 2.943183422088623, + "learning_rate": 1.564511879331773e-06, + "loss": 0.3478, + "step": 7932 + }, + { + "epoch": 3.7508274231678485, + "grad_norm": 3.2308566570281982, + "learning_rate": 1.563933402551266e-06, + "loss": 0.4143, + "step": 7933 + }, + { + "epoch": 3.7513002364066192, + "grad_norm": 2.6846251487731934, + "learning_rate": 1.5633549840555206e-06, + "loss": 0.3681, + "step": 7934 + }, + { + "epoch": 3.75177304964539, + "grad_norm": 3.0995283126831055, + "learning_rate": 1.562776623880554e-06, + "loss": 0.4642, + "step": 7935 + }, + { + "epoch": 3.752245862884161, + "grad_norm": 2.7406163215637207, + "learning_rate": 1.562198322062376e-06, + "loss": 0.3823, + "step": 7936 + }, + { + "epoch": 3.7527186761229316, + "grad_norm": 2.85732364654541, + "learning_rate": 1.5616200786369978e-06, + "loss": 0.3053, + "step": 7937 + }, + { + "epoch": 3.753191489361702, + "grad_norm": 2.812526226043701, + "learning_rate": 1.5610418936404223e-06, + "loss": 0.3944, + "step": 7938 + }, + { + "epoch": 3.753664302600473, + "grad_norm": 2.8886849880218506, + "learning_rate": 1.5604637671086499e-06, + "loss": 0.3936, + "step": 7939 + }, + { + "epoch": 3.7541371158392436, + "grad_norm": 2.831774950027466, + "learning_rate": 1.5598856990776801e-06, + "loss": 0.3146, + "step": 7940 + }, + { + "epoch": 3.754609929078014, + "grad_norm": 2.8853790760040283, + "learning_rate": 1.5593076895835052e-06, + "loss": 0.3286, + "step": 7941 + }, + { + "epoch": 3.7550827423167847, + "grad_norm": 3.2724483013153076, + "learning_rate": 1.5587297386621158e-06, + "loss": 0.3396, + "step": 7942 + }, + { + "epoch": 3.7555555555555555, + "grad_norm": 3.5077168941497803, + "learning_rate": 1.5581518463494983e-06, + "loss": 0.4528, + "step": 7943 + }, + { + "epoch": 3.7560283687943263, + "grad_norm": 3.031503915786743, + "learning_rate": 1.5575740126816346e-06, + "loss": 0.3803, + "step": 7944 + }, + { + "epoch": 3.756501182033097, + "grad_norm": 3.0939114093780518, + "learning_rate": 1.556996237694506e-06, + "loss": 0.3931, + "step": 7945 + }, + { + "epoch": 3.7569739952718675, + "grad_norm": 2.9404146671295166, + "learning_rate": 1.556418521424085e-06, + "loss": 0.3608, + "step": 7946 + }, + { + "epoch": 3.7574468085106383, + "grad_norm": 3.4363012313842773, + "learning_rate": 1.5558408639063465e-06, + "loss": 0.4335, + "step": 7947 + }, + { + "epoch": 3.757919621749409, + "grad_norm": 3.2819864749908447, + "learning_rate": 1.5552632651772575e-06, + "loss": 0.4147, + "step": 7948 + }, + { + "epoch": 3.7583924349881794, + "grad_norm": 2.917788505554199, + "learning_rate": 1.554685725272782e-06, + "loss": 0.3516, + "step": 7949 + }, + { + "epoch": 3.7588652482269502, + "grad_norm": 2.8425943851470947, + "learning_rate": 1.5541082442288818e-06, + "loss": 0.3596, + "step": 7950 + }, + { + "epoch": 3.759338061465721, + "grad_norm": 3.087005376815796, + "learning_rate": 1.5535308220815126e-06, + "loss": 0.3968, + "step": 7951 + }, + { + "epoch": 3.759810874704492, + "grad_norm": 2.743110179901123, + "learning_rate": 1.5529534588666298e-06, + "loss": 0.3802, + "step": 7952 + }, + { + "epoch": 3.7602836879432626, + "grad_norm": 2.914424180984497, + "learning_rate": 1.5523761546201825e-06, + "loss": 0.4055, + "step": 7953 + }, + { + "epoch": 3.760756501182033, + "grad_norm": 2.9691991806030273, + "learning_rate": 1.551798909378116e-06, + "loss": 0.3384, + "step": 7954 + }, + { + "epoch": 3.7612293144208038, + "grad_norm": 2.433657646179199, + "learning_rate": 1.5512217231763747e-06, + "loss": 0.3019, + "step": 7955 + }, + { + "epoch": 3.7617021276595746, + "grad_norm": 2.7904880046844482, + "learning_rate": 1.5506445960508957e-06, + "loss": 0.389, + "step": 7956 + }, + { + "epoch": 3.762174940898345, + "grad_norm": 2.9241607189178467, + "learning_rate": 1.5500675280376154e-06, + "loss": 0.4291, + "step": 7957 + }, + { + "epoch": 3.7626477541371157, + "grad_norm": 3.216491222381592, + "learning_rate": 1.549490519172465e-06, + "loss": 0.4065, + "step": 7958 + }, + { + "epoch": 3.7631205673758865, + "grad_norm": 2.8859689235687256, + "learning_rate": 1.548913569491371e-06, + "loss": 0.353, + "step": 7959 + }, + { + "epoch": 3.7635933806146573, + "grad_norm": 2.958773136138916, + "learning_rate": 1.5483366790302594e-06, + "loss": 0.3829, + "step": 7960 + }, + { + "epoch": 3.764066193853428, + "grad_norm": 2.868649482727051, + "learning_rate": 1.5477598478250505e-06, + "loss": 0.3591, + "step": 7961 + }, + { + "epoch": 3.7645390070921985, + "grad_norm": 2.6912996768951416, + "learning_rate": 1.5471830759116591e-06, + "loss": 0.3695, + "step": 7962 + }, + { + "epoch": 3.7650118203309693, + "grad_norm": 3.3318257331848145, + "learning_rate": 1.5466063633260004e-06, + "loss": 0.4126, + "step": 7963 + }, + { + "epoch": 3.76548463356974, + "grad_norm": 2.865525007247925, + "learning_rate": 1.5460297101039825e-06, + "loss": 0.4235, + "step": 7964 + }, + { + "epoch": 3.7659574468085104, + "grad_norm": 2.8639180660247803, + "learning_rate": 1.5454531162815123e-06, + "loss": 0.4392, + "step": 7965 + }, + { + "epoch": 3.766430260047281, + "grad_norm": 2.5752499103546143, + "learning_rate": 1.5448765818944902e-06, + "loss": 0.4113, + "step": 7966 + }, + { + "epoch": 3.766903073286052, + "grad_norm": 2.7622742652893066, + "learning_rate": 1.5443001069788155e-06, + "loss": 0.3785, + "step": 7967 + }, + { + "epoch": 3.767375886524823, + "grad_norm": 2.965579032897949, + "learning_rate": 1.5437236915703829e-06, + "loss": 0.335, + "step": 7968 + }, + { + "epoch": 3.7678486997635936, + "grad_norm": 3.0587408542633057, + "learning_rate": 1.5431473357050816e-06, + "loss": 0.4047, + "step": 7969 + }, + { + "epoch": 3.768321513002364, + "grad_norm": 3.2929413318634033, + "learning_rate": 1.5425710394188014e-06, + "loss": 0.4061, + "step": 7970 + }, + { + "epoch": 3.7687943262411348, + "grad_norm": 2.663043975830078, + "learning_rate": 1.541994802747424e-06, + "loss": 0.3478, + "step": 7971 + }, + { + "epoch": 3.7692671394799055, + "grad_norm": 3.0657591819763184, + "learning_rate": 1.5414186257268293e-06, + "loss": 0.3735, + "step": 7972 + }, + { + "epoch": 3.769739952718676, + "grad_norm": 2.963189125061035, + "learning_rate": 1.5408425083928939e-06, + "loss": 0.4743, + "step": 7973 + }, + { + "epoch": 3.7702127659574467, + "grad_norm": 3.1509387493133545, + "learning_rate": 1.540266450781489e-06, + "loss": 0.4164, + "step": 7974 + }, + { + "epoch": 3.7706855791962175, + "grad_norm": 3.4436306953430176, + "learning_rate": 1.539690452928485e-06, + "loss": 0.4583, + "step": 7975 + }, + { + "epoch": 3.7711583924349883, + "grad_norm": 3.1746156215667725, + "learning_rate": 1.5391145148697454e-06, + "loss": 0.4042, + "step": 7976 + }, + { + "epoch": 3.771631205673759, + "grad_norm": 3.531028985977173, + "learning_rate": 1.5385386366411304e-06, + "loss": 0.4304, + "step": 7977 + }, + { + "epoch": 3.7721040189125294, + "grad_norm": 2.867871046066284, + "learning_rate": 1.5379628182785e-06, + "loss": 0.4023, + "step": 7978 + }, + { + "epoch": 3.7725768321513002, + "grad_norm": 3.0504629611968994, + "learning_rate": 1.5373870598177051e-06, + "loss": 0.3785, + "step": 7979 + }, + { + "epoch": 3.773049645390071, + "grad_norm": 2.8188650608062744, + "learning_rate": 1.5368113612945983e-06, + "loss": 0.3808, + "step": 7980 + }, + { + "epoch": 3.7735224586288414, + "grad_norm": 3.0809133052825928, + "learning_rate": 1.5362357227450248e-06, + "loss": 0.3912, + "step": 7981 + }, + { + "epoch": 3.773995271867612, + "grad_norm": 3.223273277282715, + "learning_rate": 1.5356601442048257e-06, + "loss": 0.3802, + "step": 7982 + }, + { + "epoch": 3.774468085106383, + "grad_norm": 2.7513339519500732, + "learning_rate": 1.535084625709842e-06, + "loss": 0.3822, + "step": 7983 + }, + { + "epoch": 3.774940898345154, + "grad_norm": 3.085592031478882, + "learning_rate": 1.5345091672959074e-06, + "loss": 0.4348, + "step": 7984 + }, + { + "epoch": 3.7754137115839246, + "grad_norm": 3.315108299255371, + "learning_rate": 1.5339337689988525e-06, + "loss": 0.4196, + "step": 7985 + }, + { + "epoch": 3.775886524822695, + "grad_norm": 3.713372230529785, + "learning_rate": 1.533358430854507e-06, + "loss": 0.4292, + "step": 7986 + }, + { + "epoch": 3.7763593380614657, + "grad_norm": 2.7899155616760254, + "learning_rate": 1.532783152898692e-06, + "loss": 0.3874, + "step": 7987 + }, + { + "epoch": 3.7768321513002365, + "grad_norm": 2.918851852416992, + "learning_rate": 1.5322079351672297e-06, + "loss": 0.4073, + "step": 7988 + }, + { + "epoch": 3.777304964539007, + "grad_norm": 3.13395619392395, + "learning_rate": 1.5316327776959361e-06, + "loss": 0.3441, + "step": 7989 + }, + { + "epoch": 3.7777777777777777, + "grad_norm": 3.2320916652679443, + "learning_rate": 1.531057680520623e-06, + "loss": 0.372, + "step": 7990 + }, + { + "epoch": 3.7782505910165485, + "grad_norm": 3.1130621433258057, + "learning_rate": 1.5304826436770991e-06, + "loss": 0.3514, + "step": 7991 + }, + { + "epoch": 3.7787234042553193, + "grad_norm": 3.223207712173462, + "learning_rate": 1.5299076672011696e-06, + "loss": 0.44, + "step": 7992 + }, + { + "epoch": 3.77919621749409, + "grad_norm": 3.0757877826690674, + "learning_rate": 1.5293327511286366e-06, + "loss": 0.4051, + "step": 7993 + }, + { + "epoch": 3.7796690307328604, + "grad_norm": 2.936678409576416, + "learning_rate": 1.528757895495297e-06, + "loss": 0.3965, + "step": 7994 + }, + { + "epoch": 3.780141843971631, + "grad_norm": 2.993445873260498, + "learning_rate": 1.5281831003369435e-06, + "loss": 0.4222, + "step": 7995 + }, + { + "epoch": 3.780614657210402, + "grad_norm": 2.9140853881835938, + "learning_rate": 1.5276083656893679e-06, + "loss": 0.3662, + "step": 7996 + }, + { + "epoch": 3.7810874704491724, + "grad_norm": 3.2649893760681152, + "learning_rate": 1.5270336915883549e-06, + "loss": 0.4272, + "step": 7997 + }, + { + "epoch": 3.781560283687943, + "grad_norm": 3.0631372928619385, + "learning_rate": 1.5264590780696887e-06, + "loss": 0.4111, + "step": 7998 + }, + { + "epoch": 3.782033096926714, + "grad_norm": 2.791299343109131, + "learning_rate": 1.5258845251691463e-06, + "loss": 0.416, + "step": 7999 + }, + { + "epoch": 3.7825059101654848, + "grad_norm": 3.262294054031372, + "learning_rate": 1.5253100329225023e-06, + "loss": 0.4236, + "step": 8000 + }, + { + "epoch": 3.7829787234042556, + "grad_norm": 2.574486017227173, + "learning_rate": 1.5247356013655295e-06, + "loss": 0.4089, + "step": 8001 + }, + { + "epoch": 3.783451536643026, + "grad_norm": 3.1566531658172607, + "learning_rate": 1.5241612305339936e-06, + "loss": 0.3955, + "step": 8002 + }, + { + "epoch": 3.7839243498817967, + "grad_norm": 2.5845813751220703, + "learning_rate": 1.5235869204636602e-06, + "loss": 0.3672, + "step": 8003 + }, + { + "epoch": 3.7843971631205675, + "grad_norm": 2.877570629119873, + "learning_rate": 1.5230126711902876e-06, + "loss": 0.3919, + "step": 8004 + }, + { + "epoch": 3.784869976359338, + "grad_norm": 3.183061122894287, + "learning_rate": 1.5224384827496314e-06, + "loss": 0.3291, + "step": 8005 + }, + { + "epoch": 3.7853427895981087, + "grad_norm": 3.0778391361236572, + "learning_rate": 1.5218643551774451e-06, + "loss": 0.3571, + "step": 8006 + }, + { + "epoch": 3.7858156028368795, + "grad_norm": 3.2364399433135986, + "learning_rate": 1.5212902885094762e-06, + "loss": 0.4045, + "step": 8007 + }, + { + "epoch": 3.7862884160756503, + "grad_norm": 3.0571746826171875, + "learning_rate": 1.5207162827814687e-06, + "loss": 0.4181, + "step": 8008 + }, + { + "epoch": 3.786761229314421, + "grad_norm": 2.7215163707733154, + "learning_rate": 1.5201423380291652e-06, + "loss": 0.3328, + "step": 8009 + }, + { + "epoch": 3.7872340425531914, + "grad_norm": 3.0521233081817627, + "learning_rate": 1.5195684542883007e-06, + "loss": 0.4072, + "step": 8010 + }, + { + "epoch": 3.787706855791962, + "grad_norm": 2.541666269302368, + "learning_rate": 1.5189946315946104e-06, + "loss": 0.3293, + "step": 8011 + }, + { + "epoch": 3.788179669030733, + "grad_norm": 3.0041720867156982, + "learning_rate": 1.5184208699838232e-06, + "loss": 0.3998, + "step": 8012 + }, + { + "epoch": 3.7886524822695034, + "grad_norm": 3.0763001441955566, + "learning_rate": 1.5178471694916635e-06, + "loss": 0.38, + "step": 8013 + }, + { + "epoch": 3.789125295508274, + "grad_norm": 3.0788497924804688, + "learning_rate": 1.5172735301538544e-06, + "loss": 0.3986, + "step": 8014 + }, + { + "epoch": 3.789598108747045, + "grad_norm": 2.830225944519043, + "learning_rate": 1.5166999520061127e-06, + "loss": 0.3977, + "step": 8015 + }, + { + "epoch": 3.7900709219858157, + "grad_norm": 3.196078062057495, + "learning_rate": 1.5161264350841543e-06, + "loss": 0.4058, + "step": 8016 + }, + { + "epoch": 3.7905437352245865, + "grad_norm": 9.898200988769531, + "learning_rate": 1.5155529794236884e-06, + "loss": 0.3451, + "step": 8017 + }, + { + "epoch": 3.791016548463357, + "grad_norm": 3.0028066635131836, + "learning_rate": 1.514979585060421e-06, + "loss": 0.4029, + "step": 8018 + }, + { + "epoch": 3.7914893617021277, + "grad_norm": 2.984926223754883, + "learning_rate": 1.5144062520300562e-06, + "loss": 0.3995, + "step": 8019 + }, + { + "epoch": 3.7919621749408985, + "grad_norm": 2.938596487045288, + "learning_rate": 1.5138329803682925e-06, + "loss": 0.386, + "step": 8020 + }, + { + "epoch": 3.792434988179669, + "grad_norm": 2.992565393447876, + "learning_rate": 1.513259770110825e-06, + "loss": 0.3919, + "step": 8021 + }, + { + "epoch": 3.7929078014184396, + "grad_norm": 3.0182361602783203, + "learning_rate": 1.5126866212933453e-06, + "loss": 0.3506, + "step": 8022 + }, + { + "epoch": 3.7933806146572104, + "grad_norm": 3.2039108276367188, + "learning_rate": 1.5121135339515392e-06, + "loss": 0.3807, + "step": 8023 + }, + { + "epoch": 3.7938534278959812, + "grad_norm": 2.9290878772735596, + "learning_rate": 1.5115405081210927e-06, + "loss": 0.3596, + "step": 8024 + }, + { + "epoch": 3.794326241134752, + "grad_norm": 3.106152057647705, + "learning_rate": 1.510967543837683e-06, + "loss": 0.3703, + "step": 8025 + }, + { + "epoch": 3.7947990543735224, + "grad_norm": 2.9752190113067627, + "learning_rate": 1.510394641136989e-06, + "loss": 0.4049, + "step": 8026 + }, + { + "epoch": 3.795271867612293, + "grad_norm": 2.996206283569336, + "learning_rate": 1.5098218000546815e-06, + "loss": 0.4286, + "step": 8027 + }, + { + "epoch": 3.795744680851064, + "grad_norm": 2.9403493404388428, + "learning_rate": 1.5092490206264281e-06, + "loss": 0.3628, + "step": 8028 + }, + { + "epoch": 3.7962174940898343, + "grad_norm": 2.8101110458374023, + "learning_rate": 1.5086763028878943e-06, + "loss": 0.4016, + "step": 8029 + }, + { + "epoch": 3.796690307328605, + "grad_norm": 3.162264108657837, + "learning_rate": 1.5081036468747401e-06, + "loss": 0.4133, + "step": 8030 + }, + { + "epoch": 3.797163120567376, + "grad_norm": 2.6871988773345947, + "learning_rate": 1.5075310526226223e-06, + "loss": 0.3748, + "step": 8031 + }, + { + "epoch": 3.7976359338061467, + "grad_norm": 2.997924327850342, + "learning_rate": 1.5069585201671944e-06, + "loss": 0.4083, + "step": 8032 + }, + { + "epoch": 3.7981087470449175, + "grad_norm": 2.8266279697418213, + "learning_rate": 1.506386049544104e-06, + "loss": 0.4488, + "step": 8033 + }, + { + "epoch": 3.798581560283688, + "grad_norm": 2.7106378078460693, + "learning_rate": 1.5058136407889985e-06, + "loss": 0.363, + "step": 8034 + }, + { + "epoch": 3.7990543735224587, + "grad_norm": 2.8983304500579834, + "learning_rate": 1.5052412939375183e-06, + "loss": 0.4156, + "step": 8035 + }, + { + "epoch": 3.7995271867612295, + "grad_norm": 3.0333914756774902, + "learning_rate": 1.5046690090253001e-06, + "loss": 0.3694, + "step": 8036 + }, + { + "epoch": 3.8, + "grad_norm": 2.872662305831909, + "learning_rate": 1.5040967860879785e-06, + "loss": 0.3492, + "step": 8037 + }, + { + "epoch": 3.8004728132387706, + "grad_norm": 2.7279646396636963, + "learning_rate": 1.5035246251611835e-06, + "loss": 0.327, + "step": 8038 + }, + { + "epoch": 3.8009456264775414, + "grad_norm": 2.969326972961426, + "learning_rate": 1.5029525262805405e-06, + "loss": 0.3977, + "step": 8039 + }, + { + "epoch": 3.801418439716312, + "grad_norm": 3.073899745941162, + "learning_rate": 1.5023804894816723e-06, + "loss": 0.388, + "step": 8040 + }, + { + "epoch": 3.801891252955083, + "grad_norm": 3.026284694671631, + "learning_rate": 1.5018085148001953e-06, + "loss": 0.3761, + "step": 8041 + }, + { + "epoch": 3.8023640661938534, + "grad_norm": 3.0478618144989014, + "learning_rate": 1.5012366022717262e-06, + "loss": 0.4415, + "step": 8042 + }, + { + "epoch": 3.802836879432624, + "grad_norm": 2.801584005355835, + "learning_rate": 1.500664751931874e-06, + "loss": 0.4079, + "step": 8043 + }, + { + "epoch": 3.803309692671395, + "grad_norm": 3.4839112758636475, + "learning_rate": 1.5000929638162459e-06, + "loss": 0.4391, + "step": 8044 + }, + { + "epoch": 3.8037825059101653, + "grad_norm": 2.6945605278015137, + "learning_rate": 1.4995212379604446e-06, + "loss": 0.3564, + "step": 8045 + }, + { + "epoch": 3.804255319148936, + "grad_norm": 3.0870234966278076, + "learning_rate": 1.4989495744000687e-06, + "loss": 0.3801, + "step": 8046 + }, + { + "epoch": 3.804728132387707, + "grad_norm": 2.975332021713257, + "learning_rate": 1.4983779731707135e-06, + "loss": 0.3408, + "step": 8047 + }, + { + "epoch": 3.8052009456264777, + "grad_norm": 2.9920027256011963, + "learning_rate": 1.497806434307969e-06, + "loss": 0.3875, + "step": 8048 + }, + { + "epoch": 3.8056737588652485, + "grad_norm": 3.1974916458129883, + "learning_rate": 1.4972349578474244e-06, + "loss": 0.4492, + "step": 8049 + }, + { + "epoch": 3.806146572104019, + "grad_norm": 2.839503526687622, + "learning_rate": 1.4966635438246622e-06, + "loss": 0.3785, + "step": 8050 + }, + { + "epoch": 3.8066193853427897, + "grad_norm": 3.274502992630005, + "learning_rate": 1.4960921922752603e-06, + "loss": 0.4404, + "step": 8051 + }, + { + "epoch": 3.8070921985815604, + "grad_norm": 3.0852737426757812, + "learning_rate": 1.4955209032347967e-06, + "loss": 0.4047, + "step": 8052 + }, + { + "epoch": 3.807565011820331, + "grad_norm": 2.9251608848571777, + "learning_rate": 1.4949496767388417e-06, + "loss": 0.3654, + "step": 8053 + }, + { + "epoch": 3.8080378250591016, + "grad_norm": 2.518220901489258, + "learning_rate": 1.4943785128229635e-06, + "loss": 0.3157, + "step": 8054 + }, + { + "epoch": 3.8085106382978724, + "grad_norm": 3.3993279933929443, + "learning_rate": 1.4938074115227257e-06, + "loss": 0.4204, + "step": 8055 + }, + { + "epoch": 3.808983451536643, + "grad_norm": 3.2847096920013428, + "learning_rate": 1.4932363728736876e-06, + "loss": 0.339, + "step": 8056 + }, + { + "epoch": 3.8094562647754135, + "grad_norm": 2.7779417037963867, + "learning_rate": 1.492665396911407e-06, + "loss": 0.3538, + "step": 8057 + }, + { + "epoch": 3.8099290780141843, + "grad_norm": 2.958131790161133, + "learning_rate": 1.4920944836714353e-06, + "loss": 0.363, + "step": 8058 + }, + { + "epoch": 3.810401891252955, + "grad_norm": 3.1873440742492676, + "learning_rate": 1.491523633189319e-06, + "loss": 0.3785, + "step": 8059 + }, + { + "epoch": 3.8108747044917255, + "grad_norm": 3.132652759552002, + "learning_rate": 1.4909528455006055e-06, + "loss": 0.375, + "step": 8060 + }, + { + "epoch": 3.8113475177304963, + "grad_norm": 2.8598761558532715, + "learning_rate": 1.490382120640833e-06, + "loss": 0.4152, + "step": 8061 + }, + { + "epoch": 3.811820330969267, + "grad_norm": 3.115870952606201, + "learning_rate": 1.4898114586455399e-06, + "loss": 0.4609, + "step": 8062 + }, + { + "epoch": 3.812293144208038, + "grad_norm": 3.347944974899292, + "learning_rate": 1.4892408595502571e-06, + "loss": 0.3836, + "step": 8063 + }, + { + "epoch": 3.8127659574468087, + "grad_norm": 3.1747031211853027, + "learning_rate": 1.4886703233905132e-06, + "loss": 0.374, + "step": 8064 + }, + { + "epoch": 3.813238770685579, + "grad_norm": 2.945139169692993, + "learning_rate": 1.4880998502018345e-06, + "loss": 0.3652, + "step": 8065 + }, + { + "epoch": 3.81371158392435, + "grad_norm": 2.8911492824554443, + "learning_rate": 1.4875294400197403e-06, + "loss": 0.3683, + "step": 8066 + }, + { + "epoch": 3.8141843971631206, + "grad_norm": 3.080268383026123, + "learning_rate": 1.4869590928797491e-06, + "loss": 0.3919, + "step": 8067 + }, + { + "epoch": 3.814657210401891, + "grad_norm": 3.0834288597106934, + "learning_rate": 1.4863888088173734e-06, + "loss": 0.3988, + "step": 8068 + }, + { + "epoch": 3.815130023640662, + "grad_norm": 2.765702724456787, + "learning_rate": 1.4858185878681213e-06, + "loss": 0.3659, + "step": 8069 + }, + { + "epoch": 3.8156028368794326, + "grad_norm": 3.074059247970581, + "learning_rate": 1.4852484300674993e-06, + "loss": 0.3888, + "step": 8070 + }, + { + "epoch": 3.8160756501182034, + "grad_norm": 3.0009944438934326, + "learning_rate": 1.484678335451007e-06, + "loss": 0.417, + "step": 8071 + }, + { + "epoch": 3.816548463356974, + "grad_norm": 2.6661112308502197, + "learning_rate": 1.4841083040541438e-06, + "loss": 0.3544, + "step": 8072 + }, + { + "epoch": 3.8170212765957445, + "grad_norm": 2.7849514484405518, + "learning_rate": 1.4835383359124018e-06, + "loss": 0.3691, + "step": 8073 + }, + { + "epoch": 3.8174940898345153, + "grad_norm": 3.008070707321167, + "learning_rate": 1.4829684310612697e-06, + "loss": 0.4228, + "step": 8074 + }, + { + "epoch": 3.817966903073286, + "grad_norm": 2.649296998977661, + "learning_rate": 1.4823985895362348e-06, + "loss": 0.3642, + "step": 8075 + }, + { + "epoch": 3.8184397163120565, + "grad_norm": 2.6017661094665527, + "learning_rate": 1.4818288113727768e-06, + "loss": 0.3537, + "step": 8076 + }, + { + "epoch": 3.8189125295508273, + "grad_norm": 2.9071972370147705, + "learning_rate": 1.481259096606375e-06, + "loss": 0.3096, + "step": 8077 + }, + { + "epoch": 3.819385342789598, + "grad_norm": 3.0866518020629883, + "learning_rate": 1.4806894452725024e-06, + "loss": 0.4148, + "step": 8078 + }, + { + "epoch": 3.819858156028369, + "grad_norm": 3.2099499702453613, + "learning_rate": 1.4801198574066272e-06, + "loss": 0.4058, + "step": 8079 + }, + { + "epoch": 3.8203309692671397, + "grad_norm": 3.0204920768737793, + "learning_rate": 1.4795503330442176e-06, + "loss": 0.3427, + "step": 8080 + }, + { + "epoch": 3.82080378250591, + "grad_norm": 2.88667368888855, + "learning_rate": 1.478980872220734e-06, + "loss": 0.4075, + "step": 8081 + }, + { + "epoch": 3.821276595744681, + "grad_norm": 2.926673173904419, + "learning_rate": 1.4784114749716338e-06, + "loss": 0.3449, + "step": 8082 + }, + { + "epoch": 3.8217494089834516, + "grad_norm": 2.818936347961426, + "learning_rate": 1.4778421413323723e-06, + "loss": 0.3628, + "step": 8083 + }, + { + "epoch": 3.822222222222222, + "grad_norm": 2.960322380065918, + "learning_rate": 1.4772728713383983e-06, + "loss": 0.3669, + "step": 8084 + }, + { + "epoch": 3.8226950354609928, + "grad_norm": 2.940131902694702, + "learning_rate": 1.4767036650251584e-06, + "loss": 0.4357, + "step": 8085 + }, + { + "epoch": 3.8231678486997636, + "grad_norm": 2.9251785278320312, + "learning_rate": 1.4761345224280943e-06, + "loss": 0.4046, + "step": 8086 + }, + { + "epoch": 3.8236406619385344, + "grad_norm": 3.115590810775757, + "learning_rate": 1.475565443582643e-06, + "loss": 0.3712, + "step": 8087 + }, + { + "epoch": 3.824113475177305, + "grad_norm": 2.5968618392944336, + "learning_rate": 1.4749964285242408e-06, + "loss": 0.3432, + "step": 8088 + }, + { + "epoch": 3.8245862884160755, + "grad_norm": 3.195409059524536, + "learning_rate": 1.4744274772883148e-06, + "loss": 0.3717, + "step": 8089 + }, + { + "epoch": 3.8250591016548463, + "grad_norm": 2.8658018112182617, + "learning_rate": 1.4738585899102942e-06, + "loss": 0.3807, + "step": 8090 + }, + { + "epoch": 3.825531914893617, + "grad_norm": 2.9005510807037354, + "learning_rate": 1.4732897664255998e-06, + "loss": 0.3988, + "step": 8091 + }, + { + "epoch": 3.8260047281323875, + "grad_norm": 3.9155731201171875, + "learning_rate": 1.472721006869649e-06, + "loss": 0.3981, + "step": 8092 + }, + { + "epoch": 3.8264775413711583, + "grad_norm": 2.89312744140625, + "learning_rate": 1.4721523112778575e-06, + "loss": 0.3286, + "step": 8093 + }, + { + "epoch": 3.826950354609929, + "grad_norm": 3.006071090698242, + "learning_rate": 1.4715836796856332e-06, + "loss": 0.3901, + "step": 8094 + }, + { + "epoch": 3.8274231678487, + "grad_norm": 3.083411693572998, + "learning_rate": 1.4710151121283845e-06, + "loss": 0.3741, + "step": 8095 + }, + { + "epoch": 3.8278959810874706, + "grad_norm": 2.864989995956421, + "learning_rate": 1.4704466086415131e-06, + "loss": 0.3887, + "step": 8096 + }, + { + "epoch": 3.828368794326241, + "grad_norm": 2.4846417903900146, + "learning_rate": 1.4698781692604158e-06, + "loss": 0.33, + "step": 8097 + }, + { + "epoch": 3.828841607565012, + "grad_norm": 3.2497007846832275, + "learning_rate": 1.4693097940204893e-06, + "loss": 0.4011, + "step": 8098 + }, + { + "epoch": 3.8293144208037826, + "grad_norm": 3.0079777240753174, + "learning_rate": 1.4687414829571218e-06, + "loss": 0.4263, + "step": 8099 + }, + { + "epoch": 3.829787234042553, + "grad_norm": 2.8538410663604736, + "learning_rate": 1.4681732361057005e-06, + "loss": 0.3651, + "step": 8100 + }, + { + "epoch": 3.8302600472813237, + "grad_norm": 3.238163948059082, + "learning_rate": 1.4676050535016076e-06, + "loss": 0.392, + "step": 8101 + }, + { + "epoch": 3.8307328605200945, + "grad_norm": 2.9991304874420166, + "learning_rate": 1.46703693518022e-06, + "loss": 0.3643, + "step": 8102 + }, + { + "epoch": 3.8312056737588653, + "grad_norm": 2.9816839694976807, + "learning_rate": 1.466468881176914e-06, + "loss": 0.3803, + "step": 8103 + }, + { + "epoch": 3.831678486997636, + "grad_norm": 3.2009265422821045, + "learning_rate": 1.465900891527059e-06, + "loss": 0.3828, + "step": 8104 + }, + { + "epoch": 3.8321513002364065, + "grad_norm": 2.9479124546051025, + "learning_rate": 1.4653329662660201e-06, + "loss": 0.3683, + "step": 8105 + }, + { + "epoch": 3.8326241134751773, + "grad_norm": 2.938507080078125, + "learning_rate": 1.4647651054291614e-06, + "loss": 0.3703, + "step": 8106 + }, + { + "epoch": 3.833096926713948, + "grad_norm": 2.7777645587921143, + "learning_rate": 1.4641973090518397e-06, + "loss": 0.3982, + "step": 8107 + }, + { + "epoch": 3.8335697399527184, + "grad_norm": 3.2470149993896484, + "learning_rate": 1.4636295771694099e-06, + "loss": 0.3748, + "step": 8108 + }, + { + "epoch": 3.8340425531914892, + "grad_norm": 2.869310140609741, + "learning_rate": 1.4630619098172223e-06, + "loss": 0.3577, + "step": 8109 + }, + { + "epoch": 3.83451536643026, + "grad_norm": 3.1245369911193848, + "learning_rate": 1.4624943070306225e-06, + "loss": 0.4518, + "step": 8110 + }, + { + "epoch": 3.834988179669031, + "grad_norm": 3.0390701293945312, + "learning_rate": 1.4619267688449529e-06, + "loss": 0.5051, + "step": 8111 + }, + { + "epoch": 3.8354609929078016, + "grad_norm": 2.929943799972534, + "learning_rate": 1.4613592952955507e-06, + "loss": 0.4207, + "step": 8112 + }, + { + "epoch": 3.835933806146572, + "grad_norm": 3.17008376121521, + "learning_rate": 1.4607918864177523e-06, + "loss": 0.3836, + "step": 8113 + }, + { + "epoch": 3.8364066193853428, + "grad_norm": 3.0689237117767334, + "learning_rate": 1.460224542246886e-06, + "loss": 0.3413, + "step": 8114 + }, + { + "epoch": 3.8368794326241136, + "grad_norm": 2.9966423511505127, + "learning_rate": 1.4596572628182774e-06, + "loss": 0.4367, + "step": 8115 + }, + { + "epoch": 3.837352245862884, + "grad_norm": 3.0572052001953125, + "learning_rate": 1.45909004816725e-06, + "loss": 0.4089, + "step": 8116 + }, + { + "epoch": 3.8378250591016547, + "grad_norm": 2.911263942718506, + "learning_rate": 1.4585228983291203e-06, + "loss": 0.3848, + "step": 8117 + }, + { + "epoch": 3.8382978723404255, + "grad_norm": 2.9233853816986084, + "learning_rate": 1.4579558133392038e-06, + "loss": 0.4012, + "step": 8118 + }, + { + "epoch": 3.8387706855791963, + "grad_norm": 2.7813868522644043, + "learning_rate": 1.4573887932328097e-06, + "loss": 0.3898, + "step": 8119 + }, + { + "epoch": 3.839243498817967, + "grad_norm": 2.8727006912231445, + "learning_rate": 1.4568218380452436e-06, + "loss": 0.3965, + "step": 8120 + }, + { + "epoch": 3.8397163120567375, + "grad_norm": 3.0381174087524414, + "learning_rate": 1.4562549478118077e-06, + "loss": 0.4304, + "step": 8121 + }, + { + "epoch": 3.8401891252955083, + "grad_norm": 2.7406346797943115, + "learning_rate": 1.4556881225677982e-06, + "loss": 0.3636, + "step": 8122 + }, + { + "epoch": 3.840661938534279, + "grad_norm": 3.3900108337402344, + "learning_rate": 1.4551213623485111e-06, + "loss": 0.3863, + "step": 8123 + }, + { + "epoch": 3.8411347517730494, + "grad_norm": 2.885150909423828, + "learning_rate": 1.4545546671892354e-06, + "loss": 0.3679, + "step": 8124 + }, + { + "epoch": 3.84160756501182, + "grad_norm": 3.3361690044403076, + "learning_rate": 1.4539880371252555e-06, + "loss": 0.4333, + "step": 8125 + }, + { + "epoch": 3.842080378250591, + "grad_norm": 3.1547763347625732, + "learning_rate": 1.4534214721918545e-06, + "loss": 0.4477, + "step": 8126 + }, + { + "epoch": 3.842553191489362, + "grad_norm": 3.0337510108947754, + "learning_rate": 1.4528549724243095e-06, + "loss": 0.3647, + "step": 8127 + }, + { + "epoch": 3.8430260047281326, + "grad_norm": 2.8390069007873535, + "learning_rate": 1.452288537857893e-06, + "loss": 0.3698, + "step": 8128 + }, + { + "epoch": 3.843498817966903, + "grad_norm": 2.857513427734375, + "learning_rate": 1.451722168527876e-06, + "loss": 0.3842, + "step": 8129 + }, + { + "epoch": 3.8439716312056738, + "grad_norm": 3.015320062637329, + "learning_rate": 1.451155864469522e-06, + "loss": 0.4058, + "step": 8130 + }, + { + "epoch": 3.8444444444444446, + "grad_norm": 2.923957347869873, + "learning_rate": 1.450589625718094e-06, + "loss": 0.3976, + "step": 8131 + }, + { + "epoch": 3.844917257683215, + "grad_norm": 3.332338571548462, + "learning_rate": 1.4500234523088492e-06, + "loss": 0.4118, + "step": 8132 + }, + { + "epoch": 3.8453900709219857, + "grad_norm": 3.0403711795806885, + "learning_rate": 1.4494573442770381e-06, + "loss": 0.3715, + "step": 8133 + }, + { + "epoch": 3.8458628841607565, + "grad_norm": 3.2310287952423096, + "learning_rate": 1.4488913016579135e-06, + "loss": 0.4587, + "step": 8134 + }, + { + "epoch": 3.8463356973995273, + "grad_norm": 3.091282844543457, + "learning_rate": 1.448325324486718e-06, + "loss": 0.4234, + "step": 8135 + }, + { + "epoch": 3.846808510638298, + "grad_norm": 3.11161208152771, + "learning_rate": 1.4477594127986933e-06, + "loss": 0.4176, + "step": 8136 + }, + { + "epoch": 3.8472813238770684, + "grad_norm": 3.21042537689209, + "learning_rate": 1.4471935666290751e-06, + "loss": 0.4326, + "step": 8137 + }, + { + "epoch": 3.8477541371158392, + "grad_norm": 3.411543846130371, + "learning_rate": 1.4466277860130981e-06, + "loss": 0.4525, + "step": 8138 + }, + { + "epoch": 3.84822695035461, + "grad_norm": 3.0475308895111084, + "learning_rate": 1.4460620709859898e-06, + "loss": 0.3906, + "step": 8139 + }, + { + "epoch": 3.8486997635933804, + "grad_norm": 2.989367723464966, + "learning_rate": 1.4454964215829742e-06, + "loss": 0.3732, + "step": 8140 + }, + { + "epoch": 3.849172576832151, + "grad_norm": 2.8130393028259277, + "learning_rate": 1.4449308378392734e-06, + "loss": 0.3733, + "step": 8141 + }, + { + "epoch": 3.849645390070922, + "grad_norm": 12.2243013381958, + "learning_rate": 1.444365319790103e-06, + "loss": 0.3506, + "step": 8142 + }, + { + "epoch": 3.850118203309693, + "grad_norm": 3.075556516647339, + "learning_rate": 1.4437998674706743e-06, + "loss": 0.376, + "step": 8143 + }, + { + "epoch": 3.8505910165484636, + "grad_norm": 2.765650510787964, + "learning_rate": 1.4432344809161974e-06, + "loss": 0.3865, + "step": 8144 + }, + { + "epoch": 3.851063829787234, + "grad_norm": 3.171588897705078, + "learning_rate": 1.4426691601618747e-06, + "loss": 0.4391, + "step": 8145 + }, + { + "epoch": 3.8515366430260047, + "grad_norm": 2.8378992080688477, + "learning_rate": 1.4421039052429083e-06, + "loss": 0.3984, + "step": 8146 + }, + { + "epoch": 3.8520094562647755, + "grad_norm": 2.6588387489318848, + "learning_rate": 1.4415387161944929e-06, + "loss": 0.3961, + "step": 8147 + }, + { + "epoch": 3.852482269503546, + "grad_norm": 2.919325351715088, + "learning_rate": 1.4409735930518197e-06, + "loss": 0.4058, + "step": 8148 + }, + { + "epoch": 3.8529550827423167, + "grad_norm": 3.2239115238189697, + "learning_rate": 1.4404085358500778e-06, + "loss": 0.4018, + "step": 8149 + }, + { + "epoch": 3.8534278959810875, + "grad_norm": 3.2509875297546387, + "learning_rate": 1.4398435446244502e-06, + "loss": 0.4078, + "step": 8150 + }, + { + "epoch": 3.8539007092198583, + "grad_norm": 3.124782085418701, + "learning_rate": 1.4392786194101155e-06, + "loss": 0.4459, + "step": 8151 + }, + { + "epoch": 3.854373522458629, + "grad_norm": 2.924095392227173, + "learning_rate": 1.4387137602422512e-06, + "loss": 0.3686, + "step": 8152 + }, + { + "epoch": 3.8548463356973994, + "grad_norm": 2.9307191371917725, + "learning_rate": 1.4381489671560272e-06, + "loss": 0.4345, + "step": 8153 + }, + { + "epoch": 3.8553191489361702, + "grad_norm": 2.868488073348999, + "learning_rate": 1.4375842401866113e-06, + "loss": 0.366, + "step": 8154 + }, + { + "epoch": 3.855791962174941, + "grad_norm": 2.9893085956573486, + "learning_rate": 1.4370195793691661e-06, + "loss": 0.3401, + "step": 8155 + }, + { + "epoch": 3.8562647754137114, + "grad_norm": 3.0113472938537598, + "learning_rate": 1.4364549847388492e-06, + "loss": 0.4051, + "step": 8156 + }, + { + "epoch": 3.856737588652482, + "grad_norm": 3.4693121910095215, + "learning_rate": 1.4358904563308184e-06, + "loss": 0.4505, + "step": 8157 + }, + { + "epoch": 3.857210401891253, + "grad_norm": 2.9048118591308594, + "learning_rate": 1.4353259941802216e-06, + "loss": 0.3973, + "step": 8158 + }, + { + "epoch": 3.8576832151300238, + "grad_norm": 3.264910936355591, + "learning_rate": 1.434761598322208e-06, + "loss": 0.4317, + "step": 8159 + }, + { + "epoch": 3.8581560283687946, + "grad_norm": 2.973742723464966, + "learning_rate": 1.4341972687919186e-06, + "loss": 0.3896, + "step": 8160 + }, + { + "epoch": 3.858628841607565, + "grad_norm": 2.7802605628967285, + "learning_rate": 1.4336330056244906e-06, + "loss": 0.4063, + "step": 8161 + }, + { + "epoch": 3.8591016548463357, + "grad_norm": 3.1401731967926025, + "learning_rate": 1.433068808855061e-06, + "loss": 0.4068, + "step": 8162 + }, + { + "epoch": 3.8595744680851065, + "grad_norm": 3.132723331451416, + "learning_rate": 1.432504678518757e-06, + "loss": 0.4724, + "step": 8163 + }, + { + "epoch": 3.860047281323877, + "grad_norm": 2.94944167137146, + "learning_rate": 1.4319406146507068e-06, + "loss": 0.3666, + "step": 8164 + }, + { + "epoch": 3.8605200945626477, + "grad_norm": 2.972322463989258, + "learning_rate": 1.4313766172860311e-06, + "loss": 0.4226, + "step": 8165 + }, + { + "epoch": 3.8609929078014185, + "grad_norm": 2.9808123111724854, + "learning_rate": 1.430812686459847e-06, + "loss": 0.4079, + "step": 8166 + }, + { + "epoch": 3.8614657210401893, + "grad_norm": 2.9656291007995605, + "learning_rate": 1.4302488222072698e-06, + "loss": 0.3423, + "step": 8167 + }, + { + "epoch": 3.86193853427896, + "grad_norm": 2.886765241622925, + "learning_rate": 1.4296850245634073e-06, + "loss": 0.3577, + "step": 8168 + }, + { + "epoch": 3.8624113475177304, + "grad_norm": 3.0613043308258057, + "learning_rate": 1.4291212935633653e-06, + "loss": 0.4121, + "step": 8169 + }, + { + "epoch": 3.862884160756501, + "grad_norm": 2.842050313949585, + "learning_rate": 1.4285576292422445e-06, + "loss": 0.373, + "step": 8170 + }, + { + "epoch": 3.863356973995272, + "grad_norm": 3.0604517459869385, + "learning_rate": 1.4279940316351413e-06, + "loss": 0.3938, + "step": 8171 + }, + { + "epoch": 3.8638297872340424, + "grad_norm": 3.9742302894592285, + "learning_rate": 1.42743050077715e-06, + "loss": 0.4463, + "step": 8172 + }, + { + "epoch": 3.864302600472813, + "grad_norm": 2.8330607414245605, + "learning_rate": 1.4268670367033572e-06, + "loss": 0.4423, + "step": 8173 + }, + { + "epoch": 3.864775413711584, + "grad_norm": 2.953256607055664, + "learning_rate": 1.4263036394488497e-06, + "loss": 0.3553, + "step": 8174 + }, + { + "epoch": 3.8652482269503547, + "grad_norm": 2.865849018096924, + "learning_rate": 1.4257403090487065e-06, + "loss": 0.3348, + "step": 8175 + }, + { + "epoch": 3.8657210401891255, + "grad_norm": 2.712502956390381, + "learning_rate": 1.4251770455380027e-06, + "loss": 0.3896, + "step": 8176 + }, + { + "epoch": 3.866193853427896, + "grad_norm": 2.798898220062256, + "learning_rate": 1.4246138489518123e-06, + "loss": 0.4275, + "step": 8177 + }, + { + "epoch": 3.8666666666666667, + "grad_norm": 2.830899953842163, + "learning_rate": 1.4240507193252023e-06, + "loss": 0.3952, + "step": 8178 + }, + { + "epoch": 3.8671394799054375, + "grad_norm": 2.5789451599121094, + "learning_rate": 1.4234876566932348e-06, + "loss": 0.3483, + "step": 8179 + }, + { + "epoch": 3.867612293144208, + "grad_norm": 2.8513095378875732, + "learning_rate": 1.422924661090972e-06, + "loss": 0.3403, + "step": 8180 + }, + { + "epoch": 3.8680851063829786, + "grad_norm": 3.5031449794769287, + "learning_rate": 1.4223617325534664e-06, + "loss": 0.3964, + "step": 8181 + }, + { + "epoch": 3.8685579196217494, + "grad_norm": 2.7495479583740234, + "learning_rate": 1.4217988711157715e-06, + "loss": 0.3376, + "step": 8182 + }, + { + "epoch": 3.8690307328605202, + "grad_norm": 2.8609421253204346, + "learning_rate": 1.421236076812933e-06, + "loss": 0.3967, + "step": 8183 + }, + { + "epoch": 3.869503546099291, + "grad_norm": 3.0624637603759766, + "learning_rate": 1.420673349679994e-06, + "loss": 0.3764, + "step": 8184 + }, + { + "epoch": 3.8699763593380614, + "grad_norm": 3.3084404468536377, + "learning_rate": 1.4201106897519926e-06, + "loss": 0.4567, + "step": 8185 + }, + { + "epoch": 3.870449172576832, + "grad_norm": 3.164116382598877, + "learning_rate": 1.4195480970639624e-06, + "loss": 0.4217, + "step": 8186 + }, + { + "epoch": 3.870921985815603, + "grad_norm": 2.971390724182129, + "learning_rate": 1.4189855716509355e-06, + "loss": 0.3981, + "step": 8187 + }, + { + "epoch": 3.8713947990543733, + "grad_norm": 3.0537233352661133, + "learning_rate": 1.418423113547937e-06, + "loss": 0.4093, + "step": 8188 + }, + { + "epoch": 3.871867612293144, + "grad_norm": 3.698120594024658, + "learning_rate": 1.4178607227899877e-06, + "loss": 0.3158, + "step": 8189 + }, + { + "epoch": 3.872340425531915, + "grad_norm": 3.0320451259613037, + "learning_rate": 1.417298399412107e-06, + "loss": 0.3903, + "step": 8190 + }, + { + "epoch": 3.8728132387706857, + "grad_norm": 2.913296699523926, + "learning_rate": 1.4167361434493068e-06, + "loss": 0.3396, + "step": 8191 + }, + { + "epoch": 3.8732860520094565, + "grad_norm": 3.011906147003174, + "learning_rate": 1.4161739549365976e-06, + "loss": 0.3915, + "step": 8192 + }, + { + "epoch": 3.873758865248227, + "grad_norm": 3.2707724571228027, + "learning_rate": 1.4156118339089842e-06, + "loss": 0.4466, + "step": 8193 + }, + { + "epoch": 3.8742316784869977, + "grad_norm": 3.036747694015503, + "learning_rate": 1.4150497804014656e-06, + "loss": 0.4095, + "step": 8194 + }, + { + "epoch": 3.8747044917257685, + "grad_norm": 2.8851394653320312, + "learning_rate": 1.4144877944490411e-06, + "loss": 0.4235, + "step": 8195 + }, + { + "epoch": 3.875177304964539, + "grad_norm": 3.099785566329956, + "learning_rate": 1.4139258760867008e-06, + "loss": 0.4102, + "step": 8196 + }, + { + "epoch": 3.8756501182033096, + "grad_norm": 3.0752081871032715, + "learning_rate": 1.4133640253494347e-06, + "loss": 0.4165, + "step": 8197 + }, + { + "epoch": 3.8761229314420804, + "grad_norm": 2.842257261276245, + "learning_rate": 1.412802242272226e-06, + "loss": 0.3573, + "step": 8198 + }, + { + "epoch": 3.876595744680851, + "grad_norm": 2.93868350982666, + "learning_rate": 1.4122405268900547e-06, + "loss": 0.36, + "step": 8199 + }, + { + "epoch": 3.877068557919622, + "grad_norm": 2.674356460571289, + "learning_rate": 1.411678879237896e-06, + "loss": 0.3763, + "step": 8200 + }, + { + "epoch": 3.8775413711583924, + "grad_norm": 2.710617780685425, + "learning_rate": 1.411117299350721e-06, + "loss": 0.358, + "step": 8201 + }, + { + "epoch": 3.878014184397163, + "grad_norm": 3.0299410820007324, + "learning_rate": 1.4105557872634968e-06, + "loss": 0.3723, + "step": 8202 + }, + { + "epoch": 3.878486997635934, + "grad_norm": 3.1951241493225098, + "learning_rate": 1.4099943430111874e-06, + "loss": 0.4163, + "step": 8203 + }, + { + "epoch": 3.8789598108747043, + "grad_norm": 2.752410411834717, + "learning_rate": 1.4094329666287495e-06, + "loss": 0.3753, + "step": 8204 + }, + { + "epoch": 3.879432624113475, + "grad_norm": 3.1242496967315674, + "learning_rate": 1.40887165815114e-06, + "loss": 0.3694, + "step": 8205 + }, + { + "epoch": 3.879905437352246, + "grad_norm": 5.16750431060791, + "learning_rate": 1.4083104176133079e-06, + "loss": 0.3869, + "step": 8206 + }, + { + "epoch": 3.8803782505910167, + "grad_norm": 3.2995245456695557, + "learning_rate": 1.4077492450501978e-06, + "loss": 0.4194, + "step": 8207 + }, + { + "epoch": 3.8808510638297875, + "grad_norm": 3.506807804107666, + "learning_rate": 1.4071881404967541e-06, + "loss": 0.3873, + "step": 8208 + }, + { + "epoch": 3.881323877068558, + "grad_norm": 3.1201252937316895, + "learning_rate": 1.4066271039879123e-06, + "loss": 0.3625, + "step": 8209 + }, + { + "epoch": 3.8817966903073287, + "grad_norm": 2.870683193206787, + "learning_rate": 1.4060661355586073e-06, + "loss": 0.4039, + "step": 8210 + }, + { + "epoch": 3.8822695035460995, + "grad_norm": 3.177701234817505, + "learning_rate": 1.405505235243767e-06, + "loss": 0.3715, + "step": 8211 + }, + { + "epoch": 3.88274231678487, + "grad_norm": 3.0319771766662598, + "learning_rate": 1.4049444030783157e-06, + "loss": 0.3588, + "step": 8212 + }, + { + "epoch": 3.8832151300236406, + "grad_norm": 2.4598889350891113, + "learning_rate": 1.404383639097176e-06, + "loss": 0.2788, + "step": 8213 + }, + { + "epoch": 3.8836879432624114, + "grad_norm": 2.916987419128418, + "learning_rate": 1.4038229433352623e-06, + "loss": 0.4167, + "step": 8214 + }, + { + "epoch": 3.884160756501182, + "grad_norm": 3.005075216293335, + "learning_rate": 1.4032623158274872e-06, + "loss": 0.4251, + "step": 8215 + }, + { + "epoch": 3.8846335697399526, + "grad_norm": 3.1718621253967285, + "learning_rate": 1.4027017566087591e-06, + "loss": 0.383, + "step": 8216 + }, + { + "epoch": 3.8851063829787233, + "grad_norm": 2.954662322998047, + "learning_rate": 1.402141265713981e-06, + "loss": 0.373, + "step": 8217 + }, + { + "epoch": 3.885579196217494, + "grad_norm": 3.408008337020874, + "learning_rate": 1.4015808431780526e-06, + "loss": 0.4216, + "step": 8218 + }, + { + "epoch": 3.8860520094562645, + "grad_norm": 3.1599369049072266, + "learning_rate": 1.4010204890358675e-06, + "loss": 0.4544, + "step": 8219 + }, + { + "epoch": 3.8865248226950353, + "grad_norm": 2.8919107913970947, + "learning_rate": 1.4004602033223186e-06, + "loss": 0.3785, + "step": 8220 + }, + { + "epoch": 3.886997635933806, + "grad_norm": 3.522581100463867, + "learning_rate": 1.3998999860722918e-06, + "loss": 0.4276, + "step": 8221 + }, + { + "epoch": 3.887470449172577, + "grad_norm": 2.9278945922851562, + "learning_rate": 1.399339837320668e-06, + "loss": 0.409, + "step": 8222 + }, + { + "epoch": 3.8879432624113477, + "grad_norm": 3.032557725906372, + "learning_rate": 1.398779757102327e-06, + "loss": 0.3973, + "step": 8223 + }, + { + "epoch": 3.888416075650118, + "grad_norm": 2.843118667602539, + "learning_rate": 1.3982197454521423e-06, + "loss": 0.3418, + "step": 8224 + }, + { + "epoch": 3.888888888888889, + "grad_norm": 2.8620638847351074, + "learning_rate": 1.3976598024049815e-06, + "loss": 0.3751, + "step": 8225 + }, + { + "epoch": 3.8893617021276596, + "grad_norm": 2.532327175140381, + "learning_rate": 1.3970999279957124e-06, + "loss": 0.3541, + "step": 8226 + }, + { + "epoch": 3.88983451536643, + "grad_norm": 3.1074535846710205, + "learning_rate": 1.3965401222591935e-06, + "loss": 0.4706, + "step": 8227 + }, + { + "epoch": 3.890307328605201, + "grad_norm": 3.1558735370635986, + "learning_rate": 1.3959803852302839e-06, + "loss": 0.448, + "step": 8228 + }, + { + "epoch": 3.8907801418439716, + "grad_norm": 3.0862064361572266, + "learning_rate": 1.3954207169438344e-06, + "loss": 0.3308, + "step": 8229 + }, + { + "epoch": 3.8912529550827424, + "grad_norm": 2.9246280193328857, + "learning_rate": 1.3948611174346927e-06, + "loss": 0.3771, + "step": 8230 + }, + { + "epoch": 3.891725768321513, + "grad_norm": 2.7959492206573486, + "learning_rate": 1.394301586737704e-06, + "loss": 0.4248, + "step": 8231 + }, + { + "epoch": 3.8921985815602835, + "grad_norm": 2.787670373916626, + "learning_rate": 1.3937421248877075e-06, + "loss": 0.3416, + "step": 8232 + }, + { + "epoch": 3.8926713947990543, + "grad_norm": 3.0775792598724365, + "learning_rate": 1.393182731919538e-06, + "loss": 0.4345, + "step": 8233 + }, + { + "epoch": 3.893144208037825, + "grad_norm": 2.6338887214660645, + "learning_rate": 1.3926234078680268e-06, + "loss": 0.3995, + "step": 8234 + }, + { + "epoch": 3.8936170212765955, + "grad_norm": 2.9975900650024414, + "learning_rate": 1.392064152767999e-06, + "loss": 0.3997, + "step": 8235 + }, + { + "epoch": 3.8940898345153663, + "grad_norm": 2.8615779876708984, + "learning_rate": 1.3915049666542791e-06, + "loss": 0.3687, + "step": 8236 + }, + { + "epoch": 3.894562647754137, + "grad_norm": 3.0132436752319336, + "learning_rate": 1.3909458495616835e-06, + "loss": 0.4085, + "step": 8237 + }, + { + "epoch": 3.895035460992908, + "grad_norm": 3.141291379928589, + "learning_rate": 1.3903868015250278e-06, + "loss": 0.3903, + "step": 8238 + }, + { + "epoch": 3.8955082742316787, + "grad_norm": 2.6998603343963623, + "learning_rate": 1.3898278225791204e-06, + "loss": 0.3576, + "step": 8239 + }, + { + "epoch": 3.895981087470449, + "grad_norm": 3.212578535079956, + "learning_rate": 1.3892689127587656e-06, + "loss": 0.4321, + "step": 8240 + }, + { + "epoch": 3.89645390070922, + "grad_norm": 3.15732741355896, + "learning_rate": 1.3887100720987662e-06, + "loss": 0.4247, + "step": 8241 + }, + { + "epoch": 3.8969267139479906, + "grad_norm": 2.6001040935516357, + "learning_rate": 1.3881513006339168e-06, + "loss": 0.3376, + "step": 8242 + }, + { + "epoch": 3.897399527186761, + "grad_norm": 2.766188859939575, + "learning_rate": 1.3875925983990113e-06, + "loss": 0.3771, + "step": 8243 + }, + { + "epoch": 3.8978723404255318, + "grad_norm": 2.7471580505371094, + "learning_rate": 1.3870339654288372e-06, + "loss": 0.3311, + "step": 8244 + }, + { + "epoch": 3.8983451536643026, + "grad_norm": 3.577664375305176, + "learning_rate": 1.3864754017581769e-06, + "loss": 0.3725, + "step": 8245 + }, + { + "epoch": 3.8988179669030734, + "grad_norm": 2.8747243881225586, + "learning_rate": 1.3859169074218116e-06, + "loss": 0.3706, + "step": 8246 + }, + { + "epoch": 3.899290780141844, + "grad_norm": 2.5249671936035156, + "learning_rate": 1.3853584824545152e-06, + "loss": 0.3621, + "step": 8247 + }, + { + "epoch": 3.8997635933806145, + "grad_norm": 2.7290890216827393, + "learning_rate": 1.3848001268910589e-06, + "loss": 0.3209, + "step": 8248 + }, + { + "epoch": 3.9002364066193853, + "grad_norm": 3.0917534828186035, + "learning_rate": 1.3842418407662084e-06, + "loss": 0.3904, + "step": 8249 + }, + { + "epoch": 3.900709219858156, + "grad_norm": 3.099494695663452, + "learning_rate": 1.383683624114725e-06, + "loss": 0.3714, + "step": 8250 + }, + { + "epoch": 3.9011820330969265, + "grad_norm": 3.077505588531494, + "learning_rate": 1.3831254769713687e-06, + "loss": 0.4166, + "step": 8251 + }, + { + "epoch": 3.9016548463356973, + "grad_norm": 2.9983766078948975, + "learning_rate": 1.3825673993708915e-06, + "loss": 0.3909, + "step": 8252 + }, + { + "epoch": 3.902127659574468, + "grad_norm": 2.7958667278289795, + "learning_rate": 1.3820093913480415e-06, + "loss": 0.3966, + "step": 8253 + }, + { + "epoch": 3.902600472813239, + "grad_norm": 3.0938336849212646, + "learning_rate": 1.3814514529375656e-06, + "loss": 0.4118, + "step": 8254 + }, + { + "epoch": 3.9030732860520096, + "grad_norm": 3.2711637020111084, + "learning_rate": 1.3808935841742016e-06, + "loss": 0.4021, + "step": 8255 + }, + { + "epoch": 3.90354609929078, + "grad_norm": 3.23563814163208, + "learning_rate": 1.3803357850926885e-06, + "loss": 0.3679, + "step": 8256 + }, + { + "epoch": 3.904018912529551, + "grad_norm": 2.77942156791687, + "learning_rate": 1.3797780557277563e-06, + "loss": 0.3938, + "step": 8257 + }, + { + "epoch": 3.9044917257683216, + "grad_norm": 3.1273257732391357, + "learning_rate": 1.3792203961141313e-06, + "loss": 0.3579, + "step": 8258 + }, + { + "epoch": 3.904964539007092, + "grad_norm": 3.69164776802063, + "learning_rate": 1.378662806286539e-06, + "loss": 0.3712, + "step": 8259 + }, + { + "epoch": 3.9054373522458627, + "grad_norm": 2.8818306922912598, + "learning_rate": 1.3781052862796957e-06, + "loss": 0.3972, + "step": 8260 + }, + { + "epoch": 3.9059101654846335, + "grad_norm": 2.776651382446289, + "learning_rate": 1.377547836128318e-06, + "loss": 0.3605, + "step": 8261 + }, + { + "epoch": 3.9063829787234043, + "grad_norm": 3.1498706340789795, + "learning_rate": 1.376990455867115e-06, + "loss": 0.3995, + "step": 8262 + }, + { + "epoch": 3.906855791962175, + "grad_norm": 2.777390956878662, + "learning_rate": 1.3764331455307916e-06, + "loss": 0.3463, + "step": 8263 + }, + { + "epoch": 3.9073286052009455, + "grad_norm": 2.9953835010528564, + "learning_rate": 1.3758759051540496e-06, + "loss": 0.3881, + "step": 8264 + }, + { + "epoch": 3.9078014184397163, + "grad_norm": 3.737194538116455, + "learning_rate": 1.375318734771585e-06, + "loss": 0.4456, + "step": 8265 + }, + { + "epoch": 3.908274231678487, + "grad_norm": 3.1575849056243896, + "learning_rate": 1.374761634418092e-06, + "loss": 0.3613, + "step": 8266 + }, + { + "epoch": 3.9087470449172574, + "grad_norm": 3.140662908554077, + "learning_rate": 1.374204604128258e-06, + "loss": 0.4462, + "step": 8267 + }, + { + "epoch": 3.9092198581560282, + "grad_norm": 3.2106714248657227, + "learning_rate": 1.3736476439367663e-06, + "loss": 0.3801, + "step": 8268 + }, + { + "epoch": 3.909692671394799, + "grad_norm": 2.888345956802368, + "learning_rate": 1.3730907538782976e-06, + "loss": 0.4209, + "step": 8269 + }, + { + "epoch": 3.91016548463357, + "grad_norm": 2.8903355598449707, + "learning_rate": 1.3725339339875252e-06, + "loss": 0.3612, + "step": 8270 + }, + { + "epoch": 3.9106382978723406, + "grad_norm": 3.2661736011505127, + "learning_rate": 1.371977184299122e-06, + "loss": 0.4151, + "step": 8271 + }, + { + "epoch": 3.911111111111111, + "grad_norm": 3.1532459259033203, + "learning_rate": 1.3714205048477535e-06, + "loss": 0.3706, + "step": 8272 + }, + { + "epoch": 3.911583924349882, + "grad_norm": 2.907306432723999, + "learning_rate": 1.3708638956680804e-06, + "loss": 0.4113, + "step": 8273 + }, + { + "epoch": 3.9120567375886526, + "grad_norm": 2.7301599979400635, + "learning_rate": 1.3703073567947622e-06, + "loss": 0.355, + "step": 8274 + }, + { + "epoch": 3.912529550827423, + "grad_norm": 2.595625877380371, + "learning_rate": 1.3697508882624516e-06, + "loss": 0.3733, + "step": 8275 + }, + { + "epoch": 3.9130023640661937, + "grad_norm": 2.784294366836548, + "learning_rate": 1.369194490105796e-06, + "loss": 0.3366, + "step": 8276 + }, + { + "epoch": 3.9134751773049645, + "grad_norm": 3.0179800987243652, + "learning_rate": 1.3686381623594419e-06, + "loss": 0.3922, + "step": 8277 + }, + { + "epoch": 3.9139479905437353, + "grad_norm": 2.6641111373901367, + "learning_rate": 1.3680819050580291e-06, + "loss": 0.3324, + "step": 8278 + }, + { + "epoch": 3.914420803782506, + "grad_norm": 2.917741060256958, + "learning_rate": 1.3675257182361923e-06, + "loss": 0.3784, + "step": 8279 + }, + { + "epoch": 3.9148936170212765, + "grad_norm": 2.959599018096924, + "learning_rate": 1.3669696019285626e-06, + "loss": 0.3846, + "step": 8280 + }, + { + "epoch": 3.9153664302600473, + "grad_norm": 3.078824043273926, + "learning_rate": 1.3664135561697683e-06, + "loss": 0.4357, + "step": 8281 + }, + { + "epoch": 3.915839243498818, + "grad_norm": 3.0174930095672607, + "learning_rate": 1.3658575809944313e-06, + "loss": 0.3643, + "step": 8282 + }, + { + "epoch": 3.9163120567375884, + "grad_norm": 2.6805408000946045, + "learning_rate": 1.365301676437169e-06, + "loss": 0.3193, + "step": 8283 + }, + { + "epoch": 3.916784869976359, + "grad_norm": 2.6996054649353027, + "learning_rate": 1.3647458425325966e-06, + "loss": 0.3378, + "step": 8284 + }, + { + "epoch": 3.91725768321513, + "grad_norm": 2.7950546741485596, + "learning_rate": 1.3641900793153223e-06, + "loss": 0.3864, + "step": 8285 + }, + { + "epoch": 3.917730496453901, + "grad_norm": 2.9658634662628174, + "learning_rate": 1.363634386819951e-06, + "loss": 0.3452, + "step": 8286 + }, + { + "epoch": 3.9182033096926716, + "grad_norm": 3.0684404373168945, + "learning_rate": 1.363078765081084e-06, + "loss": 0.3278, + "step": 8287 + }, + { + "epoch": 3.918676122931442, + "grad_norm": 3.0293614864349365, + "learning_rate": 1.3625232141333164e-06, + "loss": 0.3827, + "step": 8288 + }, + { + "epoch": 3.9191489361702128, + "grad_norm": 2.9969890117645264, + "learning_rate": 1.3619677340112413e-06, + "loss": 0.3412, + "step": 8289 + }, + { + "epoch": 3.9196217494089836, + "grad_norm": 2.991654396057129, + "learning_rate": 1.3614123247494457e-06, + "loss": 0.3683, + "step": 8290 + }, + { + "epoch": 3.920094562647754, + "grad_norm": 3.032158374786377, + "learning_rate": 1.360856986382511e-06, + "loss": 0.421, + "step": 8291 + }, + { + "epoch": 3.9205673758865247, + "grad_norm": 3.1413731575012207, + "learning_rate": 1.3603017189450173e-06, + "loss": 0.3818, + "step": 8292 + }, + { + "epoch": 3.9210401891252955, + "grad_norm": 3.295527219772339, + "learning_rate": 1.3597465224715387e-06, + "loss": 0.4828, + "step": 8293 + }, + { + "epoch": 3.9215130023640663, + "grad_norm": 3.116053581237793, + "learning_rate": 1.359191396996643e-06, + "loss": 0.4108, + "step": 8294 + }, + { + "epoch": 3.921985815602837, + "grad_norm": 2.957446336746216, + "learning_rate": 1.3586363425548975e-06, + "loss": 0.3482, + "step": 8295 + }, + { + "epoch": 3.9224586288416075, + "grad_norm": 2.745471715927124, + "learning_rate": 1.3580813591808627e-06, + "loss": 0.4184, + "step": 8296 + }, + { + "epoch": 3.9229314420803783, + "grad_norm": 3.0920722484588623, + "learning_rate": 1.3575264469090943e-06, + "loss": 0.3826, + "step": 8297 + }, + { + "epoch": 3.923404255319149, + "grad_norm": 2.8719749450683594, + "learning_rate": 1.3569716057741444e-06, + "loss": 0.3953, + "step": 8298 + }, + { + "epoch": 3.9238770685579194, + "grad_norm": 3.1278762817382812, + "learning_rate": 1.3564168358105597e-06, + "loss": 0.3658, + "step": 8299 + }, + { + "epoch": 3.92434988179669, + "grad_norm": 2.7752785682678223, + "learning_rate": 1.3558621370528851e-06, + "loss": 0.3447, + "step": 8300 + }, + { + "epoch": 3.924822695035461, + "grad_norm": 2.948575735092163, + "learning_rate": 1.3553075095356575e-06, + "loss": 0.3803, + "step": 8301 + }, + { + "epoch": 3.925295508274232, + "grad_norm": 2.8164193630218506, + "learning_rate": 1.354752953293413e-06, + "loss": 0.3724, + "step": 8302 + }, + { + "epoch": 3.9257683215130026, + "grad_norm": 3.2431271076202393, + "learning_rate": 1.3541984683606798e-06, + "loss": 0.382, + "step": 8303 + }, + { + "epoch": 3.926241134751773, + "grad_norm": 2.8485286235809326, + "learning_rate": 1.353644054771983e-06, + "loss": 0.3632, + "step": 8304 + }, + { + "epoch": 3.9267139479905437, + "grad_norm": 3.334914445877075, + "learning_rate": 1.3530897125618456e-06, + "loss": 0.5286, + "step": 8305 + }, + { + "epoch": 3.9271867612293145, + "grad_norm": 3.3895132541656494, + "learning_rate": 1.3525354417647815e-06, + "loss": 0.3838, + "step": 8306 + }, + { + "epoch": 3.927659574468085, + "grad_norm": 3.141935110092163, + "learning_rate": 1.351981242415305e-06, + "loss": 0.3928, + "step": 8307 + }, + { + "epoch": 3.9281323877068557, + "grad_norm": 3.3013596534729004, + "learning_rate": 1.3514271145479225e-06, + "loss": 0.4046, + "step": 8308 + }, + { + "epoch": 3.9286052009456265, + "grad_norm": 2.8704745769500732, + "learning_rate": 1.3508730581971363e-06, + "loss": 0.3542, + "step": 8309 + }, + { + "epoch": 3.9290780141843973, + "grad_norm": 3.179405689239502, + "learning_rate": 1.3503190733974472e-06, + "loss": 0.3911, + "step": 8310 + }, + { + "epoch": 3.929550827423168, + "grad_norm": 3.1091885566711426, + "learning_rate": 1.3497651601833481e-06, + "loss": 0.3552, + "step": 8311 + }, + { + "epoch": 3.9300236406619384, + "grad_norm": 2.687678813934326, + "learning_rate": 1.3492113185893288e-06, + "loss": 0.3462, + "step": 8312 + }, + { + "epoch": 3.9304964539007092, + "grad_norm": 3.4954965114593506, + "learning_rate": 1.3486575486498749e-06, + "loss": 0.4358, + "step": 8313 + }, + { + "epoch": 3.93096926713948, + "grad_norm": 2.8652899265289307, + "learning_rate": 1.3481038503994652e-06, + "loss": 0.3434, + "step": 8314 + }, + { + "epoch": 3.9314420803782504, + "grad_norm": 3.927623748779297, + "learning_rate": 1.3475502238725797e-06, + "loss": 0.4662, + "step": 8315 + }, + { + "epoch": 3.931914893617021, + "grad_norm": 3.1166276931762695, + "learning_rate": 1.346996669103687e-06, + "loss": 0.3953, + "step": 8316 + }, + { + "epoch": 3.932387706855792, + "grad_norm": 3.140003204345703, + "learning_rate": 1.346443186127257e-06, + "loss": 0.3616, + "step": 8317 + }, + { + "epoch": 3.9328605200945628, + "grad_norm": 3.335466146469116, + "learning_rate": 1.3458897749777516e-06, + "loss": 0.3854, + "step": 8318 + }, + { + "epoch": 3.9333333333333336, + "grad_norm": 2.8305466175079346, + "learning_rate": 1.3453364356896282e-06, + "loss": 0.374, + "step": 8319 + }, + { + "epoch": 3.933806146572104, + "grad_norm": 2.9511806964874268, + "learning_rate": 1.344783168297343e-06, + "loss": 0.4235, + "step": 8320 + }, + { + "epoch": 3.9342789598108747, + "grad_norm": 3.1868233680725098, + "learning_rate": 1.3442299728353448e-06, + "loss": 0.4384, + "step": 8321 + }, + { + "epoch": 3.9347517730496455, + "grad_norm": 3.1358237266540527, + "learning_rate": 1.3436768493380766e-06, + "loss": 0.4011, + "step": 8322 + }, + { + "epoch": 3.935224586288416, + "grad_norm": 3.126192808151245, + "learning_rate": 1.343123797839982e-06, + "loss": 0.4061, + "step": 8323 + }, + { + "epoch": 3.9356973995271867, + "grad_norm": 2.9724647998809814, + "learning_rate": 1.3425708183754949e-06, + "loss": 0.3859, + "step": 8324 + }, + { + "epoch": 3.9361702127659575, + "grad_norm": 3.1526355743408203, + "learning_rate": 1.3420179109790485e-06, + "loss": 0.3543, + "step": 8325 + }, + { + "epoch": 3.9366430260047283, + "grad_norm": 3.1289172172546387, + "learning_rate": 1.3414650756850695e-06, + "loss": 0.3836, + "step": 8326 + }, + { + "epoch": 3.937115839243499, + "grad_norm": 2.851264715194702, + "learning_rate": 1.34091231252798e-06, + "loss": 0.3294, + "step": 8327 + }, + { + "epoch": 3.9375886524822694, + "grad_norm": 2.921872138977051, + "learning_rate": 1.3403596215421981e-06, + "loss": 0.3698, + "step": 8328 + }, + { + "epoch": 3.93806146572104, + "grad_norm": 2.947258234024048, + "learning_rate": 1.339807002762137e-06, + "loss": 0.3616, + "step": 8329 + }, + { + "epoch": 3.938534278959811, + "grad_norm": 3.011021375656128, + "learning_rate": 1.3392544562222077e-06, + "loss": 0.3387, + "step": 8330 + }, + { + "epoch": 3.9390070921985814, + "grad_norm": 3.5230746269226074, + "learning_rate": 1.3387019819568134e-06, + "loss": 0.4054, + "step": 8331 + }, + { + "epoch": 3.939479905437352, + "grad_norm": 3.120321035385132, + "learning_rate": 1.3381495800003536e-06, + "loss": 0.4389, + "step": 8332 + }, + { + "epoch": 3.939952718676123, + "grad_norm": 3.0090999603271484, + "learning_rate": 1.3375972503872259e-06, + "loss": 0.4158, + "step": 8333 + }, + { + "epoch": 3.9404255319148938, + "grad_norm": 3.4807989597320557, + "learning_rate": 1.3370449931518198e-06, + "loss": 0.4144, + "step": 8334 + }, + { + "epoch": 3.9408983451536646, + "grad_norm": 2.8535733222961426, + "learning_rate": 1.336492808328523e-06, + "loss": 0.4281, + "step": 8335 + }, + { + "epoch": 3.941371158392435, + "grad_norm": 2.9032745361328125, + "learning_rate": 1.3359406959517174e-06, + "loss": 0.3389, + "step": 8336 + }, + { + "epoch": 3.9418439716312057, + "grad_norm": 2.725823163986206, + "learning_rate": 1.3353886560557793e-06, + "loss": 0.369, + "step": 8337 + }, + { + "epoch": 3.9423167848699765, + "grad_norm": 3.1965179443359375, + "learning_rate": 1.3348366886750844e-06, + "loss": 0.4031, + "step": 8338 + }, + { + "epoch": 3.942789598108747, + "grad_norm": 2.6991076469421387, + "learning_rate": 1.3342847938439985e-06, + "loss": 0.3434, + "step": 8339 + }, + { + "epoch": 3.9432624113475176, + "grad_norm": 4.491400718688965, + "learning_rate": 1.3337329715968877e-06, + "loss": 0.4175, + "step": 8340 + }, + { + "epoch": 3.9437352245862884, + "grad_norm": 4.005452632904053, + "learning_rate": 1.3331812219681112e-06, + "loss": 0.4191, + "step": 8341 + }, + { + "epoch": 3.9442080378250592, + "grad_norm": 3.1575794219970703, + "learning_rate": 1.3326295449920238e-06, + "loss": 0.4135, + "step": 8342 + }, + { + "epoch": 3.94468085106383, + "grad_norm": 3.2383973598480225, + "learning_rate": 1.3320779407029755e-06, + "loss": 0.38, + "step": 8343 + }, + { + "epoch": 3.9451536643026004, + "grad_norm": 2.873703718185425, + "learning_rate": 1.3315264091353119e-06, + "loss": 0.4128, + "step": 8344 + }, + { + "epoch": 3.945626477541371, + "grad_norm": 2.947274923324585, + "learning_rate": 1.330974950323376e-06, + "loss": 0.3342, + "step": 8345 + }, + { + "epoch": 3.946099290780142, + "grad_norm": 3.2874088287353516, + "learning_rate": 1.330423564301504e-06, + "loss": 0.3849, + "step": 8346 + }, + { + "epoch": 3.9465721040189123, + "grad_norm": 2.885772466659546, + "learning_rate": 1.3298722511040275e-06, + "loss": 0.3562, + "step": 8347 + }, + { + "epoch": 3.947044917257683, + "grad_norm": 3.0031309127807617, + "learning_rate": 1.3293210107652753e-06, + "loss": 0.3593, + "step": 8348 + }, + { + "epoch": 3.947517730496454, + "grad_norm": 2.815854549407959, + "learning_rate": 1.3287698433195712e-06, + "loss": 0.3633, + "step": 8349 + }, + { + "epoch": 3.9479905437352247, + "grad_norm": 3.228415012359619, + "learning_rate": 1.328218748801232e-06, + "loss": 0.3869, + "step": 8350 + }, + { + "epoch": 3.9484633569739955, + "grad_norm": 2.8497684001922607, + "learning_rate": 1.3276677272445743e-06, + "loss": 0.3833, + "step": 8351 + }, + { + "epoch": 3.948936170212766, + "grad_norm": 3.2330706119537354, + "learning_rate": 1.3271167786839057e-06, + "loss": 0.4414, + "step": 8352 + }, + { + "epoch": 3.9494089834515367, + "grad_norm": 3.2179152965545654, + "learning_rate": 1.3265659031535332e-06, + "loss": 0.3821, + "step": 8353 + }, + { + "epoch": 3.9498817966903075, + "grad_norm": 3.0861377716064453, + "learning_rate": 1.3260151006877567e-06, + "loss": 0.4014, + "step": 8354 + }, + { + "epoch": 3.950354609929078, + "grad_norm": 3.118872880935669, + "learning_rate": 1.325464371320871e-06, + "loss": 0.3836, + "step": 8355 + }, + { + "epoch": 3.9508274231678486, + "grad_norm": 2.9787702560424805, + "learning_rate": 1.32491371508717e-06, + "loss": 0.3794, + "step": 8356 + }, + { + "epoch": 3.9513002364066194, + "grad_norm": 3.132089853286743, + "learning_rate": 1.3243631320209387e-06, + "loss": 0.3698, + "step": 8357 + }, + { + "epoch": 3.9517730496453902, + "grad_norm": 2.5304882526397705, + "learning_rate": 1.32381262215646e-06, + "loss": 0.3687, + "step": 8358 + }, + { + "epoch": 3.952245862884161, + "grad_norm": 2.9121861457824707, + "learning_rate": 1.3232621855280126e-06, + "loss": 0.3704, + "step": 8359 + }, + { + "epoch": 3.9527186761229314, + "grad_norm": 3.0885608196258545, + "learning_rate": 1.3227118221698688e-06, + "loss": 0.4303, + "step": 8360 + }, + { + "epoch": 3.953191489361702, + "grad_norm": 2.7274837493896484, + "learning_rate": 1.3221615321162979e-06, + "loss": 0.3556, + "step": 8361 + }, + { + "epoch": 3.953664302600473, + "grad_norm": 3.1329922676086426, + "learning_rate": 1.3216113154015625e-06, + "loss": 0.4042, + "step": 8362 + }, + { + "epoch": 3.9541371158392433, + "grad_norm": 2.937380313873291, + "learning_rate": 1.3210611720599243e-06, + "loss": 0.3358, + "step": 8363 + }, + { + "epoch": 3.954609929078014, + "grad_norm": 2.939194440841675, + "learning_rate": 1.3205111021256378e-06, + "loss": 0.3885, + "step": 8364 + }, + { + "epoch": 3.955082742316785, + "grad_norm": 2.9151997566223145, + "learning_rate": 1.3199611056329516e-06, + "loss": 0.4094, + "step": 8365 + }, + { + "epoch": 3.9555555555555557, + "grad_norm": 3.029733419418335, + "learning_rate": 1.3194111826161143e-06, + "loss": 0.3999, + "step": 8366 + }, + { + "epoch": 3.9560283687943265, + "grad_norm": 2.7899951934814453, + "learning_rate": 1.3188613331093653e-06, + "loss": 0.321, + "step": 8367 + }, + { + "epoch": 3.956501182033097, + "grad_norm": 3.1109507083892822, + "learning_rate": 1.3183115571469425e-06, + "loss": 0.4266, + "step": 8368 + }, + { + "epoch": 3.9569739952718677, + "grad_norm": 3.085594415664673, + "learning_rate": 1.3177618547630774e-06, + "loss": 0.4412, + "step": 8369 + }, + { + "epoch": 3.9574468085106385, + "grad_norm": 3.0980300903320312, + "learning_rate": 1.3172122259919968e-06, + "loss": 0.3385, + "step": 8370 + }, + { + "epoch": 3.957919621749409, + "grad_norm": 3.103438138961792, + "learning_rate": 1.3166626708679256e-06, + "loss": 0.3887, + "step": 8371 + }, + { + "epoch": 3.9583924349881796, + "grad_norm": 2.8235526084899902, + "learning_rate": 1.3161131894250812e-06, + "loss": 0.3759, + "step": 8372 + }, + { + "epoch": 3.9588652482269504, + "grad_norm": 2.8316404819488525, + "learning_rate": 1.3155637816976762e-06, + "loss": 0.3666, + "step": 8373 + }, + { + "epoch": 3.959338061465721, + "grad_norm": 2.7873756885528564, + "learning_rate": 1.3150144477199218e-06, + "loss": 0.3284, + "step": 8374 + }, + { + "epoch": 3.959810874704492, + "grad_norm": 3.355039119720459, + "learning_rate": 1.3144651875260218e-06, + "loss": 0.4197, + "step": 8375 + }, + { + "epoch": 3.9602836879432624, + "grad_norm": 3.477721929550171, + "learning_rate": 1.3139160011501761e-06, + "loss": 0.3298, + "step": 8376 + }, + { + "epoch": 3.960756501182033, + "grad_norm": 3.557152032852173, + "learning_rate": 1.3133668886265805e-06, + "loss": 0.3788, + "step": 8377 + }, + { + "epoch": 3.961229314420804, + "grad_norm": 3.06707763671875, + "learning_rate": 1.312817849989424e-06, + "loss": 0.3613, + "step": 8378 + }, + { + "epoch": 3.9617021276595743, + "grad_norm": 2.7702202796936035, + "learning_rate": 1.3122688852728956e-06, + "loss": 0.402, + "step": 8379 + }, + { + "epoch": 3.962174940898345, + "grad_norm": 2.8121016025543213, + "learning_rate": 1.3117199945111746e-06, + "loss": 0.3576, + "step": 8380 + }, + { + "epoch": 3.962647754137116, + "grad_norm": 2.809282064437866, + "learning_rate": 1.3111711777384403e-06, + "loss": 0.3741, + "step": 8381 + }, + { + "epoch": 3.9631205673758867, + "grad_norm": 3.1175687313079834, + "learning_rate": 1.3106224349888638e-06, + "loss": 0.3388, + "step": 8382 + }, + { + "epoch": 3.963593380614657, + "grad_norm": 2.930525064468384, + "learning_rate": 1.310073766296612e-06, + "loss": 0.3593, + "step": 8383 + }, + { + "epoch": 3.964066193853428, + "grad_norm": 3.0673177242279053, + "learning_rate": 1.3095251716958501e-06, + "loss": 0.402, + "step": 8384 + }, + { + "epoch": 3.9645390070921986, + "grad_norm": 2.9725706577301025, + "learning_rate": 1.3089766512207347e-06, + "loss": 0.3707, + "step": 8385 + }, + { + "epoch": 3.965011820330969, + "grad_norm": 2.9790916442871094, + "learning_rate": 1.3084282049054218e-06, + "loss": 0.3292, + "step": 8386 + }, + { + "epoch": 3.96548463356974, + "grad_norm": 3.257035493850708, + "learning_rate": 1.3078798327840598e-06, + "loss": 0.3753, + "step": 8387 + }, + { + "epoch": 3.9659574468085106, + "grad_norm": 3.0534379482269287, + "learning_rate": 1.307331534890792e-06, + "loss": 0.4134, + "step": 8388 + }, + { + "epoch": 3.9664302600472814, + "grad_norm": 2.919243812561035, + "learning_rate": 1.306783311259761e-06, + "loss": 0.4283, + "step": 8389 + }, + { + "epoch": 3.966903073286052, + "grad_norm": 2.7643322944641113, + "learning_rate": 1.306235161925101e-06, + "loss": 0.3454, + "step": 8390 + }, + { + "epoch": 3.9673758865248225, + "grad_norm": 3.0208916664123535, + "learning_rate": 1.3056870869209431e-06, + "loss": 0.385, + "step": 8391 + }, + { + "epoch": 3.9678486997635933, + "grad_norm": 2.8657243251800537, + "learning_rate": 1.3051390862814135e-06, + "loss": 0.3614, + "step": 8392 + }, + { + "epoch": 3.968321513002364, + "grad_norm": 3.2093591690063477, + "learning_rate": 1.3045911600406325e-06, + "loss": 0.3774, + "step": 8393 + }, + { + "epoch": 3.9687943262411345, + "grad_norm": 3.091618537902832, + "learning_rate": 1.3040433082327192e-06, + "loss": 0.4157, + "step": 8394 + }, + { + "epoch": 3.9692671394799053, + "grad_norm": 2.99763560295105, + "learning_rate": 1.3034955308917849e-06, + "loss": 0.4017, + "step": 8395 + }, + { + "epoch": 3.969739952718676, + "grad_norm": 3.063109874725342, + "learning_rate": 1.3029478280519364e-06, + "loss": 0.4568, + "step": 8396 + }, + { + "epoch": 3.970212765957447, + "grad_norm": 3.2660679817199707, + "learning_rate": 1.3024001997472791e-06, + "loss": 0.3999, + "step": 8397 + }, + { + "epoch": 3.9706855791962177, + "grad_norm": 2.860121250152588, + "learning_rate": 1.3018526460119088e-06, + "loss": 0.433, + "step": 8398 + }, + { + "epoch": 3.971158392434988, + "grad_norm": 3.1037673950195312, + "learning_rate": 1.3013051668799216e-06, + "loss": 0.4526, + "step": 8399 + }, + { + "epoch": 3.971631205673759, + "grad_norm": 2.9408578872680664, + "learning_rate": 1.3007577623854053e-06, + "loss": 0.3722, + "step": 8400 + }, + { + "epoch": 3.9721040189125296, + "grad_norm": 3.0684635639190674, + "learning_rate": 1.3002104325624436e-06, + "loss": 0.3789, + "step": 8401 + }, + { + "epoch": 3.9725768321513, + "grad_norm": 2.6469366550445557, + "learning_rate": 1.2996631774451187e-06, + "loss": 0.3409, + "step": 8402 + }, + { + "epoch": 3.9730496453900708, + "grad_norm": 3.3741610050201416, + "learning_rate": 1.2991159970675033e-06, + "loss": 0.3544, + "step": 8403 + }, + { + "epoch": 3.9735224586288416, + "grad_norm": 3.3716588020324707, + "learning_rate": 1.2985688914636701e-06, + "loss": 0.3747, + "step": 8404 + }, + { + "epoch": 3.9739952718676124, + "grad_norm": 3.000469923019409, + "learning_rate": 1.2980218606676837e-06, + "loss": 0.4506, + "step": 8405 + }, + { + "epoch": 3.974468085106383, + "grad_norm": 3.0139408111572266, + "learning_rate": 1.2974749047136057e-06, + "loss": 0.4156, + "step": 8406 + }, + { + "epoch": 3.9749408983451535, + "grad_norm": 2.9494218826293945, + "learning_rate": 1.2969280236354925e-06, + "loss": 0.3378, + "step": 8407 + }, + { + "epoch": 3.9754137115839243, + "grad_norm": 2.6061158180236816, + "learning_rate": 1.2963812174673948e-06, + "loss": 0.3887, + "step": 8408 + }, + { + "epoch": 3.975886524822695, + "grad_norm": 2.873987913131714, + "learning_rate": 1.295834486243362e-06, + "loss": 0.3202, + "step": 8409 + }, + { + "epoch": 3.9763593380614655, + "grad_norm": 3.0106539726257324, + "learning_rate": 1.2952878299974358e-06, + "loss": 0.4142, + "step": 8410 + }, + { + "epoch": 3.9768321513002363, + "grad_norm": 3.0011982917785645, + "learning_rate": 1.2947412487636527e-06, + "loss": 0.4121, + "step": 8411 + }, + { + "epoch": 3.977304964539007, + "grad_norm": 3.1321003437042236, + "learning_rate": 1.294194742576048e-06, + "loss": 0.4033, + "step": 8412 + }, + { + "epoch": 3.977777777777778, + "grad_norm": 2.812255382537842, + "learning_rate": 1.2936483114686487e-06, + "loss": 0.3414, + "step": 8413 + }, + { + "epoch": 3.9782505910165487, + "grad_norm": 2.9594221115112305, + "learning_rate": 1.2931019554754804e-06, + "loss": 0.3666, + "step": 8414 + }, + { + "epoch": 3.978723404255319, + "grad_norm": 3.119440793991089, + "learning_rate": 1.2925556746305612e-06, + "loss": 0.3902, + "step": 8415 + }, + { + "epoch": 3.97919621749409, + "grad_norm": 3.042102098464966, + "learning_rate": 1.2920094689679047e-06, + "loss": 0.344, + "step": 8416 + }, + { + "epoch": 3.9796690307328606, + "grad_norm": 2.8443872928619385, + "learning_rate": 1.2914633385215225e-06, + "loss": 0.372, + "step": 8417 + }, + { + "epoch": 3.980141843971631, + "grad_norm": 3.483201265335083, + "learning_rate": 1.2909172833254187e-06, + "loss": 0.4028, + "step": 8418 + }, + { + "epoch": 3.9806146572104018, + "grad_norm": 2.966996431350708, + "learning_rate": 1.2903713034135934e-06, + "loss": 0.3527, + "step": 8419 + }, + { + "epoch": 3.9810874704491725, + "grad_norm": 2.7813172340393066, + "learning_rate": 1.2898253988200437e-06, + "loss": 0.3873, + "step": 8420 + }, + { + "epoch": 3.9815602836879433, + "grad_norm": 3.24611234664917, + "learning_rate": 1.2892795695787602e-06, + "loss": 0.4783, + "step": 8421 + }, + { + "epoch": 3.982033096926714, + "grad_norm": 3.345573663711548, + "learning_rate": 1.2887338157237289e-06, + "loss": 0.4179, + "step": 8422 + }, + { + "epoch": 3.9825059101654845, + "grad_norm": 3.1726880073547363, + "learning_rate": 1.288188137288931e-06, + "loss": 0.3725, + "step": 8423 + }, + { + "epoch": 3.9829787234042553, + "grad_norm": 3.398966073989868, + "learning_rate": 1.2876425343083449e-06, + "loss": 0.4117, + "step": 8424 + }, + { + "epoch": 3.983451536643026, + "grad_norm": 2.615680456161499, + "learning_rate": 1.2870970068159423e-06, + "loss": 0.324, + "step": 8425 + }, + { + "epoch": 3.9839243498817964, + "grad_norm": 3.0505547523498535, + "learning_rate": 1.2865515548456893e-06, + "loss": 0.3698, + "step": 8426 + }, + { + "epoch": 3.9843971631205672, + "grad_norm": 3.077404260635376, + "learning_rate": 1.2860061784315514e-06, + "loss": 0.3592, + "step": 8427 + }, + { + "epoch": 3.984869976359338, + "grad_norm": 2.654080390930176, + "learning_rate": 1.2854608776074855e-06, + "loss": 0.3451, + "step": 8428 + }, + { + "epoch": 3.985342789598109, + "grad_norm": 3.1023523807525635, + "learning_rate": 1.284915652407444e-06, + "loss": 0.3809, + "step": 8429 + }, + { + "epoch": 3.9858156028368796, + "grad_norm": 3.0526652336120605, + "learning_rate": 1.2843705028653783e-06, + "loss": 0.3633, + "step": 8430 + }, + { + "epoch": 3.98628841607565, + "grad_norm": 2.7829604148864746, + "learning_rate": 1.2838254290152296e-06, + "loss": 0.3213, + "step": 8431 + }, + { + "epoch": 3.986761229314421, + "grad_norm": 3.2218687534332275, + "learning_rate": 1.28328043089094e-06, + "loss": 0.465, + "step": 8432 + }, + { + "epoch": 3.9872340425531916, + "grad_norm": 2.952998161315918, + "learning_rate": 1.2827355085264425e-06, + "loss": 0.4405, + "step": 8433 + }, + { + "epoch": 3.987706855791962, + "grad_norm": 2.81211519241333, + "learning_rate": 1.2821906619556667e-06, + "loss": 0.3444, + "step": 8434 + }, + { + "epoch": 3.9881796690307327, + "grad_norm": 3.1707375049591064, + "learning_rate": 1.281645891212539e-06, + "loss": 0.4019, + "step": 8435 + }, + { + "epoch": 3.9886524822695035, + "grad_norm": 2.791504383087158, + "learning_rate": 1.2811011963309788e-06, + "loss": 0.3606, + "step": 8436 + }, + { + "epoch": 3.9891252955082743, + "grad_norm": 2.954782247543335, + "learning_rate": 1.280556577344903e-06, + "loss": 0.3141, + "step": 8437 + }, + { + "epoch": 3.989598108747045, + "grad_norm": 2.718273878097534, + "learning_rate": 1.2800120342882223e-06, + "loss": 0.3715, + "step": 8438 + }, + { + "epoch": 3.9900709219858155, + "grad_norm": 3.2916250228881836, + "learning_rate": 1.2794675671948425e-06, + "loss": 0.4048, + "step": 8439 + }, + { + "epoch": 3.9905437352245863, + "grad_norm": 3.060060977935791, + "learning_rate": 1.2789231760986655e-06, + "loss": 0.4032, + "step": 8440 + }, + { + "epoch": 3.991016548463357, + "grad_norm": 2.8467273712158203, + "learning_rate": 1.2783788610335882e-06, + "loss": 0.4041, + "step": 8441 + }, + { + "epoch": 3.9914893617021274, + "grad_norm": 3.161790132522583, + "learning_rate": 1.2778346220335013e-06, + "loss": 0.4049, + "step": 8442 + }, + { + "epoch": 3.9919621749408982, + "grad_norm": 2.6512296199798584, + "learning_rate": 1.277290459132295e-06, + "loss": 0.3598, + "step": 8443 + }, + { + "epoch": 3.992434988179669, + "grad_norm": 2.792736291885376, + "learning_rate": 1.276746372363849e-06, + "loss": 0.3874, + "step": 8444 + }, + { + "epoch": 3.99290780141844, + "grad_norm": 2.887047052383423, + "learning_rate": 1.2762023617620433e-06, + "loss": 0.4255, + "step": 8445 + }, + { + "epoch": 3.9933806146572106, + "grad_norm": 3.0420780181884766, + "learning_rate": 1.275658427360751e-06, + "loss": 0.4489, + "step": 8446 + }, + { + "epoch": 3.993853427895981, + "grad_norm": 3.107618570327759, + "learning_rate": 1.2751145691938383e-06, + "loss": 0.4354, + "step": 8447 + }, + { + "epoch": 3.9943262411347518, + "grad_norm": 2.656224250793457, + "learning_rate": 1.2745707872951718e-06, + "loss": 0.4188, + "step": 8448 + }, + { + "epoch": 3.9947990543735226, + "grad_norm": 2.9895219802856445, + "learning_rate": 1.2740270816986079e-06, + "loss": 0.391, + "step": 8449 + }, + { + "epoch": 3.995271867612293, + "grad_norm": 2.919255018234253, + "learning_rate": 1.2734834524380025e-06, + "loss": 0.4058, + "step": 8450 + }, + { + "epoch": 3.9957446808510637, + "grad_norm": 3.4418535232543945, + "learning_rate": 1.2729398995472048e-06, + "loss": 0.3977, + "step": 8451 + }, + { + "epoch": 3.9962174940898345, + "grad_norm": 2.980224132537842, + "learning_rate": 1.272396423060058e-06, + "loss": 0.4417, + "step": 8452 + }, + { + "epoch": 3.9966903073286053, + "grad_norm": 3.6488101482391357, + "learning_rate": 1.2718530230104043e-06, + "loss": 0.4472, + "step": 8453 + }, + { + "epoch": 3.997163120567376, + "grad_norm": 2.725437641143799, + "learning_rate": 1.2713096994320774e-06, + "loss": 0.3125, + "step": 8454 + }, + { + "epoch": 3.9976359338061465, + "grad_norm": 3.453794002532959, + "learning_rate": 1.2707664523589076e-06, + "loss": 0.3792, + "step": 8455 + }, + { + "epoch": 3.9981087470449173, + "grad_norm": 2.8443076610565186, + "learning_rate": 1.270223281824721e-06, + "loss": 0.3627, + "step": 8456 + }, + { + "epoch": 3.998581560283688, + "grad_norm": 3.1851959228515625, + "learning_rate": 1.2696801878633372e-06, + "loss": 0.3745, + "step": 8457 + }, + { + "epoch": 3.9990543735224584, + "grad_norm": 2.897239923477173, + "learning_rate": 1.2691371705085743e-06, + "loss": 0.3817, + "step": 8458 + }, + { + "epoch": 3.999527186761229, + "grad_norm": 2.92111873626709, + "learning_rate": 1.2685942297942416e-06, + "loss": 0.3824, + "step": 8459 + }, + { + "epoch": 4.0, + "grad_norm": 2.9540340900421143, + "learning_rate": 1.268051365754148e-06, + "loss": 0.3656, + "step": 8460 + } + ], + "logging_steps": 1, + "max_steps": 12690, + "num_input_tokens_seen": 0, + "num_train_epochs": 6, + "save_steps": 2115, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 2.1369638945418117e+19, + "train_batch_size": 4, + "trial_name": null, + "trial_params": null +} diff --git a/checkpoint-8460/training_args.bin b/checkpoint-8460/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..3885e6a27b204c834ff6fbc6f3a5a4c7f69fe2cc --- /dev/null +++ b/checkpoint-8460/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:51188d498a3fe1bf9ddf234271786a31e25e89b3aa68721a97f8264e1013c9b6 +size 8056 diff --git a/checkpoint-8460/zero_to_fp32.py b/checkpoint-8460/zero_to_fp32.py new file mode 100644 index 0000000000000000000000000000000000000000..24cc342e78d1a006c782b3a4cd68d9ce786d8fd8 --- /dev/null +++ b/checkpoint-8460/zero_to_fp32.py @@ -0,0 +1,604 @@ +#!/usr/bin/env python + +# Copyright (c) Microsoft Corporation. +# SPDX-License-Identifier: Apache-2.0 + +# DeepSpeed Team + +# This script extracts fp32 consolidated weights from a zero 1, 2 and 3 DeepSpeed checkpoints. It gets +# copied into the top level checkpoint dir, so the user can easily do the conversion at any point in +# the future. Once extracted, the weights don't require DeepSpeed and can be used in any +# application. +# +# example: python zero_to_fp32.py . pytorch_model.bin + +import argparse +import torch +import glob +import math +import os +import re +from collections import OrderedDict +from dataclasses import dataclass + +# while this script doesn't use deepspeed to recover data, since the checkpoints are pickled with +# DeepSpeed data structures it has to be available in the current python environment. +from deepspeed.utils import logger +from deepspeed.checkpoint.constants import (DS_VERSION, OPTIMIZER_STATE_DICT, SINGLE_PARTITION_OF_FP32_GROUPS, + FP32_FLAT_GROUPS, ZERO_STAGE, PARTITION_COUNT, PARAM_SHAPES, BUFFER_NAMES, + FROZEN_PARAM_SHAPES, FROZEN_PARAM_FRAGMENTS) + + +@dataclass +class zero_model_state: + buffers: dict() + param_shapes: dict() + shared_params: list + ds_version: int + frozen_param_shapes: dict() + frozen_param_fragments: dict() + + +debug = 0 + +# load to cpu +device = torch.device('cpu') + + +def atoi(text): + return int(text) if text.isdigit() else text + + +def natural_keys(text): + ''' + alist.sort(key=natural_keys) sorts in human order + http://nedbatchelder.com/blog/200712/human_sorting.html + (See Toothy's implementation in the comments) + ''' + return [atoi(c) for c in re.split(r'(\d+)', text)] + + +def get_model_state_file(checkpoint_dir, zero_stage): + if not os.path.isdir(checkpoint_dir): + raise FileNotFoundError(f"Directory '{checkpoint_dir}' doesn't exist") + + # there should be only one file + if zero_stage <= 2: + file = os.path.join(checkpoint_dir, "mp_rank_00_model_states.pt") + elif zero_stage == 3: + file = os.path.join(checkpoint_dir, "zero_pp_rank_0_mp_rank_00_model_states.pt") + + if not os.path.exists(file): + raise FileNotFoundError(f"can't find model states file at '{file}'") + + return file + + +def get_checkpoint_files(checkpoint_dir, glob_pattern): + # XXX: need to test that this simple glob rule works for multi-node setup too + ckpt_files = sorted(glob.glob(os.path.join(checkpoint_dir, glob_pattern)), key=natural_keys) + + if len(ckpt_files) == 0: + raise FileNotFoundError(f"can't find {glob_pattern} files in directory '{checkpoint_dir}'") + + return ckpt_files + + +def get_optim_files(checkpoint_dir): + return get_checkpoint_files(checkpoint_dir, "*_optim_states.pt") + + +def get_model_state_files(checkpoint_dir): + return get_checkpoint_files(checkpoint_dir, "*_model_states.pt") + + +def parse_model_states(files): + zero_model_states = [] + for file in files: + state_dict = torch.load(file, map_location=device) + + if BUFFER_NAMES not in state_dict: + raise ValueError(f"{file} is not a model state checkpoint") + buffer_names = state_dict[BUFFER_NAMES] + if debug: + print("Found buffers:", buffer_names) + + # recover just the buffers while restoring them to fp32 if they were saved in fp16 + buffers = {k: v.float() for k, v in state_dict["module"].items() if k in buffer_names} + param_shapes = state_dict[PARAM_SHAPES] + + # collect parameters that are included in param_shapes + param_names = [] + for s in param_shapes: + for name in s.keys(): + param_names.append(name) + + # update with frozen parameters + frozen_param_shapes = state_dict.get(FROZEN_PARAM_SHAPES, None) + if frozen_param_shapes is not None: + if debug: + print(f"Found frozen_param_shapes: {frozen_param_shapes}") + param_names += list(frozen_param_shapes.keys()) + + # handle shared params + shared_params = [[k, v] for k, v in state_dict["shared_params"].items()] + + ds_version = state_dict.get(DS_VERSION, None) + + frozen_param_fragments = state_dict.get(FROZEN_PARAM_FRAGMENTS, None) + + z_model_state = zero_model_state(buffers=buffers, + param_shapes=param_shapes, + shared_params=shared_params, + ds_version=ds_version, + frozen_param_shapes=frozen_param_shapes, + frozen_param_fragments=frozen_param_fragments) + zero_model_states.append(z_model_state) + + return zero_model_states + + +def parse_optim_states(files, ds_checkpoint_dir): + + total_files = len(files) + state_dicts = [] + for f in files: + state_dict = torch.load(f, map_location=device) + # immediately discard the potentially huge 2 optimizer states as we only care for fp32 master weights + # and also handle the case where it was already removed by another helper script + state_dict["optimizer_state_dict"].pop("optimizer_state_dict", None) + state_dicts.append(state_dict) + + if not ZERO_STAGE in state_dicts[0][OPTIMIZER_STATE_DICT]: + raise ValueError(f"{files[0]} is not a zero checkpoint") + zero_stage = state_dicts[0][OPTIMIZER_STATE_DICT][ZERO_STAGE] + world_size = state_dicts[0][OPTIMIZER_STATE_DICT][PARTITION_COUNT] + + # For ZeRO-2 each param group can have different partition_count as data parallelism for expert + # parameters can be different from data parallelism for non-expert parameters. So we can just + # use the max of the partition_count to get the dp world_size. + + if type(world_size) is list: + world_size = max(world_size) + + if world_size != total_files: + raise ValueError( + f"Expected {world_size} of '*_optim_states.pt' under '{ds_checkpoint_dir}' but found {total_files} files. " + "Possibly due to an overwrite of an old checkpoint, or a checkpoint didn't get saved by one or more processes." + ) + + # the groups are named differently in each stage + if zero_stage <= 2: + fp32_groups_key = SINGLE_PARTITION_OF_FP32_GROUPS + elif zero_stage == 3: + fp32_groups_key = FP32_FLAT_GROUPS + else: + raise ValueError(f"unknown zero stage {zero_stage}") + + if zero_stage <= 2: + fp32_flat_groups = [state_dicts[i][OPTIMIZER_STATE_DICT][fp32_groups_key] for i in range(len(state_dicts))] + elif zero_stage == 3: + # if there is more than one param group, there will be multiple flattened tensors - one + # flattened tensor per group - for simplicity merge them into a single tensor + # + # XXX: could make the script more memory efficient for when there are multiple groups - it + # will require matching the sub-lists of param_shapes for each param group flattened tensor + + fp32_flat_groups = [ + torch.cat(state_dicts[i][OPTIMIZER_STATE_DICT][fp32_groups_key], 0) for i in range(len(state_dicts)) + ] + + return zero_stage, world_size, fp32_flat_groups + + +def _get_fp32_state_dict_from_zero_checkpoint(ds_checkpoint_dir, exclude_frozen_parameters): + """ + Returns fp32 state_dict reconstructed from ds checkpoint + + Args: + - ``ds_checkpoint_dir``: path to the deepspeed checkpoint folder (where the optimizer files are) + + """ + print(f"Processing zero checkpoint '{ds_checkpoint_dir}'") + + optim_files = get_optim_files(ds_checkpoint_dir) + zero_stage, world_size, fp32_flat_groups = parse_optim_states(optim_files, ds_checkpoint_dir) + print(f"Detected checkpoint of type zero stage {zero_stage}, world_size: {world_size}") + + model_files = get_model_state_files(ds_checkpoint_dir) + + zero_model_states = parse_model_states(model_files) + print(f'Parsing checkpoint created by deepspeed=={zero_model_states[0].ds_version}') + + if zero_stage <= 2: + return _get_fp32_state_dict_from_zero2_checkpoint(world_size, fp32_flat_groups, zero_model_states, + exclude_frozen_parameters) + elif zero_stage == 3: + return _get_fp32_state_dict_from_zero3_checkpoint(world_size, fp32_flat_groups, zero_model_states, + exclude_frozen_parameters) + + +def _zero2_merge_frozen_params(state_dict, zero_model_states): + if zero_model_states[0].frozen_param_shapes is None or len(zero_model_states[0].frozen_param_shapes) == 0: + return + + frozen_param_shapes = zero_model_states[0].frozen_param_shapes + frozen_param_fragments = zero_model_states[0].frozen_param_fragments + + if debug: + num_elem = sum(s.numel() for s in frozen_param_shapes.values()) + print(f'rank 0: {FROZEN_PARAM_SHAPES}.numel = {num_elem}') + + wanted_params = len(frozen_param_shapes) + wanted_numel = sum(s.numel() for s in frozen_param_shapes.values()) + avail_numel = sum([p.numel() for p in frozen_param_fragments.values()]) + print(f'Frozen params: Have {avail_numel} numels to process.') + print(f'Frozen params: Need {wanted_numel} numels in {wanted_params} params') + + total_params = 0 + total_numel = 0 + for name, shape in frozen_param_shapes.items(): + total_params += 1 + unpartitioned_numel = shape.numel() + total_numel += unpartitioned_numel + + state_dict[name] = frozen_param_fragments[name] + + if debug: + print(f"{name} full shape: {shape} unpartitioned numel {unpartitioned_numel} ") + + print(f"Reconstructed Frozen fp32 state dict with {total_params} params {total_numel} elements") + + +def _has_callable(obj, fn): + attr = getattr(obj, fn, None) + return callable(attr) + + +def _zero2_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states): + param_shapes = zero_model_states[0].param_shapes + + # Reconstruction protocol: + # + # XXX: document this + + if debug: + for i in range(world_size): + for j in range(len(fp32_flat_groups[0])): + print(f"{FP32_FLAT_GROUPS}[{i}][{j}].shape={fp32_flat_groups[i][j].shape}") + + # XXX: memory usage doubles here (zero2) + num_param_groups = len(fp32_flat_groups[0]) + merged_single_partition_of_fp32_groups = [] + for i in range(num_param_groups): + merged_partitions = [sd[i] for sd in fp32_flat_groups] + full_single_fp32_vector = torch.cat(merged_partitions, 0) + merged_single_partition_of_fp32_groups.append(full_single_fp32_vector) + avail_numel = sum( + [full_single_fp32_vector.numel() for full_single_fp32_vector in merged_single_partition_of_fp32_groups]) + + if debug: + wanted_params = sum([len(shapes) for shapes in param_shapes]) + wanted_numel = sum([sum(shape.numel() for shape in shapes.values()) for shapes in param_shapes]) + # not asserting if there is a mismatch due to possible padding + print(f"Have {avail_numel} numels to process.") + print(f"Need {wanted_numel} numels in {wanted_params} params.") + + # params + # XXX: for huge models that can't fit into the host's RAM we will have to recode this to support + # out-of-core computing solution + total_numel = 0 + total_params = 0 + for shapes, full_single_fp32_vector in zip(param_shapes, merged_single_partition_of_fp32_groups): + offset = 0 + avail_numel = full_single_fp32_vector.numel() + for name, shape in shapes.items(): + + unpartitioned_numel = shape.numel() if _has_callable(shape, 'numel') else math.prod(shape) + total_numel += unpartitioned_numel + total_params += 1 + + if debug: + print(f"{name} full shape: {shape} unpartitioned numel {unpartitioned_numel} ") + state_dict[name] = full_single_fp32_vector.narrow(0, offset, unpartitioned_numel).view(shape) + offset += unpartitioned_numel + + # Z2 started to align to 2*world_size to improve nccl performance. Therefore both offset and + # avail_numel can differ by anywhere between 0..2*world_size. Due to two unrelated complex + # paddings performed in the code it's almost impossible to predict the exact numbers w/o the + # live optimizer object, so we are checking that the numbers are within the right range + align_to = 2 * world_size + + def zero2_align(x): + return align_to * math.ceil(x / align_to) + + if debug: + print(f"original offset={offset}, avail_numel={avail_numel}") + + offset = zero2_align(offset) + avail_numel = zero2_align(avail_numel) + + if debug: + print(f"aligned offset={offset}, avail_numel={avail_numel}") + + # Sanity check + if offset != avail_numel: + raise ValueError(f"consumed {offset} numels out of {avail_numel} - something is wrong") + + print(f"Reconstructed fp32 state dict with {total_params} params {total_numel} elements") + + +def _get_fp32_state_dict_from_zero2_checkpoint(world_size, fp32_flat_groups, zero_model_states, + exclude_frozen_parameters): + state_dict = OrderedDict() + + # buffers + buffers = zero_model_states[0].buffers + state_dict.update(buffers) + if debug: + print(f"added {len(buffers)} buffers") + + if not exclude_frozen_parameters: + _zero2_merge_frozen_params(state_dict, zero_model_states) + + _zero2_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states) + + # recover shared parameters + for pair in zero_model_states[0].shared_params: + if pair[1] in state_dict: + state_dict[pair[0]] = state_dict[pair[1]] + + return state_dict + + +def zero3_partitioned_param_info(unpartitioned_numel, world_size): + remainder = unpartitioned_numel % world_size + padding_numel = (world_size - remainder) if remainder else 0 + partitioned_numel = math.ceil(unpartitioned_numel / world_size) + return partitioned_numel, padding_numel + + +def _zero3_merge_frozen_params(state_dict, world_size, zero_model_states): + if zero_model_states[0].frozen_param_shapes is None or len(zero_model_states[0].frozen_param_shapes) == 0: + return + + if debug: + for i in range(world_size): + num_elem = sum(s.numel() for s in zero_model_states[i].frozen_param_fragments.values()) + print(f'rank {i}: {FROZEN_PARAM_SHAPES}.numel = {num_elem}') + + frozen_param_shapes = zero_model_states[0].frozen_param_shapes + wanted_params = len(frozen_param_shapes) + wanted_numel = sum(s.numel() for s in frozen_param_shapes.values()) + avail_numel = sum([p.numel() for p in zero_model_states[0].frozen_param_fragments.values()]) * world_size + print(f'Frozen params: Have {avail_numel} numels to process.') + print(f'Frozen params: Need {wanted_numel} numels in {wanted_params} params') + + total_params = 0 + total_numel = 0 + for name, shape in zero_model_states[0].frozen_param_shapes.items(): + total_params += 1 + unpartitioned_numel = shape.numel() + total_numel += unpartitioned_numel + + param_frags = tuple(model_state.frozen_param_fragments[name] for model_state in zero_model_states) + state_dict[name] = torch.cat(param_frags, 0).narrow(0, 0, unpartitioned_numel).view(shape) + + partitioned_numel, partitioned_padding_numel = zero3_partitioned_param_info(unpartitioned_numel, world_size) + + if debug: + print( + f"Frozen params: {total_params} {name} full shape: {shape} partition0 numel={partitioned_numel} partitioned_padding_numel={partitioned_padding_numel}" + ) + + print(f"Reconstructed Frozen fp32 state dict with {total_params} params {total_numel} elements") + + +def _zero3_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states): + param_shapes = zero_model_states[0].param_shapes + avail_numel = fp32_flat_groups[0].numel() * world_size + # Reconstruction protocol: For zero3 we need to zip the partitions together at boundary of each + # param, re-consolidating each param, while dealing with padding if any + + # merge list of dicts, preserving order + param_shapes = {k: v for d in param_shapes for k, v in d.items()} + + if debug: + for i in range(world_size): + print(f"{FP32_FLAT_GROUPS}[{i}].shape={fp32_flat_groups[i].shape}") + + wanted_params = len(param_shapes) + wanted_numel = sum(shape.numel() for shape in param_shapes.values()) + # not asserting if there is a mismatch due to possible padding + avail_numel = fp32_flat_groups[0].numel() * world_size + print(f"Trainable params: Have {avail_numel} numels to process.") + print(f"Trainable params: Need {wanted_numel} numels in {wanted_params} params.") + + # params + # XXX: for huge models that can't fit into the host's RAM we will have to recode this to support + # out-of-core computing solution + offset = 0 + total_numel = 0 + total_params = 0 + for name, shape in param_shapes.items(): + + unpartitioned_numel = shape.numel() + total_numel += unpartitioned_numel + total_params += 1 + + partitioned_numel, partitioned_padding_numel = zero3_partitioned_param_info(unpartitioned_numel, world_size) + + if debug: + print( + f"Trainable params: {total_params} {name} full shape: {shape} partition0 numel={partitioned_numel} partitioned_padding_numel={partitioned_padding_numel}" + ) + + # XXX: memory usage doubles here + state_dict[name] = torch.cat( + tuple(fp32_flat_groups[i].narrow(0, offset, partitioned_numel) for i in range(world_size)), + 0).narrow(0, 0, unpartitioned_numel).view(shape) + offset += partitioned_numel + + offset *= world_size + + # Sanity check + if offset != avail_numel: + raise ValueError(f"consumed {offset} numels out of {avail_numel} - something is wrong") + + print(f"Reconstructed Trainable fp32 state dict with {total_params} params {total_numel} elements") + + +def _get_fp32_state_dict_from_zero3_checkpoint(world_size, fp32_flat_groups, zero_model_states, + exclude_frozen_parameters): + state_dict = OrderedDict() + + # buffers + buffers = zero_model_states[0].buffers + state_dict.update(buffers) + if debug: + print(f"added {len(buffers)} buffers") + + if not exclude_frozen_parameters: + _zero3_merge_frozen_params(state_dict, world_size, zero_model_states) + + _zero3_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states) + + # recover shared parameters + for pair in zero_model_states[0].shared_params: + if pair[1] in state_dict: + state_dict[pair[0]] = state_dict[pair[1]] + + return state_dict + + +def get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, tag=None, exclude_frozen_parameters=False): + """ + Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated state_dict that can be loaded with + ``load_state_dict()`` and used for training without DeepSpeed or shared with others, for example + via a model hub. + + Args: + - ``checkpoint_dir``: path to the desired checkpoint folder + - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in 'latest' file. e.g., ``global_step14`` + - ``exclude_frozen_parameters``: exclude frozen parameters + + Returns: + - pytorch ``state_dict`` + + Note: this approach may not work if your application doesn't have sufficient free CPU memory and + you may need to use the offline approach using the ``zero_to_fp32.py`` script that is saved with + the checkpoint. + + A typical usage might be :: + + from deepspeed.utils.zero_to_fp32 import get_fp32_state_dict_from_zero_checkpoint + # do the training and checkpoint saving + state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir) # already on cpu + model = model.cpu() # move to cpu + model.load_state_dict(state_dict) + # submit to model hub or save the model to share with others + + In this example the ``model`` will no longer be usable in the deepspeed context of the same + application. i.e. you will need to re-initialize the deepspeed engine, since + ``model.load_state_dict(state_dict)`` will remove all the deepspeed magic from it. + + If you want it all done for you, use ``load_state_dict_from_zero_checkpoint`` instead. + + """ + if tag is None: + latest_path = os.path.join(checkpoint_dir, 'latest') + if os.path.isfile(latest_path): + with open(latest_path, 'r') as fd: + tag = fd.read().strip() + else: + raise ValueError(f"Unable to find 'latest' file at {latest_path}") + + ds_checkpoint_dir = os.path.join(checkpoint_dir, tag) + + if not os.path.isdir(ds_checkpoint_dir): + raise FileNotFoundError(f"Directory '{ds_checkpoint_dir}' doesn't exist") + + return _get_fp32_state_dict_from_zero_checkpoint(ds_checkpoint_dir, exclude_frozen_parameters) + + +def convert_zero_checkpoint_to_fp32_state_dict(checkpoint_dir, output_file, tag=None, exclude_frozen_parameters=False): + """ + Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated ``state_dict`` file that can be + loaded with ``torch.load(file)`` + ``load_state_dict()`` and used for training without DeepSpeed. + + Args: + - ``checkpoint_dir``: path to the desired checkpoint folder. (one that contains the tag-folder, like ``global_step14``) + - ``output_file``: path to the pytorch fp32 state_dict output file (e.g. path/pytorch_model.bin) + - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in the file named ``latest`` in the checkpoint folder, e.g., ``global_step14`` + - ``exclude_frozen_parameters``: exclude frozen parameters + """ + + state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, tag, exclude_frozen_parameters) + print(f"Saving fp32 state dict to {output_file}") + torch.save(state_dict, output_file) + + +def load_state_dict_from_zero_checkpoint(model, checkpoint_dir, tag=None): + """ + 1. Put the provided model to cpu + 2. Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated ``state_dict`` + 3. Load it into the provided model + + Args: + - ``model``: the model object to update + - ``checkpoint_dir``: path to the desired checkpoint folder. (one that contains the tag-folder, like ``global_step14``) + - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in the file named ``latest`` in the checkpoint folder, e.g., ``global_step14`` + + Returns: + - ``model`: modified model + + Make sure you have plenty of CPU memory available before you call this function. If you don't + have enough use the ``zero_to_fp32.py`` utility to do the conversion. You will find it + conveniently placed for you in the checkpoint folder. + + A typical usage might be :: + + from deepspeed.utils.zero_to_fp32 import load_state_dict_from_zero_checkpoint + model = load_state_dict_from_zero_checkpoint(trainer.model, checkpoint_dir) + # submit to model hub or save the model to share with others + + Note, that once this was run, the ``model`` will no longer be usable in the deepspeed context + of the same application. i.e. you will need to re-initialize the deepspeed engine, since + ``model.load_state_dict(state_dict)`` will remove all the deepspeed magic from it. + + """ + logger.info(f"Extracting fp32 weights") + state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, tag) + + logger.info(f"Overwriting model with fp32 weights") + model = model.cpu() + model.load_state_dict(state_dict, strict=False) + + return model + + +if __name__ == "__main__": + + parser = argparse.ArgumentParser() + parser.add_argument("checkpoint_dir", + type=str, + help="path to the desired checkpoint folder, e.g., path/checkpoint-12") + parser.add_argument( + "output_file", + type=str, + help="path to the pytorch fp32 state_dict output file (e.g. path/checkpoint-12/pytorch_model.bin)") + parser.add_argument("-t", + "--tag", + type=str, + default=None, + help="checkpoint tag used as a unique identifier for checkpoint. e.g., global_step1") + parser.add_argument("--exclude_frozen_parameters", action='store_true', help="exclude frozen parameters") + parser.add_argument("-d", "--debug", action='store_true', help="enable debug") + args = parser.parse_args() + + debug = args.debug + + convert_zero_checkpoint_to_fp32_state_dict(args.checkpoint_dir, + args.output_file, + tag=args.tag, + exclude_frozen_parameters=args.exclude_frozen_parameters) diff --git a/config.json b/config.json new file mode 100644 index 0000000000000000000000000000000000000000..bc1fab23ab76663db86132a9d92014a99265286d --- /dev/null +++ b/config.json @@ -0,0 +1,52 @@ +{ + "_attn_implementation_autoset": true, + "_name_or_path": "meta-llama/Llama-3.1-8B", + "architectures": [ + "LlamaForCausalLM" + ], + "attention_bias": false, + "attention_dropout": 0.0, + "bos_token_id": 128000, + "eos_token_id": 128001, + "head_dim": 128, + "hidden_act": "silu", + "hidden_size": 4096, + "initializer_range": 0.02, + "intermediate_size": 14336, + "max_position_embeddings": 131072, + "mlp_bias": false, + "model_type": "llama", + "num_attention_heads": 32, + "num_hidden_layers": 32, + "num_key_value_heads": 8, + "pretraining_tp": 1, + "quantization_config": { + "_load_in_4bit": true, + "_load_in_8bit": false, + "bnb_4bit_compute_dtype": "bfloat16", + "bnb_4bit_quant_storage": "bfloat16", + "bnb_4bit_quant_type": "nf4", + "bnb_4bit_use_double_quant": true, + "llm_int8_enable_fp32_cpu_offload": false, + "llm_int8_has_fp16_weight": false, + "llm_int8_skip_modules": null, + "llm_int8_threshold": 6.0, + "load_in_4bit": true, + "load_in_8bit": false, + "quant_method": "bitsandbytes" + }, + "rms_norm_eps": 1e-05, + "rope_scaling": { + "factor": 8.0, + "high_freq_factor": 4.0, + "low_freq_factor": 1.0, + "original_max_position_embeddings": 8192, + "rope_type": "llama3" + }, + "rope_theta": 500000.0, + "tie_word_embeddings": false, + "torch_dtype": "bfloat16", + "transformers_version": "4.49.0", + "use_cache": false, + "vocab_size": 128256 +} diff --git a/special_tokens_map.json b/special_tokens_map.json new file mode 100644 index 0000000000000000000000000000000000000000..e5b39b6305d89284b04934011c68dbb26bf588ca --- /dev/null +++ b/special_tokens_map.json @@ -0,0 +1,23 @@ +{ + "bos_token": { + "content": "<|begin_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "eos_token": { + "content": "<|end_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "pad_token": { + "content": "<|end_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + } +} diff --git a/tokenizer.json b/tokenizer.json new file mode 100644 index 0000000000000000000000000000000000000000..1c1d8d5c9024994f1d3b00f9662b8dd89ca13cf2 --- /dev/null +++ b/tokenizer.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6b9e4e7fb171f92fd137b777cc2714bf87d11576700a1dcd7a399e7bbe39537b +size 17209920 diff --git a/tokenizer_config.json b/tokenizer_config.json new file mode 100644 index 0000000000000000000000000000000000000000..81dd14db6632ad5b35b9d447732e37ac074873a5 --- /dev/null +++ b/tokenizer_config.json @@ -0,0 +1,2063 @@ +{ + "added_tokens_decoder": { + "128000": { + "content": "<|begin_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128001": { + "content": "<|end_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128002": { + "content": "<|reserved_special_token_0|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128003": { + "content": "<|reserved_special_token_1|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128004": { + "content": "<|finetune_right_pad_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128005": { + "content": "<|reserved_special_token_2|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128006": { + "content": "<|start_header_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128007": { + "content": "<|end_header_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128008": { + "content": "<|eom_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128009": { + "content": "<|eot_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128010": { + "content": "<|python_tag|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128011": { + "content": "<|reserved_special_token_3|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128012": { + "content": "<|reserved_special_token_4|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128013": { + "content": "<|reserved_special_token_5|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128014": { + "content": "<|reserved_special_token_6|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128015": { + "content": "<|reserved_special_token_7|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128016": { + "content": "<|reserved_special_token_8|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128017": { + "content": "<|reserved_special_token_9|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128018": { + "content": "<|reserved_special_token_10|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128019": { + "content": "<|reserved_special_token_11|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128020": { + "content": "<|reserved_special_token_12|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128021": { + "content": "<|reserved_special_token_13|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128022": { + "content": "<|reserved_special_token_14|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128023": { + "content": "<|reserved_special_token_15|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128024": { + "content": "<|reserved_special_token_16|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128025": { + "content": "<|reserved_special_token_17|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128026": { + "content": "<|reserved_special_token_18|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128027": { + "content": "<|reserved_special_token_19|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128028": { + "content": "<|reserved_special_token_20|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128029": { + "content": "<|reserved_special_token_21|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128030": { + "content": "<|reserved_special_token_22|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128031": { + "content": "<|reserved_special_token_23|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128032": { + "content": "<|reserved_special_token_24|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128033": { + "content": "<|reserved_special_token_25|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128034": { + "content": "<|reserved_special_token_26|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128035": { + "content": "<|reserved_special_token_27|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128036": { + "content": "<|reserved_special_token_28|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128037": { + "content": "<|reserved_special_token_29|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128038": { + "content": "<|reserved_special_token_30|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128039": { + "content": "<|reserved_special_token_31|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128040": { + "content": "<|reserved_special_token_32|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128041": { + "content": "<|reserved_special_token_33|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128042": { + "content": "<|reserved_special_token_34|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128043": { + "content": "<|reserved_special_token_35|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128044": { + "content": "<|reserved_special_token_36|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128045": { + "content": "<|reserved_special_token_37|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128046": { + "content": "<|reserved_special_token_38|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128047": { + "content": "<|reserved_special_token_39|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128048": { + "content": "<|reserved_special_token_40|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128049": { + "content": "<|reserved_special_token_41|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128050": { + "content": "<|reserved_special_token_42|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128051": { + "content": "<|reserved_special_token_43|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128052": { + "content": "<|reserved_special_token_44|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128053": { + "content": "<|reserved_special_token_45|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128054": { + "content": "<|reserved_special_token_46|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128055": { + "content": "<|reserved_special_token_47|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128056": { + "content": "<|reserved_special_token_48|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128057": { + "content": "<|reserved_special_token_49|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128058": { + "content": "<|reserved_special_token_50|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128059": { + "content": "<|reserved_special_token_51|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128060": { + "content": "<|reserved_special_token_52|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128061": { + "content": "<|reserved_special_token_53|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128062": { + "content": "<|reserved_special_token_54|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128063": { + "content": "<|reserved_special_token_55|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128064": { + "content": "<|reserved_special_token_56|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128065": { + "content": "<|reserved_special_token_57|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128066": { + "content": "<|reserved_special_token_58|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128067": { + "content": "<|reserved_special_token_59|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128068": { + "content": "<|reserved_special_token_60|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128069": { + "content": "<|reserved_special_token_61|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128070": { + "content": "<|reserved_special_token_62|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128071": { + "content": "<|reserved_special_token_63|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128072": { + "content": "<|reserved_special_token_64|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128073": { + "content": "<|reserved_special_token_65|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128074": { + "content": "<|reserved_special_token_66|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128075": { + "content": "<|reserved_special_token_67|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128076": { + "content": "<|reserved_special_token_68|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128077": { + "content": "<|reserved_special_token_69|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128078": { + "content": "<|reserved_special_token_70|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128079": { + "content": "<|reserved_special_token_71|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128080": { + "content": "<|reserved_special_token_72|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128081": { + "content": "<|reserved_special_token_73|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128082": { + "content": "<|reserved_special_token_74|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128083": { + "content": "<|reserved_special_token_75|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128084": { + "content": "<|reserved_special_token_76|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128085": { + "content": "<|reserved_special_token_77|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128086": { + "content": "<|reserved_special_token_78|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128087": { + "content": "<|reserved_special_token_79|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128088": { + "content": "<|reserved_special_token_80|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128089": { + "content": "<|reserved_special_token_81|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128090": { + "content": "<|reserved_special_token_82|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128091": { + "content": "<|reserved_special_token_83|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128092": { + "content": "<|reserved_special_token_84|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128093": { + "content": "<|reserved_special_token_85|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128094": { + "content": "<|reserved_special_token_86|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128095": { + "content": "<|reserved_special_token_87|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128096": { + "content": "<|reserved_special_token_88|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128097": { + "content": "<|reserved_special_token_89|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128098": { + "content": "<|reserved_special_token_90|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128099": { + "content": "<|reserved_special_token_91|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128100": { + "content": "<|reserved_special_token_92|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128101": { + "content": "<|reserved_special_token_93|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128102": { + "content": "<|reserved_special_token_94|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128103": { + "content": "<|reserved_special_token_95|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128104": { + "content": "<|reserved_special_token_96|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128105": { + "content": "<|reserved_special_token_97|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128106": { + "content": "<|reserved_special_token_98|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128107": { + "content": "<|reserved_special_token_99|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128108": { + "content": "<|reserved_special_token_100|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128109": { + "content": "<|reserved_special_token_101|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128110": { + "content": "<|reserved_special_token_102|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128111": { + "content": "<|reserved_special_token_103|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128112": { + "content": "<|reserved_special_token_104|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128113": { + "content": "<|reserved_special_token_105|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128114": { + "content": "<|reserved_special_token_106|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128115": { + "content": "<|reserved_special_token_107|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128116": { + "content": "<|reserved_special_token_108|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128117": { + "content": "<|reserved_special_token_109|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128118": { + "content": "<|reserved_special_token_110|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128119": { + "content": "<|reserved_special_token_111|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128120": { + "content": "<|reserved_special_token_112|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128121": { + "content": "<|reserved_special_token_113|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128122": { + "content": "<|reserved_special_token_114|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128123": { + "content": "<|reserved_special_token_115|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128124": { + "content": "<|reserved_special_token_116|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128125": { + "content": "<|reserved_special_token_117|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128126": { + "content": "<|reserved_special_token_118|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128127": { + "content": "<|reserved_special_token_119|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128128": { + "content": "<|reserved_special_token_120|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128129": { + "content": "<|reserved_special_token_121|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128130": { + "content": "<|reserved_special_token_122|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128131": { + "content": "<|reserved_special_token_123|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128132": { + "content": "<|reserved_special_token_124|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128133": { + "content": "<|reserved_special_token_125|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128134": { + "content": "<|reserved_special_token_126|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128135": { + "content": "<|reserved_special_token_127|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128136": { + "content": "<|reserved_special_token_128|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128137": { + "content": "<|reserved_special_token_129|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128138": { + "content": "<|reserved_special_token_130|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128139": { + "content": "<|reserved_special_token_131|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128140": { + "content": "<|reserved_special_token_132|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128141": { + "content": "<|reserved_special_token_133|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128142": { + "content": "<|reserved_special_token_134|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128143": { + "content": "<|reserved_special_token_135|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128144": { + "content": "<|reserved_special_token_136|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128145": { + "content": "<|reserved_special_token_137|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128146": { + "content": "<|reserved_special_token_138|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128147": { + "content": "<|reserved_special_token_139|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128148": { + "content": "<|reserved_special_token_140|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128149": { + "content": "<|reserved_special_token_141|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128150": { + "content": "<|reserved_special_token_142|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128151": { + "content": "<|reserved_special_token_143|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128152": { + "content": "<|reserved_special_token_144|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128153": { + "content": "<|reserved_special_token_145|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128154": { + "content": "<|reserved_special_token_146|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128155": { + "content": "<|reserved_special_token_147|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128156": { + "content": "<|reserved_special_token_148|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128157": { + "content": "<|reserved_special_token_149|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128158": { + "content": "<|reserved_special_token_150|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128159": { + "content": "<|reserved_special_token_151|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128160": { + "content": "<|reserved_special_token_152|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128161": { + "content": "<|reserved_special_token_153|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128162": { + "content": "<|reserved_special_token_154|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128163": { + "content": "<|reserved_special_token_155|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128164": { + "content": "<|reserved_special_token_156|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128165": { + "content": "<|reserved_special_token_157|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128166": { + "content": "<|reserved_special_token_158|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128167": { + "content": "<|reserved_special_token_159|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128168": { + "content": "<|reserved_special_token_160|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128169": { + "content": "<|reserved_special_token_161|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128170": { + "content": "<|reserved_special_token_162|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128171": { + "content": "<|reserved_special_token_163|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128172": { + "content": "<|reserved_special_token_164|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128173": { + "content": "<|reserved_special_token_165|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128174": { + "content": "<|reserved_special_token_166|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128175": { + "content": "<|reserved_special_token_167|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128176": { + "content": "<|reserved_special_token_168|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128177": { + "content": "<|reserved_special_token_169|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128178": { + "content": "<|reserved_special_token_170|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128179": { + "content": "<|reserved_special_token_171|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128180": { + "content": "<|reserved_special_token_172|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128181": { + "content": "<|reserved_special_token_173|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128182": { + "content": "<|reserved_special_token_174|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128183": { + "content": "<|reserved_special_token_175|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128184": { + "content": "<|reserved_special_token_176|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128185": { + "content": "<|reserved_special_token_177|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128186": { + "content": "<|reserved_special_token_178|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128187": { + "content": "<|reserved_special_token_179|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128188": { + "content": "<|reserved_special_token_180|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128189": { + "content": "<|reserved_special_token_181|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128190": { + "content": "<|reserved_special_token_182|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128191": { + "content": "<|reserved_special_token_183|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128192": { + "content": "<|reserved_special_token_184|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128193": { + "content": "<|reserved_special_token_185|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128194": { + "content": "<|reserved_special_token_186|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128195": { + "content": "<|reserved_special_token_187|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128196": { + "content": "<|reserved_special_token_188|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128197": { + "content": "<|reserved_special_token_189|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128198": { + "content": "<|reserved_special_token_190|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128199": { + "content": "<|reserved_special_token_191|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128200": { + "content": "<|reserved_special_token_192|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128201": { + "content": "<|reserved_special_token_193|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128202": { + "content": "<|reserved_special_token_194|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128203": { + "content": "<|reserved_special_token_195|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128204": { + "content": "<|reserved_special_token_196|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128205": { + "content": "<|reserved_special_token_197|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128206": { + "content": "<|reserved_special_token_198|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128207": { + "content": "<|reserved_special_token_199|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128208": { + "content": "<|reserved_special_token_200|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128209": { + "content": "<|reserved_special_token_201|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128210": { + "content": "<|reserved_special_token_202|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128211": { + "content": "<|reserved_special_token_203|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128212": { + "content": "<|reserved_special_token_204|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128213": { + "content": "<|reserved_special_token_205|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128214": { + "content": "<|reserved_special_token_206|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128215": { + "content": "<|reserved_special_token_207|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128216": { + "content": "<|reserved_special_token_208|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128217": { + "content": "<|reserved_special_token_209|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128218": { + "content": "<|reserved_special_token_210|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128219": { + "content": "<|reserved_special_token_211|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128220": { + "content": "<|reserved_special_token_212|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128221": { + "content": "<|reserved_special_token_213|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128222": { + "content": "<|reserved_special_token_214|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128223": { + "content": "<|reserved_special_token_215|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128224": { + "content": "<|reserved_special_token_216|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128225": { + "content": "<|reserved_special_token_217|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128226": { + "content": "<|reserved_special_token_218|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128227": { + "content": "<|reserved_special_token_219|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128228": { + "content": "<|reserved_special_token_220|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128229": { + "content": "<|reserved_special_token_221|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128230": { + "content": "<|reserved_special_token_222|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128231": { + "content": "<|reserved_special_token_223|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128232": { + "content": "<|reserved_special_token_224|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128233": { + "content": "<|reserved_special_token_225|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128234": { + "content": "<|reserved_special_token_226|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128235": { + "content": "<|reserved_special_token_227|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128236": { + "content": "<|reserved_special_token_228|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128237": { + "content": "<|reserved_special_token_229|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128238": { + "content": "<|reserved_special_token_230|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128239": { + "content": "<|reserved_special_token_231|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128240": { + "content": "<|reserved_special_token_232|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128241": { + "content": "<|reserved_special_token_233|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128242": { + "content": "<|reserved_special_token_234|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128243": { + "content": "<|reserved_special_token_235|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128244": { + "content": "<|reserved_special_token_236|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128245": { + "content": "<|reserved_special_token_237|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128246": { + "content": "<|reserved_special_token_238|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128247": { + "content": "<|reserved_special_token_239|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128248": { + "content": "<|reserved_special_token_240|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128249": { + "content": "<|reserved_special_token_241|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128250": { + "content": "<|reserved_special_token_242|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128251": { + "content": "<|reserved_special_token_243|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128252": { + "content": "<|reserved_special_token_244|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128253": { + "content": "<|reserved_special_token_245|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128254": { + "content": "<|reserved_special_token_246|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128255": { + "content": "<|reserved_special_token_247|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + } + }, + "bos_token": "<|begin_of_text|>", + "clean_up_tokenization_spaces": true, + "eos_token": "<|end_of_text|>", + "extra_special_tokens": {}, + "model_input_names": [ + "input_ids", + "attention_mask" + ], + "model_max_length": 131072, + "pad_token": "<|end_of_text|>", + "tokenizer_class": "PreTrainedTokenizer" +}